|
from fastapi import FastAPI, HTTPException |
|
from pydantic import BaseModel |
|
from nemoguardrails import LLMRails, RailsConfig |
|
from typing import Any, Dict, Union |
|
import os |
|
from langchain_community.llms import LlamaCpp |
|
from langchain_openai import ChatOpenAI |
|
|
|
llm = ChatOpenAI( |
|
base_url=os.getenv("OPENAI_API_BASE"), |
|
api_key=os.getenv("OPENAI_API_KEY"), |
|
model="kai-model:latest", |
|
) |
|
|
|
|
|
|
|
os.environ.setdefault("OPENAI_API_KEY", "sk-no-key-needed") |
|
os.environ.setdefault("OPENAI_API_BASE", "http://localhost:8001/v1") |
|
os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:8001/v1") |
|
|
|
|
|
|
|
|
|
|
|
|
|
config = RailsConfig.from_path("./config") |
|
rails = LLMRails(config) |
|
|
|
app = FastAPI(title="Guardrailed LLM API") |
|
|
|
class ChatRequest(BaseModel): |
|
message: str |
|
|
|
def _normalize_response(r: Union[str, Dict[str, Any]]) -> str: |
|
if isinstance(r, str): |
|
return r |
|
if isinstance(r, dict): |
|
for k in ("content", "output", "text"): |
|
if k in r and isinstance(r[k], str): |
|
return r[k] |
|
return str(r) |
|
|
|
@app.post("/chat") |
|
async def chat_endpoint(request: ChatRequest): |
|
""" |
|
Aplica NeMo Guardrails a la petición y delega la generación al servidor OpenAI-like de llama.cpp |
|
configurado en OPENAI_API_BASE. |
|
""" |
|
try: |
|
resp = await rails.generate_async( |
|
messages=[{"role": "user", "content": request.message}] |
|
) |
|
return {"response": _normalize_response(resp)} |
|
except Exception as e: |
|
raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {e}") |
|
|
|
@app.get("/health") |
|
def health_check(): |
|
return { |
|
"status": "ok", |
|
"openai_api_base": os.getenv("OPENAI_API_BASE") or os.getenv("OPENAI_BASE_URL"), |
|
"rails_config_loaded": True, |
|
} |
|
|
|
@app.post("/chat_raw") |
|
def chat_raw(r: ChatRequest): |
|
return {"text": llm.invoke(r.message)} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
import uvicorn |
|
uvicorn.run(app, host="0.0.0.0", port=8000) |
|
|