File size: 2,476 Bytes
bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 bfc2180 e582e30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from nemoguardrails import LLMRails, RailsConfig
from typing import Any, Dict, Union
import os
from langchain_community.llms import LlamaCpp
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
base_url=os.getenv("OPENAI_API_BASE"),
api_key=os.getenv("OPENAI_API_KEY"),
model="kai-model:latest", # must match what your llama_cpp.server exposes
)
# --- Configura el provider OpenAI-like (llama.cpp server) ---
# Ajusta si usas otro host/puerto.
os.environ.setdefault("OPENAI_API_KEY", "sk-no-key-needed") # dummy
os.environ.setdefault("OPENAI_API_BASE", "http://localhost:8001/v1")
os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:8001/v1") # por compatibilidad
# --- Carga tu configuración de guardrails ---
# Se espera estructura:
# ./config/
# config.yml
# rails/*.co (tus flows/policies)
config = RailsConfig.from_path("./config")
rails = LLMRails(config) # <- NO pases un LLM aquí; usa el provider OpenAI del config/env
app = FastAPI(title="Guardrailed LLM API")
class ChatRequest(BaseModel):
message: str
def _normalize_response(r: Union[str, Dict[str, Any]]) -> str:
if isinstance(r, str):
return r
if isinstance(r, dict):
for k in ("content", "output", "text"): # distintas versiones/devuelven claves distintas
if k in r and isinstance(r[k], str):
return r[k]
return str(r)
@app.post("/chat")
async def chat_endpoint(request: ChatRequest):
"""
Aplica NeMo Guardrails a la petición y delega la generación al servidor OpenAI-like de llama.cpp
configurado en OPENAI_API_BASE.
"""
try:
resp = await rails.generate_async(
messages=[{"role": "user", "content": request.message}]
)
return {"response": _normalize_response(resp)}
except Exception as e:
raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {e}")
@app.get("/health")
def health_check():
return {
"status": "ok",
"openai_api_base": os.getenv("OPENAI_API_BASE") or os.getenv("OPENAI_BASE_URL"),
"rails_config_loaded": True,
}
@app.post("/chat_raw")
def chat_raw(r: ChatRequest):
return {"text": llm.invoke(r.message)} # same llm instance
if __name__ == "__main__":
# Desarrollo: uvicorn. En producción, usa gunicorn desde terminal.
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
|