File size: 2,476 Bytes
bfc2180
 
 
e582e30
bfc2180
 
e582e30
bfc2180
e582e30
 
 
 
bfc2180
 
e582e30
 
 
 
 
 
 
 
 
 
 
bfc2180
e582e30
 
 
bfc2180
 
 
 
e582e30
 
 
 
 
 
 
 
 
bfc2180
 
e582e30
 
 
 
bfc2180
e582e30
bfc2180
 
e582e30
bfc2180
e582e30
bfc2180
 
 
e582e30
 
 
 
 
 
 
 
 
bfc2180
 
 
e582e30
bfc2180
e582e30
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from nemoguardrails import LLMRails, RailsConfig
from typing import Any, Dict, Union
import os
from langchain_community.llms import LlamaCpp
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    base_url=os.getenv("OPENAI_API_BASE"),
    api_key=os.getenv("OPENAI_API_KEY"),
    model="kai-model:latest",  # must match what your llama_cpp.server exposes
)

# --- Configura el provider OpenAI-like (llama.cpp server) ---
# Ajusta si usas otro host/puerto.
os.environ.setdefault("OPENAI_API_KEY", "sk-no-key-needed")       # dummy
os.environ.setdefault("OPENAI_API_BASE", "http://localhost:8001/v1")
os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:8001/v1")  # por compatibilidad

# --- Carga tu configuración de guardrails ---
# Se espera estructura:
# ./config/
#   config.yml
#   rails/*.co  (tus flows/policies)
config = RailsConfig.from_path("./config")
rails = LLMRails(config)   # <- NO pases un LLM aquí; usa el provider OpenAI del config/env

app = FastAPI(title="Guardrailed LLM API")

class ChatRequest(BaseModel):
    message: str

def _normalize_response(r: Union[str, Dict[str, Any]]) -> str:
    if isinstance(r, str):
        return r
    if isinstance(r, dict):
        for k in ("content", "output", "text"):  # distintas versiones/devuelven claves distintas
            if k in r and isinstance(r[k], str):
                return r[k]
    return str(r)

@app.post("/chat")
async def chat_endpoint(request: ChatRequest):
    """
    Aplica NeMo Guardrails a la petición y delega la generación al servidor OpenAI-like de llama.cpp
    configurado en OPENAI_API_BASE.
    """
    try:
        resp = await rails.generate_async(
            messages=[{"role": "user", "content": request.message}]
        )
        return {"response": _normalize_response(resp)}
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {e}")

@app.get("/health")
def health_check():
    return {
        "status": "ok",
        "openai_api_base": os.getenv("OPENAI_API_BASE") or os.getenv("OPENAI_BASE_URL"),
        "rails_config_loaded": True,
    }

@app.post("/chat_raw")
def chat_raw(r: ChatRequest):
    return {"text": llm.invoke(r.message)}  # same llm instance


if __name__ == "__main__":
    # Desarrollo: uvicorn. En producción, usa gunicorn desde terminal.
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)