aferrmt commited on
Commit
e582e30
·
1 Parent(s): 363d883

0.1 Adding chat raw for improving response time

Browse files
.gitignore CHANGED
@@ -2,4 +2,5 @@ myvenv/
2
  data/
3
  __pycache__/
4
  *.gguf
5
- *.ipynb
 
 
2
  data/
3
  __pycache__/
4
  *.gguf
5
+ *.ipynb
6
+ llama.cpp
config/config.yml CHANGED
@@ -1,11 +1,10 @@
1
  models:
2
  - type: main
3
- engine: ollama
4
  model: kai-model:latest # Use your actual model name
5
  parameters:
6
- base_url: http://127.0.0.1:11434
7
- temperature: 0.3
8
- top_p: 0.9
9
 
10
  instructions:
11
  - type: general
 
1
  models:
2
  - type: main
3
+ engine: openai
4
  model: kai-model:latest # Use your actual model name
5
  parameters:
6
+ openai_api_base: http://localhost:8001/v1
7
+
 
8
 
9
  instructions:
10
  - type: general
git-diagnostics-2025-08-11-1504.zip ADDED
Binary file (786 Bytes). View file
 
main.py CHANGED
@@ -1,43 +1,73 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from llama_cpp import Llama
4
  from nemoguardrails import LLMRails, RailsConfig
 
5
  import os
6
  from langchain_community.llms import LlamaCpp
 
7
 
8
-
9
- app = FastAPI()
10
- MODEL_PATH = "./kai-model-7.2B-Q4_0.gguf"
11
- llm = LlamaCpp(
12
- model_path="./kai-model-7.2B-Q4_0.gguf",
13
- temperature=0.7,
14
- top_k=40,
15
- top_p=0.95
16
  )
17
 
18
- # Load guardrails configuration
 
 
 
 
 
 
 
 
 
 
19
  config = RailsConfig.from_path("./config")
20
- rails = LLMRails(config, llm=llm)
 
 
21
 
22
  class ChatRequest(BaseModel):
23
  message: str
24
 
 
 
 
 
 
 
 
 
 
25
  @app.post("/chat")
26
  async def chat_endpoint(request: ChatRequest):
 
 
 
 
27
  try:
28
- # Generate response with guardrails
29
- response = await rails.generate_async(
30
  messages=[{"role": "user", "content": request.message}]
31
  )
32
- return {"response": response["content"]}
33
  except Exception as e:
34
- raise HTTPException(status_code=500, detail=str(e))
35
 
36
  @app.get("/health")
37
  def health_check():
38
- return {"status": "ok", "model": MODEL_PATH}
 
 
 
 
 
 
 
 
39
 
40
 
41
  if __name__ == "__main__":
 
42
  import uvicorn
43
- uvicorn.run(main, host="127.0.0.1", port=8000)
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
 
3
  from nemoguardrails import LLMRails, RailsConfig
4
+ from typing import Any, Dict, Union
5
  import os
6
  from langchain_community.llms import LlamaCpp
7
+ from langchain_openai import ChatOpenAI
8
 
9
+ llm = ChatOpenAI(
10
+ base_url=os.getenv("OPENAI_API_BASE"),
11
+ api_key=os.getenv("OPENAI_API_KEY"),
12
+ model="kai-model:latest", # must match what your llama_cpp.server exposes
 
 
 
 
13
  )
14
 
15
+ # --- Configura el provider OpenAI-like (llama.cpp server) ---
16
+ # Ajusta si usas otro host/puerto.
17
+ os.environ.setdefault("OPENAI_API_KEY", "sk-no-key-needed") # dummy
18
+ os.environ.setdefault("OPENAI_API_BASE", "http://localhost:8001/v1")
19
+ os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:8001/v1") # por compatibilidad
20
+
21
+ # --- Carga tu configuración de guardrails ---
22
+ # Se espera estructura:
23
+ # ./config/
24
+ # config.yml
25
+ # rails/*.co (tus flows/policies)
26
  config = RailsConfig.from_path("./config")
27
+ rails = LLMRails(config) # <- NO pases un LLM aquí; usa el provider OpenAI del config/env
28
+
29
+ app = FastAPI(title="Guardrailed LLM API")
30
 
31
  class ChatRequest(BaseModel):
32
  message: str
33
 
34
+ def _normalize_response(r: Union[str, Dict[str, Any]]) -> str:
35
+ if isinstance(r, str):
36
+ return r
37
+ if isinstance(r, dict):
38
+ for k in ("content", "output", "text"): # distintas versiones/devuelven claves distintas
39
+ if k in r and isinstance(r[k], str):
40
+ return r[k]
41
+ return str(r)
42
+
43
  @app.post("/chat")
44
  async def chat_endpoint(request: ChatRequest):
45
+ """
46
+ Aplica NeMo Guardrails a la petición y delega la generación al servidor OpenAI-like de llama.cpp
47
+ configurado en OPENAI_API_BASE.
48
+ """
49
  try:
50
+ resp = await rails.generate_async(
 
51
  messages=[{"role": "user", "content": request.message}]
52
  )
53
+ return {"response": _normalize_response(resp)}
54
  except Exception as e:
55
+ raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {e}")
56
 
57
  @app.get("/health")
58
  def health_check():
59
+ return {
60
+ "status": "ok",
61
+ "openai_api_base": os.getenv("OPENAI_API_BASE") or os.getenv("OPENAI_BASE_URL"),
62
+ "rails_config_loaded": True,
63
+ }
64
+
65
+ @app.post("/chat_raw")
66
+ def chat_raw(r: ChatRequest):
67
+ return {"text": llm.invoke(r.message)} # same llm instance
68
 
69
 
70
  if __name__ == "__main__":
71
+ # Desarrollo: uvicorn. En producción, usa gunicorn desde terminal.
72
  import uvicorn
73
+ uvicorn.run(app, host="0.0.0.0", port=8000)