Spaces:

caiiofc
/

llm-agent-api

Sleeping

App Files Files Community

caiocampos-hotmart commited on 19 days ago

Commit

a6981fd

1 Parent(s): d0289d7

chore: add logs

Browse files

Files changed (3) hide show

README.md +21 -1
app.py +48 -3
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -50,10 +50,30 @@ Verifica status da API.
 ## Como usar
 ```bash
 curl -X POST "http://localhost:7860/chat" \
      -H "Content-Type: application/json" \
-     -d '{"message": "Olá!"}'
 ```
 ## Tecnologias

 ## Como usar
+### Local
 ```bash
 curl -X POST "http://localhost:7860/chat" \
      -H "Content-Type: application/json" \
+     -d '{"message": "Explique machine learning em 3 frases", "max_tokens": 150, "temperature": 0.3}'
+```
+### Hugging Face Spaces
+```bash
+curl -X POST "https://caiiofc-llm-agent-api.hf.space/chat" \
+     -H "Content-Type: application/json" \
+     -d '{"message": "Explique machine learning em 3 frases", "max_tokens": 100, "temperature": 0.75}'
+```
+### Teste de saúde
+```bash
+# Local
+curl http://localhost:7860/health
+# HF Spaces
+curl https://caiiofc-llm-agent-api.hf.space/health
 ```
 ## Tecnologias

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
 app = FastAPI(title="LLM Agent API", version="1.0.0")
@@ -20,25 +22,42 @@ class LocalLLMAgent:
         model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
         if not os.path.exists(model_path):
-            print("Baixando modelo...")
             model_path = hf_hub_download(
                 repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
                 filename="llama-2-7b-chat.Q4_K_M.gguf",
                 local_dir="./"
             )
-            print("Modelo baixado com sucesso!")
         self.llm = Llama(
             model_path=model_path,
             chat_format="llama-2",
             n_ctx=2048,
             verbose=False
         )
         self.messages = [
             {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
         ]
-    def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.7) -> str:
         self.messages.append({"role": "user", "content": message})
         response = self.llm.create_chat_completion(
@@ -50,6 +69,7 @@ class LocalLLMAgent:
         assistant_message = response['choices'][0]['message']['content']
         self.messages.append({"role": "assistant", "content": assistant_message})
         return assistant_message
 # Inicializa o agente globalmente
@@ -57,8 +77,18 @@ agent = None
 @app.on_event("startup")
 async def startup_event():
     global agent
     agent = LocalLLMAgent()
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(request: ChatRequest):
@@ -71,4 +101,19 @@ async def chat_endpoint(request: ChatRequest):
 async def health_check():
     return {"status": "healthy"}
 # Removido - uvicorn será executado pelo Dockerfile

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 import os
+import psutil
+import multiprocessing
 app = FastAPI(title="LLM Agent API", version="1.0.0")
         model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
         if not os.path.exists(model_path):
+            print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...")
+            print("   Isso pode levar alguns minutos...")
             model_path = hf_hub_download(
                 repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
                 filename="llama-2-7b-chat.Q4_K_M.gguf",
                 local_dir="./"
             )
+            print("✅ Modelo baixado com sucesso!")
+        else:
+            print("📁 Modelo já existe, carregando...")
+        # Configura para usar todas as CPUs disponíveis
+        n_threads = multiprocessing.cpu_count()
+        print(f"🔧 Configurando llama-cpp-python:")
+        print(f"   - CPUs disponíveis: {n_threads}")
+        print(f"   - Threads: {n_threads}")
+        print(f"   - Contexto: 2048 tokens")
+        print("🚀 Inicializando modelo...")
         self.llm = Llama(
             model_path=model_path,
             chat_format="llama-2",
             n_ctx=2048,
+            n_threads=n_threads,
+            n_threads_batch=n_threads,
             verbose=False
         )
+        print(f"✅ Modelo carregado! Usando {n_threads} threads")
         self.messages = [
             {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
         ]
+    def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
+        print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
+        print(f"   Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
         self.messages.append({"role": "user", "content": message})
         response = self.llm.create_chat_completion(
         assistant_message = response['choices'][0]['message']['content']
         self.messages.append({"role": "assistant", "content": assistant_message})
+        print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
         return assistant_message
 # Inicializa o agente globalmente
 @app.on_event("startup")
 async def startup_event():
+    print("=== INICIANDO LLM AGENT API ===")
+    print(f"CPUs disponíveis: {multiprocessing.cpu_count()}")
+    print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
     global agent
     agent = LocalLLMAgent()
+    print("✅ API pronta para uso!")
+    print("Endpoints disponíveis:")
+    print("  - POST /chat")
+    print("  - GET /health")
+    print("  - GET /system")
 @app.post("/chat", response_model=ChatResponse)
 async def chat_endpoint(request: ChatRequest):
 async def health_check():
     return {"status": "healthy"}
+@app.get("/system")
+async def system_info():
+    cpu_count = multiprocessing.cpu_count()
+    cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
+    memory = psutil.virtual_memory()
+    return {
+        "cpu_cores": cpu_count,
+        "cpu_usage_per_core": cpu_percent,
+        "cpu_usage_total": psutil.cpu_percent(interval=1),
+        "memory_total_gb": round(memory.total / (1024**3), 2),
+        "memory_used_gb": round(memory.used / (1024**3), 2),
+        "memory_percent": memory.percent
+    }
 # Removido - uvicorn será executado pelo Dockerfile

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ llama-cpp-python==0.2.11 --extra-index-url https://abetlen.github.io/llama-cpp-p
 huggingface-hub
 fastapi
 uvicorn
-pydantic

 huggingface-hub
 fastapi
 uvicorn
+pydantic
+psutil