Spaces:
Sleeping
Sleeping
caiocampos-hotmart
commited on
Commit
·
a6981fd
1
Parent(s):
d0289d7
chore: add logs
Browse files- README.md +21 -1
- app.py +48 -3
- requirements.txt +2 -1
README.md
CHANGED
@@ -50,10 +50,30 @@ Verifica status da API.
|
|
50 |
|
51 |
## Como usar
|
52 |
|
|
|
|
|
53 |
```bash
|
54 |
curl -X POST "http://localhost:7860/chat" \
|
55 |
-H "Content-Type: application/json" \
|
56 |
-
-d '{"message": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
```
|
58 |
|
59 |
## Tecnologias
|
|
|
50 |
|
51 |
## Como usar
|
52 |
|
53 |
+
### Local
|
54 |
+
|
55 |
```bash
|
56 |
curl -X POST "http://localhost:7860/chat" \
|
57 |
-H "Content-Type: application/json" \
|
58 |
+
-d '{"message": "Explique machine learning em 3 frases", "max_tokens": 150, "temperature": 0.3}'
|
59 |
+
```
|
60 |
+
|
61 |
+
### Hugging Face Spaces
|
62 |
+
|
63 |
+
```bash
|
64 |
+
curl -X POST "https://caiiofc-llm-agent-api.hf.space/chat" \
|
65 |
+
-H "Content-Type: application/json" \
|
66 |
+
-d '{"message": "Explique machine learning em 3 frases", "max_tokens": 100, "temperature": 0.75}'
|
67 |
+
```
|
68 |
+
|
69 |
+
### Teste de saúde
|
70 |
+
|
71 |
+
```bash
|
72 |
+
# Local
|
73 |
+
curl http://localhost:7860/health
|
74 |
+
|
75 |
+
# HF Spaces
|
76 |
+
curl https://caiiofc-llm-agent-api.hf.space/health
|
77 |
```
|
78 |
|
79 |
## Tecnologias
|
app.py
CHANGED
@@ -3,6 +3,8 @@ from pydantic import BaseModel
|
|
3 |
from llama_cpp import Llama
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
import os
|
|
|
|
|
6 |
|
7 |
app = FastAPI(title="LLM Agent API", version="1.0.0")
|
8 |
|
@@ -20,25 +22,42 @@ class LocalLLMAgent:
|
|
20 |
model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
|
21 |
|
22 |
if not os.path.exists(model_path):
|
23 |
-
print("Baixando modelo...")
|
|
|
24 |
model_path = hf_hub_download(
|
25 |
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
|
26 |
filename="llama-2-7b-chat.Q4_K_M.gguf",
|
27 |
local_dir="./"
|
28 |
)
|
29 |
-
print("Modelo baixado com sucesso!")
|
|
|
|
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
self.llm = Llama(
|
32 |
model_path=model_path,
|
33 |
chat_format="llama-2",
|
34 |
n_ctx=2048,
|
|
|
|
|
35 |
verbose=False
|
36 |
)
|
|
|
37 |
self.messages = [
|
38 |
{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
|
39 |
]
|
40 |
|
41 |
-
def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.
|
|
|
|
|
|
|
42 |
self.messages.append({"role": "user", "content": message})
|
43 |
|
44 |
response = self.llm.create_chat_completion(
|
@@ -50,6 +69,7 @@ class LocalLLMAgent:
|
|
50 |
assistant_message = response['choices'][0]['message']['content']
|
51 |
self.messages.append({"role": "assistant", "content": assistant_message})
|
52 |
|
|
|
53 |
return assistant_message
|
54 |
|
55 |
# Inicializa o agente globalmente
|
@@ -57,8 +77,18 @@ agent = None
|
|
57 |
|
58 |
@app.on_event("startup")
|
59 |
async def startup_event():
|
|
|
|
|
|
|
|
|
60 |
global agent
|
61 |
agent = LocalLLMAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
@app.post("/chat", response_model=ChatResponse)
|
64 |
async def chat_endpoint(request: ChatRequest):
|
@@ -71,4 +101,19 @@ async def chat_endpoint(request: ChatRequest):
|
|
71 |
async def health_check():
|
72 |
return {"status": "healthy"}
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
# Removido - uvicorn será executado pelo Dockerfile
|
|
|
3 |
from llama_cpp import Llama
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
import os
|
6 |
+
import psutil
|
7 |
+
import multiprocessing
|
8 |
|
9 |
app = FastAPI(title="LLM Agent API", version="1.0.0")
|
10 |
|
|
|
22 |
model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
|
23 |
|
24 |
if not os.path.exists(model_path):
|
25 |
+
print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...")
|
26 |
+
print(" Isso pode levar alguns minutos...")
|
27 |
model_path = hf_hub_download(
|
28 |
repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
|
29 |
filename="llama-2-7b-chat.Q4_K_M.gguf",
|
30 |
local_dir="./"
|
31 |
)
|
32 |
+
print("✅ Modelo baixado com sucesso!")
|
33 |
+
else:
|
34 |
+
print("📁 Modelo já existe, carregando...")
|
35 |
|
36 |
+
# Configura para usar todas as CPUs disponíveis
|
37 |
+
n_threads = multiprocessing.cpu_count()
|
38 |
+
print(f"🔧 Configurando llama-cpp-python:")
|
39 |
+
print(f" - CPUs disponíveis: {n_threads}")
|
40 |
+
print(f" - Threads: {n_threads}")
|
41 |
+
print(f" - Contexto: 2048 tokens")
|
42 |
+
|
43 |
+
print("🚀 Inicializando modelo...")
|
44 |
self.llm = Llama(
|
45 |
model_path=model_path,
|
46 |
chat_format="llama-2",
|
47 |
n_ctx=2048,
|
48 |
+
n_threads=n_threads,
|
49 |
+
n_threads_batch=n_threads,
|
50 |
verbose=False
|
51 |
)
|
52 |
+
print(f"✅ Modelo carregado! Usando {n_threads} threads")
|
53 |
self.messages = [
|
54 |
{"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
|
55 |
]
|
56 |
|
57 |
+
def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
|
58 |
+
print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
|
59 |
+
print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
|
60 |
+
|
61 |
self.messages.append({"role": "user", "content": message})
|
62 |
|
63 |
response = self.llm.create_chat_completion(
|
|
|
69 |
assistant_message = response['choices'][0]['message']['content']
|
70 |
self.messages.append({"role": "assistant", "content": assistant_message})
|
71 |
|
72 |
+
print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
|
73 |
return assistant_message
|
74 |
|
75 |
# Inicializa o agente globalmente
|
|
|
77 |
|
78 |
@app.on_event("startup")
|
79 |
async def startup_event():
|
80 |
+
print("=== INICIANDO LLM AGENT API ===")
|
81 |
+
print(f"CPUs disponíveis: {multiprocessing.cpu_count()}")
|
82 |
+
print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
|
83 |
+
|
84 |
global agent
|
85 |
agent = LocalLLMAgent()
|
86 |
+
|
87 |
+
print("✅ API pronta para uso!")
|
88 |
+
print("Endpoints disponíveis:")
|
89 |
+
print(" - POST /chat")
|
90 |
+
print(" - GET /health")
|
91 |
+
print(" - GET /system")
|
92 |
|
93 |
@app.post("/chat", response_model=ChatResponse)
|
94 |
async def chat_endpoint(request: ChatRequest):
|
|
|
101 |
async def health_check():
|
102 |
return {"status": "healthy"}
|
103 |
|
104 |
+
@app.get("/system")
|
105 |
+
async def system_info():
|
106 |
+
cpu_count = multiprocessing.cpu_count()
|
107 |
+
cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
|
108 |
+
memory = psutil.virtual_memory()
|
109 |
+
|
110 |
+
return {
|
111 |
+
"cpu_cores": cpu_count,
|
112 |
+
"cpu_usage_per_core": cpu_percent,
|
113 |
+
"cpu_usage_total": psutil.cpu_percent(interval=1),
|
114 |
+
"memory_total_gb": round(memory.total / (1024**3), 2),
|
115 |
+
"memory_used_gb": round(memory.used / (1024**3), 2),
|
116 |
+
"memory_percent": memory.percent
|
117 |
+
}
|
118 |
+
|
119 |
# Removido - uvicorn será executado pelo Dockerfile
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ llama-cpp-python==0.2.11 --extra-index-url https://abetlen.github.io/llama-cpp-p
|
|
2 |
huggingface-hub
|
3 |
fastapi
|
4 |
uvicorn
|
5 |
-
pydantic
|
|
|
|
2 |
huggingface-hub
|
3 |
fastapi
|
4 |
uvicorn
|
5 |
+
pydantic
|
6 |
+
psutil
|