caiocampos-hotmart commited on
Commit
a6981fd
·
1 Parent(s): d0289d7

chore: add logs

Browse files
Files changed (3) hide show
  1. README.md +21 -1
  2. app.py +48 -3
  3. requirements.txt +2 -1
README.md CHANGED
@@ -50,10 +50,30 @@ Verifica status da API.
50
 
51
  ## Como usar
52
 
 
 
53
  ```bash
54
  curl -X POST "http://localhost:7860/chat" \
55
  -H "Content-Type: application/json" \
56
- -d '{"message": "Olá!"}'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  ```
58
 
59
  ## Tecnologias
 
50
 
51
  ## Como usar
52
 
53
+ ### Local
54
+
55
  ```bash
56
  curl -X POST "http://localhost:7860/chat" \
57
  -H "Content-Type: application/json" \
58
+ -d '{"message": "Explique machine learning em 3 frases", "max_tokens": 150, "temperature": 0.3}'
59
+ ```
60
+
61
+ ### Hugging Face Spaces
62
+
63
+ ```bash
64
+ curl -X POST "https://caiiofc-llm-agent-api.hf.space/chat" \
65
+ -H "Content-Type: application/json" \
66
+ -d '{"message": "Explique machine learning em 3 frases", "max_tokens": 100, "temperature": 0.75}'
67
+ ```
68
+
69
+ ### Teste de saúde
70
+
71
+ ```bash
72
+ # Local
73
+ curl http://localhost:7860/health
74
+
75
+ # HF Spaces
76
+ curl https://caiiofc-llm-agent-api.hf.space/health
77
  ```
78
 
79
  ## Tecnologias
app.py CHANGED
@@ -3,6 +3,8 @@ from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
  import os
 
 
6
 
7
  app = FastAPI(title="LLM Agent API", version="1.0.0")
8
 
@@ -20,25 +22,42 @@ class LocalLLMAgent:
20
  model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
21
 
22
  if not os.path.exists(model_path):
23
- print("Baixando modelo...")
 
24
  model_path = hf_hub_download(
25
  repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
26
  filename="llama-2-7b-chat.Q4_K_M.gguf",
27
  local_dir="./"
28
  )
29
- print("Modelo baixado com sucesso!")
 
 
30
 
 
 
 
 
 
 
 
 
31
  self.llm = Llama(
32
  model_path=model_path,
33
  chat_format="llama-2",
34
  n_ctx=2048,
 
 
35
  verbose=False
36
  )
 
37
  self.messages = [
38
  {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
39
  ]
40
 
41
- def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.7) -> str:
 
 
 
42
  self.messages.append({"role": "user", "content": message})
43
 
44
  response = self.llm.create_chat_completion(
@@ -50,6 +69,7 @@ class LocalLLMAgent:
50
  assistant_message = response['choices'][0]['message']['content']
51
  self.messages.append({"role": "assistant", "content": assistant_message})
52
 
 
53
  return assistant_message
54
 
55
  # Inicializa o agente globalmente
@@ -57,8 +77,18 @@ agent = None
57
 
58
  @app.on_event("startup")
59
  async def startup_event():
 
 
 
 
60
  global agent
61
  agent = LocalLLMAgent()
 
 
 
 
 
 
62
 
63
  @app.post("/chat", response_model=ChatResponse)
64
  async def chat_endpoint(request: ChatRequest):
@@ -71,4 +101,19 @@ async def chat_endpoint(request: ChatRequest):
71
  async def health_check():
72
  return {"status": "healthy"}
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  # Removido - uvicorn será executado pelo Dockerfile
 
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
  import os
6
+ import psutil
7
+ import multiprocessing
8
 
9
  app = FastAPI(title="LLM Agent API", version="1.0.0")
10
 
 
22
  model_path = "./llama-2-7b-chat.Q4_K_M.gguf"
23
 
24
  if not os.path.exists(model_path):
25
+ print("📥 Baixando modelo Llama-2-7B-Chat (Q4_K_M)...")
26
+ print(" Isso pode levar alguns minutos...")
27
  model_path = hf_hub_download(
28
  repo_id="TheBloke/Llama-2-7B-Chat-GGUF",
29
  filename="llama-2-7b-chat.Q4_K_M.gguf",
30
  local_dir="./"
31
  )
32
+ print("Modelo baixado com sucesso!")
33
+ else:
34
+ print("📁 Modelo já existe, carregando...")
35
 
36
+ # Configura para usar todas as CPUs disponíveis
37
+ n_threads = multiprocessing.cpu_count()
38
+ print(f"🔧 Configurando llama-cpp-python:")
39
+ print(f" - CPUs disponíveis: {n_threads}")
40
+ print(f" - Threads: {n_threads}")
41
+ print(f" - Contexto: 2048 tokens")
42
+
43
+ print("🚀 Inicializando modelo...")
44
  self.llm = Llama(
45
  model_path=model_path,
46
  chat_format="llama-2",
47
  n_ctx=2048,
48
+ n_threads=n_threads,
49
+ n_threads_batch=n_threads,
50
  verbose=False
51
  )
52
+ print(f"✅ Modelo carregado! Usando {n_threads} threads")
53
  self.messages = [
54
  {"role": "system", "content": "Responda sempre em português brasileiro de forma natural e conversacional."}
55
  ]
56
 
57
+ def chat(self, message: str, max_tokens: int = 100, temperature: float = 0.75) -> str:
58
+ print(f"💬 Nova mensagem: {message[:50]}{'...' if len(message) > 50 else ''}")
59
+ print(f" Parâmetros: max_tokens={max_tokens}, temperature={temperature}")
60
+
61
  self.messages.append({"role": "user", "content": message})
62
 
63
  response = self.llm.create_chat_completion(
 
69
  assistant_message = response['choices'][0]['message']['content']
70
  self.messages.append({"role": "assistant", "content": assistant_message})
71
 
72
+ print(f"✅ Resposta gerada ({len(assistant_message)} chars)")
73
  return assistant_message
74
 
75
  # Inicializa o agente globalmente
 
77
 
78
  @app.on_event("startup")
79
  async def startup_event():
80
+ print("=== INICIANDO LLM AGENT API ===")
81
+ print(f"CPUs disponíveis: {multiprocessing.cpu_count()}")
82
+ print(f"Memória total: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
83
+
84
  global agent
85
  agent = LocalLLMAgent()
86
+
87
+ print("✅ API pronta para uso!")
88
+ print("Endpoints disponíveis:")
89
+ print(" - POST /chat")
90
+ print(" - GET /health")
91
+ print(" - GET /system")
92
 
93
  @app.post("/chat", response_model=ChatResponse)
94
  async def chat_endpoint(request: ChatRequest):
 
101
  async def health_check():
102
  return {"status": "healthy"}
103
 
104
+ @app.get("/system")
105
+ async def system_info():
106
+ cpu_count = multiprocessing.cpu_count()
107
+ cpu_percent = psutil.cpu_percent(interval=1, percpu=True)
108
+ memory = psutil.virtual_memory()
109
+
110
+ return {
111
+ "cpu_cores": cpu_count,
112
+ "cpu_usage_per_core": cpu_percent,
113
+ "cpu_usage_total": psutil.cpu_percent(interval=1),
114
+ "memory_total_gb": round(memory.total / (1024**3), 2),
115
+ "memory_used_gb": round(memory.used / (1024**3), 2),
116
+ "memory_percent": memory.percent
117
+ }
118
+
119
  # Removido - uvicorn será executado pelo Dockerfile
requirements.txt CHANGED
@@ -2,4 +2,5 @@ llama-cpp-python==0.2.11 --extra-index-url https://abetlen.github.io/llama-cpp-p
2
  huggingface-hub
3
  fastapi
4
  uvicorn
5
- pydantic
 
 
2
  huggingface-hub
3
  fastapi
4
  uvicorn
5
+ pydantic
6
+ psutil