habulaj commited on
Commit
f75c7ab
·
verified ·
1 Parent(s): 39c4a66

Update routers/searchterm.py

Browse files
Files changed (1) hide show
  1. routers/searchterm.py +122 -208
routers/searchterm.py CHANGED
@@ -10,17 +10,13 @@ import uuid
10
  import time
11
  from pathlib import Path
12
  from urllib.parse import urlparse
13
- from typing import List, Dict, Any, Optional, Set, Tuple
14
  from fastapi import APIRouter, HTTPException, Body
15
  from fastapi.responses import FileResponse
16
  from newspaper import Article
17
  from threading import Timer
18
  from google import genai
19
  from google.genai import types
20
- from asyncio import Queue, create_task, gather
21
- from concurrent.futures import ThreadPoolExecutor
22
- import aiofiles
23
- import ujson # JSON mais rápido
24
 
25
  router = APIRouter()
26
 
@@ -45,16 +41,10 @@ USER_AGENTS = [
45
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
46
  ]
47
 
48
- BLOCKED_DOMAINS = frozenset({ # frozenset é mais rápido para lookup
49
- "reddit.com", "www.reddit.com", "old.reddit.com",
50
- "quora.com", "www.quora.com"
51
- })
52
 
53
  MAX_TEXT_LENGTH = 4000
54
- MAX_CONCURRENT_SEARCHES = 30 # Aumentado
55
- MAX_CONCURRENT_EXTRACTIONS = 80 # Aumentado significativamente
56
- EXTRACTION_TIMEOUT = 8 # Reduzido
57
- HTTP_TIMEOUT = 10 # Reduzido
58
 
59
  # Diretório para arquivos temporários
60
  TEMP_DIR = Path("/tmp")
@@ -63,30 +53,22 @@ TEMP_DIR.mkdir(exist_ok=True)
63
  # Dicionário para controlar arquivos temporários
64
  temp_files = {}
65
 
66
- # Pool de threads para operações CPU-intensive
67
- thread_pool = ThreadPoolExecutor(max_workers=20)
68
-
69
- # Cache de domínios bloqueados para evitar verificações repetidas
70
- domain_cache = {}
71
 
72
  def is_blocked_domain(url: str) -> bool:
73
  try:
74
  host = urlparse(url).netloc.lower()
75
-
76
- # Cache lookup
77
- if host in domain_cache:
78
- return domain_cache[host]
79
-
80
- is_blocked = any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
81
- domain_cache[host] = is_blocked
82
- return is_blocked
83
  except Exception:
84
  return False
85
 
 
86
  def clamp_text(text: str) -> str:
87
- if not text or len(text) <= MAX_TEXT_LENGTH:
88
- return text
89
- return text[:MAX_TEXT_LENGTH]
 
 
 
90
 
91
  def get_realistic_headers() -> Dict[str, str]:
92
  return {
@@ -94,9 +76,9 @@ def get_realistic_headers() -> Dict[str, str]:
94
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
95
  "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
96
  "Connection": "keep-alive",
97
- "Accept-Encoding": "gzip, deflate, br",
98
  }
99
 
 
100
  def delete_temp_file(file_id: str, file_path: Path):
101
  """Remove arquivo temporário após expiração"""
102
  try:
@@ -107,16 +89,17 @@ def delete_temp_file(file_id: str, file_path: Path):
107
  except Exception as e:
108
  print(f"Erro ao remover arquivo temporário: {e}")
109
 
110
- async def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
111
- """Cria arquivo temporário assíncrono e agenda sua remoção"""
 
112
  file_id = str(uuid.uuid4())
113
  file_path = TEMP_DIR / f"fontes_{file_id}.txt"
114
 
115
- # Salva o JSON no arquivo de forma assíncrona
116
- async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
117
- await f.write(ujson.dumps(data, ensure_ascii=False, indent=2))
118
 
119
- # Agenda remoção em 24 horas
120
  timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
121
  timer.start()
122
 
@@ -133,6 +116,7 @@ async def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
133
  "expires_in_hours": 24
134
  }
135
 
 
136
  async def generate_search_terms(context: str) -> List[str]:
137
  """Gera termos de pesquisa usando o modelo Gemini"""
138
  try:
@@ -179,7 +163,9 @@ Retorne apenas o JSON, sem mais nenhum texto."""
179
  ]
180
 
181
  generate_content_config = types.GenerateContentConfig(
182
- thinking_config=types.ThinkingConfig(thinking_budget=0),
 
 
183
  )
184
 
185
  # Coletamos toda a resposta em stream
@@ -194,6 +180,7 @@ Retorne apenas o JSON, sem mais nenhum texto."""
194
 
195
  # Tenta extrair o JSON da resposta
196
  try:
 
197
  clean_response = full_response.strip()
198
  if clean_response.startswith("```json"):
199
  clean_response = clean_response[7:]
@@ -201,170 +188,92 @@ Retorne apenas o JSON, sem mais nenhum texto."""
201
  clean_response = clean_response[:-3]
202
  clean_response = clean_response.strip()
203
 
204
- response_data = ujson.loads(clean_response)
 
205
  terms = response_data.get("terms", [])
206
 
 
207
  if not isinstance(terms, list):
208
  raise ValueError("Terms deve ser uma lista")
209
 
210
- return terms[:20]
211
 
212
- except (ujson.JSONDecodeError, ValueError) as e:
213
  print(f"Erro ao parsear resposta do Gemini: {e}")
 
 
214
  return []
215
 
216
  except Exception as e:
217
  print(f"Erro ao gerar termos de pesquisa: {str(e)}")
218
  return []
219
 
220
- async def search_brave_batch(client: httpx.AsyncClient, terms: List[str]) -> List[Tuple[str, List[Dict[str, str]]]]:
221
- """Busca múltiplos termos em paralelo com otimizações"""
222
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_SEARCHES)
223
 
224
- async def search_single_term(term: str) -> Tuple[str, List[Dict[str, str]]]:
225
- async with semaphore:
226
- params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
227
-
228
- try:
229
- resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
230
- if resp.status_code != 200:
231
- return (term, [])
232
 
233
- data = resp.json()
234
- results = []
 
 
 
 
 
235
 
236
- if "web" in data and "results" in data["web"]:
237
- for item in data["web"]["results"]:
238
- url = item.get("url")
239
- age = item.get("age", "Unknown")
240
-
241
- if url and not is_blocked_domain(url):
242
- results.append({"url": url, "age": age})
243
-
244
- return (term, results)
245
- except Exception as e:
246
- print(f"Erro na busca do termo '{term}': {e}")
247
- return (term, [])
248
-
249
- # Executa todas as buscas em paralelo
250
- tasks = [search_single_term(term) for term in terms]
251
- return await gather(*tasks, return_exceptions=False)
252
 
253
- def extract_with_trafilatura(html: str) -> str:
254
- """Extração CPU-intensive executada em thread pool"""
255
- try:
256
- extracted = trafilatura.extract(html)
257
- return extracted.strip() if extracted else ""
258
  except Exception:
259
- return ""
 
260
 
261
- def extract_with_newspaper(url: str) -> str:
262
- """Extração com newspaper executada em thread pool"""
263
  try:
264
  art = Article(url)
265
  art.config.browser_user_agent = random.choice(USER_AGENTS)
266
- art.config.request_timeout = 6
267
  art.config.number_threads = 1
 
268
  art.download()
269
  art.parse()
270
- return (art.text or "").strip()
 
 
271
  except Exception:
272
- return ""
273
 
274
- async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
275
- """Extração de artigo otimizada com fallback robusto"""
276
-
277
- # Método 1: Tentar com trafilatura primeiro (mais rápido)
278
  try:
 
 
279
  headers = get_realistic_headers()
280
- async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
281
- if resp.status == 200:
282
- html = await resp.text()
283
 
284
- # Verifica paywall rapidamente
285
- if not re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
286
- html[:2000], re.I):
287
-
288
- # Extração com trafilatura em thread pool
289
- try:
290
- trafilatura_result = await asyncio.get_event_loop().run_in_executor(
291
- thread_pool, extract_with_trafilatura, html
292
- )
293
-
294
- if trafilatura_result and len(trafilatura_result.strip()) > 100:
295
- return clamp_text(trafilatura_result.strip())
296
- except Exception as e:
297
- print(f"Erro trafilatura para {url}: {e}")
298
-
299
- except Exception as e:
300
- print(f"Erro HTTP para {url}: {e}")
301
-
302
- # Método 2: Fallback para newspaper
303
- try:
304
- newspaper_result = await asyncio.get_event_loop().run_in_executor(
305
- thread_pool, extract_with_newspaper, url
306
- )
307
-
308
- if newspaper_result and len(newspaper_result.strip()) > 100:
309
- return clamp_text(newspaper_result.strip())
310
 
311
- except Exception as e:
312
- print(f"Erro newspaper para {url}: {e}")
313
-
 
 
 
 
 
 
 
 
314
  return ""
315
 
316
- async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
317
- """Processa URLs em lotes otimizados com logging detalhado"""
318
- semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
319
- results = []
320
- used_urls: Set[str] = set()
321
- success_count = 0
322
-
323
- async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
324
- nonlocal success_count
325
- async with semaphore:
326
- if url in used_urls:
327
- return None
328
-
329
- try:
330
- text = await extract_article_text_optimized(url, session)
331
- if text:
332
- used_urls.add(url)
333
- success_count += 1
334
- print(f"✓ Extraído: {url[:60]}... ({len(text)} chars)")
335
- return {
336
- "term": term,
337
- "age": age,
338
- "url": url,
339
- "text": text
340
- }
341
- else:
342
- print(f"✗ Falhou: {url[:60]}... (sem conteúdo)")
343
- except Exception as e:
344
- print(f"✗ Erro: {url[:60]}... - {str(e)[:50]}")
345
-
346
- return None
347
-
348
- # Cria todas as tasks de uma vez
349
- tasks = []
350
- for term, url, age in urls_data:
351
- tasks.append(process_single_url(term, url, age))
352
-
353
- print(f"Processando {len(tasks)} URLs com semáforo de {MAX_CONCURRENT_EXTRACTIONS}...")
354
-
355
- # Processa tudo em paralelo
356
- processed_results = await gather(*tasks, return_exceptions=True)
357
-
358
- # Filtra resultados válidos
359
- valid_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
360
-
361
- print(f"Sucesso: {success_count}/{len(urls_data)} URLs extraídas")
362
- return valid_results
363
 
364
  @router.post("/search-terms")
365
  async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
366
- start_time = time.time()
367
-
368
  context = payload.get("context")
369
  if not context or not isinstance(context, str):
370
  raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
@@ -372,73 +281,78 @@ async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
372
  if len(context.strip()) == 0:
373
  raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
374
 
375
- print(f"Iniciando geração de termos...")
376
  # Gera os termos de pesquisa usando o Gemini
377
  terms = await generate_search_terms(context)
378
 
379
  if not terms:
380
  raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
381
 
382
- print(f"Termos gerados em {time.time() - start_time:.2f}s. Iniciando buscas...")
 
 
383
 
384
- # Configurações otimizadas para conexões
385
- connector = aiohttp.TCPConnector(
386
- limit=200, # Aumentado
387
- limit_per_host=30, # Aumentado
388
- ttl_dns_cache=300,
389
- use_dns_cache=True,
390
- enable_cleanup_closed=True
391
- )
392
- timeout = aiohttp.ClientTimeout(total=HTTP_TIMEOUT, connect=5)
393
 
394
- # Cliente HTTP otimizado
395
- http_client = httpx.AsyncClient(
396
- timeout=HTTP_TIMEOUT,
397
- limits=httpx.Limits(
398
- max_connections=200, # Aumentado
399
- max_keepalive_connections=50 # Aumentado
400
- ),
401
- http2=True # Ativa HTTP/2 para melhor performance
402
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
403
 
404
- try:
 
 
 
405
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
406
- # Fase 1: Busca todos os termos em paralelo
407
- search_results = await search_brave_batch(http_client, terms)
408
- print(f"Buscas concluídas em {time.time() - start_time:.2f}s. Iniciando extrações...")
409
-
410
- # Fase 2: Prepara dados para extração em lote
411
- urls_data = []
412
- for term, results in search_results:
413
- for result in results:
414
- urls_data.append((term, result["url"], result["age"]))
415
 
416
- print(f"Processando {len(urls_data)} URLs...")
 
417
 
418
- # Fase 3: Processa todas as URLs em paralelo
419
- final_results = await process_urls_batch(session, urls_data)
 
 
420
 
421
- print(f"Extração concluída em {time.time() - start_time:.2f}s. Salvando arquivo...")
422
-
423
- finally:
424
- await http_client.aclose()
 
425
 
426
- # Fase 4: Cria arquivo temporário assíncrono
427
  result_data = {"results": final_results}
428
- temp_file_info = await create_temp_file(result_data)
429
 
430
- total_time = time.time() - start_time
431
- print(f"Processo completo em {total_time:.2f}s")
432
 
433
  return {
434
  "message": "Dados salvos em arquivo temporário",
435
  "total_results": len(final_results),
436
  "context": context,
437
  "generated_terms": terms,
438
- "file_info": temp_file_info,
439
- "processing_time": f"{total_time:.2f}s"
440
  }
441
 
 
442
  @router.get("/download-temp/{file_id}")
443
  async def download_temp_file(file_id: str):
444
  """Endpoint para download do arquivo temporário"""
 
10
  import time
11
  from pathlib import Path
12
  from urllib.parse import urlparse
13
+ from typing import List, Dict, Any, Optional
14
  from fastapi import APIRouter, HTTPException, Body
15
  from fastapi.responses import FileResponse
16
  from newspaper import Article
17
  from threading import Timer
18
  from google import genai
19
  from google.genai import types
 
 
 
 
20
 
21
  router = APIRouter()
22
 
 
41
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
42
  ]
43
 
44
+ BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
45
+ "quora.com", "www.quora.com"}
 
 
46
 
47
  MAX_TEXT_LENGTH = 4000
 
 
 
 
48
 
49
  # Diretório para arquivos temporários
50
  TEMP_DIR = Path("/tmp")
 
53
  # Dicionário para controlar arquivos temporários
54
  temp_files = {}
55
 
 
 
 
 
 
56
 
57
  def is_blocked_domain(url: str) -> bool:
58
  try:
59
  host = urlparse(url).netloc.lower()
60
+ return any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
 
 
 
 
 
 
 
61
  except Exception:
62
  return False
63
 
64
+
65
  def clamp_text(text: str) -> str:
66
+ if not text:
67
+ return ""
68
+ if len(text) > MAX_TEXT_LENGTH:
69
+ return text[:MAX_TEXT_LENGTH]
70
+ return text
71
+
72
 
73
  def get_realistic_headers() -> Dict[str, str]:
74
  return {
 
76
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
77
  "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
78
  "Connection": "keep-alive",
 
79
  }
80
 
81
+
82
  def delete_temp_file(file_id: str, file_path: Path):
83
  """Remove arquivo temporário após expiração"""
84
  try:
 
89
  except Exception as e:
90
  print(f"Erro ao remover arquivo temporário: {e}")
91
 
92
+
93
+ def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
94
+ """Cria arquivo temporário e agenda sua remoção"""
95
  file_id = str(uuid.uuid4())
96
  file_path = TEMP_DIR / f"fontes_{file_id}.txt"
97
 
98
+ # Salva o JSON no arquivo
99
+ with open(file_path, 'w', encoding='utf-8') as f:
100
+ json.dump(data, f, ensure_ascii=False, indent=2)
101
 
102
+ # Agenda remoção em 24 horas (86400 segundos)
103
  timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
104
  timer.start()
105
 
 
116
  "expires_in_hours": 24
117
  }
118
 
119
+
120
  async def generate_search_terms(context: str) -> List[str]:
121
  """Gera termos de pesquisa usando o modelo Gemini"""
122
  try:
 
163
  ]
164
 
165
  generate_content_config = types.GenerateContentConfig(
166
+ thinking_config=types.ThinkingConfig(
167
+ thinking_budget=0,
168
+ ),
169
  )
170
 
171
  # Coletamos toda a resposta em stream
 
180
 
181
  # Tenta extrair o JSON da resposta
182
  try:
183
+ # Remove possíveis ```json e ``` da resposta
184
  clean_response = full_response.strip()
185
  if clean_response.startswith("```json"):
186
  clean_response = clean_response[7:]
 
188
  clean_response = clean_response[:-3]
189
  clean_response = clean_response.strip()
190
 
191
+ # Parse do JSON
192
+ response_data = json.loads(clean_response)
193
  terms = response_data.get("terms", [])
194
 
195
+ # Validação básica
196
  if not isinstance(terms, list):
197
  raise ValueError("Terms deve ser uma lista")
198
 
199
+ return terms[:20] # Garante máximo de 20 termos
200
 
201
+ except (json.JSONDecodeError, ValueError) as e:
202
  print(f"Erro ao parsear resposta do Gemini: {e}")
203
+ print(f"Resposta recebida: {full_response}")
204
+ # Retorna uma lista vazia em caso de erro
205
  return []
206
 
207
  except Exception as e:
208
  print(f"Erro ao gerar termos de pesquisa: {str(e)}")
209
  return []
210
 
211
+
212
+ async def search_brave_term(client: httpx.AsyncClient, term: str) -> List[Dict[str, str]]:
213
+ params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
214
 
215
+ try:
216
+ resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
217
+ if resp.status_code != 200:
218
+ return []
 
 
 
 
219
 
220
+ data = resp.json()
221
+ results: List[Dict[str, str]] = []
222
+
223
+ if "web" in data and "results" in data["web"]:
224
+ for item in data["web"]["results"]:
225
+ url = item.get("url")
226
+ age = item.get("age", "Unknown")
227
 
228
+ if url and not is_blocked_domain(url):
229
+ results.append({"url": url, "age": age})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
 
231
+ return results
 
 
 
 
232
  except Exception:
233
+ return []
234
+
235
 
236
+ async def extract_article_text(url: str, session: aiohttp.ClientSession) -> str:
 
237
  try:
238
  art = Article(url)
239
  art.config.browser_user_agent = random.choice(USER_AGENTS)
240
+ art.config.request_timeout = 8
241
  art.config.number_threads = 1
242
+
243
  art.download()
244
  art.parse()
245
+ txt = (art.text or "").strip()
246
+ if txt and len(txt) > 100:
247
+ return clamp_text(txt)
248
  except Exception:
249
+ pass
250
 
 
 
 
 
251
  try:
252
+ await asyncio.sleep(random.uniform(0.1, 0.3))
253
+
254
  headers = get_realistic_headers()
255
+ async with session.get(url, headers=headers, timeout=12) as resp:
256
+ if resp.status != 200:
257
+ return ""
258
 
259
+ html = await resp.text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ if re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)", html, re.I):
262
+ return ""
263
+
264
+ extracted = trafilatura.extract(html) or ""
265
+ extracted = extracted.strip()
266
+ if extracted and len(extracted) > 100:
267
+ return clamp_text(extracted)
268
+
269
+ except Exception:
270
+ pass
271
+
272
  return ""
273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  @router.post("/search-terms")
276
  async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
 
 
277
  context = payload.get("context")
278
  if not context or not isinstance(context, str):
279
  raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
 
281
  if len(context.strip()) == 0:
282
  raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
283
 
 
284
  # Gera os termos de pesquisa usando o Gemini
285
  terms = await generate_search_terms(context)
286
 
287
  if not terms:
288
  raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
289
 
290
+ used_urls = set()
291
+ search_semaphore = asyncio.Semaphore(20)
292
+ extract_semaphore = asyncio.Semaphore(50)
293
 
294
+ async def search_with_limit(client, term):
295
+ async with search_semaphore:
296
+ return await search_brave_term(client, term)
 
 
 
 
 
 
297
 
298
+ async def process_term(session, term, search_results):
299
+ async with extract_semaphore:
300
+ for result in search_results:
301
+ url = result["url"]
302
+ age = result["age"]
303
+
304
+ if url in used_urls:
305
+ continue
306
+
307
+ text = await extract_article_text(url, session)
308
+ if text:
309
+ used_urls.add(url)
310
+ return {
311
+ "term": term,
312
+ "age": age,
313
+ "url": url,
314
+ "text": text
315
+ }
316
+ return None
317
+
318
+ connector = aiohttp.TCPConnector(limit=100, limit_per_host=15)
319
+ timeout = aiohttp.ClientTimeout(total=15)
320
 
321
+ async with httpx.AsyncClient(
322
+ timeout=15.0,
323
+ limits=httpx.Limits(max_connections=100, max_keepalive_connections=25)
324
+ ) as http_client:
325
  async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
 
 
 
 
 
 
 
 
 
326
 
327
+ search_tasks = [search_with_limit(http_client, term) for term in terms]
328
+ search_results = await asyncio.gather(*search_tasks, return_exceptions=True)
329
 
330
+ process_tasks = []
331
+ for term, results in zip(terms, search_results):
332
+ if isinstance(results, list) and results:
333
+ process_tasks.append(process_term(session, term, results))
334
 
335
+ if process_tasks:
336
+ processed_results = await asyncio.gather(*process_tasks, return_exceptions=True)
337
+ final_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
338
+ else:
339
+ final_results = []
340
 
341
+ # Cria o JSON final
342
  result_data = {"results": final_results}
 
343
 
344
+ # Cria arquivo temporário
345
+ temp_file_info = create_temp_file(result_data)
346
 
347
  return {
348
  "message": "Dados salvos em arquivo temporário",
349
  "total_results": len(final_results),
350
  "context": context,
351
  "generated_terms": terms,
352
+ "file_info": temp_file_info
 
353
  }
354
 
355
+
356
  @router.get("/download-temp/{file_id}")
357
  async def download_temp_file(file_id: str):
358
  """Endpoint para download do arquivo temporário"""