habulaj commited on
Commit
8bc855b
·
verified ·
1 Parent(s): 8081f24

Update routers/searchterm.py

Browse files
Files changed (1) hide show
  1. routers/searchterm.py +62 -55
routers/searchterm.py CHANGED
@@ -272,75 +272,77 @@ def extract_with_newspaper(url: str) -> str:
272
  return ""
273
 
274
  async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
275
- """Extração de artigo otimizada com paralelização de métodos"""
276
 
277
- # Tentativa 1: Newspaper em thread pool (paralelo com download HTTP)
278
- newspaper_task = asyncio.create_task(
279
- asyncio.get_event_loop().run_in_executor(thread_pool, extract_with_newspaper, url)
280
- )
281
-
282
- # Tentativa 2: Download HTTP e trafilatura
283
  try:
284
  headers = get_realistic_headers()
285
  async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
286
- if resp.status != 200:
287
- # Aguarda newspaper se HTTP falhou
288
- newspaper_result = await newspaper_task
289
- return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
290
 
291
- html = await resp.text()
292
-
293
- # Verifica paywall rapidamente
294
- if re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
295
- html[:2000], re.I): # Verifica apenas o início
296
- newspaper_result = await newspaper_task
297
- return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
298
-
299
- # Extração com trafilatura em thread pool
300
- trafilatura_task = asyncio.create_task(
301
- asyncio.get_event_loop().run_in_executor(thread_pool, extract_with_trafilatura, html)
302
- )
303
-
304
- # Aguarda ambos os métodos e pega o melhor resultado
305
- newspaper_result, trafilatura_result = await gather(newspaper_task, trafilatura_task)
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- # Escolhe o melhor resultado
308
- best_result = ""
309
- if trafilatura_result and len(trafilatura_result) > 100:
310
- best_result = trafilatura_result
311
- elif newspaper_result and len(newspaper_result) > 100:
312
- best_result = newspaper_result
313
-
314
- return clamp_text(best_result) if best_result else ""
315
-
316
- except Exception:
317
- # Se tudo falhar, tenta pelo menos o newspaper
318
- try:
319
- newspaper_result = await newspaper_task
320
- return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
321
- except Exception:
322
- return ""
323
 
324
  async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
325
- """Processa URLs em lotes otimizados"""
326
  semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
327
  results = []
328
  used_urls: Set[str] = set()
 
329
 
330
  async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
 
331
  async with semaphore:
332
  if url in used_urls:
333
  return None
334
-
335
- text = await extract_article_text_optimized(url, session)
336
- if text:
337
- used_urls.add(url)
338
- return {
339
- "term": term,
340
- "age": age,
341
- "url": url,
342
- "text": text
343
- }
 
 
 
 
 
 
 
 
344
  return None
345
 
346
  # Cria todas as tasks de uma vez
@@ -348,11 +350,16 @@ async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tup
348
  for term, url, age in urls_data:
349
  tasks.append(process_single_url(term, url, age))
350
 
 
 
351
  # Processa tudo em paralelo
352
  processed_results = await gather(*tasks, return_exceptions=True)
353
 
354
  # Filtra resultados válidos
355
- return [r for r in processed_results if r is not None and not isinstance(r, Exception)]
 
 
 
356
 
357
  @router.post("/search-terms")
358
  async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
@@ -391,7 +398,7 @@ async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
391
  max_connections=200, # Aumentado
392
  max_keepalive_connections=50 # Aumentado
393
  ),
394
- http2=True # Ativa HTTP/2
395
  )
396
 
397
  try:
 
272
  return ""
273
 
274
  async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
275
+ """Extração de artigo otimizada com fallback robusto"""
276
 
277
+ # Método 1: Tentar com trafilatura primeiro (mais rápido)
 
 
 
 
 
278
  try:
279
  headers = get_realistic_headers()
280
  async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
281
+ if resp.status == 200:
282
+ html = await resp.text()
 
 
283
 
284
+ # Verifica paywall rapidamente
285
+ if not re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
286
+ html[:2000], re.I):
287
+
288
+ # Extração com trafilatura em thread pool
289
+ try:
290
+ trafilatura_result = await asyncio.get_event_loop().run_in_executor(
291
+ thread_pool, extract_with_trafilatura, html
292
+ )
293
+
294
+ if trafilatura_result and len(trafilatura_result.strip()) > 100:
295
+ return clamp_text(trafilatura_result.strip())
296
+ except Exception as e:
297
+ print(f"Erro trafilatura para {url}: {e}")
298
+
299
+ except Exception as e:
300
+ print(f"Erro HTTP para {url}: {e}")
301
+
302
+ # Método 2: Fallback para newspaper
303
+ try:
304
+ newspaper_result = await asyncio.get_event_loop().run_in_executor(
305
+ thread_pool, extract_with_newspaper, url
306
+ )
307
+
308
+ if newspaper_result and len(newspaper_result.strip()) > 100:
309
+ return clamp_text(newspaper_result.strip())
310
 
311
+ except Exception as e:
312
+ print(f"Erro newspaper para {url}: {e}")
313
+
314
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
  async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
317
+ """Processa URLs em lotes otimizados com logging detalhado"""
318
  semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
319
  results = []
320
  used_urls: Set[str] = set()
321
+ success_count = 0
322
 
323
  async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
324
+ nonlocal success_count
325
  async with semaphore:
326
  if url in used_urls:
327
  return None
328
+
329
+ try:
330
+ text = await extract_article_text_optimized(url, session)
331
+ if text:
332
+ used_urls.add(url)
333
+ success_count += 1
334
+ print(f"✓ Extraído: {url[:60]}... ({len(text)} chars)")
335
+ return {
336
+ "term": term,
337
+ "age": age,
338
+ "url": url,
339
+ "text": text
340
+ }
341
+ else:
342
+ print(f"✗ Falhou: {url[:60]}... (sem conteúdo)")
343
+ except Exception as e:
344
+ print(f"✗ Erro: {url[:60]}... - {str(e)[:50]}")
345
+
346
  return None
347
 
348
  # Cria todas as tasks de uma vez
 
350
  for term, url, age in urls_data:
351
  tasks.append(process_single_url(term, url, age))
352
 
353
+ print(f"Processando {len(tasks)} URLs com semáforo de {MAX_CONCURRENT_EXTRACTIONS}...")
354
+
355
  # Processa tudo em paralelo
356
  processed_results = await gather(*tasks, return_exceptions=True)
357
 
358
  # Filtra resultados válidos
359
+ valid_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
360
+
361
+ print(f"Sucesso: {success_count}/{len(urls_data)} URLs extraídas")
362
+ return valid_results
363
 
364
  @router.post("/search-terms")
365
  async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
 
398
  max_connections=200, # Aumentado
399
  max_keepalive_connections=50 # Aumentado
400
  ),
401
+ http2=True # Ativa HTTP/2 para melhor performance
402
  )
403
 
404
  try: