Spaces:

habulaj
/

newapi-clone

Running

App Files Files Community

habulaj commited on 4 days ago

Commit

8bc855b

verified ·

1 Parent(s): 8081f24

Update routers/searchterm.py

Browse files

Files changed (1) hide show

routers/searchterm.py +62 -55

routers/searchterm.py CHANGED Viewed

@@ -272,75 +272,77 @@ def extract_with_newspaper(url: str) -> str:
         return ""
 async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
-    """Extração de artigo otimizada com paralelização de métodos"""
-    # Tentativa 1: Newspaper em thread pool (paralelo com download HTTP)
-    newspaper_task = asyncio.create_task(
-        asyncio.get_event_loop().run_in_executor(thread_pool, extract_with_newspaper, url)
-    )
-    # Tentativa 2: Download HTTP e trafilatura
     try:
         headers = get_realistic_headers()
         async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
-            if resp.status != 200:
-                # Aguarda newspaper se HTTP falhou
-                newspaper_result = await newspaper_task
-                return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
-            html = await resp.text()
-            # Verifica paywall rapidamente
-            if re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
-                        html[:2000], re.I):  # Verifica apenas o início
-                newspaper_result = await newspaper_task
-                return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
-            # Extração com trafilatura em thread pool
-            trafilatura_task = asyncio.create_task(
-                asyncio.get_event_loop().run_in_executor(thread_pool, extract_with_trafilatura, html)
-            )
-            # Aguarda ambos os métodos e pega o melhor resultado
-            newspaper_result, trafilatura_result = await gather(newspaper_task, trafilatura_task)
-            # Escolhe o melhor resultado
-            best_result = ""
-            if trafilatura_result and len(trafilatura_result) > 100:
-                best_result = trafilatura_result
-            elif newspaper_result and len(newspaper_result) > 100:
-                best_result = newspaper_result
-            return clamp_text(best_result) if best_result else ""
-    except Exception:
-        # Se tudo falhar, tenta pelo menos o newspaper
-        try:
-            newspaper_result = await newspaper_task
-            return clamp_text(newspaper_result) if newspaper_result and len(newspaper_result) > 100 else ""
-        except Exception:
-            return ""
 async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
-    """Processa URLs em lotes otimizados"""
     semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
     results = []
     used_urls: Set[str] = set()
     async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
         async with semaphore:
             if url in used_urls:
                 return None
-            text = await extract_article_text_optimized(url, session)
-            if text:
-                used_urls.add(url)
-                return {
-                    "term": term,
-                    "age": age,
-                    "url": url,
-                    "text": text
-                }
             return None
     # Cria todas as tasks de uma vez
@@ -348,11 +350,16 @@ async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tup
     for term, url, age in urls_data:
         tasks.append(process_single_url(term, url, age))
     # Processa tudo em paralelo
     processed_results = await gather(*tasks, return_exceptions=True)
     # Filtra resultados válidos
-    return [r for r in processed_results if r is not None and not isinstance(r, Exception)]
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
@@ -391,7 +398,7 @@ async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
             max_connections=200,  # Aumentado
             max_keepalive_connections=50  # Aumentado
         ),
-        http2=True  # Ativa HTTP/2
     )
     try:

         return ""
 async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
+    """Extração de artigo otimizada com fallback robusto"""
+    # Método 1: Tentar com trafilatura primeiro (mais rápido)
     try:
         headers = get_realistic_headers()
         async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
+            if resp.status == 200:
+                html = await resp.text()
+                # Verifica paywall rapidamente
+                if not re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
+                            html[:2000], re.I):
+                    # Extração com trafilatura em thread pool
+                    try:
+                        trafilatura_result = await asyncio.get_event_loop().run_in_executor(
+                            thread_pool, extract_with_trafilatura, html
+                        )
+                        if trafilatura_result and len(trafilatura_result.strip()) > 100:
+                            return clamp_text(trafilatura_result.strip())
+                    except Exception as e:
+                        print(f"Erro trafilatura para {url}: {e}")
+    except Exception as e:
+        print(f"Erro HTTP para {url}: {e}")
+    # Método 2: Fallback para newspaper
+    try:
+        newspaper_result = await asyncio.get_event_loop().run_in_executor(
+            thread_pool, extract_with_newspaper, url
+        )
+        if newspaper_result and len(newspaper_result.strip()) > 100:
+            return clamp_text(newspaper_result.strip())
+    except Exception as e:
+        print(f"Erro newspaper para {url}: {e}")
+    return ""
 async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
+    """Processa URLs em lotes otimizados com logging detalhado"""
     semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
     results = []
     used_urls: Set[str] = set()
+    success_count = 0
     async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
+        nonlocal success_count
         async with semaphore:
             if url in used_urls:
                 return None
+            try:
+                text = await extract_article_text_optimized(url, session)
+                if text:
+                    used_urls.add(url)
+                    success_count += 1
+                    print(f"✓ Extraído: {url[:60]}... ({len(text)} chars)")
+                    return {
+                        "term": term,
+                        "age": age,
+                        "url": url,
+                        "text": text
+                    }
+                else:
+                    print(f"✗ Falhou: {url[:60]}... (sem conteúdo)")
+            except Exception as e:
+                print(f"✗ Erro: {url[:60]}... - {str(e)[:50]}")
             return None
     # Cria todas as tasks de uma vez
     for term, url, age in urls_data:
         tasks.append(process_single_url(term, url, age))
+    print(f"Processando {len(tasks)} URLs com semáforo de {MAX_CONCURRENT_EXTRACTIONS}...")
     # Processa tudo em paralelo
     processed_results = await gather(*tasks, return_exceptions=True)
     # Filtra resultados válidos
+    valid_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
+    print(f"Sucesso: {success_count}/{len(urls_data)} URLs extraídas")
+    return valid_results
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
             max_connections=200,  # Aumentado
             max_keepalive_connections=50  # Aumentado
         ),
+        http2=True  # Ativa HTTP/2 para melhor performance
     )
     try: