Spaces:

habulaj
/

newapi-clone

Sleeping

App Files Files Community

habulaj commited on 4 days ago

Commit

532f1bb

verified ·

1 Parent(s): 8bc855b

Update routers/searchterm.py

Browse files

Files changed (1) hide show

routers/searchterm.py +290 -195

routers/searchterm.py CHANGED Viewed

@@ -17,10 +17,8 @@ from newspaper import Article
 from threading import Timer
 from google import genai
 from google.genai import types
-from asyncio import Queue, create_task, gather
-from concurrent.futures import ThreadPoolExecutor
-import aiofiles
-import ujson  # JSON mais rápido
 router = APIRouter()
@@ -45,16 +43,12 @@ USER_AGENTS = [
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
 ]
-BLOCKED_DOMAINS = frozenset({  # frozenset é mais rápido para lookup
-    "reddit.com", "www.reddit.com", "old.reddit.com",
-    "quora.com", "www.quora.com"
-})
 MAX_TEXT_LENGTH = 4000
-MAX_CONCURRENT_SEARCHES = 30  # Aumentado
-MAX_CONCURRENT_EXTRACTIONS = 80  # Aumentado significativamente
-EXTRACTION_TIMEOUT = 8  # Reduzido
-HTTP_TIMEOUT = 10  # Reduzido
 # Diretório para arquivos temporários
 TEMP_DIR = Path("/tmp")
@@ -63,30 +57,25 @@ TEMP_DIR.mkdir(exist_ok=True)
 # Dicionário para controlar arquivos temporários
 temp_files = {}
-# Pool de threads para operações CPU-intensive
-thread_pool = ThreadPoolExecutor(max_workers=20)
-# Cache de domínios bloqueados para evitar verificações repetidas
-domain_cache = {}
 def is_blocked_domain(url: str) -> bool:
     try:
         host = urlparse(url).netloc.lower()
-        # Cache lookup
-        if host in domain_cache:
-            return domain_cache[host]
-        is_blocked = any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
-        domain_cache[host] = is_blocked
-        return is_blocked
     except Exception:
         return False
 def clamp_text(text: str) -> str:
-    if not text or len(text) <= MAX_TEXT_LENGTH:
-        return text
-    return text[:MAX_TEXT_LENGTH]
 def get_realistic_headers() -> Dict[str, str]:
     return {
@@ -95,8 +84,11 @@ def get_realistic_headers() -> Dict[str, str]:
         "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
         "Connection": "keep-alive",
         "Accept-Encoding": "gzip, deflate, br",
     }
 def delete_temp_file(file_id: str, file_path: Path):
     """Remove arquivo temporário após expiração"""
     try:
@@ -107,16 +99,17 @@ def delete_temp_file(file_id: str, file_path: Path):
     except Exception as e:
         print(f"Erro ao remover arquivo temporário: {e}")
-async def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
-    """Cria arquivo temporário assíncrono e agenda sua remoção"""
     file_id = str(uuid.uuid4())
     file_path = TEMP_DIR / f"fontes_{file_id}.txt"
-    # Salva o JSON no arquivo de forma assíncrona
-    async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
-        await f.write(ujson.dumps(data, ensure_ascii=False, indent=2))
-    # Agenda remoção em 24 horas
     timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
     timer.start()
@@ -133,6 +126,22 @@ async def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
         "expires_in_hours": 24
     }
 async def generate_search_terms(context: str) -> List[str]:
     """Gera termos de pesquisa usando o modelo Gemini"""
     try:
@@ -179,7 +188,9 @@ Retorne apenas o JSON, sem mais nenhum texto."""
         ]
         generate_content_config = types.GenerateContentConfig(
-            thinking_config=types.ThinkingConfig(thinking_budget=0),
         )
         # Coletamos toda a resposta em stream
@@ -194,6 +205,7 @@ Retorne apenas o JSON, sem mais nenhum texto."""
         # Tenta extrair o JSON da resposta
         try:
             clean_response = full_response.strip()
             if clean_response.startswith("```json"):
                 clean_response = clean_response[7:]
@@ -201,170 +213,234 @@ Retorne apenas o JSON, sem mais nenhum texto."""
                 clean_response = clean_response[:-3]
             clean_response = clean_response.strip()
-            response_data = ujson.loads(clean_response)
             terms = response_data.get("terms", [])
             if not isinstance(terms, list):
                 raise ValueError("Terms deve ser uma lista")
-            return terms[:20]
-        except (ujson.JSONDecodeError, ValueError) as e:
             print(f"Erro ao parsear resposta do Gemini: {e}")
             return []
     except Exception as e:
         print(f"Erro ao gerar termos de pesquisa: {str(e)}")
         return []
 async def search_brave_batch(client: httpx.AsyncClient, terms: List[str]) -> List[Tuple[str, List[Dict[str, str]]]]:
-    """Busca múltiplos termos em paralelo com otimizações"""
-    semaphore = asyncio.Semaphore(MAX_CONCURRENT_SEARCHES)
     async def search_single_term(term: str) -> Tuple[str, List[Dict[str, str]]]:
-        async with semaphore:
-            params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
-            try:
-                resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
-                if resp.status_code != 200:
-                    return (term, [])
-                data = resp.json()
-                results = []
-                if "web" in data and "results" in data["web"]:
-                    for item in data["web"]["results"]:
-                        url = item.get("url")
-                        age = item.get("age", "Unknown")
-                        if url and not is_blocked_domain(url):
-                            results.append({"url": url, "age": age})
-                return (term, results)
-            except Exception as e:
-                print(f"Erro na busca do termo '{term}': {e}")
-                return (term, [])
-    # Executa todas as buscas em paralelo
-    tasks = [search_single_term(term) for term in terms]
-    return await gather(*tasks, return_exceptions=False)
-def extract_with_trafilatura(html: str) -> str:
-    """Extração CPU-intensive executada em thread pool"""
-    try:
-        extracted = trafilatura.extract(html)
-        return extracted.strip() if extracted else ""
-    except Exception:
-        return ""
-def extract_with_newspaper(url: str) -> str:
-    """Extração com newspaper executada em thread pool"""
-    try:
-        art = Article(url)
-        art.config.browser_user_agent = random.choice(USER_AGENTS)
-        art.config.request_timeout = 6
-        art.config.number_threads = 1
-        art.download()
-        art.parse()
-        return (art.text or "").strip()
-    except Exception:
-        return ""
-async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
-    """Extração de artigo otimizada com fallback robusto"""
-    # Método 1: Tentar com trafilatura primeiro (mais rápido)
     try:
         headers = get_realistic_headers()
-        async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
-            if resp.status == 200:
                 html = await resp.text()
-                # Verifica paywall rapidamente
-                if not re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
-                            html[:2000], re.I):
-                    # Extração com trafilatura em thread pool
                     try:
-                        trafilatura_result = await asyncio.get_event_loop().run_in_executor(
-                            thread_pool, extract_with_trafilatura, html
-                        )
-                        if trafilatura_result and len(trafilatura_result.strip()) > 100:
-                            return clamp_text(trafilatura_result.strip())
-                    except Exception as e:
-                        print(f"Erro trafilatura para {url}: {e}")
-    except Exception as e:
-        print(f"Erro HTTP para {url}: {e}")
-    # Método 2: Fallback para newspaper
-    try:
-        newspaper_result = await asyncio.get_event_loop().run_in_executor(
-            thread_pool, extract_with_newspaper, url
-        )
-        if newspaper_result and len(newspaper_result.strip()) > 100:
-            return clamp_text(newspaper_result.strip())
-    except Exception as e:
-        print(f"Erro newspaper para {url}: {e}")
-    return ""
-async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
-    """Processa URLs em lotes otimizados com logging detalhado"""
-    semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
-    results = []
-    used_urls: Set[str] = set()
-    success_count = 0
-    async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
-        nonlocal success_count
-        async with semaphore:
-            if url in used_urls:
-                return None
             try:
-                text = await extract_article_text_optimized(url, session)
-                if text:
-                    used_urls.add(url)
-                    success_count += 1
-                    print(f"✓ Extraído: {url[:60]}... ({len(text)} chars)")
                     return {
                         "term": term,
                         "age": age,
                         "url": url,
-                        "text": text
                     }
-                else:
-                    print(f"✗ Falhou: {url[:60]}... (sem conteúdo)")
-            except Exception as e:
-                print(f"✗ Erro: {url[:60]}... - {str(e)[:50]}")
-            return None
-    # Cria todas as tasks de uma vez
-    tasks = []
-    for term, url, age in urls_data:
-        tasks.append(process_single_url(term, url, age))
-    print(f"Processando {len(tasks)} URLs com semáforo de {MAX_CONCURRENT_EXTRACTIONS}...")
-    # Processa tudo em paralelo
-    processed_results = await gather(*tasks, return_exceptions=True)
-    # Filtra resultados válidos
-    valid_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
-    print(f"Sucesso: {success_count}/{len(urls_data)} URLs extraídas")
-    return valid_results
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
-    start_time = time.time()
     context = payload.get("context")
     if not context or not isinstance(context, str):
         raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
@@ -372,73 +448,92 @@ async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
     if len(context.strip()) == 0:
         raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
-    print(f"Iniciando geração de termos...")
     # Gera os termos de pesquisa usando o Gemini
     terms = await generate_search_terms(context)
     if not terms:
         raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
-    print(f"Termos gerados em {time.time() - start_time:.2f}s. Iniciando buscas...")
-    # Configurações otimizadas para conexões
     connector = aiohttp.TCPConnector(
-        limit=200,  # Aumentado
-        limit_per_host=30,  # Aumentado
-        ttl_dns_cache=300,
-        use_dns_cache=True,
-        enable_cleanup_closed=True
     )
-    timeout = aiohttp.ClientTimeout(total=HTTP_TIMEOUT, connect=5)
-    # Cliente HTTP otimizado
-    http_client = httpx.AsyncClient(
-        timeout=HTTP_TIMEOUT,
-        limits=httpx.Limits(
-            max_connections=200,  # Aumentado
-            max_keepalive_connections=50  # Aumentado
-        ),
-        http2=True  # Ativa HTTP/2 para melhor performance
     )
-    try:
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-            # Fase 1: Busca todos os termos em paralelo
             search_results = await search_brave_batch(http_client, terms)
-            print(f"Buscas concluídas em {time.time() - start_time:.2f}s. Iniciando extrações...")
-            # Fase 2: Prepara dados para extração em lote
-            urls_data = []
             for term, results in search_results:
                 for result in results:
-                    urls_data.append((term, result["url"], result["age"]))
-            print(f"Processando {len(urls_data)} URLs...")
-            # Fase 3: Processa todas as URLs em paralelo
-            final_results = await process_urls_batch(session, urls_data)
-            print(f"Extração concluída em {time.time() - start_time:.2f}s. Salvando arquivo...")
-    finally:
-        await http_client.aclose()
-    # Fase 4: Cria arquivo temporário assíncrono
     result_data = {"results": final_results}
-    temp_file_info = await create_temp_file(result_data)
-    total_time = time.time() - start_time
-    print(f"Processo completo em {total_time:.2f}s")
     return {
         "message": "Dados salvos em arquivo temporário",
         "total_results": len(final_results),
         "context": context,
         "generated_terms": terms,
-        "file_info": temp_file_info,
-        "processing_time": f"{total_time:.2f}s"
     }
 @router.get("/download-temp/{file_id}")
 async def download_temp_file(file_id: str):
     """Endpoint para download do arquivo temporário"""

 from threading import Timer
 from google import genai
 from google.genai import types
+import concurrent.futures
+from collections import deque
 router = APIRouter()
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
 ]
+BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
+                   "quora.com", "www.quora.com"}
 MAX_TEXT_LENGTH = 4000
+MAX_CONCURRENT_EXTRACTIONS = 100  # Aumentado drasticamente
+MAX_CONCURRENT_SEARCHES = 50      # Aumentado para pesquisas
 # Diretório para arquivos temporários
 TEMP_DIR = Path("/tmp")
 # Dicionário para controlar arquivos temporários
 temp_files = {}
+# Pool de threads para operações CPU-intensivas
+THREAD_POOL = concurrent.futures.ThreadPoolExecutor(max_workers=20)
 def is_blocked_domain(url: str) -> bool:
     try:
         host = urlparse(url).netloc.lower()
+        return any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
     except Exception:
         return False
 def clamp_text(text: str) -> str:
+    if not text:
+        return ""
+    if len(text) > MAX_TEXT_LENGTH:
+        return text[:MAX_TEXT_LENGTH]
+    return text
 def get_realistic_headers() -> Dict[str, str]:
     return {
         "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
         "Connection": "keep-alive",
         "Accept-Encoding": "gzip, deflate, br",
+        "Cache-Control": "no-cache",
+        "Pragma": "no-cache",
     }
 def delete_temp_file(file_id: str, file_path: Path):
     """Remove arquivo temporário após expiração"""
     try:
     except Exception as e:
         print(f"Erro ao remover arquivo temporário: {e}")
+def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
+    """Cria arquivo temporário e agenda sua remoção"""
     file_id = str(uuid.uuid4())
     file_path = TEMP_DIR / f"fontes_{file_id}.txt"
+    # Salva o JSON no arquivo
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    # Agenda remoção em 24 horas (86400 segundos)
     timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
     timer.start()
         "expires_in_hours": 24
     }
+def extract_text_cpu_intensive(html_content: str) -> str:
+    """Função CPU-intensiva para extrair texto (roda em thread separada)"""
+    try:
+        if re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)", html_content, re.I):
+            return ""
+        extracted = trafilatura.extract(html_content) or ""
+        extracted = extracted.strip()
+        if extracted and len(extracted) > 100:
+            return clamp_text(extracted)
+    except Exception:
+        pass
+    return ""
 async def generate_search_terms(context: str) -> List[str]:
     """Gera termos de pesquisa usando o modelo Gemini"""
     try:
         ]
         generate_content_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(
+                thinking_budget=0,
+            ),
         )
         # Coletamos toda a resposta em stream
         # Tenta extrair o JSON da resposta
         try:
+            # Remove possíveis ```json e ``` da resposta
             clean_response = full_response.strip()
             if clean_response.startswith("```json"):
                 clean_response = clean_response[7:]
                 clean_response = clean_response[:-3]
             clean_response = clean_response.strip()
+            # Parse do JSON
+            response_data = json.loads(clean_response)
             terms = response_data.get("terms", [])
+            # Validação básica
             if not isinstance(terms, list):
                 raise ValueError("Terms deve ser uma lista")
+            return terms[:20]  # Garante máximo de 20 termos
+        except (json.JSONDecodeError, ValueError) as e:
             print(f"Erro ao parsear resposta do Gemini: {e}")
+            print(f"Resposta recebida: {full_response}")
+            # Retorna uma lista vazia em caso de erro
             return []
     except Exception as e:
         print(f"Erro ao gerar termos de pesquisa: {str(e)}")
         return []
 async def search_brave_batch(client: httpx.AsyncClient, terms: List[str]) -> List[Tuple[str, List[Dict[str, str]]]]:
+    """Realiza múltiplas pesquisas em paralelo com batch otimizado"""
     async def search_single_term(term: str) -> Tuple[str, List[Dict[str, str]]]:
+        params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
+        try:
+            resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
+            if resp.status_code != 200:
+                return term, []
+            data = resp.json()
+            results = []
+            if "web" in data and "results" in data["web"]:
+                for item in data["web"]["results"]:
+                    url = item.get("url")
+                    age = item.get("age", "Unknown")
+                    if url and not is_blocked_domain(url):
+                        results.append({"url": url, "age": age, "term": term})
+            return term, results
+        except Exception:
+            return term, []
+    # Executa todas as pesquisas em paralelo
+    search_tasks = [search_single_term(term) for term in terms]
+    results = await asyncio.gather(*search_tasks, return_exceptions=True)
+    # Filtra apenas resultados válidos
+    valid_results = [r for r in results if isinstance(r, tuple)]
+    return valid_results
+async def extract_content_ultra_fast(session: aiohttp.ClientSession, url_data: Dict[str, str]) -> Optional[Dict[str, Any]]:
+    """Extração de conteúdo ultra-rápida com fallbacks otimizados"""
+    url = url_data["url"]
+    term = url_data["term"]
+    age = url_data["age"]
+    # Primeira tentativa: Newspaper3k (mais rápido para muitos sites)
     try:
+        loop = asyncio.get_event_loop()
+        # Executa newspaper em thread separada
+        def newspaper_extract():
+            try:
+                art = Article(url)
+                art.config.browser_user_agent = random.choice(USER_AGENTS)
+                art.config.request_timeout = 5  # Reduzido para 5s
+                art.config.number_threads = 1
+                art.download()
+                art.parse()
+                text = (art.text or "").strip()
+                return text if text and len(text) > 100 else None
+            except Exception:
+                return None
+        # Tenta newspaper em paralelo com download HTTP
+        newspaper_task = loop.run_in_executor(THREAD_POOL, newspaper_extract)
+        # Download HTTP em paralelo
         headers = get_realistic_headers()
+        try:
+            async with session.get(url, headers=headers, timeout=8) as resp:  # Timeout reduzido
+                if resp.status != 200:
+                    # Se HTTP falhar, espera newspaper
+                    newspaper_result = await newspaper_task
+                    if newspaper_result:
+                        return {
+                            "term": term,
+                            "age": age,
+                            "url": url,
+                            "text": clamp_text(newspaper_result),
+                            "method": "newspaper"
+                        }
+                    return None
                 html = await resp.text()
+                # Executa extração de texto em thread separada
+                text_extraction_task = loop.run_in_executor(
+                    THREAD_POOL,
+                    extract_text_cpu_intensive,
+                    html
+                )
+                # Aguarda tanto newspaper quanto trafilatura, pega o primeiro que terminar
+                done, pending = await asyncio.wait(
+                    [newspaper_task, text_extraction_task],
+                    return_when=asyncio.FIRST_COMPLETED,
+                    timeout=10
+                )
+                # Cancela tarefas pendentes
+                for task in pending:
+                    task.cancel()
+                # Processa resultados
+                results = []
+                for task in done:
                     try:
+                        result = await task
+                        if result:
+                            results.append(result)
+                    except Exception:
+                        continue
+                # Retorna o melhor resultado
+                if results:
+                    # Prioriza o texto mais longo
+                    best_text = max(results, key=len)
+                    return {
+                        "term": term,
+                        "age": age,
+                        "url": url,
+                        "text": clamp_text(best_text),
+                        "method": "hybrid"
+                    }
+        except asyncio.TimeoutError:
+            # Se HTTP der timeout, ainda tenta newspaper
             try:
+                newspaper_result = await asyncio.wait_for(newspaper_task, timeout=5)
+                if newspaper_result:
                     return {
                         "term": term,
                         "age": age,
                         "url": url,
+                        "text": clamp_text(newspaper_result),
+                        "method": "newspaper_fallback"
                     }
+            except asyncio.TimeoutError:
+                pass
+    except Exception:
+        pass
+    return None
+async def process_urls_ultra_parallel(session: aiohttp.ClientSession, all_urls: List[Dict[str, str]], used_urls: Set[str]) -> List[Dict[str, Any]]:
+    """Processa URLs com máximo paralelismo"""
+    # Remove URLs duplicadas imediatamente
+    unique_urls = []
+    local_used = set()
+    for url_data in all_urls:
+        url = url_data["url"]
+        if url not in used_urls and url not in local_used:
+            unique_urls.append(url_data)
+            local_used.add(url)
+            used_urls.add(url)  # Adiciona ao set global imediatamente
+    if not unique_urls:
+        return []
+    print(f"Processando {len(unique_urls)} URLs únicas em paralelo...")
+    # Cria semáforo com limite alto
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
+    async def extract_with_semaphore(url_data):
+        async with semaphore:
+            return await extract_content_ultra_fast(session, url_data)
+    # Executa TODAS as extrações em paralelo
+    tasks = [extract_with_semaphore(url_data) for url_data in unique_urls]
+    # Aguarda todas as tarefas com timeout global
+    try:
+        results = await asyncio.wait_for(
+            asyncio.gather(*tasks, return_exceptions=True),
+            timeout=30  # 30 segundos para todas as extrações
+        )
+        # Filtra resultados válidos
+        valid_results = [
+            r for r in results
+            if r is not None and not isinstance(r, Exception) and isinstance(r, dict)
+        ]
+        print(f"Extraídos {len(valid_results)} artigos de {len(unique_urls)} URLs")
+        return valid_results
+    except asyncio.TimeoutError:
+        print("Timeout global atingido, retornando resultados parciais...")
+        # Em caso de timeout, pega os resultados que já terminaram
+        completed_tasks = [task for task in tasks if task.done()]
+        valid_results = []
+        for task in completed_tasks:
+            try:
+                result = task.result()
+                if result is not None and isinstance(result, dict):
+                    valid_results.append(result)
+            except Exception:
+                continue
+        return valid_results
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
     context = payload.get("context")
     if not context or not isinstance(context, str):
         raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
     if len(context.strip()) == 0:
         raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
+    start_time = time.time()
+    print(f"Iniciando busca para contexto: {context[:100]}...")
     # Gera os termos de pesquisa usando o Gemini
     terms = await generate_search_terms(context)
     if not terms:
         raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
+    print(f"Gerados {len(terms)} termos em {time.time() - start_time:.2f}s")
+    used_urls: Set[str] = set()
+    # Configurações otimizadas para máxima velocidade
     connector = aiohttp.TCPConnector(
+        limit=200,           # Dobrou o limite total
+        limit_per_host=50,   # Aumentou limite por host
+        keepalive_timeout=30,
+        enable_cleanup_closed=True,
+        force_close=False,
+        ttl_dns_cache=300,   # Cache DNS por 5 minutos
     )
+    timeout = aiohttp.ClientTimeout(
+        total=25,           # Timeout total reduzido
+        connect=8,          # Timeout de conexão reduzido
+        sock_read=8         # Timeout de leitura reduzido
     )
+    # Configurações HTTPX otimizadas
+    http_limits = httpx.Limits(
+        max_connections=MAX_CONCURRENT_SEARCHES,
+        max_keepalive_connections=40
+    )
+    async with httpx.AsyncClient(timeout=12.0, limits=http_limits) as http_client:
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            # Fase 1: Busca em lote (todas as pesquisas em paralelo)
+            print("Fase 1: Executando pesquisas em paralelo...")
+            search_start = time.time()
             search_results = await search_brave_batch(http_client, terms)
+            print(f"Pesquisas concluídas em {time.time() - search_start:.2f}s")
+            # Fase 2: Coleta e organiza todas as URLs
+            all_urls = []
             for term, results in search_results:
                 for result in results:
+                    all_urls.append({
+                        "url": result["url"],
+                        "age": result["age"],
+                        "term": term
+                    })
+            print(f"Total de URLs coletadas: {len(all_urls)}")
+            # Fase 3: Extração ultra-paralela
+            print("Fase 2: Extraindo conteúdo em máximo paralelismo...")
+            extraction_start = time.time()
+            final_results = await process_urls_ultra_parallel(session, all_urls, used_urls)
+            print(f"Extração concluída em {time.time() - extraction_start:.2f}s")
+    total_time = time.time() - start_time
+    print(f"Processo completo em {total_time:.2f}s - {len(final_results)} artigos extraídos")
+    # Cria o JSON final
     result_data = {"results": final_results}
+    # Cria arquivo temporário
+    temp_file_info = create_temp_file(result_data)
     return {
         "message": "Dados salvos em arquivo temporário",
         "total_results": len(final_results),
         "context": context,
         "generated_terms": terms,
+        "processing_time_seconds": round(total_time, 2),
+        "urls_processed": len(all_urls),
+        "file_info": temp_file_info
     }
 @router.get("/download-temp/{file_id}")
 async def download_temp_file(file_id: str):
     """Endpoint para download do arquivo temporário"""