Spaces:

habulaj
/

newapi-clone

Running

App Files Files Community

habulaj commited on 4 days ago

Commit

39c4a66

verified ·

1 Parent(s): 532f1bb

Update routers/searchterm.py

Browse files

Files changed (1) hide show

routers/searchterm.py +195 -290

routers/searchterm.py CHANGED Viewed

@@ -17,8 +17,10 @@ from newspaper import Article
 from threading import Timer
 from google import genai
 from google.genai import types
-import concurrent.futures
-from collections import deque
 router = APIRouter()
@@ -43,12 +45,16 @@ USER_AGENTS = [
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
 ]
-BLOCKED_DOMAINS = {"reddit.com", "www.reddit.com", "old.reddit.com",
-                   "quora.com", "www.quora.com"}
 MAX_TEXT_LENGTH = 4000
-MAX_CONCURRENT_EXTRACTIONS = 100  # Aumentado drasticamente
-MAX_CONCURRENT_SEARCHES = 50      # Aumentado para pesquisas
 # Diretório para arquivos temporários
 TEMP_DIR = Path("/tmp")
@@ -57,25 +63,30 @@ TEMP_DIR.mkdir(exist_ok=True)
 # Dicionário para controlar arquivos temporários
 temp_files = {}
-# Pool de threads para operações CPU-intensivas
-THREAD_POOL = concurrent.futures.ThreadPoolExecutor(max_workers=20)
 def is_blocked_domain(url: str) -> bool:
     try:
         host = urlparse(url).netloc.lower()
-        return any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
     except Exception:
         return False
 def clamp_text(text: str) -> str:
-    if not text:
-        return ""
-    if len(text) > MAX_TEXT_LENGTH:
-        return text[:MAX_TEXT_LENGTH]
-    return text
 def get_realistic_headers() -> Dict[str, str]:
     return {
@@ -84,11 +95,8 @@ def get_realistic_headers() -> Dict[str, str]:
         "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
         "Connection": "keep-alive",
         "Accept-Encoding": "gzip, deflate, br",
-        "Cache-Control": "no-cache",
-        "Pragma": "no-cache",
     }
 def delete_temp_file(file_id: str, file_path: Path):
     """Remove arquivo temporário após expiração"""
     try:
@@ -99,17 +107,16 @@ def delete_temp_file(file_id: str, file_path: Path):
     except Exception as e:
         print(f"Erro ao remover arquivo temporário: {e}")
-def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
-    """Cria arquivo temporário e agenda sua remoção"""
     file_id = str(uuid.uuid4())
     file_path = TEMP_DIR / f"fontes_{file_id}.txt"
-    # Salva o JSON no arquivo
-    with open(file_path, 'w', encoding='utf-8') as f:
-        json.dump(data, f, ensure_ascii=False, indent=2)
-    # Agenda remoção em 24 horas (86400 segundos)
     timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
     timer.start()
@@ -126,22 +133,6 @@ def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
         "expires_in_hours": 24
     }
-def extract_text_cpu_intensive(html_content: str) -> str:
-    """Função CPU-intensiva para extrair texto (roda em thread separada)"""
-    try:
-        if re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)", html_content, re.I):
-            return ""
-        extracted = trafilatura.extract(html_content) or ""
-        extracted = extracted.strip()
-        if extracted and len(extracted) > 100:
-            return clamp_text(extracted)
-    except Exception:
-        pass
-    return ""
 async def generate_search_terms(context: str) -> List[str]:
     """Gera termos de pesquisa usando o modelo Gemini"""
     try:
@@ -188,9 +179,7 @@ Retorne apenas o JSON, sem mais nenhum texto."""
         ]
         generate_content_config = types.GenerateContentConfig(
-            thinking_config=types.ThinkingConfig(
-                thinking_budget=0,
-            ),
         )
         # Coletamos toda a resposta em stream
@@ -205,7 +194,6 @@ Retorne apenas o JSON, sem mais nenhum texto."""
         # Tenta extrair o JSON da resposta
         try:
-            # Remove possíveis ```json e ``` da resposta
             clean_response = full_response.strip()
             if clean_response.startswith("```json"):
                 clean_response = clean_response[7:]
@@ -213,234 +201,170 @@ Retorne apenas o JSON, sem mais nenhum texto."""
                 clean_response = clean_response[:-3]
             clean_response = clean_response.strip()
-            # Parse do JSON
-            response_data = json.loads(clean_response)
             terms = response_data.get("terms", [])
-            # Validação básica
             if not isinstance(terms, list):
                 raise ValueError("Terms deve ser uma lista")
-            return terms[:20]  # Garante máximo de 20 termos
-        except (json.JSONDecodeError, ValueError) as e:
             print(f"Erro ao parsear resposta do Gemini: {e}")
-            print(f"Resposta recebida: {full_response}")
-            # Retorna uma lista vazia em caso de erro
             return []
     except Exception as e:
         print(f"Erro ao gerar termos de pesquisa: {str(e)}")
         return []
 async def search_brave_batch(client: httpx.AsyncClient, terms: List[str]) -> List[Tuple[str, List[Dict[str, str]]]]:
-    """Realiza múltiplas pesquisas em paralelo com batch otimizado"""
     async def search_single_term(term: str) -> Tuple[str, List[Dict[str, str]]]:
-        params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
-        try:
-            resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
-            if resp.status_code != 200:
-                return term, []
-            data = resp.json()
-            results = []
-            if "web" in data and "results" in data["web"]:
-                for item in data["web"]["results"]:
-                    url = item.get("url")
-                    age = item.get("age", "Unknown")
-                    if url and not is_blocked_domain(url):
-                        results.append({"url": url, "age": age, "term": term})
-            return term, results
-        except Exception:
-            return term, []
-    # Executa todas as pesquisas em paralelo
-    search_tasks = [search_single_term(term) for term in terms]
-    results = await asyncio.gather(*search_tasks, return_exceptions=True)
-    # Filtra apenas resultados válidos
-    valid_results = [r for r in results if isinstance(r, tuple)]
-    return valid_results
-async def extract_content_ultra_fast(session: aiohttp.ClientSession, url_data: Dict[str, str]) -> Optional[Dict[str, Any]]:
-    """Extração de conteúdo ultra-rápida com fallbacks otimizados"""
-    url = url_data["url"]
-    term = url_data["term"]
-    age = url_data["age"]
-    # Primeira tentativa: Newspaper3k (mais rápido para muitos sites)
     try:
-        loop = asyncio.get_event_loop()
-        # Executa newspaper em thread separada
-        def newspaper_extract():
-            try:
-                art = Article(url)
-                art.config.browser_user_agent = random.choice(USER_AGENTS)
-                art.config.request_timeout = 5  # Reduzido para 5s
-                art.config.number_threads = 1
-                art.download()
-                art.parse()
-                text = (art.text or "").strip()
-                return text if text and len(text) > 100 else None
-            except Exception:
-                return None
-        # Tenta newspaper em paralelo com download HTTP
-        newspaper_task = loop.run_in_executor(THREAD_POOL, newspaper_extract)
-        # Download HTTP em paralelo
         headers = get_realistic_headers()
-        try:
-            async with session.get(url, headers=headers, timeout=8) as resp:  # Timeout reduzido
-                if resp.status != 200:
-                    # Se HTTP falhar, espera newspaper
-                    newspaper_result = await newspaper_task
-                    if newspaper_result:
-                        return {
-                            "term": term,
-                            "age": age,
-                            "url": url,
-                            "text": clamp_text(newspaper_result),
-                            "method": "newspaper"
-                        }
-                    return None
                 html = await resp.text()
-                # Executa extração de texto em thread separada
-                text_extraction_task = loop.run_in_executor(
-                    THREAD_POOL,
-                    extract_text_cpu_intensive,
-                    html
-                )
-                # Aguarda tanto newspaper quanto trafilatura, pega o primeiro que terminar
-                done, pending = await asyncio.wait(
-                    [newspaper_task, text_extraction_task],
-                    return_when=asyncio.FIRST_COMPLETED,
-                    timeout=10
-                )
-                # Cancela tarefas pendentes
-                for task in pending:
-                    task.cancel()
-                # Processa resultados
-                results = []
-                for task in done:
                     try:
-                        result = await task
-                        if result:
-                            results.append(result)
-                    except Exception:
-                        continue
-                # Retorna o melhor resultado
-                if results:
-                    # Prioriza o texto mais longo
-                    best_text = max(results, key=len)
-                    return {
-                        "term": term,
-                        "age": age,
-                        "url": url,
-                        "text": clamp_text(best_text),
-                        "method": "hybrid"
-                    }
-        except asyncio.TimeoutError:
-            # Se HTTP der timeout, ainda tenta newspaper
             try:
-                newspaper_result = await asyncio.wait_for(newspaper_task, timeout=5)
-                if newspaper_result:
                     return {
                         "term": term,
                         "age": age,
                         "url": url,
-                        "text": clamp_text(newspaper_result),
-                        "method": "newspaper_fallback"
                     }
-            except asyncio.TimeoutError:
-                pass
-    except Exception:
-        pass
-    return None
-async def process_urls_ultra_parallel(session: aiohttp.ClientSession, all_urls: List[Dict[str, str]], used_urls: Set[str]) -> List[Dict[str, Any]]:
-    """Processa URLs com máximo paralelismo"""
-    # Remove URLs duplicadas imediatamente
-    unique_urls = []
-    local_used = set()
-    for url_data in all_urls:
-        url = url_data["url"]
-        if url not in used_urls and url not in local_used:
-            unique_urls.append(url_data)
-            local_used.add(url)
-            used_urls.add(url)  # Adiciona ao set global imediatamente
-    if not unique_urls:
-        return []
-    print(f"Processando {len(unique_urls)} URLs únicas em paralelo...")
-    # Cria semáforo com limite alto
-    semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
-    async def extract_with_semaphore(url_data):
-        async with semaphore:
-            return await extract_content_ultra_fast(session, url_data)
-    # Executa TODAS as extrações em paralelo
-    tasks = [extract_with_semaphore(url_data) for url_data in unique_urls]
-    # Aguarda todas as tarefas com timeout global
-    try:
-        results = await asyncio.wait_for(
-            asyncio.gather(*tasks, return_exceptions=True),
-            timeout=30  # 30 segundos para todas as extrações
-        )
-        # Filtra resultados válidos
-        valid_results = [
-            r for r in results
-            if r is not None and not isinstance(r, Exception) and isinstance(r, dict)
-        ]
-        print(f"Extraídos {len(valid_results)} artigos de {len(unique_urls)} URLs")
-        return valid_results
-    except asyncio.TimeoutError:
-        print("Timeout global atingido, retornando resultados parciais...")
-        # Em caso de timeout, pega os resultados que já terminaram
-        completed_tasks = [task for task in tasks if task.done()]
-        valid_results = []
-        for task in completed_tasks:
-            try:
-                result = task.result()
-                if result is not None and isinstance(result, dict):
-                    valid_results.append(result)
-            except Exception:
-                continue
-        return valid_results
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
     context = payload.get("context")
     if not context or not isinstance(context, str):
         raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
@@ -448,92 +372,73 @@ async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
     if len(context.strip()) == 0:
         raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
-    start_time = time.time()
-    print(f"Iniciando busca para contexto: {context[:100]}...")
     # Gera os termos de pesquisa usando o Gemini
     terms = await generate_search_terms(context)
     if not terms:
         raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
-    print(f"Gerados {len(terms)} termos em {time.time() - start_time:.2f}s")
-    used_urls: Set[str] = set()
-    # Configurações otimizadas para máxima velocidade
     connector = aiohttp.TCPConnector(
-        limit=200,           # Dobrou o limite total
-        limit_per_host=50,   # Aumentou limite por host
-        keepalive_timeout=30,
-        enable_cleanup_closed=True,
-        force_close=False,
-        ttl_dns_cache=300,   # Cache DNS por 5 minutos
     )
-    timeout = aiohttp.ClientTimeout(
-        total=25,           # Timeout total reduzido
-        connect=8,          # Timeout de conexão reduzido
-        sock_read=8         # Timeout de leitura reduzido
-    )
-    # Configurações HTTPX otimizadas
-    http_limits = httpx.Limits(
-        max_connections=MAX_CONCURRENT_SEARCHES,
-        max_keepalive_connections=40
     )
-    async with httpx.AsyncClient(timeout=12.0, limits=http_limits) as http_client:
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-            # Fase 1: Busca em lote (todas as pesquisas em paralelo)
-            print("Fase 1: Executando pesquisas em paralelo...")
-            search_start = time.time()
             search_results = await search_brave_batch(http_client, terms)
-            print(f"Pesquisas concluídas em {time.time() - search_start:.2f}s")
-            # Fase 2: Coleta e organiza todas as URLs
-            all_urls = []
             for term, results in search_results:
                 for result in results:
-                    all_urls.append({
-                        "url": result["url"],
-                        "age": result["age"],
-                        "term": term
-                    })
-            print(f"Total de URLs coletadas: {len(all_urls)}")
-            # Fase 3: Extração ultra-paralela
-            print("Fase 2: Extraindo conteúdo em máximo paralelismo...")
-            extraction_start = time.time()
-            final_results = await process_urls_ultra_parallel(session, all_urls, used_urls)
-            print(f"Extração concluída em {time.time() - extraction_start:.2f}s")
-    total_time = time.time() - start_time
-    print(f"Processo completo em {total_time:.2f}s - {len(final_results)} artigos extraídos")
-    # Cria o JSON final
     result_data = {"results": final_results}
-    # Cria arquivo temporário
-    temp_file_info = create_temp_file(result_data)
     return {
         "message": "Dados salvos em arquivo temporário",
         "total_results": len(final_results),
         "context": context,
         "generated_terms": terms,
-        "processing_time_seconds": round(total_time, 2),
-        "urls_processed": len(all_urls),
-        "file_info": temp_file_info
     }
 @router.get("/download-temp/{file_id}")
 async def download_temp_file(file_id: str):
     """Endpoint para download do arquivo temporário"""

 from threading import Timer
 from google import genai
 from google.genai import types
+from asyncio import Queue, create_task, gather
+from concurrent.futures import ThreadPoolExecutor
+import aiofiles
+import ujson  # JSON mais rápido
 router = APIRouter()
     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
 ]
+BLOCKED_DOMAINS = frozenset({  # frozenset é mais rápido para lookup
+    "reddit.com", "www.reddit.com", "old.reddit.com",
+    "quora.com", "www.quora.com"
+})
 MAX_TEXT_LENGTH = 4000
+MAX_CONCURRENT_SEARCHES = 30  # Aumentado
+MAX_CONCURRENT_EXTRACTIONS = 80  # Aumentado significativamente
+EXTRACTION_TIMEOUT = 8  # Reduzido
+HTTP_TIMEOUT = 10  # Reduzido
 # Diretório para arquivos temporários
 TEMP_DIR = Path("/tmp")
 # Dicionário para controlar arquivos temporários
 temp_files = {}
+# Pool de threads para operações CPU-intensive
+thread_pool = ThreadPoolExecutor(max_workers=20)
+# Cache de domínios bloqueados para evitar verificações repetidas
+domain_cache = {}
 def is_blocked_domain(url: str) -> bool:
     try:
         host = urlparse(url).netloc.lower()
+        # Cache lookup
+        if host in domain_cache:
+            return domain_cache[host]
+        is_blocked = any(host == b or host.endswith("." + b) for b in BLOCKED_DOMAINS)
+        domain_cache[host] = is_blocked
+        return is_blocked
     except Exception:
         return False
 def clamp_text(text: str) -> str:
+    if not text or len(text) <= MAX_TEXT_LENGTH:
+        return text
+    return text[:MAX_TEXT_LENGTH]
 def get_realistic_headers() -> Dict[str, str]:
     return {
         "Accept-Language": "en-US,en;q=0.7,pt-BR;q=0.6",
         "Connection": "keep-alive",
         "Accept-Encoding": "gzip, deflate, br",
     }
 def delete_temp_file(file_id: str, file_path: Path):
     """Remove arquivo temporário após expiração"""
     try:
     except Exception as e:
         print(f"Erro ao remover arquivo temporário: {e}")
+async def create_temp_file(data: Dict[str, Any]) -> Dict[str, str]:
+    """Cria arquivo temporário assíncrono e agenda sua remoção"""
     file_id = str(uuid.uuid4())
     file_path = TEMP_DIR / f"fontes_{file_id}.txt"
+    # Salva o JSON no arquivo de forma assíncrona
+    async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
+        await f.write(ujson.dumps(data, ensure_ascii=False, indent=2))
+    # Agenda remoção em 24 horas
     timer = Timer(86400, delete_temp_file, args=[file_id, file_path])
     timer.start()
         "expires_in_hours": 24
     }
 async def generate_search_terms(context: str) -> List[str]:
     """Gera termos de pesquisa usando o modelo Gemini"""
     try:
         ]
         generate_content_config = types.GenerateContentConfig(
+            thinking_config=types.ThinkingConfig(thinking_budget=0),
         )
         # Coletamos toda a resposta em stream
         # Tenta extrair o JSON da resposta
         try:
             clean_response = full_response.strip()
             if clean_response.startswith("```json"):
                 clean_response = clean_response[7:]
                 clean_response = clean_response[:-3]
             clean_response = clean_response.strip()
+            response_data = ujson.loads(clean_response)
             terms = response_data.get("terms", [])
             if not isinstance(terms, list):
                 raise ValueError("Terms deve ser uma lista")
+            return terms[:20]
+        except (ujson.JSONDecodeError, ValueError) as e:
             print(f"Erro ao parsear resposta do Gemini: {e}")
             return []
     except Exception as e:
         print(f"Erro ao gerar termos de pesquisa: {str(e)}")
         return []
 async def search_brave_batch(client: httpx.AsyncClient, terms: List[str]) -> List[Tuple[str, List[Dict[str, str]]]]:
+    """Busca múltiplos termos em paralelo com otimizações"""
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_SEARCHES)
     async def search_single_term(term: str) -> Tuple[str, List[Dict[str, str]]]:
+        async with semaphore:
+            params = {"q": term, "count": 10, "safesearch": "off", "summary": "false"}
+            try:
+                resp = await client.get(BRAVE_SEARCH_URL, headers=BRAVE_HEADERS, params=params)
+                if resp.status_code != 200:
+                    return (term, [])
+                data = resp.json()
+                results = []
+                if "web" in data and "results" in data["web"]:
+                    for item in data["web"]["results"]:
+                        url = item.get("url")
+                        age = item.get("age", "Unknown")
+                        if url and not is_blocked_domain(url):
+                            results.append({"url": url, "age": age})
+                return (term, results)
+            except Exception as e:
+                print(f"Erro na busca do termo '{term}': {e}")
+                return (term, [])
+    # Executa todas as buscas em paralelo
+    tasks = [search_single_term(term) for term in terms]
+    return await gather(*tasks, return_exceptions=False)
+def extract_with_trafilatura(html: str) -> str:
+    """Extração CPU-intensive executada em thread pool"""
+    try:
+        extracted = trafilatura.extract(html)
+        return extracted.strip() if extracted else ""
+    except Exception:
+        return ""
+def extract_with_newspaper(url: str) -> str:
+    """Extração com newspaper executada em thread pool"""
+    try:
+        art = Article(url)
+        art.config.browser_user_agent = random.choice(USER_AGENTS)
+        art.config.request_timeout = 6
+        art.config.number_threads = 1
+        art.download()
+        art.parse()
+        return (art.text or "").strip()
+    except Exception:
+        return ""
+async def extract_article_text_optimized(url: str, session: aiohttp.ClientSession) -> str:
+    """Extração de artigo otimizada com fallback robusto"""
+    # Método 1: Tentar com trafilatura primeiro (mais rápido)
     try:
         headers = get_realistic_headers()
+        async with session.get(url, headers=headers, timeout=EXTRACTION_TIMEOUT) as resp:
+            if resp.status == 200:
                 html = await resp.text()
+                # Verifica paywall rapidamente
+                if not re.search(r"(paywall|subscribe|metered|registration|captcha|access denied)",
+                            html[:2000], re.I):
+                    # Extração com trafilatura em thread pool
                     try:
+                        trafilatura_result = await asyncio.get_event_loop().run_in_executor(
+                            thread_pool, extract_with_trafilatura, html
+                        )
+                        if trafilatura_result and len(trafilatura_result.strip()) > 100:
+                            return clamp_text(trafilatura_result.strip())
+                    except Exception as e:
+                        print(f"Erro trafilatura para {url}: {e}")
+    except Exception as e:
+        print(f"Erro HTTP para {url}: {e}")
+    # Método 2: Fallback para newspaper
+    try:
+        newspaper_result = await asyncio.get_event_loop().run_in_executor(
+            thread_pool, extract_with_newspaper, url
+        )
+        if newspaper_result and len(newspaper_result.strip()) > 100:
+            return clamp_text(newspaper_result.strip())
+    except Exception as e:
+        print(f"Erro newspaper para {url}: {e}")
+    return ""
+async def process_urls_batch(session: aiohttp.ClientSession, urls_data: List[Tuple[str, str, str]]) -> List[Dict[str, Any]]:
+    """Processa URLs em lotes otimizados com logging detalhado"""
+    semaphore = asyncio.Semaphore(MAX_CONCURRENT_EXTRACTIONS)
+    results = []
+    used_urls: Set[str] = set()
+    success_count = 0
+    async def process_single_url(term: str, url: str, age: str) -> Optional[Dict[str, Any]]:
+        nonlocal success_count
+        async with semaphore:
+            if url in used_urls:
+                return None
             try:
+                text = await extract_article_text_optimized(url, session)
+                if text:
+                    used_urls.add(url)
+                    success_count += 1
+                    print(f"✓ Extraído: {url[:60]}... ({len(text)} chars)")
                     return {
                         "term": term,
                         "age": age,
                         "url": url,
+                        "text": text
                     }
+                else:
+                    print(f"✗ Falhou: {url[:60]}... (sem conteúdo)")
+            except Exception as e:
+                print(f"✗ Erro: {url[:60]}... - {str(e)[:50]}")
+            return None
+    # Cria todas as tasks de uma vez
+    tasks = []
+    for term, url, age in urls_data:
+        tasks.append(process_single_url(term, url, age))
+    print(f"Processando {len(tasks)} URLs com semáforo de {MAX_CONCURRENT_EXTRACTIONS}...")
+    # Processa tudo em paralelo
+    processed_results = await gather(*tasks, return_exceptions=True)
+    # Filtra resultados válidos
+    valid_results = [r for r in processed_results if r is not None and not isinstance(r, Exception)]
+    print(f"Sucesso: {success_count}/{len(urls_data)} URLs extraídas")
+    return valid_results
 @router.post("/search-terms")
 async def search_terms(payload: Dict[str, str] = Body(...)) -> Dict[str, Any]:
+    start_time = time.time()
     context = payload.get("context")
     if not context or not isinstance(context, str):
         raise HTTPException(status_code=400, detail="Campo 'context' é obrigatório e deve ser uma string.")
     if len(context.strip()) == 0:
         raise HTTPException(status_code=400, detail="Campo 'context' não pode estar vazio.")
+    print(f"Iniciando geração de termos...")
     # Gera os termos de pesquisa usando o Gemini
     terms = await generate_search_terms(context)
     if not terms:
         raise HTTPException(status_code=500, detail="Não foi possível gerar termos de pesquisa válidos.")
+    print(f"Termos gerados em {time.time() - start_time:.2f}s. Iniciando buscas...")
+    # Configurações otimizadas para conexões
     connector = aiohttp.TCPConnector(
+        limit=200,  # Aumentado
+        limit_per_host=30,  # Aumentado
+        ttl_dns_cache=300,
+        use_dns_cache=True,
+        enable_cleanup_closed=True
     )
+    timeout = aiohttp.ClientTimeout(total=HTTP_TIMEOUT, connect=5)
+    # Cliente HTTP otimizado
+    http_client = httpx.AsyncClient(
+        timeout=HTTP_TIMEOUT,
+        limits=httpx.Limits(
+            max_connections=200,  # Aumentado
+            max_keepalive_connections=50  # Aumentado
+        ),
+        http2=True  # Ativa HTTP/2 para melhor performance
     )
+    try:
         async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            # Fase 1: Busca todos os termos em paralelo
             search_results = await search_brave_batch(http_client, terms)
+            print(f"Buscas concluídas em {time.time() - start_time:.2f}s. Iniciando extrações...")
+            # Fase 2: Prepara dados para extração em lote
+            urls_data = []
             for term, results in search_results:
                 for result in results:
+                    urls_data.append((term, result["url"], result["age"]))
+            print(f"Processando {len(urls_data)} URLs...")
+            # Fase 3: Processa todas as URLs em paralelo
+            final_results = await process_urls_batch(session, urls_data)
+            print(f"Extração concluída em {time.time() - start_time:.2f}s. Salvando arquivo...")
+    finally:
+        await http_client.aclose()
+    # Fase 4: Cria arquivo temporário assíncrono
     result_data = {"results": final_results}
+    temp_file_info = await create_temp_file(result_data)
+    total_time = time.time() - start_time
+    print(f"Processo completo em {total_time:.2f}s")
     return {
         "message": "Dados salvos em arquivo temporário",
         "total_results": len(final_results),
         "context": context,
         "generated_terms": terms,
+        "file_info": temp_file_info,
+        "processing_time": f"{total_time:.2f}s"
     }
 @router.get("/download-temp/{file_id}")
 async def download_temp_file(file_id: str):
     """Endpoint para download do arquivo temporário"""