SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 2, 2024

Commit

d07bea9

verified ·

1 Parent(s): 07efc76

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -8

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from trafilatura import fetch_url, extract
 from trafilatura import extract
 from trafilatura.settings import use_config
 from urllib.request import urlopen, Request
 import json
@@ -87,11 +88,16 @@ def scrape_with_bs4(url, session, max_chars=None):
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
-def scrape_with_trafilatura(url, max_chars=None):
     try:
-        downloaded = fetch_url(url)
         content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
         return (content or "")[:max_chars] if max_chars else (content or "")
     except Exception as e:
         logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
@@ -228,13 +234,13 @@ Remember to focus on financial aspects and implications in your assessment and s
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
-def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
     try:
         logger.info(f"Scraping full content from: {url}")
         if scraper == "bs4":
             session = requests_retry_session()
-            response = session.get(url, timeout=10)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
@@ -246,11 +252,13 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000):
             else:
                 content = soup.get_text(strip=True, separator='\n')
         else:  # trafilatura
-            downloaded = fetch_url(url)
-            content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
@@ -298,7 +306,7 @@ from trafilatura.settings import use_config
 from urllib.request import urlopen, Request
 def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
-                      engines=[], safesearch=2, method="GET", llm_temperature=0.2):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
@@ -407,7 +415,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
                                 config = use_config()
                                 config.set("DEFAULT", "USER_AGENT", ua)
-                                content = scrape_with_trafilatura(url, max_chars)
                             if content:
                                 break

 from requests.packages.urllib3.util.retry import Retry
 from trafilatura import fetch_url, extract
 from trafilatura import extract
+from requests.exceptions import Timeout
 from trafilatura.settings import use_config
 from urllib.request import urlopen, Request
 import json
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
+def scrape_with_trafilatura(url, max_chars=None, timeout=10):
     try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()
+        downloaded = response.text
         content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
         return (content or "")[:max_chars] if max_chars else (content or "")
+    except Timeout:
+        logger.error(f"Timeout error while scraping {url} with Trafilatura")
+        return ""
     except Exception as e:
         logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
         logger.error(f"Error assessing relevance and summarizing with LLM: {e}")
         return "Error: Unable to assess relevance and summarize"
+def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=10):
     try:
         logger.info(f"Scraping full content from: {url}")
         if scraper == "bs4":
             session = requests_retry_session()
+            response = session.get(url, timeout=timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.content, 'html.parser')
             else:
                 content = soup.get_text(strip=True, separator='\n')
         else:  # trafilatura
+            content = scrape_with_trafilatura(url, max_chars, timeout)
         # Limit the content to max_chars
         return content[:max_chars] if content else ""
+    except Timeout:
+        logger.error(f"Timeout error while scraping full content from {url}")
+        return ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")
         return ""
 from urllib.request import urlopen, Request
 def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura", max_chars=3000, time_range="", language="all", category="",
+                      engines=[], safesearch=2, method="GET", llm_temperature=0.2, timeout=10):
     try:
         # Step 1: Rephrase the Query
         rephrased_query = rephrase_query(chat_history, query, temperature=llm_temperature)
                                 config = use_config()
                                 config.set("DEFAULT", "USER_AGENT", ua)
+                                content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
                             if content:
                                 break