SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 4, 2024

Commit

c6a0be6

verified ·

1 Parent(s): 6552a74

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -30

app.py CHANGED Viewed

@@ -142,35 +142,50 @@ def scrape_with_bs4(url, session, max_chars=None):
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
-def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
     try:
         response = requests.get(url, timeout=timeout)
         response.raise_for_status()
-        downloaded = response.text
-        content = ""
-        if use_beautifulsoup:
-            soup = BeautifulSoup(downloaded, "lxml")
-            # Convert BeautifulSoup object to a string
-            html_string = str(soup)
-            # Use Trafilatura's extract function directly on the HTML string
-            content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
-        # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
-        if not content and use_beautifulsoup:
-            logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
-            content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
-        # If still no content, use the URL directly
         if not content:
-            content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
-        return (content or "")[:max_chars] if max_chars else (content or "")
     except requests.Timeout:
-        logger.error(f"Timeout error while scraping {url} with Trafilatura")
         return ""
     except Exception as e:
-        logger.error(f"Error scraping {url} with Trafilatura: {e}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
@@ -307,9 +322,23 @@ Remember to focus on financial aspects and implications in your assessment and s
         return "Error: Unable to assess relevance and summarize"
 def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
     try:
         logger.info(f"Scraping full content from: {url}")
         if scraper == "bs4":
             session = requests_retry_session()
             response = session.get(url, timeout=timeout)
@@ -317,26 +346,39 @@ def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
             soup = BeautifulSoup(response.content, 'html.parser')
             # Try to find the main content
-            main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
-            if main_content:
-                content = main_content.get_text(strip=True, separator='\n')
-            else:
-                content = soup.get_text(strip=True, separator='\n')
         elif scraper == "trafilatura":
-            content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
         elif scraper == "scrapy":
             content = scrape_with_scrapy(url, timeout)
         elif scraper == "newspaper":
-            content = scrape_with_newspaper(url)
         else:
             logger.error(f"Unknown scraper: {scraper}")
             return ""
-        # Limit the content to max_chars
-        return content[:max_chars] if content else ""
-    except Timeout:
-        logger.error(f"Timeout error while scraping full content from {url}")
         return ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")

         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
+def scrape_with_trafilatura(url, max_chars=None, timeout=5):
+    """
+    Scrape web content using Trafilatura with simplified error handling and fallback options.
+    Args:
+        url (str): The URL to scrape
+        max_chars (int, optional): Maximum number of characters to return
+        timeout (int, optional): Request timeout in seconds
+    Returns:
+        str: Extracted content or empty string if extraction fails
+    """
     try:
+        # Make the request with timeout
         response = requests.get(url, timeout=timeout)
         response.raise_for_status()
+        # Extract content from the downloaded HTML
+        content = extract(
+            response.text,
+            include_comments=False,
+            include_tables=True,
+            no_fallback=False
+        )
+        # If first attempt fails, try direct URL extraction
         if not content:
+            content = extract(
+                url,
+                include_comments=False,
+                include_tables=True,
+                no_fallback=False
+            )
+        # Return content with optional length limit
+        if content and max_chars:
+            return content[:max_chars]
+        return content or ""
     except requests.Timeout:
+        logger.error(f"Timeout error while scraping {url}")
         return ""
     except Exception as e:
+        logger.error(f"Error scraping {url}: {str(e)}")
         return ""
 def rephrase_query(chat_history, query, temperature=0.2):
         return "Error: Unable to assess relevance and summarize"
 def scrape_full_content(url, scraper="bs4", max_chars=3000, timeout=5):
+    """
+    Unified content scraper that supports multiple scraping methods.
+    Args:
+        url (str): The URL to scrape
+        scraper (str): Scraping method to use ('bs4', 'trafilatura', 'scrapy', 'newspaper')
+        max_chars (int): Maximum number of characters to return
+        timeout (int): Request timeout in seconds
+    Returns:
+        str: Scraped content or empty string if scraping fails
+    """
     try:
         logger.info(f"Scraping full content from: {url}")
+        content = ""
         if scraper == "bs4":
             session = requests_retry_session()
             response = session.get(url, timeout=timeout)
             soup = BeautifulSoup(response.content, 'html.parser')
             # Try to find the main content
+            main_content = (
+                soup.find('main') or
+                soup.find('article') or
+                soup.find('div', class_='content')
+            )
+            content = main_content.get_text(strip=True, separator='\n') if main_content else soup.get_text(strip=True, separator='\n')
         elif scraper == "trafilatura":
+            content = scrape_with_trafilatura(url, max_chars, timeout)
         elif scraper == "scrapy":
             content = scrape_with_scrapy(url, timeout)
         elif scraper == "newspaper":
+            article = Article(url)
+            article.download()
+            article.parse()
+            content = article.text
         else:
             logger.error(f"Unknown scraper: {scraper}")
             return ""
+        # Standardize whitespace and limit content length
+        if content:
+            content = " ".join(content.split())  # Standardize whitespace
+            return content[:max_chars] if max_chars else content
+        return ""
+    except requests.Timeout:
+        logger.error(f"Timeout error while scraping {url}")
         return ""
     except Exception as e:
         logger.error(f"Error scraping full content from {url}: {e}")