SearXNG-WebSearch-Agent

Running

App Files Files Community

Shreyas094 commited on Oct 2, 2024

Commit

e4abe82

•

1 Parent(s): 9bc0e06

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -91,29 +91,35 @@ def scrape_with_bs4(url, session, max_chars=None):
         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
 def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
     try:
         response = requests.get(url, timeout=timeout)
         response.raise_for_status()
         downloaded = response.text
         content = ""
         if use_beautifulsoup:
             soup = BeautifulSoup(downloaded, "lxml")
             # Convert BeautifulSoup object to a string
             html_string = str(soup)
-            # Use Trafilatura's parse_html function
-            tree = parse_html(html_string)
-            content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
         # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
         if not content and use_beautifulsoup:
             logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
             content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
-        # If still no content, use the direct method
         if not content:
-            content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
         return (content or "")[:max_chars] if max_chars else (content or "")
-    except Timeout:
         logger.error(f"Timeout error while scraping {url} with Trafilatura")
         return ""
     except Exception as e:

         logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
         return ""
+from bs4 import BeautifulSoup
+from trafilatura import extract
+import requests
 def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
     try:
         response = requests.get(url, timeout=timeout)
         response.raise_for_status()
         downloaded = response.text
         content = ""
         if use_beautifulsoup:
             soup = BeautifulSoup(downloaded, "lxml")
             # Convert BeautifulSoup object to a string
             html_string = str(soup)
+            # Use Trafilatura's extract function directly on the HTML string
+            content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
         # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
         if not content and use_beautifulsoup:
             logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
             content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
+        # If still no content, use the URL directly
         if not content:
+            content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
         return (content or "")[:max_chars] if max_chars else (content or "")
+    except requests.Timeout:
         logger.error(f"Timeout error while scraping {url} with Trafilatura")
         return ""
     except Exception as e: