SearXNG-WebSearch-Agent

Running

Shreyas094 commited on Oct 2, 2024

Commit

9bc0e06

verified ·

1 Parent(s): 6c0f253

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -21,8 +21,8 @@ import os
 from dotenv import load_dotenv
 import certifi
 from bs4 import BeautifulSoup
-from trafilatura import extract
-from trafilatura.htmlprocessing import convert_tree
 # Load environment variables from a .env file
 load_dotenv()
@@ -99,8 +99,11 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
         content = ""
         if use_beautifulsoup:
             soup = BeautifulSoup(downloaded, "lxml")
-            lxml_tree = convert_tree(soup)[0]
-            content = extract(lxml_tree, include_comments=False, include_tables=True, no_fallback=False)
         # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
         if not content and use_beautifulsoup:

 from dotenv import load_dotenv
 import certifi
 from bs4 import BeautifulSoup
+from trafilatura.core import parse_html
 # Load environment variables from a .env file
 load_dotenv()
         content = ""
         if use_beautifulsoup:
             soup = BeautifulSoup(downloaded, "lxml")
+            # Convert BeautifulSoup object to a string
+            html_string = str(soup)
+            # Use Trafilatura's parse_html function
+            tree = parse_html(html_string)
+            content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
         # Fallback mechanism: if BeautifulSoup didn't yield results, try without it
         if not content and use_beautifulsoup: