Shreyas094
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -21,8 +21,8 @@ import os
|
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
23 |
from bs4 import BeautifulSoup
|
24 |
-
|
25 |
-
from trafilatura.
|
26 |
|
27 |
# Load environment variables from a .env file
|
28 |
load_dotenv()
|
@@ -99,8 +99,11 @@ def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=Fa
|
|
99 |
content = ""
|
100 |
if use_beautifulsoup:
|
101 |
soup = BeautifulSoup(downloaded, "lxml")
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
104 |
|
105 |
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
106 |
if not content and use_beautifulsoup:
|
|
|
21 |
from dotenv import load_dotenv
|
22 |
import certifi
|
23 |
from bs4 import BeautifulSoup
|
24 |
+
|
25 |
+
from trafilatura.core import parse_html
|
26 |
|
27 |
# Load environment variables from a .env file
|
28 |
load_dotenv()
|
|
|
99 |
content = ""
|
100 |
if use_beautifulsoup:
|
101 |
soup = BeautifulSoup(downloaded, "lxml")
|
102 |
+
# Convert BeautifulSoup object to a string
|
103 |
+
html_string = str(soup)
|
104 |
+
# Use Trafilatura's parse_html function
|
105 |
+
tree = parse_html(html_string)
|
106 |
+
content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
|
107 |
|
108 |
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
109 |
if not content and use_beautifulsoup:
|