Shreyas094
commited on
Commit
•
e4abe82
1
Parent(s):
9bc0e06
Update app.py
Browse files
app.py
CHANGED
@@ -91,29 +91,35 @@ def scrape_with_bs4(url, session, max_chars=None):
|
|
91 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
92 |
return ""
|
93 |
|
|
|
|
|
|
|
|
|
94 |
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
95 |
try:
|
96 |
response = requests.get(url, timeout=timeout)
|
97 |
response.raise_for_status()
|
98 |
downloaded = response.text
|
99 |
content = ""
|
|
|
100 |
if use_beautifulsoup:
|
101 |
soup = BeautifulSoup(downloaded, "lxml")
|
102 |
# Convert BeautifulSoup object to a string
|
103 |
html_string = str(soup)
|
104 |
-
# Use Trafilatura's
|
105 |
-
|
106 |
-
content = extract(tree, include_comments=False, include_tables=True, no_fallback=False)
|
107 |
|
108 |
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
109 |
if not content and use_beautifulsoup:
|
110 |
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
111 |
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
112 |
-
|
|
|
113 |
if not content:
|
114 |
-
content = extract(
|
|
|
115 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
116 |
-
except Timeout:
|
117 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
118 |
return ""
|
119 |
except Exception as e:
|
|
|
91 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
92 |
return ""
|
93 |
|
94 |
+
from bs4 import BeautifulSoup
|
95 |
+
from trafilatura import extract
|
96 |
+
import requests
|
97 |
+
|
98 |
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
99 |
try:
|
100 |
response = requests.get(url, timeout=timeout)
|
101 |
response.raise_for_status()
|
102 |
downloaded = response.text
|
103 |
content = ""
|
104 |
+
|
105 |
if use_beautifulsoup:
|
106 |
soup = BeautifulSoup(downloaded, "lxml")
|
107 |
# Convert BeautifulSoup object to a string
|
108 |
html_string = str(soup)
|
109 |
+
# Use Trafilatura's extract function directly on the HTML string
|
110 |
+
content = extract(html_string, include_comments=False, include_tables=True, no_fallback=False)
|
|
|
111 |
|
112 |
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
113 |
if not content and use_beautifulsoup:
|
114 |
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
115 |
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
116 |
+
|
117 |
+
# If still no content, use the URL directly
|
118 |
if not content:
|
119 |
+
content = extract(url, include_comments=False, include_tables=True, no_fallback=False)
|
120 |
+
|
121 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
122 |
+
except requests.Timeout:
|
123 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
124 |
return ""
|
125 |
except Exception as e:
|