refining scraper
Browse files
components/fetchers/scraper.py
CHANGED
@@ -13,29 +13,49 @@ HEADERS = {
|
|
13 |
}
|
14 |
|
15 |
def clean_text(text: str) -> str:
|
16 |
-
# Remove
|
17 |
-
|
18 |
-
cleaned =
|
19 |
-
cleaned = " ".join(cleaned.split())
|
20 |
return cleaned
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
|
|
|
23 |
try:
|
24 |
response = requests.get(url, timeout=timeout, headers=HEADERS)
|
25 |
if response.status_code == 200:
|
26 |
html = response.text
|
27 |
extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
|
28 |
-
if extracted
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
except Exception as e:
|
31 |
print(f"⚠️ Trafilatura failed for {url}: {e}")
|
32 |
|
|
|
33 |
try:
|
34 |
article = Article(url)
|
35 |
article.download()
|
36 |
article.parse()
|
37 |
-
if article.text
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
except Exception as e:
|
40 |
print(f"⚠️ Newspaper3k failed for {url}: {e}")
|
41 |
|
|
|
13 |
}
|
14 |
|
15 |
def clean_text(text: str) -> str:
|
16 |
+
# Remove HTML tags, collapse whitespace
|
17 |
+
soup = BeautifulSoup(text, "html.parser")
|
18 |
+
cleaned = soup.get_text(separator=" ", strip=True)
|
19 |
+
cleaned = " ".join(cleaned.split())
|
20 |
return cleaned
|
21 |
|
22 |
+
def is_low_quality(text: str) -> bool:
|
23 |
+
"""Detect navigation garbage, footers, or low-word-count dumps."""
|
24 |
+
if not text or len(text.split()) < 120:
|
25 |
+
return True
|
26 |
+
junk_markers = [
|
27 |
+
"subscribe", "click here", "latest headlines", "more from", "privacy policy",
|
28 |
+
"video", "terms of service", "back to top", "all rights reserved"
|
29 |
+
]
|
30 |
+
return any(marker in text.lower() for marker in junk_markers)
|
31 |
+
|
32 |
def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
|
33 |
+
# Try Trafilatura first
|
34 |
try:
|
35 |
response = requests.get(url, timeout=timeout, headers=HEADERS)
|
36 |
if response.status_code == 200:
|
37 |
html = response.text
|
38 |
extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
|
39 |
+
if extracted:
|
40 |
+
text = clean_text(extracted)
|
41 |
+
if not is_low_quality(text):
|
42 |
+
return text
|
43 |
+
else:
|
44 |
+
print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
|
45 |
except Exception as e:
|
46 |
print(f"⚠️ Trafilatura failed for {url}: {e}")
|
47 |
|
48 |
+
# Fallback to newspaper3k
|
49 |
try:
|
50 |
article = Article(url)
|
51 |
article.download()
|
52 |
article.parse()
|
53 |
+
if article.text:
|
54 |
+
text = clean_text(article.text)
|
55 |
+
if not is_low_quality(text):
|
56 |
+
return text
|
57 |
+
else:
|
58 |
+
print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
|
59 |
except Exception as e:
|
60 |
print(f"⚠️ Newspaper3k failed for {url}: {e}")
|
61 |
|