Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 19

Commit

a62e0f6

1 Parent(s): 9d73da0

refining scraper

Browse files

Files changed (1) hide show

components/fetchers/scraper.py +28 -8

components/fetchers/scraper.py CHANGED Viewed

@@ -13,29 +13,49 @@ HEADERS = {
 }
 def clean_text(text: str) -> str:
-    # Remove excess whitespace, ads, and headings
-    cleaned = text.replace("\n", " ").strip()
-    cleaned = BeautifulSoup(cleaned, "html.parser").text  # remove tags
-    cleaned = " ".join(cleaned.split())  # remove multiple spaces
     return cleaned
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
         if response.status_code == 200:
             html = response.text
             extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
-            if extracted and len(extracted.split()) > 100:
-                return clean_text(extracted)
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
     try:
         article = Article(url)
         article.download()
         article.parse()
-        if article.text and len(article.text.split()) > 100:
-            return clean_text(article.text)
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")

 }
 def clean_text(text: str) -> str:
+    # Remove HTML tags, collapse whitespace
+    soup = BeautifulSoup(text, "html.parser")
+    cleaned = soup.get_text(separator=" ", strip=True)
+    cleaned = " ".join(cleaned.split())
     return cleaned
+def is_low_quality(text: str) -> bool:
+    """Detect navigation garbage, footers, or low-word-count dumps."""
+    if not text or len(text.split()) < 120:
+        return True
+    junk_markers = [
+        "subscribe", "click here", "latest headlines", "more from", "privacy policy",
+        "video", "terms of service", "back to top", "all rights reserved"
+    ]
+    return any(marker in text.lower() for marker in junk_markers)
 def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
+    # Try Trafilatura first
     try:
         response = requests.get(url, timeout=timeout, headers=HEADERS)
         if response.status_code == 200:
             html = response.text
             extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
+            if extracted:
+                text = clean_text(extracted)
+                if not is_low_quality(text):
+                    return text
+                else:
+                    print(f"⚠️ Skipped low-quality text from Trafilatura: {url}")
     except Exception as e:
         print(f"⚠️ Trafilatura failed for {url}: {e}")
+    # Fallback to newspaper3k
     try:
         article = Article(url)
         article.download()
         article.parse()
+        if article.text:
+            text = clean_text(article.text)
+            if not is_low_quality(text):
+                return text
+            else:
+                print(f"⚠️ Skipped low-quality text from Newspaper3k: {url}")
     except Exception as e:
         print(f"⚠️ Newspaper3k failed for {url}: {e}")