ragV98 commited on
Commit
3521e98
ยท
1 Parent(s): 93ca074

improvements 1

Browse files
components/fetchers/scraper.py CHANGED
@@ -12,36 +12,68 @@ HEADERS = {
12
  )
13
  }
14
 
 
15
  def clean_text(text: str) -> str:
16
- # Remove HTML tags, collapse whitespace
17
  soup = BeautifulSoup(text, "html.parser")
18
  cleaned = soup.get_text(separator=" ", strip=True)
19
- cleaned = " ".join(cleaned.split())
20
- return cleaned
21
 
22
  def is_low_quality(text: str) -> bool:
23
- """Detect navigation garbage, footers, or low-word-count dumps."""
24
- if not text or len(text.split()) < 120:
25
  return True
 
26
  junk_markers = [
27
- "subscribe", "click here", "latest headlines", "more from", "privacy policy",
28
- "video", "terms of service", "back to top", "all rights reserved"
 
29
  ]
 
30
  return any(marker in text.lower() for marker in junk_markers)
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
33
- # Try Trafilatura first
34
  try:
35
  response = requests.get(url, timeout=timeout, headers=HEADERS)
36
- if response.status_code == 200:
37
- html = response.text
38
- extracted = trafilatura.extract(html, include_comments=False, include_tables=False)
39
- if extracted:
40
- text = clean_text(extracted)
41
- if not is_low_quality(text):
42
- return text
43
- else:
44
- print(f"โš ๏ธ Skipped low-quality text from Trafilatura: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  except Exception as e:
46
  print(f"โš ๏ธ Trafilatura failed for {url}: {e}")
47
 
@@ -56,7 +88,19 @@ def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
56
  return text
57
  else:
58
  print(f"โš ๏ธ Skipped low-quality text from Newspaper3k: {url}")
 
 
59
  except Exception as e:
60
  print(f"โš ๏ธ Newspaper3k failed for {url}: {e}")
61
 
 
 
 
 
 
 
 
 
 
 
62
  return None
 
12
  )
13
  }
14
 
15
+
16
  def clean_text(text: str) -> str:
17
+ """Remove HTML tags and collapse whitespace."""
18
  soup = BeautifulSoup(text, "html.parser")
19
  cleaned = soup.get_text(separator=" ", strip=True)
20
+ return " ".join(cleaned.split())
21
+
22
 
23
  def is_low_quality(text: str) -> bool:
24
+ """Heuristic to detect low-value content like navbars, footers, etc."""
25
+ if not text or len(text.split()) < 50:
26
  return True
27
+
28
  junk_markers = [
29
+ "subscribe", "click here", "latest headlines", "more from",
30
+ "privacy policy", "video", "terms of service", "back to top",
31
+ "all rights reserved", "advertisement", "read more", "sign in"
32
  ]
33
+
34
  return any(marker in text.lower() for marker in junk_markers)
35
 
36
+
37
+ def fallback_html_extract(html: str) -> Optional[str]:
38
+ """Very basic content extractor as a last resort."""
39
+ try:
40
+ soup = BeautifulSoup(html, "html.parser")
41
+ paragraphs = soup.find_all("p")
42
+ text = " ".join(p.get_text(strip=True) for p in paragraphs)
43
+ cleaned = clean_text(text)
44
+ return cleaned if len(cleaned.split()) >= 50 else None
45
+ except Exception as e:
46
+ print(f"โš ๏ธ Fallback extract failed: {e}")
47
+ return None
48
+
49
+
50
  def scrape_url(url: str, timeout: int = 10) -> Optional[str]:
51
+ """Extract meaningful text from a given URL using multiple methods."""
52
  try:
53
  response = requests.get(url, timeout=timeout, headers=HEADERS)
54
+ if response.status_code != 200:
55
+ print(f"โš ๏ธ Bad status ({response.status_code}) for {url}")
56
+ return None
57
+
58
+ html = response.text
59
+
60
+ # Attempt trafilatura
61
+ extracted = trafilatura.extract(
62
+ html,
63
+ include_comments=False,
64
+ include_tables=False,
65
+ no_fallback=False
66
+ )
67
+
68
+ if extracted:
69
+ text = clean_text(extracted)
70
+ if not is_low_quality(text):
71
+ return text
72
+ else:
73
+ print(f"โš ๏ธ Skipped low-quality text from Trafilatura: {url}")
74
+ else:
75
+ print(f"โš ๏ธ Trafilatura extraction failed or empty: {url}")
76
+
77
  except Exception as e:
78
  print(f"โš ๏ธ Trafilatura failed for {url}: {e}")
79
 
 
88
  return text
89
  else:
90
  print(f"โš ๏ธ Skipped low-quality text from Newspaper3k: {url}")
91
+ else:
92
+ print(f"โš ๏ธ Newspaper3k extracted no text: {url}")
93
  except Exception as e:
94
  print(f"โš ๏ธ Newspaper3k failed for {url}: {e}")
95
 
96
+ # Final fallback to basic HTML parsing
97
+ try:
98
+ if html:
99
+ fallback = fallback_html_extract(html)
100
+ if fallback:
101
+ print(f"โœ… Used fallback extractor for: {url}")
102
+ return fallback
103
+ except Exception as e:
104
+ print(f"โš ๏ธ Final fallback failed for {url}: {e}")
105
+
106
  return None
components/generators/daily_feed.py CHANGED
@@ -34,13 +34,13 @@ HEADERS = {
34
  def build_prompt(content: str, topic: str) -> str:
35
  base_instruction = (
36
  "You are Nuseโ€™s official news summarizer โ€” insightful, punchy, and always on point. ๐Ÿง โœจ\n"
37
- "Your job is to scan the content below and extract the key news items. For each item, craft a crisp summary (25โ€“30 words), add 1โ€“2 fitting emojis, and make it pop.\n"
38
- "List each summary on a new line starting with a dash (-). This is how Nuse keeps it clean and scannable.\n"
39
  "\n"
40
  "Example format:\n"
41
- "- India stuns Australia in a last-ball thriller at the World Cup finals ๐Ÿ๐Ÿ‡ฎ๐Ÿ‡ณ\n"
42
- "- U.S. imposes sweeping tariffs on Chinese tech giants, rattling global markets ๐Ÿ“‰๐Ÿ‡บ๐Ÿ‡ธ\n"
43
- "- Ceasefire breakthrough: Netanyahu bows to pressure after week-long escalation ๐Ÿ”ฅ๐Ÿ•Š๏ธ\n"
44
  "\n"
45
  "Be sharp. Be brief. No fluff. No preambles. Just the summaries.\n"
46
  "Return only the final summary block โ€” no extra commentary, no prompt repetition."
 
34
  def build_prompt(content: str, topic: str) -> str:
35
  base_instruction = (
36
  "You are Nuseโ€™s official news summarizer โ€” insightful, punchy, and always on point. ๐Ÿง โœจ\n"
37
+ "Your job is to scan the content below and extract the key news items. For each item, craft a crisp summary (15โ€“20 words), add 1โ€“2 fitting emojis, and make it pop.\n"
38
+ "List each summary on a new line starting with a dash (-) and no numbers. This is how Nuse keeps it clean and scannable.\n"
39
  "\n"
40
  "Example format:\n"
41
+ "- India stuns Australia in a last-ball thriller at the World Cup finals ๐Ÿ๐Ÿ‡ฎ๐Ÿ‡ณ\n (15โ€“20 words)"
42
+ "- U.S. imposes sweeping tariffs on Chinese tech giants, rattling global markets ๐Ÿ“‰๐Ÿ‡บ๐Ÿ‡ธ\n (15โ€“20 words)"
43
+ "- Ceasefire breakthrough: Netanyahu bows to pressure after week-long escalation ๐Ÿ”ฅ๐Ÿ•Š๏ธ\n (15โ€“20 words)"
44
  "\n"
45
  "Be sharp. Be brief. No fluff. No preambles. Just the summaries.\n"
46
  "Return only the final summary block โ€” no extra commentary, no prompt repetition."
pipeline/news_ingest.py CHANGED
@@ -63,7 +63,7 @@ async def main():
63
  for query in QUERIES:
64
  print(f"๐Ÿ” Searching for: {query}")
65
  try:
66
- results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
67
  print(f" โ†’ Found {len(results)} links for '{query}'.")
68
 
69
  for item in results:
 
63
  for query in QUERIES:
64
  print(f"๐Ÿ” Searching for: {query}")
65
  try:
66
+ results = fetch_google_news(query, API_KEY, CSE_ID, num_results=30)
67
  print(f" โ†’ Found {len(results)} links for '{query}'.")
68
 
69
  for item in results: