raghavNCI commited on
Commit
2049c5d
Β·
1 Parent(s): 929da00

headlines fix 3

Browse files
nuse_modules/headlines_generator.py CHANGED
@@ -16,7 +16,6 @@ from models_initialization.mistral_registry import mistral_generate
16
  # ──────────────────────────────────────────────────────────────
17
  # CONFIG (Google News RSS, no external API keys needed)
18
  # ──────────────────────────────────────────────────────────────
19
- # Query strings passed into GoogleΒ News RSS search feed
20
  _CATEGORIES: dict[str, str] = {
21
  "world": "world news",
22
  "india": "india top stories",
@@ -28,10 +27,12 @@ _CATEGORIES: dict[str, str] = {
28
  _ARTICLES_PER_CAT = 5
29
  _SUMMARY_TOKENS = 120
30
  _REDIS_TTL_SECONDS = 24 * 3600
31
- _RSS_TIMEOUT = 10 # seconds
32
- _ARTICLE_TIMEOUT = 10 # seconds
 
33
 
34
  # Google News RSS search template
 
35
  def _rss_url(query: str) -> str:
36
  query = requests.utils.quote(query)
37
  return (
@@ -42,13 +43,35 @@ def _rss_url(query: str) -> str:
42
  # BoilerPy3 extractor (thread‑safe singleton)
43
  _bp_extractor = extractors.ArticleExtractor()
44
 
 
 
 
 
 
 
 
 
45
  # ──────────────────────────────────────────────────────────────
46
  # FETCH RSS + ARTICLE BODY
47
  # ──────────────────────────────────────────────────────────────
48
 
 
 
 
 
 
 
49
  def _extract_fulltext(url: str) -> str:
50
  try:
51
- html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
 
 
 
 
 
 
 
 
52
  text = _bp_extractor.get_content(html)
53
  return text or ""
54
  except Exception as e:
@@ -59,7 +82,7 @@ def _extract_fulltext(url: str) -> str:
59
  def _fetch_articles(query: str, wanted: int) -> List[dict]:
60
  feed_url = _rss_url(query)
61
  try:
62
- feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
63
  except Exception as e:
64
  print(f"[RSS ERR] {query}: {e}")
65
  return []
@@ -74,19 +97,17 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
74
  seen_links.add(link)
75
 
76
  body = _extract_fulltext(link)
77
- if len(body) < 300:
78
- continue # skip trivial pages/homepages
79
-
80
- collected.append(
81
- {
82
- "title": entry.title,
83
- "url": link,
84
- "content": body,
85
- "pubDate": entry.get("published", ""),
86
- "image": None, # RSS search feed rarely returns image; can scrape OG tag later
87
- "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
88
- }
89
- )
90
  if len(collected) >= wanted:
91
  break
92
 
@@ -97,6 +118,7 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
97
  # ──────────────────────────────────────────────────────────────
98
  _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
99
 
 
100
  def _summarise(text: str) -> str:
101
  prompt = (
102
  "You are a concise news assistant. Summarise the following article "
 
16
  # ──────────────────────────────────────────────────────────────
17
  # CONFIG (Google News RSS, no external API keys needed)
18
  # ──────────────────────────────────────────────────────────────
 
19
  _CATEGORIES: dict[str, str] = {
20
  "world": "world news",
21
  "india": "india top stories",
 
27
  _ARTICLES_PER_CAT = 5
28
  _SUMMARY_TOKENS = 120
29
  _REDIS_TTL_SECONDS = 24 * 3600
30
+ _RSS_TIMEOUT = 10 # seconds
31
+ _ARTICLE_TIMEOUT = 10 # seconds
32
+ _MIN_BODY_LENGTH = 120 # relaxed threshold so short briefs pass
33
 
34
  # Google News RSS search template
35
+
36
  def _rss_url(query: str) -> str:
37
  query = requests.utils.quote(query)
38
  return (
 
43
  # BoilerPy3 extractor (thread‑safe singleton)
44
  _bp_extractor = extractors.ArticleExtractor()
45
 
46
+ # Common browser UA to avoid 403s
47
+ _HEADERS = {
48
+ "User-Agent": (
49
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
50
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114 Safari/537.36"
51
+ )
52
+ }
53
+
54
  # ──────────────────────────────────────────────────────────────
55
  # FETCH RSS + ARTICLE BODY
56
  # ──────────────────────────────────────────────────────────────
57
 
58
+ def _follow_google_redirect(html: str) -> str | None:
59
+ """Extract the real URL from a Google News redirect HTML page."""
60
+ match = re.search(r'url=(https?[^"\']+)', html, flags=re.I)
61
+ return match.group(1) if match else None
62
+
63
+
64
  def _extract_fulltext(url: str) -> str:
65
  try:
66
+ resp = requests.get(url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT, allow_redirects=True)
67
+ html = resp.text
68
+
69
+ # If still on news.google.com and meta refresh present β†’ follow manually
70
+ if "news.google.com" in resp.url and "http-equiv=\"refresh\"" in html.lower():
71
+ real_url = _follow_google_redirect(html)
72
+ if real_url:
73
+ html = requests.get(real_url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT).text
74
+
75
  text = _bp_extractor.get_content(html)
76
  return text or ""
77
  except Exception as e:
 
82
  def _fetch_articles(query: str, wanted: int) -> List[dict]:
83
  feed_url = _rss_url(query)
84
  try:
85
+ feed = feedparser.parse(feed_url, request_headers=_HEADERS)
86
  except Exception as e:
87
  print(f"[RSS ERR] {query}: {e}")
88
  return []
 
97
  seen_links.add(link)
98
 
99
  body = _extract_fulltext(link)
100
+ if len(body) < _MIN_BODY_LENGTH:
101
+ continue # skip very short pages/homepages
102
+
103
+ collected.append({
104
+ "title": entry.title,
105
+ "url": link,
106
+ "content": body,
107
+ "pubDate": entry.get("published", ""),
108
+ "image": None, # can scrape OG tag later
109
+ "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
110
+ })
 
 
111
  if len(collected) >= wanted:
112
  break
113
 
 
118
  # ──────────────────────────────────────────────────────────────
119
  _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
120
 
121
+
122
  def _summarise(text: str) -> str:
123
  prompt = (
124
  "You are a concise news assistant. Summarise the following article "
requirements.txt CHANGED
@@ -8,4 +8,6 @@ accelerate
8
  torch
9
  huggingface_hub
10
  boilerpy3==1.0.6
11
- feedparser
 
 
 
8
  torch
9
  huggingface_hub
10
  boilerpy3==1.0.6
11
+ feedparser
12
+ newspaper3k
13
+ nltk