raghavNCI commited on
Commit
a37ba23
Β·
1 Parent(s): 7fdb7c1

using rss instead

Browse files
Files changed (1) hide show
  1. nuse_modules/headlines_generator.py +77 -87
nuse_modules/headlines_generator.py CHANGED
@@ -1,104 +1,96 @@
1
- # nuse_modules/headlines_generator.py
2
  from __future__ import annotations
3
  import datetime as _dt
4
  import json
5
  import os
6
  import re
7
  import time
8
- from typing import List, Dict, Optional
9
 
10
  import requests
 
 
11
 
12
  from clients.redis_client import redis_client as _r
13
  from models_initialization.mistral_registry import mistral_generate
14
 
15
  # ──────────────────────────────────────────────────────────────
16
- # CONFIG
17
  # ──────────────────────────────────────────────────────────────
18
- NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
19
- assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY missing (add to Space secrets or .env)"
20
-
21
- # Pure-query strings we’ll pass via &q=
22
  _CATEGORIES: dict[str, str] = {
23
- "world": "world news top stories",
24
- "india": "india top headlines",
25
- "finance": "business finance economy",
26
- "sports": "sports news today",
27
- "entertainment": "celebrity movies tv music",
28
  }
29
 
30
  _ARTICLES_PER_CAT = 5
31
  _SUMMARY_TOKENS = 120
32
  _REDIS_TTL_SECONDS = 24 * 3600
33
- _REQUEST_TIMEOUT = 10 # seconds
 
34
 
35
- # ──────────────────────────────────────────────────────────────
36
- # NEWSDATA HELPER
37
- # ──────────────────────────────────────────────────────────────
38
- def _newsdata_url(
39
- query: str,
40
- page: int = 0,
41
- language: str = "en",
42
- size: int = 25,
43
- ) -> str:
44
- """
45
- Build a Newsdata /latest request that always uses q=.
46
- """
47
  return (
48
- "https://newsdata.io/api/1/latest"
49
- f"?apikey={NEWSDATA_API_KEY}"
50
- f"&language={language}"
51
- f"&size={size}"
52
- f"&page={page}"
53
- f"&q={requests.utils.quote(query)}"
54
  )
55
 
56
- def _fetch_articles(q: str, wanted: int) -> List[dict]:
57
- """
58
- Fetch up to `wanted` unique articles for the query string `q`.
59
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  collected: List[dict] = []
61
- seen_urls: set[str] = set()
62
- page = 0
63
-
64
- while len(collected) < wanted and page < 5: # hard stop at 5 pages
65
- url = _newsdata_url(query=q, page=page)
66
- try:
67
- res = requests.get(url, timeout=_REQUEST_TIMEOUT)
68
- res.raise_for_status()
69
- data = res.json()
70
- except Exception as e:
71
- print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
 
 
 
 
 
 
 
 
 
 
 
 
72
  break
73
 
74
- for item in data.get("results", []):
75
- url_link = item.get("link")
76
- if not url_link or url_link in seen_urls:
77
- continue
78
- seen_urls.add(url_link)
79
-
80
- content = item.get("content") or item.get("full_description") or ""
81
- if len(content) < 300:
82
- continue # skip short or empty articles
83
-
84
- collected.append(
85
- {
86
- "title": item.get("title"),
87
- "url": url_link,
88
- "content": content,
89
- "image": item.get("image_url"),
90
- "source_snippet": item.get("description") or "",
91
- "pubDate": item.get("pubDate"),
92
- }
93
- )
94
- if len(collected) >= wanted:
95
- break
96
-
97
- if not data.get("nextPage"):
98
- break
99
- page += 1
100
- time.sleep(0.4) # gentle throttling
101
- return collected[:wanted]
102
 
103
  # ──────────────────────────────────────────────────────────────
104
  # SUMMARISER
@@ -117,16 +109,16 @@ def _summarise(text: str) -> str:
117
  # ──────────────────────────────────────────────────────────────
118
  # REDIS KEY
119
  # ──────────────────────────────────────────────────────────────
 
120
  def _redis_key(date: str, cat: str) -> str:
121
  return f"headlines:{date}:{cat}"
122
 
123
  # ──────────────────────────────────────────────────────────────
124
  # MAIN ENTRY
125
  # ──────────────────────────────────────────────────────────────
 
126
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
127
- """
128
- Fetch, summarise, and cache today’s headlines for each category.
129
- """
130
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
131
  all_results: Dict[str, List[dict]] = {}
132
 
@@ -137,16 +129,14 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
137
  summaries: List[dict] = []
138
  for art in articles:
139
  summary_txt = _summarise(art["content"])
140
- summaries.append(
141
- {
142
- "title": art["title"],
143
- "url": art["url"],
144
- "summary": summary_txt,
145
- "source_snippet": art["source_snippet"],
146
- "image": art["image"],
147
- "pubDate": art["pubDate"],
148
- }
149
- )
150
 
151
  redis_key = _redis_key(date_str, cat)
152
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
 
 
1
  from __future__ import annotations
2
  import datetime as _dt
3
  import json
4
  import os
5
  import re
6
  import time
7
+ from typing import List, Dict
8
 
9
  import requests
10
+ import feedparser
11
+ from boilerpy3 import extractors
12
 
13
  from clients.redis_client import redis_client as _r
14
  from models_initialization.mistral_registry import mistral_generate
15
 
16
  # ──────────────────────────────────────────────────────────────
17
+ # CONFIG (Google News RSS, no external API keys needed)
18
  # ──────────────────────────────────────────────────────────────
19
+ # Query strings passed into GoogleΒ News RSS search feed
 
 
 
20
  _CATEGORIES: dict[str, str] = {
21
+ "world": "world news",
22
+ "india": "india top stories",
23
+ "finance": "finance business economy",
24
+ "sports": "sports headlines",
25
+ "entertainment": "entertainment celebrity movies tv",
26
  }
27
 
28
  _ARTICLES_PER_CAT = 5
29
  _SUMMARY_TOKENS = 120
30
  _REDIS_TTL_SECONDS = 24 * 3600
31
+ _RSS_TIMEOUT = 10 # seconds
32
+ _ARTICLE_TIMEOUT = 10 # seconds
33
 
34
+ # Google News RSS search template
35
+ def _rss_url(query: str) -> str:
36
+ query = requests.utils.quote(query)
 
 
 
 
 
 
 
 
 
37
  return (
38
+ "https://news.google.com/rss/search?q=" + query +
39
+ "&hl=en-US&gl=US&ceid=US:en"
 
 
 
 
40
  )
41
 
42
+ # BoilerPy3 extractor (thread‑safe singleton)
43
+ _bp_extractor = extractors.ArticleExtractor()
44
+
45
+ # ──────────────────────────────────────────────────────────────
46
+ # FETCH RSS + ARTICLE BODY
47
+ # ──────────────────────────────────────────────────────────────
48
+
49
+ def _extract_fulltext(url: str) -> str:
50
+ try:
51
+ html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
52
+ text = _bp_extractor.get_content(html)
53
+ return text or ""
54
+ except Exception as e:
55
+ print(f"[SCRAPE ERR] {url}: {e}")
56
+ return ""
57
+
58
+
59
+ def _fetch_articles(query: str, wanted: int) -> List[dict]:
60
+ feed_url = _rss_url(query)
61
+ try:
62
+ feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
63
+ except Exception as e:
64
+ print(f"[RSS ERR] {query}: {e}")
65
+ return []
66
+
67
  collected: List[dict] = []
68
+ seen_links: set[str] = set()
69
+
70
+ for entry in feed.entries:
71
+ link = entry.link
72
+ if link in seen_links:
73
+ continue
74
+ seen_links.add(link)
75
+
76
+ body = _extract_fulltext(link)
77
+ if len(body) < 300:
78
+ continue # skip trivial pages/homepages
79
+
80
+ collected.append(
81
+ {
82
+ "title": entry.title,
83
+ "url": link,
84
+ "content": body,
85
+ "pubDate": entry.get("published", ""),
86
+ "image": None, # RSS search feed rarely returns image; can scrape OG tag later
87
+ "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
88
+ }
89
+ )
90
+ if len(collected) >= wanted:
91
  break
92
 
93
+ return collected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  # ──────────────────────────────────────────────────────────────
96
  # SUMMARISER
 
109
  # ──────────────────────────────────────────────────────────────
110
  # REDIS KEY
111
  # ──────────────────────────────────────────────────────────────
112
+
113
  def _redis_key(date: str, cat: str) -> str:
114
  return f"headlines:{date}:{cat}"
115
 
116
  # ──────────────────────────────────────────────────────────────
117
  # MAIN ENTRY
118
  # ──────────────────────────────────────────────────────────────
119
+
120
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
121
+ """Fetches, summarises, and caches headlines via Google News RSS."""
 
 
122
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
123
  all_results: Dict[str, List[dict]] = {}
124
 
 
129
  summaries: List[dict] = []
130
  for art in articles:
131
  summary_txt = _summarise(art["content"])
132
+ summaries.append({
133
+ "title": art["title"],
134
+ "url": art["url"],
135
+ "summary": summary_txt,
136
+ "source_snippet": art["source_snippet"],
137
+ "image": art["image"],
138
+ "pubDate": art["pubDate"],
139
+ })
 
 
140
 
141
  redis_key = _redis_key(date_str, cat)
142
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)