raghavNCI
commited on
Commit
Β·
a37ba23
1
Parent(s):
7fdb7c1
using rss instead
Browse files
nuse_modules/headlines_generator.py
CHANGED
@@ -1,104 +1,96 @@
|
|
1 |
-
# nuse_modules/headlines_generator.py
|
2 |
from __future__ import annotations
|
3 |
import datetime as _dt
|
4 |
import json
|
5 |
import os
|
6 |
import re
|
7 |
import time
|
8 |
-
from typing import List, Dict
|
9 |
|
10 |
import requests
|
|
|
|
|
11 |
|
12 |
from clients.redis_client import redis_client as _r
|
13 |
from models_initialization.mistral_registry import mistral_generate
|
14 |
|
15 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
16 |
-
# CONFIG
|
17 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
18 |
-
|
19 |
-
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY missing (add to Space secrets or .env)"
|
20 |
-
|
21 |
-
# Pure-query strings weβll pass via &q=
|
22 |
_CATEGORIES: dict[str, str] = {
|
23 |
-
"world": "world news
|
24 |
-
"india": "india top
|
25 |
-
"finance": "business
|
26 |
-
"sports": "sports
|
27 |
-
"entertainment": "celebrity movies tv
|
28 |
}
|
29 |
|
30 |
_ARTICLES_PER_CAT = 5
|
31 |
_SUMMARY_TOKENS = 120
|
32 |
_REDIS_TTL_SECONDS = 24 * 3600
|
33 |
-
|
|
|
34 |
|
35 |
-
#
|
36 |
-
|
37 |
-
|
38 |
-
def _newsdata_url(
|
39 |
-
query: str,
|
40 |
-
page: int = 0,
|
41 |
-
language: str = "en",
|
42 |
-
size: int = 25,
|
43 |
-
) -> str:
|
44 |
-
"""
|
45 |
-
Build a Newsdata /latest request that always uses q=.
|
46 |
-
"""
|
47 |
return (
|
48 |
-
"https://
|
49 |
-
|
50 |
-
f"&language={language}"
|
51 |
-
f"&size={size}"
|
52 |
-
f"&page={page}"
|
53 |
-
f"&q={requests.utils.quote(query)}"
|
54 |
)
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
collected: List[dict] = []
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
break
|
73 |
|
74 |
-
|
75 |
-
url_link = item.get("link")
|
76 |
-
if not url_link or url_link in seen_urls:
|
77 |
-
continue
|
78 |
-
seen_urls.add(url_link)
|
79 |
-
|
80 |
-
content = item.get("content") or item.get("full_description") or ""
|
81 |
-
if len(content) < 300:
|
82 |
-
continue # skip short or empty articles
|
83 |
-
|
84 |
-
collected.append(
|
85 |
-
{
|
86 |
-
"title": item.get("title"),
|
87 |
-
"url": url_link,
|
88 |
-
"content": content,
|
89 |
-
"image": item.get("image_url"),
|
90 |
-
"source_snippet": item.get("description") or "",
|
91 |
-
"pubDate": item.get("pubDate"),
|
92 |
-
}
|
93 |
-
)
|
94 |
-
if len(collected) >= wanted:
|
95 |
-
break
|
96 |
-
|
97 |
-
if not data.get("nextPage"):
|
98 |
-
break
|
99 |
-
page += 1
|
100 |
-
time.sleep(0.4) # gentle throttling
|
101 |
-
return collected[:wanted]
|
102 |
|
103 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
104 |
# SUMMARISER
|
@@ -117,16 +109,16 @@ def _summarise(text: str) -> str:
|
|
117 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
118 |
# REDIS KEY
|
119 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
120 |
def _redis_key(date: str, cat: str) -> str:
|
121 |
return f"headlines:{date}:{cat}"
|
122 |
|
123 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
124 |
# MAIN ENTRY
|
125 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
126 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
127 |
-
"""
|
128 |
-
Fetch, summarise, and cache todayβs headlines for each category.
|
129 |
-
"""
|
130 |
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
131 |
all_results: Dict[str, List[dict]] = {}
|
132 |
|
@@ -137,16 +129,14 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
|
|
137 |
summaries: List[dict] = []
|
138 |
for art in articles:
|
139 |
summary_txt = _summarise(art["content"])
|
140 |
-
summaries.append(
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
}
|
149 |
-
)
|
150 |
|
151 |
redis_key = _redis_key(date_str, cat)
|
152 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
|
|
|
|
1 |
from __future__ import annotations
|
2 |
import datetime as _dt
|
3 |
import json
|
4 |
import os
|
5 |
import re
|
6 |
import time
|
7 |
+
from typing import List, Dict
|
8 |
|
9 |
import requests
|
10 |
+
import feedparser
|
11 |
+
from boilerpy3 import extractors
|
12 |
|
13 |
from clients.redis_client import redis_client as _r
|
14 |
from models_initialization.mistral_registry import mistral_generate
|
15 |
|
16 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
17 |
+
# CONFIG (Google News RSS, no external API keys needed)
|
18 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
19 |
+
# Query strings passed into GoogleΒ News RSS search feed
|
|
|
|
|
|
|
20 |
_CATEGORIES: dict[str, str] = {
|
21 |
+
"world": "world news",
|
22 |
+
"india": "india top stories",
|
23 |
+
"finance": "finance business economy",
|
24 |
+
"sports": "sports headlines",
|
25 |
+
"entertainment": "entertainment celebrity movies tv",
|
26 |
}
|
27 |
|
28 |
_ARTICLES_PER_CAT = 5
|
29 |
_SUMMARY_TOKENS = 120
|
30 |
_REDIS_TTL_SECONDS = 24 * 3600
|
31 |
+
_RSS_TIMEOUT = 10 # seconds
|
32 |
+
_ARTICLE_TIMEOUT = 10 # seconds
|
33 |
|
34 |
+
# Google News RSS search template
|
35 |
+
def _rss_url(query: str) -> str:
|
36 |
+
query = requests.utils.quote(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return (
|
38 |
+
"https://news.google.com/rss/search?q=" + query +
|
39 |
+
"&hl=en-US&gl=US&ceid=US:en"
|
|
|
|
|
|
|
|
|
40 |
)
|
41 |
|
42 |
+
# BoilerPy3 extractor (threadβsafe singleton)
|
43 |
+
_bp_extractor = extractors.ArticleExtractor()
|
44 |
+
|
45 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
46 |
+
# FETCH RSS + ARTICLE BODY
|
47 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
48 |
+
|
49 |
+
def _extract_fulltext(url: str) -> str:
|
50 |
+
try:
|
51 |
+
html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
|
52 |
+
text = _bp_extractor.get_content(html)
|
53 |
+
return text or ""
|
54 |
+
except Exception as e:
|
55 |
+
print(f"[SCRAPE ERR] {url}: {e}")
|
56 |
+
return ""
|
57 |
+
|
58 |
+
|
59 |
+
def _fetch_articles(query: str, wanted: int) -> List[dict]:
|
60 |
+
feed_url = _rss_url(query)
|
61 |
+
try:
|
62 |
+
feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
|
63 |
+
except Exception as e:
|
64 |
+
print(f"[RSS ERR] {query}: {e}")
|
65 |
+
return []
|
66 |
+
|
67 |
collected: List[dict] = []
|
68 |
+
seen_links: set[str] = set()
|
69 |
+
|
70 |
+
for entry in feed.entries:
|
71 |
+
link = entry.link
|
72 |
+
if link in seen_links:
|
73 |
+
continue
|
74 |
+
seen_links.add(link)
|
75 |
+
|
76 |
+
body = _extract_fulltext(link)
|
77 |
+
if len(body) < 300:
|
78 |
+
continue # skip trivial pages/homepages
|
79 |
+
|
80 |
+
collected.append(
|
81 |
+
{
|
82 |
+
"title": entry.title,
|
83 |
+
"url": link,
|
84 |
+
"content": body,
|
85 |
+
"pubDate": entry.get("published", ""),
|
86 |
+
"image": None, # RSS search feed rarely returns image; can scrape OG tag later
|
87 |
+
"source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
|
88 |
+
}
|
89 |
+
)
|
90 |
+
if len(collected) >= wanted:
|
91 |
break
|
92 |
|
93 |
+
return collected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
96 |
# SUMMARISER
|
|
|
109 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
110 |
# REDIS KEY
|
111 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
112 |
+
|
113 |
def _redis_key(date: str, cat: str) -> str:
|
114 |
return f"headlines:{date}:{cat}"
|
115 |
|
116 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
117 |
# MAIN ENTRY
|
118 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
119 |
+
|
120 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
121 |
+
"""Fetches, summarises, and caches headlines via Google News RSS."""
|
|
|
|
|
122 |
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
123 |
all_results: Dict[str, List[dict]] = {}
|
124 |
|
|
|
129 |
summaries: List[dict] = []
|
130 |
for art in articles:
|
131 |
summary_txt = _summarise(art["content"])
|
132 |
+
summaries.append({
|
133 |
+
"title": art["title"],
|
134 |
+
"url": art["url"],
|
135 |
+
"summary": summary_txt,
|
136 |
+
"source_snippet": art["source_snippet"],
|
137 |
+
"image": art["image"],
|
138 |
+
"pubDate": art["pubDate"],
|
139 |
+
})
|
|
|
|
|
140 |
|
141 |
redis_key = _redis_key(date_str, cat)
|
142 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|