raghavNCI
commited on
Commit
Β·
b2bd47e
1
Parent(s):
588f923
switching to newsdata
Browse files- nuse_modules/headlines_generator.py +126 -71
nuse_modules/headlines_generator.py
CHANGED
@@ -1,113 +1,168 @@
|
|
|
|
1 |
from __future__ import annotations
|
2 |
import datetime as _dt
|
3 |
-
import json, os
|
4 |
-
import
|
5 |
-
|
|
|
6 |
|
7 |
from clients.redis_client import redis_client as _r
|
8 |
-
from nuse_modules.google_search import search_google_news
|
9 |
from models_initialization.mistral_registry import mistral_generate
|
10 |
|
11 |
|
12 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
13 |
-
#
|
14 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
_CATEGORIES = {
|
16 |
-
"world": "world
|
17 |
-
"india": "
|
18 |
-
"finance": "business
|
19 |
-
"sports": "sports
|
20 |
-
"entertainment": "entertainment
|
21 |
}
|
22 |
|
23 |
_ARTICLES_PER_CAT = 5
|
24 |
_SUMMARY_TOKENS = 120
|
25 |
_REDIS_TTL_SECONDS = 24 * 3600
|
|
|
26 |
|
27 |
|
28 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
29 |
-
#
|
30 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
31 |
-
def
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
"""
|
42 |
-
|
43 |
"""
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
prompt = (
|
53 |
"You are a concise news assistant. Summarise the following article "
|
54 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
55 |
-
"
|
56 |
-
f"ARTICLE:\n{content}"
|
57 |
)
|
58 |
-
|
59 |
-
|
60 |
-
# Remove repeated prompt instructions if echoed back
|
61 |
-
cleaned = re.sub(r"(you are.*?article[:\n]+)", "", raw_output, flags=re.IGNORECASE | re.DOTALL).strip()
|
62 |
-
return cleaned
|
63 |
|
64 |
|
|
|
|
|
|
|
65 |
def _redis_key(date: str, category: str) -> str:
|
66 |
return f"headlines:{date}:{category}"
|
67 |
|
68 |
|
69 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
70 |
-
#
|
71 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
72 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
73 |
"""
|
74 |
-
Fetches top articles per category, summarises them,
|
75 |
-
and returns the
|
76 |
"""
|
77 |
-
date_str
|
78 |
-
all_output = {}
|
79 |
-
|
80 |
-
for
|
81 |
-
print(f"[HEADLINES] {
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
summaries.append({
|
100 |
-
"title": item.get("title"),
|
101 |
-
"url": item.get("link"),
|
102 |
-
"summary": summary,
|
103 |
-
"source_snippet": item.get("snippet"),
|
104 |
-
"image": item.get("image"), # added in google_search.py
|
105 |
-
})
|
106 |
-
|
107 |
-
redis_key = _redis_key(date_str, cat)
|
108 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
109 |
-
|
110 |
-
all_output[cat] = summaries
|
111 |
print(f" β³ stored {len(summaries)} items in Redis ({redis_key})")
|
112 |
|
113 |
return all_output
|
|
|
1 |
+
# nuse_modules/headlines_generator.py
|
2 |
from __future__ import annotations
|
3 |
import datetime as _dt
|
4 |
+
import json, os, re, time
|
5 |
+
from typing import List, Dict, Optional
|
6 |
+
|
7 |
+
import requests
|
8 |
|
9 |
from clients.redis_client import redis_client as _r
|
|
|
10 |
from models_initialization.mistral_registry import mistral_generate
|
11 |
|
12 |
|
13 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
14 |
+
# CONFIG
|
15 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
16 |
+
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
|
17 |
+
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY is not set in env / Space secrets"
|
18 |
+
|
19 |
+
# Newsdata supports these canonical categories:
|
20 |
+
# 'world', 'business', 'science', 'technology', 'entertainment',
|
21 |
+
# 'sports', 'environment', 'politics'
|
22 |
_CATEGORIES = {
|
23 |
+
"world": "world",
|
24 |
+
"india": "world", # use query filter for India
|
25 |
+
"finance": "business",
|
26 |
+
"sports": "sports",
|
27 |
+
"entertainment": "entertainment",
|
28 |
}
|
29 |
|
30 |
_ARTICLES_PER_CAT = 5
|
31 |
_SUMMARY_TOKENS = 120
|
32 |
_REDIS_TTL_SECONDS = 24 * 3600
|
33 |
+
_REQUEST_TIMEOUT = 10
|
34 |
|
35 |
|
36 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
37 |
+
# NEWSDATA FETCHER
|
38 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
39 |
+
def _newsdata_url(
|
40 |
+
category: str,
|
41 |
+
query: Optional[str] = None,
|
42 |
+
page: int = 0,
|
43 |
+
language: str = "en",
|
44 |
+
size: int = 25,
|
45 |
+
) -> str:
|
46 |
+
base = (
|
47 |
+
"https://newsdata.io/api/1/news"
|
48 |
+
f"?apikey={NEWSDATA_API_KEY}"
|
49 |
+
f"&language={language}"
|
50 |
+
f"&category={category}"
|
51 |
+
f"&size={size}"
|
52 |
+
f"&page={page}"
|
53 |
+
)
|
54 |
+
if query:
|
55 |
+
base += f"&q={query}"
|
56 |
+
return base
|
57 |
+
|
58 |
+
|
59 |
+
def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
|
60 |
"""
|
61 |
+
Fetch up to `wanted` articles for a given logical category (cat_key).
|
62 |
"""
|
63 |
+
collected: List[dict] = []
|
64 |
+
seen_links = set()
|
65 |
+
page = 0
|
66 |
+
|
67 |
+
while len(collected) < wanted and page < 5: # safety cap
|
68 |
+
url = _newsdata_url(
|
69 |
+
category=category,
|
70 |
+
query="india" if cat_key == "india" else None,
|
71 |
+
page=page,
|
72 |
+
)
|
73 |
+
try:
|
74 |
+
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
|
75 |
+
res.raise_for_status()
|
76 |
+
data = res.json()
|
77 |
+
except Exception as e:
|
78 |
+
print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
|
79 |
+
break
|
80 |
+
|
81 |
+
for item in data.get("results", []):
|
82 |
+
link = item.get("link")
|
83 |
+
if not link or link in seen_links:
|
84 |
+
continue
|
85 |
+
seen_links.add(link)
|
86 |
+
|
87 |
+
content = item.get("content") or item.get("full_description") or ""
|
88 |
+
if not content or len(content) < 300:
|
89 |
+
continue # skip short / empty bodies
|
90 |
|
91 |
+
collected.append(
|
92 |
+
{
|
93 |
+
"title": item.get("title"),
|
94 |
+
"url": link,
|
95 |
+
"content": content,
|
96 |
+
"image": item.get("image_url"),
|
97 |
+
"source_snippet": item.get("description") or "",
|
98 |
+
"pubDate": item.get("pubDate"),
|
99 |
+
}
|
100 |
+
)
|
101 |
+
if len(collected) >= wanted:
|
102 |
+
break
|
103 |
|
104 |
+
if not data.get("nextPage"):
|
105 |
+
break # no more pages
|
106 |
+
page += 1
|
107 |
+
time.sleep(0.4) # gentle throttle
|
108 |
+
|
109 |
+
return collected[:wanted]
|
110 |
+
|
111 |
+
|
112 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
113 |
+
# SUMMARISER
|
114 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
115 |
+
_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
|
116 |
+
|
117 |
+
def _summarise_article(body: str) -> str:
|
118 |
prompt = (
|
119 |
"You are a concise news assistant. Summarise the following article "
|
120 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
121 |
+
f"ARTICLE:\n{body}"
|
|
|
122 |
)
|
123 |
+
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
124 |
+
return _CLEAN_RE.sub("", raw).strip()
|
|
|
|
|
|
|
125 |
|
126 |
|
127 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
128 |
+
# REDIS KEY HELPERS
|
129 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
130 |
def _redis_key(date: str, category: str) -> str:
|
131 |
return f"headlines:{date}:{category}"
|
132 |
|
133 |
|
134 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
135 |
+
# MAIN ENTRY POINT
|
136 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
137 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
138 |
"""
|
139 |
+
Fetches top articles per category via Newsdata.io, summarises them,
|
140 |
+
stores in Upstash Redis, and returns the payload for logging/tests.
|
141 |
"""
|
142 |
+
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
143 |
+
all_output: Dict[str, List[dict]] = {}
|
144 |
+
|
145 |
+
for cat_key, newsdata_cat in _CATEGORIES.items():
|
146 |
+
print(f"[HEADLINES] {cat_key.title()} β¦")
|
147 |
+
articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
|
148 |
+
|
149 |
+
summaries: List[dict] = []
|
150 |
+
for art in articles:
|
151 |
+
summary = _summarise_article(art["content"])
|
152 |
+
summaries.append(
|
153 |
+
{
|
154 |
+
"title": art["title"],
|
155 |
+
"url": art["url"],
|
156 |
+
"summary": summary,
|
157 |
+
"source_snippet": art["source_snippet"],
|
158 |
+
"image": art["image"],
|
159 |
+
"pubDate": art["pubDate"],
|
160 |
+
}
|
161 |
+
)
|
162 |
+
|
163 |
+
redis_key = _redis_key(date_str, cat_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
165 |
+
all_output[cat_key] = summaries
|
|
|
166 |
print(f" β³ stored {len(summaries)} items in Redis ({redis_key})")
|
167 |
|
168 |
return all_output
|