raghavNCI
commited on
Commit
Β·
7fdb7c1
1
Parent(s):
5557673
headlines fix 1
Browse files
nuse_modules/headlines_generator.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
# nuse_modules/headlines_generator.py
|
2 |
from __future__ import annotations
|
3 |
import datetime as _dt
|
4 |
-
import json
|
|
|
|
|
|
|
5 |
from typing import List, Dict, Optional
|
6 |
|
7 |
import requests
|
@@ -9,96 +12,79 @@ import requests
|
|
9 |
from clients.redis_client import redis_client as _r
|
10 |
from models_initialization.mistral_registry import mistral_generate
|
11 |
|
12 |
-
|
13 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
14 |
# CONFIG
|
15 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
16 |
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
|
17 |
-
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY
|
18 |
-
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"sports": "sports",
|
27 |
-
"entertainment": "entertainment",
|
28 |
}
|
29 |
|
30 |
_ARTICLES_PER_CAT = 5
|
31 |
_SUMMARY_TOKENS = 120
|
32 |
_REDIS_TTL_SECONDS = 24 * 3600
|
33 |
-
_REQUEST_TIMEOUT = 10
|
34 |
-
|
35 |
|
36 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
37 |
-
# NEWSDATA
|
38 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
39 |
def _newsdata_url(
|
40 |
-
|
41 |
-
query: str | None = None,
|
42 |
-
category: str | None = None,
|
43 |
page: int = 0,
|
44 |
language: str = "en",
|
45 |
size: int = 25,
|
46 |
) -> str:
|
47 |
"""
|
48 |
-
Build
|
49 |
-
You may supply *either* query OR category (not both).
|
50 |
"""
|
51 |
-
|
52 |
"https://newsdata.io/api/1/latest"
|
53 |
f"?apikey={NEWSDATA_API_KEY}"
|
54 |
f"&language={language}"
|
55 |
f"&size={size}"
|
56 |
f"&page={page}"
|
|
|
57 |
)
|
58 |
-
if query:
|
59 |
-
base += f"&q={query}"
|
60 |
-
elif category:
|
61 |
-
base += f"&category={category}"
|
62 |
-
return base
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
|
67 |
"""
|
68 |
-
Fetch up to `wanted` articles for
|
69 |
"""
|
70 |
collected: List[dict] = []
|
71 |
-
|
72 |
page = 0
|
73 |
|
74 |
-
while len(collected) < wanted and page < 5: #
|
75 |
-
url = _newsdata_url(
|
76 |
-
category=category,
|
77 |
-
query="india" if cat_key == "india" else None,
|
78 |
-
page=page,
|
79 |
-
)
|
80 |
try:
|
81 |
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
|
82 |
res.raise_for_status()
|
83 |
data = res.json()
|
84 |
except Exception as e:
|
85 |
-
print(f"[ERROR] Newsdata fetch failed ({
|
86 |
break
|
87 |
|
88 |
for item in data.get("results", []):
|
89 |
-
|
90 |
-
if not
|
91 |
continue
|
92 |
-
|
93 |
|
94 |
content = item.get("content") or item.get("full_description") or ""
|
95 |
-
if
|
96 |
-
continue # skip short
|
97 |
|
98 |
collected.append(
|
99 |
{
|
100 |
"title": item.get("title"),
|
101 |
-
"url":
|
102 |
"content": content,
|
103 |
"image": item.get("image_url"),
|
104 |
"source_snippet": item.get("description") or "",
|
@@ -109,67 +95,62 @@ def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[d
|
|
109 |
break
|
110 |
|
111 |
if not data.get("nextPage"):
|
112 |
-
break
|
113 |
page += 1
|
114 |
-
time.sleep(0.4) # gentle
|
115 |
-
|
116 |
return collected[:wanted]
|
117 |
|
118 |
-
|
119 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
120 |
# SUMMARISER
|
121 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
122 |
-
|
123 |
|
124 |
-
def
|
125 |
prompt = (
|
126 |
"You are a concise news assistant. Summarise the following article "
|
127 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
128 |
-
f"ARTICLE:\n{
|
129 |
)
|
130 |
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
131 |
-
return
|
132 |
-
|
133 |
|
134 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
135 |
-
# REDIS KEY
|
136 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
137 |
-
def _redis_key(date: str,
|
138 |
-
return f"headlines:{date}:{
|
139 |
-
|
140 |
|
141 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
142 |
-
# MAIN ENTRY
|
143 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
144 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
145 |
"""
|
146 |
-
|
147 |
-
stores in Upstash Redis, and returns the payload for logging/tests.
|
148 |
"""
|
149 |
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
150 |
-
|
151 |
|
152 |
-
for
|
153 |
-
print(f"[HEADLINES] {
|
154 |
-
articles =
|
155 |
|
156 |
summaries: List[dict] = []
|
157 |
for art in articles:
|
158 |
-
|
159 |
summaries.append(
|
160 |
{
|
161 |
"title": art["title"],
|
162 |
"url": art["url"],
|
163 |
-
"summary":
|
164 |
"source_snippet": art["source_snippet"],
|
165 |
"image": art["image"],
|
166 |
"pubDate": art["pubDate"],
|
167 |
}
|
168 |
)
|
169 |
|
170 |
-
redis_key = _redis_key(date_str,
|
171 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
172 |
-
|
173 |
-
print(f" β³ stored {len(summaries)} items
|
174 |
|
175 |
-
return
|
|
|
1 |
# nuse_modules/headlines_generator.py
|
2 |
from __future__ import annotations
|
3 |
import datetime as _dt
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
import time
|
8 |
from typing import List, Dict, Optional
|
9 |
|
10 |
import requests
|
|
|
12 |
from clients.redis_client import redis_client as _r
|
13 |
from models_initialization.mistral_registry import mistral_generate
|
14 |
|
|
|
15 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
16 |
# CONFIG
|
17 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
18 |
NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
|
19 |
+
assert NEWSDATA_API_KEY, "β NEWSDATA_API_KEY missing (add to Space secrets or .env)"
|
20 |
+
|
21 |
+
# Pure-query strings weβll pass via &q=
|
22 |
+
_CATEGORIES: dict[str, str] = {
|
23 |
+
"world": "world news top stories",
|
24 |
+
"india": "india top headlines",
|
25 |
+
"finance": "business finance economy",
|
26 |
+
"sports": "sports news today",
|
27 |
+
"entertainment": "celebrity movies tv music",
|
|
|
|
|
28 |
}
|
29 |
|
30 |
_ARTICLES_PER_CAT = 5
|
31 |
_SUMMARY_TOKENS = 120
|
32 |
_REDIS_TTL_SECONDS = 24 * 3600
|
33 |
+
_REQUEST_TIMEOUT = 10 # seconds
|
|
|
34 |
|
35 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
36 |
+
# NEWSDATA HELPER
|
37 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
38 |
def _newsdata_url(
|
39 |
+
query: str,
|
|
|
|
|
40 |
page: int = 0,
|
41 |
language: str = "en",
|
42 |
size: int = 25,
|
43 |
) -> str:
|
44 |
"""
|
45 |
+
Build a Newsdata /latest request that always uses q=.
|
|
|
46 |
"""
|
47 |
+
return (
|
48 |
"https://newsdata.io/api/1/latest"
|
49 |
f"?apikey={NEWSDATA_API_KEY}"
|
50 |
f"&language={language}"
|
51 |
f"&size={size}"
|
52 |
f"&page={page}"
|
53 |
+
f"&q={requests.utils.quote(query)}"
|
54 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
def _fetch_articles(q: str, wanted: int) -> List[dict]:
|
|
|
57 |
"""
|
58 |
+
Fetch up to `wanted` unique articles for the query string `q`.
|
59 |
"""
|
60 |
collected: List[dict] = []
|
61 |
+
seen_urls: set[str] = set()
|
62 |
page = 0
|
63 |
|
64 |
+
while len(collected) < wanted and page < 5: # hard stop at 5 pages
|
65 |
+
url = _newsdata_url(query=q, page=page)
|
|
|
|
|
|
|
|
|
66 |
try:
|
67 |
res = requests.get(url, timeout=_REQUEST_TIMEOUT)
|
68 |
res.raise_for_status()
|
69 |
data = res.json()
|
70 |
except Exception as e:
|
71 |
+
print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
|
72 |
break
|
73 |
|
74 |
for item in data.get("results", []):
|
75 |
+
url_link = item.get("link")
|
76 |
+
if not url_link or url_link in seen_urls:
|
77 |
continue
|
78 |
+
seen_urls.add(url_link)
|
79 |
|
80 |
content = item.get("content") or item.get("full_description") or ""
|
81 |
+
if len(content) < 300:
|
82 |
+
continue # skip short or empty articles
|
83 |
|
84 |
collected.append(
|
85 |
{
|
86 |
"title": item.get("title"),
|
87 |
+
"url": url_link,
|
88 |
"content": content,
|
89 |
"image": item.get("image_url"),
|
90 |
"source_snippet": item.get("description") or "",
|
|
|
95 |
break
|
96 |
|
97 |
if not data.get("nextPage"):
|
98 |
+
break
|
99 |
page += 1
|
100 |
+
time.sleep(0.4) # gentle throttling
|
|
|
101 |
return collected[:wanted]
|
102 |
|
|
|
103 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
104 |
# SUMMARISER
|
105 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
106 |
+
_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
|
107 |
|
108 |
+
def _summarise(text: str) -> str:
|
109 |
prompt = (
|
110 |
"You are a concise news assistant. Summarise the following article "
|
111 |
"in one sentence (<=25 words). Omit source and author names.\n\n"
|
112 |
+
f"ARTICLE:\n{text}"
|
113 |
)
|
114 |
raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
|
115 |
+
return _RE_PROMPT_ECHO.sub("", raw).strip()
|
|
|
116 |
|
117 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
118 |
+
# REDIS KEY
|
119 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
120 |
+
def _redis_key(date: str, cat: str) -> str:
|
121 |
+
return f"headlines:{date}:{cat}"
|
|
|
122 |
|
123 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
124 |
+
# MAIN ENTRY
|
125 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
126 |
def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
|
127 |
"""
|
128 |
+
Fetch, summarise, and cache todayβs headlines for each category.
|
|
|
129 |
"""
|
130 |
date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
|
131 |
+
all_results: Dict[str, List[dict]] = {}
|
132 |
|
133 |
+
for cat, query in _CATEGORIES.items():
|
134 |
+
print(f"[HEADLINES] {cat.title()} β¦")
|
135 |
+
articles = _fetch_articles(query, _ARTICLES_PER_CAT)
|
136 |
|
137 |
summaries: List[dict] = []
|
138 |
for art in articles:
|
139 |
+
summary_txt = _summarise(art["content"])
|
140 |
summaries.append(
|
141 |
{
|
142 |
"title": art["title"],
|
143 |
"url": art["url"],
|
144 |
+
"summary": summary_txt,
|
145 |
"source_snippet": art["source_snippet"],
|
146 |
"image": art["image"],
|
147 |
"pubDate": art["pubDate"],
|
148 |
}
|
149 |
)
|
150 |
|
151 |
+
redis_key = _redis_key(date_str, cat)
|
152 |
_r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
|
153 |
+
all_results[cat] = summaries
|
154 |
+
print(f" β³ stored {len(summaries)} items β {redis_key}")
|
155 |
|
156 |
+
return all_results
|