raghavNCI commited on
Commit
7fdb7c1
Β·
1 Parent(s): 5557673

headlines fix 1

Browse files
Files changed (1) hide show
  1. nuse_modules/headlines_generator.py +52 -71
nuse_modules/headlines_generator.py CHANGED
@@ -1,7 +1,10 @@
1
  # nuse_modules/headlines_generator.py
2
  from __future__ import annotations
3
  import datetime as _dt
4
- import json, os, re, time
 
 
 
5
  from typing import List, Dict, Optional
6
 
7
  import requests
@@ -9,96 +12,79 @@ import requests
9
  from clients.redis_client import redis_client as _r
10
  from models_initialization.mistral_registry import mistral_generate
11
 
12
-
13
  # ──────────────────────────────────────────────────────────────
14
  # CONFIG
15
  # ──────────────────────────────────────────────────────────────
16
  NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
17
- assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"
18
-
19
- # Newsdata supports these canonical categories:
20
- # 'world', 'business', 'science', 'technology', 'entertainment',
21
- # 'sports', 'environment', 'politics'
22
- _CATEGORIES = {
23
- "world": "world",
24
- "india": "world", # use query filter for India
25
- "finance": "business",
26
- "sports": "sports",
27
- "entertainment": "entertainment",
28
  }
29
 
30
  _ARTICLES_PER_CAT = 5
31
  _SUMMARY_TOKENS = 120
32
  _REDIS_TTL_SECONDS = 24 * 3600
33
- _REQUEST_TIMEOUT = 10
34
-
35
 
36
  # ──────────────────────────────────────────────────────────────
37
- # NEWSDATA FETCHER
38
  # ──────────────────────────────────────────────────────────────
39
  def _newsdata_url(
40
- *, # ← keyword-only for clarity
41
- query: str | None = None,
42
- category: str | None = None,
43
  page: int = 0,
44
  language: str = "en",
45
  size: int = 25,
46
  ) -> str:
47
  """
48
- Build the /latest endpoint URL.
49
- You may supply *either* query OR category (not both).
50
  """
51
- base = (
52
  "https://newsdata.io/api/1/latest"
53
  f"?apikey={NEWSDATA_API_KEY}"
54
  f"&language={language}"
55
  f"&size={size}"
56
  f"&page={page}"
 
57
  )
58
- if query:
59
- base += f"&q={query}"
60
- elif category:
61
- base += f"&category={category}"
62
- return base
63
-
64
 
65
-
66
- def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
67
  """
68
- Fetch up to `wanted` articles for a given logical category (cat_key).
69
  """
70
  collected: List[dict] = []
71
- seen_links = set()
72
  page = 0
73
 
74
- while len(collected) < wanted and page < 5: # safety cap
75
- url = _newsdata_url(
76
- category=category,
77
- query="india" if cat_key == "india" else None,
78
- page=page,
79
- )
80
  try:
81
  res = requests.get(url, timeout=_REQUEST_TIMEOUT)
82
  res.raise_for_status()
83
  data = res.json()
84
  except Exception as e:
85
- print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
86
  break
87
 
88
  for item in data.get("results", []):
89
- link = item.get("link")
90
- if not link or link in seen_links:
91
  continue
92
- seen_links.add(link)
93
 
94
  content = item.get("content") or item.get("full_description") or ""
95
- if not content or len(content) < 300:
96
- continue # skip short / empty bodies
97
 
98
  collected.append(
99
  {
100
  "title": item.get("title"),
101
- "url": link,
102
  "content": content,
103
  "image": item.get("image_url"),
104
  "source_snippet": item.get("description") or "",
@@ -109,67 +95,62 @@ def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[d
109
  break
110
 
111
  if not data.get("nextPage"):
112
- break # no more pages
113
  page += 1
114
- time.sleep(0.4) # gentle throttle
115
-
116
  return collected[:wanted]
117
 
118
-
119
  # ──────────────────────────────────────────────────────────────
120
  # SUMMARISER
121
  # ──────────────────────────────────────────────────────────────
122
- _CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
123
 
124
- def _summarise_article(body: str) -> str:
125
  prompt = (
126
  "You are a concise news assistant. Summarise the following article "
127
  "in one sentence (<=25 words). Omit source and author names.\n\n"
128
- f"ARTICLE:\n{body}"
129
  )
130
  raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
131
- return _CLEAN_RE.sub("", raw).strip()
132
-
133
 
134
  # ──────────────────────────────────────────────────────────────
135
- # REDIS KEY HELPERS
136
  # ──────────────────────────────────────────────────────────────
137
- def _redis_key(date: str, category: str) -> str:
138
- return f"headlines:{date}:{category}"
139
-
140
 
141
  # ──────────────────────────────────────────────────────────────
142
- # MAIN ENTRY POINT
143
  # ──────────────────────────────────────────────────────────────
144
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
145
  """
146
- Fetches top articles per category via Newsdata.io, summarises them,
147
- stores in Upstash Redis, and returns the payload for logging/tests.
148
  """
149
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
150
- all_output: Dict[str, List[dict]] = {}
151
 
152
- for cat_key, newsdata_cat in _CATEGORIES.items():
153
- print(f"[HEADLINES] {cat_key.title()} …")
154
- articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
155
 
156
  summaries: List[dict] = []
157
  for art in articles:
158
- summary = _summarise_article(art["content"])
159
  summaries.append(
160
  {
161
  "title": art["title"],
162
  "url": art["url"],
163
- "summary": summary,
164
  "source_snippet": art["source_snippet"],
165
  "image": art["image"],
166
  "pubDate": art["pubDate"],
167
  }
168
  )
169
 
170
- redis_key = _redis_key(date_str, cat_key)
171
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
172
- all_output[cat_key] = summaries
173
- print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")
174
 
175
- return all_output
 
1
  # nuse_modules/headlines_generator.py
2
  from __future__ import annotations
3
  import datetime as _dt
4
+ import json
5
+ import os
6
+ import re
7
+ import time
8
  from typing import List, Dict, Optional
9
 
10
  import requests
 
12
  from clients.redis_client import redis_client as _r
13
  from models_initialization.mistral_registry import mistral_generate
14
 
 
15
  # ──────────────────────────────────────────────────────────────
16
  # CONFIG
17
  # ──────────────────────────────────────────────────────────────
18
  NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
19
+ assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY missing (add to Space secrets or .env)"
20
+
21
+ # Pure-query strings we’ll pass via &q=
22
+ _CATEGORIES: dict[str, str] = {
23
+ "world": "world news top stories",
24
+ "india": "india top headlines",
25
+ "finance": "business finance economy",
26
+ "sports": "sports news today",
27
+ "entertainment": "celebrity movies tv music",
 
 
28
  }
29
 
30
  _ARTICLES_PER_CAT = 5
31
  _SUMMARY_TOKENS = 120
32
  _REDIS_TTL_SECONDS = 24 * 3600
33
+ _REQUEST_TIMEOUT = 10 # seconds
 
34
 
35
  # ──────────────────────────────────────────────────────────────
36
+ # NEWSDATA HELPER
37
  # ──────────────────────────────────────────────────────────────
38
  def _newsdata_url(
39
+ query: str,
 
 
40
  page: int = 0,
41
  language: str = "en",
42
  size: int = 25,
43
  ) -> str:
44
  """
45
+ Build a Newsdata /latest request that always uses q=.
 
46
  """
47
+ return (
48
  "https://newsdata.io/api/1/latest"
49
  f"?apikey={NEWSDATA_API_KEY}"
50
  f"&language={language}"
51
  f"&size={size}"
52
  f"&page={page}"
53
+ f"&q={requests.utils.quote(query)}"
54
  )
 
 
 
 
 
 
55
 
56
+ def _fetch_articles(q: str, wanted: int) -> List[dict]:
 
57
  """
58
+ Fetch up to `wanted` unique articles for the query string `q`.
59
  """
60
  collected: List[dict] = []
61
+ seen_urls: set[str] = set()
62
  page = 0
63
 
64
+ while len(collected) < wanted and page < 5: # hard stop at 5 pages
65
+ url = _newsdata_url(query=q, page=page)
 
 
 
 
66
  try:
67
  res = requests.get(url, timeout=_REQUEST_TIMEOUT)
68
  res.raise_for_status()
69
  data = res.json()
70
  except Exception as e:
71
+ print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
72
  break
73
 
74
  for item in data.get("results", []):
75
+ url_link = item.get("link")
76
+ if not url_link or url_link in seen_urls:
77
  continue
78
+ seen_urls.add(url_link)
79
 
80
  content = item.get("content") or item.get("full_description") or ""
81
+ if len(content) < 300:
82
+ continue # skip short or empty articles
83
 
84
  collected.append(
85
  {
86
  "title": item.get("title"),
87
+ "url": url_link,
88
  "content": content,
89
  "image": item.get("image_url"),
90
  "source_snippet": item.get("description") or "",
 
95
  break
96
 
97
  if not data.get("nextPage"):
98
+ break
99
  page += 1
100
+ time.sleep(0.4) # gentle throttling
 
101
  return collected[:wanted]
102
 
 
103
  # ──────────────────────────────────────────────────────────────
104
  # SUMMARISER
105
  # ──────────────────────────────────────────────────────────────
106
+ _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
107
 
108
+ def _summarise(text: str) -> str:
109
  prompt = (
110
  "You are a concise news assistant. Summarise the following article "
111
  "in one sentence (<=25 words). Omit source and author names.\n\n"
112
+ f"ARTICLE:\n{text}"
113
  )
114
  raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
115
+ return _RE_PROMPT_ECHO.sub("", raw).strip()
 
116
 
117
  # ──────────────────────────────────────────────────────────────
118
+ # REDIS KEY
119
  # ──────────────────────────────────────────────────────────────
120
+ def _redis_key(date: str, cat: str) -> str:
121
+ return f"headlines:{date}:{cat}"
 
122
 
123
  # ──────────────────────────────────────────────────────────────
124
+ # MAIN ENTRY
125
  # ──────────────────────────────────────────────────────────────
126
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
127
  """
128
+ Fetch, summarise, and cache today’s headlines for each category.
 
129
  """
130
  date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
131
+ all_results: Dict[str, List[dict]] = {}
132
 
133
+ for cat, query in _CATEGORIES.items():
134
+ print(f"[HEADLINES] {cat.title()} …")
135
+ articles = _fetch_articles(query, _ARTICLES_PER_CAT)
136
 
137
  summaries: List[dict] = []
138
  for art in articles:
139
+ summary_txt = _summarise(art["content"])
140
  summaries.append(
141
  {
142
  "title": art["title"],
143
  "url": art["url"],
144
+ "summary": summary_txt,
145
  "source_snippet": art["source_snippet"],
146
  "image": art["image"],
147
  "pubDate": art["pubDate"],
148
  }
149
  )
150
 
151
+ redis_key = _redis_key(date_str, cat)
152
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
153
+ all_results[cat] = summaries
154
+ print(f" ↳ stored {len(summaries)} items β†’ {redis_key}")
155
 
156
+ return all_results