raghavNCI commited on
Commit
b2bd47e
Β·
1 Parent(s): 588f923

switching to newsdata

Browse files
Files changed (1) hide show
  1. nuse_modules/headlines_generator.py +126 -71
nuse_modules/headlines_generator.py CHANGED
@@ -1,113 +1,168 @@
 
1
  from __future__ import annotations
2
  import datetime as _dt
3
- import json, os
4
- import re
5
- from typing import List, Dict
 
6
 
7
  from clients.redis_client import redis_client as _r
8
- from nuse_modules.google_search import search_google_news
9
  from models_initialization.mistral_registry import mistral_generate
10
 
11
 
12
  # ──────────────────────────────────────────────────────────────
13
- # Config
14
  # ──────────────────────────────────────────────────────────────
 
 
 
 
 
 
15
  _CATEGORIES = {
16
- "world": "world news top stories",
17
- "india": "india top stories",
18
- "finance": "business finance economy today",
19
- "sports": "sports headlines today",
20
- "entertainment": "entertainment celebrity movie tv",
21
  }
22
 
23
  _ARTICLES_PER_CAT = 5
24
  _SUMMARY_TOKENS = 120
25
  _REDIS_TTL_SECONDS = 24 * 3600
 
26
 
27
 
28
  # ──────────────────────────────────────────────────────────────
29
- # Helpers
30
  # ──────────────────────────────────────────────────────────────
31
- def _dedupe_urls(articles: List[dict]) -> List[dict]:
32
- seen = set()
33
- out = []
34
- for art in articles:
35
- if art["link"] not in seen:
36
- seen.add(art["link"])
37
- out.append(art)
38
- return out
39
-
40
- def is_probably_article(url: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
41
  """
42
- Simple heuristic: filters out category pages, homepages, etc.
43
  """
44
- bad_patterns = [
45
- "/world", "/us", "/news", "/topics", "/home", "/video",
46
- "index.html", ".com/", ".org/", ".net/"
47
- ]
48
- return not any(url.rstrip("/").endswith(p.strip("/")) for p in bad_patterns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- def _summarise_article(content: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  prompt = (
53
  "You are a concise news assistant. Summarise the following article "
54
  "in one sentence (<=25 words). Omit source and author names.\n\n"
55
- "Some of these articles contain text which is not useful to the context so you can omit it."
56
- f"ARTICLE:\n{content}"
57
  )
58
- raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
59
-
60
- # Remove repeated prompt instructions if echoed back
61
- cleaned = re.sub(r"(you are.*?article[:\n]+)", "", raw_output, flags=re.IGNORECASE | re.DOTALL).strip()
62
- return cleaned
63
 
64
 
 
 
 
65
  def _redis_key(date: str, category: str) -> str:
66
  return f"headlines:{date}:{category}"
67
 
68
 
69
  # ──────────────────────────────────────────────────────────────
70
- # Main Generator
71
  # ──────────────────────────────────────────────────────────────
72
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
73
  """
74
- Fetches top articles per category, summarises them, stores in Redis,
75
- and returns the full payload (useful for logging / testing).
76
  """
77
- date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
78
- all_output = {}
79
-
80
- for cat, query in _CATEGORIES.items():
81
- print(f"[HEADLINES] {cat.title()} …")
82
-
83
- raw_articles = search_google_news([query], num_results=_ARTICLES_PER_CAT)
84
- raw_articles = _dedupe_urls(raw_articles)
85
-
86
- summaries = []
87
- for item in raw_articles:
88
-
89
- link = item.get("link")
90
-
91
- if not item.get("content"):
92
- continue
93
-
94
- if not is_probably_article(link):
95
- continue
96
-
97
- summary = _summarise_article(item["content"])
98
-
99
- summaries.append({
100
- "title": item.get("title"),
101
- "url": item.get("link"),
102
- "summary": summary,
103
- "source_snippet": item.get("snippet"),
104
- "image": item.get("image"), # added in google_search.py
105
- })
106
-
107
- redis_key = _redis_key(date_str, cat)
108
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
109
-
110
- all_output[cat] = summaries
111
  print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")
112
 
113
  return all_output
 
1
+ # nuse_modules/headlines_generator.py
2
  from __future__ import annotations
3
  import datetime as _dt
4
+ import json, os, re, time
5
+ from typing import List, Dict, Optional
6
+
7
+ import requests
8
 
9
  from clients.redis_client import redis_client as _r
 
10
  from models_initialization.mistral_registry import mistral_generate
11
 
12
 
13
  # ──────────────────────────────────────────────────────────────
14
+ # CONFIG
15
  # ──────────────────────────────────────────────────────────────
16
+ NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
17
+ assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"
18
+
19
+ # Newsdata supports these canonical categories:
20
+ # 'world', 'business', 'science', 'technology', 'entertainment',
21
+ # 'sports', 'environment', 'politics'
22
  _CATEGORIES = {
23
+ "world": "world",
24
+ "india": "world", # use query filter for India
25
+ "finance": "business",
26
+ "sports": "sports",
27
+ "entertainment": "entertainment",
28
  }
29
 
30
  _ARTICLES_PER_CAT = 5
31
  _SUMMARY_TOKENS = 120
32
  _REDIS_TTL_SECONDS = 24 * 3600
33
+ _REQUEST_TIMEOUT = 10
34
 
35
 
36
  # ──────────────────────────────────────────────────────────────
37
+ # NEWSDATA FETCHER
38
  # ──────────────────────────────────────────────────────────────
39
+ def _newsdata_url(
40
+ category: str,
41
+ query: Optional[str] = None,
42
+ page: int = 0,
43
+ language: str = "en",
44
+ size: int = 25,
45
+ ) -> str:
46
+ base = (
47
+ "https://newsdata.io/api/1/news"
48
+ f"?apikey={NEWSDATA_API_KEY}"
49
+ f"&language={language}"
50
+ f"&category={category}"
51
+ f"&size={size}"
52
+ f"&page={page}"
53
+ )
54
+ if query:
55
+ base += f"&q={query}"
56
+ return base
57
+
58
+
59
+ def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
60
  """
61
+ Fetch up to `wanted` articles for a given logical category (cat_key).
62
  """
63
+ collected: List[dict] = []
64
+ seen_links = set()
65
+ page = 0
66
+
67
+ while len(collected) < wanted and page < 5: # safety cap
68
+ url = _newsdata_url(
69
+ category=category,
70
+ query="india" if cat_key == "india" else None,
71
+ page=page,
72
+ )
73
+ try:
74
+ res = requests.get(url, timeout=_REQUEST_TIMEOUT)
75
+ res.raise_for_status()
76
+ data = res.json()
77
+ except Exception as e:
78
+ print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
79
+ break
80
+
81
+ for item in data.get("results", []):
82
+ link = item.get("link")
83
+ if not link or link in seen_links:
84
+ continue
85
+ seen_links.add(link)
86
+
87
+ content = item.get("content") or item.get("full_description") or ""
88
+ if not content or len(content) < 300:
89
+ continue # skip short / empty bodies
90
 
91
+ collected.append(
92
+ {
93
+ "title": item.get("title"),
94
+ "url": link,
95
+ "content": content,
96
+ "image": item.get("image_url"),
97
+ "source_snippet": item.get("description") or "",
98
+ "pubDate": item.get("pubDate"),
99
+ }
100
+ )
101
+ if len(collected) >= wanted:
102
+ break
103
 
104
+ if not data.get("nextPage"):
105
+ break # no more pages
106
+ page += 1
107
+ time.sleep(0.4) # gentle throttle
108
+
109
+ return collected[:wanted]
110
+
111
+
112
+ # ──────────────────────────────────────────────────────────────
113
+ # SUMMARISER
114
+ # ──────────────────────────────────────────────────────────────
115
+ _CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
116
+
117
+ def _summarise_article(body: str) -> str:
118
  prompt = (
119
  "You are a concise news assistant. Summarise the following article "
120
  "in one sentence (<=25 words). Omit source and author names.\n\n"
121
+ f"ARTICLE:\n{body}"
 
122
  )
123
+ raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
124
+ return _CLEAN_RE.sub("", raw).strip()
 
 
 
125
 
126
 
127
+ # ──────────────────────────────────────────────────────────────
128
+ # REDIS KEY HELPERS
129
+ # ──────────────────────────────────────────────────────────────
130
  def _redis_key(date: str, category: str) -> str:
131
  return f"headlines:{date}:{category}"
132
 
133
 
134
  # ──────────────────────────────────────────────────────────────
135
+ # MAIN ENTRY POINT
136
  # ──────────────────────────────────────────────────────────────
137
  def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
138
  """
139
+ Fetches top articles per category via Newsdata.io, summarises them,
140
+ stores in Upstash Redis, and returns the payload for logging/tests.
141
  """
142
+ date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
143
+ all_output: Dict[str, List[dict]] = {}
144
+
145
+ for cat_key, newsdata_cat in _CATEGORIES.items():
146
+ print(f"[HEADLINES] {cat_key.title()} …")
147
+ articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
148
+
149
+ summaries: List[dict] = []
150
+ for art in articles:
151
+ summary = _summarise_article(art["content"])
152
+ summaries.append(
153
+ {
154
+ "title": art["title"],
155
+ "url": art["url"],
156
+ "summary": summary,
157
+ "source_snippet": art["source_snippet"],
158
+ "image": art["image"],
159
+ "pubDate": art["pubDate"],
160
+ }
161
+ )
162
+
163
+ redis_key = _redis_key(date_str, cat_key)
 
 
 
 
 
 
 
 
 
164
  _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
165
+ all_output[cat_key] = summaries
 
166
  print(f" ↳ stored {len(summaries)} items in Redis ({redis_key})")
167
 
168
  return all_output