Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 8

Commit

7fdb7c1

1 Parent(s): 5557673

headlines fix 1

Browse files

Files changed (1) hide show

nuse_modules/headlines_generator.py +52 -71

nuse_modules/headlines_generator.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # nuse_modules/headlines_generator.py
 from __future__ import annotations
 import datetime as _dt
-import json, os, re, time
 from typing import List, Dict, Optional
 import requests
@@ -9,96 +12,79 @@ import requests
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
 # CONFIG
 # ──────────────────────────────────────────────────────────────
 NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
-assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"
-# Newsdata supports these canonical categories:
-#  'world', 'business', 'science', 'technology', 'entertainment',
-#  'sports', 'environment', 'politics'
-_CATEGORIES = {
-    "world":         "world",
-    "india":         "world",        # use query filter for India
-    "finance":       "business",
-    "sports":        "sports",
-    "entertainment": "entertainment",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
-_REQUEST_TIMEOUT    = 10
 # ──────────────────────────────────────────────────────────────
-# NEWSDATA FETCHER
 # ──────────────────────────────────────────────────────────────
 def _newsdata_url(
-    *,                              # ← keyword-only for clarity
-    query: str | None = None,
-    category: str | None = None,
     page: int = 0,
     language: str = "en",
     size: int = 25,
 ) -> str:
     """
-    Build the /latest endpoint URL.
-    You may supply *either* query OR category (not both).
     """
-    base = (
         "https://newsdata.io/api/1/latest"
         f"?apikey={NEWSDATA_API_KEY}"
         f"&language={language}"
         f"&size={size}"
         f"&page={page}"
     )
-    if query:
-        base += f"&q={query}"
-    elif category:
-        base += f"&category={category}"
-    return base
-def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
     """
-    Fetch up to `wanted` articles for a given logical category (cat_key).
     """
     collected: List[dict] = []
-    seen_links = set()
     page = 0
-    while len(collected) < wanted and page < 5:  # safety cap
-        url = _newsdata_url(
-            category=category,
-            query="india" if cat_key == "india" else None,
-            page=page,
-        )
         try:
             res = requests.get(url, timeout=_REQUEST_TIMEOUT)
             res.raise_for_status()
             data = res.json()
         except Exception as e:
-            print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
             break
         for item in data.get("results", []):
-            link = item.get("link")
-            if not link or link in seen_links:
                 continue
-            seen_links.add(link)
             content = item.get("content") or item.get("full_description") or ""
-            if not content or len(content) < 300:
-                continue  # skip short / empty bodies
             collected.append(
                 {
                     "title":   item.get("title"),
-                    "url":     link,
                     "content": content,
                     "image":   item.get("image_url"),
                     "source_snippet": item.get("description") or "",
@@ -109,67 +95,62 @@ def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[d
                 break
         if not data.get("nextPage"):
-            break  # no more pages
         page += 1
-        time.sleep(0.4)  # gentle throttle
     return collected[:wanted]
 # ──────────────────────────────────────────────────────────────
 # SUMMARISER
 # ──────────────────────────────────────────────────────────────
-_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
-def _summarise_article(body: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
         "in one sentence (<=25 words). Omit source and author names.\n\n"
-        f"ARTICLE:\n{body}"
     )
     raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
-    return _CLEAN_RE.sub("", raw).strip()
 # ──────────────────────────────────────────────────────────────
-# REDIS KEY HELPERS
 # ──────────────────────────────────────────────────────────────
-def _redis_key(date: str, category: str) -> str:
-    return f"headlines:{date}:{category}"
 # ──────────────────────────────────────────────────────────────
-# MAIN ENTRY POINT
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
     """
-    Fetches top articles per category via Newsdata.io, summarises them,
-    stores in Upstash Redis, and returns the payload for logging/tests.
     """
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
-    all_output: Dict[str, List[dict]] = {}
-    for cat_key, newsdata_cat in _CATEGORIES.items():
-        print(f"[HEADLINES] {cat_key.title()} …")
-        articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
         summaries: List[dict] = []
         for art in articles:
-            summary = _summarise_article(art["content"])
             summaries.append(
                 {
                     "title":   art["title"],
                     "url":     art["url"],
-                    "summary": summary,
                     "source_snippet": art["source_snippet"],
                     "image":   art["image"],
                     "pubDate": art["pubDate"],
                 }
             )
-        redis_key = _redis_key(date_str, cat_key)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
-        all_output[cat_key] = summaries
-        print(f"  ↳ stored {len(summaries)} items in Redis ({redis_key})")
-    return all_output

 # nuse_modules/headlines_generator.py
 from __future__ import annotations
 import datetime as _dt
+import json
+import os
+import re
+import time
 from typing import List, Dict, Optional
 import requests
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
 # CONFIG
 # ──────────────────────────────────────────────────────────────
 NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
+assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY missing (add to Space secrets or .env)"
+# Pure-query strings we’ll pass via &q=
+_CATEGORIES: dict[str, str] = {
+    "world":         "world news top stories",
+    "india":         "india top headlines",
+    "finance":       "business finance economy",
+    "sports":        "sports news today",
+    "entertainment": "celebrity movies tv music",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
+_REQUEST_TIMEOUT    = 10  # seconds
 # ──────────────────────────────────────────────────────────────
+# NEWSDATA HELPER
 # ──────────────────────────────────────────────────────────────
 def _newsdata_url(
+    query: str,
     page: int = 0,
     language: str = "en",
     size: int = 25,
 ) -> str:
     """
+    Build a Newsdata /latest request that always uses q=.
     """
+    return (
         "https://newsdata.io/api/1/latest"
         f"?apikey={NEWSDATA_API_KEY}"
         f"&language={language}"
         f"&size={size}"
         f"&page={page}"
+        f"&q={requests.utils.quote(query)}"
     )
+def _fetch_articles(q: str, wanted: int) -> List[dict]:
     """
+    Fetch up to `wanted` unique articles for the query string `q`.
     """
     collected: List[dict] = []
+    seen_urls: set[str] = set()
     page = 0
+    while len(collected) < wanted and page < 5:  # hard stop at 5 pages
+        url = _newsdata_url(query=q, page=page)
         try:
             res = requests.get(url, timeout=_REQUEST_TIMEOUT)
             res.raise_for_status()
             data = res.json()
         except Exception as e:
+            print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
             break
         for item in data.get("results", []):
+            url_link = item.get("link")
+            if not url_link or url_link in seen_urls:
                 continue
+            seen_urls.add(url_link)
             content = item.get("content") or item.get("full_description") or ""
+            if len(content) < 300:
+                continue  # skip short or empty articles
             collected.append(
                 {
                     "title":   item.get("title"),
+                    "url":     url_link,
                     "content": content,
                     "image":   item.get("image_url"),
                     "source_snippet": item.get("description") or "",
                 break
         if not data.get("nextPage"):
+            break
         page += 1
+        time.sleep(0.4)  # gentle throttling
     return collected[:wanted]
 # ──────────────────────────────────────────────────────────────
 # SUMMARISER
 # ──────────────────────────────────────────────────────────────
+_RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
+def _summarise(text: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
         "in one sentence (<=25 words). Omit source and author names.\n\n"
+        f"ARTICLE:\n{text}"
     )
     raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
+    return _RE_PROMPT_ECHO.sub("", raw).strip()
 # ──────────────────────────────────────────────────────────────
+# REDIS KEY
 # ──────────────────────────────────────────────────────────────
+def _redis_key(date: str, cat: str) -> str:
+    return f"headlines:{date}:{cat}"
 # ──────────────────────────────────────────────────────────────
+# MAIN ENTRY
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
     """
+    Fetch, summarise, and cache today’s headlines for each category.
     """
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
+    all_results: Dict[str, List[dict]] = {}
+    for cat, query in _CATEGORIES.items():
+        print(f"[HEADLINES] {cat.title()} …")
+        articles = _fetch_articles(query, _ARTICLES_PER_CAT)
         summaries: List[dict] = []
         for art in articles:
+            summary_txt = _summarise(art["content"])
             summaries.append(
                 {
                     "title":   art["title"],
                     "url":     art["url"],
+                    "summary": summary_txt,
                     "source_snippet": art["source_snippet"],
                     "image":   art["image"],
                     "pubDate": art["pubDate"],
                 }
             )
+        redis_key = _redis_key(date_str, cat)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
+        all_results[cat] = summaries
+        print(f"  ↳ stored {len(summaries)} items → {redis_key}")
+    return all_results