Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 8

Commit

b2bd47e

1 Parent(s): 588f923

switching to newsdata

Browse files

Files changed (1) hide show

nuse_modules/headlines_generator.py +126 -71

nuse_modules/headlines_generator.py CHANGED Viewed

@@ -1,113 +1,168 @@
 from __future__ import annotations
 import datetime as _dt
-import json, os
-import re
-from typing import List, Dict
 from clients.redis_client import redis_client as _r
-from nuse_modules.google_search import search_google_news
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
-# Config
 # ──────────────────────────────────────────────────────────────
 _CATEGORIES = {
-    "world":         "world news top stories",
-    "india":         "india top stories",
-    "finance":       "business finance economy today",
-    "sports":        "sports headlines today",
-    "entertainment": "entertainment celebrity movie tv",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
 # ──────────────────────────────────────────────────────────────
-# Helpers
 # ──────────────────────────────────────────────────────────────
-def _dedupe_urls(articles: List[dict]) -> List[dict]:
-    seen = set()
-    out  = []
-    for art in articles:
-        if art["link"] not in seen:
-            seen.add(art["link"])
-            out.append(art)
-    return out
-def is_probably_article(url: str) -> bool:
     """
-    Simple heuristic: filters out category pages, homepages, etc.
     """
-    bad_patterns = [
-        "/world", "/us", "/news", "/topics", "/home", "/video",
-        "index.html", ".com/", ".org/", ".net/"
-    ]
-    return not any(url.rstrip("/").endswith(p.strip("/")) for p in bad_patterns)
-def _summarise_article(content: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
         "in one sentence (<=25 words). Omit source and author names.\n\n"
-        "Some of these articles contain text which is not useful to the context so you can omit it."
-        f"ARTICLE:\n{content}"
     )
-    raw_output = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
-    # Remove repeated prompt instructions if echoed back
-    cleaned = re.sub(r"(you are.*?article[:\n]+)", "", raw_output, flags=re.IGNORECASE | re.DOTALL).strip()
-    return cleaned
 def _redis_key(date: str, category: str) -> str:
     return f"headlines:{date}:{category}"
 # ──────────────────────────────────────────────────────────────
-# Main Generator
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
     """
-    Fetches top articles per category, summarises them, stores in Redis,
-    and returns the full payload (useful for logging / testing).
     """
-    date_str   = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
-    all_output = {}
-    for cat, query in _CATEGORIES.items():
-        print(f"[HEADLINES] {cat.title()} …")
-        raw_articles = search_google_news([query], num_results=_ARTICLES_PER_CAT)
-        raw_articles = _dedupe_urls(raw_articles)
-        summaries = []
-        for item in raw_articles:
-            link = item.get("link")
-            if not item.get("content"):
-                continue
-            if not is_probably_article(link):
-                continue
-            summary = _summarise_article(item["content"])
-            summaries.append({
-                "title":   item.get("title"),
-                "url":     item.get("link"),
-                "summary": summary,
-                "source_snippet": item.get("snippet"),
-                "image":   item.get("image"),  # added in google_search.py
-            })
-        redis_key = _redis_key(date_str, cat)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
-        all_output[cat] = summaries
         print(f"  ↳ stored {len(summaries)} items in Redis ({redis_key})")
     return all_output

+# nuse_modules/headlines_generator.py
 from __future__ import annotations
 import datetime as _dt
+import json, os, re, time
+from typing import List, Dict, Optional
+import requests
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
+# CONFIG
 # ──────────────────────────────────────────────────────────────
+NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
+assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY is not set in env / Space secrets"
+# Newsdata supports these canonical categories:
+#  'world', 'business', 'science', 'technology', 'entertainment',
+#  'sports', 'environment', 'politics'
 _CATEGORIES = {
+    "world":         "world",
+    "india":         "world",        # use query filter for India
+    "finance":       "business",
+    "sports":        "sports",
+    "entertainment": "entertainment",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
+_REQUEST_TIMEOUT    = 10
 # ──────────────────────────────────────────────────────────────
+# NEWSDATA FETCHER
 # ──────────────────────────────────────────────────────────────
+def _newsdata_url(
+    category: str,
+    query: Optional[str] = None,
+    page: int = 0,
+    language: str = "en",
+    size: int = 25,
+) -> str:
+    base = (
+        "https://newsdata.io/api/1/news"
+        f"?apikey={NEWSDATA_API_KEY}"
+        f"&language={language}"
+        f"&category={category}"
+        f"&size={size}"
+        f"&page={page}"
+    )
+    if query:
+        base += f"&q={query}"
+    return base
+def _fetch_newsdata_articles(cat_key: str, category: str, wanted: int) -> List[dict]:
     """
+    Fetch up to `wanted` articles for a given logical category (cat_key).
     """
+    collected: List[dict] = []
+    seen_links = set()
+    page = 0
+    while len(collected) < wanted and page < 5:  # safety cap
+        url = _newsdata_url(
+            category=category,
+            query="india" if cat_key == "india" else None,
+            page=page,
+        )
+        try:
+            res = requests.get(url, timeout=_REQUEST_TIMEOUT)
+            res.raise_for_status()
+            data = res.json()
+        except Exception as e:
+            print(f"[ERROR] Newsdata fetch failed ({cat_key}, page {page}): {e}")
+            break
+        for item in data.get("results", []):
+            link = item.get("link")
+            if not link or link in seen_links:
+                continue
+            seen_links.add(link)
+            content = item.get("content") or item.get("full_description") or ""
+            if not content or len(content) < 300:
+                continue  # skip short / empty bodies
+            collected.append(
+                {
+                    "title":   item.get("title"),
+                    "url":     link,
+                    "content": content,
+                    "image":   item.get("image_url"),
+                    "source_snippet": item.get("description") or "",
+                    "pubDate": item.get("pubDate"),
+                }
+            )
+            if len(collected) >= wanted:
+                break
+        if not data.get("nextPage"):
+            break  # no more pages
+        page += 1
+        time.sleep(0.4)  # gentle throttle
+    return collected[:wanted]
+# ──────────────────────────────────────────────────────────────
+# SUMMARISER
+# ──────────────────────────────────────────────────────────────
+_CLEAN_RE = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
+def _summarise_article(body: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "
         "in one sentence (<=25 words). Omit source and author names.\n\n"
+        f"ARTICLE:\n{body}"
     )
+    raw = mistral_generate(prompt, max_new_tokens=_SUMMARY_TOKENS, temperature=0.3)
+    return _CLEAN_RE.sub("", raw).strip()
+# ──────────────────────────────────────────────────────────────
+# REDIS KEY HELPERS
+# ──────────────────────────────────────────────────────────────
 def _redis_key(date: str, category: str) -> str:
     return f"headlines:{date}:{category}"
 # ──────────────────────────────────────────────────────────────
+# MAIN ENTRY POINT
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
     """
+    Fetches top articles per category via Newsdata.io, summarises them,
+    stores in Upstash Redis, and returns the payload for logging/tests.
     """
+    date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
+    all_output: Dict[str, List[dict]] = {}
+    for cat_key, newsdata_cat in _CATEGORIES.items():
+        print(f"[HEADLINES] {cat_key.title()} …")
+        articles = _fetch_newsdata_articles(cat_key, newsdata_cat, _ARTICLES_PER_CAT)
+        summaries: List[dict] = []
+        for art in articles:
+            summary = _summarise_article(art["content"])
+            summaries.append(
+                {
+                    "title":   art["title"],
+                    "url":     art["url"],
+                    "summary": summary,
+                    "source_snippet": art["source_snippet"],
+                    "image":   art["image"],
+                    "pubDate": art["pubDate"],
+                }
+            )
+        redis_key = _redis_key(date_str, cat_key)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)
+        all_output[cat_key] = summaries
         print(f"  ↳ stored {len(summaries)} items in Redis ({redis_key})")
     return all_output