Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 8

Commit

a37ba23

1 Parent(s): 7fdb7c1

using rss instead

Browse files

Files changed (1) hide show

nuse_modules/headlines_generator.py +77 -87

nuse_modules/headlines_generator.py CHANGED Viewed

@@ -1,104 +1,96 @@
-# nuse_modules/headlines_generator.py
 from __future__ import annotations
 import datetime as _dt
 import json
 import os
 import re
 import time
-from typing import List, Dict, Optional
 import requests
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
-# CONFIG
 # ──────────────────────────────────────────────────────────────
-NEWSDATA_API_KEY = os.getenv("NEWSDATA_API_KEY")
-assert NEWSDATA_API_KEY, "❌ NEWSDATA_API_KEY missing (add to Space secrets or .env)"
-# Pure-query strings we’ll pass via &q=
 _CATEGORIES: dict[str, str] = {
-    "world":         "world news top stories",
-    "india":         "india top headlines",
-    "finance":       "business finance economy",
-    "sports":        "sports news today",
-    "entertainment": "celebrity movies tv music",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
-_REQUEST_TIMEOUT    = 10  # seconds
-# ──────────────────────────────────────────────────────────────
-# NEWSDATA HELPER
-# ──────────────────────────────────────────────────────────────
-def _newsdata_url(
-    query: str,
-    page: int = 0,
-    language: str = "en",
-    size: int = 25,
-) -> str:
-    """
-    Build a Newsdata /latest request that always uses q=.
-    """
     return (
-        "https://newsdata.io/api/1/latest"
-        f"?apikey={NEWSDATA_API_KEY}"
-        f"&language={language}"
-        f"&size={size}"
-        f"&page={page}"
-        f"&q={requests.utils.quote(query)}"
     )
-def _fetch_articles(q: str, wanted: int) -> List[dict]:
-    """
-    Fetch up to `wanted` unique articles for the query string `q`.
-    """
     collected: List[dict] = []
-    seen_urls: set[str] = set()
-    page = 0
-    while len(collected) < wanted and page < 5:  # hard stop at 5 pages
-        url = _newsdata_url(query=q, page=page)
-        try:
-            res = requests.get(url, timeout=_REQUEST_TIMEOUT)
-            res.raise_for_status()
-            data = res.json()
-        except Exception as e:
-            print(f"[ERROR] Newsdata fetch failed ({q}, page {page}): {e}")
             break
-        for item in data.get("results", []):
-            url_link = item.get("link")
-            if not url_link or url_link in seen_urls:
-                continue
-            seen_urls.add(url_link)
-            content = item.get("content") or item.get("full_description") or ""
-            if len(content) < 300:
-                continue  # skip short or empty articles
-            collected.append(
-                {
-                    "title":   item.get("title"),
-                    "url":     url_link,
-                    "content": content,
-                    "image":   item.get("image_url"),
-                    "source_snippet": item.get("description") or "",
-                    "pubDate": item.get("pubDate"),
-                }
-            )
-            if len(collected) >= wanted:
-                break
-        if not data.get("nextPage"):
-            break
-        page += 1
-        time.sleep(0.4)  # gentle throttling
-    return collected[:wanted]
 # ──────────────────────────────────────────────────────────────
 # SUMMARISER
@@ -117,16 +109,16 @@ def _summarise(text: str) -> str:
 # ──────────────────────────────────────────────────────────────
 # REDIS KEY
 # ──────────────────────────────────────────────────────────────
 def _redis_key(date: str, cat: str) -> str:
     return f"headlines:{date}:{cat}"
 # ──────────────────────────────────────────────────────────────
 # MAIN ENTRY
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
-    """
-    Fetch, summarise, and cache today’s headlines for each category.
-    """
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
     all_results: Dict[str, List[dict]] = {}
@@ -137,16 +129,14 @@ def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dic
         summaries: List[dict] = []
         for art in articles:
             summary_txt = _summarise(art["content"])
-            summaries.append(
-                {
-                    "title":   art["title"],
-                    "url":     art["url"],
-                    "summary": summary_txt,
-                    "source_snippet": art["source_snippet"],
-                    "image":   art["image"],
-                    "pubDate": art["pubDate"],
-                }
-            )
         redis_key = _redis_key(date_str, cat)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)

 from __future__ import annotations
 import datetime as _dt
 import json
 import os
 import re
 import time
+from typing import List, Dict
 import requests
+import feedparser
+from boilerpy3 import extractors
 from clients.redis_client import redis_client as _r
 from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
+# CONFIG (Google News RSS, no external API keys needed)
 # ──────────────────────────────────────────────────────────────
+# Query strings passed into Google News RSS search feed
 _CATEGORIES: dict[str, str] = {
+    "world":         "world news",
+    "india":         "india top stories",
+    "finance":       "finance business economy",
+    "sports":        "sports headlines",
+    "entertainment": "entertainment celebrity movies tv",
 }
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
+_RSS_TIMEOUT        = 10  # seconds
+_ARTICLE_TIMEOUT    = 10  # seconds
+# Google News RSS search template
+def _rss_url(query: str) -> str:
+    query = requests.utils.quote(query)
     return (
+        "https://news.google.com/rss/search?q=" + query +
+        "&hl=en-US&gl=US&ceid=US:en"
     )
+# BoilerPy3 extractor (thread‑safe singleton)
+_bp_extractor = extractors.ArticleExtractor()
+# ──────────────────────────────────────────────────────────────
+# FETCH RSS + ARTICLE BODY
+# ──────────────────────────────────────────────────────────────
+def _extract_fulltext(url: str) -> str:
+    try:
+        html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
+        text = _bp_extractor.get_content(html)
+        return text or ""
+    except Exception as e:
+        print(f"[SCRAPE ERR] {url}: {e}")
+        return ""
+def _fetch_articles(query: str, wanted: int) -> List[dict]:
+    feed_url = _rss_url(query)
+    try:
+        feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
+    except Exception as e:
+        print(f"[RSS ERR] {query}: {e}")
+        return []
     collected: List[dict] = []
+    seen_links: set[str] = set()
+    for entry in feed.entries:
+        link = entry.link
+        if link in seen_links:
+            continue
+        seen_links.add(link)
+        body = _extract_fulltext(link)
+        if len(body) < 300:
+            continue  # skip trivial pages/homepages
+        collected.append(
+            {
+                "title": entry.title,
+                "url":   link,
+                "content": body,
+                "pubDate": entry.get("published", ""),
+                "image":  None,  # RSS search feed rarely returns image; can scrape OG tag later
+                "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
+            }
+        )
+        if len(collected) >= wanted:
             break
+    return collected
 # ──────────────────────────────────────────────────────────────
 # SUMMARISER
 # ──────────────────────────────────────────────────────────────
 # REDIS KEY
 # ──────────────────────────────────────────────────────────────
 def _redis_key(date: str, cat: str) -> str:
     return f"headlines:{date}:{cat}"
 # ──────────────────────────────────────────────────────────────
 # MAIN ENTRY
 # ──────────────────────────────────────────────────────────────
 def generate_and_store_headlines(today: str | None = None) -> Dict[str, List[dict]]:
+    """Fetches, summarises, and caches headlines via Google News RSS."""
     date_str = today or _dt.datetime.utcnow().strftime("%Y-%m-%d")
     all_results: Dict[str, List[dict]] = {}
         summaries: List[dict] = []
         for art in articles:
             summary_txt = _summarise(art["content"])
+            summaries.append({
+                "title":   art["title"],
+                "url":     art["url"],
+                "summary": summary_txt,
+                "source_snippet": art["source_snippet"],
+                "image":   art["image"],
+                "pubDate": art["pubDate"],
+            })
         redis_key = _redis_key(date_str, cat)
         _r.set(redis_key, json.dumps(summaries), ex=_REDIS_TTL_SECONDS)