Spaces:

nuseAI
/

FastAPI

Sleeping

App Files Files Community

raghavNCI commited on Jul 8

Commit

2049c5d

1 Parent(s): 929da00

headlines fix 3

Browse files

Files changed (2) hide show

nuse_modules/headlines_generator.py +40 -18
requirements.txt +3 -1

nuse_modules/headlines_generator.py CHANGED Viewed

@@ -16,7 +16,6 @@ from models_initialization.mistral_registry import mistral_generate
 # ──────────────────────────────────────────────────────────────
 # CONFIG (Google News RSS, no external API keys needed)
 # ──────────────────────────────────────────────────────────────
-# Query strings passed into Google News RSS search feed
 _CATEGORIES: dict[str, str] = {
     "world":         "world news",
     "india":         "india top stories",
@@ -28,10 +27,12 @@ _CATEGORIES: dict[str, str] = {
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
-_RSS_TIMEOUT        = 10  # seconds
-_ARTICLE_TIMEOUT    = 10  # seconds
 # Google News RSS search template
 def _rss_url(query: str) -> str:
     query = requests.utils.quote(query)
     return (
@@ -42,13 +43,35 @@ def _rss_url(query: str) -> str:
 # BoilerPy3 extractor (thread‑safe singleton)
 _bp_extractor = extractors.ArticleExtractor()
 # ──────────────────────────────────────────────────────────────
 # FETCH RSS + ARTICLE BODY
 # ──────────────────────────────────────────────────────────────
 def _extract_fulltext(url: str) -> str:
     try:
-        html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=_ARTICLE_TIMEOUT).text
         text = _bp_extractor.get_content(html)
         return text or ""
     except Exception as e:
@@ -59,7 +82,7 @@ def _extract_fulltext(url: str) -> str:
 def _fetch_articles(query: str, wanted: int) -> List[dict]:
     feed_url = _rss_url(query)
     try:
-        feed = feedparser.parse(feed_url, request_headers={"User-Agent": "Mozilla/5.0"})
     except Exception as e:
         print(f"[RSS ERR] {query}: {e}")
         return []
@@ -74,19 +97,17 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
         seen_links.add(link)
         body = _extract_fulltext(link)
-        if len(body) < 300:
-            continue  # skip trivial pages/homepages
-        collected.append(
-            {
-                "title": entry.title,
-                "url":   link,
-                "content": body,
-                "pubDate": entry.get("published", ""),
-                "image":  None,  # RSS search feed rarely returns image; can scrape OG tag later
-                "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
-            }
-        )
         if len(collected) >= wanted:
             break
@@ -97,6 +118,7 @@ def _fetch_articles(query: str, wanted: int) -> List[dict]:
 # ──────────────────────────────────────────────────────────────
 _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
 def _summarise(text: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "

 # ──────────────────────────────────────────────────────────────
 # CONFIG (Google News RSS, no external API keys needed)
 # ──────────────────────────────────────────────────────────────
 _CATEGORIES: dict[str, str] = {
     "world":         "world news",
     "india":         "india top stories",
 _ARTICLES_PER_CAT   = 5
 _SUMMARY_TOKENS     = 120
 _REDIS_TTL_SECONDS  = 24 * 3600
+_RSS_TIMEOUT        = 10   # seconds
+_ARTICLE_TIMEOUT    = 10   # seconds
+_MIN_BODY_LENGTH    = 120  # relaxed threshold so short briefs pass
 # Google News RSS search template
 def _rss_url(query: str) -> str:
     query = requests.utils.quote(query)
     return (
 # BoilerPy3 extractor (thread‑safe singleton)
 _bp_extractor = extractors.ArticleExtractor()
+# Common browser UA to avoid 403s
+_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114 Safari/537.36"
+    )
+}
 # ──────────────────────────────────────────────────────────────
 # FETCH RSS + ARTICLE BODY
 # ──────────────────────────────────────────────────────────────
+def _follow_google_redirect(html: str) -> str | None:
+    """Extract the real URL from a Google News redirect HTML page."""
+    match = re.search(r'url=(https?[^"\']+)', html, flags=re.I)
+    return match.group(1) if match else None
 def _extract_fulltext(url: str) -> str:
     try:
+        resp = requests.get(url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT, allow_redirects=True)
+        html = resp.text
+        # If still on news.google.com and meta refresh present → follow manually
+        if "news.google.com" in resp.url and "http-equiv=\"refresh\"" in html.lower():
+            real_url = _follow_google_redirect(html)
+            if real_url:
+                html = requests.get(real_url, headers=_HEADERS, timeout=_ARTICLE_TIMEOUT).text
         text = _bp_extractor.get_content(html)
         return text or ""
     except Exception as e:
 def _fetch_articles(query: str, wanted: int) -> List[dict]:
     feed_url = _rss_url(query)
     try:
+        feed = feedparser.parse(feed_url, request_headers=_HEADERS)
     except Exception as e:
         print(f"[RSS ERR] {query}: {e}")
         return []
         seen_links.add(link)
         body = _extract_fulltext(link)
+        if len(body) < _MIN_BODY_LENGTH:
+            continue  # skip very short pages/homepages
+        collected.append({
+            "title": entry.title,
+            "url":   link,
+            "content": body,
+            "pubDate": entry.get("published", ""),
+            "image":  None,  # can scrape OG tag later
+            "source_snippet": re.sub(r"<.*?>", "", entry.summary) if hasattr(entry, "summary") else "",
+        })
         if len(collected) >= wanted:
             break
 # ──────────────────────────────────────────────────────────────
 _RE_PROMPT_ECHO = re.compile(r"(you are.*?article[:\n]+)", re.IGNORECASE | re.DOTALL)
 def _summarise(text: str) -> str:
     prompt = (
         "You are a concise news assistant. Summarise the following article "

requirements.txt CHANGED Viewed

@@ -8,4 +8,6 @@ accelerate
 torch
 huggingface_hub
 boilerpy3==1.0.6
-feedparser

 torch
 huggingface_hub
 boilerpy3==1.0.6
+feedparser
+newspaper3k
+nltk