Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 22

Commit

6858714

1 Parent(s): 9266b3d

structural ingest change

Browse files

Files changed (1) hide show

components/generators/daily_feed.py +42 -51

components/generators/daily_feed.py CHANGED Viewed

@@ -1,18 +1,10 @@
 import os
-import sys
 import json
-import requests
 import redis
-from typing import List, Dict, Optional
-from llama_index.core import VectorStoreIndex
-from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.core.schema import Document
-from llama_index.core.settings import Settings
 from components.LLMs.Mistral import call_mistral
-# ✅ Disable implicit LLM usage
-Settings.llm = None
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
@@ -20,11 +12,13 @@ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 # ✅ Redis client
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
-# 📰 Topics
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
-# 🧠 Base summarization prompt (used for all topics)
-# 🧠 Define the base summarization prompt
 BASE_PROMPT = (
     "You are Nuse’s official news summarizer — fast, sharp, and never generic.\n"
     "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
@@ -55,58 +49,55 @@ BASE_PROMPT = (
     "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
 )
-def summarize_topic(docs: List[str], topic: str) -> List[Dict]:
-    feed = []
-    if not docs:
-        return feed
-    # 🧠 Merge all docs with separators
-    merged_context = "\n\n---\n\n".join(doc.strip() for doc in docs)
-    tail_prompt = f"Topic: {topic}\n\n{merged_context}"
-    print(f"\n📤 Prompt tail for summarization:\n{tail_prompt[:500]}...\n")
-    # 🧠 Single call to summarizer
-    summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=tail_prompt)
     if summary_block:
         for line in summary_block.splitlines():
             line = line.strip()
-            if line.startswith("-") or line.startswith("–"):
-                clean_summary = line.lstrip("-–").strip()
-                if clean_summary:
-                    feed.append({
-                        "summary": clean_summary,
                         "image_url": "https://source.unsplash.com/800x600/?news",
-                        "article_link": f"https://google.com/search?q={topic.replace(' ', '+')}"
                     })
-    return feed
-# ⚡ Generate and cache daily feed
 def generate_and_cache_daily_feed(documents: List[Document]):
-    index = VectorStoreIndex.from_documents(documents)
-    retriever = index.as_retriever()
-    query_engine = RetrieverQueryEngine(retriever=retriever)
-    final_feed = []
-    for topic in TOPICS:
-        print(f"\n🔍 Generating for: {topic}")
-        response = query_engine.query(topic)
-        docs = [str(node.get_content()) for node in response.source_nodes]
-        topic_feed = summarize_topic(docs, topic)
-        final_feed.append({
-            "topic": topic.lower().replace(" news", ""),
-            "feed": topic_feed
-        })
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
-# 📦 For testing or API access
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []

 import os
 import json
 import redis
+from typing import List, Dict
 from llama_index.core.schema import Document
 from components.LLMs.Mistral import call_mistral
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 # ✅ Redis client
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
+# 📰 Topic list
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
+# 🔧 Flattened topic keys for JSON output
+TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
+# 🧠 Summarization prompt
 BASE_PROMPT = (
     "You are Nuse’s official news summarizer — fast, sharp, and never generic.\n"
     "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
     "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
 )
+# 🧠 Categorize summary line into topic
+def categorize_summary(summary: str) -> str:
+    s = summary.lower()
+    if "india" in s or "modi" in s:
+        return "india"
+    elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
+        return "world"
+    elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
+        return "tech"
+    elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
+        return "finance"
+    elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
+        return "sports"
+    else:
+        return "world"
+# 🧪 Summarize the entire day’s documents in one LLM pass
+def summarize_all_documents(documents: List[Document]) -> Dict[str, List[Dict]]:
+    merged_text = "\n\n---\n\n".join(doc.text.strip() for doc in documents if doc.text.strip())
+    print("\n🧠 Sending merged prompt to summarizer...\n")
+    summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
+    categorized_feed = {key: [] for key in TOPIC_KEYS}
     if summary_block:
         for line in summary_block.splitlines():
             line = line.strip()
+            if line.startswith("-"):
+                clean = line.lstrip("-–").strip()
+                if clean:
+                    topic_key = categorize_summary(clean)
+                    categorized_feed[topic_key].append({
+                        "summary": clean,
                         "image_url": "https://source.unsplash.com/800x600/?news",
+                        "article_link": f"https://google.com/search?q={topic_key}+news"
                     })
+    return categorized_feed
+# 🚀 Final callable to build and cache the feed
 def generate_and_cache_daily_feed(documents: List[Document]):
+    all_feed = summarize_all_documents(documents)
+    final_feed = [{"topic": topic, "feed": all_feed[topic]} for topic in TOPIC_KEYS]
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
+# 📦 Utility to read cached data
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []