Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 22

Commit

4df303e

1 Parent(s): f090069

structural changes

Browse files

Files changed (2) hide show

components/generators/daily_feed.py +31 -11
pipeline/news_ingest.py +3 -3

components/generators/daily_feed.py CHANGED Viewed

@@ -2,20 +2,21 @@ import os
 import json
 import redis
 from typing import List, Dict
 from llama_index.core.schema import Document
 from components.LLMs.Mistral import call_mistral
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 # ✅ Redis client
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 # 📰 Topic list
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
-# 🔧 Flattened topic keys for JSON output
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
 # 🧠 Summarization prompt
@@ -65,15 +66,29 @@ def categorize_summary(summary: str) -> str:
     else:
         return "world"
-# 🧪 Summarize the entire day’s documents in one LLM pass
-def summarize_all_documents(documents: List[Document]) -> Dict[str, List[Dict]]:
-    merged_text = "\n\n---\n\n".join(doc.text.strip() for doc in documents if doc.text.strip())
     print("\n🧠 Sending merged prompt to summarizer...\n")
     summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
     categorized_feed = {key: [] for key in TOPIC_KEYS}
     if summary_block:
         for line in summary_block.splitlines():
             line = line.strip()
@@ -88,16 +103,21 @@ def summarize_all_documents(documents: List[Document]) -> Dict[str, List[Dict]]:
                     })
     return categorized_feed
-# 🚀 Final callable to build and cache the feed
-def generate_and_cache_daily_feed(documents: List[Document]):
-    all_feed = summarize_all_documents(documents)
-    final_feed = [{"topic": topic, "feed": all_feed[topic]} for topic in TOPIC_KEYS]
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
-# 📦 Utility to read cached data
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []

 import json
 import redis
 from typing import List, Dict
+from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
 from llama_index.core.schema import Document
+from llama_index.core.query_engine import RetrieverQueryEngine
 from components.LLMs.Mistral import call_mistral
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
 REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
+INDEX_DIR = os.environ.get("INDEX_DIR", "storage/index")
 # ✅ Redis client
 redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
 # 📰 Topic list
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
 # 🧠 Summarization prompt
     else:
         return "world"
+# 📥 Load all documents from the vector store
+def load_all_documents() -> List[Document]:
+    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
+    index = load_index_from_storage(storage_context)
+    retriever = index.as_retriever(similarity_top_k=50)
+    query_engine = RetrieverQueryEngine(retriever=retriever)
+    combined_docs = []
+    for topic in TOPICS:
+        response = query_engine.query(topic)
+        for node in response.source_nodes:
+            doc_text = str(node.get_content()).strip()
+            if doc_text:
+                combined_docs.append(doc_text)
+    return combined_docs
+# 🧪 Summarize entire day's content in one call
+def summarize_and_categorize(docs: List[str]) -> Dict[str, List[Dict]]:
+    merged_text = "\n\n---\n\n".join(docs)
     print("\n🧠 Sending merged prompt to summarizer...\n")
     summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
     categorized_feed = {key: [] for key in TOPIC_KEYS}
     if summary_block:
         for line in summary_block.splitlines():
             line = line.strip()
                     })
     return categorized_feed
+# 🚀 Main callable
+def generate_and_cache_daily_feed():
+    docs = load_all_documents()
+    if not docs:
+        print("⚠️ No documents found in vector store.")
+        return []
+    feed_map = summarize_and_categorize(docs)
+    final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]
     redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
     print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
     return final_feed
+# 📦 Get cached data
 def get_cached_daily_feed():
     cached = redis_client.get(REDIS_KEY)
     return json.loads(cached) if cached else []

pipeline/news_ingest.py CHANGED Viewed

@@ -101,10 +101,10 @@ async def main():
     documents = await build_documents(all_articles)
     get_or_build_index_from_docs(documents)
-    print("⚡ Generating daily feed...")
-    generate_and_cache_daily_feed(documents)  # ✅ SYNC CALL
-    print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")
 # 🏁 Entrypoint
 if __name__ == "__main__":

     documents = await build_documents(all_articles)
     get_or_build_index_from_docs(documents)
+    # print("⚡ Generating daily feed...")
+    # generate_and_cache_daily_feed(documents)  # ✅ SYNC CALL
+    # print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")
 # 🏁 Entrypoint
 if __name__ == "__main__":