Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on about 1 month ago

Commit

e4a76c1

1 Parent(s): a0be762

sequential headlines

Browse files

Files changed (1) hide show

components/generators/daily_feed.py +57 -48

components/generators/daily_feed.py CHANGED Viewed

@@ -2,12 +2,12 @@ import os
 import json
 import redis
 import numpy as np
-from typing import List, Dict
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 import logging
-import re
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,7 +27,7 @@ except Exception as e:
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
-# 🧠 Summarization Prompt - REVISED (no change from previous)
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
@@ -40,6 +40,8 @@ BASE_PROMPT = (
 )
 # 📥 Load documents and metadata
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     logging.info("Starting to load documents by topic from Upstash Vector Store...")
@@ -58,34 +60,35 @@ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
             for node in result.nodes:
                 content = node.get_content().strip()
-                headline_id = node.metadata.get("headline_id")
                 title = node.metadata.get("title", "No Title")
                 url = node.metadata.get("url", "#")
                 source = node.metadata.get("source", "Unknown Source")
-                if content and headline_id is not None:
                     topic_docs[topic_key_for_filter].append({
                         "text": content,
-                        "headline_id": headline_id,
                         "title": title,
                         "url": url,
                         "source": source
                     })
-                elif content and headline_id is None:
-                    logging.warning(f"Node found without 'headline_id' for topic '{full_topic_name}': URL {node.metadata.get('url', 'N/A')}")
     except Exception as e:
         logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
     return topic_docs
 # 🧪 Topic summarizer
-def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
     if not docs:
         logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
-        return []
-    representative_headline_id = docs[0].get("headline_id") if docs else None
     representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
     representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
@@ -93,7 +96,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
     if not content:
         logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
-        return []
     content = content[:12000] # Truncate to avoid excessive token usage
@@ -113,89 +116,95 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
         logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
-        headlines = []
         for line in llm_output.splitlines():
             line = line.strip()
             if not line:
                 continue
-            # --- THE CRITICAL REGEX FIX ---
-            # Pattern:
-            # ^(?:[->•\d\.]+\s*)?  -> Optional leading bullet/number
-            # (.*?)                -> Non-greedy capture for headline (any characters until --)
-            # \s*--\s* -> The separator "--" with optional whitespace
-            # (.*)                 -> Greedy capture for explanation (rest of the line)
             match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
             if match:
                 headline_text = match.group(1).strip()
                 explanation_text = match.group(2).strip()
-                # Further clean explanation_text if LLM adds unwanted intros despite prompt
                 explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
-                # Basic validation: ensure both parts are reasonably non-empty
-                if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3: # Headline at least 2 words, explanation at least 3 words
-                    headlines.append({"summary": headline_text, "explanation": explanation_text})
                 else:
                     logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
             else:
                 logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
         result = []
-        for h_item in headlines:
             result.append({
                 "summary": h_item["summary"],
                 "explanation": h_item["explanation"],
-                "headline_id": representative_headline_id,
                 "image_url": "https://source.unsplash.com/800x600/?news",
                 "article_link": representative_article_link,
                 "representative_title": representative_title
             })
         logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
-        return result
     except Exception as e:
         logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
-        return []
 # 🚀 Generate and cache feed
 def generate_and_cache_daily_feed():
     try:
         logging.info("🆕 Generating daily feed...")
         topic_docs = load_docs_by_topic_with_refs()
-        feed_map = {}
-        for topic_key in TOPIC_KEYS:
-            try:
-                summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
-                feed_map[topic_key] = summaries
-            except Exception as e:
-                logging.error(f"❌ [Topic summarization loop error for '{topic_key}']: {e}", exc_info=True)
-                feed_map[topic_key] = []
-        final_feed = []
         for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
-            topic_feed = feed_map.get(topic_key, [])
-            final_feed.append({
-                "topic": topic_display_name,
-                "feed": topic_feed
-            })
         # Cache to Redis
         try:
             cache_key = "daily_news_feed_cache"
-            redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
             redis_client.expire(cache_key, 86400)
             logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
         except Exception as e:
             logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
-        return final_feed
     except Exception as e:
         logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
-        return []
 # 📦 Retrieve from cache
 def get_cached_daily_feed():
@@ -207,13 +216,13 @@ def get_cached_daily_feed():
             return json.loads(cached)
         else:
             logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
-            return []
     except Exception as e:
         logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
-        return []
 # 🧪 Run if main
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
-    print("\n--- Generated Daily Feed ---")
     print(json.dumps(feed, indent=2, ensure_ascii=False))

 import json
 import redis
 import numpy as np
+from typing import List, Dict, Any
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 import logging
+import re
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
+# 🧠 Summarization Prompt
 BASE_PROMPT = (
     "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
     "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
 )
 # 📥 Load documents and metadata
+# This function will now only return 'text', 'title', 'url', 'source'
+# We remove 'headline_id' from this output as it will be newly generated for summaries
 def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
     topic_docs = {key: [] for key in TOPIC_KEYS}
     logging.info("Starting to load documents by topic from Upstash Vector Store...")
             for node in result.nodes:
                 content = node.get_content().strip()
+                # We no longer need to retrieve headline_id here for the summarizer's purpose
+                # headline_id = node.metadata.get("headline_id")
                 title = node.metadata.get("title", "No Title")
                 url = node.metadata.get("url", "#")
                 source = node.metadata.get("source", "Unknown Source")
+                if content: # No longer checking for headline_id here
                     topic_docs[topic_key_for_filter].append({
                         "text": content,
+                        # "headline_id": headline_id, # Removed
                         "title": title,
                         "url": url,
                         "source": source
                     })
+                # Removed the warning for missing headline_id since we are not relying on it here
     except Exception as e:
         logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
     return topic_docs
 # 🧪 Topic summarizer
+# Now accepts 'current_global_id' to assign sequential IDs
+def summarize_topic(topic_key: str, docs: List[Dict], current_global_id: int) -> List[Dict]:
     if not docs:
         logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
+        return [], current_global_id # Return empty list and unchanged ID
+    # These representative fields are for generic summary context if no specific link
     representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
     representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
     if not content:
         logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
+        return [], current_global_id
     content = content[:12000] # Truncate to avoid excessive token usage
         logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
+        parsed_summaries = [] # Renamed for clarity
         for line in llm_output.splitlines():
             line = line.strip()
             if not line:
                 continue
             match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
             if match:
                 headline_text = match.group(1).strip()
                 explanation_text = match.group(2).strip()
                 explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
+                if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3:
+                    parsed_summaries.append({"summary": headline_text, "explanation": explanation_text})
                 else:
                     logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
             else:
                 logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
         result = []
+        # Assign new sequential IDs here
+        for h_item in parsed_summaries:
             result.append({
                 "summary": h_item["summary"],
                 "explanation": h_item["explanation"],
+                "id": current_global_id, # Assign the new sequential ID
                 "image_url": "https://source.unsplash.com/800x600/?news",
                 "article_link": representative_article_link,
                 "representative_title": representative_title
             })
+            current_global_id += 1 # Increment for the next summary
         logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
+        return result, current_global_id # Return the summaries and the updated global ID
     except Exception as e:
         logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
+        return [], current_global_id # Return empty and unchanged ID on error
 # 🚀 Generate and cache feed
 def generate_and_cache_daily_feed():
     try:
         logging.info("🆕 Generating daily feed...")
         topic_docs = load_docs_by_topic_with_refs()
+        # This will hold the final structure you requested
+        final_feed_structured: Dict[str, Dict[int, Dict[str, Any]]] = {}
+        global_summary_id_counter = 1 # Initialize global counter for all summaries
         for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
+            summaries_for_topic, updated_global_id = summarize_topic(
+                topic_key,
+                topic_docs.get(topic_key, []),
+                global_summary_id_counter # Pass the current global ID
+            )
+            # Update the global counter for the next topic
+            global_summary_id_counter = updated_global_id
+            # Store summaries in the desired {1: data, 2: data} format
+            topic_summary_map: Dict[int, Dict[str, Any]] = {}
+            for summary_item in summaries_for_topic:
+                # The 'id' key in summary_item already holds the sequential ID
+                topic_summary_map[summary_item["id"]] = {
+                    "summary": summary_item["summary"],
+                    "explanation": summary_item["explanation"],
+                    "image_url": summary_item["image_url"],
+                    "article_link": summary_item["article_link"],
+                    "representative_title": summary_item["representative_title"]
+                }
+            final_feed_structured[topic_key] = topic_summary_map
         # Cache to Redis
         try:
             cache_key = "daily_news_feed_cache"
+            # Dump the structured dictionary
+            redis_client.set(cache_key, json.dumps(final_feed_structured, ensure_ascii=False))
             redis_client.expire(cache_key, 86400)
             logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
         except Exception as e:
             logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
+        return final_feed_structured # Return the structured feed
     except Exception as e:
         logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
+        return {} # Return empty dict on overall error
 # 📦 Retrieve from cache
 def get_cached_daily_feed():
             return json.loads(cached)
         else:
             logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
+            return {} # Return empty dict if no cache
     except Exception as e:
         logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
+        return {}
 # 🧪 Run if main
 if __name__ == "__main__":
     feed = generate_and_cache_daily_feed()
+    print("\n--- Generated Daily Feed (Structured) ---")
     print(json.dumps(feed, indent=2, ensure_ascii=False))