sequential headlines
Browse files
components/generators/daily_feed.py
CHANGED
@@ -2,12 +2,12 @@ import os
|
|
2 |
import json
|
3 |
import redis
|
4 |
import numpy as np
|
5 |
-
from typing import List, Dict
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
9 |
import logging
|
10 |
-
import re
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
@@ -27,7 +27,7 @@ except Exception as e:
|
|
27 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
28 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
29 |
|
30 |
-
# 🧠 Summarization Prompt
|
31 |
BASE_PROMPT = (
|
32 |
"You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
|
33 |
"Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
|
@@ -40,6 +40,8 @@ BASE_PROMPT = (
|
|
40 |
)
|
41 |
|
42 |
# 📥 Load documents and metadata
|
|
|
|
|
43 |
def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
|
44 |
topic_docs = {key: [] for key in TOPIC_KEYS}
|
45 |
logging.info("Starting to load documents by topic from Upstash Vector Store...")
|
@@ -58,34 +60,35 @@ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
|
|
58 |
|
59 |
for node in result.nodes:
|
60 |
content = node.get_content().strip()
|
61 |
-
|
|
|
62 |
|
63 |
title = node.metadata.get("title", "No Title")
|
64 |
url = node.metadata.get("url", "#")
|
65 |
source = node.metadata.get("source", "Unknown Source")
|
66 |
|
67 |
-
if content
|
68 |
topic_docs[topic_key_for_filter].append({
|
69 |
"text": content,
|
70 |
-
"headline_id": headline_id,
|
71 |
"title": title,
|
72 |
"url": url,
|
73 |
"source": source
|
74 |
})
|
75 |
-
|
76 |
-
logging.warning(f"Node found without 'headline_id' for topic '{full_topic_name}': URL {node.metadata.get('url', 'N/A')}")
|
77 |
|
78 |
except Exception as e:
|
79 |
logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
|
80 |
return topic_docs
|
81 |
|
82 |
# 🧪 Topic summarizer
|
83 |
-
|
|
|
84 |
if not docs:
|
85 |
logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
|
86 |
-
return []
|
87 |
|
88 |
-
|
89 |
representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
|
90 |
representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
|
91 |
|
@@ -93,7 +96,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
|
|
93 |
|
94 |
if not content:
|
95 |
logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
|
96 |
-
return []
|
97 |
|
98 |
content = content[:12000] # Truncate to avoid excessive token usage
|
99 |
|
@@ -113,89 +116,95 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
|
|
113 |
|
114 |
logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
|
115 |
|
116 |
-
|
117 |
for line in llm_output.splitlines():
|
118 |
line = line.strip()
|
119 |
if not line:
|
120 |
continue
|
121 |
|
122 |
-
# --- THE CRITICAL REGEX FIX ---
|
123 |
-
# Pattern:
|
124 |
-
# ^(?:[->•\d\.]+\s*)? -> Optional leading bullet/number
|
125 |
-
# (.*?) -> Non-greedy capture for headline (any characters until --)
|
126 |
-
# \s*--\s* -> The separator "--" with optional whitespace
|
127 |
-
# (.*) -> Greedy capture for explanation (rest of the line)
|
128 |
match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
|
129 |
|
130 |
if match:
|
131 |
headline_text = match.group(1).strip()
|
132 |
explanation_text = match.group(2).strip()
|
133 |
|
134 |
-
# Further clean explanation_text if LLM adds unwanted intros despite prompt
|
135 |
explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
headlines.append({"summary": headline_text, "explanation": explanation_text})
|
140 |
else:
|
141 |
logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
|
142 |
else:
|
143 |
logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
|
144 |
|
145 |
result = []
|
146 |
-
|
|
|
147 |
result.append({
|
148 |
"summary": h_item["summary"],
|
149 |
"explanation": h_item["explanation"],
|
150 |
-
"
|
151 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
152 |
"article_link": representative_article_link,
|
153 |
"representative_title": representative_title
|
154 |
})
|
|
|
155 |
|
156 |
logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
|
157 |
-
return result
|
158 |
except Exception as e:
|
159 |
logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
|
160 |
-
return []
|
161 |
|
162 |
# 🚀 Generate and cache feed
|
163 |
def generate_and_cache_daily_feed():
|
164 |
try:
|
165 |
logging.info("🆕 Generating daily feed...")
|
166 |
topic_docs = load_docs_by_topic_with_refs()
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
|
172 |
-
feed_map[topic_key] = summaries
|
173 |
-
except Exception as e:
|
174 |
-
logging.error(f"❌ [Topic summarization loop error for '{topic_key}']: {e}", exc_info=True)
|
175 |
-
feed_map[topic_key] = []
|
176 |
|
177 |
-
final_feed = []
|
178 |
for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
# Cache to Redis
|
186 |
try:
|
187 |
cache_key = "daily_news_feed_cache"
|
188 |
-
|
|
|
189 |
redis_client.expire(cache_key, 86400)
|
190 |
logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
|
191 |
except Exception as e:
|
192 |
logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
|
193 |
|
194 |
-
return
|
195 |
|
196 |
except Exception as e:
|
197 |
logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
|
198 |
-
return
|
199 |
|
200 |
# 📦 Retrieve from cache
|
201 |
def get_cached_daily_feed():
|
@@ -207,13 +216,13 @@ def get_cached_daily_feed():
|
|
207 |
return json.loads(cached)
|
208 |
else:
|
209 |
logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
|
210 |
-
return
|
211 |
except Exception as e:
|
212 |
logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
|
213 |
-
return
|
214 |
|
215 |
# 🧪 Run if main
|
216 |
if __name__ == "__main__":
|
217 |
feed = generate_and_cache_daily_feed()
|
218 |
-
print("\n--- Generated Daily Feed ---")
|
219 |
print(json.dumps(feed, indent=2, ensure_ascii=False))
|
|
|
2 |
import json
|
3 |
import redis
|
4 |
import numpy as np
|
5 |
+
from typing import List, Dict, Any
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
9 |
import logging
|
10 |
+
import re
|
11 |
|
12 |
# Configure logging
|
13 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
27 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
28 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
29 |
|
30 |
+
# 🧠 Summarization Prompt
|
31 |
BASE_PROMPT = (
|
32 |
"You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
|
33 |
"Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
|
|
|
40 |
)
|
41 |
|
42 |
# 📥 Load documents and metadata
|
43 |
+
# This function will now only return 'text', 'title', 'url', 'source'
|
44 |
+
# We remove 'headline_id' from this output as it will be newly generated for summaries
|
45 |
def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
|
46 |
topic_docs = {key: [] for key in TOPIC_KEYS}
|
47 |
logging.info("Starting to load documents by topic from Upstash Vector Store...")
|
|
|
60 |
|
61 |
for node in result.nodes:
|
62 |
content = node.get_content().strip()
|
63 |
+
# We no longer need to retrieve headline_id here for the summarizer's purpose
|
64 |
+
# headline_id = node.metadata.get("headline_id")
|
65 |
|
66 |
title = node.metadata.get("title", "No Title")
|
67 |
url = node.metadata.get("url", "#")
|
68 |
source = node.metadata.get("source", "Unknown Source")
|
69 |
|
70 |
+
if content: # No longer checking for headline_id here
|
71 |
topic_docs[topic_key_for_filter].append({
|
72 |
"text": content,
|
73 |
+
# "headline_id": headline_id, # Removed
|
74 |
"title": title,
|
75 |
"url": url,
|
76 |
"source": source
|
77 |
})
|
78 |
+
# Removed the warning for missing headline_id since we are not relying on it here
|
|
|
79 |
|
80 |
except Exception as e:
|
81 |
logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
|
82 |
return topic_docs
|
83 |
|
84 |
# 🧪 Topic summarizer
|
85 |
+
# Now accepts 'current_global_id' to assign sequential IDs
|
86 |
+
def summarize_topic(topic_key: str, docs: List[Dict], current_global_id: int) -> List[Dict]:
|
87 |
if not docs:
|
88 |
logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
|
89 |
+
return [], current_global_id # Return empty list and unchanged ID
|
90 |
|
91 |
+
# These representative fields are for generic summary context if no specific link
|
92 |
representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
|
93 |
representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
|
94 |
|
|
|
96 |
|
97 |
if not content:
|
98 |
logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
|
99 |
+
return [], current_global_id
|
100 |
|
101 |
content = content[:12000] # Truncate to avoid excessive token usage
|
102 |
|
|
|
116 |
|
117 |
logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
|
118 |
|
119 |
+
parsed_summaries = [] # Renamed for clarity
|
120 |
for line in llm_output.splitlines():
|
121 |
line = line.strip()
|
122 |
if not line:
|
123 |
continue
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
|
126 |
|
127 |
if match:
|
128 |
headline_text = match.group(1).strip()
|
129 |
explanation_text = match.group(2).strip()
|
130 |
|
|
|
131 |
explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
|
132 |
|
133 |
+
if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3:
|
134 |
+
parsed_summaries.append({"summary": headline_text, "explanation": explanation_text})
|
|
|
135 |
else:
|
136 |
logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
|
137 |
else:
|
138 |
logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
|
139 |
|
140 |
result = []
|
141 |
+
# Assign new sequential IDs here
|
142 |
+
for h_item in parsed_summaries:
|
143 |
result.append({
|
144 |
"summary": h_item["summary"],
|
145 |
"explanation": h_item["explanation"],
|
146 |
+
"id": current_global_id, # Assign the new sequential ID
|
147 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
148 |
"article_link": representative_article_link,
|
149 |
"representative_title": representative_title
|
150 |
})
|
151 |
+
current_global_id += 1 # Increment for the next summary
|
152 |
|
153 |
logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
|
154 |
+
return result, current_global_id # Return the summaries and the updated global ID
|
155 |
except Exception as e:
|
156 |
logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
|
157 |
+
return [], current_global_id # Return empty and unchanged ID on error
|
158 |
|
159 |
# 🚀 Generate and cache feed
|
160 |
def generate_and_cache_daily_feed():
|
161 |
try:
|
162 |
logging.info("🆕 Generating daily feed...")
|
163 |
topic_docs = load_docs_by_topic_with_refs()
|
164 |
+
|
165 |
+
# This will hold the final structure you requested
|
166 |
+
final_feed_structured: Dict[str, Dict[int, Dict[str, Any]]] = {}
|
167 |
+
global_summary_id_counter = 1 # Initialize global counter for all summaries
|
|
|
|
|
|
|
|
|
|
|
168 |
|
|
|
169 |
for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
|
170 |
+
summaries_for_topic, updated_global_id = summarize_topic(
|
171 |
+
topic_key,
|
172 |
+
topic_docs.get(topic_key, []),
|
173 |
+
global_summary_id_counter # Pass the current global ID
|
174 |
+
)
|
175 |
+
|
176 |
+
# Update the global counter for the next topic
|
177 |
+
global_summary_id_counter = updated_global_id
|
178 |
+
|
179 |
+
# Store summaries in the desired {1: data, 2: data} format
|
180 |
+
topic_summary_map: Dict[int, Dict[str, Any]] = {}
|
181 |
+
for summary_item in summaries_for_topic:
|
182 |
+
# The 'id' key in summary_item already holds the sequential ID
|
183 |
+
topic_summary_map[summary_item["id"]] = {
|
184 |
+
"summary": summary_item["summary"],
|
185 |
+
"explanation": summary_item["explanation"],
|
186 |
+
"image_url": summary_item["image_url"],
|
187 |
+
"article_link": summary_item["article_link"],
|
188 |
+
"representative_title": summary_item["representative_title"]
|
189 |
+
}
|
190 |
+
|
191 |
+
final_feed_structured[topic_key] = topic_summary_map
|
192 |
|
193 |
# Cache to Redis
|
194 |
try:
|
195 |
cache_key = "daily_news_feed_cache"
|
196 |
+
# Dump the structured dictionary
|
197 |
+
redis_client.set(cache_key, json.dumps(final_feed_structured, ensure_ascii=False))
|
198 |
redis_client.expire(cache_key, 86400)
|
199 |
logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
|
200 |
except Exception as e:
|
201 |
logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
|
202 |
|
203 |
+
return final_feed_structured # Return the structured feed
|
204 |
|
205 |
except Exception as e:
|
206 |
logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
|
207 |
+
return {} # Return empty dict on overall error
|
208 |
|
209 |
# 📦 Retrieve from cache
|
210 |
def get_cached_daily_feed():
|
|
|
216 |
return json.loads(cached)
|
217 |
else:
|
218 |
logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
|
219 |
+
return {} # Return empty dict if no cache
|
220 |
except Exception as e:
|
221 |
logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
|
222 |
+
return {}
|
223 |
|
224 |
# 🧪 Run if main
|
225 |
if __name__ == "__main__":
|
226 |
feed = generate_and_cache_daily_feed()
|
227 |
+
print("\n--- Generated Daily Feed (Structured) ---")
|
228 |
print(json.dumps(feed, indent=2, ensure_ascii=False))
|