ragV98 commited on
Commit
e4a76c1
·
1 Parent(s): a0be762

sequential headlines

Browse files
Files changed (1) hide show
  1. components/generators/daily_feed.py +57 -48
components/generators/daily_feed.py CHANGED
@@ -2,12 +2,12 @@ import os
2
  import json
3
  import redis
4
  import numpy as np
5
- from typing import List, Dict
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
9
  import logging
10
- import re
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,7 +27,7 @@ except Exception as e:
27
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
28
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
29
 
30
- # 🧠 Summarization Prompt - REVISED (no change from previous)
31
  BASE_PROMPT = (
32
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
33
  "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
@@ -40,6 +40,8 @@ BASE_PROMPT = (
40
  )
41
 
42
  # 📥 Load documents and metadata
 
 
43
  def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
44
  topic_docs = {key: [] for key in TOPIC_KEYS}
45
  logging.info("Starting to load documents by topic from Upstash Vector Store...")
@@ -58,34 +60,35 @@ def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
58
 
59
  for node in result.nodes:
60
  content = node.get_content().strip()
61
- headline_id = node.metadata.get("headline_id")
 
62
 
63
  title = node.metadata.get("title", "No Title")
64
  url = node.metadata.get("url", "#")
65
  source = node.metadata.get("source", "Unknown Source")
66
 
67
- if content and headline_id is not None:
68
  topic_docs[topic_key_for_filter].append({
69
  "text": content,
70
- "headline_id": headline_id,
71
  "title": title,
72
  "url": url,
73
  "source": source
74
  })
75
- elif content and headline_id is None:
76
- logging.warning(f"Node found without 'headline_id' for topic '{full_topic_name}': URL {node.metadata.get('url', 'N/A')}")
77
 
78
  except Exception as e:
79
  logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
80
  return topic_docs
81
 
82
  # 🧪 Topic summarizer
83
- def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
 
84
  if not docs:
85
  logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
86
- return []
87
 
88
- representative_headline_id = docs[0].get("headline_id") if docs else None
89
  representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
90
  representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
91
 
@@ -93,7 +96,7 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
93
 
94
  if not content:
95
  logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
96
- return []
97
 
98
  content = content[:12000] # Truncate to avoid excessive token usage
99
 
@@ -113,89 +116,95 @@ def summarize_topic(topic_key: str, docs: List[Dict]) -> List[Dict]:
113
 
114
  logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
115
 
116
- headlines = []
117
  for line in llm_output.splitlines():
118
  line = line.strip()
119
  if not line:
120
  continue
121
 
122
- # --- THE CRITICAL REGEX FIX ---
123
- # Pattern:
124
- # ^(?:[->•\d\.]+\s*)? -> Optional leading bullet/number
125
- # (.*?) -> Non-greedy capture for headline (any characters until --)
126
- # \s*--\s* -> The separator "--" with optional whitespace
127
- # (.*) -> Greedy capture for explanation (rest of the line)
128
  match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
129
 
130
  if match:
131
  headline_text = match.group(1).strip()
132
  explanation_text = match.group(2).strip()
133
 
134
- # Further clean explanation_text if LLM adds unwanted intros despite prompt
135
  explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
136
 
137
- # Basic validation: ensure both parts are reasonably non-empty
138
- if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3: # Headline at least 2 words, explanation at least 3 words
139
- headlines.append({"summary": headline_text, "explanation": explanation_text})
140
  else:
141
  logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
142
  else:
143
  logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
144
 
145
  result = []
146
- for h_item in headlines:
 
147
  result.append({
148
  "summary": h_item["summary"],
149
  "explanation": h_item["explanation"],
150
- "headline_id": representative_headline_id,
151
  "image_url": "https://source.unsplash.com/800x600/?news",
152
  "article_link": representative_article_link,
153
  "representative_title": representative_title
154
  })
 
155
 
156
  logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
157
- return result
158
  except Exception as e:
159
  logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
160
- return []
161
 
162
  # 🚀 Generate and cache feed
163
  def generate_and_cache_daily_feed():
164
  try:
165
  logging.info("🆕 Generating daily feed...")
166
  topic_docs = load_docs_by_topic_with_refs()
167
- feed_map = {}
168
-
169
- for topic_key in TOPIC_KEYS:
170
- try:
171
- summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
172
- feed_map[topic_key] = summaries
173
- except Exception as e:
174
- logging.error(f"❌ [Topic summarization loop error for '{topic_key}']: {e}", exc_info=True)
175
- feed_map[topic_key] = []
176
 
177
- final_feed = []
178
  for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
179
- topic_feed = feed_map.get(topic_key, [])
180
- final_feed.append({
181
- "topic": topic_display_name,
182
- "feed": topic_feed
183
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
  # Cache to Redis
186
  try:
187
  cache_key = "daily_news_feed_cache"
188
- redis_client.set(cache_key, json.dumps(final_feed, ensure_ascii=False))
 
189
  redis_client.expire(cache_key, 86400)
190
  logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
191
  except Exception as e:
192
  logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
193
 
194
- return final_feed
195
 
196
  except Exception as e:
197
  logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
198
- return []
199
 
200
  # 📦 Retrieve from cache
201
  def get_cached_daily_feed():
@@ -207,13 +216,13 @@ def get_cached_daily_feed():
207
  return json.loads(cached)
208
  else:
209
  logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
210
- return []
211
  except Exception as e:
212
  logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
213
- return []
214
 
215
  # 🧪 Run if main
216
  if __name__ == "__main__":
217
  feed = generate_and_cache_daily_feed()
218
- print("\n--- Generated Daily Feed ---")
219
  print(json.dumps(feed, indent=2, ensure_ascii=False))
 
2
  import json
3
  import redis
4
  import numpy as np
5
+ from typing import List, Dict, Any
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
9
  import logging
10
+ import re
11
 
12
  # Configure logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
27
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
28
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
29
 
30
+ # 🧠 Summarization Prompt
31
  BASE_PROMPT = (
32
  "You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
33
  "Generate exactly 3 punchy headlines, each under 20 words. Each headline must be immediately followed by a concise, single-sentence explanation of why the story matters. Do NOT include phrases like 'this is important because', 'why this matters', 'explanation:', etc. Just state the logic directly."
 
40
  )
41
 
42
  # 📥 Load documents and metadata
43
+ # This function will now only return 'text', 'title', 'url', 'source'
44
+ # We remove 'headline_id' from this output as it will be newly generated for summaries
45
  def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
46
  topic_docs = {key: [] for key in TOPIC_KEYS}
47
  logging.info("Starting to load documents by topic from Upstash Vector Store...")
 
60
 
61
  for node in result.nodes:
62
  content = node.get_content().strip()
63
+ # We no longer need to retrieve headline_id here for the summarizer's purpose
64
+ # headline_id = node.metadata.get("headline_id")
65
 
66
  title = node.metadata.get("title", "No Title")
67
  url = node.metadata.get("url", "#")
68
  source = node.metadata.get("source", "Unknown Source")
69
 
70
+ if content: # No longer checking for headline_id here
71
  topic_docs[topic_key_for_filter].append({
72
  "text": content,
73
+ # "headline_id": headline_id, # Removed
74
  "title": title,
75
  "url": url,
76
  "source": source
77
  })
78
+ # Removed the warning for missing headline_id since we are not relying on it here
 
79
 
80
  except Exception as e:
81
  logging.error(f"❌ [load_docs_by_topic_with_refs Error]: {e}", exc_info=True)
82
  return topic_docs
83
 
84
  # 🧪 Topic summarizer
85
+ # Now accepts 'current_global_id' to assign sequential IDs
86
+ def summarize_topic(topic_key: str, docs: List[Dict], current_global_id: int) -> List[Dict]:
87
  if not docs:
88
  logging.warning(f"⚠️ No docs for topic: {topic_key}, skipping summarization.")
89
+ return [], current_global_id # Return empty list and unchanged ID
90
 
91
+ # These representative fields are for generic summary context if no specific link
92
  representative_article_link = docs[0].get("url") if docs else f"https://google.com/search?q={topic_key}+news"
93
  representative_title = docs[0].get("title") if docs else f"Summary for {topic_key}"
94
 
 
96
 
97
  if not content:
98
  logging.warning(f"⚠️ No valid text content found in docs for topic: {topic_key}, skipping summarization.")
99
+ return [], current_global_id
100
 
101
  content = content[:12000] # Truncate to avoid excessive token usage
102
 
 
116
 
117
  logging.info(f"Raw LLM output for topic '{topic_key}':\n---\n{llm_output}\n---")
118
 
119
+ parsed_summaries = [] # Renamed for clarity
120
  for line in llm_output.splitlines():
121
  line = line.strip()
122
  if not line:
123
  continue
124
 
 
 
 
 
 
 
125
  match = re.match(r'^(?:[->•\d\.]+\s*)?(.*?)\s*--\s*(.*)$', line)
126
 
127
  if match:
128
  headline_text = match.group(1).strip()
129
  explanation_text = match.group(2).strip()
130
 
 
131
  explanation_text = re.sub(r'^(?:this is important because|why this matters because|this matters because|reason:|significance:)\s*', '', explanation_text, flags=re.IGNORECASE).strip()
132
 
133
+ if len(headline_text.split()) >= 2 and len(explanation_text.split()) >= 3:
134
+ parsed_summaries.append({"summary": headline_text, "explanation": explanation_text})
 
135
  else:
136
  logging.warning(f"Skipping line due to short/empty headline or explanation after parsing: '{line}' for topic '{topic_key}'.")
137
  else:
138
  logging.warning(f"Could not parse line: '{line}' for topic '{topic_key}'. Does it match 'Headline -- Explanation' format?")
139
 
140
  result = []
141
+ # Assign new sequential IDs here
142
+ for h_item in parsed_summaries:
143
  result.append({
144
  "summary": h_item["summary"],
145
  "explanation": h_item["explanation"],
146
+ "id": current_global_id, # Assign the new sequential ID
147
  "image_url": "https://source.unsplash.com/800x600/?news",
148
  "article_link": representative_article_link,
149
  "representative_title": representative_title
150
  })
151
+ current_global_id += 1 # Increment for the next summary
152
 
153
  logging.info(f"✅ Successfully generated {len(result)} summaries for topic '{topic_key}'.")
154
+ return result, current_global_id # Return the summaries and the updated global ID
155
  except Exception as e:
156
  logging.error(f"❌ [Summarize topic '{topic_key}' Error]: {e}", exc_info=True)
157
+ return [], current_global_id # Return empty and unchanged ID on error
158
 
159
  # 🚀 Generate and cache feed
160
  def generate_and_cache_daily_feed():
161
  try:
162
  logging.info("🆕 Generating daily feed...")
163
  topic_docs = load_docs_by_topic_with_refs()
164
+
165
+ # This will hold the final structure you requested
166
+ final_feed_structured: Dict[str, Dict[int, Dict[str, Any]]] = {}
167
+ global_summary_id_counter = 1 # Initialize global counter for all summaries
 
 
 
 
 
168
 
 
169
  for topic_display_name, topic_key in zip(TOPICS, TOPIC_KEYS):
170
+ summaries_for_topic, updated_global_id = summarize_topic(
171
+ topic_key,
172
+ topic_docs.get(topic_key, []),
173
+ global_summary_id_counter # Pass the current global ID
174
+ )
175
+
176
+ # Update the global counter for the next topic
177
+ global_summary_id_counter = updated_global_id
178
+
179
+ # Store summaries in the desired {1: data, 2: data} format
180
+ topic_summary_map: Dict[int, Dict[str, Any]] = {}
181
+ for summary_item in summaries_for_topic:
182
+ # The 'id' key in summary_item already holds the sequential ID
183
+ topic_summary_map[summary_item["id"]] = {
184
+ "summary": summary_item["summary"],
185
+ "explanation": summary_item["explanation"],
186
+ "image_url": summary_item["image_url"],
187
+ "article_link": summary_item["article_link"],
188
+ "representative_title": summary_item["representative_title"]
189
+ }
190
+
191
+ final_feed_structured[topic_key] = topic_summary_map
192
 
193
  # Cache to Redis
194
  try:
195
  cache_key = "daily_news_feed_cache"
196
+ # Dump the structured dictionary
197
+ redis_client.set(cache_key, json.dumps(final_feed_structured, ensure_ascii=False))
198
  redis_client.expire(cache_key, 86400)
199
  logging.info(f"✅ Cached feed under key '{cache_key}' with 24-hour expiry.")
200
  except Exception as e:
201
  logging.error(f"❌ [Redis cache error]: {e}", exc_info=True)
202
 
203
+ return final_feed_structured # Return the structured feed
204
 
205
  except Exception as e:
206
  logging.critical(f"❌ [generate_and_cache_daily_feed Overall Error]: {e}", exc_info=True)
207
+ return {} # Return empty dict on overall error
208
 
209
  # 📦 Retrieve from cache
210
  def get_cached_daily_feed():
 
216
  return json.loads(cached)
217
  else:
218
  logging.info(f"ℹ️ No cached data found under key '{cache_key}'.")
219
+ return {} # Return empty dict if no cache
220
  except Exception as e:
221
  logging.error(f"❌ [get_cached_daily_feed Error]: {e}", exc_info=True)
222
+ return {}
223
 
224
  # 🧪 Run if main
225
  if __name__ == "__main__":
226
  feed = generate_and_cache_daily_feed()
227
+ print("\n--- Generated Daily Feed (Structured) ---")
228
  print(json.dumps(feed, indent=2, ensure_ascii=False))