ragV98 commited on
Commit
315bd36
Β·
1 Parent(s): f8625a7
Files changed (1) hide show
  1. components/generators/daily_feed.py +26 -63
components/generators/daily_feed.py CHANGED
@@ -10,7 +10,7 @@ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilte
10
 
11
  # πŸ” Environment variables
12
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
13
- REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") # Using REDIS_KEY for the cache key, assuming UPSTASH_REDIS_TOKEN is meant for the cache key here
14
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
 
16
  # βœ… Redis client
@@ -22,6 +22,7 @@ except Exception as e:
22
 
23
  # πŸ“° Topic list
24
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
 
25
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
26
 
27
  # 🧠 Summarization prompt
@@ -38,23 +39,20 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
38
  vector_store = get_upstash_vector_store()
39
  print("πŸ’‘ Successfully retrieved Upstash vector store.")
40
 
41
- # --- ADD THESE PRINT STATEMENTS ---
42
  print(f"DEBUG: TOPICS = {TOPICS}")
43
  print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
44
  print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
45
  print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
46
- # ----------------------------------
47
 
48
- for topic, key in zip(TOPICS, TOPIC_KEYS):
49
  try:
50
- # Upstash VectorStore expects the filter value to match the exact string
51
- # of the topic as it was indexed. Make sure your 'topic' metadata
52
- # in Upstash exactly matches the values in TOPICS (e.g., "India news").
53
-
54
- # Construct MetadataFilters object
55
  filters = MetadataFilters(
56
  filters=[
57
- MetadataFilter(key="topic", value=topic, operator=FilterOperator.EQ)
58
  ]
59
  )
60
 
@@ -65,27 +63,21 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
65
  filters=filters # Apply the metadata filter
66
  )
67
 
68
- # Removed the problematic .to_dict() call
69
- print(f"πŸ”Ž Querying Upstash for topic: '{topic}'")
70
  result = vector_store.query(query)
71
- print(f"➑️ Found {len(result.nodes)} nodes for topic: '{topic}'.")
72
 
73
  for node in result.nodes:
74
  content = node.get_content().strip()
75
  if content:
76
- topic_docs[key].append(content)
77
  # Optional: Print metadata to verify filtering
78
  # print(f" Node metadata: {node.metadata}")
79
  except Exception as e:
80
- print(f"❌ [Topic Metadata Filter error for '{topic}']: {e}")
81
- # Optional: Log the full traceback for more detailed debugging
82
- # import traceback
83
- # traceback.print_exc()
84
 
85
  except Exception as e:
86
  print("❌ [load_all_documents_grouped_by_topic Error]", e)
87
- # import traceback
88
- # traceback.print_exc()
89
 
90
  return topic_docs
91
 
@@ -97,38 +89,34 @@ def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
97
 
98
  try:
99
  client = OpenAI(api_key=OPENAI_API_KEY)
100
- # Join documents, ensuring we don't exceed typical GPT-4 context window (approx 128k tokens, 12000 chars is safe)
101
  content = "\n\n---\n\n".join(docs)[:12000]
102
 
103
  print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
104
  completion = client.chat.completions.create(
105
- model="gpt-4", # Or "gpt-4o" for potentially better performance
106
  messages=[
107
  {"role": "system", "content": BASE_PROMPT},
108
  {"role": "user", "content": content},
109
  ],
110
- max_tokens=512, # Enough tokens for 3 punchy headlines
111
- temperature=0.7, # A bit creative but focused
112
  )
113
 
114
  text = completion.choices[0].message.content.strip()
115
 
116
  summaries = []
117
- # Parse the headlines, assuming they might be bullet points or lines
118
  for line in text.splitlines():
119
- line = line.strip("-–‒ ") # Remove common bullet characters
120
  if line:
121
  summaries.append({
122
  "summary": line,
123
- "image_url": "https://source.unsplash.com/800x600/?news", # Generic image, could be improved
124
- "article_link": f"https://google.com/search?q={topic_key}+news" # Generic search link
125
  })
126
  return summaries
127
 
128
  except Exception as e:
129
  print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
130
- # import traceback
131
- # traceback.print_exc()
132
  return []
133
 
134
  # πŸš€ Main callable
@@ -144,37 +132,30 @@ def generate_and_cache_daily_feed():
144
  feed_map[topic_key] = summaries
145
  except Exception as e:
146
  print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
147
- # import traceback
148
- # traceback.print_exc()
149
  feed_map[topic_key] = []
150
 
151
- final_feed = [{"topic": topic, "feed": feed_map[topic_key]} for topic, topic_key in zip(TOPICS, TOPIC_KEYS)]
 
 
152
 
153
  try:
154
- # Ensure the REDIS_KEY is suitable for a key name (e.g., not an API token itself)
155
- # You might want a separate environment variable for the cache key, e.g., DAILY_FEED_CACHE_KEY
156
- cache_key_name = "daily_news_feed_cache" # A more descriptive key
157
  redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
158
- # Set an expiry for the cache, e.g., 24 hours (86400 seconds)
159
  redis_client.expire(cache_key_name, 86400)
160
  print(f"βœ… Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
161
  except Exception as e:
162
  print("❌ [Redis Cache Error]", e)
163
- # import traceback
164
- # traceback.print_exc()
165
 
166
  return final_feed
167
 
168
  except Exception as e:
169
  print("❌ [generate_and_cache_daily_feed Overall Error]", e)
170
- # import traceback
171
- # traceback.print_exc()
172
  return []
173
 
174
  # πŸ“¦ Get cached data
175
  def get_cached_daily_feed():
176
  try:
177
- cache_key_name = "daily_news_feed_cache" # Use the same key name as in generate_and_cache_daily_feed
178
  cached = redis_client.get(cache_key_name)
179
  if cached:
180
  print(f"βœ… Retrieved cached daily feed from '{cache_key_name}'.")
@@ -184,37 +165,19 @@ def get_cached_daily_feed():
184
  return []
185
  except Exception as e:
186
  print("❌ [get_cached_daily_feed Error]", e)
187
- # import traceback
188
- # traceback.print_exc()
189
  return []
190
 
191
  # Example of how to run it (for testing purposes, if this were the main script)
192
  if __name__ == "__main__":
193
  # Ensure your environment variables are set before running
194
  # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
195
- # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token" # This should ideally be a unique key for caching, not the token
196
  # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
197
 
198
- # For the UPSTASH_REDIS_TOKEN environment variable, if it's truly a Redis token
199
- # that shouldn't be used as a cache key, you should define a separate environment
200
- # variable for the cache key, or use a hardcoded string as I've done with "daily_news_feed_cache".
201
- # For Upstash Vector connection, ensure UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN
202
- # are configured in your `components.indexers.news_indexer.py`'s `get_upstash_vector_store` function.
203
-
204
- # Generate and cache the feed
205
  generated_feed = generate_and_cache_daily_feed()
206
  print("\n--- Generated and Cached Feed ---")
207
- # for item in generated_feed:
208
- # print(f"Topic: {item['topic']}")
209
- # for summary in item['feed']:
210
- # print(f" - {summary['summary']}")
211
- # print(json.dumps(generated_feed, indent=2, ensure_ascii=False)) # For full output
212
 
213
- # Retrieve from cache
214
  cached_feed = get_cached_daily_feed()
215
  print("\n--- Retrieved from Cache ---")
216
- # for item in cached_feed:
217
- # print(f"Topic: {item['topic']}")
218
- # for summary in item['feed']:
219
- # print(f" - {summary['summary']}")
220
- # print(json.dumps(cached_feed, indent=2, ensure_ascii=False)) # For full output
 
10
 
11
  # πŸ” Environment variables
12
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
13
+ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
14
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
 
16
  # βœ… Redis client
 
22
 
23
  # πŸ“° Topic list
24
  TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
25
+ # This list correctly generates 'india', 'world', etc.
26
  TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
27
 
28
  # 🧠 Summarization prompt
 
39
  vector_store = get_upstash_vector_store()
40
  print("πŸ’‘ Successfully retrieved Upstash vector store.")
41
 
42
+ # Debugging prints (keep them for now, they are useful)
43
  print(f"DEBUG: TOPICS = {TOPICS}")
44
  print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
45
  print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
46
  print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
 
47
 
48
+ for full_topic_name, topic_key_for_filter in zip(TOPICS, TOPIC_KEYS):
49
  try:
50
+ # *** THE CRITICAL CHANGE IS HERE ***
51
+ # Use 'topic_key_for_filter' (e.g., "india") which matches your stored metadata
52
+ # instead of 'full_topic_name' (e.g., "India news").
 
 
53
  filters = MetadataFilters(
54
  filters=[
55
+ MetadataFilter(key="topic", value=topic_key_for_filter, operator=FilterOperator.EQ)
56
  ]
57
  )
58
 
 
63
  filters=filters # Apply the metadata filter
64
  )
65
 
66
+ print(f"πŸ”Ž Querying Upstash for topic: '{full_topic_name}' using filter value '{topic_key_for_filter}'")
 
67
  result = vector_store.query(query)
68
+ print(f"➑️ Found {len(result.nodes)} nodes for topic: '{full_topic_name}'.")
69
 
70
  for node in result.nodes:
71
  content = node.get_content().strip()
72
  if content:
73
+ topic_docs[topic_key_for_filter].append(content)
74
  # Optional: Print metadata to verify filtering
75
  # print(f" Node metadata: {node.metadata}")
76
  except Exception as e:
77
+ print(f"❌ [Topic Metadata Filter error for '{full_topic_name}']: {e}")
 
 
 
78
 
79
  except Exception as e:
80
  print("❌ [load_all_documents_grouped_by_topic Error]", e)
 
 
81
 
82
  return topic_docs
83
 
 
89
 
90
  try:
91
  client = OpenAI(api_key=OPENAI_API_KEY)
 
92
  content = "\n\n---\n\n".join(docs)[:12000]
93
 
94
  print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
95
  completion = client.chat.completions.create(
96
+ model="gpt-4",
97
  messages=[
98
  {"role": "system", "content": BASE_PROMPT},
99
  {"role": "user", "content": content},
100
  ],
101
+ max_tokens=512,
102
+ temperature=0.7,
103
  )
104
 
105
  text = completion.choices[0].message.content.strip()
106
 
107
  summaries = []
 
108
  for line in text.splitlines():
109
+ line = line.strip("-–‒ ")
110
  if line:
111
  summaries.append({
112
  "summary": line,
113
+ "image_url": "https://source.unsplash.com/800x600/?news",
114
+ "article_link": f"https://google.com/search?q={topic_key}+news"
115
  })
116
  return summaries
117
 
118
  except Exception as e:
119
  print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
 
 
120
  return []
121
 
122
  # πŸš€ Main callable
 
132
  feed_map[topic_key] = summaries
133
  except Exception as e:
134
  print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
 
 
135
  feed_map[topic_key] = []
136
 
137
+ # When creating final_feed, use TOPICS for the display name but TOPIC_KEYS for mapping
138
+ final_feed = [{"topic": display_name, "feed": feed_map[actual_key]}
139
+ for display_name, actual_key in zip(TOPICS, TOPIC_KEYS)]
140
 
141
  try:
142
+ cache_key_name = "daily_news_feed_cache"
 
 
143
  redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
 
144
  redis_client.expire(cache_key_name, 86400)
145
  print(f"βœ… Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
146
  except Exception as e:
147
  print("❌ [Redis Cache Error]", e)
 
 
148
 
149
  return final_feed
150
 
151
  except Exception as e:
152
  print("❌ [generate_and_cache_daily_feed Overall Error]", e)
 
 
153
  return []
154
 
155
  # πŸ“¦ Get cached data
156
  def get_cached_daily_feed():
157
  try:
158
+ cache_key_name = "daily_news_feed_cache"
159
  cached = redis_client.get(cache_key_name)
160
  if cached:
161
  print(f"βœ… Retrieved cached daily feed from '{cache_key_name}'.")
 
165
  return []
166
  except Exception as e:
167
  print("❌ [get_cached_daily_feed Error]", e)
 
 
168
  return []
169
 
170
  # Example of how to run it (for testing purposes, if this were the main script)
171
  if __name__ == "__main__":
172
  # Ensure your environment variables are set before running
173
  # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
174
+ # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
175
  # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
176
 
 
 
 
 
 
 
 
177
  generated_feed = generate_and_cache_daily_feed()
178
  print("\n--- Generated and Cached Feed ---")
179
+ # print(json.dumps(generated_feed, indent=2, ensure_ascii=False))
 
 
 
 
180
 
 
181
  cached_feed = get_cached_daily_feed()
182
  print("\n--- Retrieved from Cache ---")
183
+ # print(json.dumps(cached_feed, indent=2, ensure_ascii=False))