ragV98 commited on
Commit
e51955e
Β·
1 Parent(s): 9f27402

let's try again

Browse files
Files changed (1) hide show
  1. components/generators/daily_feed.py +97 -20
components/generators/daily_feed.py CHANGED
@@ -6,11 +6,11 @@ from typing import List, Dict
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core import StorageContext
9
- from llama_index.core.vector_stores.types import VectorStoreQuery
10
 
11
  # πŸ” Environment variables
12
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
13
- REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
14
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
 
16
  # βœ… Redis client
@@ -36,64 +36,91 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
36
 
37
  try:
38
  vector_store = get_upstash_vector_store()
 
39
 
40
  for topic, key in zip(TOPICS, TOPIC_KEYS):
41
  try:
 
 
 
 
 
 
 
 
 
 
 
42
  dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
43
  query = VectorStoreQuery(
44
  query_embedding=dummy_vector,
45
- similarity_top_k=50,
46
- filters={"topic": topic}
47
  )
 
 
48
  result = vector_store.query(query)
 
49
 
50
  for node in result.nodes:
51
  content = node.get_content().strip()
52
  if content:
53
  topic_docs[key].append(content)
 
 
54
  except Exception as e:
55
- print(f"❌ [Topic Metadata Filter error: {key}]", e)
 
 
 
56
 
57
  except Exception as e:
58
  print("❌ [load_all_documents_grouped_by_topic Error]", e)
 
 
59
 
60
  return topic_docs
61
 
62
  # πŸ§ͺ Summarize one topic at a time using OpenAI GPT-4
63
  def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
64
  if not docs:
65
- print(f"⚠️ No docs found for topic: {topic_key}")
66
  return []
67
 
68
  try:
69
  client = OpenAI(api_key=OPENAI_API_KEY)
 
70
  content = "\n\n---\n\n".join(docs)[:12000]
71
 
72
- print(f"🧠 Summarizing topic via OpenAI: {topic_key}")
73
  completion = client.chat.completions.create(
74
- model="gpt-4",
75
  messages=[
76
  {"role": "system", "content": BASE_PROMPT},
77
  {"role": "user", "content": content},
78
  ],
79
- max_tokens=512,
 
80
  )
81
 
82
  text = completion.choices[0].message.content.strip()
83
 
84
  summaries = []
 
85
  for line in text.splitlines():
86
- line = line.strip("-–‒ ")
87
  if line:
88
  summaries.append({
89
  "summary": line,
90
- "image_url": "https://source.unsplash.com/800x600/?news",
91
- "article_link": f"https://google.com/search?q={topic_key}+news"
92
  })
93
  return summaries
94
 
95
  except Exception as e:
96
- print(f"❌ [OpenAI Summarization Error: {topic_key}]", e)
 
 
97
  return []
98
 
99
  # πŸš€ Main callable
@@ -108,28 +135,78 @@ def generate_and_cache_daily_feed():
108
  summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
109
  feed_map[topic_key] = summaries
110
  except Exception as e:
111
- print(f"❌ [Topic Loop Error: {topic_key}]", e)
 
 
112
  feed_map[topic_key] = []
113
 
114
- final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]
115
 
116
  try:
117
- redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
118
- print(f"βœ… Cached daily feed under key '{REDIS_KEY}'")
 
 
 
 
 
119
  except Exception as e:
120
  print("❌ [Redis Cache Error]", e)
 
 
121
 
122
  return final_feed
123
 
124
  except Exception as e:
125
- print("❌ [generate_and_cache_daily_feed Error]", e)
 
 
126
  return []
127
 
128
  # πŸ“¦ Get cached data
129
  def get_cached_daily_feed():
130
  try:
131
- cached = redis_client.get(REDIS_KEY)
132
- return json.loads(cached) if cached else []
 
 
 
 
 
 
133
  except Exception as e:
134
  print("❌ [get_cached_daily_feed Error]", e)
 
 
135
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from openai import OpenAI
7
  from components.indexers.news_indexer import get_upstash_vector_store
8
  from llama_index.core import StorageContext
9
+ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
10
 
11
  # πŸ” Environment variables
12
  REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
13
+ REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") # Using REDIS_KEY for the cache key, assuming UPSTASH_REDIS_TOKEN is meant for the cache key here
14
  OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
15
 
16
  # βœ… Redis client
 
36
 
37
  try:
38
  vector_store = get_upstash_vector_store()
39
+ print("πŸ’‘ Successfully retrieved Upstash vector store.")
40
 
41
  for topic, key in zip(TOPICS, TOPIC_KEYS):
42
  try:
43
+ # Upstash VectorStore expects the filter value to match the exact string
44
+ # of the topic as it was indexed. Make sure your 'topic' metadata
45
+ # in Upstash exactly matches the values in TOPICS (e.g., "India news").
46
+
47
+ # Construct MetadataFilters object
48
+ filters = MetadataFilters(
49
+ filters=[
50
+ MetadataFilter(key="topic", value=topic, operator=FilterOperator.EQ)
51
+ ]
52
+ )
53
+
54
  dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
55
  query = VectorStoreQuery(
56
  query_embedding=dummy_vector,
57
+ similarity_top_k=50, # Retrieve enough documents for summarization
58
+ filters=filters # Apply the metadata filter
59
  )
60
+
61
+ print(f"πŸ”Ž Querying Upstash for topic: '{topic}' with filters: {filters.to_dict()}")
62
  result = vector_store.query(query)
63
+ print(f"➑️ Found {len(result.nodes)} nodes for topic: '{topic}'.")
64
 
65
  for node in result.nodes:
66
  content = node.get_content().strip()
67
  if content:
68
  topic_docs[key].append(content)
69
+ # Optional: Print metadata to verify filtering
70
+ # print(f" Node metadata: {node.metadata}")
71
  except Exception as e:
72
+ print(f"❌ [Topic Metadata Filter error for '{topic}']: {e}")
73
+ # Optional: Log the full traceback for more detailed debugging
74
+ # import traceback
75
+ # traceback.print_exc()
76
 
77
  except Exception as e:
78
  print("❌ [load_all_documents_grouped_by_topic Error]", e)
79
+ # import traceback
80
+ # traceback.print_exc()
81
 
82
  return topic_docs
83
 
84
  # πŸ§ͺ Summarize one topic at a time using OpenAI GPT-4
85
  def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
86
  if not docs:
87
+ print(f"⚠️ No docs found for topic: {topic_key}, skipping summarization.")
88
  return []
89
 
90
  try:
91
  client = OpenAI(api_key=OPENAI_API_KEY)
92
+ # Join documents, ensuring we don't exceed typical GPT-4 context window (approx 128k tokens, 12000 chars is safe)
93
  content = "\n\n---\n\n".join(docs)[:12000]
94
 
95
+ print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
96
  completion = client.chat.completions.create(
97
+ model="gpt-4", # Or "gpt-4o" for potentially better performance
98
  messages=[
99
  {"role": "system", "content": BASE_PROMPT},
100
  {"role": "user", "content": content},
101
  ],
102
+ max_tokens=512, # Enough tokens for 3 punchy headlines
103
+ temperature=0.7, # A bit creative but focused
104
  )
105
 
106
  text = completion.choices[0].message.content.strip()
107
 
108
  summaries = []
109
+ # Parse the headlines, assuming they might be bullet points or lines
110
  for line in text.splitlines():
111
+ line = line.strip("-–‒ ") # Remove common bullet characters
112
  if line:
113
  summaries.append({
114
  "summary": line,
115
+ "image_url": "https://source.unsplash.com/800x600/?news", # Generic image, could be improved
116
+ "article_link": f"https://google.com/search?q={topic_key}+news" # Generic search link
117
  })
118
  return summaries
119
 
120
  except Exception as e:
121
+ print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
122
+ # import traceback
123
+ # traceback.print_exc()
124
  return []
125
 
126
  # πŸš€ Main callable
 
135
  summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
136
  feed_map[topic_key] = summaries
137
  except Exception as e:
138
+ print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
139
+ # import traceback
140
+ # traceback.print_exc()
141
  feed_map[topic_key] = []
142
 
143
+ final_feed = [{"topic": topic, "feed": feed_map[topic_key]} for topic, topic_key in zip(TOPICS, TOPIC_KEYS)]
144
 
145
  try:
146
+ # Ensure the REDIS_KEY is suitable for a key name (e.g., not an API token itself)
147
+ # You might want a separate environment variable for the cache key, e.g., DAILY_FEED_CACHE_KEY
148
+ cache_key_name = "daily_news_feed_cache" # A more descriptive key
149
+ redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
150
+ # Set an expiry for the cache, e.g., 24 hours (86400 seconds)
151
+ redis_client.expire(cache_key_name, 86400)
152
+ print(f"βœ… Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
153
  except Exception as e:
154
  print("❌ [Redis Cache Error]", e)
155
+ # import traceback
156
+ # traceback.print_exc()
157
 
158
  return final_feed
159
 
160
  except Exception as e:
161
+ print("❌ [generate_and_cache_daily_feed Overall Error]", e)
162
+ # import traceback
163
+ # traceback.print_exc()
164
  return []
165
 
166
  # πŸ“¦ Get cached data
167
  def get_cached_daily_feed():
168
  try:
169
+ cache_key_name = "daily_news_feed_cache" # Use the same key name as in generate_and_cache_daily_feed
170
+ cached = redis_client.get(cache_key_name)
171
+ if cached:
172
+ print(f"βœ… Retrieved cached daily feed from '{cache_key_name}'.")
173
+ return json.loads(cached)
174
+ else:
175
+ print(f"ℹ️ No cached data found under key '{cache_key_name}'.")
176
+ return []
177
  except Exception as e:
178
  print("❌ [get_cached_daily_feed Error]", e)
179
+ # import traceback
180
+ # traceback.print_exc()
181
  return []
182
+
183
+ # Example of how to run it (for testing purposes, if this were the main script)
184
+ if __name__ == "__main__":
185
+ # Ensure your environment variables are set before running
186
+ # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
187
+ # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token" # This should ideally be a unique key for caching, not the token
188
+ # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
189
+
190
+ # For the UPSTASH_REDIS_TOKEN environment variable, if it's truly a Redis token
191
+ # that shouldn't be used as a cache key, you should define a separate environment
192
+ # variable for the cache key, or use a hardcoded string as I've done with "daily_news_feed_cache".
193
+ # For Upstash Vector connection, ensure UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN
194
+ # are configured in your `components.indexers.news_indexer.py`'s `get_upstash_vector_store` function.
195
+
196
+ # Generate and cache the feed
197
+ generated_feed = generate_and_cache_daily_feed()
198
+ print("\n--- Generated and Cached Feed ---")
199
+ # for item in generated_feed:
200
+ # print(f"Topic: {item['topic']}")
201
+ # for summary in item['feed']:
202
+ # print(f" - {summary['summary']}")
203
+ # print(json.dumps(generated_feed, indent=2, ensure_ascii=False)) # For full output
204
+
205
+ # Retrieve from cache
206
+ cached_feed = get_cached_daily_feed()
207
+ print("\n--- Retrieved from Cache ---")
208
+ # for item in cached_feed:
209
+ # print(f"Topic: {item['topic']}")
210
+ # for summary in item['feed']:
211
+ # print(f" - {summary['summary']}")
212
+ # print(json.dumps(cached_feed, indent=2, ensure_ascii=False)) # For full output