fix 1
Browse files
components/generators/daily_feed.py
CHANGED
@@ -10,7 +10,7 @@ from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilte
|
|
10 |
|
11 |
# π Environment variables
|
12 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
13 |
-
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
14 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
|
16 |
# β
Redis client
|
@@ -22,6 +22,7 @@ except Exception as e:
|
|
22 |
|
23 |
# π° Topic list
|
24 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
|
|
25 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
26 |
|
27 |
# π§ Summarization prompt
|
@@ -38,23 +39,20 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
|
|
38 |
vector_store = get_upstash_vector_store()
|
39 |
print("π‘ Successfully retrieved Upstash vector store.")
|
40 |
|
41 |
-
#
|
42 |
print(f"DEBUG: TOPICS = {TOPICS}")
|
43 |
print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
|
44 |
print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
|
45 |
print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
|
46 |
-
# ----------------------------------
|
47 |
|
48 |
-
for
|
49 |
try:
|
50 |
-
#
|
51 |
-
#
|
52 |
-
#
|
53 |
-
|
54 |
-
# Construct MetadataFilters object
|
55 |
filters = MetadataFilters(
|
56 |
filters=[
|
57 |
-
MetadataFilter(key="topic", value=
|
58 |
]
|
59 |
)
|
60 |
|
@@ -65,27 +63,21 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
|
|
65 |
filters=filters # Apply the metadata filter
|
66 |
)
|
67 |
|
68 |
-
|
69 |
-
print(f"π Querying Upstash for topic: '{topic}'")
|
70 |
result = vector_store.query(query)
|
71 |
-
print(f"β‘οΈ Found {len(result.nodes)} nodes for topic: '{
|
72 |
|
73 |
for node in result.nodes:
|
74 |
content = node.get_content().strip()
|
75 |
if content:
|
76 |
-
topic_docs[
|
77 |
# Optional: Print metadata to verify filtering
|
78 |
# print(f" Node metadata: {node.metadata}")
|
79 |
except Exception as e:
|
80 |
-
print(f"β [Topic Metadata Filter error for '{
|
81 |
-
# Optional: Log the full traceback for more detailed debugging
|
82 |
-
# import traceback
|
83 |
-
# traceback.print_exc()
|
84 |
|
85 |
except Exception as e:
|
86 |
print("β [load_all_documents_grouped_by_topic Error]", e)
|
87 |
-
# import traceback
|
88 |
-
# traceback.print_exc()
|
89 |
|
90 |
return topic_docs
|
91 |
|
@@ -97,38 +89,34 @@ def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
|
|
97 |
|
98 |
try:
|
99 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
100 |
-
# Join documents, ensuring we don't exceed typical GPT-4 context window (approx 128k tokens, 12000 chars is safe)
|
101 |
content = "\n\n---\n\n".join(docs)[:12000]
|
102 |
|
103 |
print(f"π§ Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
|
104 |
completion = client.chat.completions.create(
|
105 |
-
model="gpt-4",
|
106 |
messages=[
|
107 |
{"role": "system", "content": BASE_PROMPT},
|
108 |
{"role": "user", "content": content},
|
109 |
],
|
110 |
-
max_tokens=512,
|
111 |
-
temperature=0.7,
|
112 |
)
|
113 |
|
114 |
text = completion.choices[0].message.content.strip()
|
115 |
|
116 |
summaries = []
|
117 |
-
# Parse the headlines, assuming they might be bullet points or lines
|
118 |
for line in text.splitlines():
|
119 |
-
line = line.strip("-ββ’ ")
|
120 |
if line:
|
121 |
summaries.append({
|
122 |
"summary": line,
|
123 |
-
"image_url": "https://source.unsplash.com/800x600/?news",
|
124 |
-
"article_link": f"https://google.com/search?q={topic_key}+news"
|
125 |
})
|
126 |
return summaries
|
127 |
|
128 |
except Exception as e:
|
129 |
print(f"β [OpenAI Summarization Error for '{topic_key}']: {e}")
|
130 |
-
# import traceback
|
131 |
-
# traceback.print_exc()
|
132 |
return []
|
133 |
|
134 |
# π Main callable
|
@@ -144,37 +132,30 @@ def generate_and_cache_daily_feed():
|
|
144 |
feed_map[topic_key] = summaries
|
145 |
except Exception as e:
|
146 |
print(f"β [Topic Loop Error for '{topic_key}']: {e}")
|
147 |
-
# import traceback
|
148 |
-
# traceback.print_exc()
|
149 |
feed_map[topic_key] = []
|
150 |
|
151 |
-
|
|
|
|
|
152 |
|
153 |
try:
|
154 |
-
|
155 |
-
# You might want a separate environment variable for the cache key, e.g., DAILY_FEED_CACHE_KEY
|
156 |
-
cache_key_name = "daily_news_feed_cache" # A more descriptive key
|
157 |
redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
|
158 |
-
# Set an expiry for the cache, e.g., 24 hours (86400 seconds)
|
159 |
redis_client.expire(cache_key_name, 86400)
|
160 |
print(f"β
Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
|
161 |
except Exception as e:
|
162 |
print("β [Redis Cache Error]", e)
|
163 |
-
# import traceback
|
164 |
-
# traceback.print_exc()
|
165 |
|
166 |
return final_feed
|
167 |
|
168 |
except Exception as e:
|
169 |
print("β [generate_and_cache_daily_feed Overall Error]", e)
|
170 |
-
# import traceback
|
171 |
-
# traceback.print_exc()
|
172 |
return []
|
173 |
|
174 |
# π¦ Get cached data
|
175 |
def get_cached_daily_feed():
|
176 |
try:
|
177 |
-
cache_key_name = "daily_news_feed_cache"
|
178 |
cached = redis_client.get(cache_key_name)
|
179 |
if cached:
|
180 |
print(f"β
Retrieved cached daily feed from '{cache_key_name}'.")
|
@@ -184,37 +165,19 @@ def get_cached_daily_feed():
|
|
184 |
return []
|
185 |
except Exception as e:
|
186 |
print("β [get_cached_daily_feed Error]", e)
|
187 |
-
# import traceback
|
188 |
-
# traceback.print_exc()
|
189 |
return []
|
190 |
|
191 |
# Example of how to run it (for testing purposes, if this were the main script)
|
192 |
if __name__ == "__main__":
|
193 |
# Ensure your environment variables are set before running
|
194 |
# os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
|
195 |
-
# os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
|
196 |
# os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
|
197 |
|
198 |
-
# For the UPSTASH_REDIS_TOKEN environment variable, if it's truly a Redis token
|
199 |
-
# that shouldn't be used as a cache key, you should define a separate environment
|
200 |
-
# variable for the cache key, or use a hardcoded string as I've done with "daily_news_feed_cache".
|
201 |
-
# For Upstash Vector connection, ensure UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN
|
202 |
-
# are configured in your `components.indexers.news_indexer.py`'s `get_upstash_vector_store` function.
|
203 |
-
|
204 |
-
# Generate and cache the feed
|
205 |
generated_feed = generate_and_cache_daily_feed()
|
206 |
print("\n--- Generated and Cached Feed ---")
|
207 |
-
#
|
208 |
-
# print(f"Topic: {item['topic']}")
|
209 |
-
# for summary in item['feed']:
|
210 |
-
# print(f" - {summary['summary']}")
|
211 |
-
# print(json.dumps(generated_feed, indent=2, ensure_ascii=False)) # For full output
|
212 |
|
213 |
-
# Retrieve from cache
|
214 |
cached_feed = get_cached_daily_feed()
|
215 |
print("\n--- Retrieved from Cache ---")
|
216 |
-
#
|
217 |
-
# print(f"Topic: {item['topic']}")
|
218 |
-
# for summary in item['feed']:
|
219 |
-
# print(f" - {summary['summary']}")
|
220 |
-
# print(json.dumps(cached_feed, indent=2, ensure_ascii=False)) # For full output
|
|
|
10 |
|
11 |
# π Environment variables
|
12 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
13 |
+
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
14 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
|
16 |
# β
Redis client
|
|
|
22 |
|
23 |
# π° Topic list
|
24 |
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
|
25 |
+
# This list correctly generates 'india', 'world', etc.
|
26 |
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
|
27 |
|
28 |
# π§ Summarization prompt
|
|
|
39 |
vector_store = get_upstash_vector_store()
|
40 |
print("π‘ Successfully retrieved Upstash vector store.")
|
41 |
|
42 |
+
# Debugging prints (keep them for now, they are useful)
|
43 |
print(f"DEBUG: TOPICS = {TOPICS}")
|
44 |
print(f"DEBUG: TOPIC_KEYS = {TOPIC_KEYS}")
|
45 |
print(f"DEBUG: Length of TOPICS = {len(TOPICS)}")
|
46 |
print(f"DEBUG: Length of TOPIC_KEYS = {len(TOPIC_KEYS)}")
|
|
|
47 |
|
48 |
+
for full_topic_name, topic_key_for_filter in zip(TOPICS, TOPIC_KEYS):
|
49 |
try:
|
50 |
+
# *** THE CRITICAL CHANGE IS HERE ***
|
51 |
+
# Use 'topic_key_for_filter' (e.g., "india") which matches your stored metadata
|
52 |
+
# instead of 'full_topic_name' (e.g., "India news").
|
|
|
|
|
53 |
filters = MetadataFilters(
|
54 |
filters=[
|
55 |
+
MetadataFilter(key="topic", value=topic_key_for_filter, operator=FilterOperator.EQ)
|
56 |
]
|
57 |
)
|
58 |
|
|
|
63 |
filters=filters # Apply the metadata filter
|
64 |
)
|
65 |
|
66 |
+
print(f"π Querying Upstash for topic: '{full_topic_name}' using filter value '{topic_key_for_filter}'")
|
|
|
67 |
result = vector_store.query(query)
|
68 |
+
print(f"β‘οΈ Found {len(result.nodes)} nodes for topic: '{full_topic_name}'.")
|
69 |
|
70 |
for node in result.nodes:
|
71 |
content = node.get_content().strip()
|
72 |
if content:
|
73 |
+
topic_docs[topic_key_for_filter].append(content)
|
74 |
# Optional: Print metadata to verify filtering
|
75 |
# print(f" Node metadata: {node.metadata}")
|
76 |
except Exception as e:
|
77 |
+
print(f"β [Topic Metadata Filter error for '{full_topic_name}']: {e}")
|
|
|
|
|
|
|
78 |
|
79 |
except Exception as e:
|
80 |
print("β [load_all_documents_grouped_by_topic Error]", e)
|
|
|
|
|
81 |
|
82 |
return topic_docs
|
83 |
|
|
|
89 |
|
90 |
try:
|
91 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
|
92 |
content = "\n\n---\n\n".join(docs)[:12000]
|
93 |
|
94 |
print(f"π§ Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
|
95 |
completion = client.chat.completions.create(
|
96 |
+
model="gpt-4",
|
97 |
messages=[
|
98 |
{"role": "system", "content": BASE_PROMPT},
|
99 |
{"role": "user", "content": content},
|
100 |
],
|
101 |
+
max_tokens=512,
|
102 |
+
temperature=0.7,
|
103 |
)
|
104 |
|
105 |
text = completion.choices[0].message.content.strip()
|
106 |
|
107 |
summaries = []
|
|
|
108 |
for line in text.splitlines():
|
109 |
+
line = line.strip("-ββ’ ")
|
110 |
if line:
|
111 |
summaries.append({
|
112 |
"summary": line,
|
113 |
+
"image_url": "https://source.unsplash.com/800x600/?news",
|
114 |
+
"article_link": f"https://google.com/search?q={topic_key}+news"
|
115 |
})
|
116 |
return summaries
|
117 |
|
118 |
except Exception as e:
|
119 |
print(f"β [OpenAI Summarization Error for '{topic_key}']: {e}")
|
|
|
|
|
120 |
return []
|
121 |
|
122 |
# π Main callable
|
|
|
132 |
feed_map[topic_key] = summaries
|
133 |
except Exception as e:
|
134 |
print(f"β [Topic Loop Error for '{topic_key}']: {e}")
|
|
|
|
|
135 |
feed_map[topic_key] = []
|
136 |
|
137 |
+
# When creating final_feed, use TOPICS for the display name but TOPIC_KEYS for mapping
|
138 |
+
final_feed = [{"topic": display_name, "feed": feed_map[actual_key]}
|
139 |
+
for display_name, actual_key in zip(TOPICS, TOPIC_KEYS)]
|
140 |
|
141 |
try:
|
142 |
+
cache_key_name = "daily_news_feed_cache"
|
|
|
|
|
143 |
redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
|
|
|
144 |
redis_client.expire(cache_key_name, 86400)
|
145 |
print(f"β
Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
|
146 |
except Exception as e:
|
147 |
print("β [Redis Cache Error]", e)
|
|
|
|
|
148 |
|
149 |
return final_feed
|
150 |
|
151 |
except Exception as e:
|
152 |
print("β [generate_and_cache_daily_feed Overall Error]", e)
|
|
|
|
|
153 |
return []
|
154 |
|
155 |
# π¦ Get cached data
|
156 |
def get_cached_daily_feed():
|
157 |
try:
|
158 |
+
cache_key_name = "daily_news_feed_cache"
|
159 |
cached = redis_client.get(cache_key_name)
|
160 |
if cached:
|
161 |
print(f"β
Retrieved cached daily feed from '{cache_key_name}'.")
|
|
|
165 |
return []
|
166 |
except Exception as e:
|
167 |
print("β [get_cached_daily_feed Error]", e)
|
|
|
|
|
168 |
return []
|
169 |
|
170 |
# Example of how to run it (for testing purposes, if this were the main script)
|
171 |
if __name__ == "__main__":
|
172 |
# Ensure your environment variables are set before running
|
173 |
# os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
|
174 |
+
# os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token"
|
175 |
# os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
|
176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
generated_feed = generate_and_cache_daily_feed()
|
178 |
print("\n--- Generated and Cached Feed ---")
|
179 |
+
# print(json.dumps(generated_feed, indent=2, ensure_ascii=False))
|
|
|
|
|
|
|
|
|
180 |
|
|
|
181 |
cached_feed = get_cached_daily_feed()
|
182 |
print("\n--- Retrieved from Cache ---")
|
183 |
+
# print(json.dumps(cached_feed, indent=2, ensure_ascii=False))
|
|
|
|
|
|
|
|