let's try again
Browse files
components/generators/daily_feed.py
CHANGED
@@ -6,11 +6,11 @@ from typing import List, Dict
|
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core import StorageContext
|
9 |
-
from llama_index.core.vector_stores.types import VectorStoreQuery
|
10 |
|
11 |
# π Environment variables
|
12 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
13 |
-
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
|
14 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
|
16 |
# β
Redis client
|
@@ -36,64 +36,91 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
|
|
36 |
|
37 |
try:
|
38 |
vector_store = get_upstash_vector_store()
|
|
|
39 |
|
40 |
for topic, key in zip(TOPICS, TOPIC_KEYS):
|
41 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
|
43 |
query = VectorStoreQuery(
|
44 |
query_embedding=dummy_vector,
|
45 |
-
similarity_top_k=50,
|
46 |
-
filters=
|
47 |
)
|
|
|
|
|
48 |
result = vector_store.query(query)
|
|
|
49 |
|
50 |
for node in result.nodes:
|
51 |
content = node.get_content().strip()
|
52 |
if content:
|
53 |
topic_docs[key].append(content)
|
|
|
|
|
54 |
except Exception as e:
|
55 |
-
print(f"β [Topic Metadata Filter error
|
|
|
|
|
|
|
56 |
|
57 |
except Exception as e:
|
58 |
print("β [load_all_documents_grouped_by_topic Error]", e)
|
|
|
|
|
59 |
|
60 |
return topic_docs
|
61 |
|
62 |
# π§ͺ Summarize one topic at a time using OpenAI GPT-4
|
63 |
def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
|
64 |
if not docs:
|
65 |
-
print(f"β οΈ No docs found for topic: {topic_key}")
|
66 |
return []
|
67 |
|
68 |
try:
|
69 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
|
|
70 |
content = "\n\n---\n\n".join(docs)[:12000]
|
71 |
|
72 |
-
print(f"π§ Summarizing topic via OpenAI: {topic_key}")
|
73 |
completion = client.chat.completions.create(
|
74 |
-
model="gpt-4",
|
75 |
messages=[
|
76 |
{"role": "system", "content": BASE_PROMPT},
|
77 |
{"role": "user", "content": content},
|
78 |
],
|
79 |
-
max_tokens=512,
|
|
|
80 |
)
|
81 |
|
82 |
text = completion.choices[0].message.content.strip()
|
83 |
|
84 |
summaries = []
|
|
|
85 |
for line in text.splitlines():
|
86 |
-
line = line.strip("-ββ’ ")
|
87 |
if line:
|
88 |
summaries.append({
|
89 |
"summary": line,
|
90 |
-
"image_url": "https://source.unsplash.com/800x600/?news",
|
91 |
-
"article_link": f"https://google.com/search?q={topic_key}+news"
|
92 |
})
|
93 |
return summaries
|
94 |
|
95 |
except Exception as e:
|
96 |
-
print(f"β [OpenAI Summarization Error
|
|
|
|
|
97 |
return []
|
98 |
|
99 |
# π Main callable
|
@@ -108,28 +135,78 @@ def generate_and_cache_daily_feed():
|
|
108 |
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
|
109 |
feed_map[topic_key] = summaries
|
110 |
except Exception as e:
|
111 |
-
print(f"β [Topic Loop Error
|
|
|
|
|
112 |
feed_map[topic_key] = []
|
113 |
|
114 |
-
final_feed = [{"topic": topic, "feed": feed_map[
|
115 |
|
116 |
try:
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
119 |
except Exception as e:
|
120 |
print("β [Redis Cache Error]", e)
|
|
|
|
|
121 |
|
122 |
return final_feed
|
123 |
|
124 |
except Exception as e:
|
125 |
-
print("β [generate_and_cache_daily_feed Error]", e)
|
|
|
|
|
126 |
return []
|
127 |
|
128 |
# π¦ Get cached data
|
129 |
def get_cached_daily_feed():
|
130 |
try:
|
131 |
-
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
except Exception as e:
|
134 |
print("β [get_cached_daily_feed Error]", e)
|
|
|
|
|
135 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from openai import OpenAI
|
7 |
from components.indexers.news_indexer import get_upstash_vector_store
|
8 |
from llama_index.core import StorageContext
|
9 |
+
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
|
10 |
|
11 |
# π Environment variables
|
12 |
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
|
13 |
+
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") # Using REDIS_KEY for the cache key, assuming UPSTASH_REDIS_TOKEN is meant for the cache key here
|
14 |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
15 |
|
16 |
# β
Redis client
|
|
|
36 |
|
37 |
try:
|
38 |
vector_store = get_upstash_vector_store()
|
39 |
+
print("π‘ Successfully retrieved Upstash vector store.")
|
40 |
|
41 |
for topic, key in zip(TOPICS, TOPIC_KEYS):
|
42 |
try:
|
43 |
+
# Upstash VectorStore expects the filter value to match the exact string
|
44 |
+
# of the topic as it was indexed. Make sure your 'topic' metadata
|
45 |
+
# in Upstash exactly matches the values in TOPICS (e.g., "India news").
|
46 |
+
|
47 |
+
# Construct MetadataFilters object
|
48 |
+
filters = MetadataFilters(
|
49 |
+
filters=[
|
50 |
+
MetadataFilter(key="topic", value=topic, operator=FilterOperator.EQ)
|
51 |
+
]
|
52 |
+
)
|
53 |
+
|
54 |
dummy_vector = np.random.rand(384).tolist() # Assuming MiniLM embeddings
|
55 |
query = VectorStoreQuery(
|
56 |
query_embedding=dummy_vector,
|
57 |
+
similarity_top_k=50, # Retrieve enough documents for summarization
|
58 |
+
filters=filters # Apply the metadata filter
|
59 |
)
|
60 |
+
|
61 |
+
print(f"π Querying Upstash for topic: '{topic}' with filters: {filters.to_dict()}")
|
62 |
result = vector_store.query(query)
|
63 |
+
print(f"β‘οΈ Found {len(result.nodes)} nodes for topic: '{topic}'.")
|
64 |
|
65 |
for node in result.nodes:
|
66 |
content = node.get_content().strip()
|
67 |
if content:
|
68 |
topic_docs[key].append(content)
|
69 |
+
# Optional: Print metadata to verify filtering
|
70 |
+
# print(f" Node metadata: {node.metadata}")
|
71 |
except Exception as e:
|
72 |
+
print(f"β [Topic Metadata Filter error for '{topic}']: {e}")
|
73 |
+
# Optional: Log the full traceback for more detailed debugging
|
74 |
+
# import traceback
|
75 |
+
# traceback.print_exc()
|
76 |
|
77 |
except Exception as e:
|
78 |
print("β [load_all_documents_grouped_by_topic Error]", e)
|
79 |
+
# import traceback
|
80 |
+
# traceback.print_exc()
|
81 |
|
82 |
return topic_docs
|
83 |
|
84 |
# π§ͺ Summarize one topic at a time using OpenAI GPT-4
|
85 |
def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
|
86 |
if not docs:
|
87 |
+
print(f"β οΈ No docs found for topic: {topic_key}, skipping summarization.")
|
88 |
return []
|
89 |
|
90 |
try:
|
91 |
client = OpenAI(api_key=OPENAI_API_KEY)
|
92 |
+
# Join documents, ensuring we don't exceed typical GPT-4 context window (approx 128k tokens, 12000 chars is safe)
|
93 |
content = "\n\n---\n\n".join(docs)[:12000]
|
94 |
|
95 |
+
print(f"π§ Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
|
96 |
completion = client.chat.completions.create(
|
97 |
+
model="gpt-4", # Or "gpt-4o" for potentially better performance
|
98 |
messages=[
|
99 |
{"role": "system", "content": BASE_PROMPT},
|
100 |
{"role": "user", "content": content},
|
101 |
],
|
102 |
+
max_tokens=512, # Enough tokens for 3 punchy headlines
|
103 |
+
temperature=0.7, # A bit creative but focused
|
104 |
)
|
105 |
|
106 |
text = completion.choices[0].message.content.strip()
|
107 |
|
108 |
summaries = []
|
109 |
+
# Parse the headlines, assuming they might be bullet points or lines
|
110 |
for line in text.splitlines():
|
111 |
+
line = line.strip("-ββ’ ") # Remove common bullet characters
|
112 |
if line:
|
113 |
summaries.append({
|
114 |
"summary": line,
|
115 |
+
"image_url": "https://source.unsplash.com/800x600/?news", # Generic image, could be improved
|
116 |
+
"article_link": f"https://google.com/search?q={topic_key}+news" # Generic search link
|
117 |
})
|
118 |
return summaries
|
119 |
|
120 |
except Exception as e:
|
121 |
+
print(f"β [OpenAI Summarization Error for '{topic_key}']: {e}")
|
122 |
+
# import traceback
|
123 |
+
# traceback.print_exc()
|
124 |
return []
|
125 |
|
126 |
# π Main callable
|
|
|
135 |
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
|
136 |
feed_map[topic_key] = summaries
|
137 |
except Exception as e:
|
138 |
+
print(f"β [Topic Loop Error for '{topic_key}']: {e}")
|
139 |
+
# import traceback
|
140 |
+
# traceback.print_exc()
|
141 |
feed_map[topic_key] = []
|
142 |
|
143 |
+
final_feed = [{"topic": topic, "feed": feed_map[topic_key]} for topic, topic_key in zip(TOPICS, TOPIC_KEYS)]
|
144 |
|
145 |
try:
|
146 |
+
# Ensure the REDIS_KEY is suitable for a key name (e.g., not an API token itself)
|
147 |
+
# You might want a separate environment variable for the cache key, e.g., DAILY_FEED_CACHE_KEY
|
148 |
+
cache_key_name = "daily_news_feed_cache" # A more descriptive key
|
149 |
+
redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
|
150 |
+
# Set an expiry for the cache, e.g., 24 hours (86400 seconds)
|
151 |
+
redis_client.expire(cache_key_name, 86400)
|
152 |
+
print(f"β
Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
|
153 |
except Exception as e:
|
154 |
print("β [Redis Cache Error]", e)
|
155 |
+
# import traceback
|
156 |
+
# traceback.print_exc()
|
157 |
|
158 |
return final_feed
|
159 |
|
160 |
except Exception as e:
|
161 |
+
print("β [generate_and_cache_daily_feed Overall Error]", e)
|
162 |
+
# import traceback
|
163 |
+
# traceback.print_exc()
|
164 |
return []
|
165 |
|
166 |
# π¦ Get cached data
|
167 |
def get_cached_daily_feed():
|
168 |
try:
|
169 |
+
cache_key_name = "daily_news_feed_cache" # Use the same key name as in generate_and_cache_daily_feed
|
170 |
+
cached = redis_client.get(cache_key_name)
|
171 |
+
if cached:
|
172 |
+
print(f"β
Retrieved cached daily feed from '{cache_key_name}'.")
|
173 |
+
return json.loads(cached)
|
174 |
+
else:
|
175 |
+
print(f"βΉοΈ No cached data found under key '{cache_key_name}'.")
|
176 |
+
return []
|
177 |
except Exception as e:
|
178 |
print("β [get_cached_daily_feed Error]", e)
|
179 |
+
# import traceback
|
180 |
+
# traceback.print_exc()
|
181 |
return []
|
182 |
+
|
183 |
+
# Example of how to run it (for testing purposes, if this were the main script)
|
184 |
+
if __name__ == "__main__":
|
185 |
+
# Ensure your environment variables are set before running
|
186 |
+
# os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
|
187 |
+
# os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token" # This should ideally be a unique key for caching, not the token
|
188 |
+
# os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
|
189 |
+
|
190 |
+
# For the UPSTASH_REDIS_TOKEN environment variable, if it's truly a Redis token
|
191 |
+
# that shouldn't be used as a cache key, you should define a separate environment
|
192 |
+
# variable for the cache key, or use a hardcoded string as I've done with "daily_news_feed_cache".
|
193 |
+
# For Upstash Vector connection, ensure UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN
|
194 |
+
# are configured in your `components.indexers.news_indexer.py`'s `get_upstash_vector_store` function.
|
195 |
+
|
196 |
+
# Generate and cache the feed
|
197 |
+
generated_feed = generate_and_cache_daily_feed()
|
198 |
+
print("\n--- Generated and Cached Feed ---")
|
199 |
+
# for item in generated_feed:
|
200 |
+
# print(f"Topic: {item['topic']}")
|
201 |
+
# for summary in item['feed']:
|
202 |
+
# print(f" - {summary['summary']}")
|
203 |
+
# print(json.dumps(generated_feed, indent=2, ensure_ascii=False)) # For full output
|
204 |
+
|
205 |
+
# Retrieve from cache
|
206 |
+
cached_feed = get_cached_daily_feed()
|
207 |
+
print("\n--- Retrieved from Cache ---")
|
208 |
+
# for item in cached_feed:
|
209 |
+
# print(f"Topic: {item['topic']}")
|
210 |
+
# for summary in item['feed']:
|
211 |
+
# print(f" - {summary['summary']}")
|
212 |
+
# print(json.dumps(cached_feed, indent=2, ensure_ascii=False)) # For full output
|