Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

ragV98 commited on Jul 23

Commit

e51955e

1 Parent(s): 9f27402

let's try again

Browse files

Files changed (1) hide show

components/generators/daily_feed.py +97 -20

components/generators/daily_feed.py CHANGED Viewed

@@ -6,11 +6,11 @@ from typing import List, Dict
 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core import StorageContext
-from llama_index.core.vector_stores.types import VectorStoreQuery
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
-REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
@@ -36,64 +36,91 @@ def load_all_documents_grouped_by_topic() -> Dict[str, List[str]]:
     try:
         vector_store = get_upstash_vector_store()
         for topic, key in zip(TOPICS, TOPIC_KEYS):
             try:
                 dummy_vector = np.random.rand(384).tolist()  # Assuming MiniLM embeddings
                 query = VectorStoreQuery(
                     query_embedding=dummy_vector,
-                    similarity_top_k=50,
-                    filters={"topic": topic}
                 )
                 result = vector_store.query(query)
                 for node in result.nodes:
                     content = node.get_content().strip()
                     if content:
                         topic_docs[key].append(content)
             except Exception as e:
-                print(f"❌ [Topic Metadata Filter error: {key}]", e)
     except Exception as e:
         print("❌ [load_all_documents_grouped_by_topic Error]", e)
     return topic_docs
 # 🧪 Summarize one topic at a time using OpenAI GPT-4
 def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
     if not docs:
-        print(f"⚠️ No docs found for topic: {topic_key}")
         return []
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
         content = "\n\n---\n\n".join(docs)[:12000]
-        print(f"🧠 Summarizing topic via OpenAI: {topic_key}")
         completion = client.chat.completions.create(
-            model="gpt-4",
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
             ],
-            max_tokens=512,
         )
         text = completion.choices[0].message.content.strip()
         summaries = []
         for line in text.splitlines():
-            line = line.strip("-–• ")
             if line:
                 summaries.append({
                     "summary": line,
-                    "image_url": "https://source.unsplash.com/800x600/?news",
-                    "article_link": f"https://google.com/search?q={topic_key}+news"
                 })
         return summaries
     except Exception as e:
-        print(f"❌ [OpenAI Summarization Error: {topic_key}]", e)
         return []
 # 🚀 Main callable
@@ -108,28 +135,78 @@ def generate_and_cache_daily_feed():
                 summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
                 feed_map[topic_key] = summaries
             except Exception as e:
-                print(f"❌ [Topic Loop Error: {topic_key}]", e)
                 feed_map[topic_key] = []
-        final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]
         try:
-            redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
-            print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
         except Exception as e:
             print("❌ [Redis Cache Error]", e)
         return final_feed
     except Exception as e:
-        print("❌ [generate_and_cache_daily_feed Error]", e)
         return []
 # 📦 Get cached data
 def get_cached_daily_feed():
     try:
-        cached = redis_client.get(REDIS_KEY)
-        return json.loads(cached) if cached else []
     except Exception as e:
         print("❌ [get_cached_daily_feed Error]", e)
         return []

 from openai import OpenAI
 from components.indexers.news_indexer import get_upstash_vector_store
 from llama_index.core import StorageContext
+from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator
 # 🔐 Environment variables
 REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
+REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") # Using REDIS_KEY for the cache key, assuming UPSTASH_REDIS_TOKEN is meant for the cache key here
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 # ✅ Redis client
     try:
         vector_store = get_upstash_vector_store()
+        print("💡 Successfully retrieved Upstash vector store.")
         for topic, key in zip(TOPICS, TOPIC_KEYS):
             try:
+                # Upstash VectorStore expects the filter value to match the exact string
+                # of the topic as it was indexed. Make sure your 'topic' metadata
+                # in Upstash exactly matches the values in TOPICS (e.g., "India news").
+                # Construct MetadataFilters object
+                filters = MetadataFilters(
+                    filters=[
+                        MetadataFilter(key="topic", value=topic, operator=FilterOperator.EQ)
+                    ]
+                )
                 dummy_vector = np.random.rand(384).tolist()  # Assuming MiniLM embeddings
                 query = VectorStoreQuery(
                     query_embedding=dummy_vector,
+                    similarity_top_k=50, # Retrieve enough documents for summarization
+                    filters=filters # Apply the metadata filter
                 )
+                print(f"🔎 Querying Upstash for topic: '{topic}' with filters: {filters.to_dict()}")
                 result = vector_store.query(query)
+                print(f"➡️ Found {len(result.nodes)} nodes for topic: '{topic}'.")
                 for node in result.nodes:
                     content = node.get_content().strip()
                     if content:
                         topic_docs[key].append(content)
+                        # Optional: Print metadata to verify filtering
+                        # print(f"  Node metadata: {node.metadata}")
             except Exception as e:
+                print(f"❌ [Topic Metadata Filter error for '{topic}']: {e}")
+                # Optional: Log the full traceback for more detailed debugging
+                # import traceback
+                # traceback.print_exc()
     except Exception as e:
         print("❌ [load_all_documents_grouped_by_topic Error]", e)
+        # import traceback
+        # traceback.print_exc()
     return topic_docs
 # 🧪 Summarize one topic at a time using OpenAI GPT-4
 def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
     if not docs:
+        print(f"⚠️ No docs found for topic: {topic_key}, skipping summarization.")
         return []
     try:
         client = OpenAI(api_key=OPENAI_API_KEY)
+        # Join documents, ensuring we don't exceed typical GPT-4 context window (approx 128k tokens, 12000 chars is safe)
         content = "\n\n---\n\n".join(docs)[:12000]
+        print(f"🧠 Summarizing topic via OpenAI: {topic_key} ({len(docs)} documents)")
         completion = client.chat.completions.create(
+            model="gpt-4", # Or "gpt-4o" for potentially better performance
             messages=[
                 {"role": "system", "content": BASE_PROMPT},
                 {"role": "user", "content": content},
             ],
+            max_tokens=512, # Enough tokens for 3 punchy headlines
+            temperature=0.7, # A bit creative but focused
         )
         text = completion.choices[0].message.content.strip()
         summaries = []
+        # Parse the headlines, assuming they might be bullet points or lines
         for line in text.splitlines():
+            line = line.strip("-–• ") # Remove common bullet characters
             if line:
                 summaries.append({
                     "summary": line,
+                    "image_url": "https://source.unsplash.com/800x600/?news", # Generic image, could be improved
+                    "article_link": f"https://google.com/search?q={topic_key}+news" # Generic search link
                 })
         return summaries
     except Exception as e:
+        print(f"❌ [OpenAI Summarization Error for '{topic_key}']: {e}")
+        # import traceback
+        # traceback.print_exc()
         return []
 # 🚀 Main callable
                 summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
                 feed_map[topic_key] = summaries
             except Exception as e:
+                print(f"❌ [Topic Loop Error for '{topic_key}']: {e}")
+                # import traceback
+                # traceback.print_exc()
                 feed_map[topic_key] = []
+        final_feed = [{"topic": topic, "feed": feed_map[topic_key]} for topic, topic_key in zip(TOPICS, TOPIC_KEYS)]
         try:
+            # Ensure the REDIS_KEY is suitable for a key name (e.g., not an API token itself)
+            # You might want a separate environment variable for the cache key, e.g., DAILY_FEED_CACHE_KEY
+            cache_key_name = "daily_news_feed_cache" # A more descriptive key
+            redis_client.set(cache_key_name, json.dumps(final_feed, ensure_ascii=False))
+            # Set an expiry for the cache, e.g., 24 hours (86400 seconds)
+            redis_client.expire(cache_key_name, 86400)
+            print(f"✅ Cached daily feed under key '{cache_key_name}' with 24-hour expiry.")
         except Exception as e:
             print("❌ [Redis Cache Error]", e)
+            # import traceback
+            # traceback.print_exc()
         return final_feed
     except Exception as e:
+        print("❌ [generate_and_cache_daily_feed Overall Error]", e)
+        # import traceback
+        # traceback.print_exc()
         return []
 # 📦 Get cached data
 def get_cached_daily_feed():
     try:
+        cache_key_name = "daily_news_feed_cache" # Use the same key name as in generate_and_cache_daily_feed
+        cached = redis_client.get(cache_key_name)
+        if cached:
+            print(f"✅ Retrieved cached daily feed from '{cache_key_name}'.")
+            return json.loads(cached)
+        else:
+            print(f"ℹ️ No cached data found under key '{cache_key_name}'.")
+            return []
     except Exception as e:
         print("❌ [get_cached_daily_feed Error]", e)
+        # import traceback
+        # traceback.print_exc()
         return []
+# Example of how to run it (for testing purposes, if this were the main script)
+if __name__ == "__main__":
+    # Ensure your environment variables are set before running
+    # os.environ["UPSTASH_REDIS_URL"] = "your_upstash_redis_url"
+    # os.environ["UPSTASH_REDIS_TOKEN"] = "your_upstash_redis_token" # This should ideally be a unique key for caching, not the token
+    # os.environ["OPENAI_API_KEY"] = "your_openai_api_key"
+    # For the UPSTASH_REDIS_TOKEN environment variable, if it's truly a Redis token
+    # that shouldn't be used as a cache key, you should define a separate environment
+    # variable for the cache key, or use a hardcoded string as I've done with "daily_news_feed_cache".
+    # For Upstash Vector connection, ensure UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN
+    # are configured in your `components.indexers.news_indexer.py`'s `get_upstash_vector_store` function.
+    # Generate and cache the feed
+    generated_feed = generate_and_cache_daily_feed()
+    print("\n--- Generated and Cached Feed ---")
+    # for item in generated_feed:
+    #     print(f"Topic: {item['topic']}")
+    #     for summary in item['feed']:
+    #         print(f"  - {summary['summary']}")
+    # print(json.dumps(generated_feed, indent=2, ensure_ascii=False)) # For full output
+    # Retrieve from cache
+    cached_feed = get_cached_daily_feed()
+    print("\n--- Retrieved from Cache ---")
+    # for item in cached_feed:
+    #     print(f"Topic: {item['topic']}")
+    #     for summary in item['feed']:
+    #         print(f"  - {summary['summary']}")
+    # print(json.dumps(cached_feed, indent=2, ensure_ascii=False)) # For full output