Spaces:

broadfield-dev
/

RSS_News

Runtime error

App Files Files Community

broadfield-dev commited on Feb 22

Commit

7a82005

verified ·

1 Parent(s): 146f768

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -17

app.py CHANGED Viewed

@@ -39,16 +39,13 @@ def index():
     # Check if the database needs to be loaded (first time or empty)
     db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
     if not db_exists:
-        # First load: DB doesn't exist or is empty
         loading_complete = False
         logger.info("Downloading Chroma DB from Hugging Face Hub...")
         download_from_hf_hub()
         threading.Thread(target=load_feeds_in_background, daemon=True).start()
     elif not loading_complete:
-        # Background loading is still in progress from a previous request
-        pass  # Let it continue, spinner will show
     else:
-        # DB exists and loading is complete, no spinner needed
         loading_complete = True
     try:
@@ -58,7 +55,7 @@ def index():
             logger.info("No articles in DB yet")
             return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
-        # Process and categorize articles, getting only 10 most recent per category with strict deduplication
         enriched_articles = []
         seen_keys = set()
         for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
@@ -82,27 +79,27 @@ def index():
                     "published": published,
                     "image": meta.get("image", "svg"),
                 })
         enriched_articles.sort(key=lambda x: x["published"], reverse=True)
         categorized_articles = {}
         for article in enriched_articles:
             cat = article["category"]
             if cat not in categorized_articles:
                 categorized_articles[cat] = []
-            key = f"{article['title']}|{article['link']}|{article['published']}"
-            if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
-                categorized_articles[cat].append(article)
         for cat in categorized_articles:
-            unique_articles = []
-            seen_cat_keys = set()
-            for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
-                key = f"{article['title']}|{article['link']}|{article['published']}"
-                if key not in seen_cat_keys:
-                    seen_cat_keys.add(key)
-                    unique_articles.append(article)
-            categorized_articles[cat] = unique_articles[:10]
         logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
         return render_template("index.html",

     # Check if the database needs to be loaded (first time or empty)
     db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
     if not db_exists:
         loading_complete = False
         logger.info("Downloading Chroma DB from Hugging Face Hub...")
         download_from_hf_hub()
         threading.Thread(target=load_feeds_in_background, daemon=True).start()
     elif not loading_complete:
+        pass  # Let background loading continue
     else:
         loading_complete = True
     try:
             logger.info("No articles in DB yet")
             return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
+        # Process and categorize articles with strict deduplication
         enriched_articles = []
         seen_keys = set()
         for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
                     "published": published,
                     "image": meta.get("image", "svg"),
                 })
+            else:
+                logger.debug(f"Duplicate found in DB: {key}")
+        # Sort by published date (stable sort)
         enriched_articles.sort(key=lambda x: x["published"], reverse=True)
+        # Group by category and limit to 10 most recent per category
         categorized_articles = {}
         for article in enriched_articles:
             cat = article["category"]
             if cat not in categorized_articles:
                 categorized_articles[cat] = []
+            # No need for extra deduplication here; trust seen_keys
+            categorized_articles[cat].append(article)
+        # Limit to 10 most recent per category
         for cat in categorized_articles:
+            categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
+            # Log the first two items to check for duplicates
+            if len(categorized_articles[cat]) >= 2:
+                logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
         logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
         return render_template("index.html",