broadfield-dev commited on
Commit
7a82005
·
verified ·
1 Parent(s): 146f768

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -17
app.py CHANGED
@@ -39,16 +39,13 @@ def index():
39
  # Check if the database needs to be loaded (first time or empty)
40
  db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
41
  if not db_exists:
42
- # First load: DB doesn't exist or is empty
43
  loading_complete = False
44
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
45
  download_from_hf_hub()
46
  threading.Thread(target=load_feeds_in_background, daemon=True).start()
47
  elif not loading_complete:
48
- # Background loading is still in progress from a previous request
49
- pass # Let it continue, spinner will show
50
  else:
51
- # DB exists and loading is complete, no spinner needed
52
  loading_complete = True
53
 
54
  try:
@@ -58,7 +55,7 @@ def index():
58
  logger.info("No articles in DB yet")
59
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
60
 
61
- # Process and categorize articles, getting only 10 most recent per category with strict deduplication
62
  enriched_articles = []
63
  seen_keys = set()
64
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
@@ -82,27 +79,27 @@ def index():
82
  "published": published,
83
  "image": meta.get("image", "svg"),
84
  })
 
 
85
 
 
86
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
87
 
 
88
  categorized_articles = {}
89
  for article in enriched_articles:
90
  cat = article["category"]
91
  if cat not in categorized_articles:
92
  categorized_articles[cat] = []
93
- key = f"{article['title']}|{article['link']}|{article['published']}"
94
- if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
95
- categorized_articles[cat].append(article)
96
-
97
  for cat in categorized_articles:
98
- unique_articles = []
99
- seen_cat_keys = set()
100
- for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
101
- key = f"{article['title']}|{article['link']}|{article['published']}"
102
- if key not in seen_cat_keys:
103
- seen_cat_keys.add(key)
104
- unique_articles.append(article)
105
- categorized_articles[cat] = unique_articles[:10]
106
 
107
  logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
108
  return render_template("index.html",
 
39
  # Check if the database needs to be loaded (first time or empty)
40
  db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
41
  if not db_exists:
 
42
  loading_complete = False
43
  logger.info("Downloading Chroma DB from Hugging Face Hub...")
44
  download_from_hf_hub()
45
  threading.Thread(target=load_feeds_in_background, daemon=True).start()
46
  elif not loading_complete:
47
+ pass # Let background loading continue
 
48
  else:
 
49
  loading_complete = True
50
 
51
  try:
 
55
  logger.info("No articles in DB yet")
56
  return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
57
 
58
+ # Process and categorize articles with strict deduplication
59
  enriched_articles = []
60
  seen_keys = set()
61
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
 
79
  "published": published,
80
  "image": meta.get("image", "svg"),
81
  })
82
+ else:
83
+ logger.debug(f"Duplicate found in DB: {key}")
84
 
85
+ # Sort by published date (stable sort)
86
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
87
 
88
+ # Group by category and limit to 10 most recent per category
89
  categorized_articles = {}
90
  for article in enriched_articles:
91
  cat = article["category"]
92
  if cat not in categorized_articles:
93
  categorized_articles[cat] = []
94
+ # No need for extra deduplication here; trust seen_keys
95
+ categorized_articles[cat].append(article)
96
+
97
+ # Limit to 10 most recent per category
98
  for cat in categorized_articles:
99
+ categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
100
+ # Log the first two items to check for duplicates
101
+ if len(categorized_articles[cat]) >= 2:
102
+ logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
 
 
 
 
103
 
104
  logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
105
  return render_template("index.html",