broadfield-dev commited on
Commit
6d6a251
·
verified ·
1 Parent(s): a13e6db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -25
app.py CHANGED
@@ -5,6 +5,7 @@ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
5
  import logging
6
  import time
7
  from datetime import datetime
 
8
 
9
  app = Flask(__name__)
10
 
@@ -23,9 +24,8 @@ def load_feeds_in_background():
23
  articles = fetch_rss_feeds()
24
  logger.info(f"Fetched {len(articles)} articles")
25
  process_and_store_articles(articles)
26
- last_update_time = time.time() # Update timestamp when new articles are added
27
  logger.info("Background feed processing complete")
28
- # Upload updated DB to Hugging Face Hub
29
  upload_to_hf_hub()
30
  loading_complete = True
31
  except Exception as e:
@@ -61,11 +61,13 @@ def index():
61
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
62
  if not meta:
63
  continue
64
- title = meta.get("title", "No Title").strip()
65
- link = meta.get("link", "").strip()
 
66
  published = meta.get("published", "Unknown Date").strip()
67
- # Use a more robust key with normalized fields to prevent duplicates
68
- key = f"{title.lower()}|{link.lower()}|{published}"
 
69
  if key not in seen_keys:
70
  seen_keys.add(key)
71
  try:
@@ -75,13 +77,13 @@ def index():
75
  enriched_articles.append({
76
  "title": title,
77
  "link": link,
78
- "description": meta.get("original_description", "No Description"),
79
  "category": meta.get("category", "Uncategorized"),
80
  "published": published,
81
  "image": meta.get("image", "svg"),
82
  })
83
  else:
84
- logger.debug(f"Duplicate found in DB: {key}")
85
 
86
  # Sort by published date (stable sort)
87
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
@@ -100,7 +102,6 @@ def index():
100
  # Limit to 10 most recent per category and log top 2 for debugging
101
  for cat in categorized_articles:
102
  categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
103
- # Log the first two items to check for duplicates
104
  if len(categorized_articles[cat]) >= 2:
105
  logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
106
 
@@ -129,16 +130,18 @@ def search():
129
  seen_keys = set()
130
  for doc in results:
131
  meta = doc.metadata
132
- title = meta.get("title", "No Title").strip()
133
- link = meta.get("link", "").strip()
 
134
  published = meta.get("published", "Unknown Date").strip()
135
- key = f"{title}|{link}|{published}"
 
136
  if key not in seen_keys:
137
  seen_keys.add(key)
138
  enriched_articles.append({
139
  "title": title,
140
  "link": link,
141
- "description": meta.get("original_description", "No Description"),
142
  "category": meta.get("category", "Uncategorized"),
143
  "published": published,
144
  "image": meta.get("image", "svg"),
@@ -179,20 +182,22 @@ def get_updates():
179
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
180
  if not meta:
181
  continue
182
- title = meta.get("title", "No Title").strip()
183
- link = meta.get("link", "").strip()
 
184
  published = meta.get("published", "Unknown Date").strip()
185
- key = f"{title}|{link}|{published}"
 
186
  if key not in seen_keys:
187
  seen_keys.add(key)
188
  try:
189
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
190
  except (ValueError, TypeError):
191
- published = "1970-01-01T00:00:00" # Fallback to a very old date
192
  enriched_articles.append({
193
  "title": title,
194
  "link": link,
195
- "description": meta.get("original_description", "No Description"),
196
  "category": meta.get("category", "Uncategorized"),
197
  "published": published,
198
  "image": meta.get("image", "svg"),
@@ -204,7 +209,6 @@ def get_updates():
204
  cat = article["category"]
205
  if cat not in categorized_articles:
206
  categorized_articles[cat] = []
207
- # Extra deduplication for category
208
  key = f"{article['title']}|{article['link']}|{article['published']}"
209
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
210
  categorized_articles[cat].append(article)
@@ -214,7 +218,7 @@ def get_updates():
214
  unique_articles = []
215
  seen_cat_keys = set()
216
  for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
217
- key = f"{article['title']}|{article['link']}|{article['published']}"
218
  if key not in seen_cat_keys:
219
  seen_cat_keys.add(key)
220
  unique_articles.append(article)
@@ -237,20 +241,22 @@ def get_all_articles(category):
237
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
238
  if not meta or meta.get("category") != category:
239
  continue
240
- title = meta.get("title", "No Title").strip()
241
- link = meta.get("link", "").strip()
 
242
  published = meta.get("published", "Unknown Date").strip()
243
- key = f"{title}|{link}|{published}"
 
244
  if key not in seen_keys:
245
  seen_keys.add(key)
246
  try:
247
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
248
  except (ValueError, TypeError):
249
- published = "1970-01-01T00:00:00" # Fallback to a very old date
250
  enriched_articles.append({
251
  "title": title,
252
  "link": link,
253
- "description": meta.get("original_description", "No Description"),
254
  "category": meta.get("category", "Uncategorized"),
255
  "published": published,
256
  "image": meta.get("image", "svg"),
 
5
  import logging
6
  import time
7
  from datetime import datetime
8
+ import hashlib
9
 
10
  app = Flask(__name__)
11
 
 
24
  articles = fetch_rss_feeds()
25
  logger.info(f"Fetched {len(articles)} articles")
26
  process_and_store_articles(articles)
27
+ last_update_time = time.time()
28
  logger.info("Background feed processing complete")
 
29
  upload_to_hf_hub()
30
  loading_complete = True
31
  except Exception as e:
 
61
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
62
  if not meta:
63
  continue
64
+ title = meta.get("title", "No Title").strip().lower()
65
+ link = meta.get("link", "").strip().lower()
66
+ description = meta.get("original_description", "No Description").strip()
67
  published = meta.get("published", "Unknown Date").strip()
68
+ # Use a robust key with normalized fields and description hash for deduplication
69
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
70
+ key = f"{title}|{link}|{published}|{description_hash}"
71
  if key not in seen_keys:
72
  seen_keys.add(key)
73
  try:
 
77
  enriched_articles.append({
78
  "title": title,
79
  "link": link,
80
+ "description": description,
81
  "category": meta.get("category", "Uncategorized"),
82
  "published": published,
83
  "image": meta.get("image", "svg"),
84
  })
85
  else:
86
+ logger.debug(f"Duplicate found in retrieval: {key}")
87
 
88
  # Sort by published date (stable sort)
89
  enriched_articles.sort(key=lambda x: x["published"], reverse=True)
 
102
  # Limit to 10 most recent per category and log top 2 for debugging
103
  for cat in categorized_articles:
104
  categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
 
105
  if len(categorized_articles[cat]) >= 2:
106
  logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
107
 
 
130
  seen_keys = set()
131
  for doc in results:
132
  meta = doc.metadata
133
+ title = meta.get("title", "No Title").strip().lower()
134
+ link = meta.get("link", "").strip().lower()
135
+ description = meta.get("original_description", "No Description").strip()
136
  published = meta.get("published", "Unknown Date").strip()
137
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
138
+ key = f"{title}|{link}|{published}|{description_hash}"
139
  if key not in seen_keys:
140
  seen_keys.add(key)
141
  enriched_articles.append({
142
  "title": title,
143
  "link": link,
144
+ "description": description,
145
  "category": meta.get("category", "Uncategorized"),
146
  "published": published,
147
  "image": meta.get("image", "svg"),
 
182
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
183
  if not meta:
184
  continue
185
+ title = meta.get("title", "No Title").strip().lower()
186
+ link = meta.get("link", "").strip().lower()
187
+ description = meta.get("original_description", "No Description").strip()
188
  published = meta.get("published", "Unknown Date").strip()
189
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
190
+ key = f"{title}|{link}|{published}|{description_hash}"
191
  if key not in seen_keys:
192
  seen_keys.add(key)
193
  try:
194
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
195
  except (ValueError, TypeError):
196
+ published = "1970-01-01T00:00:00"
197
  enriched_articles.append({
198
  "title": title,
199
  "link": link,
200
+ "description": description,
201
  "category": meta.get("category", "Uncategorized"),
202
  "published": published,
203
  "image": meta.get("image", "svg"),
 
209
  cat = article["category"]
210
  if cat not in categorized_articles:
211
  categorized_articles[cat] = []
 
212
  key = f"{article['title']}|{article['link']}|{article['published']}"
213
  if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
214
  categorized_articles[cat].append(article)
 
218
  unique_articles = []
219
  seen_cat_keys = set()
220
  for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
221
+ key = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
222
  if key not in seen_cat_keys:
223
  seen_cat_keys.add(key)
224
  unique_articles.append(article)
 
241
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
242
  if not meta or meta.get("category") != category:
243
  continue
244
+ title = meta.get("title", "No Title").strip().lower()
245
+ link = meta.get("link", "").strip().lower()
246
+ description = meta.get("original_description", "No Description").strip()
247
  published = meta.get("published", "Unknown Date").strip()
248
+ description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
249
+ key = f"{title}|{link}|{published}|{description_hash}"
250
  if key not in seen_keys:
251
  seen_keys.add(key)
252
  try:
253
  published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
254
  except (ValueError, TypeError):
255
+ published = "1970-01-01T00:00:00"
256
  enriched_articles.append({
257
  "title": title,
258
  "link": link,
259
+ "description": description,
260
  "category": meta.get("category", "Uncategorized"),
261
  "published": published,
262
  "image": meta.get("image", "svg"),