Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
|
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
|
|
8 |
|
9 |
app = Flask(__name__)
|
10 |
|
@@ -23,9 +24,8 @@ def load_feeds_in_background():
|
|
23 |
articles = fetch_rss_feeds()
|
24 |
logger.info(f"Fetched {len(articles)} articles")
|
25 |
process_and_store_articles(articles)
|
26 |
-
last_update_time = time.time()
|
27 |
logger.info("Background feed processing complete")
|
28 |
-
# Upload updated DB to Hugging Face Hub
|
29 |
upload_to_hf_hub()
|
30 |
loading_complete = True
|
31 |
except Exception as e:
|
@@ -61,11 +61,13 @@ def index():
|
|
61 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
62 |
if not meta:
|
63 |
continue
|
64 |
-
title = meta.get("title", "No Title").strip()
|
65 |
-
link = meta.get("link", "").strip()
|
|
|
66 |
published = meta.get("published", "Unknown Date").strip()
|
67 |
-
# Use a
|
68 |
-
|
|
|
69 |
if key not in seen_keys:
|
70 |
seen_keys.add(key)
|
71 |
try:
|
@@ -75,13 +77,13 @@ def index():
|
|
75 |
enriched_articles.append({
|
76 |
"title": title,
|
77 |
"link": link,
|
78 |
-
"description":
|
79 |
"category": meta.get("category", "Uncategorized"),
|
80 |
"published": published,
|
81 |
"image": meta.get("image", "svg"),
|
82 |
})
|
83 |
else:
|
84 |
-
logger.debug(f"Duplicate found in
|
85 |
|
86 |
# Sort by published date (stable sort)
|
87 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
@@ -100,7 +102,6 @@ def index():
|
|
100 |
# Limit to 10 most recent per category and log top 2 for debugging
|
101 |
for cat in categorized_articles:
|
102 |
categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
|
103 |
-
# Log the first two items to check for duplicates
|
104 |
if len(categorized_articles[cat]) >= 2:
|
105 |
logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
|
106 |
|
@@ -129,16 +130,18 @@ def search():
|
|
129 |
seen_keys = set()
|
130 |
for doc in results:
|
131 |
meta = doc.metadata
|
132 |
-
title = meta.get("title", "No Title").strip()
|
133 |
-
link = meta.get("link", "").strip()
|
|
|
134 |
published = meta.get("published", "Unknown Date").strip()
|
135 |
-
|
|
|
136 |
if key not in seen_keys:
|
137 |
seen_keys.add(key)
|
138 |
enriched_articles.append({
|
139 |
"title": title,
|
140 |
"link": link,
|
141 |
-
"description":
|
142 |
"category": meta.get("category", "Uncategorized"),
|
143 |
"published": published,
|
144 |
"image": meta.get("image", "svg"),
|
@@ -179,20 +182,22 @@ def get_updates():
|
|
179 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
180 |
if not meta:
|
181 |
continue
|
182 |
-
title = meta.get("title", "No Title").strip()
|
183 |
-
link = meta.get("link", "").strip()
|
|
|
184 |
published = meta.get("published", "Unknown Date").strip()
|
185 |
-
|
|
|
186 |
if key not in seen_keys:
|
187 |
seen_keys.add(key)
|
188 |
try:
|
189 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
190 |
except (ValueError, TypeError):
|
191 |
-
published = "1970-01-01T00:00:00"
|
192 |
enriched_articles.append({
|
193 |
"title": title,
|
194 |
"link": link,
|
195 |
-
"description":
|
196 |
"category": meta.get("category", "Uncategorized"),
|
197 |
"published": published,
|
198 |
"image": meta.get("image", "svg"),
|
@@ -204,7 +209,6 @@ def get_updates():
|
|
204 |
cat = article["category"]
|
205 |
if cat not in categorized_articles:
|
206 |
categorized_articles[cat] = []
|
207 |
-
# Extra deduplication for category
|
208 |
key = f"{article['title']}|{article['link']}|{article['published']}"
|
209 |
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
210 |
categorized_articles[cat].append(article)
|
@@ -214,7 +218,7 @@ def get_updates():
|
|
214 |
unique_articles = []
|
215 |
seen_cat_keys = set()
|
216 |
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
217 |
-
key = f"{article['title']}|{article['link']}|{article['published']}"
|
218 |
if key not in seen_cat_keys:
|
219 |
seen_cat_keys.add(key)
|
220 |
unique_articles.append(article)
|
@@ -237,20 +241,22 @@ def get_all_articles(category):
|
|
237 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
238 |
if not meta or meta.get("category") != category:
|
239 |
continue
|
240 |
-
title = meta.get("title", "No Title").strip()
|
241 |
-
link = meta.get("link", "").strip()
|
|
|
242 |
published = meta.get("published", "Unknown Date").strip()
|
243 |
-
|
|
|
244 |
if key not in seen_keys:
|
245 |
seen_keys.add(key)
|
246 |
try:
|
247 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
248 |
except (ValueError, TypeError):
|
249 |
-
published = "1970-01-01T00:00:00"
|
250 |
enriched_articles.append({
|
251 |
"title": title,
|
252 |
"link": link,
|
253 |
-
"description":
|
254 |
"category": meta.get("category", "Uncategorized"),
|
255 |
"published": published,
|
256 |
"image": meta.get("image", "svg"),
|
|
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
8 |
+
import hashlib
|
9 |
|
10 |
app = Flask(__name__)
|
11 |
|
|
|
24 |
articles = fetch_rss_feeds()
|
25 |
logger.info(f"Fetched {len(articles)} articles")
|
26 |
process_and_store_articles(articles)
|
27 |
+
last_update_time = time.time()
|
28 |
logger.info("Background feed processing complete")
|
|
|
29 |
upload_to_hf_hub()
|
30 |
loading_complete = True
|
31 |
except Exception as e:
|
|
|
61 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
62 |
if not meta:
|
63 |
continue
|
64 |
+
title = meta.get("title", "No Title").strip().lower()
|
65 |
+
link = meta.get("link", "").strip().lower()
|
66 |
+
description = meta.get("original_description", "No Description").strip()
|
67 |
published = meta.get("published", "Unknown Date").strip()
|
68 |
+
# Use a robust key with normalized fields and description hash for deduplication
|
69 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
70 |
+
key = f"{title}|{link}|{published}|{description_hash}"
|
71 |
if key not in seen_keys:
|
72 |
seen_keys.add(key)
|
73 |
try:
|
|
|
77 |
enriched_articles.append({
|
78 |
"title": title,
|
79 |
"link": link,
|
80 |
+
"description": description,
|
81 |
"category": meta.get("category", "Uncategorized"),
|
82 |
"published": published,
|
83 |
"image": meta.get("image", "svg"),
|
84 |
})
|
85 |
else:
|
86 |
+
logger.debug(f"Duplicate found in retrieval: {key}")
|
87 |
|
88 |
# Sort by published date (stable sort)
|
89 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
|
|
102 |
# Limit to 10 most recent per category and log top 2 for debugging
|
103 |
for cat in categorized_articles:
|
104 |
categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
|
|
|
105 |
if len(categorized_articles[cat]) >= 2:
|
106 |
logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")
|
107 |
|
|
|
130 |
seen_keys = set()
|
131 |
for doc in results:
|
132 |
meta = doc.metadata
|
133 |
+
title = meta.get("title", "No Title").strip().lower()
|
134 |
+
link = meta.get("link", "").strip().lower()
|
135 |
+
description = meta.get("original_description", "No Description").strip()
|
136 |
published = meta.get("published", "Unknown Date").strip()
|
137 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
138 |
+
key = f"{title}|{link}|{published}|{description_hash}"
|
139 |
if key not in seen_keys:
|
140 |
seen_keys.add(key)
|
141 |
enriched_articles.append({
|
142 |
"title": title,
|
143 |
"link": link,
|
144 |
+
"description": description,
|
145 |
"category": meta.get("category", "Uncategorized"),
|
146 |
"published": published,
|
147 |
"image": meta.get("image", "svg"),
|
|
|
182 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
183 |
if not meta:
|
184 |
continue
|
185 |
+
title = meta.get("title", "No Title").strip().lower()
|
186 |
+
link = meta.get("link", "").strip().lower()
|
187 |
+
description = meta.get("original_description", "No Description").strip()
|
188 |
published = meta.get("published", "Unknown Date").strip()
|
189 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
190 |
+
key = f"{title}|{link}|{published}|{description_hash}"
|
191 |
if key not in seen_keys:
|
192 |
seen_keys.add(key)
|
193 |
try:
|
194 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
195 |
except (ValueError, TypeError):
|
196 |
+
published = "1970-01-01T00:00:00"
|
197 |
enriched_articles.append({
|
198 |
"title": title,
|
199 |
"link": link,
|
200 |
+
"description": description,
|
201 |
"category": meta.get("category", "Uncategorized"),
|
202 |
"published": published,
|
203 |
"image": meta.get("image", "svg"),
|
|
|
209 |
cat = article["category"]
|
210 |
if cat not in categorized_articles:
|
211 |
categorized_articles[cat] = []
|
|
|
212 |
key = f"{article['title']}|{article['link']}|{article['published']}"
|
213 |
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
214 |
categorized_articles[cat].append(article)
|
|
|
218 |
unique_articles = []
|
219 |
seen_cat_keys = set()
|
220 |
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
221 |
+
key = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
|
222 |
if key not in seen_cat_keys:
|
223 |
seen_cat_keys.add(key)
|
224 |
unique_articles.append(article)
|
|
|
241 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
242 |
if not meta or meta.get("category") != category:
|
243 |
continue
|
244 |
+
title = meta.get("title", "No Title").strip().lower()
|
245 |
+
link = meta.get("link", "").strip().lower()
|
246 |
+
description = meta.get("original_description", "No Description").strip()
|
247 |
published = meta.get("published", "Unknown Date").strip()
|
248 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
249 |
+
key = f"{title}|{link}|{published}|{description_hash}"
|
250 |
if key not in seen_keys:
|
251 |
seen_keys.add(key)
|
252 |
try:
|
253 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
254 |
except (ValueError, TypeError):
|
255 |
+
published = "1970-01-01T00:00:00"
|
256 |
enriched_articles.append({
|
257 |
"title": title,
|
258 |
"link": link,
|
259 |
+
"description": description,
|
260 |
"category": meta.get("category", "Uncategorized"),
|
261 |
"published": published,
|
262 |
"image": meta.get("image", "svg"),
|