Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
2 |
import threading
|
3 |
from flask import Flask, render_template, request, jsonify
|
4 |
-
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
@@ -61,12 +61,18 @@ def index():
|
|
61 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
62 |
if not meta:
|
63 |
continue
|
64 |
-
title = meta.get("title", "No Title")
|
65 |
-
link = meta.get("link", "")
|
66 |
-
description = meta.get("original_description", "No Description")
|
67 |
published = meta.get("published", "Unknown Date").strip()
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
key = f"{title}|{link}|{published}|{description_hash}"
|
71 |
if key not in seen_keys:
|
72 |
seen_keys.add(key)
|
@@ -130,11 +136,17 @@ def search():
|
|
130 |
seen_keys = set()
|
131 |
for doc in results:
|
132 |
meta = doc.metadata
|
133 |
-
title = meta.get("title", "No Title")
|
134 |
-
link = meta.get("link", "")
|
135 |
-
description = meta.get("original_description", "No Description")
|
136 |
published = meta.get("published", "Unknown Date").strip()
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
key = f"{title}|{link}|{published}|{description_hash}"
|
139 |
if key not in seen_keys:
|
140 |
seen_keys.add(key)
|
@@ -182,11 +194,17 @@ def get_updates():
|
|
182 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
183 |
if not meta:
|
184 |
continue
|
185 |
-
title = meta.get("title", "No Title")
|
186 |
-
link = meta.get("link", "")
|
187 |
-
description = meta.get("original_description", "No Description")
|
188 |
published = meta.get("published", "Unknown Date").strip()
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
key = f"{title}|{link}|{published}|{description_hash}"
|
191 |
if key not in seen_keys:
|
192 |
seen_keys.add(key)
|
@@ -218,7 +236,7 @@ def get_updates():
|
|
218 |
unique_articles = []
|
219 |
seen_cat_keys = set()
|
220 |
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
221 |
-
key = f"{article['title']
|
222 |
if key not in seen_cat_keys:
|
223 |
seen_cat_keys.add(key)
|
224 |
unique_articles.append(article)
|
@@ -241,11 +259,17 @@ def get_all_articles(category):
|
|
241 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
242 |
if not meta or meta.get("category") != category:
|
243 |
continue
|
244 |
-
title = meta.get("title", "No Title")
|
245 |
-
link = meta.get("link", "")
|
246 |
-
description = meta.get("original_description", "No Description")
|
247 |
published = meta.get("published", "Unknown Date").strip()
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
key = f"{title}|{link}|{published}|{description_hash}"
|
250 |
if key not in seen_keys:
|
251 |
seen_keys.add(key)
|
|
|
1 |
import os
|
2 |
import threading
|
3 |
from flask import Flask, render_template, request, jsonify
|
4 |
+
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
|
5 |
import logging
|
6 |
import time
|
7 |
from datetime import datetime
|
|
|
61 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
62 |
if not meta:
|
63 |
continue
|
64 |
+
title = meta.get("title", "No Title")
|
65 |
+
link = meta.get("link", "")
|
66 |
+
description = meta.get("original_description", "No Description")
|
67 |
published = meta.get("published", "Unknown Date").strip()
|
68 |
+
|
69 |
+
# Clean and normalize all fields
|
70 |
+
title = clean_text(title)
|
71 |
+
link = clean_text(link)
|
72 |
+
description = clean_text(description)
|
73 |
+
|
74 |
+
# Use a robust key with cleaned fields and description hash for deduplication
|
75 |
+
description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
|
76 |
key = f"{title}|{link}|{published}|{description_hash}"
|
77 |
if key not in seen_keys:
|
78 |
seen_keys.add(key)
|
|
|
136 |
seen_keys = set()
|
137 |
for doc in results:
|
138 |
meta = doc.metadata
|
139 |
+
title = meta.get("title", "No Title")
|
140 |
+
link = meta.get("link", "")
|
141 |
+
description = meta.get("original_description", "No Description")
|
142 |
published = meta.get("published", "Unknown Date").strip()
|
143 |
+
|
144 |
+
# Clean and normalize all fields
|
145 |
+
title = clean_text(title)
|
146 |
+
link = clean_text(link)
|
147 |
+
description = clean_text(description)
|
148 |
+
|
149 |
+
description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
|
150 |
key = f"{title}|{link}|{published}|{description_hash}"
|
151 |
if key not in seen_keys:
|
152 |
seen_keys.add(key)
|
|
|
194 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
195 |
if not meta:
|
196 |
continue
|
197 |
+
title = meta.get("title", "No Title")
|
198 |
+
link = meta.get("link", "")
|
199 |
+
description = meta.get("original_description", "No Description")
|
200 |
published = meta.get("published", "Unknown Date").strip()
|
201 |
+
|
202 |
+
# Clean and normalize all fields
|
203 |
+
title = clean_text(title)
|
204 |
+
link = clean_text(link)
|
205 |
+
description = clean_text(description)
|
206 |
+
|
207 |
+
description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
|
208 |
key = f"{title}|{link}|{published}|{description_hash}"
|
209 |
if key not in seen_keys:
|
210 |
seen_keys.add(key)
|
|
|
236 |
unique_articles = []
|
237 |
seen_cat_keys = set()
|
238 |
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
239 |
+
key = f"{clean_text(article['title'])}|{clean_text(article['link'])}|{article['published']}"
|
240 |
if key not in seen_cat_keys:
|
241 |
seen_cat_keys.add(key)
|
242 |
unique_articles.append(article)
|
|
|
259 |
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
260 |
if not meta or meta.get("category") != category:
|
261 |
continue
|
262 |
+
title = meta.get("title", "No Title")
|
263 |
+
link = meta.get("link", "")
|
264 |
+
description = meta.get("original_description", "No Description")
|
265 |
published = meta.get("published", "Unknown Date").strip()
|
266 |
+
|
267 |
+
# Clean and normalize all fields
|
268 |
+
title = clean_text(title)
|
269 |
+
link = clean_text(link)
|
270 |
+
description = clean_text(description)
|
271 |
+
|
272 |
+
description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
|
273 |
key = f"{title}|{link}|{published}|{description_hash}"
|
274 |
if key not in seen_keys:
|
275 |
seen_keys.add(key)
|