broadfield-dev commited on
Commit
b5bbce9
·
verified ·
1 Parent(s): 4a45db6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -19
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import threading
3
  from flask import Flask, render_template, request, jsonify
4
- from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
5
  import logging
6
  import time
7
  from datetime import datetime
@@ -61,12 +61,18 @@ def index():
61
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
62
  if not meta:
63
  continue
64
- title = meta.get("title", "No Title").strip().lower()
65
- link = meta.get("link", "").strip().lower()
66
- description = meta.get("original_description", "No Description").strip()
67
  published = meta.get("published", "Unknown Date").strip()
68
- # Use a robust key with normalized fields and description hash for deduplication
69
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
 
 
 
 
 
 
70
  key = f"{title}|{link}|{published}|{description_hash}"
71
  if key not in seen_keys:
72
  seen_keys.add(key)
@@ -130,11 +136,17 @@ def search():
130
  seen_keys = set()
131
  for doc in results:
132
  meta = doc.metadata
133
- title = meta.get("title", "No Title").strip().lower()
134
- link = meta.get("link", "").strip().lower()
135
- description = meta.get("original_description", "No Description").strip()
136
  published = meta.get("published", "Unknown Date").strip()
137
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
 
 
 
 
 
 
138
  key = f"{title}|{link}|{published}|{description_hash}"
139
  if key not in seen_keys:
140
  seen_keys.add(key)
@@ -182,11 +194,17 @@ def get_updates():
182
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
183
  if not meta:
184
  continue
185
- title = meta.get("title", "No Title").strip().lower()
186
- link = meta.get("link", "").strip().lower()
187
- description = meta.get("original_description", "No Description").strip()
188
  published = meta.get("published", "Unknown Date").strip()
189
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
 
 
 
 
 
 
190
  key = f"{title}|{link}|{published}|{description_hash}"
191
  if key not in seen_keys:
192
  seen_keys.add(key)
@@ -218,7 +236,7 @@ def get_updates():
218
  unique_articles = []
219
  seen_cat_keys = set()
220
  for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
221
- key = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
222
  if key not in seen_cat_keys:
223
  seen_cat_keys.add(key)
224
  unique_articles.append(article)
@@ -241,11 +259,17 @@ def get_all_articles(category):
241
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
242
  if not meta or meta.get("category") != category:
243
  continue
244
- title = meta.get("title", "No Title").strip().lower()
245
- link = meta.get("link", "").strip().lower()
246
- description = meta.get("original_description", "No Description").strip()
247
  published = meta.get("published", "Unknown Date").strip()
248
- description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
 
 
 
 
 
 
249
  key = f"{title}|{link}|{published}|{description_hash}"
250
  if key not in seen_keys:
251
  seen_keys.add(key)
 
1
  import os
2
  import threading
3
  from flask import Flask, render_template, request, jsonify
4
+ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
5
  import logging
6
  import time
7
  from datetime import datetime
 
61
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
62
  if not meta:
63
  continue
64
+ title = meta.get("title", "No Title")
65
+ link = meta.get("link", "")
66
+ description = meta.get("original_description", "No Description")
67
  published = meta.get("published", "Unknown Date").strip()
68
+
69
+ # Clean and normalize all fields
70
+ title = clean_text(title)
71
+ link = clean_text(link)
72
+ description = clean_text(description)
73
+
74
+ # Use a robust key with cleaned fields and description hash for deduplication
75
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
76
  key = f"{title}|{link}|{published}|{description_hash}"
77
  if key not in seen_keys:
78
  seen_keys.add(key)
 
136
  seen_keys = set()
137
  for doc in results:
138
  meta = doc.metadata
139
+ title = meta.get("title", "No Title")
140
+ link = meta.get("link", "")
141
+ description = meta.get("original_description", "No Description")
142
  published = meta.get("published", "Unknown Date").strip()
143
+
144
+ # Clean and normalize all fields
145
+ title = clean_text(title)
146
+ link = clean_text(link)
147
+ description = clean_text(description)
148
+
149
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
150
  key = f"{title}|{link}|{published}|{description_hash}"
151
  if key not in seen_keys:
152
  seen_keys.add(key)
 
194
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
195
  if not meta:
196
  continue
197
+ title = meta.get("title", "No Title")
198
+ link = meta.get("link", "")
199
+ description = meta.get("original_description", "No Description")
200
  published = meta.get("published", "Unknown Date").strip()
201
+
202
+ # Clean and normalize all fields
203
+ title = clean_text(title)
204
+ link = clean_text(link)
205
+ description = clean_text(description)
206
+
207
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
208
  key = f"{title}|{link}|{published}|{description_hash}"
209
  if key not in seen_keys:
210
  seen_keys.add(key)
 
236
  unique_articles = []
237
  seen_cat_keys = set()
238
  for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
239
+ key = f"{clean_text(article['title'])}|{clean_text(article['link'])}|{article['published']}"
240
  if key not in seen_cat_keys:
241
  seen_cat_keys.add(key)
242
  unique_articles.append(article)
 
259
  for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
260
  if not meta or meta.get("category") != category:
261
  continue
262
+ title = meta.get("title", "No Title")
263
+ link = meta.get("link", "")
264
+ description = meta.get("original_description", "No Description")
265
  published = meta.get("published", "Unknown Date").strip()
266
+
267
+ # Clean and normalize all fields
268
+ title = clean_text(title)
269
+ link = clean_text(link)
270
+ description = clean_text(description)
271
+
272
+ description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
273
  key = f"{title}|{link}|{published}|{description_hash}"
274
  if key not in seen_keys:
275
  seen_keys.add(key)