broadfield-dev commited on
Commit
1c7cefc
·
verified ·
1 Parent(s): a947e87

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +259 -0
app.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+ from flask import Flask, render_template, request, jsonify
4
+ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
5
+ import logging
6
+ import time
7
+ from datetime import datetime
8
+
9
+ app = Flask(__name__)
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Global flag to track background loading
16
+ loading_complete = False
17
+ last_update_time = time.time()
18
+
19
+ def load_feeds_in_background():
20
+ global loading_complete, last_update_time
21
+ try:
22
+ logger.info("Starting background RSS feed fetch")
23
+ articles = fetch_rss_feeds()
24
+ logger.info(f"Fetched {len(articles)} articles")
25
+ process_and_store_articles(articles)
26
+ last_update_time = time.time() # Update timestamp when new articles are added
27
+ logger.info("Background feed processing complete")
28
+ # Upload updated DB to Hugging Face Hub
29
+ upload_to_hf_hub()
30
+ loading_complete = True
31
+ except Exception as e:
32
+ logger.error(f"Error in background feed loading: {e}")
33
+ loading_complete = True
34
+
35
+ @app.route('/')
36
+ def index():
37
+ global loading_complete
38
+ loading_complete = False # Reset on each load
39
+
40
+ # Ensure Chroma DB is downloaded from Hugging Face Hub on first load
41
+ if not os.path.exists("chroma_db"):
42
+ logger.info("Downloading Chroma DB from Hugging Face Hub...")
43
+ download_from_hf_hub()
44
+
45
+ # Start background feed loading
46
+ threading.Thread(target=load_feeds_in_background, daemon=True).start()
47
+
48
+ try:
49
+ # Retrieve all articles from Chroma DB
50
+ all_docs = vector_db.get(include=['documents', 'metadatas'])
51
+ if not all_docs.get('metadatas'):
52
+ logger.info("No articles in DB yet")
53
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
54
+
55
+ # Process and categorize articles, getting only 10 most recent per category with strict deduplication
56
+ enriched_articles = []
57
+ seen_keys = set()
58
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
59
+ if not meta:
60
+ continue
61
+ title = meta.get("title", "No Title").strip()
62
+ link = meta.get("link", "").strip()
63
+ published = meta.get("published", "Unknown Date").strip()
64
+ # Use a more robust key including trimmed fields to prevent duplicates
65
+ key = f"{title}|{link}|{published}"
66
+ if key not in seen_keys:
67
+ seen_keys.add(key)
68
+ # Try to parse published date, fallback to string sorting
69
+ try:
70
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
71
+ except (ValueError, TypeError):
72
+ # Fallback to a very old date for sorting if parsing fails
73
+ published = "1970-01-01T00:00:00"
74
+ enriched_articles.append({
75
+ "title": title,
76
+ "link": link,
77
+ "description": meta.get("original_description", "No Description"),
78
+ "category": meta.get("category", "Uncategorized"),
79
+ "published": published,
80
+ "image": meta.get("image", "svg"),
81
+ })
82
+
83
+ # Sort by published date (handle both datetime and string)
84
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
85
+
86
+ # Group by category and limit to 10 most recent per category with final deduplication
87
+ categorized_articles = {}
88
+ for article in enriched_articles:
89
+ cat = article["category"]
90
+ if cat not in categorized_articles:
91
+ categorized_articles[cat] = []
92
+ # Add only if not already in the category list (extra deduplication)
93
+ key = f"{article['title']}|{article['link']}|{article['published']}"
94
+ if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
95
+ categorized_articles[cat].append(article)
96
+
97
+ # Limit to 10 most recent per category and sort again for safety
98
+ for cat in categorized_articles:
99
+ unique_articles = []
100
+ seen_cat_keys = set()
101
+ for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
102
+ key = f"{article['title']}|{article['link']}|{article['published']}"
103
+ if key not in seen_cat_keys:
104
+ seen_cat_keys.add(key)
105
+ unique_articles.append(article)
106
+ categorized_articles[cat] = unique_articles[:10]
107
+
108
+ logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
109
+ return render_template("index.html",
110
+ categorized_articles=categorized_articles,
111
+ has_articles=True,
112
+ loading=True)
113
+ except Exception as e:
114
+ logger.error(f"Error retrieving articles: {e}")
115
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
116
+
117
+ @app.route('/search', methods=['POST'])
118
+ def search():
119
+ query = request.form.get('search')
120
+ if not query:
121
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
122
+
123
+ try:
124
+ logger.info(f"Searching for: {query}")
125
+ results = vector_db.similarity_search(query, k=10)
126
+ enriched_articles = []
127
+ seen_keys = set()
128
+ for doc in results:
129
+ meta = doc.metadata
130
+ title = meta.get("title", "No Title").strip()
131
+ link = meta.get("link", "").strip()
132
+ published = meta.get("published", "Unknown Date").strip()
133
+ key = f"{title}|{link}|{published}"
134
+ if key not in seen_keys:
135
+ seen_keys.add(key)
136
+ enriched_articles.append({
137
+ "title": title,
138
+ "link": link,
139
+ "description": meta.get("original_description", "No Description"),
140
+ "category": meta.get("category", "Uncategorized"),
141
+ "published": meta.get("published", "Unknown Date"),
142
+ "image": meta.get("image", "svg"),
143
+ })
144
+
145
+ categorized_articles = {}
146
+ for article in enriched_articles:
147
+ cat = article["category"]
148
+ categorized_articles.setdefault(cat, []).append(article)
149
+
150
+ return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles), loading=False)
151
+ except Exception as e:
152
+ logger.error(f"Search error: {e}")
153
+ return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
154
+
155
+ @app.route('/check_loading')
156
+ def check_loading():
157
+ global loading_complete, last_update_time
158
+ if loading_complete:
159
+ return jsonify({"status": "complete", "last_update": last_update_time})
160
+ return jsonify({"status": "loading"}), 202
161
+
162
+ @app.route('/get_updates')
163
+ def get_updates():
164
+ global last_update_time
165
+ try:
166
+ all_docs = vector_db.get(include=['documents', 'metadatas'])
167
+ if not all_docs.get('metadatas'):
168
+ return jsonify({"articles": [], "last_update": last_update_time})
169
+
170
+ enriched_articles = []
171
+ seen_keys = set()
172
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
173
+ if not meta:
174
+ continue
175
+ title = meta.get("title", "No Title").strip()
176
+ link = meta.get("link", "").strip()
177
+ published = meta.get("published", "Unknown Date").strip()
178
+ key = f"{title}|{link}|{published}"
179
+ if key not in seen_keys:
180
+ seen_keys.add(key)
181
+ try:
182
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
183
+ except (ValueError, TypeError):
184
+ published = "1970-01-01T00:00:00" # Fallback to a very old date
185
+ enriched_articles.append({
186
+ "title": title,
187
+ "link": link,
188
+ "description": meta.get("original_description", "No Description"),
189
+ "category": meta.get("category", "Uncategorized"),
190
+ "published": published,
191
+ "image": meta.get("image", "svg"),
192
+ })
193
+
194
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
195
+ categorized_articles = {}
196
+ for article in enriched_articles:
197
+ cat = article["category"]
198
+ if cat not in categorized_articles:
199
+ categorized_articles[cat] = []
200
+ # Extra deduplication for category
201
+ key = f"{article['title']}|{article['link']}|{article['published']}"
202
+ if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
203
+ categorized_articles[cat].append(article)
204
+
205
+ # Limit to 10 most recent per category with final deduplication
206
+ for cat in categorized_articles:
207
+ unique_articles = []
208
+ seen_cat_keys = set()
209
+ for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
210
+ key = f"{article['title']}|{article['link']}|{article['published']}"
211
+ if key not in seen_cat_keys:
212
+ seen_cat_keys.add(key)
213
+ unique_articles.append(article)
214
+ categorized_articles[cat] = unique_articles[:10]
215
+
216
+ return jsonify({"articles": categorized_articles, "last_update": last_update_time})
217
+ except Exception as e:
218
+ logger.error(f"Error fetching updates: {e}")
219
+ return jsonify({"articles": {}, "last_update": last_update_time}), 500
220
+
221
+ @app.route('/get_all_articles/<category>')
222
+ def get_all_articles(category):
223
+ try:
224
+ all_docs = vector_db.get(include=['documents', 'metadatas'])
225
+ if not all_docs.get('metadatas'):
226
+ return jsonify({"articles": [], "category": category})
227
+
228
+ enriched_articles = []
229
+ seen_keys = set()
230
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
231
+ if not meta or meta.get("category") != category:
232
+ continue
233
+ title = meta.get("title", "No Title").strip()
234
+ link = meta.get("link", "").strip()
235
+ published = meta.get("published", "Unknown Date").strip()
236
+ key = f"{title}|{link}|{published}"
237
+ if key not in seen_keys:
238
+ seen_keys.add(key)
239
+ try:
240
+ published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
241
+ except (ValueError, TypeError):
242
+ published = "1970-01-01T00:00:00" # Fallback to a very old date
243
+ enriched_articles.append({
244
+ "title": title,
245
+ "link": link,
246
+ "description": meta.get("original_description", "No Description"),
247
+ "category": meta.get("category", "Uncategorized"),
248
+ "published": published,
249
+ "image": meta.get("image", "svg"),
250
+ })
251
+
252
+ enriched_articles.sort(key=lambda x: x["published"], reverse=True)
253
+ return jsonify({"articles": enriched_articles, "category": category})
254
+ except Exception as e:
255
+ logger.error(f"Error fetching all articles for category {category}: {e}")
256
+ return jsonify({"articles": [], "category": category}), 500
257
+
258
+ if __name__ == "__main__":
259
+ app.run(host="0.0.0.0", port=7860)