Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import threading
|
3 |
+
from flask import Flask, render_template, request, jsonify
|
4 |
+
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
|
5 |
+
import logging
|
6 |
+
import time
|
7 |
+
from datetime import datetime
|
8 |
+
|
9 |
+
app = Flask(__name__)
|
10 |
+
|
11 |
+
# Setup logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
# Global flag to track background loading
|
16 |
+
loading_complete = False
|
17 |
+
last_update_time = time.time()
|
18 |
+
|
19 |
+
def load_feeds_in_background():
|
20 |
+
global loading_complete, last_update_time
|
21 |
+
try:
|
22 |
+
logger.info("Starting background RSS feed fetch")
|
23 |
+
articles = fetch_rss_feeds()
|
24 |
+
logger.info(f"Fetched {len(articles)} articles")
|
25 |
+
process_and_store_articles(articles)
|
26 |
+
last_update_time = time.time() # Update timestamp when new articles are added
|
27 |
+
logger.info("Background feed processing complete")
|
28 |
+
# Upload updated DB to Hugging Face Hub
|
29 |
+
upload_to_hf_hub()
|
30 |
+
loading_complete = True
|
31 |
+
except Exception as e:
|
32 |
+
logger.error(f"Error in background feed loading: {e}")
|
33 |
+
loading_complete = True
|
34 |
+
|
35 |
+
@app.route('/')
|
36 |
+
def index():
|
37 |
+
global loading_complete
|
38 |
+
loading_complete = False # Reset on each load
|
39 |
+
|
40 |
+
# Ensure Chroma DB is downloaded from Hugging Face Hub on first load
|
41 |
+
if not os.path.exists("chroma_db"):
|
42 |
+
logger.info("Downloading Chroma DB from Hugging Face Hub...")
|
43 |
+
download_from_hf_hub()
|
44 |
+
|
45 |
+
# Start background feed loading
|
46 |
+
threading.Thread(target=load_feeds_in_background, daemon=True).start()
|
47 |
+
|
48 |
+
try:
|
49 |
+
# Retrieve all articles from Chroma DB
|
50 |
+
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
51 |
+
if not all_docs.get('metadatas'):
|
52 |
+
logger.info("No articles in DB yet")
|
53 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
|
54 |
+
|
55 |
+
# Process and categorize articles, getting only 10 most recent per category with strict deduplication
|
56 |
+
enriched_articles = []
|
57 |
+
seen_keys = set()
|
58 |
+
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
59 |
+
if not meta:
|
60 |
+
continue
|
61 |
+
title = meta.get("title", "No Title").strip()
|
62 |
+
link = meta.get("link", "").strip()
|
63 |
+
published = meta.get("published", "Unknown Date").strip()
|
64 |
+
# Use a more robust key including trimmed fields to prevent duplicates
|
65 |
+
key = f"{title}|{link}|{published}"
|
66 |
+
if key not in seen_keys:
|
67 |
+
seen_keys.add(key)
|
68 |
+
# Try to parse published date, fallback to string sorting
|
69 |
+
try:
|
70 |
+
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
71 |
+
except (ValueError, TypeError):
|
72 |
+
# Fallback to a very old date for sorting if parsing fails
|
73 |
+
published = "1970-01-01T00:00:00"
|
74 |
+
enriched_articles.append({
|
75 |
+
"title": title,
|
76 |
+
"link": link,
|
77 |
+
"description": meta.get("original_description", "No Description"),
|
78 |
+
"category": meta.get("category", "Uncategorized"),
|
79 |
+
"published": published,
|
80 |
+
"image": meta.get("image", "svg"),
|
81 |
+
})
|
82 |
+
|
83 |
+
# Sort by published date (handle both datetime and string)
|
84 |
+
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
85 |
+
|
86 |
+
# Group by category and limit to 10 most recent per category with final deduplication
|
87 |
+
categorized_articles = {}
|
88 |
+
for article in enriched_articles:
|
89 |
+
cat = article["category"]
|
90 |
+
if cat not in categorized_articles:
|
91 |
+
categorized_articles[cat] = []
|
92 |
+
# Add only if not already in the category list (extra deduplication)
|
93 |
+
key = f"{article['title']}|{article['link']}|{article['published']}"
|
94 |
+
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
95 |
+
categorized_articles[cat].append(article)
|
96 |
+
|
97 |
+
# Limit to 10 most recent per category and sort again for safety
|
98 |
+
for cat in categorized_articles:
|
99 |
+
unique_articles = []
|
100 |
+
seen_cat_keys = set()
|
101 |
+
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
102 |
+
key = f"{article['title']}|{article['link']}|{article['published']}"
|
103 |
+
if key not in seen_cat_keys:
|
104 |
+
seen_cat_keys.add(key)
|
105 |
+
unique_articles.append(article)
|
106 |
+
categorized_articles[cat] = unique_articles[:10]
|
107 |
+
|
108 |
+
logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
|
109 |
+
return render_template("index.html",
|
110 |
+
categorized_articles=categorized_articles,
|
111 |
+
has_articles=True,
|
112 |
+
loading=True)
|
113 |
+
except Exception as e:
|
114 |
+
logger.error(f"Error retrieving articles: {e}")
|
115 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
|
116 |
+
|
117 |
+
@app.route('/search', methods=['POST'])
|
118 |
+
def search():
|
119 |
+
query = request.form.get('search')
|
120 |
+
if not query:
|
121 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
|
122 |
+
|
123 |
+
try:
|
124 |
+
logger.info(f"Searching for: {query}")
|
125 |
+
results = vector_db.similarity_search(query, k=10)
|
126 |
+
enriched_articles = []
|
127 |
+
seen_keys = set()
|
128 |
+
for doc in results:
|
129 |
+
meta = doc.metadata
|
130 |
+
title = meta.get("title", "No Title").strip()
|
131 |
+
link = meta.get("link", "").strip()
|
132 |
+
published = meta.get("published", "Unknown Date").strip()
|
133 |
+
key = f"{title}|{link}|{published}"
|
134 |
+
if key not in seen_keys:
|
135 |
+
seen_keys.add(key)
|
136 |
+
enriched_articles.append({
|
137 |
+
"title": title,
|
138 |
+
"link": link,
|
139 |
+
"description": meta.get("original_description", "No Description"),
|
140 |
+
"category": meta.get("category", "Uncategorized"),
|
141 |
+
"published": meta.get("published", "Unknown Date"),
|
142 |
+
"image": meta.get("image", "svg"),
|
143 |
+
})
|
144 |
+
|
145 |
+
categorized_articles = {}
|
146 |
+
for article in enriched_articles:
|
147 |
+
cat = article["category"]
|
148 |
+
categorized_articles.setdefault(cat, []).append(article)
|
149 |
+
|
150 |
+
return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles), loading=False)
|
151 |
+
except Exception as e:
|
152 |
+
logger.error(f"Search error: {e}")
|
153 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
|
154 |
+
|
155 |
+
@app.route('/check_loading')
|
156 |
+
def check_loading():
|
157 |
+
global loading_complete, last_update_time
|
158 |
+
if loading_complete:
|
159 |
+
return jsonify({"status": "complete", "last_update": last_update_time})
|
160 |
+
return jsonify({"status": "loading"}), 202
|
161 |
+
|
162 |
+
@app.route('/get_updates')
|
163 |
+
def get_updates():
|
164 |
+
global last_update_time
|
165 |
+
try:
|
166 |
+
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
167 |
+
if not all_docs.get('metadatas'):
|
168 |
+
return jsonify({"articles": [], "last_update": last_update_time})
|
169 |
+
|
170 |
+
enriched_articles = []
|
171 |
+
seen_keys = set()
|
172 |
+
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
173 |
+
if not meta:
|
174 |
+
continue
|
175 |
+
title = meta.get("title", "No Title").strip()
|
176 |
+
link = meta.get("link", "").strip()
|
177 |
+
published = meta.get("published", "Unknown Date").strip()
|
178 |
+
key = f"{title}|{link}|{published}"
|
179 |
+
if key not in seen_keys:
|
180 |
+
seen_keys.add(key)
|
181 |
+
try:
|
182 |
+
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
183 |
+
except (ValueError, TypeError):
|
184 |
+
published = "1970-01-01T00:00:00" # Fallback to a very old date
|
185 |
+
enriched_articles.append({
|
186 |
+
"title": title,
|
187 |
+
"link": link,
|
188 |
+
"description": meta.get("original_description", "No Description"),
|
189 |
+
"category": meta.get("category", "Uncategorized"),
|
190 |
+
"published": published,
|
191 |
+
"image": meta.get("image", "svg"),
|
192 |
+
})
|
193 |
+
|
194 |
+
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
195 |
+
categorized_articles = {}
|
196 |
+
for article in enriched_articles:
|
197 |
+
cat = article["category"]
|
198 |
+
if cat not in categorized_articles:
|
199 |
+
categorized_articles[cat] = []
|
200 |
+
# Extra deduplication for category
|
201 |
+
key = f"{article['title']}|{article['link']}|{article['published']}"
|
202 |
+
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
203 |
+
categorized_articles[cat].append(article)
|
204 |
+
|
205 |
+
# Limit to 10 most recent per category with final deduplication
|
206 |
+
for cat in categorized_articles:
|
207 |
+
unique_articles = []
|
208 |
+
seen_cat_keys = set()
|
209 |
+
for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
|
210 |
+
key = f"{article['title']}|{article['link']}|{article['published']}"
|
211 |
+
if key not in seen_cat_keys:
|
212 |
+
seen_cat_keys.add(key)
|
213 |
+
unique_articles.append(article)
|
214 |
+
categorized_articles[cat] = unique_articles[:10]
|
215 |
+
|
216 |
+
return jsonify({"articles": categorized_articles, "last_update": last_update_time})
|
217 |
+
except Exception as e:
|
218 |
+
logger.error(f"Error fetching updates: {e}")
|
219 |
+
return jsonify({"articles": {}, "last_update": last_update_time}), 500
|
220 |
+
|
221 |
+
@app.route('/get_all_articles/<category>')
|
222 |
+
def get_all_articles(category):
|
223 |
+
try:
|
224 |
+
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
225 |
+
if not all_docs.get('metadatas'):
|
226 |
+
return jsonify({"articles": [], "category": category})
|
227 |
+
|
228 |
+
enriched_articles = []
|
229 |
+
seen_keys = set()
|
230 |
+
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
|
231 |
+
if not meta or meta.get("category") != category:
|
232 |
+
continue
|
233 |
+
title = meta.get("title", "No Title").strip()
|
234 |
+
link = meta.get("link", "").strip()
|
235 |
+
published = meta.get("published", "Unknown Date").strip()
|
236 |
+
key = f"{title}|{link}|{published}"
|
237 |
+
if key not in seen_keys:
|
238 |
+
seen_keys.add(key)
|
239 |
+
try:
|
240 |
+
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
241 |
+
except (ValueError, TypeError):
|
242 |
+
published = "1970-01-01T00:00:00" # Fallback to a very old date
|
243 |
+
enriched_articles.append({
|
244 |
+
"title": title,
|
245 |
+
"link": link,
|
246 |
+
"description": meta.get("original_description", "No Description"),
|
247 |
+
"category": meta.get("category", "Uncategorized"),
|
248 |
+
"published": published,
|
249 |
+
"image": meta.get("image", "svg"),
|
250 |
+
})
|
251 |
+
|
252 |
+
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
253 |
+
return jsonify({"articles": enriched_articles, "category": category})
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error fetching all articles for category {category}: {e}")
|
256 |
+
return jsonify({"articles": [], "category": category}), 500
|
257 |
+
|
258 |
+
if __name__ == "__main__":
|
259 |
+
app.run(host="0.0.0.0", port=7860)
|