Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -34,23 +34,29 @@ def load_feeds_in_background():
|
|
34 |
|
35 |
@app.route('/')
|
36 |
def index():
|
37 |
-
global loading_complete
|
38 |
-
loading_complete = False # Reset on each load
|
39 |
|
40 |
-
#
|
41 |
-
|
|
|
|
|
|
|
42 |
logger.info("Downloading Chroma DB from Hugging Face Hub...")
|
43 |
download_from_hf_hub()
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
47 |
|
48 |
try:
|
49 |
# Retrieve all articles from Chroma DB
|
50 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
51 |
if not all_docs.get('metadatas'):
|
52 |
logger.info("No articles in DB yet")
|
53 |
-
return render_template("index.html", categorized_articles={}, has_articles=False, loading=
|
54 |
|
55 |
# Process and categorize articles, getting only 10 most recent per category with strict deduplication
|
56 |
enriched_articles = []
|
@@ -61,15 +67,12 @@ def index():
|
|
61 |
title = meta.get("title", "No Title").strip()
|
62 |
link = meta.get("link", "").strip()
|
63 |
published = meta.get("published", "Unknown Date").strip()
|
64 |
-
# Use a more robust key including trimmed fields to prevent duplicates
|
65 |
key = f"{title}|{link}|{published}"
|
66 |
if key not in seen_keys:
|
67 |
seen_keys.add(key)
|
68 |
-
# Try to parse published date, fallback to string sorting
|
69 |
try:
|
70 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
71 |
except (ValueError, TypeError):
|
72 |
-
# Fallback to a very old date for sorting if parsing fails
|
73 |
published = "1970-01-01T00:00:00"
|
74 |
enriched_articles.append({
|
75 |
"title": title,
|
@@ -80,21 +83,17 @@ def index():
|
|
80 |
"image": meta.get("image", "svg"),
|
81 |
})
|
82 |
|
83 |
-
# Sort by published date (handle both datetime and string)
|
84 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
85 |
|
86 |
-
# Group by category and limit to 10 most recent per category with final deduplication
|
87 |
categorized_articles = {}
|
88 |
for article in enriched_articles:
|
89 |
cat = article["category"]
|
90 |
if cat not in categorized_articles:
|
91 |
categorized_articles[cat] = []
|
92 |
-
# Add only if not already in the category list (extra deduplication)
|
93 |
key = f"{article['title']}|{article['link']}|{article['published']}"
|
94 |
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
95 |
categorized_articles[cat].append(article)
|
96 |
|
97 |
-
# Limit to 10 most recent per category and sort again for safety
|
98 |
for cat in categorized_articles:
|
99 |
unique_articles = []
|
100 |
seen_cat_keys = set()
|
@@ -109,10 +108,10 @@ def index():
|
|
109 |
return render_template("index.html",
|
110 |
categorized_articles=categorized_articles,
|
111 |
has_articles=True,
|
112 |
-
loading=
|
113 |
except Exception as e:
|
114 |
logger.error(f"Error retrieving articles: {e}")
|
115 |
-
return render_template("index.html", categorized_articles={}, has_articles=False, loading=
|
116 |
|
117 |
@app.route('/search', methods=['POST'])
|
118 |
def search():
|
|
|
34 |
|
35 |
@app.route('/')
|
36 |
def index():
|
37 |
+
global loading_complete, last_update_time
|
|
|
38 |
|
39 |
+
# Check if the database needs to be loaded (first time or empty)
|
40 |
+
db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
|
41 |
+
if not db_exists:
|
42 |
+
# First load: DB doesn't exist or is empty
|
43 |
+
loading_complete = False
|
44 |
logger.info("Downloading Chroma DB from Hugging Face Hub...")
|
45 |
download_from_hf_hub()
|
46 |
+
threading.Thread(target=load_feeds_in_background, daemon=True).start()
|
47 |
+
elif not loading_complete:
|
48 |
+
# Background loading is still in progress from a previous request
|
49 |
+
pass # Let it continue, spinner will show
|
50 |
+
else:
|
51 |
+
# DB exists and loading is complete, no spinner needed
|
52 |
+
loading_complete = True
|
53 |
|
54 |
try:
|
55 |
# Retrieve all articles from Chroma DB
|
56 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
57 |
if not all_docs.get('metadatas'):
|
58 |
logger.info("No articles in DB yet")
|
59 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
60 |
|
61 |
# Process and categorize articles, getting only 10 most recent per category with strict deduplication
|
62 |
enriched_articles = []
|
|
|
67 |
title = meta.get("title", "No Title").strip()
|
68 |
link = meta.get("link", "").strip()
|
69 |
published = meta.get("published", "Unknown Date").strip()
|
|
|
70 |
key = f"{title}|{link}|{published}"
|
71 |
if key not in seen_keys:
|
72 |
seen_keys.add(key)
|
|
|
73 |
try:
|
74 |
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
|
75 |
except (ValueError, TypeError):
|
|
|
76 |
published = "1970-01-01T00:00:00"
|
77 |
enriched_articles.append({
|
78 |
"title": title,
|
|
|
83 |
"image": meta.get("image", "svg"),
|
84 |
})
|
85 |
|
|
|
86 |
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
|
87 |
|
|
|
88 |
categorized_articles = {}
|
89 |
for article in enriched_articles:
|
90 |
cat = article["category"]
|
91 |
if cat not in categorized_articles:
|
92 |
categorized_articles[cat] = []
|
|
|
93 |
key = f"{article['title']}|{article['link']}|{article['published']}"
|
94 |
if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
|
95 |
categorized_articles[cat].append(article)
|
96 |
|
|
|
97 |
for cat in categorized_articles:
|
98 |
unique_articles = []
|
99 |
seen_cat_keys = set()
|
|
|
108 |
return render_template("index.html",
|
109 |
categorized_articles=categorized_articles,
|
110 |
has_articles=True,
|
111 |
+
loading=not loading_complete)
|
112 |
except Exception as e:
|
113 |
logger.error(f"Error retrieving articles: {e}")
|
114 |
+
return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)
|
115 |
|
116 |
@app.route('/search', methods=['POST'])
|
117 |
def search():
|