Spaces:
Sleeping
Sleeping
File size: 8,358 Bytes
3a7387c 37fdec4 ae30ab9 6a2f937 cb518f2 ae30ab9 3a7387c cb518f2 37fdec4 ae30ab9 37fdec4 ae30ab9 9a3bd4a 37fdec4 5d47c6a ae30ab9 37fdec4 6a2f937 37fdec4 9383dc3 37fdec4 6a2f937 37fdec4 5d47c6a 37fdec4 9383dc3 9a3bd4a 5d47c6a 37fdec4 5d47c6a 9383dc3 5d47c6a 72c3c36 5d47c6a ae30ab9 5d47c6a 9a3bd4a 37fdec4 5d47c6a ae30ab9 5d47c6a 3a7387c ae30ab9 37fdec4 9383dc3 5d47c6a 9383dc3 5d47c6a 9383dc3 37fdec4 5d47c6a 9383dc3 37fdec4 9a3bd4a 5d47c6a 37fdec4 5d47c6a 37fdec4 5d47c6a 9a3bd4a 5d47c6a ce02056 37fdec4 1e338bc 5d47c6a 37fdec4 9383dc3 37fdec4 ae30ab9 37fdec4 ae30ab9 37fdec4 1e338bc ae30ab9 3a7387c be9be7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
import threading
from flask import Flask, render_template, request, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
import logging
import time
from datetime import datetime
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Global flag to track background loading
loading_complete = False
last_update_time = time.time()
def load_feeds_in_background():
global loading_complete, last_update_time
try:
logger.info("Starting background RSS feed fetch")
articles = fetch_rss_feeds()
logger.info(f"Fetched {len(articles)} articles")
process_and_store_articles(articles)
last_update_time = time.time() # Update timestamp when new articles are added
logger.info("Background feed processing complete")
# Upload updated DB to Hugging Face Hub
upload_to_hf_hub()
loading_complete = True
except Exception as e:
logger.error(f"Error in background feed loading: {e}")
loading_complete = True
@app.route('/')
def index():
global loading_complete
loading_complete = False # Reset on each load
# Ensure Chroma DB is downloaded from Hugging Face Hub on first load
if not os.path.exists("chroma_db"):
logger.info("Downloading Chroma DB from Hugging Face Hub...")
download_from_hf_hub()
# Start background feed loading
threading.Thread(target=load_feeds_in_background, daemon=True).start()
try:
# Retrieve all articles from Chroma DB
all_docs = vector_db.get(include=['documents', 'metadatas'])
if not all_docs.get('metadatas'):
logger.info("No articles in DB yet")
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
# Process and categorize articles, getting 10 most recent per category
enriched_articles = []
seen_keys = set()
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
if not meta:
continue
title = meta.get("title", "No Title")
link = meta.get("link", "")
key = f"{title}|{link}"
if key not in seen_keys:
seen_keys.add(key)
# Try to parse published date, fallback to string sorting
published = meta.get("published", "Unknown Date")
try:
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
except (ValueError, TypeError):
pass # Keep as is if parsing fails
enriched_articles.append({
"title": title,
"link": link,
"description": meta.get("original_description", "No Description"),
"category": meta.get("category", "Uncategorized"),
"published": published,
"image": meta.get("image", "svg"),
})
# Sort by published date (handle both datetime and string)
enriched_articles.sort(key=lambda x: x["published"] if "Unknown" not in x["published"] else "1970-01-01", reverse=True)
# Group by category and limit to 10 most recent per category
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
if cat not in categorized_articles:
categorized_articles[cat] = []
categorized_articles[cat].append(article)
# Limit to 10 most recent per category
for cat in categorized_articles:
categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
return render_template("index.html", categorized_articles=categorized_articles, has_articles=True, loading=True)
except Exception as e:
logger.error(f"Error retrieving articles: {e}")
return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
@app.route('/search', methods=['POST'])
def search():
query = request.form.get('search')
if not query:
return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
try:
logger.info(f"Searching for: {query}")
results = vector_db.similarity_search(query, k=10)
enriched_articles = []
seen_keys = set()
for doc in results:
meta = doc.metadata
title = meta.get("title", "No Title")
link = meta.get("link", "")
key = f"{title}|{link}"
if key not in seen_keys:
seen_keys.add(key)
enriched_articles.append({
"title": title,
"link": link,
"description": meta.get("original_description", "No Description"),
"category": meta.get("category", "Uncategorized"),
"published": meta.get("published", "Unknown Date"),
"image": meta.get("image", "svg"),
})
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
categorized_articles.setdefault(cat, []).append(article)
return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles), loading=False)
except Exception as e:
logger.error(f"Search error: {e}")
return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
@app.route('/check_loading')
def check_loading():
global loading_complete, last_update_time
if loading_complete:
return jsonify({"status": "complete", "last_update": last_update_time})
return jsonify({"status": "loading"}), 202
@app.route('/get_updates')
def get_updates():
global last_update_time
try:
all_docs = vector_db.get(include=['documents', 'metadatas'])
if not all_docs.get('metadatas'):
return jsonify({"articles": [], "last_update": last_update_time})
enriched_articles = []
seen_keys = set()
for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
if not meta:
continue
title = meta.get("title", "No Title")
link = meta.get("link", "")
key = f"{title}|{link}"
if key not in seen_keys:
seen_keys.add(key)
published = meta.get("published", "Unknown Date")
try:
published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
except (ValueError, TypeError):
pass
enriched_articles.append({
"title": title,
"link": link,
"description": meta.get("original_description", "No Description"),
"category": meta.get("category", "Uncategorized"),
"published": published,
"image": meta.get("image", "svg"),
})
enriched_articles.sort(key=lambda x: x["published"], reverse=True)
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
if cat not in categorized_articles:
categorized_articles[cat] = []
categorized_articles[cat].append(article)
# Limit to 10 most recent per category
for cat in categorized_articles:
categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
return jsonify({"articles": categorized_articles, "last_update": last_update_time})
except Exception as e:
logger.error(f"Error fetching updates: {e}")
return jsonify({"articles": {}, "last_update": last_update_time}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |