Spaces:
Running
Running
File size: 4,369 Bytes
3a7387c 33e2dac 7bafad1 cb518f2 d695e20 33e2dac 3a7387c cb518f2 33e2dac d695e20 3a7387c cb518f2 7bafad1 cb518f2 d695e20 33e2dac d695e20 33e2dac 3156b44 33e2dac 3156b44 33e2dac 3156b44 ce02056 1f5e987 ce02056 cb518f2 ce02056 3156b44 33e2dac 3156b44 33e2dac 3156b44 3a7387c ce02056 3a7387c 6680594 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
from flask import Flask, render_template, request, Response, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
import logging
import time
from threading import Thread
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_feeds_in_background():
logger.info("Starting to fetch and process RSS feeds in background")
start_time = time.time()
articles = fetch_rss_feeds()
logger.info(f"Fetched {len(articles)} articles")
process_and_store_articles(articles)
logger.info("Articles processed and stored")
end_time = time.time()
logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")
@app.route('/')
def loading():
# Start loading feeds in a background thread
thread = Thread(target=load_feeds_in_background)
thread.daemon = True
thread.start()
return render_template("loading.html")
@app.route('/check_feeds', methods=['GET'])
def check_feeds():
try:
# Check if vector DB has documents (simplified check)
docs = vector_db.similarity_search("news", k=1)
if docs:
return jsonify({"status": "loaded"})
return jsonify({"status": "loading"}), 202
except Exception as e:
logger.error(f"Error checking feeds: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
@app.route('/index', methods=['GET'])
def index():
# Poll until feeds are loaded
while True:
response = check_feeds()
if response.status_code == 200 and response.get_json()["status"] == "loaded":
break
time.sleep(1) # Check every second
stored_docs = vector_db.similarity_search("news", k=1000) # Increased k for all unique articles
# Use a set to ensure unique articles by title, link, and description hash
unique_articles = {}
for doc in stored_docs:
import hashlib
title = doc.metadata["title"]
link = doc.metadata["link"]
desc = doc.metadata["original_description"]
desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10] # Short hash for uniqueness
key = f"{title}|{link}|{desc_hash}"
if key not in unique_articles:
unique_articles[key] = {
"title": title,
"link": link,
"description": doc.metadata["original_description"],
"category": doc.metadata["category"],
"published": doc.metadata["published"],
"image": doc.metadata.get("image", "svg"),
}
enriched_articles = list(unique_articles.values())
logger.info(f"Enriched {len(enriched_articles)} unique articles for display")
if request.method == 'POST' and 'search' in request.form:
query = request.form.get('search')
if query:
logger.info(f"Processing search query: {query}")
results = vector_db.similarity_search(query, k=10)
unique_search_articles = {}
for doc in results:
title = doc.metadata["title"]
link = doc.metadata["link"]
desc = doc.metadata["original_description"]
desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10]
key = f"{title}|{link}|{desc_hash}"
if key not in unique_search_articles:
unique_search_articles[key] = {
"title": title,
"link": link,
"description": doc.metadata["original_description"],
"category": doc.metadata["category"],
"published": doc.metadata["published"],
"image": doc.metadata.get("image", "svg"),
}
enriched_articles = list(unique_search_articles.values())
logger.info(f"Search returned {len(enriched_articles)} unique results")
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
if cat not in categorized_articles:
categorized_articles[cat] = []
categorized_articles[cat].append(article)
return render_template("index.html", categorized_articles=categorized_articles)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |