Spaces:
Running
Running
File size: 5,827 Bytes
3a7387c 37df149 72c3c36 7bafad1 cb518f2 d695e20 0aab8d6 3a7387c cb518f2 1e338bc d695e20 1e338bc d695e20 9a3bd4a 72c3c36 9a3bd4a 72c3c36 9a3bd4a 72c3c36 9a3bd4a 72c3c36 9a3bd4a ce02056 1e338bc 3a7387c ce02056 9a3bd4a 72c3c36 9a3bd4a ce02056 1e338bc 72c3c36 9a3bd4a 72c3c36 1e338bc 3a7387c be9be7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import subprocess
from flask import Flask, render_template, request, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
import logging
import time
import hashlib
app = Flask(__name__)
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def load_feeds_in_background():
logger.info("Starting to fetch and process RSS feeds in background")
start_time = time.time()
articles = fetch_rss_feeds()
logger.info(f"Fetched {len(articles)} articles")
process_and_store_articles(articles)
logger.info("Articles processed and stored")
end_time = time.time()
logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")
@app.route('/')
def index():
# Show all existing articles immediately, even if empty
try:
# Get all documents from Chroma DB using get()
all_docs = vector_db.get(include=['documents', 'metadatas'])
if 'metadatas' in all_docs and all_docs['metadatas']:
stored_docs = [
Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta)
for doc, meta in zip(all_docs['documents'], all_docs['metadatas'])
]
logger.info(f"Found {len(stored_docs)} documents in vector DB")
else:
stored_docs = []
logger.warning("No metadata or documents found in vector DB")
# Use a set to ensure unique articles by title, link, and full description hash
unique_articles = {}
for doc in stored_docs:
if not doc.metadata: # Handle potential None metadata
continue
title = doc.metadata.get("title", "No Title")
link = doc.metadata.get("link", "")
description = doc.metadata.get("original_description", "No Description")
desc_hash = hashlib.md5(description.encode()).hexdigest()
key = f"{title}|{link}|{desc_hash}"
if key not in unique_articles:
unique_articles[key] = {
"title": title,
"link": link,
"description": description,
"category": doc.metadata.get("category", "Uncategorized"),
"published": doc.metadata.get("published", "Unknown Date"),
"image": doc.metadata.get("image", "svg"),
}
enriched_articles = list(unique_articles.values())
logger.info(f"Enriched {len(enriched_articles)} unique articles for display")
except Exception as e:
logger.error(f"Error retrieving documents from vector DB: {e}")
enriched_articles = [] # Fallback if DB is empty or inaccessible
# Start loading new feeds in the background
subprocess.Popen(["python", "rss_processor.py", "load_feeds"])
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
if cat not in categorized_articles:
categorized_articles[cat] = []
categorized_articles[cat].append(article)
return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))
@app.route('/search', methods=['POST'])
def search():
query = request.form.get('search')
if query:
logger.info(f"Processing search query: {query}")
try:
results = vector_db.similarity_search(query, k=10)
unique_search_articles = {}
for doc in results:
title = doc.metadata.get("title", "No Title")
link = doc.metadata.get("link", "")
description = doc.metadata.get("original_description", "No Description")
desc_hash = hashlib.md5(description.encode()).hexdigest()
key = f"{title}|{link}|{desc_hash}"
if key not in unique_search_articles:
unique_search_articles[key] = {
"title": title,
"link": link,
"description": description,
"category": doc.metadata.get("category", "Uncategorized"),
"published": doc.metadata.get("published", "Unknown Date"),
"image": doc.metadata.get("image", "svg"),
}
enriched_articles = list(unique_search_articles.values())
logger.info(f"Search returned {len(enriched_articles)} unique results")
except Exception as e:
logger.error(f"Error performing search: {e}")
enriched_articles = []
categorized_articles = {}
for article in enriched_articles:
cat = article["category"]
if cat not in categorized_articles:
categorized_articles[cat] = []
categorized_articles[cat].append(article)
return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))
return render_template("index.html", categorized_articles={}, loading_new_feeds=True, has_articles=False)
@app.route('/check_feeds', methods=['GET'])
def check_feeds():
try:
# Check if vector DB has any documents
all_docs = vector_db.get(include=['documents', 'metadatas'])
if 'metadatas' in all_docs and all_docs['metadatas']:
logger.info("Feeds loaded successfully in vector DB")
return jsonify({"status": "loaded"})
return jsonify({"status": "loading"}), 202
except Exception as e:
logger.error(f"Error checking feeds: {e}")
return jsonify({"status": "error", "message": str(e)}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860) |