import os import subprocess from flask import Flask, render_template, request, jsonify from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db import logging import time import hashlib app = Flask(__name__) # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_feeds_in_background(): logger.info("Starting to fetch and process RSS feeds in background") start_time = time.time() articles = fetch_rss_feeds() logger.info(f"Fetched {len(articles)} articles") process_and_store_articles(articles) logger.info("Articles processed and stored") end_time = time.time() logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds") @app.route('/') def index(): # Show all existing articles immediately, even if empty try: # Get all documents from Chroma DB using get() all_docs = vector_db.get(include=['documents', 'metadatas']) if 'metadatas' in all_docs and all_docs['metadatas']: stored_docs = [ Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta) for doc, meta in zip(all_docs['documents'], all_docs['metadatas']) ] logger.info(f"Found {len(stored_docs)} documents in vector DB") else: stored_docs = [] logger.warning("No metadata or documents found in vector DB") # Use a set to ensure unique articles by title, link, and full description hash unique_articles = {} for doc in stored_docs: if not doc.metadata: # Handle potential None metadata continue title = doc.metadata.get("title", "No Title") link = doc.metadata.get("link", "") description = doc.metadata.get("original_description", "No Description") desc_hash = hashlib.md5(description.encode()).hexdigest() key = f"{title}|{link}|{desc_hash}" if key not in unique_articles: unique_articles[key] = { "title": title, "link": link, "description": description, "category": doc.metadata.get("category", "Uncategorized"), "published": doc.metadata.get("published", "Unknown Date"), "image": doc.metadata.get("image", "svg"), } enriched_articles = list(unique_articles.values()) logger.info(f"Enriched {len(enriched_articles)} unique articles for display") except Exception as e: logger.error(f"Error retrieving documents from vector DB: {e}") enriched_articles = [] # Fallback if DB is empty or inaccessible # Start loading new feeds in the background subprocess.Popen(["python", "rss_processor.py", "load_feeds"]) categorized_articles = {} for article in enriched_articles: cat = article["category"] if cat not in categorized_articles: categorized_articles[cat] = [] categorized_articles[cat].append(article) return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles)) @app.route('/search', methods=['POST']) def search(): query = request.form.get('search') if query: logger.info(f"Processing search query: {query}") try: results = vector_db.similarity_search(query, k=10) unique_search_articles = {} for doc in results: title = doc.metadata.get("title", "No Title") link = doc.metadata.get("link", "") description = doc.metadata.get("original_description", "No Description") desc_hash = hashlib.md5(description.encode()).hexdigest() key = f"{title}|{link}|{desc_hash}" if key not in unique_search_articles: unique_search_articles[key] = { "title": title, "link": link, "description": description, "category": doc.metadata.get("category", "Uncategorized"), "published": doc.metadata.get("published", "Unknown Date"), "image": doc.metadata.get("image", "svg"), } enriched_articles = list(unique_search_articles.values()) logger.info(f"Search returned {len(enriched_articles)} unique results") except Exception as e: logger.error(f"Error performing search: {e}") enriched_articles = [] categorized_articles = {} for article in enriched_articles: cat = article["category"] if cat not in categorized_articles: categorized_articles[cat] = [] categorized_articles[cat].append(article) return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles)) return render_template("index.html", categorized_articles={}, loading_new_feeds=True, has_articles=False) @app.route('/check_feeds', methods=['GET']) def check_feeds(): try: # Check if vector DB has any documents all_docs = vector_db.get(include=['documents', 'metadatas']) if 'metadatas' in all_docs and all_docs['metadatas']: logger.info("Feeds loaded successfully in vector DB") return jsonify({"status": "loaded"}) return jsonify({"status": "loading"}), 202 except Exception as e: logger.error(f"Error checking feeds: {e}") return jsonify({"status": "error", "message": str(e)}), 500 if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)