File size: 5,827 Bytes
3a7387c
37df149
72c3c36
7bafad1
cb518f2
d695e20
0aab8d6
3a7387c
 
 
cb518f2
 
 
 
1e338bc
 
 
 
 
 
 
 
 
d695e20
1e338bc
d695e20
9a3bd4a
 
72c3c36
9a3bd4a
72c3c36
 
 
 
 
 
 
 
 
9a3bd4a
 
 
72c3c36
 
 
 
 
9a3bd4a
 
 
 
 
 
 
72c3c36
 
9a3bd4a
 
 
 
 
 
 
ce02056
1e338bc
 
3a7387c
ce02056
 
 
 
 
 
 
9a3bd4a
 
 
 
 
 
 
72c3c36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a3bd4a
 
 
 
 
 
 
 
 
 
ce02056
1e338bc
 
 
72c3c36
9a3bd4a
72c3c36
1e338bc
 
 
 
 
 
 
3a7387c
be9be7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import subprocess
from flask import Flask, render_template, request, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
import logging
import time
import hashlib

app = Flask(__name__)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_feeds_in_background():
    logger.info("Starting to fetch and process RSS feeds in background")
    start_time = time.time()
    articles = fetch_rss_feeds()
    logger.info(f"Fetched {len(articles)} articles")
    process_and_store_articles(articles)
    logger.info("Articles processed and stored")
    end_time = time.time()
    logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")

@app.route('/')
def index():
    # Show all existing articles immediately, even if empty
    try:
        # Get all documents from Chroma DB using get()
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if 'metadatas' in all_docs and all_docs['metadatas']:
            stored_docs = [
                Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta)
                for doc, meta in zip(all_docs['documents'], all_docs['metadatas'])
            ]
            logger.info(f"Found {len(stored_docs)} documents in vector DB")
        else:
            stored_docs = []
            logger.warning("No metadata or documents found in vector DB")
        # Use a set to ensure unique articles by title, link, and full description hash
        unique_articles = {}
        for doc in stored_docs:
            if not doc.metadata:  # Handle potential None metadata
                continue
            title = doc.metadata.get("title", "No Title")
            link = doc.metadata.get("link", "")
            description = doc.metadata.get("original_description", "No Description")
            desc_hash = hashlib.md5(description.encode()).hexdigest()
            key = f"{title}|{link}|{desc_hash}"
            if key not in unique_articles:
                unique_articles[key] = {
                    "title": title,
                    "link": link,
                    "description": description,
                    "category": doc.metadata.get("category", "Uncategorized"),
                    "published": doc.metadata.get("published", "Unknown Date"),
                    "image": doc.metadata.get("image", "svg"),
                }
        enriched_articles = list(unique_articles.values())
        logger.info(f"Enriched {len(enriched_articles)} unique articles for display")
    except Exception as e:
        logger.error(f"Error retrieving documents from vector DB: {e}")
        enriched_articles = []  # Fallback if DB is empty or inaccessible

    # Start loading new feeds in the background
    subprocess.Popen(["python", "rss_processor.py", "load_feeds"])

    categorized_articles = {}
    for article in enriched_articles:
        cat = article["category"]
        if cat not in categorized_articles:
            categorized_articles[cat] = []
        categorized_articles[cat].append(article)

    return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))

@app.route('/search', methods=['POST'])
def search():
    query = request.form.get('search')
    if query:
        logger.info(f"Processing search query: {query}")
        try:
            results = vector_db.similarity_search(query, k=10)
            unique_search_articles = {}
            for doc in results:
                title = doc.metadata.get("title", "No Title")
                link = doc.metadata.get("link", "")
                description = doc.metadata.get("original_description", "No Description")
                desc_hash = hashlib.md5(description.encode()).hexdigest()
                key = f"{title}|{link}|{desc_hash}"
                if key not in unique_search_articles:
                    unique_search_articles[key] = {
                        "title": title,
                        "link": link,
                        "description": description,
                        "category": doc.metadata.get("category", "Uncategorized"),
                        "published": doc.metadata.get("published", "Unknown Date"),
                        "image": doc.metadata.get("image", "svg"),
                    }
            enriched_articles = list(unique_search_articles.values())
            logger.info(f"Search returned {len(enriched_articles)} unique results")
        except Exception as e:
            logger.error(f"Error performing search: {e}")
            enriched_articles = []

        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            if cat not in categorized_articles:
                categorized_articles[cat] = []
            categorized_articles[cat].append(article)

        return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))
    return render_template("index.html", categorized_articles={}, loading_new_feeds=True, has_articles=False)

@app.route('/check_feeds', methods=['GET'])
def check_feeds():
    try:
        # Check if vector DB has any documents
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if 'metadatas' in all_docs and all_docs['metadatas']:
            logger.info("Feeds loaded successfully in vector DB")
            return jsonify({"status": "loaded"})
        return jsonify({"status": "loading"}), 202
    except Exception as e:
        logger.error(f"Error checking feeds: {e}")
        return jsonify({"status": "error", "message": str(e)}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)