File size: 8,358 Bytes
3a7387c
37fdec4
ae30ab9
6a2f937
cb518f2
ae30ab9
 
3a7387c
 
 
cb518f2
 
 
 
37fdec4
 
ae30ab9
37fdec4
 
ae30ab9
9a3bd4a
37fdec4
5d47c6a
 
 
ae30ab9
37fdec4
6a2f937
 
37fdec4
 
 
9383dc3
37fdec4
 
 
 
 
 
6a2f937
 
 
 
 
37fdec4
 
5d47c6a
37fdec4
9383dc3
9a3bd4a
5d47c6a
37fdec4
 
5d47c6a
9383dc3
5d47c6a
 
 
 
72c3c36
5d47c6a
 
 
 
 
ae30ab9
 
 
 
 
 
5d47c6a
9a3bd4a
 
37fdec4
5d47c6a
ae30ab9
5d47c6a
 
3a7387c
ae30ab9
 
37fdec4
9383dc3
5d47c6a
9383dc3
5d47c6a
9383dc3
 
 
 
 
 
 
 
 
37fdec4
5d47c6a
9383dc3
37fdec4
9a3bd4a
 
 
 
5d47c6a
37fdec4
5d47c6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37fdec4
5d47c6a
 
 
 
9a3bd4a
 
 
 
5d47c6a
ce02056
37fdec4
1e338bc
5d47c6a
37fdec4
 
9383dc3
37fdec4
ae30ab9
37fdec4
ae30ab9
37fdec4
1e338bc
ae30ab9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a7387c
be9be7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import os
import threading
from flask import Flask, render_template, request, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
import logging
import time
from datetime import datetime

app = Flask(__name__)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global flag to track background loading
loading_complete = False
last_update_time = time.time()

def load_feeds_in_background():
    global loading_complete, last_update_time
    try:
        logger.info("Starting background RSS feed fetch")
        articles = fetch_rss_feeds()
        logger.info(f"Fetched {len(articles)} articles")
        process_and_store_articles(articles)
        last_update_time = time.time()  # Update timestamp when new articles are added
        logger.info("Background feed processing complete")
        # Upload updated DB to Hugging Face Hub
        upload_to_hf_hub()
        loading_complete = True
    except Exception as e:
        logger.error(f"Error in background feed loading: {e}")
        loading_complete = True

@app.route('/')
def index():
    global loading_complete
    loading_complete = False  # Reset on each load

    # Ensure Chroma DB is downloaded from Hugging Face Hub on first load
    if not os.path.exists("chroma_db"):
        logger.info("Downloading Chroma DB from Hugging Face Hub...")
        download_from_hf_hub()

    # Start background feed loading
    threading.Thread(target=load_feeds_in_background, daemon=True).start()

    try:
        # Retrieve all articles from Chroma DB
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if not all_docs.get('metadatas'):
            logger.info("No articles in DB yet")
            return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)

        # Process and categorize articles, getting 10 most recent per category
        enriched_articles = []
        seen_keys = set()
        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
            if not meta:
                continue
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            key = f"{title}|{link}"
            if key not in seen_keys:
                seen_keys.add(key)
                # Try to parse published date, fallback to string sorting
                published = meta.get("published", "Unknown Date")
                try:
                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
                except (ValueError, TypeError):
                    pass  # Keep as is if parsing fails
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": meta.get("original_description", "No Description"),
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })

        # Sort by published date (handle both datetime and string)
        enriched_articles.sort(key=lambda x: x["published"] if "Unknown" not in x["published"] else "1970-01-01", reverse=True)

        # Group by category and limit to 10 most recent per category
        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            if cat not in categorized_articles:
                categorized_articles[cat] = []
            categorized_articles[cat].append(article)
        
        # Limit to 10 most recent per category
        for cat in categorized_articles:
            categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]

        logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
        return render_template("index.html", categorized_articles=categorized_articles, has_articles=True, loading=True)
    except Exception as e:
        logger.error(f"Error retrieving articles: {e}")
        return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)

@app.route('/search', methods=['POST'])
def search():
    query = request.form.get('search')
    if not query:
        return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)

    try:
        logger.info(f"Searching for: {query}")
        results = vector_db.similarity_search(query, k=10)
        enriched_articles = []
        seen_keys = set()
        for doc in results:
            meta = doc.metadata
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            key = f"{title}|{link}"
            if key not in seen_keys:
                seen_keys.add(key)
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": meta.get("original_description", "No Description"),
                    "category": meta.get("category", "Uncategorized"),
                    "published": meta.get("published", "Unknown Date"),
                    "image": meta.get("image", "svg"),
                })

        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            categorized_articles.setdefault(cat, []).append(article)

        return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles), loading=False)
    except Exception as e:
        logger.error(f"Search error: {e}")
        return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)

@app.route('/check_loading')
def check_loading():
    global loading_complete, last_update_time
    if loading_complete:
        return jsonify({"status": "complete", "last_update": last_update_time})
    return jsonify({"status": "loading"}), 202

@app.route('/get_updates')
def get_updates():
    global last_update_time
    try:
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if not all_docs.get('metadatas'):
            return jsonify({"articles": [], "last_update": last_update_time})

        enriched_articles = []
        seen_keys = set()
        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
            if not meta:
                continue
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            key = f"{title}|{link}"
            if key not in seen_keys:
                seen_keys.add(key)
                published = meta.get("published", "Unknown Date")
                try:
                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
                except (ValueError, TypeError):
                    pass
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": meta.get("original_description", "No Description"),
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })

        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            if cat not in categorized_articles:
                categorized_articles[cat] = []
            categorized_articles[cat].append(article)

        # Limit to 10 most recent per category
        for cat in categorized_articles:
            categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]

        return jsonify({"articles": categorized_articles, "last_update": last_update_time})
    except Exception as e:
        logger.error(f"Error fetching updates: {e}")
        return jsonify({"articles": {}, "last_update": last_update_time}), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)