File size: 12,978 Bytes
1c7cefc
 
 
b5bbce9
1c7cefc
 
 
6d6a251
1c7cefc
 
 
 
 
 
 
 
 
 
 
 
 
 
a14387f
1c7cefc
 
 
6d6a251
a14387f
1c7cefc
 
 
 
 
 
 
 
935c631
1c7cefc
935c631
 
a14387f
 
1c7cefc
a14387f
 
97a599a
af08b40
a14387f
1c7cefc
 
 
97a599a
 
1c7cefc
a14387f
 
1c7cefc
a14387f
1c7cefc
 
 
 
 
b5bbce9
 
 
1c7cefc
b5bbce9
 
 
 
 
 
 
 
6d6a251
1c7cefc
 
 
 
 
 
 
 
 
6d6a251
1c7cefc
 
 
 
7a82005
6d6a251
1c7cefc
7a82005
1c7cefc
 
7a82005
1c7cefc
 
 
 
 
7a82005
 
47649d8
 
 
 
1c7cefc
7a82005
 
 
1c7cefc
 
 
 
 
a14387f
1c7cefc
 
a14387f
1c7cefc
 
 
 
 
2aa963e
64231a2
1c7cefc
 
 
 
2aa963e
 
1c7cefc
 
 
 
b5bbce9
 
 
1c7cefc
b5bbce9
 
 
 
 
 
 
6d6a251
1c7cefc
 
 
 
 
6d6a251
1c7cefc
2aa963e
1c7cefc
 
 
 
 
 
 
 
2aa963e
2c0d9b1
64231a2
 
 
 
1c7cefc
 
64231a2
1c7cefc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5bbce9
 
 
1c7cefc
b5bbce9
 
 
 
 
 
 
6d6a251
1c7cefc
 
 
 
 
6d6a251
1c7cefc
 
 
6d6a251
1c7cefc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5bbce9
1c7cefc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5bbce9
 
 
1c7cefc
b5bbce9
 
 
 
 
 
 
6d6a251
1c7cefc
 
 
 
 
6d6a251
1c7cefc
 
 
6d6a251
1c7cefc
 
 
 
 
 
 
 
 
 
f3a9fc2
 
 
1c7cefc
f3a9fc2
1c7cefc
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import os
import threading
from flask import Flask, render_template, request, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub, clean_text
import logging
import time
from datetime import datetime
import hashlib

app = Flask(__name__)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Global flag to track background loading
loading_complete = False
last_update_time = time.time()

def load_feeds_in_background():
    global loading_complete, last_update_time
    try:
        logger.info("Starting background RSS feed fetch")
        articles = fetch_rss_feeds()
        logger.info(f"Fetched {len(articles)} articles")
        process_and_store_articles(articles)
        last_update_time = time.time()
        logger.info("Background feed processing complete")
        upload_to_hf_hub()
        loading_complete = True
    except Exception as e:
        logger.error(f"Error in background feed loading: {e}")
        loading_complete = True

@app.route('/')
def index():
    global loading_complete, last_update_time

    db_exists = os.path.exists("chroma_db") and vector_db.get().get('documents')
    if not db_exists:
        loading_complete = False
        logger.info("Downloading Chroma DB from Hugging Face Hub...")
        download_from_hf_hub()
        threading.Thread(target=load_feeds_in_background, daemon=True).start()
    elif not loading_complete:
        pass
    else:
        loading_complete = True

    try:
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        total_docs = len(all_docs['documents']) if all_docs.get('documents') else 0
        logger.info(f"Total articles in DB: {total_docs}")
        if not all_docs.get('metadatas'):
            logger.info("No articles in DB yet")
            return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)

        # Process and categorize articles with strict deduplication
        enriched_articles = []
        seen_keys = set()
        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
            if not meta:
                continue
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            description = meta.get("original_description", "No Description")
            published = meta.get("published", "Unknown Date").strip()

            # Clean and normalize all fields
            title = clean_text(title)
            link = clean_text(link)
            description = clean_text(description)

            # Use a robust key with cleaned fields and description hash for deduplication
            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
            key = f"{title}|{link}|{published}|{description_hash}"
            if key not in seen_keys:
                seen_keys.add(key)
                try:
                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
                except (ValueError, TypeError):
                    published = "1970-01-01T00:00:00"
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": description,
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })
            else:
                logger.debug(f"Duplicate found in retrieval: {key}")

        # Sort by published date (stable sort)
        enriched_articles.sort(key=lambda x: x["published"], reverse=True)

        # Group by category and limit to 10 most recent per category
        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            if cat not in categorized_articles:
                categorized_articles[cat] = []
            categorized_articles[cat].append(article)

        # Sort categories alphabetically
        categorized_articles = dict(sorted(categorized_articles.items(), key=lambda x: x[0].lower()))

        # Limit to 10 most recent per category and log top 2 for debugging
        for cat in categorized_articles:
            categorized_articles[cat] = sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True)[:10]
            if len(categorized_articles[cat]) >= 2:
                logger.debug(f"Category {cat} top 2: {categorized_articles[cat][0]['title']} | {categorized_articles[cat][1]['title']}")

        logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
        return render_template("index.html", 
                              categorized_articles=categorized_articles, 
                              has_articles=True, 
                              loading=not loading_complete)
    except Exception as e:
        logger.error(f"Error retrieving articles: {e}")
        return render_template("index.html", categorized_articles={}, has_articles=False, loading=not loading_complete)

@app.route('/search', methods=['POST'])
def search():
    query = request.form.get('search')
    if not query:
        logger.info("Empty search query received")
        return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False})

    try:
        logger.info(f"Searching for: {query}")
        results = vector_db.similarity_search(query, k=10)
        logger.info(f"Search returned {len(results)} results")
        
        enriched_articles = []
        seen_keys = set()
        for doc in results:
            meta = doc.metadata
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            description = meta.get("original_description", "No Description")
            published = meta.get("published", "Unknown Date").strip()

            # Clean and normalize all fields
            title = clean_text(title)
            link = clean_text(link)
            description = clean_text(description)

            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
            key = f"{title}|{link}|{published}|{description_hash}"
            if key not in seen_keys:
                seen_keys.add(key)
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": description,
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })

        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            categorized_articles.setdefault(cat, []).append(article)

        logger.info(f"Found {len(enriched_articles)} unique articles across {len(categorized_articles)} categories")
        return jsonify({
            "categorized_articles": categorized_articles,
            "has_articles": bool(enriched_articles),
            "loading": False
        })
    except Exception as e:
        logger.error(f"Search error: {e}")
        return jsonify({"categorized_articles": {}, "has_articles": False, "loading": False}), 500

@app.route('/check_loading')
def check_loading():
    global loading_complete, last_update_time
    if loading_complete:
        return jsonify({"status": "complete", "last_update": last_update_time})
    return jsonify({"status": "loading"}), 202

@app.route('/get_updates')
def get_updates():
    global last_update_time
    try:
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if not all_docs.get('metadatas'):
            return jsonify({"articles": [], "last_update": last_update_time})

        enriched_articles = []
        seen_keys = set()
        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
            if not meta:
                continue
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            description = meta.get("original_description", "No Description")
            published = meta.get("published", "Unknown Date").strip()

            # Clean and normalize all fields
            title = clean_text(title)
            link = clean_text(link)
            description = clean_text(description)

            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
            key = f"{title}|{link}|{published}|{description_hash}"
            if key not in seen_keys:
                seen_keys.add(key)
                try:
                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
                except (ValueError, TypeError):
                    published = "1970-01-01T00:00:00"
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": description,
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })

        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
        categorized_articles = {}
        for article in enriched_articles:
            cat = article["category"]
            if cat not in categorized_articles:
                categorized_articles[cat] = []
            key = f"{article['title']}|{article['link']}|{article['published']}"
            if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
                categorized_articles[cat].append(article)

        # Limit to 10 most recent per category with final deduplication
        for cat in categorized_articles:
            unique_articles = []
            seen_cat_keys = set()
            for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
                key = f"{clean_text(article['title'])}|{clean_text(article['link'])}|{article['published']}"
                if key not in seen_cat_keys:
                    seen_cat_keys.add(key)
                    unique_articles.append(article)
            categorized_articles[cat] = unique_articles[:10]

        return jsonify({"articles": categorized_articles, "last_update": last_update_time})
    except Exception as e:
        logger.error(f"Error fetching updates: {e}")
        return jsonify({"articles": {}, "last_update": last_update_time}), 500

@app.route('/get_all_articles/<category>')
def get_all_articles(category):
    try:
        all_docs = vector_db.get(include=['documents', 'metadatas'])
        if not all_docs.get('metadatas'):
            return jsonify({"articles": [], "category": category})

        enriched_articles = []
        seen_keys = set()
        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
            if not meta or meta.get("category") != category:
                continue
            title = meta.get("title", "No Title")
            link = meta.get("link", "")
            description = meta.get("original_description", "No Description")
            published = meta.get("published", "Unknown Date").strip()

            # Clean and normalize all fields
            title = clean_text(title)
            link = clean_text(link)
            description = clean_text(description)

            description_hash = hashlib.sha256(description.encode('utf-8')).hexdigest()
            key = f"{title}|{link}|{published}|{description_hash}"
            if key not in seen_keys:
                seen_keys.add(key)
                try:
                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
                except (ValueError, TypeError):
                    published = "1970-01-01T00:00:00"
                enriched_articles.append({
                    "title": title,
                    "link": link,
                    "description": description,
                    "category": meta.get("category", "Uncategorized"),
                    "published": published,
                    "image": meta.get("image", "svg"),
                })

        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
        return jsonify({"articles": enriched_articles, "category": category})
    except Exception as e:
        logger.error(f"Error fetching all articles for category {category}: {e}")
        return jsonify({"articles": [], "category": category}), 500
@app.route('/card')
def card_load():
    return render_template("card.html")

    
if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)