Spaces:

broadfield-dev
/

grok_test

Running

File size: 4,369 Bytes

3a7387c
33e2dac
7bafad1
cb518f2
d695e20
33e2dac
3a7387c
 
 
cb518f2
 
 
 
33e2dac
 
d695e20
3a7387c
cb518f2
7bafad1
cb518f2
d695e20
 
33e2dac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d695e20
 
 
33e2dac
 
 
 
 
 
 
 
 
3156b44
 
33e2dac
 
 
 
 
 
3156b44
 
33e2dac
 
3156b44
 
 
 
 
 
 
ce02056
1f5e987
ce02056
 
cb518f2
ce02056
3156b44
 
33e2dac
 
 
 
 
3156b44
 
33e2dac
 
3156b44
 
 
 
 
 
 
3a7387c
ce02056
 
 
 
 
 
 
 
 
3a7387c
6680594

import os
from flask import Flask, render_template, request, Response, jsonify
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
import logging
import time
from threading import Thread

app = Flask(__name__)

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_feeds_in_background():
    logger.info("Starting to fetch and process RSS feeds in background")
    start_time = time.time()
    articles = fetch_rss_feeds()
    logger.info(f"Fetched {len(articles)} articles")
    process_and_store_articles(articles)
    logger.info("Articles processed and stored")
    end_time = time.time()
    logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")

@app.route('/')
def loading():
    # Start loading feeds in a background thread
    thread = Thread(target=load_feeds_in_background)
    thread.daemon = True
    thread.start()
    return render_template("loading.html")

@app.route('/check_feeds', methods=['GET'])
def check_feeds():
    try:
        # Check if vector DB has documents (simplified check)
        docs = vector_db.similarity_search("news", k=1)
        if docs:
            return jsonify({"status": "loaded"})
        return jsonify({"status": "loading"}), 202
    except Exception as e:
        logger.error(f"Error checking feeds: {e}")
        return jsonify({"status": "error", "message": str(e)}), 500

@app.route('/index', methods=['GET'])
def index():
    # Poll until feeds are loaded
    while True:
        response = check_feeds()
        if response.status_code == 200 and response.get_json()["status"] == "loaded":
            break
        time.sleep(1)  # Check every second

    stored_docs = vector_db.similarity_search("news", k=1000)  # Increased k for all unique articles
    # Use a set to ensure unique articles by title, link, and description hash
    unique_articles = {}
    for doc in stored_docs:
        import hashlib
        title = doc.metadata["title"]
        link = doc.metadata["link"]
        desc = doc.metadata["original_description"]
        desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10]  # Short hash for uniqueness
        key = f"{title}|{link}|{desc_hash}"
        if key not in unique_articles:
            unique_articles[key] = {
                "title": title,
                "link": link,
                "description": doc.metadata["original_description"],
                "category": doc.metadata["category"],
                "published": doc.metadata["published"],
                "image": doc.metadata.get("image", "svg"),
            }
    enriched_articles = list(unique_articles.values())
    logger.info(f"Enriched {len(enriched_articles)} unique articles for display")

    if request.method == 'POST' and 'search' in request.form:
        query = request.form.get('search')
        if query:
            logger.info(f"Processing search query: {query}")
            results = vector_db.similarity_search(query, k=10)
            unique_search_articles = {}
            for doc in results:
                title = doc.metadata["title"]
                link = doc.metadata["link"]
                desc = doc.metadata["original_description"]
                desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10]
                key = f"{title}|{link}|{desc_hash}"
                if key not in unique_search_articles:
                    unique_search_articles[key] = {
                        "title": title,
                        "link": link,
                        "description": doc.metadata["original_description"],
                        "category": doc.metadata["category"],
                        "published": doc.metadata["published"],
                        "image": doc.metadata.get("image", "svg"),
                    }
            enriched_articles = list(unique_search_articles.values())
            logger.info(f"Search returned {len(enriched_articles)} unique results")

    categorized_articles = {}
    for article in enriched_articles:
        cat = article["category"]
        if cat not in categorized_articles:
            categorized_articles[cat] = []
        categorized_articles[cat].append(article)

    return render_template("index.html", categorized_articles=categorized_articles)

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)