broadfield-dev commited on
Commit
33e2dac
·
verified ·
1 Parent(s): fd4dc15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -17
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import os
2
- from flask import Flask, render_template, request, Response
3
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
4
  import logging
5
  import time
 
6
 
7
  app = Flask(__name__)
8
 
@@ -10,13 +11,8 @@ app = Flask(__name__)
10
  logging.basicConfig(level=logging.INFO)
11
  logger = logging.getLogger(__name__)
12
 
13
- @app.route('/')
14
- def loading():
15
- return render_template("loading.html")
16
-
17
- @app.route('/load_feeds', methods=['GET'])
18
- def load_feeds():
19
- logger.info("Starting to fetch and process RSS feeds")
20
  start_time = time.time()
21
  articles = fetch_rss_feeds()
22
  logger.info(f"Fetched {len(articles)} articles")
@@ -24,19 +20,50 @@ def load_feeds():
24
  logger.info("Articles processed and stored")
25
  end_time = time.time()
26
  logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")
27
- return Response("Feeds loaded", status=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  @app.route('/index', methods=['GET'])
30
  def index():
31
- stored_docs = vector_db.similarity_search("news", k=1000) # Increased k to ensure all unique articles
32
- # Use a set to ensure unique articles by title and link
 
 
 
 
 
 
 
33
  unique_articles = {}
34
  for doc in stored_docs:
35
- key = f"{doc.metadata['title']}|{doc.metadata['link']}"
 
 
 
 
 
36
  if key not in unique_articles:
37
  unique_articles[key] = {
38
- "title": doc.metadata["title"],
39
- "link": doc.metadata["link"],
40
  "description": doc.metadata["original_description"],
41
  "category": doc.metadata["category"],
42
  "published": doc.metadata["published"],
@@ -52,11 +79,15 @@ def index():
52
  results = vector_db.similarity_search(query, k=10)
53
  unique_search_articles = {}
54
  for doc in results:
55
- key = f"{doc.metadata['title']}|{doc.metadata['link']}"
 
 
 
 
56
  if key not in unique_search_articles:
57
  unique_search_articles[key] = {
58
- "title": doc.metadata["title"],
59
- "link": doc.metadata["link"],
60
  "description": doc.metadata["original_description"],
61
  "category": doc.metadata["category"],
62
  "published": doc.metadata["published"],
 
1
  import os
2
+ from flask import Flask, render_template, request, Response, jsonify
3
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
4
  import logging
5
  import time
6
+ from threading import Thread
7
 
8
  app = Flask(__name__)
9
 
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
+ def load_feeds_in_background():
15
+ logger.info("Starting to fetch and process RSS feeds in background")
 
 
 
 
 
16
  start_time = time.time()
17
  articles = fetch_rss_feeds()
18
  logger.info(f"Fetched {len(articles)} articles")
 
20
  logger.info("Articles processed and stored")
21
  end_time = time.time()
22
  logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")
23
+
24
+ @app.route('/')
25
+ def loading():
26
+ # Start loading feeds in a background thread
27
+ thread = Thread(target=load_feeds_in_background)
28
+ thread.daemon = True
29
+ thread.start()
30
+ return render_template("loading.html")
31
+
32
+ @app.route('/check_feeds', methods=['GET'])
33
+ def check_feeds():
34
+ try:
35
+ # Check if vector DB has documents (simplified check)
36
+ docs = vector_db.similarity_search("news", k=1)
37
+ if docs:
38
+ return jsonify({"status": "loaded"})
39
+ return jsonify({"status": "loading"}), 202
40
+ except Exception as e:
41
+ logger.error(f"Error checking feeds: {e}")
42
+ return jsonify({"status": "error", "message": str(e)}), 500
43
 
44
  @app.route('/index', methods=['GET'])
45
  def index():
46
+ # Poll until feeds are loaded
47
+ while True:
48
+ response = check_feeds()
49
+ if response.status_code == 200 and response.get_json()["status"] == "loaded":
50
+ break
51
+ time.sleep(1) # Check every second
52
+
53
+ stored_docs = vector_db.similarity_search("news", k=1000) # Increased k for all unique articles
54
+ # Use a set to ensure unique articles by title, link, and description hash
55
  unique_articles = {}
56
  for doc in stored_docs:
57
+ import hashlib
58
+ title = doc.metadata["title"]
59
+ link = doc.metadata["link"]
60
+ desc = doc.metadata["original_description"]
61
+ desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10] # Short hash for uniqueness
62
+ key = f"{title}|{link}|{desc_hash}"
63
  if key not in unique_articles:
64
  unique_articles[key] = {
65
+ "title": title,
66
+ "link": link,
67
  "description": doc.metadata["original_description"],
68
  "category": doc.metadata["category"],
69
  "published": doc.metadata["published"],
 
79
  results = vector_db.similarity_search(query, k=10)
80
  unique_search_articles = {}
81
  for doc in results:
82
+ title = doc.metadata["title"]
83
+ link = doc.metadata["link"]
84
+ desc = doc.metadata["original_description"]
85
+ desc_hash = hashlib.md5(desc.encode()).hexdigest()[:10]
86
+ key = f"{title}|{link}|{desc_hash}"
87
  if key not in unique_search_articles:
88
  unique_search_articles[key] = {
89
+ "title": title,
90
+ "link": link,
91
  "description": doc.metadata["original_description"],
92
  "category": doc.metadata["category"],
93
  "published": doc.metadata["published"],