broadfield-dev commited on
Commit
5d47c6a
·
verified ·
1 Parent(s): 2cfff23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -96
app.py CHANGED
@@ -1,10 +1,7 @@
1
  import os
2
- import subprocess
3
  from flask import Flask, render_template, request, jsonify
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
5
  import logging
6
- import time
7
- import hashlib
8
 
9
  app = Flask(__name__)
10
 
@@ -12,119 +9,91 @@ app = Flask(__name__)
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
- def load_feeds_in_background():
16
- logger.info("Starting to fetch and process RSS feeds in background")
17
- start_time = time.time()
18
- articles = fetch_rss_feeds()
19
- logger.info(f"Fetched {len(articles)} articles")
20
- process_and_store_articles(articles)
21
- logger.info("Articles processed and stored")
22
- end_time = time.time()
23
- logger.info(f"RSS feed loading took {end_time - start_time:.2f} seconds")
24
-
25
  @app.route('/')
26
  def index():
27
- # Show all existing articles immediately, even if empty
28
  try:
29
- # Get all documents from Chroma DB using get()
 
 
 
 
 
 
30
  all_docs = vector_db.get(include=['documents', 'metadatas'])
31
- if 'metadatas' in all_docs and all_docs['metadatas']:
32
- stored_docs = [
33
- Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta)
34
- for doc, meta in zip(all_docs['documents'], all_docs['metadatas'])
35
- ]
36
- logger.info(f"Found {len(stored_docs)} documents in vector DB")
37
- else:
38
- stored_docs = []
39
- logger.warning("No metadata or documents found in vector DB")
40
- # Use a set to ensure unique articles by title, link, and full description hash
41
- unique_articles = {}
42
- for doc in stored_docs:
43
- if not doc.metadata: # Handle potential None metadata
44
  continue
45
- title = doc.metadata.get("title", "No Title")
46
- link = doc.metadata.get("link", "")
47
- description = doc.metadata.get("original_description", "No Description")
48
- desc_hash = hashlib.md5(description.encode()).hexdigest()
49
- key = f"{title}|{link}|{desc_hash}"
50
- if key not in unique_articles:
51
- unique_articles[key] = {
52
  "title": title,
53
  "link": link,
54
  "description": description,
55
- "category": doc.metadata.get("category", "Uncategorized"),
56
- "published": doc.metadata.get("published", "Unknown Date"),
57
- "image": doc.metadata.get("image", "svg"),
58
- }
59
- enriched_articles = list(unique_articles.values())
60
- logger.info(f"Enriched {len(enriched_articles)} unique articles for display")
61
- except Exception as e:
62
- logger.error(f"Error retrieving documents from vector DB: {e}")
63
- enriched_articles = [] # Fallback if DB is empty or inaccessible
64
-
65
- # Start loading new feeds in the background
66
- subprocess.Popen(["python", "rss_processor.py", "load_feeds"])
67
 
68
- categorized_articles = {}
69
- for article in enriched_articles:
70
- cat = article["category"]
71
- if cat not in categorized_articles:
72
- categorized_articles[cat] = []
73
- categorized_articles[cat].append(article)
74
 
75
- return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))
 
 
 
76
 
77
  @app.route('/search', methods=['POST'])
78
  def search():
79
  query = request.form.get('search')
80
- if query:
81
- logger.info(f"Processing search query: {query}")
82
- try:
83
- results = vector_db.similarity_search(query, k=10)
84
- unique_search_articles = {}
85
- for doc in results:
86
- title = doc.metadata.get("title", "No Title")
87
- link = doc.metadata.get("link", "")
88
- description = doc.metadata.get("original_description", "No Description")
89
- desc_hash = hashlib.md5(description.encode()).hexdigest()
90
- key = f"{title}|{link}|{desc_hash}"
91
- if key not in unique_search_articles:
92
- unique_search_articles[key] = {
93
- "title": title,
94
- "link": link,
95
- "description": description,
96
- "category": doc.metadata.get("category", "Uncategorized"),
97
- "published": doc.metadata.get("published", "Unknown Date"),
98
- "image": doc.metadata.get("image", "svg"),
99
- }
100
- enriched_articles = list(unique_search_articles.values())
101
- logger.info(f"Search returned {len(enriched_articles)} unique results")
102
- except Exception as e:
103
- logger.error(f"Error performing search: {e}")
104
- enriched_articles = []
105
 
106
  categorized_articles = {}
107
  for article in enriched_articles:
108
  cat = article["category"]
109
- if cat not in categorized_articles:
110
- categorized_articles[cat] = []
111
- categorized_articles[cat].append(article)
112
-
113
- return render_template("index.html", categorized_articles=categorized_articles, loading_new_feeds=True, has_articles=bool(enriched_articles))
114
- return render_template("index.html", categorized_articles={}, loading_new_feeds=True, has_articles=False)
115
 
116
- @app.route('/check_feeds', methods=['GET'])
117
- def check_feeds():
118
- try:
119
- # Check if vector DB has any documents
120
- all_docs = vector_db.get(include=['documents', 'metadatas'])
121
- if 'metadatas' in all_docs and all_docs['metadatas']:
122
- logger.info("Feeds loaded successfully in vector DB")
123
- return jsonify({"status": "loaded"})
124
- return jsonify({"status": "loading"}), 202
125
  except Exception as e:
126
- logger.error(f"Error checking feeds: {e}")
127
- return jsonify({"status": "error", "message": str(e)}), 500
128
 
129
  if __name__ == "__main__":
130
  app.run(host="0.0.0.0", port=7860)
 
1
  import os
 
2
  from flask import Flask, render_template, request, jsonify
3
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
4
  import logging
 
 
5
 
6
  app = Flask(__name__)
7
 
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
 
 
 
 
 
 
 
 
 
 
12
  @app.route('/')
13
  def index():
 
14
  try:
15
+ # Fetch and store articles synchronously on first load
16
+ articles = fetch_rss_feeds()
17
+ logger.info(f"Fetched {len(articles)} articles")
18
+ process_and_store_articles(articles)
19
+ logger.info("Articles processed and stored")
20
+
21
+ # Retrieve all articles from Chroma DB
22
  all_docs = vector_db.get(include=['documents', 'metadatas'])
23
+ if not all_docs.get('metadatas'):
24
+ logger.warning("No articles in DB yet")
25
+ return render_template("index.html", categorized_articles={}, has_articles=False)
26
+
27
+ # Process retrieved documents
28
+ enriched_articles = []
29
+ seen_keys = set()
30
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
31
+ if not meta:
 
 
 
 
32
  continue
33
+ title = meta.get("title", "No Title")
34
+ link = meta.get("link", "")
35
+ description = meta.get("original_description", "No Description")
36
+ key = f"{title}|{link}"
37
+ if key not in seen_keys:
38
+ seen_keys.add(key)
39
+ enriched_articles.append({
40
  "title": title,
41
  "link": link,
42
  "description": description,
43
+ "category": meta.get("category", "Uncategorized"),
44
+ "published": meta.get("published", "Unknown Date"),
45
+ "image": meta.get("image", "svg"),
46
+ })
47
+ logger.info(f"Displaying {len(enriched_articles)} unique articles")
 
 
 
 
 
 
 
48
 
49
+ # Categorize articles
50
+ categorized_articles = {}
51
+ for article in enriched_articles:
52
+ cat = article["category"]
53
+ categorized_articles.setdefault(cat, []).append(article)
 
54
 
55
+ return render_template("index.html", categorized_articles=categorized_articles, has_articles=True)
56
+ except Exception as e:
57
+ logger.error(f"Error in index: {e}")
58
+ return render_template("index.html", categorized_articles={}, has_articles=False)
59
 
60
  @app.route('/search', methods=['POST'])
61
  def search():
62
  query = request.form.get('search')
63
+ if not query:
64
+ return render_template("index.html", categorized_articles={}, has_articles=False)
65
+
66
+ try:
67
+ logger.info(f"Searching for: {query}")
68
+ results = vector_db.similarity_search(query, k=10)
69
+ enriched_articles = []
70
+ seen_keys = set()
71
+ for doc in results:
72
+ meta = doc.metadata
73
+ title = meta.get("title", "No Title")
74
+ link = meta.get("link", "")
75
+ description = meta.get("original_description", "No Description")
76
+ key = f"{title}|{link}"
77
+ if key not in seen_keys:
78
+ seen_keys.add(key)
79
+ enriched_articles.append({
80
+ "title": title,
81
+ "link": link,
82
+ "description": description,
83
+ "category": meta.get("category", "Uncategorized"),
84
+ "published": meta.get("published", "Unknown Date"),
85
+ "image": meta.get("image", "svg"),
86
+ })
 
87
 
88
  categorized_articles = {}
89
  for article in enriched_articles:
90
  cat = article["category"]
91
+ categorized_articles.setdefault(cat, []).append(article)
 
 
 
 
 
92
 
93
+ return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles))
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
+ logger.error(f"Search error: {e}")
96
+ return render_template("index.html", categorized_articles={}, has_articles=False)
97
 
98
  if __name__ == "__main__":
99
  app.run(host="0.0.0.0", port=7860)