broadfield-dev commited on
Commit
72c3c36
·
verified ·
1 Parent(s): 6c20801

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -33
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import subprocess
3
- from flask import Flask, render_template, request
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
5
  import logging
6
  import time
@@ -26,19 +26,25 @@ def load_feeds_in_background():
26
  def index():
27
  # Show all existing articles immediately, even if empty
28
  try:
29
- # Get all documents from Chroma DB
30
  all_docs = vector_db.get(include=['documents', 'metadatas'])
31
- stored_docs = [
32
- Document(page_content=doc['documents'], metadata=doc['metadatas'])
33
- for doc in all_docs['documents']
34
- ]
35
- logger.info(f"Found {len(stored_docs)} documents in vector DB")
 
 
 
 
36
  # Use a set to ensure unique articles by title, link, and full description hash
37
  unique_articles = {}
38
  for doc in stored_docs:
39
- title = doc.metadata["title"]
40
- link = doc.metadata["link"]
41
- description = doc.metadata["original_description"]
 
 
42
  desc_hash = hashlib.md5(description.encode()).hexdigest()
43
  key = f"{title}|{link}|{desc_hash}"
44
  if key not in unique_articles:
@@ -46,8 +52,8 @@ def index():
46
  "title": title,
47
  "link": link,
48
  "description": description,
49
- "category": doc.metadata["category"],
50
- "published": doc.metadata["published"],
51
  "image": doc.metadata.get("image", "svg"),
52
  }
53
  enriched_articles = list(unique_articles.values())
@@ -73,25 +79,29 @@ def search():
73
  query = request.form.get('search')
74
  if query:
75
  logger.info(f"Processing search query: {query}")
76
- results = vector_db.similarity_search(query, k=10)
77
- unique_search_articles = {}
78
- for doc in results:
79
- title = doc.metadata["title"]
80
- link = doc.metadata["link"]
81
- description = doc.metadata["original_description"]
82
- desc_hash = hashlib.md5(description.encode()).hexdigest()
83
- key = f"{title}|{link}|{desc_hash}"
84
- if key not in unique_search_articles:
85
- unique_search_articles[key] = {
86
- "title": title,
87
- "link": link,
88
- "description": description,
89
- "category": doc.metadata["category"],
90
- "published": doc.metadata["published"],
91
- "image": doc.metadata.get("image", "svg"),
92
- }
93
- enriched_articles = list(unique_search_articles.values())
94
- logger.info(f"Search returned {len(enriched_articles)} unique results")
 
 
 
 
95
 
96
  categorized_articles = {}
97
  for article in enriched_articles:
@@ -106,9 +116,9 @@ def search():
106
  @app.route('/check_feeds', methods=['GET'])
107
  def check_feeds():
108
  try:
109
- # Check if vector DB has new or updated documents
110
  all_docs = vector_db.get(include=['documents', 'metadatas'])
111
- if all_docs['documents']:
112
  logger.info("Feeds loaded successfully in vector DB")
113
  return jsonify({"status": "loaded"})
114
  return jsonify({"status": "loading"}), 202
 
1
  import os
2
  import subprocess
3
+ from flask import Flask, render_template, request, jsonify
4
  from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
5
  import logging
6
  import time
 
26
  def index():
27
  # Show all existing articles immediately, even if empty
28
  try:
29
+ # Get all documents from Chroma DB using get()
30
  all_docs = vector_db.get(include=['documents', 'metadatas'])
31
+ if 'metadatas' in all_docs and all_docs['metadatas']:
32
+ stored_docs = [
33
+ Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta)
34
+ for doc, meta in zip(all_docs['documents'], all_docs['metadatas'])
35
+ ]
36
+ logger.info(f"Found {len(stored_docs)} documents in vector DB")
37
+ else:
38
+ stored_docs = []
39
+ logger.warning("No metadata or documents found in vector DB")
40
  # Use a set to ensure unique articles by title, link, and full description hash
41
  unique_articles = {}
42
  for doc in stored_docs:
43
+ if not doc.metadata: # Handle potential None metadata
44
+ continue
45
+ title = doc.metadata.get("title", "No Title")
46
+ link = doc.metadata.get("link", "")
47
+ description = doc.metadata.get("original_description", "No Description")
48
  desc_hash = hashlib.md5(description.encode()).hexdigest()
49
  key = f"{title}|{link}|{desc_hash}"
50
  if key not in unique_articles:
 
52
  "title": title,
53
  "link": link,
54
  "description": description,
55
+ "category": doc.metadata.get("category", "Uncategorized"),
56
+ "published": doc.metadata.get("published", "Unknown Date"),
57
  "image": doc.metadata.get("image", "svg"),
58
  }
59
  enriched_articles = list(unique_articles.values())
 
79
  query = request.form.get('search')
80
  if query:
81
  logger.info(f"Processing search query: {query}")
82
+ try:
83
+ results = vector_db.similarity_search(query, k=10)
84
+ unique_search_articles = {}
85
+ for doc in results:
86
+ title = doc.metadata.get("title", "No Title")
87
+ link = doc.metadata.get("link", "")
88
+ description = doc.metadata.get("original_description", "No Description")
89
+ desc_hash = hashlib.md5(description.encode()).hexdigest()
90
+ key = f"{title}|{link}|{desc_hash}"
91
+ if key not in unique_search_articles:
92
+ unique_search_articles[key] = {
93
+ "title": title,
94
+ "link": link,
95
+ "description": description,
96
+ "category": doc.metadata.get("category", "Uncategorized"),
97
+ "published": doc.metadata.get("published", "Unknown Date"),
98
+ "image": doc.metadata.get("image", "svg"),
99
+ }
100
+ enriched_articles = list(unique_search_articles.values())
101
+ logger.info(f"Search returned {len(enriched_articles)} unique results")
102
+ except Exception as e:
103
+ logger.error(f"Error performing search: {e}")
104
+ enriched_articles = []
105
 
106
  categorized_articles = {}
107
  for article in enriched_articles:
 
116
  @app.route('/check_feeds', methods=['GET'])
117
  def check_feeds():
118
  try:
119
+ # Check if vector DB has any documents
120
  all_docs = vector_db.get(include=['documents', 'metadatas'])
121
+ if 'metadatas' in all_docs and all_docs['metadatas']:
122
  logger.info("Feeds loaded successfully in vector DB")
123
  return jsonify({"status": "loaded"})
124
  return jsonify({"status": "loading"}), 202