Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
-
from flask import Flask, render_template, request
|
4 |
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
|
5 |
import logging
|
6 |
import time
|
@@ -26,19 +26,25 @@ def load_feeds_in_background():
|
|
26 |
def index():
|
27 |
# Show all existing articles immediately, even if empty
|
28 |
try:
|
29 |
-
# Get all documents from Chroma DB
|
30 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
36 |
# Use a set to ensure unique articles by title, link, and full description hash
|
37 |
unique_articles = {}
|
38 |
for doc in stored_docs:
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
42 |
desc_hash = hashlib.md5(description.encode()).hexdigest()
|
43 |
key = f"{title}|{link}|{desc_hash}"
|
44 |
if key not in unique_articles:
|
@@ -46,8 +52,8 @@ def index():
|
|
46 |
"title": title,
|
47 |
"link": link,
|
48 |
"description": description,
|
49 |
-
"category": doc.metadata
|
50 |
-
"published": doc.metadata
|
51 |
"image": doc.metadata.get("image", "svg"),
|
52 |
}
|
53 |
enriched_articles = list(unique_articles.values())
|
@@ -73,25 +79,29 @@ def search():
|
|
73 |
query = request.form.get('search')
|
74 |
if query:
|
75 |
logger.info(f"Processing search query: {query}")
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
95 |
|
96 |
categorized_articles = {}
|
97 |
for article in enriched_articles:
|
@@ -106,9 +116,9 @@ def search():
|
|
106 |
@app.route('/check_feeds', methods=['GET'])
|
107 |
def check_feeds():
|
108 |
try:
|
109 |
-
# Check if vector DB has
|
110 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
111 |
-
if all_docs['
|
112 |
logger.info("Feeds loaded successfully in vector DB")
|
113 |
return jsonify({"status": "loaded"})
|
114 |
return jsonify({"status": "loading"}), 202
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
+
from flask import Flask, render_template, request, jsonify
|
4 |
from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
|
5 |
import logging
|
6 |
import time
|
|
|
26 |
def index():
|
27 |
# Show all existing articles immediately, even if empty
|
28 |
try:
|
29 |
+
# Get all documents from Chroma DB using get()
|
30 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
31 |
+
if 'metadatas' in all_docs and all_docs['metadatas']:
|
32 |
+
stored_docs = [
|
33 |
+
Document(page_content=doc['documents'][0] if doc['documents'] else "", metadata=meta)
|
34 |
+
for doc, meta in zip(all_docs['documents'], all_docs['metadatas'])
|
35 |
+
]
|
36 |
+
logger.info(f"Found {len(stored_docs)} documents in vector DB")
|
37 |
+
else:
|
38 |
+
stored_docs = []
|
39 |
+
logger.warning("No metadata or documents found in vector DB")
|
40 |
# Use a set to ensure unique articles by title, link, and full description hash
|
41 |
unique_articles = {}
|
42 |
for doc in stored_docs:
|
43 |
+
if not doc.metadata: # Handle potential None metadata
|
44 |
+
continue
|
45 |
+
title = doc.metadata.get("title", "No Title")
|
46 |
+
link = doc.metadata.get("link", "")
|
47 |
+
description = doc.metadata.get("original_description", "No Description")
|
48 |
desc_hash = hashlib.md5(description.encode()).hexdigest()
|
49 |
key = f"{title}|{link}|{desc_hash}"
|
50 |
if key not in unique_articles:
|
|
|
52 |
"title": title,
|
53 |
"link": link,
|
54 |
"description": description,
|
55 |
+
"category": doc.metadata.get("category", "Uncategorized"),
|
56 |
+
"published": doc.metadata.get("published", "Unknown Date"),
|
57 |
"image": doc.metadata.get("image", "svg"),
|
58 |
}
|
59 |
enriched_articles = list(unique_articles.values())
|
|
|
79 |
query = request.form.get('search')
|
80 |
if query:
|
81 |
logger.info(f"Processing search query: {query}")
|
82 |
+
try:
|
83 |
+
results = vector_db.similarity_search(query, k=10)
|
84 |
+
unique_search_articles = {}
|
85 |
+
for doc in results:
|
86 |
+
title = doc.metadata.get("title", "No Title")
|
87 |
+
link = doc.metadata.get("link", "")
|
88 |
+
description = doc.metadata.get("original_description", "No Description")
|
89 |
+
desc_hash = hashlib.md5(description.encode()).hexdigest()
|
90 |
+
key = f"{title}|{link}|{desc_hash}"
|
91 |
+
if key not in unique_search_articles:
|
92 |
+
unique_search_articles[key] = {
|
93 |
+
"title": title,
|
94 |
+
"link": link,
|
95 |
+
"description": description,
|
96 |
+
"category": doc.metadata.get("category", "Uncategorized"),
|
97 |
+
"published": doc.metadata.get("published", "Unknown Date"),
|
98 |
+
"image": doc.metadata.get("image", "svg"),
|
99 |
+
}
|
100 |
+
enriched_articles = list(unique_search_articles.values())
|
101 |
+
logger.info(f"Search returned {len(enriched_articles)} unique results")
|
102 |
+
except Exception as e:
|
103 |
+
logger.error(f"Error performing search: {e}")
|
104 |
+
enriched_articles = []
|
105 |
|
106 |
categorized_articles = {}
|
107 |
for article in enriched_articles:
|
|
|
116 |
@app.route('/check_feeds', methods=['GET'])
|
117 |
def check_feeds():
|
118 |
try:
|
119 |
+
# Check if vector DB has any documents
|
120 |
all_docs = vector_db.get(include=['documents', 'metadatas'])
|
121 |
+
if 'metadatas' in all_docs and all_docs['metadatas']:
|
122 |
logger.info("Feeds loaded successfully in vector DB")
|
123 |
return jsonify({"status": "loaded"})
|
124 |
return jsonify({"status": "loading"}), 202
|