Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db
|
|
4 |
import logging
|
5 |
import time
|
6 |
from threading import Thread
|
|
|
7 |
|
8 |
app = Flask(__name__)
|
9 |
|
@@ -32,9 +33,10 @@ def loading():
|
|
32 |
@app.route('/check_feeds', methods=['GET'])
|
33 |
def check_feeds():
|
34 |
try:
|
35 |
-
# Check if vector DB has documents
|
36 |
docs = vector_db.similarity_search("news", k=1)
|
37 |
if docs:
|
|
|
38 |
return jsonify({"status": "loaded"})
|
39 |
return jsonify({"status": "loading"}), 202
|
40 |
except Exception as e:
|
@@ -43,28 +45,20 @@ def check_feeds():
|
|
43 |
|
44 |
@app.route('/index', methods=['GET'])
|
45 |
def index():
|
46 |
-
#
|
47 |
-
while True:
|
48 |
-
response = check_feeds()
|
49 |
-
if response.status_code == 200 and response.get_json()["status"] == "loaded":
|
50 |
-
break
|
51 |
-
time.sleep(1) # Check every second
|
52 |
-
|
53 |
-
stored_docs = vector_db.similarity_search("news", k=1000) # Increased k for all unique articles
|
54 |
# Use a set to ensure unique articles by title, link, and description hash
|
55 |
unique_articles = {}
|
56 |
for doc in stored_docs:
|
57 |
-
import hashlib
|
58 |
title = doc.metadata["title"]
|
59 |
link = doc.metadata["link"]
|
60 |
-
|
61 |
-
desc_hash = hashlib.md5(
|
62 |
key = f"{title}|{link}|{desc_hash}"
|
63 |
if key not in unique_articles:
|
64 |
unique_articles[key] = {
|
65 |
"title": title,
|
66 |
"link": link,
|
67 |
-
"description":
|
68 |
"category": doc.metadata["category"],
|
69 |
"published": doc.metadata["published"],
|
70 |
"image": doc.metadata.get("image", "svg"),
|
@@ -81,14 +75,14 @@ def index():
|
|
81 |
for doc in results:
|
82 |
title = doc.metadata["title"]
|
83 |
link = doc.metadata["link"]
|
84 |
-
|
85 |
-
desc_hash = hashlib.md5(
|
86 |
key = f"{title}|{link}|{desc_hash}"
|
87 |
if key not in unique_search_articles:
|
88 |
unique_search_articles[key] = {
|
89 |
"title": title,
|
90 |
"link": link,
|
91 |
-
"description":
|
92 |
"category": doc.metadata["category"],
|
93 |
"published": doc.metadata["published"],
|
94 |
"image": doc.metadata.get("image", "svg"),
|
|
|
4 |
import logging
|
5 |
import time
|
6 |
from threading import Thread
|
7 |
+
import hashlib
|
8 |
|
9 |
app = Flask(__name__)
|
10 |
|
|
|
33 |
@app.route('/check_feeds', methods=['GET'])
|
34 |
def check_feeds():
|
35 |
try:
|
36 |
+
# Check if vector DB has documents
|
37 |
docs = vector_db.similarity_search("news", k=1)
|
38 |
if docs:
|
39 |
+
logger.info("Feeds loaded successfully in vector DB")
|
40 |
return jsonify({"status": "loaded"})
|
41 |
return jsonify({"status": "loading"}), 202
|
42 |
except Exception as e:
|
|
|
45 |
|
46 |
@app.route('/index', methods=['GET'])
|
47 |
def index():
|
48 |
+
stored_docs = vector_db.similarity_search("news", k=1000) # Ensure all unique articles
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
# Use a set to ensure unique articles by title, link, and description hash
|
50 |
unique_articles = {}
|
51 |
for doc in stored_docs:
|
|
|
52 |
title = doc.metadata["title"]
|
53 |
link = doc.metadata["link"]
|
54 |
+
description = doc.metadata["original_description"]
|
55 |
+
desc_hash = hashlib.md5(description.encode()).hexdigest()[:10]
|
56 |
key = f"{title}|{link}|{desc_hash}"
|
57 |
if key not in unique_articles:
|
58 |
unique_articles[key] = {
|
59 |
"title": title,
|
60 |
"link": link,
|
61 |
+
"description": description,
|
62 |
"category": doc.metadata["category"],
|
63 |
"published": doc.metadata["published"],
|
64 |
"image": doc.metadata.get("image", "svg"),
|
|
|
75 |
for doc in results:
|
76 |
title = doc.metadata["title"]
|
77 |
link = doc.metadata["link"]
|
78 |
+
description = doc.metadata["original_description"]
|
79 |
+
desc_hash = hashlib.md5(description.encode()).hexdigest()[:10]
|
80 |
key = f"{title}|{link}|{desc_hash}"
|
81 |
if key not in unique_search_articles:
|
82 |
unique_search_articles[key] = {
|
83 |
"title": title,
|
84 |
"link": link,
|
85 |
+
"description": description,
|
86 |
"category": doc.metadata["category"],
|
87 |
"published": doc.metadata["published"],
|
88 |
"image": doc.metadata.get("image", "svg"),
|