Spaces:
Running
Running
Update rss_processor.py
Browse files- rss_processor.py +7 -4
rss_processor.py
CHANGED
@@ -6,6 +6,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
|
|
6 |
from langchain.docstore.document import Document
|
7 |
import shutil
|
8 |
import logging
|
|
|
9 |
|
10 |
# Setup logging
|
11 |
logging.basicConfig(level=logging.INFO)
|
@@ -69,7 +70,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
|
|
69 |
|
70 |
def fetch_rss_feeds():
|
71 |
articles = []
|
72 |
-
seen_articles = set() # Track unique articles by title, link, and description
|
73 |
for feed_url in RSS_FEEDS:
|
74 |
try:
|
75 |
logger.info(f"Fetching feed: {feed_url}")
|
@@ -82,8 +83,9 @@ def fetch_rss_feeds():
|
|
82 |
title = entry.get("title", "No Title")
|
83 |
link = entry.get("link", "")
|
84 |
description = entry.get("summary", entry.get("description", "No Description"))
|
85 |
-
#
|
86 |
-
|
|
|
87 |
if article_key not in seen_articles:
|
88 |
seen_articles.add(article_key)
|
89 |
unique_count += 1
|
@@ -120,7 +122,8 @@ def process_and_store_articles(articles):
|
|
120 |
seen_docs = set() # Additional de-duplication at DB level
|
121 |
for article in articles:
|
122 |
try:
|
123 |
-
|
|
|
124 |
if key not in seen_docs:
|
125 |
seen_docs.add(key)
|
126 |
metadata = {
|
|
|
6 |
from langchain.docstore.document import Document
|
7 |
import shutil
|
8 |
import logging
|
9 |
+
import hashlib
|
10 |
|
11 |
# Setup logging
|
12 |
logging.basicConfig(level=logging.INFO)
|
|
|
70 |
|
71 |
def fetch_rss_feeds():
|
72 |
articles = []
|
73 |
+
seen_articles = set() # Track unique articles by title, link, and description hash
|
74 |
for feed_url in RSS_FEEDS:
|
75 |
try:
|
76 |
logger.info(f"Fetching feed: {feed_url}")
|
|
|
83 |
title = entry.get("title", "No Title")
|
84 |
link = entry.get("link", "")
|
85 |
description = entry.get("summary", entry.get("description", "No Description"))
|
86 |
+
# Use MD5 hash of description for uniqueness
|
87 |
+
desc_hash = hashlib.md5(description.encode()).hexdigest()[:10]
|
88 |
+
article_key = f"{title}|{link}|{desc_hash}"
|
89 |
if article_key not in seen_articles:
|
90 |
seen_articles.add(article_key)
|
91 |
unique_count += 1
|
|
|
122 |
seen_docs = set() # Additional de-duplication at DB level
|
123 |
for article in articles:
|
124 |
try:
|
125 |
+
desc_hash = hashlib.md5(article["description"].encode()).hexdigest()[:10]
|
126 |
+
key = f"{article['title']}|{article['link']}|{desc_hash}"
|
127 |
if key not in seen_docs:
|
128 |
seen_docs.add(key)
|
129 |
metadata = {
|