broadfield-dev commited on
Commit
6303399
·
verified ·
1 Parent(s): 33e2dac

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +7 -4
rss_processor.py CHANGED
@@ -6,6 +6,7 @@ from langchain.embeddings import HuggingFaceEmbeddings
6
  from langchain.docstore.document import Document
7
  import shutil
8
  import logging
 
9
 
10
  # Setup logging
11
  logging.basicConfig(level=logging.INFO)
@@ -69,7 +70,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
69
 
70
  def fetch_rss_feeds():
71
  articles = []
72
- seen_articles = set() # Track unique articles by title, link, and description
73
  for feed_url in RSS_FEEDS:
74
  try:
75
  logger.info(f"Fetching feed: {feed_url}")
@@ -82,8 +83,9 @@ def fetch_rss_feeds():
82
  title = entry.get("title", "No Title")
83
  link = entry.get("link", "")
84
  description = entry.get("summary", entry.get("description", "No Description"))
85
- # Create a unique key for deduplication (title, link, and description for stricter uniqueness)
86
- article_key = f"{title}|{link}|{description[:50]}" # Use first 50 chars of description to avoid overly long keys
 
87
  if article_key not in seen_articles:
88
  seen_articles.add(article_key)
89
  unique_count += 1
@@ -120,7 +122,8 @@ def process_and_store_articles(articles):
120
  seen_docs = set() # Additional de-duplication at DB level
121
  for article in articles:
122
  try:
123
- key = f"{article['title']}|{article['link']}|{article['description'][:50]}"
 
124
  if key not in seen_docs:
125
  seen_docs.add(key)
126
  metadata = {
 
6
  from langchain.docstore.document import Document
7
  import shutil
8
  import logging
9
+ import hashlib
10
 
11
  # Setup logging
12
  logging.basicConfig(level=logging.INFO)
 
70
 
71
  def fetch_rss_feeds():
72
  articles = []
73
+ seen_articles = set() # Track unique articles by title, link, and description hash
74
  for feed_url in RSS_FEEDS:
75
  try:
76
  logger.info(f"Fetching feed: {feed_url}")
 
83
  title = entry.get("title", "No Title")
84
  link = entry.get("link", "")
85
  description = entry.get("summary", entry.get("description", "No Description"))
86
+ # Use MD5 hash of description for uniqueness
87
+ desc_hash = hashlib.md5(description.encode()).hexdigest()[:10]
88
+ article_key = f"{title}|{link}|{desc_hash}"
89
  if article_key not in seen_articles:
90
  seen_articles.add(article_key)
91
  unique_count += 1
 
122
  seen_docs = set() # Additional de-duplication at DB level
123
  for article in articles:
124
  try:
125
+ desc_hash = hashlib.md5(article["description"].encode()).hexdigest()[:10]
126
+ key = f"{article['title']}|{article['link']}|{desc_hash}"
127
  if key not in seen_docs:
128
  seen_docs.add(key)
129
  metadata = {