broadfield-dev commited on
Commit
8179b58
·
verified ·
1 Parent(s): c4a29ef

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +7 -6
rss_processor.py CHANGED
@@ -69,7 +69,7 @@ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_
69
 
70
  def fetch_rss_feeds():
71
  articles = []
72
- seen_articles = set() # Track unique articles by title and link
73
  for feed_url in RSS_FEEDS:
74
  try:
75
  logger.info(f"Fetching feed: {feed_url}")
@@ -78,11 +78,12 @@ def fetch_rss_feeds():
78
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
79
  continue
80
  unique_count = 0
81
- for entry in feed.entries[:100]:
82
  title = entry.get("title", "No Title")
83
  link = entry.get("link", "")
84
- # Create a unique key for deduplication (title and link)
85
- article_key = f"{title}|{link}"
 
86
  if article_key not in seen_articles:
87
  seen_articles.add(article_key)
88
  unique_count += 1
@@ -90,7 +91,7 @@ def fetch_rss_feeds():
90
  articles.append({
91
  "title": title,
92
  "link": link,
93
- "description": entry.get("summary", entry.get("description", "No Description")),
94
  "published": entry.get("published", "Unknown Date"),
95
  "category": categorize_feed(feed_url),
96
  "image": image if image else "",
@@ -119,7 +120,7 @@ def process_and_store_articles(articles):
119
  seen_docs = set() # Additional de-duplication at DB level
120
  for article in articles:
121
  try:
122
- key = f"{article['title']}|{article['link']}"
123
  if key not in seen_docs:
124
  seen_docs.add(key)
125
  metadata = {
 
69
 
70
  def fetch_rss_feeds():
71
  articles = []
72
+ seen_articles = set() # Track unique articles by title, link, and description
73
  for feed_url in RSS_FEEDS:
74
  try:
75
  logger.info(f"Fetching feed: {feed_url}")
 
78
  logger.warning(f"Failed to parse {feed_url}: {feed.bozo_exception}")
79
  continue
80
  unique_count = 0
81
+ for entry in feed.entries[:5]:
82
  title = entry.get("title", "No Title")
83
  link = entry.get("link", "")
84
+ description = entry.get("summary", entry.get("description", "No Description"))
85
+ # Create a unique key for deduplication (title, link, and description for stricter uniqueness)
86
+ article_key = f"{title}|{link}|{description[:50]}" # Use first 50 chars of description to avoid overly long keys
87
  if article_key not in seen_articles:
88
  seen_articles.add(article_key)
89
  unique_count += 1
 
91
  articles.append({
92
  "title": title,
93
  "link": link,
94
+ "description": description,
95
  "published": entry.get("published", "Unknown Date"),
96
  "category": categorize_feed(feed_url),
97
  "image": image if image else "",
 
120
  seen_docs = set() # Additional de-duplication at DB level
121
  for article in articles:
122
  try:
123
+ key = f"{article['title']}|{article['link']}|{article['description'][:50]}"
124
  if key not in seen_docs:
125
  seen_docs.add(key)
126
  metadata = {