broadfield-dev commited on
Commit
78dac58
·
verified ·
1 Parent(s): 3156b44

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +19 -15
rss_processor.py CHANGED
@@ -80,7 +80,7 @@ def fetch_rss_feeds():
80
  for entry in feed.entries[:5]:
81
  title = entry.get("title", "No Title")
82
  link = entry.get("link", "")
83
- # Create a unique key for deduplication
84
  article_key = f"{title}|{link}"
85
  if article_key not in seen_articles:
86
  seen_articles.add(article_key)
@@ -93,7 +93,7 @@ def fetch_rss_feeds():
93
  "category": categorize_feed(feed_url),
94
  "image": image if image else "",
95
  })
96
- logger.info(f"Processed {len(feed.entries[:5])} unique entries from {feed_url}")
97
  except Exception as e:
98
  logger.error(f"Error fetching {feed_url}: {e}")
99
  return articles
@@ -114,21 +114,25 @@ def categorize_feed(url):
114
 
115
  def process_and_store_articles(articles):
116
  documents = []
 
117
  for article in articles:
118
  try:
119
- metadata = {
120
- "title": article["title"] or "No Title",
121
- "link": article["link"] or "",
122
- "original_description": article["description"] or "No Description",
123
- "published": article["published"] or "Unknown Date",
124
- "category": article["category"] or "Uncategorized",
125
- "image": article["image"] or "",
126
- }
127
- doc = Document(
128
- page_content=article["description"] or "No Description",
129
- metadata=metadata
130
- )
131
- documents.append(doc)
 
 
 
132
  except Exception as e:
133
  logger.error(f"Error processing article {article['title']}: {e}")
134
  try:
 
80
  for entry in feed.entries[:5]:
81
  title = entry.get("title", "No Title")
82
  link = entry.get("link", "")
83
+ # Create a unique key for deduplication (title and link)
84
  article_key = f"{title}|{link}"
85
  if article_key not in seen_articles:
86
  seen_articles.add(article_key)
 
93
  "category": categorize_feed(feed_url),
94
  "image": image if image else "",
95
  })
96
+ logger.info(f"Processed {len([e for e in feed.entries[:5] if f'{e.get('title', 'No Title')}|{e.get('link', '')}' not in seen_articles])} unique entries from {feed_url}")
97
  except Exception as e:
98
  logger.error(f"Error fetching {feed_url}: {e}")
99
  return articles
 
114
 
115
  def process_and_store_articles(articles):
116
  documents = []
117
+ seen_docs = set() # Additional de-duplication at DB level
118
  for article in articles:
119
  try:
120
+ key = f"{article['title']}|{article['link']}"
121
+ if key not in seen_docs:
122
+ seen_docs.add(key)
123
+ metadata = {
124
+ "title": article["title"] or "No Title",
125
+ "link": article["link"] or "",
126
+ "original_description": article["description"] or "No Description",
127
+ "published": article["published"] or "Unknown Date",
128
+ "category": article["category"] or "Uncategorized",
129
+ "image": article["image"] or "",
130
+ }
131
+ doc = Document(
132
+ page_content=article["description"] or "No Description",
133
+ metadata=metadata
134
+ )
135
+ documents.append(doc)
136
  except Exception as e:
137
  logger.error(f"Error processing article {article['title']}: {e}")
138
  try: