Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +13 -10
rss_processor.py
CHANGED
@@ -9,7 +9,7 @@ import shutil
|
|
9 |
import rss_feeds
|
10 |
from datetime import datetime
|
11 |
import dateutil.parser # For flexible date parsing
|
12 |
-
import hashlib
|
13 |
|
14 |
# Setup logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
@@ -52,7 +52,7 @@ def fetch_rss_feeds():
|
|
52 |
for entry in feed.entries:
|
53 |
if article_count >= MAX_ARTICLES_PER_FEED:
|
54 |
break
|
55 |
-
title = entry.get("title", "No Title").strip().lower()
|
56 |
link = entry.get("link", "").strip().lower()
|
57 |
description = entry.get("summary", entry.get("description", "No Description")).strip()
|
58 |
|
@@ -68,10 +68,9 @@ def fetch_rss_feeds():
|
|
68 |
logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
|
69 |
continue
|
70 |
|
71 |
-
# Use a robust key for deduplication
|
72 |
-
|
73 |
-
key = f"{title
|
74 |
-
|
75 |
if key not in seen_keys:
|
76 |
seen_keys.add(key)
|
77 |
# Try multiple image sources
|
@@ -107,7 +106,6 @@ def fetch_rss_feeds():
|
|
107 |
return articles
|
108 |
|
109 |
def categorize_feed(url):
|
110 |
-
# (Unchanged, keeping your existing categorization logic)
|
111 |
if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
|
112 |
return "Academic Papers"
|
113 |
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
@@ -143,8 +141,13 @@ def process_and_store_articles(articles):
|
|
143 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
144 |
for article in articles:
|
145 |
try:
|
146 |
-
# Create a unique ID based on normalized fields
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
148 |
if doc_id in existing_ids:
|
149 |
logger.debug(f"Skipping duplicate in DB: {doc_id}")
|
150 |
continue
|
@@ -156,7 +159,7 @@ def process_and_store_articles(articles):
|
|
156 |
"category": article["category"],
|
157 |
"image": article["image"],
|
158 |
}
|
159 |
-
doc = Document(page_content=
|
160 |
documents.append(doc)
|
161 |
except Exception as e:
|
162 |
logger.error(f"Error processing article {article['title']}: {e}")
|
|
|
9 |
import rss_feeds
|
10 |
from datetime import datetime
|
11 |
import dateutil.parser # For flexible date parsing
|
12 |
+
import hashlib # For generating unique hashes
|
13 |
|
14 |
# Setup logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
|
|
52 |
for entry in feed.entries:
|
53 |
if article_count >= MAX_ARTICLES_PER_FEED:
|
54 |
break
|
55 |
+
title = entry.get("title", "No Title").strip().lower()
|
56 |
link = entry.get("link", "").strip().lower()
|
57 |
description = entry.get("summary", entry.get("description", "No Description")).strip()
|
58 |
|
|
|
68 |
logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
|
69 |
continue
|
70 |
|
71 |
+
# Use a robust key for deduplication, including a hash of the description
|
72 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
73 |
+
key = f"{title}|{link}|{published}|{description_hash}"
|
|
|
74 |
if key not in seen_keys:
|
75 |
seen_keys.add(key)
|
76 |
# Try multiple image sources
|
|
|
106 |
return articles
|
107 |
|
108 |
def categorize_feed(url):
|
|
|
109 |
if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
|
110 |
return "Academic Papers"
|
111 |
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
|
|
141 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
142 |
for article in articles:
|
143 |
try:
|
144 |
+
# Create a unique ID based on normalized fields, including description hash
|
145 |
+
title = article["title"].lower().strip()
|
146 |
+
link = article["link"].lower().strip()
|
147 |
+
description = article["description"].strip()
|
148 |
+
published = article["published"]
|
149 |
+
description_hash = hashlib.md5(description.encode('utf-8')).hexdigest()
|
150 |
+
doc_id = f"{title}|{link}|{published}|{description_hash}"
|
151 |
if doc_id in existing_ids:
|
152 |
logger.debug(f"Skipping duplicate in DB: {doc_id}")
|
153 |
continue
|
|
|
159 |
"category": article["category"],
|
160 |
"image": article["image"],
|
161 |
}
|
162 |
+
doc = Document(page_content=description, metadata=metadata, id=doc_id)
|
163 |
documents.append(doc)
|
164 |
except Exception as e:
|
165 |
logger.error(f"Error processing article {article['title']}: {e}")
|