Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +17 -13
rss_processor.py
CHANGED
@@ -7,16 +7,18 @@ import logging
|
|
7 |
from huggingface_hub import HfApi, login
|
8 |
import shutil
|
9 |
import rss_feeds
|
|
|
|
|
10 |
|
11 |
# Setup logging
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Constants
|
16 |
-
MAX_ARTICLES_PER_FEED =
|
17 |
LOCAL_DB_DIR = "chroma_db"
|
18 |
RSS_FEEDS = rss_feeds.RSS_FEEDS
|
19 |
-
COLLECTION_NAME = "news_articles"
|
20 |
|
21 |
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
|
22 |
REPO_ID = "broadfield-dev/news-rag-db"
|
@@ -35,9 +37,6 @@ vector_db = Chroma(
|
|
35 |
collection_name=COLLECTION_NAME
|
36 |
)
|
37 |
|
38 |
-
from datetime import datetime
|
39 |
-
import dateutil.parser # Add this dependency: pip install python-dateutil
|
40 |
-
|
41 |
def fetch_rss_feeds():
|
42 |
articles = []
|
43 |
seen_keys = set()
|
@@ -52,8 +51,8 @@ def fetch_rss_feeds():
|
|
52 |
for entry in feed.entries:
|
53 |
if article_count >= MAX_ARTICLES_PER_FEED:
|
54 |
break
|
55 |
-
title = entry.get("title", "No Title").strip()
|
56 |
-
link = entry.get("link", "").strip()
|
57 |
description = entry.get("summary", entry.get("description", "No Description")).strip()
|
58 |
|
59 |
# Try multiple date fields and parse flexibly
|
@@ -68,6 +67,7 @@ def fetch_rss_feeds():
|
|
68 |
logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
|
69 |
continue
|
70 |
|
|
|
71 |
key = f"{title}|{link}|{published}"
|
72 |
if key not in seen_keys:
|
73 |
seen_keys.add(key)
|
@@ -81,8 +81,8 @@ def fetch_rss_feeds():
|
|
81 |
]:
|
82 |
try:
|
83 |
img = img_source(entry)
|
84 |
-
if img:
|
85 |
-
image = img
|
86 |
break
|
87 |
except (IndexError, AttributeError, TypeError):
|
88 |
continue
|
@@ -96,12 +96,15 @@ def fetch_rss_feeds():
|
|
96 |
"image": image,
|
97 |
})
|
98 |
article_count += 1
|
|
|
|
|
99 |
except Exception as e:
|
100 |
logger.error(f"Error fetching {feed_url}: {e}")
|
101 |
logger.info(f"Total articles fetched: {len(articles)}")
|
102 |
return articles
|
103 |
|
104 |
def categorize_feed(url):
|
|
|
105 |
if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
|
106 |
return "Academic Papers"
|
107 |
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
@@ -131,16 +134,17 @@ def categorize_feed(url):
|
|
131 |
elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
|
132 |
return "Physics"
|
133 |
return "Uncategorized"
|
134 |
-
|
135 |
def process_and_store_articles(articles):
|
136 |
documents = []
|
137 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
138 |
for article in articles:
|
139 |
try:
|
140 |
-
# Create a unique ID
|
141 |
-
doc_id = f"{article['title']}|{article['link']}|{article['published']}"
|
142 |
if doc_id in existing_ids:
|
143 |
-
|
|
|
144 |
metadata = {
|
145 |
"title": article["title"],
|
146 |
"link": article["link"],
|
|
|
7 |
from huggingface_hub import HfApi, login
|
8 |
import shutil
|
9 |
import rss_feeds
|
10 |
+
from datetime import datetime
|
11 |
+
import dateutil.parser # For flexible date parsing
|
12 |
|
13 |
# Setup logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
16 |
|
17 |
# Constants
|
18 |
+
MAX_ARTICLES_PER_FEED = 5
|
19 |
LOCAL_DB_DIR = "chroma_db"
|
20 |
RSS_FEEDS = rss_feeds.RSS_FEEDS
|
21 |
+
COLLECTION_NAME = "news_articles"
|
22 |
|
23 |
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
|
24 |
REPO_ID = "broadfield-dev/news-rag-db"
|
|
|
37 |
collection_name=COLLECTION_NAME
|
38 |
)
|
39 |
|
|
|
|
|
|
|
40 |
def fetch_rss_feeds():
|
41 |
articles = []
|
42 |
seen_keys = set()
|
|
|
51 |
for entry in feed.entries:
|
52 |
if article_count >= MAX_ARTICLES_PER_FEED:
|
53 |
break
|
54 |
+
title = entry.get("title", "No Title").strip().lower() # Normalize case and whitespace
|
55 |
+
link = entry.get("link", "").strip().lower()
|
56 |
description = entry.get("summary", entry.get("description", "No Description")).strip()
|
57 |
|
58 |
# Try multiple date fields and parse flexibly
|
|
|
67 |
logger.debug(f"Failed to parse {date_field} '{entry[date_field]}': {e}")
|
68 |
continue
|
69 |
|
70 |
+
# Use a robust key for deduplication
|
71 |
key = f"{title}|{link}|{published}"
|
72 |
if key not in seen_keys:
|
73 |
seen_keys.add(key)
|
|
|
81 |
]:
|
82 |
try:
|
83 |
img = img_source(entry)
|
84 |
+
if img and isinstance(img, str) and img.strip():
|
85 |
+
image = img.strip()
|
86 |
break
|
87 |
except (IndexError, AttributeError, TypeError):
|
88 |
continue
|
|
|
96 |
"image": image,
|
97 |
})
|
98 |
article_count += 1
|
99 |
+
else:
|
100 |
+
logger.debug(f"Duplicate article skipped in feed {feed_url}: {key}")
|
101 |
except Exception as e:
|
102 |
logger.error(f"Error fetching {feed_url}: {e}")
|
103 |
logger.info(f"Total articles fetched: {len(articles)}")
|
104 |
return articles
|
105 |
|
106 |
def categorize_feed(url):
|
107 |
+
# (Unchanged, keeping your existing categorization logic)
|
108 |
if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
|
109 |
return "Academic Papers"
|
110 |
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
|
|
134 |
elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
|
135 |
return "Physics"
|
136 |
return "Uncategorized"
|
137 |
+
|
138 |
def process_and_store_articles(articles):
|
139 |
documents = []
|
140 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
141 |
for article in articles:
|
142 |
try:
|
143 |
+
# Create a unique ID based on normalized fields
|
144 |
+
doc_id = f"{article['title'].lower()}|{article['link'].lower()}|{article['published']}"
|
145 |
if doc_id in existing_ids:
|
146 |
+
logger.debug(f"Skipping duplicate in DB: {doc_id}")
|
147 |
+
continue
|
148 |
metadata = {
|
149 |
"title": article["title"],
|
150 |
"link": article["link"],
|