Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +32 -45
rss_processor.py
CHANGED
@@ -13,9 +13,10 @@ logging.basicConfig(level=logging.INFO)
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Constants
|
16 |
-
MAX_ARTICLES_PER_FEED = 5
|
17 |
LOCAL_DB_DIR = "chroma_db"
|
18 |
RSS_FEEDS = rss_feeds.RSS_FEEDS
|
|
|
19 |
|
20 |
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
|
21 |
REPO_ID = "broadfield-dev/news-rag-db"
|
@@ -24,9 +25,15 @@ REPO_ID = "broadfield-dev/news-rag-db"
|
|
24 |
login(token=HF_API_TOKEN)
|
25 |
hf_api = HfApi()
|
26 |
|
27 |
-
# Initialize embedding model
|
28 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def fetch_rss_feeds():
|
32 |
articles = []
|
@@ -66,40 +73,18 @@ def fetch_rss_feeds():
|
|
66 |
return articles
|
67 |
|
68 |
def categorize_feed(url):
|
69 |
-
|
70 |
-
|
71 |
-
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
72 |
-
return "Business"
|
73 |
-
elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
|
74 |
-
return "Stocks & Markets"
|
75 |
-
elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
|
76 |
-
return "Federal Government"
|
77 |
-
elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
|
78 |
-
return "Weather"
|
79 |
-
elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
|
80 |
-
return "Data & Statistics"
|
81 |
-
elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
|
82 |
-
return "Space"
|
83 |
-
elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
|
84 |
-
return "Science"
|
85 |
-
elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
|
86 |
-
return "Tech"
|
87 |
-
elif "horoscope" in url or "astrostyle" in url:
|
88 |
-
return "Astrology"
|
89 |
-
elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
|
90 |
-
return "Politics"
|
91 |
-
elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
|
92 |
-
return "Earth Weather"
|
93 |
-
elif "vogue" in url:
|
94 |
-
return "Lifestyle"
|
95 |
-
elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
|
96 |
-
return "Physics"
|
97 |
-
return "Uncategorized"
|
98 |
|
99 |
def process_and_store_articles(articles):
|
100 |
documents = []
|
|
|
101 |
for article in articles:
|
102 |
try:
|
|
|
|
|
|
|
|
|
103 |
metadata = {
|
104 |
"title": article["title"],
|
105 |
"link": article["link"],
|
@@ -108,7 +93,7 @@ def process_and_store_articles(articles):
|
|
108 |
"category": article["category"],
|
109 |
"image": article["image"],
|
110 |
}
|
111 |
-
doc = Document(page_content=article["description"], metadata=metadata)
|
112 |
documents.append(doc)
|
113 |
except Exception as e:
|
114 |
logger.error(f"Error processing article {article['title']}: {e}")
|
@@ -116,26 +101,28 @@ def process_and_store_articles(articles):
|
|
116 |
if documents:
|
117 |
try:
|
118 |
vector_db.add_documents(documents)
|
119 |
-
|
|
|
120 |
except Exception as e:
|
121 |
logger.error(f"Error storing articles: {e}")
|
122 |
|
123 |
def download_from_hf_hub():
|
124 |
-
if
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
|
|
|
|
133 |
|
134 |
def upload_to_hf_hub():
|
135 |
if os.path.exists(LOCAL_DB_DIR):
|
136 |
try:
|
137 |
-
|
138 |
-
logger.info(f"Uploading Chroma DB to {REPO_ID}...")
|
139 |
for root, _, files in os.walk(LOCAL_DB_DIR):
|
140 |
for file in files:
|
141 |
local_path = os.path.join(root, file)
|
|
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
15 |
# Constants
|
16 |
+
MAX_ARTICLES_PER_FEED = 5
|
17 |
LOCAL_DB_DIR = "chroma_db"
|
18 |
RSS_FEEDS = rss_feeds.RSS_FEEDS
|
19 |
+
COLLECTION_NAME = "news_articles" # Explicitly name the collection
|
20 |
|
21 |
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
|
22 |
REPO_ID = "broadfield-dev/news-rag-db"
|
|
|
25 |
login(token=HF_API_TOKEN)
|
26 |
hf_api = HfApi()
|
27 |
|
28 |
+
# Initialize embedding model (global, reusable)
|
29 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
30 |
+
|
31 |
+
# Initialize vector DB with a specific collection name
|
32 |
+
vector_db = Chroma(
|
33 |
+
persist_directory=LOCAL_DB_DIR,
|
34 |
+
embedding_function=embedding_model,
|
35 |
+
collection_name=COLLECTION_NAME
|
36 |
+
)
|
37 |
|
38 |
def fetch_rss_feeds():
|
39 |
articles = []
|
|
|
73 |
return articles
|
74 |
|
75 |
def categorize_feed(url):
|
76 |
+
# (Unchanged, keeping your existing categorization logic)
|
77 |
+
# ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
def process_and_store_articles(articles):
|
80 |
documents = []
|
81 |
+
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
82 |
for article in articles:
|
83 |
try:
|
84 |
+
# Create a unique ID for deduplication
|
85 |
+
doc_id = f"{article['title']}|{article['link']}|{article['published']}"
|
86 |
+
if doc_id in existing_ids:
|
87 |
+
continue # Skip if already in DB
|
88 |
metadata = {
|
89 |
"title": article["title"],
|
90 |
"link": article["link"],
|
|
|
93 |
"category": article["category"],
|
94 |
"image": article["image"],
|
95 |
}
|
96 |
+
doc = Document(page_content=article["description"], metadata=metadata, id=doc_id)
|
97 |
documents.append(doc)
|
98 |
except Exception as e:
|
99 |
logger.error(f"Error processing article {article['title']}: {e}")
|
|
|
101 |
if documents:
|
102 |
try:
|
103 |
vector_db.add_documents(documents)
|
104 |
+
vector_db.persist() # Explicitly persist changes
|
105 |
+
logger.info(f"Added {len(documents)} new articles to DB")
|
106 |
except Exception as e:
|
107 |
logger.error(f"Error storing articles: {e}")
|
108 |
|
109 |
def download_from_hf_hub():
|
110 |
+
# Only download if the local DB doesn’t exist (initial setup)
|
111 |
+
if not os.path.exists(LOCAL_DB_DIR):
|
112 |
+
try:
|
113 |
+
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
|
114 |
+
logger.info(f"Downloading Chroma DB from {REPO_ID}...")
|
115 |
+
hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
|
116 |
+
except Exception as e:
|
117 |
+
logger.error(f"Error downloading from Hugging Face Hub: {e}")
|
118 |
+
raise
|
119 |
+
else:
|
120 |
+
logger.info("Local Chroma DB already exists, skipping download.")
|
121 |
|
122 |
def upload_to_hf_hub():
|
123 |
if os.path.exists(LOCAL_DB_DIR):
|
124 |
try:
|
125 |
+
logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
|
|
|
126 |
for root, _, files in os.walk(LOCAL_DB_DIR):
|
127 |
for file in files:
|
128 |
local_path = os.path.join(root, file)
|