broadfield-dev commited on
Commit
430a9bd
·
verified ·
1 Parent(s): 1c7cefc

Create rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +278 -0
rss_processor.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import feedparser
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain.docstore.document import Document
6
+ import logging
7
+ from huggingface_hub import HfApi, login
8
+ import shutil
9
+
10
+ # Setup logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Constants
15
+ LOCAL_DB_DIR = "chroma_db"
16
+ RSS_FEEDS = [
17
+ # Academic Papers (Published Papers Across Fields)
18
+ "https://www.nature.com/nature/current_issue/rss", # Nature - High-brow science
19
+ "https://www.science.org/action/showFeed?type=etoc&feed=rss", # Science journal
20
+ "https://arxiv.org/rss/cs", # Computer Science arXiv
21
+ "https://arxiv.org/rss/physics", # Physics arXiv
22
+ "https://arxiv.org/rss/math", # Mathematics arXiv
23
+ "https://arxiv.org/rss/astro-ph", # Astrophysics arXiv
24
+ "https://arxiv.org/rss/biology", # Biology arXiv
25
+ "https://arxiv.org/rss/econ", # Economics arXiv
26
+ "https://arxiv.org/rss/stat", # Statistics arXiv
27
+ "https://arxiv.org/rss/quant-ph", # Quantum Physics arXiv
28
+ "https://www.plos.org/feed/", # PLOS (Public Library of Science) - multidisciplinary
29
+ "https://www.journals.uchicago.edu/action/showFeed?type=etoc&feed=rss&jc=pnl", # Philosophy (e.g., Philosophy of Science)
30
+ "https://www.annualreviews.org/action/showFeed?type=etoc&feed=rss&jc=anrevo", # Annual Reviews (various fields)
31
+ "https://www.jneurosci.org/rss/current.xml", # Journal of Neuroscience
32
+ "https://www.cell.com/cell/rss", # Cell Press - Biology/Medicine
33
+ "https://www.nejm.org/rss/all-articles.xml", # New England Journal of Medicine
34
+ "https://www.lancet.com/rss/lancet_current.xml", # The Lancet - Medicine
35
+
36
+ # Business
37
+ "https://www.reuters.com/arc/outboundfeeds/business-news/?outputType=rss", # Reuters Business
38
+ "https://www.bloomberg.com/feeds/rss/businessweek.xml", # Bloomberg Businessweek
39
+ "https://www.ft.com/rss/home", # Financial Times
40
+ "https://www.marketwatch.com/rss/topstories", # MarketWatch Business & Finance
41
+ "https://www.cnbc.com/id/100003114/device/rss/rss.html", # CNBC Business News
42
+ "https://www.foxbusiness.com/rss", # Fox Business
43
+ "https://www.wsj.com/xml/rss/3_7085.xml", # Wall Street Journal - Business & Finance
44
+ "https://www.bworldonline.com/feed/", # BusinessWorld Online (Philippines)
45
+ "https://www.economist.com/business/rss.xml", # The Economist - Business
46
+ "https://www.forbes.com/business/feed/", # Forbes Business
47
+
48
+ # Stocks & Markets
49
+ "https://www.investing.com/rss/news_25.rss", # Investing.com Stocks & Markets
50
+ "https://www.cnbc.com/id/100727362/device/rss/rss.html", # CNBC Market Data
51
+ "https://www.marketwatch.com/rss/marketpulse", # MarketWatch Market News
52
+ "https://www.fool.co.uk/feed/", # Motley Fool UK - Stock Investing
53
+ "https://www.zacks.com/rss/zc.xml", # Zacks Investment Research
54
+ "https://seekingalpha.com/feed.xml", # Seeking Alpha - Stock Analysis
55
+ "https://www.barrons.com/rss.xml", # Barron's - Market News
56
+ "https://www.yahoofinance.com/news/rss", # Yahoo Finance - Stocks & Markets
57
+
58
+ # Federal Government
59
+ "https://www.whitehouse.gov/feed/", # White House News
60
+ "https://www.state.gov/rss-feeds/", # U.S. Department of State
61
+ "https://www.commerce.gov/feeds/rss", # U.S. Department of Commerce
62
+ "https://www.transportation.gov/feeds/rss", # U.S. Department of Transportation
63
+ "https://www.ed.gov/rss/", # U.S. Department of Education
64
+ "https://www.dol.gov/newsroom/rss-feeds", # U.S. Department of Labor
65
+ "https://www.justice.gov/feeds/opa", # U.S. Department of Justice
66
+ "https://www.federalreserve.gov/feeds/", # Federal Reserve Board
67
+ "https://www.occ.gov/rss-feeds.html", # Office of the Comptroller of the Currency
68
+ "https://www.sec.gov/news/sec-rss-feeds.htm", # SEC News
69
+ "https://www.bls.gov/rss/", # U.S. Bureau of Labor Statistics
70
+ "https://www.usda.gov/rss/", # U.S. Department of Agriculture
71
+ "https://www.gao.gov/rss/", # Government Accountability Office
72
+ "https://www.cbo.gov/rss/", # Congressional Budget Office
73
+ "https://www.fema.gov/rss/", # Federal Emergency Management Agency
74
+ "https://www.defense.gov/Newsroom/RSS/", # U.S. Department of Defense
75
+ "https://www.hhs.gov/rss/", # U.S. Department of Health & Human Services
76
+ "https://www.energy.gov/rss/", # U.S. Department of Energy
77
+ "https://www.interior.gov/rss", # U.S. Department of the Interior
78
+
79
+ # Weather
80
+ "https://www.weather.gov/rss/", # National Weather Service
81
+ "https://www.metoffice.gov.uk/weather/warnings-and-advice/uk-warnings/rss", # UK Met Office
82
+ "https://www.accuweather.com/rss", # AccuWeather
83
+ "https://www.weatherunderground.com/rss", # Weather Underground
84
+ "https://www.noaa.gov/rss", # NOAA News & Weather
85
+ "https://www.wunderground.com/weather/rss/", # Weather Underground Forecasts
86
+ "https://www.climate.gov/rss", # NOAA Climate.gov
87
+ "https://www.ecmwf.int/rss", # European Centre for Medium-Range Weather Forecasts
88
+ "https://www.bom.gov.au/rss/feeds.xml", # Australian Bureau of Meteorology
89
+
90
+ # Data & Statistics
91
+ "https://data.worldbank.org/rss", # World Bank Data
92
+ "https://www.imf.org/external/np/weblogs/rss.aspx", # IMF Blog & Data
93
+ "https://www.un.org/en/rss-feed", # United Nations News & Data
94
+ "https://www.oecd.org/rss/", # OECD Data & Publications
95
+ "https://www.statista.com/rss/", # Statista Data Insights
96
+ "https://www.kff.org/rss/", # Kaiser Family Foundation Health Data
97
+ "https://www.who.int/rss.xml", # World Health Organization News & Data
98
+ "https://www.cdc.gov/rss/", # CDC Data & Alerts
99
+ "https://www.bea.gov/rss/", # U.S. Bureau of Economic Analysis
100
+ "https://www.census.gov/rss/", # U.S. Census Bureau
101
+ "https://www.fdic.gov/rss/", # FDIC Data & News
102
+
103
+ # Existing Categories (Expanded)
104
+ "https://www.nasa.gov/rss/dyn/breaking_news.rss", # Space
105
+ "https://www.sciencedaily.com/rss/top/science.xml", # Science
106
+ "https://www.wired.com/feed/rss", # Tech
107
+ "https://www.horoscope.com/us/horoscopes/general/rss/horoscope-rss.aspx", # Astrology
108
+ "http://rss.cnn.com/rss/cnn_allpolitics.rss", # Politics
109
+ "https://phys.org/rss-feed/physics-news/", # Physics
110
+ "https://www.spaceweatherlive.com/en/news/rss", # Solar & Space
111
+ "https://weather.com/feeds/rss", # Earth Weather
112
+ "https://feeds.bbci.co.uk/news/politics/rss.xml", # Politics
113
+ "https://www.reuters.com/arc/outboundfeeds/newsletter-politics/?outputType=xml", # Politics
114
+ "https://www.politico.com/rss/politics.xml", # Politics
115
+ "https://thehill.com/feed/", # Politics
116
+ "https://www.aps.org/publications/apsnews/updates/rss.cfm", # Physics
117
+ "https://www.quantamagazine.org/feed/", # Science
118
+ "https://www.sciencedaily.com/rss/matter_energy/physics.xml", # Physics
119
+ "https://physicsworld.com/feed/", # Physics
120
+ "https://www.swpc.noaa.gov/rss.xml", # Solar & Space
121
+ "https://feeds.bbci.co.uk/weather/feeds/rss/5day/world/", # Earth Weather
122
+ "https://www.weather.gov/rss", # Earth Weather
123
+ "https://www.foxweather.com/rss", # Earth Weather
124
+ "https://techcrunch.com/feed/", # Tech
125
+ "https://arstechnica.com/feed/", # Tech
126
+ "https://gizmodo.com/rss", # Tech
127
+ "https://www.theverge.com/rss/index.xml", # Tech
128
+ "https://www.space.com/feeds/all", # Space
129
+ "https://www.universetoday.com/feed/", # Space
130
+ "https://skyandtelescope.org/feed/", # Space
131
+ "https://www.esa.int/rss", # Space
132
+ "https://www.smithsonianmag.com/rss/", # Science
133
+ "https://www.popsci.com/rss.xml", # Science
134
+ "https://www.discovermagazine.com/rss", # Science
135
+ "https://www.atlasobscura.com/feeds/latest", # Science
136
+ "https://www.nature.com/nature.rss", # Science
137
+ "https://www.scientificamerican.com/rss/", # Science
138
+ "https://www.newscientist.com/feed/home/", # Science
139
+ "https://www.livescience.com/feeds/all", # Science
140
+ "https://astrostyle.com/feed/", # Astrology
141
+ "https://www.vogue.com/feed/rss", # Lifestyle
142
+ ]
143
+
144
+ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
145
+ REPO_ID = "broadfield-dev/news-rag-db"
146
+
147
+ # Initialize Hugging Face API
148
+ login(token=HF_API_TOKEN)
149
+ hf_api = HfApi()
150
+
151
+ # Initialize embedding model and vector DB
152
+ embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
153
+ vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
154
+
155
+ def fetch_rss_feeds():
156
+ articles = []
157
+ seen_keys = set()
158
+ for feed_url in RSS_FEEDS:
159
+ try:
160
+ logger.info(f"Fetching {feed_url}")
161
+ feed = feedparser.parse(feed_url)
162
+ if feed.bozo:
163
+ logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
164
+ continue
165
+ for entry in feed.entries:
166
+ title = entry.get("title", "No Title").strip()
167
+ link = entry.get("link", "").strip()
168
+ description = entry.get("summary", entry.get("description", "No Description"))
169
+ published = entry.get("published", "Unknown Date").strip()
170
+ key = f"{title}|{link}|{published}"
171
+ if key not in seen_keys:
172
+ seen_keys.add(key)
173
+ image = (entry.get("media_content", [{}])[0].get("url") or
174
+ entry.get("media_thumbnail", [{}])[0].get("url") or "svg")
175
+ articles.append({
176
+ "title": title,
177
+ "link": link,
178
+ "description": description,
179
+ "published": published,
180
+ "category": categorize_feed(feed_url),
181
+ "image": image,
182
+ })
183
+ except Exception as e:
184
+ logger.error(f"Error fetching {feed_url}: {e}")
185
+ logger.info(f"Total articles fetched: {len(articles)}")
186
+ return articles
187
+
188
+ def categorize_feed(url):
189
+ if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
190
+ return "Academic Papers"
191
+ elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
192
+ return "Business"
193
+ elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
194
+ return "Stocks & Markets"
195
+ elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
196
+ return "Federal Government"
197
+ elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
198
+ return "Weather"
199
+ elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
200
+ return "Data & Statistics"
201
+ elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
202
+ return "Space"
203
+ elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
204
+ return "Science"
205
+ elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
206
+ return "Tech"
207
+ elif "horoscope" in url or "astrostyle" in url:
208
+ return "Astrology"
209
+ elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
210
+ return "Politics"
211
+ elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
212
+ return "Earth Weather"
213
+ elif "vogue" in url:
214
+ return "Lifestyle"
215
+ elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
216
+ return "Physics"
217
+ return "Uncategorized"
218
+
219
+ def process_and_store_articles(articles):
220
+ documents = []
221
+ for article in articles:
222
+ try:
223
+ metadata = {
224
+ "title": article["title"],
225
+ "link": article["link"],
226
+ "original_description": article["description"],
227
+ "published": article["published"],
228
+ "category": article["category"],
229
+ "image": article["image"],
230
+ }
231
+ doc = Document(page_content=article["description"], metadata=metadata)
232
+ documents.append(doc)
233
+ except Exception as e:
234
+ logger.error(f"Error processing article {article['title']}: {e}")
235
+
236
+ if documents:
237
+ try:
238
+ vector_db.add_documents(documents)
239
+ logger.info(f"Stored {len(documents)} articles in DB")
240
+ except Exception as e:
241
+ logger.error(f"Error storing articles: {e}")
242
+
243
+ def download_from_hf_hub():
244
+ if os.path.exists(LOCAL_DB_DIR):
245
+ shutil.rmtree(LOCAL_DB_DIR)
246
+ try:
247
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
248
+ logger.info(f"Downloading Chroma DB from {REPO_ID}...")
249
+ hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
250
+ except Exception as e:
251
+ logger.error(f"Error downloading from Hugging Face Hub: {e}")
252
+ raise
253
+
254
+ def upload_to_hf_hub():
255
+ if os.path.exists(LOCAL_DB_DIR):
256
+ try:
257
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
258
+ logger.info(f"Uploading Chroma DB to {REPO_ID}...")
259
+ for root, _, files in os.walk(LOCAL_DB_DIR):
260
+ for file in files:
261
+ local_path = os.path.join(root, file)
262
+ remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
263
+ hf_api.upload_file(
264
+ path_or_fileobj=local_path,
265
+ path_in_repo=remote_path,
266
+ repo_id=REPO_ID,
267
+ repo_type="dataset",
268
+ token=HF_API_TOKEN
269
+ )
270
+ logger.info(f"Database uploaded to: {REPO_ID}")
271
+ except Exception as e:
272
+ logger.error(f"Error uploading to Hugging Face Hub: {e}")
273
+ raise
274
+
275
+ if __name__ == "__main__":
276
+ articles = fetch_rss_feeds()
277
+ process_and_store_articles(articles)
278
+ upload_to_hf_hub()