broadfield-dev commited on
Commit
4f97b8a
·
verified ·
1 Parent(s): bc16436

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +30 -3
rss_processor.py CHANGED
@@ -73,9 +73,36 @@ def fetch_rss_feeds():
73
  return articles
74
 
75
  def categorize_feed(url):
76
- # (Unchanged, keeping your existing categorization logic)
77
- # ...
78
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def process_and_store_articles(articles):
80
  documents = []
81
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
 
73
  return articles
74
 
75
  def categorize_feed(url):
76
+ if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
77
+ return "Academic Papers"
78
+ elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
79
+ return "Business"
80
+ elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
81
+ return "Stocks & Markets"
82
+ elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
83
+ return "Federal Government"
84
+ elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
85
+ return "Weather"
86
+ elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
87
+ return "Data & Statistics"
88
+ elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
89
+ return "Space"
90
+ elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
91
+ return "Science"
92
+ elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
93
+ return "Tech"
94
+ elif "horoscope" in url or "astrostyle" in url:
95
+ return "Astrology"
96
+ elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
97
+ return "Politics"
98
+ elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
99
+ return "Earth Weather"
100
+ elif "vogue" in url:
101
+ return "Lifestyle"
102
+ elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
103
+ return "Physics"
104
+ return "Uncategorized"
105
+
106
  def process_and_store_articles(articles):
107
  documents = []
108
  existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates