Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +30 -3
rss_processor.py
CHANGED
@@ -73,9 +73,36 @@ def fetch_rss_feeds():
|
|
73 |
return articles
|
74 |
|
75 |
def categorize_feed(url):
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def process_and_store_articles(articles):
|
80 |
documents = []
|
81 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|
|
|
73 |
return articles
|
74 |
|
75 |
def categorize_feed(url):
|
76 |
+
if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
|
77 |
+
return "Academic Papers"
|
78 |
+
elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
|
79 |
+
return "Business"
|
80 |
+
elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
|
81 |
+
return "Stocks & Markets"
|
82 |
+
elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
|
83 |
+
return "Federal Government"
|
84 |
+
elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
|
85 |
+
return "Weather"
|
86 |
+
elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
|
87 |
+
return "Data & Statistics"
|
88 |
+
elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
|
89 |
+
return "Space"
|
90 |
+
elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
|
91 |
+
return "Science"
|
92 |
+
elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
|
93 |
+
return "Tech"
|
94 |
+
elif "horoscope" in url or "astrostyle" in url:
|
95 |
+
return "Astrology"
|
96 |
+
elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
|
97 |
+
return "Politics"
|
98 |
+
elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
|
99 |
+
return "Earth Weather"
|
100 |
+
elif "vogue" in url:
|
101 |
+
return "Lifestyle"
|
102 |
+
elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
|
103 |
+
return "Physics"
|
104 |
+
return "Uncategorized"
|
105 |
+
|
106 |
def process_and_store_articles(articles):
|
107 |
documents = []
|
108 |
existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
|