broadfield-dev commited on
Commit
bc16436
·
verified ·
1 Parent(s): 4868a00

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +32 -45
rss_processor.py CHANGED
@@ -13,9 +13,10 @@ logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
  # Constants
16
- MAX_ARTICLES_PER_FEED = 5 # Set to 5 for testing, increase later as needed
17
  LOCAL_DB_DIR = "chroma_db"
18
  RSS_FEEDS = rss_feeds.RSS_FEEDS
 
19
 
20
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
21
  REPO_ID = "broadfield-dev/news-rag-db"
@@ -24,9 +25,15 @@ REPO_ID = "broadfield-dev/news-rag-db"
24
  login(token=HF_API_TOKEN)
25
  hf_api = HfApi()
26
 
27
- # Initialize embedding model and vector DB
28
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
29
- vector_db = Chroma(persist_directory=LOCAL_DB_DIR, embedding_function=embedding_model)
 
 
 
 
 
 
30
 
31
  def fetch_rss_feeds():
32
  articles = []
@@ -66,40 +73,18 @@ def fetch_rss_feeds():
66
  return articles
67
 
68
  def categorize_feed(url):
69
- if "nature" in url or "science.org" in url or "arxiv.org" in url or "plos.org" in url or "annualreviews.org" in url or "journals.uchicago.edu" in url or "jneurosci.org" in url or "cell.com" in url or "nejm.org" in url or "lancet.com" in url:
70
- return "Academic Papers"
71
- elif "reuters.com/business" in url or "bloomberg.com" in url or "ft.com" in url or "marketwatch.com" in url or "cnbc.com" in url or "foxbusiness.com" in url or "wsj.com" in url or "bworldonline.com" in url or "economist.com" in url or "forbes.com" in url:
72
- return "Business"
73
- elif "investing.com" in url or "cnbc.com/market" in url or "marketwatch.com/market" in url or "fool.co.uk" in url or "zacks.com" in url or "seekingalpha.com" in url or "barrons.com" in url or "yahoofinance.com" in url:
74
- return "Stocks & Markets"
75
- elif "whitehouse.gov" in url or "state.gov" in url or "commerce.gov" in url or "transportation.gov" in url or "ed.gov" in url or "dol.gov" in url or "justice.gov" in url or "federalreserve.gov" in url or "occ.gov" in url or "sec.gov" in url or "bls.gov" in url or "usda.gov" in url or "gao.gov" in url or "cbo.gov" in url or "fema.gov" in url or "defense.gov" in url or "hhs.gov" in url or "energy.gov" in url or "interior.gov" in url:
76
- return "Federal Government"
77
- elif "weather.gov" in url or "metoffice.gov.uk" in url or "accuweather.com" in url or "weatherunderground.com" in url or "noaa.gov" in url or "wunderground.com" in url or "climate.gov" in url or "ecmwf.int" in url or "bom.gov.au" in url:
78
- return "Weather"
79
- elif "data.worldbank.org" in url or "imf.org" in url or "un.org" in url or "oecd.org" in url or "statista.com" in url or "kff.org" in url or "who.int" in url or "cdc.gov" in url or "bea.gov" in url or "census.gov" in url or "fdic.gov" in url:
80
- return "Data & Statistics"
81
- elif "nasa" in url or "spaceweatherlive" in url or "space" in url or "universetoday" in url or "skyandtelescope" in url or "esa" in url:
82
- return "Space"
83
- elif "sciencedaily" in url or "quantamagazine" in url or "smithsonianmag" in url or "popsci" in url or "discovermagazine" in url or "scientificamerican" in url or "newscientist" in url or "livescience" in url or "atlasobscura" in url:
84
- return "Science"
85
- elif "wired" in url or "techcrunch" in url or "arstechnica" in url or "gizmodo" in url or "theverge" in url:
86
- return "Tech"
87
- elif "horoscope" in url or "astrostyle" in url:
88
- return "Astrology"
89
- elif "cnn_allpolitics" in url or "bbci.co.uk/news/politics" in url or "reuters.com/arc/outboundfeeds/newsletter-politics" in url or "politico.com/rss/politics" in url or "thehill" in url:
90
- return "Politics"
91
- elif "weather" in url or "swpc.noaa.gov" in url or "foxweather" in url:
92
- return "Earth Weather"
93
- elif "vogue" in url:
94
- return "Lifestyle"
95
- elif "phys.org" in url or "aps.org" in url or "physicsworld" in url:
96
- return "Physics"
97
- return "Uncategorized"
98
 
99
  def process_and_store_articles(articles):
100
  documents = []
 
101
  for article in articles:
102
  try:
 
 
 
 
103
  metadata = {
104
  "title": article["title"],
105
  "link": article["link"],
@@ -108,7 +93,7 @@ def process_and_store_articles(articles):
108
  "category": article["category"],
109
  "image": article["image"],
110
  }
111
- doc = Document(page_content=article["description"], metadata=metadata)
112
  documents.append(doc)
113
  except Exception as e:
114
  logger.error(f"Error processing article {article['title']}: {e}")
@@ -116,26 +101,28 @@ def process_and_store_articles(articles):
116
  if documents:
117
  try:
118
  vector_db.add_documents(documents)
119
- logger.info(f"Stored {len(documents)} articles in DB")
 
120
  except Exception as e:
121
  logger.error(f"Error storing articles: {e}")
122
 
123
  def download_from_hf_hub():
124
- if os.path.exists(LOCAL_DB_DIR):
125
- shutil.rmtree(LOCAL_DB_DIR)
126
- try:
127
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
128
- logger.info(f"Downloading Chroma DB from {REPO_ID}...")
129
- hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
130
- except Exception as e:
131
- logger.error(f"Error downloading from Hugging Face Hub: {e}")
132
- raise
 
 
133
 
134
  def upload_to_hf_hub():
135
  if os.path.exists(LOCAL_DB_DIR):
136
  try:
137
- hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
138
- logger.info(f"Uploading Chroma DB to {REPO_ID}...")
139
  for root, _, files in os.walk(LOCAL_DB_DIR):
140
  for file in files:
141
  local_path = os.path.join(root, file)
 
13
  logger = logging.getLogger(__name__)
14
 
15
  # Constants
16
+ MAX_ARTICLES_PER_FEED = 5
17
  LOCAL_DB_DIR = "chroma_db"
18
  RSS_FEEDS = rss_feeds.RSS_FEEDS
19
+ COLLECTION_NAME = "news_articles" # Explicitly name the collection
20
 
21
  HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
22
  REPO_ID = "broadfield-dev/news-rag-db"
 
25
  login(token=HF_API_TOKEN)
26
  hf_api = HfApi()
27
 
28
+ # Initialize embedding model (global, reusable)
29
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
30
+
31
+ # Initialize vector DB with a specific collection name
32
+ vector_db = Chroma(
33
+ persist_directory=LOCAL_DB_DIR,
34
+ embedding_function=embedding_model,
35
+ collection_name=COLLECTION_NAME
36
+ )
37
 
38
  def fetch_rss_feeds():
39
  articles = []
 
73
  return articles
74
 
75
  def categorize_feed(url):
76
+ # (Unchanged, keeping your existing categorization logic)
77
+ # ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
  def process_and_store_articles(articles):
80
  documents = []
81
+ existing_ids = set(vector_db.get()["ids"]) # Get existing document IDs to avoid duplicates
82
  for article in articles:
83
  try:
84
+ # Create a unique ID for deduplication
85
+ doc_id = f"{article['title']}|{article['link']}|{article['published']}"
86
+ if doc_id in existing_ids:
87
+ continue # Skip if already in DB
88
  metadata = {
89
  "title": article["title"],
90
  "link": article["link"],
 
93
  "category": article["category"],
94
  "image": article["image"],
95
  }
96
+ doc = Document(page_content=article["description"], metadata=metadata, id=doc_id)
97
  documents.append(doc)
98
  except Exception as e:
99
  logger.error(f"Error processing article {article['title']}: {e}")
 
101
  if documents:
102
  try:
103
  vector_db.add_documents(documents)
104
+ vector_db.persist() # Explicitly persist changes
105
+ logger.info(f"Added {len(documents)} new articles to DB")
106
  except Exception as e:
107
  logger.error(f"Error storing articles: {e}")
108
 
109
  def download_from_hf_hub():
110
+ # Only download if the local DB doesn’t exist (initial setup)
111
+ if not os.path.exists(LOCAL_DB_DIR):
112
+ try:
113
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
114
+ logger.info(f"Downloading Chroma DB from {REPO_ID}...")
115
+ hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
116
+ except Exception as e:
117
+ logger.error(f"Error downloading from Hugging Face Hub: {e}")
118
+ raise
119
+ else:
120
+ logger.info("Local Chroma DB already exists, skipping download.")
121
 
122
  def upload_to_hf_hub():
123
  if os.path.exists(LOCAL_DB_DIR):
124
  try:
125
+ logger.info(f"Uploading updated Chroma DB to {REPO_ID}...")
 
126
  for root, _, files in os.walk(LOCAL_DB_DIR):
127
  for file in files:
128
  local_path = os.path.join(root, file)