broadfield-dev commited on
Commit
e727948
·
verified ·
1 Parent(s): 6a2f937

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +41 -1
rss_processor.py CHANGED
@@ -4,6 +4,7 @@ from langchain.vectorstores import Chroma
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
6
  import logging
 
7
 
8
  # Setup logging
9
  logging.basicConfig(level=logging.INFO)
@@ -52,6 +53,12 @@ RSS_FEEDS = [
52
  "https://www.discovermagazine.com/rss",
53
  "https://www.atlasobscura.com/feeds/latest"
54
  ]
 
 
 
 
 
 
55
 
56
  # Initialize embedding model and vector DB
57
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -122,6 +129,39 @@ def process_and_store_articles(articles):
122
  except Exception as e:
123
  logger.error(f"Error storing articles: {e}")
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  if __name__ == "__main__":
126
  articles = fetch_rss_feeds()
127
- process_and_store_articles(articles)
 
 
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.docstore.document import Document
6
  import logging
7
+ from huggingface_hub import HfApi, login
8
 
9
  # Setup logging
10
  logging.basicConfig(level=logging.INFO)
 
53
  "https://www.discovermagazine.com/rss",
54
  "https://www.atlasobscura.com/feeds/latest"
55
  ]
56
+ HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
57
+ REPO_ID = "broadfield-dev/news-rag-db"
58
+
59
+ # Initialize Hugging Face API
60
+ login(token=HF_API_TOKEN)
61
+ hf_api = HfApi()
62
 
63
  # Initialize embedding model and vector DB
64
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 
129
  except Exception as e:
130
  logger.error(f"Error storing articles: {e}")
131
 
132
+ def download_from_hf_hub():
133
+ if os.path.exists(LOCAL_DB_DIR):
134
+ shutil.rmtree(LOCAL_DB_DIR)
135
+ try:
136
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
137
+ logger.info(f"Downloading Chroma DB from {REPO_ID}...")
138
+ hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
139
+ except Exception as e:
140
+ logger.error(f"Error downloading from Hugging Face Hub: {e}")
141
+ raise
142
+
143
+ def upload_to_hf_hub():
144
+ if os.path.exists(LOCAL_DB_DIR):
145
+ try:
146
+ hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
147
+ logger.info(f"Uploading Chroma DB to {REPO_ID}...")
148
+ for root, _, files in os.walk(LOCAL_DB_DIR):
149
+ for file in files:
150
+ local_path = os.path.join(root, file)
151
+ remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
152
+ hf_api.upload_file(
153
+ path_or_fileobj=local_path,
154
+ path_in_repo=remote_path,
155
+ repo_id=REPO_ID,
156
+ repo_type="dataset",
157
+ token=HF_API_TOKEN
158
+ )
159
+ logger.info(f"Database uploaded to: {REPO_ID}")
160
+ except Exception as e:
161
+ logger.error(f"Error uploading to Hugging Face Hub: {e}")
162
+ raise
163
+
164
  if __name__ == "__main__":
165
  articles = fetch_rss_feeds()
166
+ process_and_store_articles(articles)
167
+ upload_to_hf_hub()