Spaces:
Running
Running
Update rss_processor.py
Browse files- rss_processor.py +41 -1
rss_processor.py
CHANGED
@@ -4,6 +4,7 @@ from langchain.vectorstores import Chroma
|
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
|
|
7 |
|
8 |
# Setup logging
|
9 |
logging.basicConfig(level=logging.INFO)
|
@@ -52,6 +53,12 @@ RSS_FEEDS = [
|
|
52 |
"https://www.discovermagazine.com/rss",
|
53 |
"https://www.atlasobscura.com/feeds/latest"
|
54 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
# Initialize embedding model and vector DB
|
57 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
@@ -122,6 +129,39 @@ def process_and_store_articles(articles):
|
|
122 |
except Exception as e:
|
123 |
logger.error(f"Error storing articles: {e}")
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
if __name__ == "__main__":
|
126 |
articles = fetch_rss_feeds()
|
127 |
-
process_and_store_articles(articles)
|
|
|
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.docstore.document import Document
|
6 |
import logging
|
7 |
+
from huggingface_hub import HfApi, login
|
8 |
|
9 |
# Setup logging
|
10 |
logging.basicConfig(level=logging.INFO)
|
|
|
53 |
"https://www.discovermagazine.com/rss",
|
54 |
"https://www.atlasobscura.com/feeds/latest"
|
55 |
]
|
56 |
+
HF_API_TOKEN = os.getenv("DEMO_HF_API_TOKEN", "YOUR_HF_API_TOKEN")
|
57 |
+
REPO_ID = "broadfield-dev/news-rag-db"
|
58 |
+
|
59 |
+
# Initialize Hugging Face API
|
60 |
+
login(token=HF_API_TOKEN)
|
61 |
+
hf_api = HfApi()
|
62 |
|
63 |
# Initialize embedding model and vector DB
|
64 |
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
|
129 |
except Exception as e:
|
130 |
logger.error(f"Error storing articles: {e}")
|
131 |
|
132 |
+
def download_from_hf_hub():
|
133 |
+
if os.path.exists(LOCAL_DB_DIR):
|
134 |
+
shutil.rmtree(LOCAL_DB_DIR)
|
135 |
+
try:
|
136 |
+
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
|
137 |
+
logger.info(f"Downloading Chroma DB from {REPO_ID}...")
|
138 |
+
hf_api.download_repo(repo_id=REPO_ID, repo_type="dataset", local_dir=LOCAL_DB_DIR, token=HF_API_TOKEN)
|
139 |
+
except Exception as e:
|
140 |
+
logger.error(f"Error downloading from Hugging Face Hub: {e}")
|
141 |
+
raise
|
142 |
+
|
143 |
+
def upload_to_hf_hub():
|
144 |
+
if os.path.exists(LOCAL_DB_DIR):
|
145 |
+
try:
|
146 |
+
hf_api.create_repo(repo_id=REPO_ID, repo_type="dataset", exist_ok=True, token=HF_API_TOKEN)
|
147 |
+
logger.info(f"Uploading Chroma DB to {REPO_ID}...")
|
148 |
+
for root, _, files in os.walk(LOCAL_DB_DIR):
|
149 |
+
for file in files:
|
150 |
+
local_path = os.path.join(root, file)
|
151 |
+
remote_path = os.path.relpath(local_path, LOCAL_DB_DIR)
|
152 |
+
hf_api.upload_file(
|
153 |
+
path_or_fileobj=local_path,
|
154 |
+
path_in_repo=remote_path,
|
155 |
+
repo_id=REPO_ID,
|
156 |
+
repo_type="dataset",
|
157 |
+
token=HF_API_TOKEN
|
158 |
+
)
|
159 |
+
logger.info(f"Database uploaded to: {REPO_ID}")
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error uploading to Hugging Face Hub: {e}")
|
162 |
+
raise
|
163 |
+
|
164 |
if __name__ == "__main__":
|
165 |
articles = fetch_rss_feeds()
|
166 |
+
process_and_store_articles(articles)
|
167 |
+
upload_to_hf_hub()
|