inkling / update_dataset.py
nomadicsynth's picture
Add DatasetManager for handling dataset operations and update dataset with new papers
261056f
raw
history blame contribute delete
791 Bytes
import os
from dataset_utils import DatasetManager
from app import init_embedding_model
# Dataset details
dataset_name = "nomadicsynth/arxiv-dataset-abstract-embeddings"
HF_TOKEN = os.getenv("HF_TOKEN")
if __name__ == "__main__":
# Initialize the embedding model
embedding_model = init_embedding_model(
model_name_or_path="nomadicsynth/research-compass-arxiv-abstracts-embedding-model",
model_revision="2025-01-28_23-06-17-1epochs-12batch-32eval-512embed-final",
hf_token=HF_TOKEN,
)
# Initialize DatasetManager with the embedding model
dataset_manager = DatasetManager(dataset_name=dataset_name, hf_token=HF_TOKEN, embedding_model=embedding_model)
# Update the dataset with new papers
dataset_manager.update_dataset_with_new_papers()