import os import logging import pickle from datasets import load_dataset from config import ConfigConstants # For saving the dataset locally def load_data(data_set_name): local_path = ConfigConstants.DATA_SET_PATH + 'local_datasets' os.makedirs(local_path, exist_ok=True) dataset_file = os.path.join(local_path, f"{data_set_name}_test.pkl") if os.path.exists(dataset_file): logging.info(f"Loading dataset {data_set_name} from local storage. File location {dataset_file}") with open(dataset_file, "rb") as f: dataset = pickle.load(f) else: logging.info("Loading dataset from Hugging Face") dataset = load_dataset("rungalileo/ragbench", data_set_name, split="test") logging.info(f"Saving {data_set_name} dataset locally") with open(dataset_file, "wb") as f: pickle.dump(dataset, f) logging.info("Dataset loaded successfully") logging.info(f"Number of documents found: {dataset.num_rows}") return dataset