# Script to bootstrap a dataset # Bootstraping a dataset requires first running an SQL script in BigQuery and downloading the json. # A dataset will then be created on the Hub with the initial data. This script will go step-by-step over the creation. import os import sys from pathlib import Path from datasets import Dataset from huggingface_hub import dataset_info # Define the library name you'd like to work on. from huggingface_hub.utils import RepositoryNotFoundError library_name = input("Library name: ") current_dir = Path(__file__).parent if f"{library_name}.csv" not in os.listdir(current_dir / 'csv_files'): query = Path(current_dir / 'query.sql').read_text().replace("", library_name) print("Open the following link: https://console.cloud.google.com/bigquery?project=huggingface-ml\n") print(f"Run the following query:\n\n{query}\n\n") print("Press paste the results here (Ctrl+D once pasted): ") csv_values = sys.stdin.read() Path(current_dir / f'csv_files/{library_name}.csv').write_text(csv_values) try: dataset_info(f'open-source-metrics/{library_name}-pip-installs') dataset_exists = True except RepositoryNotFoundError: dataset_exists = False dataset = Dataset.from_csv(str(current_dir / f'csv_files/{library_name}.csv'), delimiter='\t') dataset.push_to_hub(f'open-source-metrics/{library_name}-pip-installs', private=True) print("Dataset is pushed to Hub.")