Spaces:
Runtime error
Runtime error
# Script to bootstrap a dataset | |
# Bootstraping a dataset requires first running an SQL script in BigQuery and downloading the json. | |
# A dataset will then be created on the Hub with the initial data. This script will go step-by-step over the creation. | |
import os | |
import sys | |
from pathlib import Path | |
from datasets import Dataset | |
from huggingface_hub import dataset_info | |
# Define the library name you'd like to work on. | |
from huggingface_hub.utils import RepositoryNotFoundError | |
library_name = input("Library name: ") | |
current_dir = Path(__file__).parent | |
if f"{library_name}.csv" not in os.listdir(current_dir / 'csv_files'): | |
query = Path(current_dir / 'query.sql').read_text().replace("<PROJECT_NAME>", library_name) | |
print("Open the following link: https://console.cloud.google.com/bigquery?project=huggingface-ml\n") | |
print(f"Run the following query:\n\n{query}\n\n") | |
print("Press paste the results here (Ctrl+D once pasted): ") | |
csv_values = sys.stdin.read() | |
Path(current_dir / f'csv_files/{library_name}.csv').write_text(csv_values) | |
try: | |
dataset_info(f'open-source-metrics/{library_name}-pip-installs') | |
dataset_exists = True | |
except RepositoryNotFoundError: | |
dataset_exists = False | |
dataset = Dataset.from_csv(str(current_dir / f'csv_files/{library_name}.csv'), delimiter='\t') | |
dataset.push_to_hub(f'open-source-metrics/{library_name}-pip-installs', private=True) | |
print("Dataset is pushed to Hub.") | |