File size: 1,436 Bytes
38e70c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# Script to bootstrap a dataset
# Bootstraping a dataset requires first running an SQL script in BigQuery and downloading the json.
# A dataset will then be created on the Hub with the initial data. This script will go step-by-step over the creation.

import os
import sys
from pathlib import Path
from datasets import Dataset
from huggingface_hub import dataset_info

# Define the library name you'd like to work on.
from huggingface_hub.utils import RepositoryNotFoundError

library_name = input("Library name: ")
current_dir = Path(__file__).parent

if f"{library_name}.csv" not in os.listdir(current_dir / 'csv_files'):
    query = Path(current_dir / 'query.sql').read_text().replace("<PROJECT_NAME>", library_name)

    print("Open the following link: https://console.cloud.google.com/bigquery?project=huggingface-ml\n")
    print(f"Run the following query:\n\n{query}\n\n")
    print("Press paste the results here (Ctrl+D once pasted): ")
    csv_values = sys.stdin.read()

    Path(current_dir / f'csv_files/{library_name}.csv').write_text(csv_values)

try:
    dataset_info(f'open-source-metrics/{library_name}-pip-installs')
    dataset_exists = True
except RepositoryNotFoundError:
    dataset_exists = False

dataset = Dataset.from_csv(str(current_dir / f'csv_files/{library_name}.csv'), delimiter='\t')
dataset.push_to_hub(f'open-source-metrics/{library_name}-pip-installs', private=True)
print("Dataset is pushed to Hub.")