Upload 4 files
Browse files- app.py +101 -0
- defaults.py +7 -0
- hub.py +50 -0
- pages/🧑🌾 Domain Data Grower.py +15 -0
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
from regex import F
|
4 |
+
from defaults import (
|
5 |
+
DEFAULT_DOMAIN,
|
6 |
+
)
|
7 |
+
from hub import (
|
8 |
+
setup_dataset_on_hub,
|
9 |
+
duplicate_space_on_hub,
|
10 |
+
add_project_config_to_space_repo,
|
11 |
+
)
|
12 |
+
|
13 |
+
import streamlit as st
|
14 |
+
|
15 |
+
st.set_page_config("Domain Data Grower", page_icon="🧑🌾")
|
16 |
+
st.header("🧑🌾 Domain Data Grower")
|
17 |
+
st.divider()
|
18 |
+
|
19 |
+
################################################################################
|
20 |
+
# APP MARKDOWN
|
21 |
+
################################################################################
|
22 |
+
|
23 |
+
st.header("🌱 Create a domain specific dataset")
|
24 |
+
|
25 |
+
st.markdown(
|
26 |
+
"""This space will set up your domain specific dataset project. It will
|
27 |
+
create the resources that you need to build a dataset. Those resources include:
|
28 |
+
|
29 |
+
- A dataset repository on the Hub
|
30 |
+
- Another space to define expert domain and run generation pipelines
|
31 |
+
|
32 |
+
For a complete overview of the project. Check out the README
|
33 |
+
"""
|
34 |
+
)
|
35 |
+
|
36 |
+
st.page_link(
|
37 |
+
"pages/🧑🌾 Domain Data Grower.py",
|
38 |
+
label="Domain Data Grower",
|
39 |
+
icon="🧑🌾",
|
40 |
+
)
|
41 |
+
|
42 |
+
################################################################################
|
43 |
+
# CONFIGURATION
|
44 |
+
################################################################################
|
45 |
+
|
46 |
+
st.subheader("🌾 Project Configuration")
|
47 |
+
|
48 |
+
project_name = st.text_input("Project Name", DEFAULT_DOMAIN)
|
49 |
+
hub_username = st.text_input("Hub Username", "argilla")
|
50 |
+
hub_token = st.text_input("Hub Token", type="password")
|
51 |
+
private_selector = st.checkbox("Private Space", value=False)
|
52 |
+
|
53 |
+
if st.button("🤗 Setup Project Resources"):
|
54 |
+
repo_id = f"{hub_username}/{project_name}"
|
55 |
+
|
56 |
+
setup_dataset_on_hub(
|
57 |
+
repo_id=repo_id,
|
58 |
+
hub_token=hub_token,
|
59 |
+
)
|
60 |
+
|
61 |
+
st.success(
|
62 |
+
f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name}). Hold on the repo_id: {repo_id}, we will need it in the next steps."
|
63 |
+
)
|
64 |
+
|
65 |
+
space_name = f"{project_name}_config_space"
|
66 |
+
|
67 |
+
duplicate_space_on_hub(
|
68 |
+
source_repo="argilla/domain-specific-datasets-template",
|
69 |
+
target_repo=space_name,
|
70 |
+
hub_token=hub_token,
|
71 |
+
private=private_selector,
|
72 |
+
)
|
73 |
+
|
74 |
+
st.success(
|
75 |
+
f"Configuration Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{space_name})."
|
76 |
+
)
|
77 |
+
|
78 |
+
argilla_name = f"{project_name}_argilla_space"
|
79 |
+
|
80 |
+
duplicate_space_on_hub(
|
81 |
+
source_repo="argilla/argilla-template-space",
|
82 |
+
target_repo=argilla_name,
|
83 |
+
hub_token=hub_token,
|
84 |
+
private=private_selector,
|
85 |
+
)
|
86 |
+
|
87 |
+
st.success(
|
88 |
+
f"Argilla Space created. Check it out [here](https://huggingface.co/spaces/{hub_username}/{argilla_name})."
|
89 |
+
)
|
90 |
+
|
91 |
+
seconds = 5
|
92 |
+
|
93 |
+
with st.spinner(f"Adding project configuration to spaces in {seconds} seconds"):
|
94 |
+
time.sleep(seconds)
|
95 |
+
add_project_config_to_space_repo(
|
96 |
+
dataset_repo_id=repo_id,
|
97 |
+
hub_token=hub_token,
|
98 |
+
project_name=project_name,
|
99 |
+
argilla_space_repo_id=f"{hub_username}/{argilla_name}",
|
100 |
+
project_space_repo_id=f"{hub_username}/{space_name}",
|
101 |
+
)
|
defaults.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
SEED_DATA_PATH = "seed_data.json"
|
4 |
+
|
5 |
+
with open(SEED_DATA_PATH) as f:
|
6 |
+
DEFAULT_DATA = json.load(f)
|
7 |
+
DEFAULT_DOMAIN = DEFAULT_DATA["domain"]
|
hub.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
from huggingface_hub import duplicate_space, HfApi
|
4 |
+
|
5 |
+
|
6 |
+
hf_api = HfApi()
|
7 |
+
|
8 |
+
|
9 |
+
def setup_dataset_on_hub(repo_id, hub_token):
|
10 |
+
# create an empty dataset repo on the hub
|
11 |
+
hf_api.create_repo(
|
12 |
+
repo_id=repo_id,
|
13 |
+
token=hub_token,
|
14 |
+
repo_type="dataset",
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def duplicate_space_on_hub(source_repo, target_repo, hub_token, private=False):
|
19 |
+
duplicate_space(
|
20 |
+
from_id=source_repo, to_id=target_repo, token=hub_token, private=private
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
def add_project_config_to_space_repo(
|
25 |
+
dataset_repo_id,
|
26 |
+
hub_token,
|
27 |
+
project_name,
|
28 |
+
argilla_space_repo_id,
|
29 |
+
project_space_repo_id,
|
30 |
+
):
|
31 |
+
# upload the seed data and readme to the hub
|
32 |
+
|
33 |
+
with open("project_config.json", "w") as f:
|
34 |
+
json.dump(
|
35 |
+
{
|
36 |
+
"project_name": project_name,
|
37 |
+
"argilla_space_repo_id": argilla_space_repo_id,
|
38 |
+
"project_space_repo_id": project_space_repo_id,
|
39 |
+
"dataset_repo_id": dataset_repo_id,
|
40 |
+
},
|
41 |
+
f,
|
42 |
+
)
|
43 |
+
|
44 |
+
hf_api.upload_file(
|
45 |
+
path_or_fileobj="project_config.json",
|
46 |
+
path_in_repo="project_config.json",
|
47 |
+
token=hub_token,
|
48 |
+
repo_id=project_space_repo_id,
|
49 |
+
repo_type="space",
|
50 |
+
)
|
pages/🧑🌾 Domain Data Grower.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
|
4 |
+
|
5 |
+
readme_location = "https://raw.githubusercontent.com/huggingface/data-is-better-together/4d7848149dcfe575b86517ca15e4aaa09dc9db74/domain-specific-datasets/README.md"
|
6 |
+
|
7 |
+
|
8 |
+
def open_markdown_file(url):
|
9 |
+
response = requests.get(url)
|
10 |
+
return response.text
|
11 |
+
|
12 |
+
|
13 |
+
readme = open_markdown_file(readme_location)
|
14 |
+
|
15 |
+
st.markdown(readme)
|