Spaces:
Running
Running
Merge pull request #5 from lhoestq/load-local-only-by-default
Browse files- tagging_app.py +12 -7
tagging_app.py
CHANGED
@@ -3,12 +3,18 @@ import datasets
|
|
3 |
import json
|
4 |
import os
|
5 |
import streamlit as st
|
|
|
6 |
import yaml
|
7 |
-
|
8 |
from dataclasses import asdict
|
|
|
|
|
|
|
9 |
from glob import glob
|
10 |
from os.path import join as pjoin
|
11 |
|
|
|
|
|
|
|
12 |
st.set_page_config(
|
13 |
page_title="HF Dataset Tagging App",
|
14 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
@@ -132,7 +138,7 @@ def load_all_dataset_infos(dataset_list):
|
|
132 |
def load_existing_tags():
|
133 |
has_tags = {}
|
134 |
for fname in glob("saved_tags/*/*/tags.json"):
|
135 |
-
_, did, cid, _ = fname.split(
|
136 |
has_tags[did] = has_tags.get(did, {})
|
137 |
has_tags[did][cid] = fname
|
138 |
return has_tags
|
@@ -160,9 +166,9 @@ to pre-load the tag sets from another dataset or configuration to avoid too much
|
|
160 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
161 |
"""
|
162 |
|
163 |
-
all_dataset_ids = copy.deepcopy(get_dataset_list())
|
164 |
existing_tag_sets = load_existing_tags()
|
165 |
-
|
|
|
166 |
|
167 |
st.sidebar.markdown(app_desc)
|
168 |
|
@@ -181,6 +187,7 @@ dataset_id = st.sidebar.selectbox(
|
|
181 |
index=0,
|
182 |
)
|
183 |
|
|
|
184 |
if dataset_id == "local dataset":
|
185 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
186 |
if path_to_info not in ["/path/to/dataset/", ""]:
|
@@ -249,8 +256,6 @@ c2.markdown(f"### Writing tags for: {dataset_id} / {config_id}")
|
|
249 |
##########
|
250 |
c2.markdown("#### Pre-loading an existing tag set")
|
251 |
|
252 |
-
existing_tag_sets = load_existing_tags()
|
253 |
-
|
254 |
pre_loaded = {
|
255 |
"task_categories": [],
|
256 |
"task_ids": [],
|
@@ -442,7 +447,7 @@ with c3.beta_expander("Show JSON output for the current config"):
|
|
442 |
|
443 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
444 |
task_saved_configs = dict([
|
445 |
-
(fname.
|
446 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
447 |
])
|
448 |
aggregate_config = {}
|
|
|
3 |
import json
|
4 |
import os
|
5 |
import streamlit as st
|
6 |
+
import sys
|
7 |
import yaml
|
|
|
8 |
from dataclasses import asdict
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Dict
|
11 |
+
|
12 |
from glob import glob
|
13 |
from os.path import join as pjoin
|
14 |
|
15 |
+
|
16 |
+
load_remote_datasets = "--load_remote_datasets" in sys.argv[1:]
|
17 |
+
|
18 |
st.set_page_config(
|
19 |
page_title="HF Dataset Tagging App",
|
20 |
page_icon="https://huggingface.co/front/assets/huggingface_logo.svg",
|
|
|
138 |
def load_existing_tags():
|
139 |
has_tags = {}
|
140 |
for fname in glob("saved_tags/*/*/tags.json"):
|
141 |
+
_, did, cid, _ = fname.split(os.sep)
|
142 |
has_tags[did] = has_tags.get(did, {})
|
143 |
has_tags[did][cid] = fname
|
144 |
return has_tags
|
|
|
166 |
The tag sets are saved in JSON format, but you can print a YAML version in the right-most column to copy-paste to the config README.md
|
167 |
"""
|
168 |
|
|
|
169 |
existing_tag_sets = load_existing_tags()
|
170 |
+
all_dataset_ids = list(existing_tag_sets.keys()) if not load_remote_datasets else copy.deepcopy(get_dataset_list())
|
171 |
+
all_dataset_infos = {} if not load_remote_datasets else load_all_dataset_infos(all_dataset_ids)
|
172 |
|
173 |
st.sidebar.markdown(app_desc)
|
174 |
|
|
|
187 |
index=0,
|
188 |
)
|
189 |
|
190 |
+
all_info_dicts = {}
|
191 |
if dataset_id == "local dataset":
|
192 |
path_to_info = st.sidebar.text_input("Please enter the path to the folder where the dataset_infos.json file was generated", "/path/to/dataset/")
|
193 |
if path_to_info not in ["/path/to/dataset/", ""]:
|
|
|
256 |
##########
|
257 |
c2.markdown("#### Pre-loading an existing tag set")
|
258 |
|
|
|
|
|
259 |
pre_loaded = {
|
260 |
"task_categories": [],
|
261 |
"task_ids": [],
|
|
|
447 |
|
448 |
with c3.beta_expander("Show YAML output aggregating the tags saved for all configs"):
|
449 |
task_saved_configs = dict([
|
450 |
+
(Path(fname).parent.name, json.load(open(fname)))
|
451 |
for fname in glob(f"saved_tags/{dataset_id}/*/tags.json")
|
452 |
])
|
453 |
aggregate_config = {}
|