Spaces:
Sleeping
Sleeping
ZeroCommand
commited on
Commit
·
55aeb04
1
Parent(s):
b5a969d
use global queue and fix write configs
Browse files- app_text_classification.py +22 -20
- io_utils.py +32 -47
- pipe.py +4 -0
- run_jobs.py +2 -2
- text_classification.py +4 -0
- text_classification_ui_helpers.py +9 -9
app_text_classification.py
CHANGED
@@ -25,8 +25,12 @@ CONFIG_PATH = "./config.yaml"
|
|
25 |
|
26 |
|
27 |
def get_demo(demo):
|
|
|
28 |
with gr.Row():
|
29 |
gr.Markdown(INTRODUCTION_MD)
|
|
|
|
|
|
|
30 |
with gr.Row():
|
31 |
model_id_input = gr.Textbox(
|
32 |
label="Hugging Face model id",
|
@@ -64,11 +68,11 @@ def get_demo(demo):
|
|
64 |
|
65 |
with gr.Accordion(label="Model Wrap Advance Config (optional)", open=False):
|
66 |
run_local = gr.Checkbox(value=True, label="Run in this Space")
|
67 |
-
use_inference = read_inference_type(
|
68 |
run_inference = gr.Checkbox(value=use_inference, label="Run with Inference API")
|
69 |
|
70 |
with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
|
71 |
-
selected = read_scanners(
|
72 |
# currently we remove data_leakage from the default scanners
|
73 |
# Reason: data_leakage barely raises any issues and takes too many requests
|
74 |
# when using inference API, causing rate limit error
|
@@ -86,13 +90,23 @@ def get_demo(demo):
|
|
86 |
)
|
87 |
|
88 |
with gr.Row():
|
89 |
-
uid = uuid.uuid4()
|
90 |
-
uid_label = gr.Textbox(
|
91 |
-
label="Evaluation ID:", value=uid, visible=False, interactive=False
|
92 |
-
)
|
93 |
logs = gr.Textbox(label="Giskard Bot Evaluation Log:", visible=False)
|
94 |
demo.load(get_logs_file, uid_label, logs, every=0.5)
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
gr.on(
|
97 |
triggers=[label.change for label in column_mappings],
|
98 |
fn=write_column_mapping_to_config,
|
@@ -100,6 +114,7 @@ def get_demo(demo):
|
|
100 |
dataset_id_input,
|
101 |
dataset_config_input,
|
102 |
dataset_split_input,
|
|
|
103 |
*column_mappings,
|
104 |
],
|
105 |
)
|
@@ -107,6 +122,7 @@ def get_demo(demo):
|
|
107 |
gr.on(
|
108 |
triggers=[
|
109 |
model_id_input.change,
|
|
|
110 |
dataset_config_input.change,
|
111 |
dataset_split_input.change,
|
112 |
],
|
@@ -125,20 +141,6 @@ def get_demo(demo):
|
|
125 |
],
|
126 |
)
|
127 |
|
128 |
-
dataset_id_input.blur(
|
129 |
-
check_dataset_and_get_config, dataset_id_input, dataset_config_input
|
130 |
-
)
|
131 |
-
|
132 |
-
dataset_config_input.change(
|
133 |
-
check_dataset_and_get_split,
|
134 |
-
inputs=[dataset_id_input, dataset_config_input],
|
135 |
-
outputs=[dataset_split_input],
|
136 |
-
)
|
137 |
-
|
138 |
-
scanners.change(write_scanners, inputs=scanners)
|
139 |
-
|
140 |
-
run_inference.change(write_inference_type, inputs=[run_inference])
|
141 |
-
|
142 |
gr.on(
|
143 |
triggers=[
|
144 |
run_btn.click,
|
|
|
25 |
|
26 |
|
27 |
def get_demo(demo):
|
28 |
+
uid = uuid.uuid4()
|
29 |
with gr.Row():
|
30 |
gr.Markdown(INTRODUCTION_MD)
|
31 |
+
uid_label = gr.Textbox(
|
32 |
+
label="Evaluation ID:", value=uid, visible=False, interactive=False
|
33 |
+
)
|
34 |
with gr.Row():
|
35 |
model_id_input = gr.Textbox(
|
36 |
label="Hugging Face model id",
|
|
|
68 |
|
69 |
with gr.Accordion(label="Model Wrap Advance Config (optional)", open=False):
|
70 |
run_local = gr.Checkbox(value=True, label="Run in this Space")
|
71 |
+
use_inference = read_inference_type(uid) == "hf_inference_api"
|
72 |
run_inference = gr.Checkbox(value=use_inference, label="Run with Inference API")
|
73 |
|
74 |
with gr.Accordion(label="Scanner Advance Config (optional)", open=False):
|
75 |
+
selected = read_scanners(uid)
|
76 |
# currently we remove data_leakage from the default scanners
|
77 |
# Reason: data_leakage barely raises any issues and takes too many requests
|
78 |
# when using inference API, causing rate limit error
|
|
|
90 |
)
|
91 |
|
92 |
with gr.Row():
|
|
|
|
|
|
|
|
|
93 |
logs = gr.Textbox(label="Giskard Bot Evaluation Log:", visible=False)
|
94 |
demo.load(get_logs_file, uid_label, logs, every=0.5)
|
95 |
|
96 |
+
dataset_id_input.change(
|
97 |
+
check_dataset_and_get_config, inputs=[dataset_id_input, uid_label], outputs=[dataset_config_input]
|
98 |
+
)
|
99 |
+
|
100 |
+
dataset_config_input.change(
|
101 |
+
check_dataset_and_get_split,
|
102 |
+
inputs=[dataset_id_input, dataset_config_input],
|
103 |
+
outputs=[dataset_split_input],
|
104 |
+
)
|
105 |
+
|
106 |
+
scanners.change(write_scanners, inputs=[scanners, uid_label])
|
107 |
+
|
108 |
+
run_inference.change(write_inference_type, inputs=[run_inference, uid_label])
|
109 |
+
|
110 |
gr.on(
|
111 |
triggers=[label.change for label in column_mappings],
|
112 |
fn=write_column_mapping_to_config,
|
|
|
114 |
dataset_id_input,
|
115 |
dataset_config_input,
|
116 |
dataset_split_input,
|
117 |
+
uid_label,
|
118 |
*column_mappings,
|
119 |
],
|
120 |
)
|
|
|
122 |
gr.on(
|
123 |
triggers=[
|
124 |
model_id_input.change,
|
125 |
+
dataset_id_input.change,
|
126 |
dataset_config_input.change,
|
127 |
dataset_split_input.change,
|
128 |
],
|
|
|
141 |
],
|
142 |
)
|
143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
144 |
gr.on(
|
145 |
triggers=[
|
146 |
run_btn.click,
|
io_utils.py
CHANGED
@@ -1,50 +1,56 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
-
|
4 |
import yaml
|
5 |
|
6 |
-
YAML_PATH = "./
|
7 |
-
PIPE_PATH = "./tmp/pipe"
|
8 |
-
|
9 |
|
10 |
class Dumper(yaml.Dumper):
|
11 |
def increase_indent(self, flow=False, *args, **kwargs):
|
12 |
return super().increase_indent(flow=flow, indentless=False)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# read scanners from yaml file
|
16 |
# return a list of scanners
|
17 |
-
def read_scanners(
|
18 |
scanners = []
|
19 |
-
with open(
|
20 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
21 |
scanners = config.get("detectors", [])
|
|
|
22 |
return scanners
|
23 |
|
24 |
|
25 |
# convert a list of scanners to yaml file
|
26 |
-
def write_scanners(scanners):
|
27 |
-
|
28 |
-
with open(YAML_PATH, "r+") as f:
|
29 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
30 |
if config:
|
31 |
config["detectors"] = scanners
|
32 |
# save scanners to detectors in yaml
|
33 |
yaml.dump(config, f, Dumper=Dumper)
|
|
|
34 |
|
35 |
|
36 |
# read model_type from yaml file
|
37 |
-
def read_inference_type(
|
38 |
inference_type = ""
|
39 |
-
with open(
|
40 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
41 |
inference_type = config.get("inference_type", "")
|
|
|
42 |
return inference_type
|
43 |
|
44 |
|
45 |
# write model_type to yaml file
|
46 |
-
def write_inference_type(use_inference):
|
47 |
-
with open(
|
48 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
49 |
if use_inference:
|
50 |
config["inference_type"] = "hf_inference_api"
|
@@ -52,31 +58,34 @@ def write_inference_type(use_inference):
|
|
52 |
config["inference_type"] = "hf_pipeline"
|
53 |
# save inference_type to inference_type in yaml
|
54 |
yaml.dump(config, f, Dumper=Dumper)
|
55 |
-
|
56 |
|
57 |
# read column mapping from yaml file
|
58 |
-
def read_column_mapping(
|
59 |
column_mapping = {}
|
60 |
-
with open(
|
61 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
62 |
if config:
|
63 |
column_mapping = config.get("column_mapping", dict())
|
|
|
64 |
return column_mapping
|
65 |
|
66 |
|
67 |
# write column mapping to yaml file
|
68 |
-
def write_column_mapping(mapping):
|
69 |
-
with open(
|
70 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
|
|
71 |
if config is None:
|
72 |
return
|
73 |
if mapping is None and "column_mapping" in config.keys():
|
74 |
del config["column_mapping"]
|
75 |
else:
|
76 |
config["column_mapping"] = mapping
|
77 |
-
with open(
|
78 |
# save column_mapping to column_mapping in yaml
|
79 |
yaml.dump(config, f, Dumper=Dumper)
|
|
|
80 |
|
81 |
|
82 |
# convert column mapping dataframe to json
|
@@ -102,39 +111,15 @@ def write_log_to_user_file(id, log):
|
|
102 |
|
103 |
|
104 |
def save_job_to_pipe(id, job, lock):
|
105 |
-
if not os.path.exists("./tmp"):
|
106 |
-
os.makedirs("./tmp")
|
107 |
-
job = [str(i) for i in job]
|
108 |
-
job = ",".join(job)
|
109 |
-
print(job)
|
110 |
with lock:
|
111 |
-
|
112 |
-
# write each element in job
|
113 |
-
f.write(f"{id}@{job}\n")
|
114 |
-
|
115 |
|
116 |
def pop_job_from_pipe():
|
117 |
-
if
|
118 |
return
|
119 |
-
|
120 |
-
job = f.readline().strip()
|
121 |
-
remaining = f.readlines()
|
122 |
-
f.close()
|
123 |
-
with open(PIPE_PATH, "w") as f:
|
124 |
-
f.write("\n".join(remaining))
|
125 |
-
f.close()
|
126 |
-
if len(job) == 0:
|
127 |
-
return
|
128 |
-
job_info = job.split("\n")[0].split("@")
|
129 |
-
if len(job_info) != 2:
|
130 |
-
raise ValueError("Invalid job info: ", job_info)
|
131 |
-
|
132 |
write_log_to_user_file(job_info[0], f"Running job id {job_info[0]}\n")
|
133 |
-
command = job_info[1]
|
134 |
-
masked_command = command.copy()
|
135 |
-
hf_token_index = masked_command.index("--hf_token")
|
136 |
-
masked_command[hf_token_index + 1] = "hf_********"
|
137 |
-
write_log_to_user_file(job_info[0], f"Running command {masked_command}\n")
|
138 |
|
139 |
log_file = open(f"./tmp/{job_info[0]}_log", "a")
|
140 |
subprocess.Popen(
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
+
import pipe
|
4 |
import yaml
|
5 |
|
6 |
+
YAML_PATH = "./configs"
|
|
|
|
|
7 |
|
8 |
class Dumper(yaml.Dumper):
|
9 |
def increase_indent(self, flow=False, *args, **kwargs):
|
10 |
return super().increase_indent(flow=flow, indentless=False)
|
11 |
|
12 |
+
def get_yaml_path(uid):
|
13 |
+
if not os.path.exists(YAML_PATH):
|
14 |
+
os.makedirs(YAML_PATH)
|
15 |
+
if not os.path.exists(f"{YAML_PATH}/{uid}_config.yaml"):
|
16 |
+
os.system(f"cp {YAML_PATH}/config.yaml {YAML_PATH}/{uid}_config.yaml")
|
17 |
+
return f"{YAML_PATH}/{uid}_config.yaml"
|
18 |
|
19 |
# read scanners from yaml file
|
20 |
# return a list of scanners
|
21 |
+
def read_scanners(uid):
|
22 |
scanners = []
|
23 |
+
with open(get_yaml_path(uid), "r") as f:
|
24 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
25 |
scanners = config.get("detectors", [])
|
26 |
+
f.close()
|
27 |
return scanners
|
28 |
|
29 |
|
30 |
# convert a list of scanners to yaml file
|
31 |
+
def write_scanners(scanners, uid):
|
32 |
+
with open(get_yaml_path(uid), "r+") as f:
|
|
|
33 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
34 |
if config:
|
35 |
config["detectors"] = scanners
|
36 |
# save scanners to detectors in yaml
|
37 |
yaml.dump(config, f, Dumper=Dumper)
|
38 |
+
f.close()
|
39 |
|
40 |
|
41 |
# read model_type from yaml file
|
42 |
+
def read_inference_type(uid):
|
43 |
inference_type = ""
|
44 |
+
with open(get_yaml_path(uid), "r") as f:
|
45 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
46 |
inference_type = config.get("inference_type", "")
|
47 |
+
f.close()
|
48 |
return inference_type
|
49 |
|
50 |
|
51 |
# write model_type to yaml file
|
52 |
+
def write_inference_type(use_inference, uid):
|
53 |
+
with open(get_yaml_path(uid), "r+") as f:
|
54 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
55 |
if use_inference:
|
56 |
config["inference_type"] = "hf_inference_api"
|
|
|
58 |
config["inference_type"] = "hf_pipeline"
|
59 |
# save inference_type to inference_type in yaml
|
60 |
yaml.dump(config, f, Dumper=Dumper)
|
61 |
+
f.close()
|
62 |
|
63 |
# read column mapping from yaml file
|
64 |
+
def read_column_mapping(uid):
|
65 |
column_mapping = {}
|
66 |
+
with open(get_yaml_path(uid), "r") as f:
|
67 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
68 |
if config:
|
69 |
column_mapping = config.get("column_mapping", dict())
|
70 |
+
f.close()
|
71 |
return column_mapping
|
72 |
|
73 |
|
74 |
# write column mapping to yaml file
|
75 |
+
def write_column_mapping(mapping, uid):
|
76 |
+
with open(get_yaml_path(uid), "r") as f:
|
77 |
config = yaml.load(f, Loader=yaml.FullLoader)
|
78 |
+
f.close()
|
79 |
if config is None:
|
80 |
return
|
81 |
if mapping is None and "column_mapping" in config.keys():
|
82 |
del config["column_mapping"]
|
83 |
else:
|
84 |
config["column_mapping"] = mapping
|
85 |
+
with open(get_yaml_path(uid), "w") as f:
|
86 |
# save column_mapping to column_mapping in yaml
|
87 |
yaml.dump(config, f, Dumper=Dumper)
|
88 |
+
f.close()
|
89 |
|
90 |
|
91 |
# convert column mapping dataframe to json
|
|
|
111 |
|
112 |
|
113 |
def save_job_to_pipe(id, job, lock):
|
|
|
|
|
|
|
|
|
|
|
114 |
with lock:
|
115 |
+
pipe.jobs.append((id, job))
|
|
|
|
|
|
|
116 |
|
117 |
def pop_job_from_pipe():
|
118 |
+
if len(pipe.jobs) == 0:
|
119 |
return
|
120 |
+
job_info = pipe.jobs.pop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
write_log_to_user_file(job_info[0], f"Running job id {job_info[0]}\n")
|
122 |
+
command = job_info[1]
|
|
|
|
|
|
|
|
|
123 |
|
124 |
log_file = open(f"./tmp/{job_info[0]}_log", "a")
|
125 |
subprocess.Popen(
|
pipe.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
def init():
|
3 |
+
global jobs
|
4 |
+
jobs = list()
|
run_jobs.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import threading
|
2 |
import time
|
3 |
-
|
4 |
from io_utils import pop_job_from_pipe
|
5 |
|
6 |
|
@@ -11,6 +11,7 @@ def start_process_run_job():
|
|
11 |
thread = threading.Thread(target=run_job)
|
12 |
thread.daemon = True
|
13 |
thread.do_run = True
|
|
|
14 |
thread.start()
|
15 |
|
16 |
except Exception as e:
|
@@ -24,7 +25,6 @@ def stop_thread():
|
|
24 |
|
25 |
def run_job():
|
26 |
while True:
|
27 |
-
print(thread.do_run)
|
28 |
try:
|
29 |
pop_job_from_pipe()
|
30 |
time.sleep(10)
|
|
|
1 |
import threading
|
2 |
import time
|
3 |
+
import pipe
|
4 |
from io_utils import pop_job_from_pipe
|
5 |
|
6 |
|
|
|
11 |
thread = threading.Thread(target=run_job)
|
12 |
thread.daemon = True
|
13 |
thread.do_run = True
|
14 |
+
pipe.init()
|
15 |
thread.start()
|
16 |
|
17 |
except Exception as e:
|
|
|
25 |
|
26 |
def run_job():
|
27 |
while True:
|
|
|
28 |
try:
|
29 |
pop_job_from_pipe()
|
30 |
time.sleep(10)
|
text_classification.py
CHANGED
@@ -8,6 +8,10 @@ from transformers import pipeline
|
|
8 |
|
9 |
|
10 |
def get_labels_and_features_from_dataset(dataset_id, dataset_config, split):
|
|
|
|
|
|
|
|
|
11 |
try:
|
12 |
ds = datasets.load_dataset(dataset_id, dataset_config)[split]
|
13 |
dataset_features = ds.features
|
|
|
8 |
|
9 |
|
10 |
def get_labels_and_features_from_dataset(dataset_id, dataset_config, split):
|
11 |
+
if not dataset_config:
|
12 |
+
dataset_config = 'default'
|
13 |
+
if not split:
|
14 |
+
split = 'train'
|
15 |
try:
|
16 |
ds = datasets.load_dataset(dataset_id, dataset_config)[split]
|
17 |
dataset_features = ds.features
|
text_classification_ui_helpers.py
CHANGED
@@ -27,12 +27,10 @@ MAX_FEATURES = 20
|
|
27 |
HF_REPO_ID = "HF_REPO_ID"
|
28 |
HF_SPACE_ID = "SPACE_ID"
|
29 |
HF_WRITE_TOKEN = "HF_WRITE_TOKEN"
|
30 |
-
CONFIG_PATH = "./config.yaml"
|
31 |
|
32 |
-
|
33 |
-
def check_dataset_and_get_config(dataset_id):
|
34 |
try:
|
35 |
-
write_column_mapping(None)
|
36 |
configs = datasets.get_dataset_config_names(dataset_id)
|
37 |
return gr.Dropdown(configs, value=configs[0], visible=True)
|
38 |
except Exception:
|
@@ -50,14 +48,16 @@ def check_dataset_and_get_split(dataset_id, dataset_config):
|
|
50 |
pass
|
51 |
|
52 |
|
53 |
-
def write_column_mapping_to_config(dataset_id, dataset_config, dataset_split, *labels):
|
54 |
-
|
|
|
|
|
55 |
dataset_id, dataset_config, dataset_split
|
56 |
)
|
57 |
if labels is None:
|
58 |
return
|
59 |
labels = [*labels]
|
60 |
-
all_mappings = read_column_mapping(
|
61 |
|
62 |
if all_mappings is None:
|
63 |
all_mappings = dict()
|
@@ -73,7 +73,7 @@ def write_column_mapping_to_config(dataset_id, dataset_config, dataset_split, *l
|
|
73 |
if feat:
|
74 |
# TODO: Substitute 'text' with more features for zero-shot
|
75 |
all_mappings["features"]["text"] = feat
|
76 |
-
write_column_mapping(all_mappings)
|
77 |
|
78 |
|
79 |
def list_labels_and_features_from_dataset(ds_labels, ds_features, model_id2label):
|
@@ -178,7 +178,7 @@ def check_model_and_show_prediction(
|
|
178 |
|
179 |
|
180 |
def try_submit(m_id, d_id, config, split, local, uid):
|
181 |
-
all_mappings = read_column_mapping(
|
182 |
|
183 |
if all_mappings is None:
|
184 |
gr.Warning(CONFIRM_MAPPING_DETAILS_FAIL_RAW)
|
|
|
27 |
HF_REPO_ID = "HF_REPO_ID"
|
28 |
HF_SPACE_ID = "SPACE_ID"
|
29 |
HF_WRITE_TOKEN = "HF_WRITE_TOKEN"
|
|
|
30 |
|
31 |
+
def check_dataset_and_get_config(dataset_id, uid):
|
|
|
32 |
try:
|
33 |
+
write_column_mapping(None, uid) # reset column mapping
|
34 |
configs = datasets.get_dataset_config_names(dataset_id)
|
35 |
return gr.Dropdown(configs, value=configs[0], visible=True)
|
36 |
except Exception:
|
|
|
48 |
pass
|
49 |
|
50 |
|
51 |
+
def write_column_mapping_to_config(dataset_id, dataset_config, dataset_split, uid, *labels):
|
52 |
+
# TODO: Substitute 'text' with more features for zero-shot
|
53 |
+
# we are not using ds features because we only support "text" for now
|
54 |
+
ds_labels, _ = get_labels_and_features_from_dataset(
|
55 |
dataset_id, dataset_config, dataset_split
|
56 |
)
|
57 |
if labels is None:
|
58 |
return
|
59 |
labels = [*labels]
|
60 |
+
all_mappings = read_column_mapping(uid)
|
61 |
|
62 |
if all_mappings is None:
|
63 |
all_mappings = dict()
|
|
|
73 |
if feat:
|
74 |
# TODO: Substitute 'text' with more features for zero-shot
|
75 |
all_mappings["features"]["text"] = feat
|
76 |
+
write_column_mapping(all_mappings, uid)
|
77 |
|
78 |
|
79 |
def list_labels_and_features_from_dataset(ds_labels, ds_features, model_id2label):
|
|
|
178 |
|
179 |
|
180 |
def try_submit(m_id, d_id, config, split, local, uid):
|
181 |
+
all_mappings = read_column_mapping(uid)
|
182 |
|
183 |
if all_mappings is None:
|
184 |
gr.Warning(CONFIRM_MAPPING_DETAILS_FAIL_RAW)
|