Spaces:
Running
Running
File size: 6,172 Bytes
73e0168 f045267 73e0168 f045267 6017ce1 73e0168 f045267 73e0168 f045267 73e0168 f045267 bdb8322 73e0168 f045267 73e0168 f045267 73e0168 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import re
import subprocess
import yaml
import gradio as gr
import requests
from huggingface_hub import HfApi
CMD = ["python" ,"run.py"]
with open("README.md") as f:
METADATA = yaml.safe_load(f.read().split("---\n")[1])
TITLE = METADATA["title"]
EMOJI = METADATA["emoji"]
try:
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
HELP = not process.returncode and (process.stdout or process.stderr).decode()
except Exception:
HELP = False
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
def update_pbars(pbars: dict[str, float], line: str):
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|█▌"):
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
percent = float(percent_match.group(0)[:-1]) / 100
desc = line[:percent_match.start()].strip() or "Progress"
pbars[desc] = percent
def dry_run(src, config, split, dst, query):
if not all([src, config, split, dst, query]):
raise gr.Error("Please fill source, destination and query.")
process = subprocess.Popen(CMD + ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN], stdout=subprocess.PIPE)
logs = ""
for line in iter(process.stdout.readline, b""):
logs += line.decode()
yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
def run(src, config, split, dst, query):
if not all([src, config, split, dst, query]):
raise gr.Error("Please fill source, destination and query.")
raise gr.Error("NotImplemented")
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
NUM_TRENDING_DATASETS = 10
with gr.Blocks() as demo:
with gr.Row():
with gr.Column(scale=10):
gr.Markdown(f"# {TITLE} {EMOJI}")
with gr.Column():
gr.LoginButton(scale=0.1)
with gr.Row():
with gr.Column():
with gr.Row():
loading_codes_json = gr.JSON([], visible=False)
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
with gr.Column(scale=0.1, min_width=60):
gr.HTML("<div style='font-size: 4em;'>→</div>")
with gr.Column():
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
with gr.Row():
run_button = gr.Button("Run", scale=10, variant="primary")
if DRY_RUN:
dry_run_button = gr.Button("Dry-Run")
progress_labels= gr.Label(visible=False, label="Progress")
output_markdown = gr.Markdown(label="Output logs")
run_button.click(run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
if DRY_RUN:
dry_run_button.click(dry_run, inputs=[dataset_dropdown, subset_dropdown, split_dropdown, dst_dropdown, query_textarea], outputs=[progress_labels, output_markdown])
def show_subset_dropdown(dataset: str):
if dataset and "/" not in dataset.strip().strip("/"):
return []
resp = requests.get(f"https://datasets-server.huggingface.co/compatible-libraries?dataset={dataset}", timeout=3).json()
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_FUNCTIONS] or [[]])[0] or []
subsets = [loading_code["config_name"] for loading_code in loading_codes]
subset = (subsets or [""])[0]
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
def show_split_dropdown(subset: str, loading_codes: list[dict]):
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
split = (splits or [""])[0]
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
@demo.load(outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown])
def _fetch_datasets(request: gr.Request):
dataset = "CohereForAI/Global-MMLU"
datasets = [dataset] + [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1) if ds.id != dataset]
subsets, loading_codes = show_subset_dropdown(dataset)
splits = show_split_dropdown(subsets["value"], loading_codes)
return {
dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset),
loading_codes_json: loading_codes,
subset_dropdown: gr.Dropdown(**subsets),
split_dropdown: gr.Dropdown(**splits),
}
@dataset_dropdown.select(inputs=[dataset_dropdown], outputs=[subset_dropdown, split_dropdown])
def _show_subset_dropdown(dataset: str):
subsets, loading_codes = show_subset_dropdown(dataset)
splits = show_split_dropdown(subsets["value"], loading_codes)
return {
subset_dropdown: gr.Dropdown(**subsets),
split_dropdown: gr.Dropdown(**splits),
}
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[split_dropdown])
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
splits = show_split_dropdown(subset, loading_codes)
return {
split_dropdown: gr.Dropdown(**splits),
}
if HELP:
with demo.route("Help", "/help"):
gr.Markdown(f"# Help\n\n```\n{HELP}\n```")
with demo.route("Jobs", "/jobs"):
gr.Markdown("# Jobs")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")
|