Commit
·
a69bbb8
1
Parent(s):
1df21c4
feat: add support for file uploads
Browse files
src/distilabel_dataset_generator/__init__.py
CHANGED
@@ -7,7 +7,7 @@ from distilabel.utils.card.dataset_card import (
|
|
7 |
DistilabelDatasetCard,
|
8 |
size_categories_parser,
|
9 |
)
|
10 |
-
from huggingface_hub import DatasetCardData, HfApi
|
11 |
|
12 |
|
13 |
class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
|
|
7 |
DistilabelDatasetCard,
|
8 |
size_categories_parser,
|
9 |
)
|
10 |
+
from huggingface_hub import DatasetCardData, HfApi, upload_file
|
11 |
|
12 |
|
13 |
class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
|
src/distilabel_dataset_generator/apps/sft.py
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
1 |
import multiprocessing
|
2 |
import time
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
from distilabel.distiset import Distiset
|
|
|
7 |
|
8 |
from src.distilabel_dataset_generator.pipelines.sft import (
|
9 |
DEFAULT_DATASET_DESCRIPTIONS,
|
@@ -140,7 +142,7 @@ def generate_dataset(
|
|
140 |
distiset.push_to_hub(
|
141 |
repo_id=repo_id,
|
142 |
private=private,
|
143 |
-
include_script=
|
144 |
token=oauth_token,
|
145 |
)
|
146 |
|
@@ -155,6 +157,18 @@ def generate_dataset(
|
|
155 |
return pd.DataFrame(outputs)
|
156 |
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
css = """
|
159 |
.main_ui_logged_out{opacity: 0.3; pointer-events: none}
|
160 |
"""
|
@@ -169,9 +183,9 @@ with gr.Blocks(
|
|
169 |
"To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
|
170 |
)
|
171 |
with gr.Row():
|
172 |
-
gr.Column(
|
173 |
get_login_button()
|
174 |
-
gr.Column(
|
175 |
|
176 |
gr.Markdown("## Iterate on a sample dataset")
|
177 |
with gr.Column() as main_ui:
|
@@ -304,6 +318,17 @@ with gr.Blocks(
|
|
304 |
def hide_success_message():
|
305 |
return gr.Markdown(visible=False)
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
sample_dataset.change(
|
308 |
fn=lambda x: x,
|
309 |
inputs=[sample_dataset],
|
@@ -326,23 +351,16 @@ with gr.Blocks(
|
|
326 |
],
|
327 |
outputs=[final_dataset],
|
328 |
show_progress=True,
|
|
|
|
|
|
|
|
|
329 |
).success(
|
330 |
fn=show_success_message,
|
331 |
inputs=[org_name, repo_name],
|
332 |
outputs=[success_message],
|
333 |
)
|
334 |
|
335 |
-
gr.Markdown("## Or run this pipeline locally with distilabel")
|
336 |
-
|
337 |
-
with gr.Accordion("Run this pipeline using distilabel", open=False):
|
338 |
-
pipeline_code = gr.Code(
|
339 |
-
value=generate_pipeline_code(
|
340 |
-
system_prompt.value, num_turns.value, num_rows.value
|
341 |
-
),
|
342 |
-
language="python",
|
343 |
-
label="Distilabel Pipeline Code",
|
344 |
-
)
|
345 |
-
|
346 |
system_prompt.change(
|
347 |
fn=generate_pipeline_code,
|
348 |
inputs=[system_prompt, num_turns, num_rows],
|
|
|
1 |
+
import io
|
2 |
import multiprocessing
|
3 |
import time
|
4 |
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
7 |
from distilabel.distiset import Distiset
|
8 |
+
from huggingface_hub import upload_file
|
9 |
|
10 |
from src.distilabel_dataset_generator.pipelines.sft import (
|
11 |
DEFAULT_DATASET_DESCRIPTIONS,
|
|
|
142 |
distiset.push_to_hub(
|
143 |
repo_id=repo_id,
|
144 |
private=private,
|
145 |
+
include_script=True,
|
146 |
token=oauth_token,
|
147 |
)
|
148 |
|
|
|
157 |
return pd.DataFrame(outputs)
|
158 |
|
159 |
|
160 |
+
def upload_pipeline_code(pipeline_code, org_name, repo_name, oauth_token):
|
161 |
+
with io.BytesIO(pipeline_code.encode("utf-8")) as f:
|
162 |
+
upload_file(
|
163 |
+
path_or_fileobj=f,
|
164 |
+
path_in_repo="pipeline.py",
|
165 |
+
repo_id=f"{org_name}/{repo_name}",
|
166 |
+
repo_type="dataset",
|
167 |
+
token=oauth_token,
|
168 |
+
commit_message="Include pipeline script",
|
169 |
+
)
|
170 |
+
|
171 |
+
|
172 |
css = """
|
173 |
.main_ui_logged_out{opacity: 0.3; pointer-events: none}
|
174 |
"""
|
|
|
183 |
"To push the dataset to the Hugging Face Hub you need to sign in. This will only be used for pushing the dataset not for data generation."
|
184 |
)
|
185 |
with gr.Row():
|
186 |
+
gr.Column()
|
187 |
get_login_button()
|
188 |
+
gr.Column()
|
189 |
|
190 |
gr.Markdown("## Iterate on a sample dataset")
|
191 |
with gr.Column() as main_ui:
|
|
|
318 |
def hide_success_message():
|
319 |
return gr.Markdown(visible=False)
|
320 |
|
321 |
+
gr.Markdown("## Or run this pipeline locally with distilabel")
|
322 |
+
|
323 |
+
with gr.Accordion("Run this pipeline using distilabel", open=False):
|
324 |
+
pipeline_code = gr.Code(
|
325 |
+
value=generate_pipeline_code(
|
326 |
+
system_prompt.value, num_turns.value, num_rows.value
|
327 |
+
),
|
328 |
+
language="python",
|
329 |
+
label="Distilabel Pipeline Code",
|
330 |
+
)
|
331 |
+
|
332 |
sample_dataset.change(
|
333 |
fn=lambda x: x,
|
334 |
inputs=[sample_dataset],
|
|
|
351 |
],
|
352 |
outputs=[final_dataset],
|
353 |
show_progress=True,
|
354 |
+
).then(
|
355 |
+
fn=upload_pipeline_code,
|
356 |
+
inputs=[pipeline_code, org_name, repo_name, oauth_token],
|
357 |
+
outputs=[],
|
358 |
).success(
|
359 |
fn=show_success_message,
|
360 |
inputs=[org_name, repo_name],
|
361 |
outputs=[success_message],
|
362 |
)
|
363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
system_prompt.change(
|
365 |
fn=generate_pipeline_code,
|
366 |
inputs=[system_prompt, num_turns, num_rows],
|
src/distilabel_dataset_generator/utils.py
CHANGED
@@ -33,8 +33,7 @@ else:
|
|
33 |
|
34 |
def get_login_button():
|
35 |
return gr.LoginButton(
|
36 |
-
value="Sign in with Hugging Face!",
|
37 |
-
size="lg",
|
38 |
).activate()
|
39 |
|
40 |
|
|
|
33 |
|
34 |
def get_login_button():
|
35 |
return gr.LoginButton(
|
36 |
+
value="Sign in with Hugging Face!", size="lg", scale=2
|
|
|
37 |
).activate()
|
38 |
|
39 |
|