Commit
β’
fd2f716
1
Parent(s):
629fdb8
refactor package folders
Browse files- app.py +1 -1
- pyproject.toml +1 -1
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/__init__.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/_tabbedinterface.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/app.py +5 -5
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/__init__.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/base.py +2 -2
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/eval.py +5 -5
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/faq.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/sft.py +5 -5
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/textcat.py +5 -5
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/constants.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/__init__.py +0 -0
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/base.py +1 -1
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/embeddings.py +1 -1
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/eval.py +3 -3
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/sft.py +2 -2
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/textcat.py +3 -3
- src/{distilabel_dataset_generator β synthetic_dataset_generator}/utils.py +1 -1
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
demo.launch()
|
|
|
1 |
+
from synthetic_dataset_generator.app import demo
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
demo.launch()
|
pyproject.toml
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
[project]
|
2 |
-
name = "
|
3 |
version = "0.1.0"
|
4 |
description = "Build datasets using natural language"
|
5 |
authors = [
|
|
|
1 |
[project]
|
2 |
+
name = "synthetic-dataset-generator"
|
3 |
version = "0.1.0"
|
4 |
description = "Build datasets using natural language"
|
5 |
authors = [
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/__init__.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/_tabbedinterface.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/app.py
RENAMED
@@ -1,8 +1,8 @@
|
|
1 |
-
from
|
2 |
-
from
|
3 |
-
from
|
4 |
-
from
|
5 |
-
from
|
6 |
|
7 |
theme = "argilla/argilla-theme"
|
8 |
|
|
|
1 |
+
from synthetic_dataset_generator._tabbedinterface import TabbedInterface
|
2 |
+
from synthetic_dataset_generator.apps.eval import app as eval_app
|
3 |
+
from synthetic_dataset_generator.apps.faq import app as faq_app
|
4 |
+
from synthetic_dataset_generator.apps.sft import app as sft_app
|
5 |
+
from synthetic_dataset_generator.apps.textcat import app as textcat_app
|
6 |
|
7 |
theme = "argilla/argilla-theme"
|
8 |
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/__init__.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/base.py
RENAMED
@@ -10,8 +10,8 @@ from distilabel.distiset import Distiset
|
|
10 |
from gradio import OAuthToken
|
11 |
from huggingface_hub import HfApi, upload_file
|
12 |
|
13 |
-
from
|
14 |
-
from
|
15 |
get_argilla_client,
|
16 |
)
|
17 |
|
|
|
10 |
from gradio import OAuthToken
|
11 |
from huggingface_hub import HfApi, upload_file
|
12 |
|
13 |
+
from synthetic_dataset_generator.constants import TEXTCAT_TASK
|
14 |
+
from synthetic_dataset_generator.utils import (
|
15 |
get_argilla_client,
|
16 |
)
|
17 |
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/eval.py
RENAMED
@@ -16,23 +16,23 @@ from distilabel.distiset import Distiset
|
|
16 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
17 |
from huggingface_hub import HfApi
|
18 |
|
19 |
-
from
|
20 |
hide_success_message,
|
21 |
show_success_message,
|
22 |
validate_argilla_user_workspace_dataset,
|
23 |
validate_push_to_hub,
|
24 |
)
|
25 |
-
from
|
26 |
-
from
|
27 |
get_embeddings,
|
28 |
get_sentence_embedding_dimensions,
|
29 |
)
|
30 |
-
from
|
31 |
generate_pipeline_code,
|
32 |
get_custom_evaluator,
|
33 |
get_ultrafeedback_evaluator,
|
34 |
)
|
35 |
-
from
|
36 |
column_to_list,
|
37 |
extract_column_names,
|
38 |
get_argilla_client,
|
|
|
16 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
17 |
from huggingface_hub import HfApi
|
18 |
|
19 |
+
from synthetic_dataset_generator.apps.base import (
|
20 |
hide_success_message,
|
21 |
show_success_message,
|
22 |
validate_argilla_user_workspace_dataset,
|
23 |
validate_push_to_hub,
|
24 |
)
|
25 |
+
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
26 |
+
from synthetic_dataset_generator.pipelines.embeddings import (
|
27 |
get_embeddings,
|
28 |
get_sentence_embedding_dimensions,
|
29 |
)
|
30 |
+
from synthetic_dataset_generator.pipelines.eval import (
|
31 |
generate_pipeline_code,
|
32 |
get_custom_evaluator,
|
33 |
get_ultrafeedback_evaluator,
|
34 |
)
|
35 |
+
from synthetic_dataset_generator.utils import (
|
36 |
column_to_list,
|
37 |
extract_column_names,
|
38 |
get_argilla_client,
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/faq.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/sft.py
RENAMED
@@ -9,25 +9,25 @@ from datasets import Dataset
|
|
9 |
from distilabel.distiset import Distiset
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
-
from
|
13 |
hide_success_message,
|
14 |
show_success_message,
|
15 |
validate_argilla_user_workspace_dataset,
|
16 |
validate_push_to_hub,
|
17 |
)
|
18 |
-
from
|
19 |
-
from
|
20 |
get_embeddings,
|
21 |
get_sentence_embedding_dimensions,
|
22 |
)
|
23 |
-
from
|
24 |
DEFAULT_DATASET_DESCRIPTIONS,
|
25 |
generate_pipeline_code,
|
26 |
get_magpie_generator,
|
27 |
get_prompt_generator,
|
28 |
get_response_generator,
|
29 |
)
|
30 |
-
from
|
31 |
get_argilla_client,
|
32 |
get_org_dropdown,
|
33 |
swap_visibility,
|
|
|
9 |
from distilabel.distiset import Distiset
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
+
from synthetic_dataset_generator.apps.base import (
|
13 |
hide_success_message,
|
14 |
show_success_message,
|
15 |
validate_argilla_user_workspace_dataset,
|
16 |
validate_push_to_hub,
|
17 |
)
|
18 |
+
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, SFT_AVAILABLE
|
19 |
+
from synthetic_dataset_generator.pipelines.embeddings import (
|
20 |
get_embeddings,
|
21 |
get_sentence_embedding_dimensions,
|
22 |
)
|
23 |
+
from synthetic_dataset_generator.pipelines.sft import (
|
24 |
DEFAULT_DATASET_DESCRIPTIONS,
|
25 |
generate_pipeline_code,
|
26 |
get_magpie_generator,
|
27 |
get_prompt_generator,
|
28 |
get_response_generator,
|
29 |
)
|
30 |
+
from synthetic_dataset_generator.utils import (
|
31 |
get_argilla_client,
|
32 |
get_org_dropdown,
|
33 |
swap_visibility,
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/apps/textcat.py
RENAMED
@@ -9,30 +9,30 @@ from datasets import ClassLabel, Dataset, Features, Sequence, Value
|
|
9 |
from distilabel.distiset import Distiset
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
-
from
|
13 |
-
from src.distilabel_dataset_generator.apps.base import (
|
14 |
hide_success_message,
|
15 |
show_success_message,
|
16 |
validate_argilla_user_workspace_dataset,
|
17 |
validate_push_to_hub,
|
18 |
)
|
19 |
-
from src.
|
20 |
get_embeddings,
|
21 |
get_sentence_embedding_dimensions,
|
22 |
)
|
23 |
-
from src.
|
24 |
DEFAULT_DATASET_DESCRIPTIONS,
|
25 |
generate_pipeline_code,
|
26 |
get_labeller_generator,
|
27 |
get_prompt_generator,
|
28 |
get_textcat_generator,
|
29 |
)
|
30 |
-
from src.
|
31 |
get_argilla_client,
|
32 |
get_org_dropdown,
|
33 |
get_preprocess_labels,
|
34 |
swap_visibility,
|
35 |
)
|
|
|
36 |
|
37 |
|
38 |
def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
|
|
|
9 |
from distilabel.distiset import Distiset
|
10 |
from huggingface_hub import HfApi
|
11 |
|
12 |
+
from src.synthetic_dataset_generator.apps.base import (
|
|
|
13 |
hide_success_message,
|
14 |
show_success_message,
|
15 |
validate_argilla_user_workspace_dataset,
|
16 |
validate_push_to_hub,
|
17 |
)
|
18 |
+
from src.synthetic_dataset_generator.pipelines.embeddings import (
|
19 |
get_embeddings,
|
20 |
get_sentence_embedding_dimensions,
|
21 |
)
|
22 |
+
from src.synthetic_dataset_generator.pipelines.textcat import (
|
23 |
DEFAULT_DATASET_DESCRIPTIONS,
|
24 |
generate_pipeline_code,
|
25 |
get_labeller_generator,
|
26 |
get_prompt_generator,
|
27 |
get_textcat_generator,
|
28 |
)
|
29 |
+
from src.synthetic_dataset_generator.utils import (
|
30 |
get_argilla_client,
|
31 |
get_org_dropdown,
|
32 |
get_preprocess_labels,
|
33 |
swap_visibility,
|
34 |
)
|
35 |
+
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
36 |
|
37 |
|
38 |
def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/constants.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/__init__.py
RENAMED
File without changes
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/base.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
from
|
2 |
|
3 |
TOKEN_INDEX = 0
|
4 |
|
|
|
1 |
+
from synthetic_dataset_generator.constants import API_KEYS
|
2 |
|
3 |
TOKEN_INDEX = 0
|
4 |
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/embeddings.py
RENAMED
@@ -3,7 +3,7 @@ from typing import List
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sentence_transformers.models import StaticEmbedding
|
5 |
|
6 |
-
from
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec(STATIC_EMBEDDING_MODEL)
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sentence_transformers.models import StaticEmbedding
|
5 |
|
6 |
+
from synthetic_dataset_generator.constants import STATIC_EMBEDDING_MODEL
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec(STATIC_EMBEDDING_MODEL)
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/eval.py
RENAMED
@@ -5,9 +5,9 @@ from distilabel.steps.tasks import (
|
|
5 |
UltraFeedback,
|
6 |
)
|
7 |
|
8 |
-
from
|
9 |
-
from
|
10 |
-
from
|
11 |
|
12 |
|
13 |
def get_ultrafeedback_evaluator(aspect, is_sample):
|
|
|
5 |
UltraFeedback,
|
6 |
)
|
7 |
|
8 |
+
from synthetic_dataset_generator.constants import BASE_URL, MODEL
|
9 |
+
from synthetic_dataset_generator.pipelines.base import _get_next_api_key
|
10 |
+
from synthetic_dataset_generator.utils import extract_column_names
|
11 |
|
12 |
|
13 |
def get_ultrafeedback_evaluator(aspect, is_sample):
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/sft.py
RENAMED
@@ -1,12 +1,12 @@
|
|
1 |
from distilabel.llms import InferenceEndpointsLLM
|
2 |
from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
|
3 |
|
4 |
-
from
|
5 |
BASE_URL,
|
6 |
MAGPIE_PRE_QUERY_TEMPLATE,
|
7 |
MODEL,
|
8 |
)
|
9 |
-
from
|
10 |
|
11 |
INFORMATION_SEEKING_PROMPT = (
|
12 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
|
|
1 |
from distilabel.llms import InferenceEndpointsLLM
|
2 |
from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
|
3 |
|
4 |
+
from synthetic_dataset_generator.constants import (
|
5 |
BASE_URL,
|
6 |
MAGPIE_PRE_QUERY_TEMPLATE,
|
7 |
MODEL,
|
8 |
)
|
9 |
+
from synthetic_dataset_generator.pipelines.base import _get_next_api_key
|
10 |
|
11 |
INFORMATION_SEEKING_PROMPT = (
|
12 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/pipelines/textcat.py
RENAMED
@@ -9,9 +9,9 @@ from distilabel.steps.tasks import (
|
|
9 |
)
|
10 |
from pydantic import BaseModel, Field
|
11 |
|
12 |
-
from
|
13 |
-
from
|
14 |
-
from
|
15 |
|
16 |
PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
|
17 |
|
|
|
9 |
)
|
10 |
from pydantic import BaseModel, Field
|
11 |
|
12 |
+
from synthetic_dataset_generator.constants import BASE_URL, MODEL
|
13 |
+
from synthetic_dataset_generator.pipelines.base import _get_next_api_key
|
14 |
+
from synthetic_dataset_generator.utils import get_preprocess_labels
|
15 |
|
16 |
PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
|
17 |
|
src/{distilabel_dataset_generator β synthetic_dataset_generator}/utils.py
RENAMED
@@ -12,7 +12,7 @@ from gradio.oauth import (
|
|
12 |
from huggingface_hub import whoami
|
13 |
from jinja2 import Environment, meta
|
14 |
|
15 |
-
from
|
16 |
|
17 |
|
18 |
def get_duplicate_button():
|
|
|
12 |
from huggingface_hub import whoami
|
13 |
from jinja2 import Environment, meta
|
14 |
|
15 |
+
from synthetic_dataset_generator.constants import argilla_client
|
16 |
|
17 |
|
18 |
def get_duplicate_button():
|