Spaces:
Runtime error
Runtime error
not working, dont know why
Browse files- a.py +51 -0
- app.py +1 -1
- examples/argilla-deployment.py +1 -1
- examples/hf-dedicated-or-tgi-deployment.py +1 -1
- examples/hf-serverless-deployment.py +1 -1
- examples/ollama-deployment.py +1 -1
- examples/openai-deployment.py +1 -1
- examples/vllm-deployment.py +1 -1
- src/synthetic_dataset_generator/__init__.py +2 -2
- src/synthetic_dataset_generator/__main__.py +1 -1
- src/synthetic_dataset_generator/app.py +6 -6
- src/synthetic_dataset_generator/apps/base.py +2 -2
- src/synthetic_dataset_generator/apps/chat.py +6 -6
- src/synthetic_dataset_generator/apps/eval.py +5 -5
- src/synthetic_dataset_generator/apps/rag.py +6 -6
- src/synthetic_dataset_generator/apps/textcat.py +6 -6
- src/synthetic_dataset_generator/pipelines/base.py +1 -1
- src/synthetic_dataset_generator/pipelines/chat.py +2 -2
- src/synthetic_dataset_generator/pipelines/embeddings.py +1 -1
- src/synthetic_dataset_generator/pipelines/eval.py +3 -3
- src/synthetic_dataset_generator/pipelines/rag.py +2 -2
- src/synthetic_dataset_generator/pipelines/textcat.py +3 -3
- src/synthetic_dataset_generator/utils.py +1 -1
a.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
|
4 |
+
def modify_synthetic_imports(file_path):
|
5 |
+
"""Modify imports of synthetic_dataset_generator to add src."""
|
6 |
+
try:
|
7 |
+
with open(file_path, 'r') as file:
|
8 |
+
content = file.read()
|
9 |
+
|
10 |
+
# Replace both import patterns to add src.
|
11 |
+
modified_content = re.sub(
|
12 |
+
r'from src.synthetic_dataset_generator',
|
13 |
+
'from src.synthetic_dataset_generator',
|
14 |
+
content
|
15 |
+
)
|
16 |
+
modified_content = re.sub(
|
17 |
+
r'import src.synthetic_dataset_generator',
|
18 |
+
'import src.synthetic_dataset_generator',
|
19 |
+
modified_content
|
20 |
+
)
|
21 |
+
|
22 |
+
# Only write if changes were made
|
23 |
+
if modified_content != content:
|
24 |
+
with open(file_path, 'w') as file:
|
25 |
+
file.write(modified_content)
|
26 |
+
print(f"Modified imports in: {file_path}")
|
27 |
+
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error processing {file_path}: {str(e)}")
|
30 |
+
|
31 |
+
def process_directory(start_path):
|
32 |
+
"""Recursively process all Python files in directory"""
|
33 |
+
for root, _, files in os.walk(start_path):
|
34 |
+
for file in files:
|
35 |
+
if file.endswith('.py'):
|
36 |
+
file_path = os.path.join(root, file)
|
37 |
+
modify_synthetic_imports(file_path)
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
import sys
|
41 |
+
if len(sys.argv) != 2:
|
42 |
+
print("Usage: python script.py <directory_path>")
|
43 |
+
sys.exit(1)
|
44 |
+
|
45 |
+
directory_path = sys.argv[1]
|
46 |
+
if not os.path.isdir(directory_path):
|
47 |
+
print(f"Error: {directory_path} is not a valid directory")
|
48 |
+
sys.exit(1)
|
49 |
+
|
50 |
+
process_directory(directory_path)
|
51 |
+
print("Processing complete!")
|
app.py
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
-
from synthetic_dataset_generator import launch
|
2 |
|
3 |
launch()
|
|
|
1 |
+
from src.synthetic_dataset_generator import launch
|
2 |
|
3 |
launch()
|
examples/argilla-deployment.py
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
-
from synthetic_dataset_generator import launch
|
10 |
|
11 |
# Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
|
12 |
os.environ["HF_TOKEN"] = "hf_..."
|
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
+
from src.synthetic_dataset_generator import launch
|
10 |
|
11 |
# Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
|
12 |
os.environ["HF_TOKEN"] = "hf_..."
|
examples/hf-dedicated-or-tgi-deployment.py
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
-
from synthetic_dataset_generator import launch
|
10 |
|
11 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
12 |
os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/" # dedicated endpoint/TGI
|
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
+
from src.synthetic_dataset_generator import launch
|
10 |
|
11 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
12 |
os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/" # dedicated endpoint/TGI
|
examples/hf-serverless-deployment.py
CHANGED
@@ -6,7 +6,7 @@
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
-
from synthetic_dataset_generator import launch
|
10 |
|
11 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
12 |
os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use instruct model
|
|
|
6 |
# ///
|
7 |
import os
|
8 |
|
9 |
+
from src.synthetic_dataset_generator import launch
|
10 |
|
11 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
12 |
os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct" # use instruct model
|
examples/ollama-deployment.py
CHANGED
@@ -8,7 +8,7 @@
|
|
8 |
# ollama run qwen2.5:32b-instruct-q5_K_S
|
9 |
import os
|
10 |
|
11 |
-
from synthetic_dataset_generator import launch
|
12 |
|
13 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
14 |
os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/" # ollama base url
|
|
|
8 |
# ollama run qwen2.5:32b-instruct-q5_K_S
|
9 |
import os
|
10 |
|
11 |
+
from src.synthetic_dataset_generator import launch
|
12 |
|
13 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
14 |
os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/" # ollama base url
|
examples/openai-deployment.py
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
|
8 |
import os
|
9 |
|
10 |
-
from synthetic_dataset_generator import launch
|
11 |
|
12 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
13 |
os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/" # openai base url
|
|
|
7 |
|
8 |
import os
|
9 |
|
10 |
+
from src.synthetic_dataset_generator import launch
|
11 |
|
12 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
13 |
os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/" # openai base url
|
examples/vllm-deployment.py
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
# vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
8 |
import os
|
9 |
|
10 |
-
from synthetic_dataset_generator import launch
|
11 |
|
12 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
13 |
os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/" # vllm base url
|
|
|
7 |
# vllm serve Qwen/Qwen2.5-1.5B-Instruct
|
8 |
import os
|
9 |
|
10 |
+
from src.synthetic_dataset_generator import launch
|
11 |
|
12 |
os.environ["HF_TOKEN"] = "hf_..." # push the data to huggingface
|
13 |
os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/" # vllm base url
|
src/synthetic_dataset_generator/__init__.py
CHANGED
@@ -2,7 +2,7 @@ import inspect
|
|
2 |
|
3 |
from gradio import TabbedInterface
|
4 |
|
5 |
-
from synthetic_dataset_generator import ( # noqa
|
6 |
_distiset,
|
7 |
_inference_endpoints,
|
8 |
)
|
@@ -13,7 +13,7 @@ def launch(*args, **kwargs):
|
|
13 |
Based on the `TabbedInterface` from Gradio.
|
14 |
Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
|
15 |
"""
|
16 |
-
from synthetic_dataset_generator.app import demo
|
17 |
|
18 |
return demo.launch(*args, **kwargs)
|
19 |
|
|
|
2 |
|
3 |
from gradio import TabbedInterface
|
4 |
|
5 |
+
from src.synthetic_dataset_generator import ( # noqa
|
6 |
_distiset,
|
7 |
_inference_endpoints,
|
8 |
)
|
|
|
13 |
Based on the `TabbedInterface` from Gradio.
|
14 |
Parameters: https://www.gradio.app/docs/gradio/tabbedinterface
|
15 |
"""
|
16 |
+
from src.synthetic_dataset_generator.app import demo
|
17 |
|
18 |
return demo.launch(*args, **kwargs)
|
19 |
|
src/synthetic_dataset_generator/__main__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
if __name__ == "__main__":
|
2 |
-
from synthetic_dataset_generator import launch
|
3 |
|
4 |
launch()
|
|
|
1 |
if __name__ == "__main__":
|
2 |
+
from src.synthetic_dataset_generator import launch
|
3 |
|
4 |
launch()
|
src/synthetic_dataset_generator/app.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
from synthetic_dataset_generator._tabbedinterface import TabbedInterface
|
2 |
|
3 |
-
# from synthetic_dataset_generator.apps.eval import app as eval_app
|
4 |
-
from synthetic_dataset_generator.apps.rag import app as rag_app
|
5 |
-
from synthetic_dataset_generator.apps.about import app as about_app
|
6 |
-
from synthetic_dataset_generator.apps.chat import app as chat_app
|
7 |
-
from synthetic_dataset_generator.apps.textcat import app as textcat_app
|
8 |
|
9 |
theme = "argilla/argilla-theme"
|
10 |
|
|
|
1 |
+
from src.synthetic_dataset_generator._tabbedinterface import TabbedInterface
|
2 |
|
3 |
+
# from src.synthetic_dataset_generator.apps.eval import app as eval_app
|
4 |
+
from src.synthetic_dataset_generator.apps.rag import app as rag_app
|
5 |
+
from src.synthetic_dataset_generator.apps.about import app as about_app
|
6 |
+
from src.synthetic_dataset_generator.apps.chat import app as chat_app
|
7 |
+
from src.synthetic_dataset_generator.apps.textcat import app as textcat_app
|
8 |
|
9 |
theme = "argilla/argilla-theme"
|
10 |
|
src/synthetic_dataset_generator/apps/base.py
CHANGED
@@ -8,8 +8,8 @@ from datasets import Dataset, concatenate_datasets, load_dataset
|
|
8 |
from gradio import OAuthToken
|
9 |
from huggingface_hub import HfApi, upload_file, repo_exists
|
10 |
|
11 |
-
from synthetic_dataset_generator.constants import MAX_NUM_ROWS
|
12 |
-
from synthetic_dataset_generator.utils import get_argilla_client
|
13 |
|
14 |
|
15 |
def validate_argilla_user_workspace_dataset(
|
|
|
8 |
from gradio import OAuthToken
|
9 |
from huggingface_hub import HfApi, upload_file, repo_exists
|
10 |
|
11 |
+
from src.synthetic_dataset_generator.constants import MAX_NUM_ROWS
|
12 |
+
from src.synthetic_dataset_generator.utils import get_argilla_client
|
13 |
|
14 |
|
15 |
def validate_argilla_user_workspace_dataset(
|
src/synthetic_dataset_generator/apps/chat.py
CHANGED
@@ -10,7 +10,7 @@ from datasets import Dataset
|
|
10 |
from distilabel.distiset import Distiset
|
11 |
from huggingface_hub import HfApi
|
12 |
|
13 |
-
from synthetic_dataset_generator.apps.base import (
|
14 |
combine_datasets,
|
15 |
hide_success_message,
|
16 |
push_pipeline_code_to_hub,
|
@@ -19,25 +19,25 @@ from synthetic_dataset_generator.apps.base import (
|
|
19 |
validate_argilla_user_workspace_dataset,
|
20 |
validate_push_to_hub,
|
21 |
)
|
22 |
-
from synthetic_dataset_generator.constants import (
|
23 |
BASE_URL,
|
24 |
DEFAULT_BATCH_SIZE,
|
25 |
MODEL,
|
26 |
SFT_AVAILABLE,
|
27 |
)
|
28 |
-
from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
29 |
-
from synthetic_dataset_generator.pipelines.chat import (
|
30 |
DEFAULT_DATASET_DESCRIPTIONS,
|
31 |
generate_pipeline_code,
|
32 |
get_magpie_generator,
|
33 |
get_prompt_generator,
|
34 |
get_response_generator,
|
35 |
)
|
36 |
-
from synthetic_dataset_generator.pipelines.embeddings import (
|
37 |
get_embeddings,
|
38 |
get_sentence_embedding_dimensions,
|
39 |
)
|
40 |
-
from synthetic_dataset_generator.utils import (
|
41 |
get_argilla_client,
|
42 |
get_org_dropdown,
|
43 |
get_random_repo_name,
|
|
|
10 |
from distilabel.distiset import Distiset
|
11 |
from huggingface_hub import HfApi
|
12 |
|
13 |
+
from src.synthetic_dataset_generator.apps.base import (
|
14 |
combine_datasets,
|
15 |
hide_success_message,
|
16 |
push_pipeline_code_to_hub,
|
|
|
19 |
validate_argilla_user_workspace_dataset,
|
20 |
validate_push_to_hub,
|
21 |
)
|
22 |
+
from src.synthetic_dataset_generator.constants import (
|
23 |
BASE_URL,
|
24 |
DEFAULT_BATCH_SIZE,
|
25 |
MODEL,
|
26 |
SFT_AVAILABLE,
|
27 |
)
|
28 |
+
from src.synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
29 |
+
from src.synthetic_dataset_generator.pipelines.chat import (
|
30 |
DEFAULT_DATASET_DESCRIPTIONS,
|
31 |
generate_pipeline_code,
|
32 |
get_magpie_generator,
|
33 |
get_prompt_generator,
|
34 |
get_response_generator,
|
35 |
)
|
36 |
+
from src.synthetic_dataset_generator.pipelines.embeddings import (
|
37 |
get_embeddings,
|
38 |
get_sentence_embedding_dimensions,
|
39 |
)
|
40 |
+
from src.synthetic_dataset_generator.utils import (
|
41 |
get_argilla_client,
|
42 |
get_org_dropdown,
|
43 |
get_random_repo_name,
|
src/synthetic_dataset_generator/apps/eval.py
CHANGED
@@ -17,7 +17,7 @@ from gradio.oauth import OAuthToken #
|
|
17 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
18 |
from huggingface_hub import HfApi, repo_exists
|
19 |
|
20 |
-
from synthetic_dataset_generator.apps.base import (
|
21 |
combine_datasets,
|
22 |
get_iframe,
|
23 |
hide_success_message,
|
@@ -27,17 +27,17 @@ from synthetic_dataset_generator.apps.base import (
|
|
27 |
validate_argilla_user_workspace_dataset,
|
28 |
validate_push_to_hub,
|
29 |
)
|
30 |
-
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
31 |
-
from synthetic_dataset_generator.pipelines.embeddings import (
|
32 |
get_embeddings,
|
33 |
get_sentence_embedding_dimensions,
|
34 |
)
|
35 |
-
from synthetic_dataset_generator.pipelines.eval import (
|
36 |
generate_pipeline_code,
|
37 |
get_custom_evaluator,
|
38 |
get_ultrafeedback_evaluator,
|
39 |
)
|
40 |
-
from synthetic_dataset_generator.utils import (
|
41 |
column_to_list,
|
42 |
extract_column_names,
|
43 |
get_argilla_client,
|
|
|
17 |
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
18 |
from huggingface_hub import HfApi, repo_exists
|
19 |
|
20 |
+
from src.synthetic_dataset_generator.apps.base import (
|
21 |
combine_datasets,
|
22 |
get_iframe,
|
23 |
hide_success_message,
|
|
|
27 |
validate_argilla_user_workspace_dataset,
|
28 |
validate_push_to_hub,
|
29 |
)
|
30 |
+
from src.synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
31 |
+
from src.synthetic_dataset_generator.pipelines.embeddings import (
|
32 |
get_embeddings,
|
33 |
get_sentence_embedding_dimensions,
|
34 |
)
|
35 |
+
from src.synthetic_dataset_generator.pipelines.eval import (
|
36 |
generate_pipeline_code,
|
37 |
get_custom_evaluator,
|
38 |
get_ultrafeedback_evaluator,
|
39 |
)
|
40 |
+
from src.synthetic_dataset_generator.utils import (
|
41 |
column_to_list,
|
42 |
extract_column_names,
|
43 |
get_argilla_client,
|
src/synthetic_dataset_generator/apps/rag.py
CHANGED
@@ -21,7 +21,7 @@ from huggingface_hub import HfApi
|
|
21 |
from unstructured.chunking.title import chunk_by_title
|
22 |
from unstructured.partition.auto import partition
|
23 |
|
24 |
-
from synthetic_dataset_generator.apps.base import (
|
25 |
combine_datasets,
|
26 |
get_iframe,
|
27 |
hide_success_message,
|
@@ -31,13 +31,13 @@ from synthetic_dataset_generator.apps.base import (
|
|
31 |
validate_argilla_user_workspace_dataset,
|
32 |
validate_push_to_hub,
|
33 |
)
|
34 |
-
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
35 |
-
from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
36 |
-
from synthetic_dataset_generator.pipelines.embeddings import (
|
37 |
get_embeddings,
|
38 |
get_sentence_embedding_dimensions,
|
39 |
)
|
40 |
-
from synthetic_dataset_generator.pipelines.rag import (
|
41 |
DEFAULT_DATASET_DESCRIPTIONS,
|
42 |
get_chunks_generator,
|
43 |
get_prompt_generator,
|
@@ -45,7 +45,7 @@ from synthetic_dataset_generator.pipelines.rag import (
|
|
45 |
get_sentence_pair_generator,
|
46 |
get_response_generator,
|
47 |
)
|
48 |
-
from synthetic_dataset_generator.utils import (
|
49 |
column_to_list,
|
50 |
get_argilla_client,
|
51 |
get_org_dropdown,
|
|
|
21 |
from unstructured.chunking.title import chunk_by_title
|
22 |
from unstructured.partition.auto import partition
|
23 |
|
24 |
+
from src.synthetic_dataset_generator.apps.base import (
|
25 |
combine_datasets,
|
26 |
get_iframe,
|
27 |
hide_success_message,
|
|
|
31 |
validate_argilla_user_workspace_dataset,
|
32 |
validate_push_to_hub,
|
33 |
)
|
34 |
+
from src.synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
35 |
+
from src.synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
36 |
+
from src.synthetic_dataset_generator.pipelines.embeddings import (
|
37 |
get_embeddings,
|
38 |
get_sentence_embedding_dimensions,
|
39 |
)
|
40 |
+
from src.synthetic_dataset_generator.pipelines.rag import (
|
41 |
DEFAULT_DATASET_DESCRIPTIONS,
|
42 |
get_chunks_generator,
|
43 |
get_prompt_generator,
|
|
|
45 |
get_sentence_pair_generator,
|
46 |
get_response_generator,
|
47 |
)
|
48 |
+
from src.synthetic_dataset_generator.utils import (
|
49 |
column_to_list,
|
50 |
get_argilla_client,
|
51 |
get_org_dropdown,
|
src/synthetic_dataset_generator/apps/textcat.py
CHANGED
@@ -10,7 +10,7 @@ from datasets import ClassLabel, Dataset, Features, Sequence, Value
|
|
10 |
from distilabel.distiset import Distiset
|
11 |
from huggingface_hub import HfApi
|
12 |
|
13 |
-
from synthetic_dataset_generator.apps.base import (
|
14 |
combine_datasets,
|
15 |
hide_success_message,
|
16 |
push_pipeline_code_to_hub,
|
@@ -19,20 +19,20 @@ from synthetic_dataset_generator.apps.base import (
|
|
19 |
validate_argilla_user_workspace_dataset,
|
20 |
validate_push_to_hub,
|
21 |
)
|
22 |
-
from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
23 |
-
from synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
24 |
-
from synthetic_dataset_generator.pipelines.embeddings import (
|
25 |
get_embeddings,
|
26 |
get_sentence_embedding_dimensions,
|
27 |
)
|
28 |
-
from synthetic_dataset_generator.pipelines.textcat import (
|
29 |
DEFAULT_DATASET_DESCRIPTIONS,
|
30 |
generate_pipeline_code,
|
31 |
get_labeller_generator,
|
32 |
get_prompt_generator,
|
33 |
get_textcat_generator,
|
34 |
)
|
35 |
-
from synthetic_dataset_generator.utils import (
|
36 |
get_argilla_client,
|
37 |
get_org_dropdown,
|
38 |
get_preprocess_labels,
|
|
|
10 |
from distilabel.distiset import Distiset
|
11 |
from huggingface_hub import HfApi
|
12 |
|
13 |
+
from src.synthetic_dataset_generator.apps.base import (
|
14 |
combine_datasets,
|
15 |
hide_success_message,
|
16 |
push_pipeline_code_to_hub,
|
|
|
19 |
validate_argilla_user_workspace_dataset,
|
20 |
validate_push_to_hub,
|
21 |
)
|
22 |
+
from src.synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
|
23 |
+
from src.synthetic_dataset_generator.pipelines.base import get_rewritten_prompts
|
24 |
+
from src.synthetic_dataset_generator.pipelines.embeddings import (
|
25 |
get_embeddings,
|
26 |
get_sentence_embedding_dimensions,
|
27 |
)
|
28 |
+
from src.synthetic_dataset_generator.pipelines.textcat import (
|
29 |
DEFAULT_DATASET_DESCRIPTIONS,
|
30 |
generate_pipeline_code,
|
31 |
get_labeller_generator,
|
32 |
get_prompt_generator,
|
33 |
get_textcat_generator,
|
34 |
)
|
35 |
+
from src.synthetic_dataset_generator.utils import (
|
36 |
get_argilla_client,
|
37 |
get_org_dropdown,
|
38 |
get_preprocess_labels,
|
src/synthetic_dataset_generator/pipelines/base.py
CHANGED
@@ -4,7 +4,7 @@ import random
|
|
4 |
from distilabel.models import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
|
5 |
from distilabel.steps.tasks import TextGeneration
|
6 |
|
7 |
-
from synthetic_dataset_generator.constants import (
|
8 |
API_KEYS,
|
9 |
DEFAULT_BATCH_SIZE,
|
10 |
HUGGINGFACE_BASE_URL,
|
|
|
4 |
from distilabel.models import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
|
5 |
from distilabel.steps.tasks import TextGeneration
|
6 |
|
7 |
+
from src.synthetic_dataset_generator.constants import (
|
8 |
API_KEYS,
|
9 |
DEFAULT_BATCH_SIZE,
|
10 |
HUGGINGFACE_BASE_URL,
|
src/synthetic_dataset_generator/pipelines/chat.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
|
2 |
|
3 |
-
from synthetic_dataset_generator.constants import (
|
4 |
MAGPIE_PRE_QUERY_TEMPLATE,
|
5 |
MAX_NUM_TOKENS,
|
6 |
)
|
7 |
-
from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
8 |
|
9 |
INFORMATION_SEEKING_PROMPT = (
|
10 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
|
|
1 |
from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
|
2 |
|
3 |
+
from src.synthetic_dataset_generator.constants import (
|
4 |
MAGPIE_PRE_QUERY_TEMPLATE,
|
5 |
MAX_NUM_TOKENS,
|
6 |
)
|
7 |
+
from src.synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
8 |
|
9 |
INFORMATION_SEEKING_PROMPT = (
|
10 |
"You are an AI assistant designed to provide accurate and concise information on a wide"
|
src/synthetic_dataset_generator/pipelines/embeddings.py
CHANGED
@@ -3,7 +3,7 @@ from typing import List
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sentence_transformers.models import StaticEmbedding
|
5 |
|
6 |
-
from synthetic_dataset_generator.constants import STATIC_EMBEDDING_MODEL
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec(STATIC_EMBEDDING_MODEL)
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
|
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from sentence_transformers.models import StaticEmbedding
|
5 |
|
6 |
+
from src.synthetic_dataset_generator.constants import STATIC_EMBEDDING_MODEL
|
7 |
|
8 |
static_embedding = StaticEmbedding.from_model2vec(STATIC_EMBEDDING_MODEL)
|
9 |
model = SentenceTransformer(modules=[static_embedding])
|
src/synthetic_dataset_generator/pipelines/eval.py
CHANGED
@@ -5,9 +5,9 @@ from distilabel.steps.tasks import (
|
|
5 |
UltraFeedback,
|
6 |
)
|
7 |
|
8 |
-
from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
|
9 |
-
from synthetic_dataset_generator.pipelines.base import _get_next_api_key
|
10 |
-
from synthetic_dataset_generator.utils import extract_column_names
|
11 |
|
12 |
|
13 |
def get_ultrafeedback_evaluator(aspect, is_sample):
|
|
|
5 |
UltraFeedback,
|
6 |
)
|
7 |
|
8 |
+
from src.synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
|
9 |
+
from src.synthetic_dataset_generator.pipelines.base import _get_next_api_key
|
10 |
+
from src.synthetic_dataset_generator.utils import extract_column_names
|
11 |
|
12 |
|
13 |
def get_ultrafeedback_evaluator(aspect, is_sample):
|
src/synthetic_dataset_generator/pipelines/rag.py
CHANGED
@@ -8,8 +8,8 @@ from distilabel.steps.tasks import (
|
|
8 |
TextGeneration,
|
9 |
)
|
10 |
|
11 |
-
from synthetic_dataset_generator.constants import MAX_NUM_TOKENS
|
12 |
-
from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
13 |
|
14 |
DEFAULT_DATASET_DESCRIPTIONS = [
|
15 |
"A dataset to retrieve information from legal documents.",
|
|
|
8 |
TextGeneration,
|
9 |
)
|
10 |
|
11 |
+
from src.synthetic_dataset_generator.constants import MAX_NUM_TOKENS
|
12 |
+
from src.synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
13 |
|
14 |
DEFAULT_DATASET_DESCRIPTIONS = [
|
15 |
"A dataset to retrieve information from legal documents.",
|
src/synthetic_dataset_generator/pipelines/textcat.py
CHANGED
@@ -8,11 +8,11 @@ from distilabel.steps.tasks import (
|
|
8 |
)
|
9 |
from pydantic import BaseModel, Field
|
10 |
|
11 |
-
from synthetic_dataset_generator.constants import (
|
12 |
MAX_NUM_TOKENS,
|
13 |
)
|
14 |
-
from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
15 |
-
from synthetic_dataset_generator.utils import get_preprocess_labels
|
16 |
|
17 |
PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
|
18 |
|
|
|
8 |
)
|
9 |
from pydantic import BaseModel, Field
|
10 |
|
11 |
+
from src.synthetic_dataset_generator.constants import (
|
12 |
MAX_NUM_TOKENS,
|
13 |
)
|
14 |
+
from src.synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class
|
15 |
+
from src.synthetic_dataset_generator.utils import get_preprocess_labels
|
16 |
|
17 |
PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
|
18 |
|
src/synthetic_dataset_generator/utils.py
CHANGED
@@ -14,7 +14,7 @@ from gradio.oauth import (
|
|
14 |
from huggingface_hub import whoami
|
15 |
from jinja2 import Environment, meta
|
16 |
|
17 |
-
from synthetic_dataset_generator.constants import argilla_client
|
18 |
|
19 |
|
20 |
def get_duplicate_button():
|
|
|
14 |
from huggingface_hub import whoami
|
15 |
from jinja2 import Environment, meta
|
16 |
|
17 |
+
from src.synthetic_dataset_generator.constants import argilla_client
|
18 |
|
19 |
|
20 |
def get_duplicate_button():
|