data-generator

Runtime error

App Files Files

davidberenstein1957 HF staff commited on Dec 10, 2024

Commit

ffa2ee0

1 Parent(s): d15b1c7

update logic textcat for inferring labels

Browse files

Files changed (5) hide show

src/synthetic_dataset_generator/apps/base.py +9 -0
src/synthetic_dataset_generator/apps/eval.py +4 -1
src/synthetic_dataset_generator/apps/sft.py +10 -1
src/synthetic_dataset_generator/apps/textcat.py +14 -3
src/synthetic_dataset_generator/pipelines/textcat.py +4 -4

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Union
 import argilla as rg
 import gradio as gr
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
@@ -75,6 +76,14 @@ def validate_push_to_hub(org_name, repo_name):
     return repo_id
 def show_success_message(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     if client is None:

 import argilla as rg
 import gradio as gr
+from datasets import Dataset, concatenate_datasets, load_dataset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
     return repo_id
+def combine_datasets(repo_id: str, dataset: Dataset) -> Dataset:
+    try:
+        dataset = load_dataset(repo_id, split="train")
+        return concatenate_datasets([dataset, dataset])
+    except Exception:
+        return dataset
 def show_success_message(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     if client is None:

src/synthetic_dataset_generator/apps/eval.py CHANGED Viewed

@@ -18,6 +18,7 @@ from gradio_huggingfacehub_search import HuggingfaceHubSearch
 from huggingface_hub import HfApi, repo_exists
 from synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
@@ -355,7 +356,9 @@ def push_dataset_to_hub(
     pipeline_code: str,
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
-    distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,

 from huggingface_hub import HfApi, repo_exists
 from synthetic_dataset_generator.apps.base import (
+    combine_datasets,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     pipeline_code: str,
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
+    dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset)
+    distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,

src/synthetic_dataset_generator/apps/sft.py CHANGED Viewed

@@ -10,6 +10,7 @@ from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
@@ -209,11 +210,18 @@ def push_dataset_to_hub(
     oauth_token: Union[gr.OAuthToken, None],
     private: bool,
     pipeline_code: str,
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
-    distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,
@@ -222,6 +230,7 @@ def push_dataset_to_hub(
         create_pr=False,
     )
     push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
     return original_dataframe

 from huggingface_hub import HfApi
 from synthetic_dataset_generator.apps.base import (
+    combine_datasets,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     oauth_token: Union[gr.OAuthToken, None],
     private: bool,
     pipeline_code: str,
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Validating")
     repo_id = validate_push_to_hub(org_name, repo_name)
+    progress(0.3, desc="Converting")
     original_dataframe = dataframe.copy(deep=True)
     dataframe = convert_dataframe_messages(dataframe)
+    progress(0.7, desc="Creating dataset")
+    dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset)
+    progress(0.9, desc="Pushing dataset")
+    distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,
         create_pr=False,
     )
     push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
+    progress(1.0, desc="Dataset pushed")
     return original_dataframe

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -11,6 +11,7 @@ from distilabel.distiset import Distiset
 from huggingface_hub import HfApi
 from src.synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
@@ -129,7 +130,9 @@ def generate_dataset(
             sampled_labels = random.sample(labels, num_labels)
             random.shuffle(sampled_labels)
             inputs.append(
-                {"task": f"{system_prompt}. Labels: {', '.join(sampled_labels)}"}
             )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
@@ -194,9 +197,13 @@ def push_dataset_to_hub(
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
     pipeline_code: str = "",
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     labels = get_preprocess_labels(labels)
     if num_labels == 1:
         dataframe["label"] = dataframe["label"].replace("", None)
         features = Features(
@@ -209,7 +216,10 @@ def push_dataset_to_hub(
                 "labels": Sequence(feature=ClassLabel(names=labels)),
             }
         )
-    distiset = Distiset({"default": Dataset.from_pandas(dataframe, features=features)})
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,
@@ -218,6 +228,7 @@ def push_dataset_to_hub(
         create_pr=False,
     )
     push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
 def push_dataset(
@@ -439,7 +450,7 @@ with gr.Blocks() as app:
                             ("Ambiguous", "ambiguous"),
                             ("Mixed", "mixed"),
                         ],
-                        value="understandable with some effort",
                         label="Clarity",
                         info="Set how easily the correct label or labels can be identified.",
                         interactive=True,

 from huggingface_hub import HfApi
 from src.synthetic_dataset_generator.apps.base import (
+    combine_datasets,
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
             sampled_labels = random.sample(labels, num_labels)
             random.shuffle(sampled_labels)
             inputs.append(
+                {
+                    "task": f"{system_prompt}. The text represents the following categories: {', '.join(sampled_labels)}"
+                }
             )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
     pipeline_code: str = "",
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Validating")
     repo_id = validate_push_to_hub(org_name, repo_name)
+    progress(0.3, desc="Preprocessing")
     labels = get_preprocess_labels(labels)
+    progress(0.7, desc="Creating dataset")
     if num_labels == 1:
         dataframe["label"] = dataframe["label"].replace("", None)
         features = Features(
                 "labels": Sequence(feature=ClassLabel(names=labels)),
             }
         )
+    dataset = Dataset.from_pandas(dataframe, features=features)
+    dataset = combine_datasets(repo_id, dataset)
+    distiset = Distiset({"default": dataset})
+    progress(0.9, desc="Pushing dataset")
     distiset.push_to_hub(
         repo_id=repo_id,
         private=private,
         create_pr=False,
     )
     push_pipeline_code_to_hub(pipeline_code, org_name, repo_name, oauth_token)
+    progress(1.0, desc="Dataset pushed")
 def push_dataset(
                             ("Ambiguous", "ambiguous"),
                             ("Mixed", "mixed"),
                         ],
+                        value="mixed",
                         label="Clarity",
                         info="Set how easily the correct label or labels can be identified.",
                         interactive=True,

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -26,16 +26,16 @@ Don't include the labels in the classification_task but only provide a high leve
 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
 Description: DavidMovieHouse is a cinema that has been in business for 10 years.
-Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews. Classify the customer reviews as", "labels": ["positive", "negative"]}
 Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
-Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover. Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
 Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
-Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
 Description: A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels "data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"
-Output: {"classification_task": "A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels", "labels": ["data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"]}
 Description:
 """

 If a label is composed of multiple words, use a hyphen to separate them. For example, 'smartphone-review', 'customer-service', 'product-quality'.:
 Description: DavidMovieHouse is a cinema that has been in business for 10 years.
+Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews of varying customer groups. Classify the customer reviews as", "labels": ["positive", "negative"]}
 Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
+Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover from different speaking people    . Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
 Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
+Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Written by different journalists. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
 Description: A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review with labels "data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"
+Output: {"classification_task": "A dataset covering customer reviews for an e-commerce website called Argilla that sells technology datasets within the open source Natural Language Processing space and has review from various cusomer demographics with labels", "labels": ["data-quality", "data-accuracy", "customer-service", "price", "product-availability", "shipping-speed"]}
 Description:
 """