from huggingface_hub import InferenceClient import gradio as gr import re from nltk.tokenize import sent_tokenize client = InferenceClient( "mistralai/Mixtral-8x7B-Instruct-v0.1" ) def tokenize_sentences(file_content): sentences = sent_tokenize(file_content.decode()) return sentences def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji): synthetic_data = [] for sentence in sentences[:data_size]: # Apply the prompt instructions to generate synthetic data from the sentence synthetic_sentence = f"{prompt}: {sentence}" # Adjust toxicity level if toxicity_level == "High": synthetic_sentence = add_toxic_content(synthetic_sentence) elif toxicity_level == "Low": synthetic_sentence = remove_toxic_content(synthetic_sentence) # Add or remove emoji if use_emoji: synthetic_sentence = add_emojis(synthetic_sentence) else: synthetic_sentence = remove_emojis(synthetic_sentence) synthetic_data.append(synthetic_sentence) return "\n".join(synthetic_data) def add_toxic_content(text): # Add code to make the text more toxic return text def remove_toxic_content(text): # Add code to remove toxic content from the text return text def add_emojis(text): # Add code to add emojis to the text return text def remove_emojis(text): # Add code to remove emojis from the text return text def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None): if files is not None: file_contents = [file.decode() for file in files] sentences = [] for content in file_contents: sentences.extend(tokenize_sentences(content)) synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji) return synthetic_data else: return "Please upload a file to generate synthetic data." additional_inputs=[ gr.Textbox( label="Prompt for Synthetic Data Generation", max_lines=1, interactive=True, ), gr.Slider( label="Max Data Size", value=100, minimum=10, maximum=1000, step=10, interactive=True, info="The maximum number of sentences to include in the synthetic data", ), gr.Radio( label="Toxicity Level", choices=["High", "Low", "Neutral"], value="Neutral", interactive=True, info="Adjust the toxicity level of the synthetic data", ), gr.Checkbox( label="Use Emoji", value=False, interactive=True, info="Add or remove emojis in the synthetic data", ), gr.File( label="Upload PDF or Document", file_count="multiple", file_types=[".pdf", ".doc", ".docx", ".txt"], interactive=True, ) ] gr.Interface( fn=generate, inputs=additional_inputs, outputs="text", title="Synthetic-data-generation-aze", description="Generate synthetic data from uploaded files based on a given prompt and customization options.", ).launch(show_api=False)