|
from huggingface_hub import InferenceClient |
|
import gradio as gr |
|
import re |
|
from nltk.tokenize import sent_tokenize |
|
|
|
client = InferenceClient( |
|
"mistralai/Mixtral-8x7B-Instruct-v0.1" |
|
) |
|
|
|
def tokenize_sentences(file_content): |
|
sentences = sent_tokenize(file_content.decode()) |
|
return sentences |
|
|
|
def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji): |
|
synthetic_data = [] |
|
for sentence in sentences[:data_size]: |
|
|
|
synthetic_sentence = f"{prompt}: {sentence}" |
|
|
|
|
|
if toxicity_level == "High": |
|
synthetic_sentence = add_toxic_content(synthetic_sentence) |
|
elif toxicity_level == "Low": |
|
synthetic_sentence = remove_toxic_content(synthetic_sentence) |
|
|
|
|
|
if use_emoji: |
|
synthetic_sentence = add_emojis(synthetic_sentence) |
|
else: |
|
synthetic_sentence = remove_emojis(synthetic_sentence) |
|
|
|
synthetic_data.append(synthetic_sentence) |
|
return "\n".join(synthetic_data) |
|
|
|
def add_toxic_content(text): |
|
|
|
return text |
|
|
|
def remove_toxic_content(text): |
|
|
|
return text |
|
|
|
def add_emojis(text): |
|
|
|
return text |
|
|
|
def remove_emojis(text): |
|
|
|
return text |
|
|
|
def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None): |
|
if files is not None: |
|
file_contents = [file.decode() for file in files] |
|
sentences = [] |
|
for content in file_contents: |
|
sentences.extend(tokenize_sentences(content)) |
|
synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji) |
|
return synthetic_data |
|
else: |
|
return "Please upload a file to generate synthetic data." |
|
|
|
additional_inputs=[ |
|
gr.Textbox( |
|
label="Prompt for Synthetic Data Generation", |
|
max_lines=1, |
|
interactive=True, |
|
), |
|
gr.Slider( |
|
label="Max Data Size", |
|
value=100, |
|
minimum=10, |
|
maximum=1000, |
|
step=10, |
|
interactive=True, |
|
info="The maximum number of sentences to include in the synthetic data", |
|
), |
|
gr.Radio( |
|
label="Toxicity Level", |
|
choices=["High", "Low", "Neutral"], |
|
value="Neutral", |
|
interactive=True, |
|
info="Adjust the toxicity level of the synthetic data", |
|
), |
|
gr.Checkbox( |
|
label="Use Emoji", |
|
value=False, |
|
interactive=True, |
|
info="Add or remove emojis in the synthetic data", |
|
), |
|
gr.File( |
|
label="Upload PDF or Document", |
|
file_count="multiple", |
|
file_types=[".pdf", ".doc", ".docx", ".txt"], |
|
interactive=True, |
|
) |
|
] |
|
|
|
gr.Interface( |
|
fn=generate, |
|
inputs=additional_inputs, |
|
outputs="text", |
|
title="Synthetic-data-generation-aze", |
|
description="Generate synthetic data from uploaded files based on a given prompt and customization options.", |
|
).launch(show_api=False) |