data_gen / app.py
ramalMr's picture
Update app.py
7bc64c3 verified
raw
history blame
3.22 kB
from huggingface_hub import InferenceClient
import gradio as gr
import re
from nltk.tokenize import sent_tokenize
client = InferenceClient(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
)
def tokenize_sentences(file_content):
sentences = sent_tokenize(file_content.decode())
return sentences
def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji):
synthetic_data = []
for sentence in sentences[:data_size]:
# Apply the prompt instructions to generate synthetic data from the sentence
synthetic_sentence = f"{prompt}: {sentence}"
# Adjust toxicity level
if toxicity_level == "High":
synthetic_sentence = add_toxic_content(synthetic_sentence)
elif toxicity_level == "Low":
synthetic_sentence = remove_toxic_content(synthetic_sentence)
# Add or remove emoji
if use_emoji:
synthetic_sentence = add_emojis(synthetic_sentence)
else:
synthetic_sentence = remove_emojis(synthetic_sentence)
synthetic_data.append(synthetic_sentence)
return "\n".join(synthetic_data)
def add_toxic_content(text):
# Add code to make the text more toxic
return text
def remove_toxic_content(text):
# Add code to remove toxic content from the text
return text
def add_emojis(text):
# Add code to add emojis to the text
return text
def remove_emojis(text):
# Add code to remove emojis from the text
return text
def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None):
if files is not None:
file_contents = [file.decode() for file in files]
sentences = []
for content in file_contents:
sentences.extend(tokenize_sentences(content))
synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji)
return synthetic_data
else:
return "Please upload a file to generate synthetic data."
additional_inputs=[
gr.Textbox(
label="Prompt for Synthetic Data Generation",
max_lines=1,
interactive=True,
),
gr.Slider(
label="Max Data Size",
value=100,
minimum=10,
maximum=1000,
step=10,
interactive=True,
info="The maximum number of sentences to include in the synthetic data",
),
gr.Radio(
label="Toxicity Level",
choices=["High", "Low", "Neutral"],
value="Neutral",
interactive=True,
info="Adjust the toxicity level of the synthetic data",
),
gr.Checkbox(
label="Use Emoji",
value=False,
interactive=True,
info="Add or remove emojis in the synthetic data",
),
gr.File(
label="Upload PDF or Document",
file_count="multiple",
file_types=[".pdf", ".doc", ".docx", ".txt"],
interactive=True,
)
]
gr.Interface(
fn=generate,
inputs=additional_inputs,
outputs="text",
title="Synthetic-data-generation-aze",
description="Generate synthetic data from uploaded files based on a given prompt and customization options.",
).launch(show_api=False)