Spaces:

ramalMr
/

data_gen

Sleeping

App Files Files Community

data_gen / app.py

ramalMr

Update app.py

7bc64c3 verified 12 months ago

raw

history blame

3.22 kB

	from huggingface_hub import InferenceClient
	import gradio as gr
	import re
	from nltk.tokenize import sent_tokenize

	client = InferenceClient(
	"mistralai/Mixtral-8x7B-Instruct-v0.1"
	)

	def tokenize_sentences(file_content):
	sentences = sent_tokenize(file_content.decode())
	return sentences

	def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji):
	synthetic_data = []
	for sentence in sentences[:data_size]:
	# Apply the prompt instructions to generate synthetic data from the sentence
	synthetic_sentence = f"{prompt}: {sentence}"

	# Adjust toxicity level
	if toxicity_level == "High":
	synthetic_sentence = add_toxic_content(synthetic_sentence)
	elif toxicity_level == "Low":
	synthetic_sentence = remove_toxic_content(synthetic_sentence)

	# Add or remove emoji
	if use_emoji:
	synthetic_sentence = add_emojis(synthetic_sentence)
	else:
	synthetic_sentence = remove_emojis(synthetic_sentence)

	synthetic_data.append(synthetic_sentence)
	return "\n".join(synthetic_data)

	def add_toxic_content(text):
	# Add code to make the text more toxic
	return text

	def remove_toxic_content(text):
	# Add code to remove toxic content from the text
	return text

	def add_emojis(text):
	# Add code to add emojis to the text
	return text

	def remove_emojis(text):
	# Add code to remove emojis from the text
	return text

	def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None):
	if files is not None:
	file_contents = [file.decode() for file in files]
	sentences = []
	for content in file_contents:
	sentences.extend(tokenize_sentences(content))
	synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji)
	return synthetic_data
	else:
	return "Please upload a file to generate synthetic data."

	additional_inputs=[
	gr.Textbox(
	label="Prompt for Synthetic Data Generation",
	max_lines=1,
	interactive=True,
	),
	gr.Slider(
	label="Max Data Size",
	value=100,
	minimum=10,
	maximum=1000,
	step=10,
	interactive=True,
	info="The maximum number of sentences to include in the synthetic data",
	),
	gr.Radio(
	label="Toxicity Level",
	choices=["High", "Low", "Neutral"],
	value="Neutral",
	interactive=True,
	info="Adjust the toxicity level of the synthetic data",
	),
	gr.Checkbox(
	label="Use Emoji",
	value=False,
	interactive=True,
	info="Add or remove emojis in the synthetic data",
	),
	gr.File(
	label="Upload PDF or Document",
	file_count="multiple",
	file_types=[".pdf", ".doc", ".docx", ".txt"],
	interactive=True,
	)
	]

	gr.Interface(
	fn=generate,
	inputs=additional_inputs,
	outputs="text",
	title="Synthetic-data-generation-aze",
	description="Generate synthetic data from uploaded files based on a given prompt and customization options.",
	).launch(show_api=False)