File size: 3,216 Bytes
cd650c7 fceacf5 cd650c7 923f75f fceacf5 7bc64c3 fceacf5 7bc64c3 fceacf5 7bc64c3 fceacf5 cd650c7 7bc64c3 cd650c7 7bc64c3 cd650c7 7bc64c3 43561b8 7bc64c3 923f75f fceacf5 7bc64c3 cd650c7 fceacf5 923f75f 7bc64c3 923f75f 7bc64c3 923f75f 7bc64c3 923f75f 7bc64c3 923f75f 7bc64c3 923f75f 7bc64c3 923f75f cd650c7 7bc64c3 cd650c7 7bc64c3 923f75f 7bc64c3 923f75f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
from huggingface_hub import InferenceClient
import gradio as gr
import re
from nltk.tokenize import sent_tokenize
client = InferenceClient(
"mistralai/Mixtral-8x7B-Instruct-v0.1"
)
def tokenize_sentences(file_content):
sentences = sent_tokenize(file_content.decode())
return sentences
def generate_synthetic_data(prompt, sentences, data_size, toxicity_level, use_emoji):
synthetic_data = []
for sentence in sentences[:data_size]:
# Apply the prompt instructions to generate synthetic data from the sentence
synthetic_sentence = f"{prompt}: {sentence}"
# Adjust toxicity level
if toxicity_level == "High":
synthetic_sentence = add_toxic_content(synthetic_sentence)
elif toxicity_level == "Low":
synthetic_sentence = remove_toxic_content(synthetic_sentence)
# Add or remove emoji
if use_emoji:
synthetic_sentence = add_emojis(synthetic_sentence)
else:
synthetic_sentence = remove_emojis(synthetic_sentence)
synthetic_data.append(synthetic_sentence)
return "\n".join(synthetic_data)
def add_toxic_content(text):
# Add code to make the text more toxic
return text
def remove_toxic_content(text):
# Add code to remove toxic content from the text
return text
def add_emojis(text):
# Add code to add emojis to the text
return text
def remove_emojis(text):
# Add code to remove emojis from the text
return text
def generate(prompt, max_data_size=100, toxicity_level="Neutral", use_emoji=False, files=None):
if files is not None:
file_contents = [file.decode() for file in files]
sentences = []
for content in file_contents:
sentences.extend(tokenize_sentences(content))
synthetic_data = generate_synthetic_data(prompt, sentences, max_data_size, toxicity_level, use_emoji)
return synthetic_data
else:
return "Please upload a file to generate synthetic data."
additional_inputs=[
gr.Textbox(
label="Prompt for Synthetic Data Generation",
max_lines=1,
interactive=True,
),
gr.Slider(
label="Max Data Size",
value=100,
minimum=10,
maximum=1000,
step=10,
interactive=True,
info="The maximum number of sentences to include in the synthetic data",
),
gr.Radio(
label="Toxicity Level",
choices=["High", "Low", "Neutral"],
value="Neutral",
interactive=True,
info="Adjust the toxicity level of the synthetic data",
),
gr.Checkbox(
label="Use Emoji",
value=False,
interactive=True,
info="Add or remove emojis in the synthetic data",
),
gr.File(
label="Upload PDF or Document",
file_count="multiple",
file_types=[".pdf", ".doc", ".docx", ".txt"],
interactive=True,
)
]
gr.Interface(
fn=generate,
inputs=additional_inputs,
outputs="text",
title="Synthetic-data-generation-aze",
description="Generate synthetic data from uploaded files based on a given prompt and customization options.",
).launch(show_api=False) |