File size: 2,906 Bytes
cd650c7 9cbb806 418de0b adba430 923f75f 418de0b 8115786 418de0b adba430 418de0b fa4d0d9 7b026a2 a5056fa 7b026a2 d53066f 7b026a2 418de0b a5056fa 418de0b a5056fa 7b026a2 418de0b fa4d0d9 418de0b 34421df 418de0b 9cbb806 8115786 418de0b 1fd65af 418de0b 1fd65af 418de0b adba430 9cbb806 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
def extract_text_from_excel(file):
df = pd.read_excel(file)
text = ' '.join(df['data'].astype(str))
return text
def generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty):
sentences = text.split('.')
random.shuffle(sentences) # Shuffle sentences
generated_data = []
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
try:
stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
generated_sentences = [s.strip() for s in output.split('.') if s.strip()]
generated_data.extend([(sentence, generated_sentence) for generated_sentence in generated_sentences])
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
return generated_data
def save_to_csv(data, filename="synthetic_data.csv"):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Original Sentence', 'Generated Sentence'])
writer.writerows(data)
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
text = extract_text_from_excel(file)
data = generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty)
save_to_csv(data)
return gr.File.update(value=filename, visible=True)
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
],
outputs=gr.File(label="Synthetic Data"),
title="SDG",
description="AYE QABIL.",
allow_flagging="never",
).launch() |