File size: 3,391 Bytes
cd650c7 9cbb806 418de0b 18cb91d 09b14bf adba430 923f75f 57e7335 8115786 57e7335 418de0b adba430 57e7335 418de0b fa4d0d9 18cb91d 09b14bf 7b026a2 18cb91d a5056fa 18cb91d d53066f 18cb91d 57e7335 18cb91d 57e7335 a5056fa 18cb91d a5056fa 79a6f49 09b14bf 7b026a2 18cb91d fa4d0d9 18cb91d 34421df 18cb91d 9cbb806 18cb91d 418de0b 1fd65af 418de0b 57e7335 1fd65af 79a6f49 1fd65af 18cb91d 418de0b adba430 09b14bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
import csv
import tempfile
import re
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
def extract_text_from_excel(file, column_name):
df = pd.read_excel(file)
text = ' '.join(df[column_name].astype(str))
return text
def generate(file, column_name, temperature, max_new_tokens, top_p, repetition_penalty, num_similar_sentences):
text = extract_text_from_excel(file, column_name)
sentences = text.split('.')
random.shuffle(sentences) # Shuffle sentences
with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
fieldnames = ['Original Sentence', 'Generated Sentence']
writer = csv.DictWriter(tmp, fieldnames=fieldnames)
writer.writeheader()
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
try:
stream = client.text_generation(sentence, **generate_kwargs, stream=True, return_full_text=False)
output = ""
for response in stream:
output += response.text
generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']
for _ in range(num_similar_sentences):
if not generated_sentences:
break
generated_sentence = generated_sentences.pop(random.randrange(len(generated_sentences)))
writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
tmp_path = tmp.name
return tmp_path
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
gr.TextAreaInput(label="Column Name", placeholder="Enter the column name"),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
gr.Slider(label="Number of similar sentences", value=10, minimum=1, maximum=20, step=1, interactive=True, info="Number of similar sentences to generate for each original sentence"),
],
outputs=gr.File(label="Synthetic Data "),
title="SDG",
description="AYE QABIL.",
allow_flagging="never",
).launch() |