File size: 2,906 Bytes
cd650c7
9cbb806
418de0b
 
 
 
adba430
923f75f
418de0b
8115786
418de0b
 
adba430
418de0b
 
 
fa4d0d9
7b026a2
 
 
 
 
 
a5056fa
7b026a2
 
 
 
 
 
 
 
d53066f
7b026a2
418de0b
 
 
 
a5056fa
418de0b
 
a5056fa
7b026a2
 
 
418de0b
fa4d0d9
418de0b
 
 
 
 
34421df
418de0b
 
 
 
 
9cbb806
8115786
418de0b
1fd65af
418de0b
1fd65af
 
 
 
 
418de0b
 
 
adba430
9cbb806
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO

client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

def extract_text_from_excel(file):
    df = pd.read_excel(file)
    text = ' '.join(df['data'].astype(str))
    return text

def generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty):
    sentences = text.split('.')
    random.shuffle(sentences)  # Shuffle sentences

    generated_data = []

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        generate_kwargs = {
            "temperature": temperature,
            "max_new_tokens": max_new_tokens,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
            "do_sample": True,
            "seed": 42,
        }

        try:
            stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
            output = ""
            for response in stream:
                output += response.token.text

            generated_sentences = [s.strip() for s in output.split('.') if s.strip()]
            generated_data.extend([(sentence, generated_sentence) for generated_sentence in generated_sentences])

        except Exception as e:
            print(f"Error generating data for sentence '{sentence}': {e}")

    return generated_data

def save_to_csv(data, filename="synthetic_data.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Original Sentence', 'Generated Sentence'])
        writer.writerows(data)

def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
    text = extract_text_from_excel(file)
    data = generate_sentences(text, temperature, max_new_tokens, top_p, repetition_penalty)
    save_to_csv(data)
    return gr.File.update(value=filename, visible=True)

gr.Interface(
    fn=generate,
    inputs=[
        gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
    ],
    outputs=gr.File(label="Synthetic Data"),
    title="SDG",
    description="AYE QABIL.",
    allow_flagging="never",
).launch()