File size: 3,095 Bytes
838b223
cd650c7
9cbb806
418de0b
 
838b223
97425d1
838b223
18cb91d
 
09b14bf
adba430
923f75f
664305c
8115786
a1f8d56
664305c
 
 
adba430
838b223
 
 
97425d1
664305c
 
838b223
fa4d0d9
838b223
18cb91d
 
 
 
a5056fa
18cb91d
 
 
 
 
 
 
 
d53066f
18cb91d
838b223
18cb91d
 
97425d1
a5056fa
838b223
 
 
 
 
7b026a2
18cb91d
 
fa4d0d9
838b223
18cb91d
34421df
664305c
9cbb806
838b223
418de0b
1fd65af
418de0b
664305c
1fd65af
 
 
 
b3b73c4
664305c
418de0b
 
adba430
09b14bf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import json
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import os
import io
import tempfile
import re

client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

def extract_sentences_from_excel(file):
    df = pd.read_excel(file)
    text = ' '.join(df['Unnamed: 1'].astype(str))
    sentences = text.split('.')
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def save_to_json(data, filename="synthetic_data.json"):
    with open(filename, mode='a', encoding='utf-8') as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
    sentences = extract_sentences_from_excel(file)
    data = []

    with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as tmp:
        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            generate_kwargs = {
                "temperature": temperature,
                "max_new_tokens": max_new_tokens,
                "top_p": top_p,
                "repetition_penalty": repetition_penalty,
                "do_sample": True,
                "seed": 42,
            }

            try:
                stream = client.text_generation(f"{prompt} Output the response in JSON format.", **generate_kwargs, stream=True, details=True, return_full_text=False)
                output = ""
                for response in stream:
                    output += response.token.text

                try:
                    json_output = json.loads(output)
                    data.append({"original_sentence": sentence, "generated_sentence": json_output})
                except json.JSONDecodeError:
                    print(f"Error decoding JSON for sentence '{sentence}': {output}")

            except Exception as e:
                print(f"Error generating data for sentence '{sentence}': {e}")

        save_to_json(data, tmp.name)
        tmp_path = tmp.name

    return tmp_path

gr.Interface(
    fn=generate,
    inputs=[
        gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
        gr.Textbox(label="Prompt", placeholder="Enter your prompt here"),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
    ],
    outputs=gr.File(label="Synthetic Data "),
    title="SDG",
    description="AYE QABIL.",
    allow_flagging="never",
).launch()