Spaces:

ramalMr
/

data_gen

Sleeping

File size: 3,286 Bytes

cd650c7
9cbb806
418de0b
 
beea405
664305c
97425d1
d04019f
18cb91d
 
09b14bf
adba430
923f75f
664305c
8115786
664305c
 
 
 
adba430
664305c
97425d1
 
664305c
97425d1
664305c
 
fa4d0d9
18cb91d
664305c
18cb91d
d04019f
7b026a2
18cb91d
 
 
 
a5056fa
18cb91d
 
 
 
 
 
 
 
d53066f
18cb91d
664305c
18cb91d
 
97425d1
a5056fa
18cb91d
 
a5056fa
664305c
 
7b026a2
18cb91d
 
fa4d0d9
18cb91d
34421df
664305c
9cbb806
d04019f
418de0b
1fd65af
418de0b
664305c
1fd65af
 
 
 
b3b73c4
664305c
418de0b
 
adba430
09b14bf

from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO 
import csv
import os
import io 
import tempfile
import re

client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

def extract_sentences_from_excel(file):
    df = pd.read_excel(file)
    text = ' '.join(df['Column_Name'].astype(str))
    sentences = text.split('.')
    sentences = [s.strip() for s in sentences if s.strip()]
    return sentences

def save_to_csv(sentence, output, filename="synthetic_data.csv"):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([sentence, output])

def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
    sentences = extract_sentences_from_excel(file)

    with tempfile.NamedTemporaryFile(mode='w', newline='', delete=False, suffix='.csv') as tmp:
        fieldnames = ['Original Sentence', 'Generated Sentence']
        writer = csv.DictWriter(tmp, fieldnames=fieldnames)
        writer.writeheader()  

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            generate_kwargs = {
                "temperature": temperature,
                "max_new_tokens": max_new_tokens,
                "top_p": top_p,
                "repetition_penalty": repetition_penalty,
                "do_sample": True,
                "seed": 42,
            }

            try:
                stream = client.text_generation(f"{prompt} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
                output = ""
                for response in stream:
                    output += response.token.text

                generated_sentences = re.split(r'(?<=[\.\!\?:])[\s\n]+', output)
                generated_sentences = [s.strip() for s in generated_sentences if s.strip() and s != '.']

                for generated_sentence in generated_sentences:
                    writer.writerow({'Original Sentence': sentence, 'Generated Sentence': generated_sentence})

            except Exception as e:
                print(f"Error generating data for sentence '{sentence}': {e}")

        tmp_path = tmp.name

    return tmp_path

gr.Interface( 
    fn=generate,
    inputs=[
        gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
        gr.Textbox(label="Prompt", placeholder="Enter your prompt here"),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
    ],
    outputs=gr.File(label="Synthetic Data "),
    title="SDG",
    description="AYE QABIL.",
    allow_flagging="never",
).launch()