Spaces:

ramalMr
/

data_gen

Running

File size: 3,179 Bytes

838b223
cd650c7
9cbb806
418de0b
 
838b223
d435c8a
97425d1
838b223
18cb91d
 
09b14bf
aebf89a
adba430
923f75f
664305c
8115786
f5a3917
664305c
adba430
664305c
 
838b223
fa4d0d9
f5a3917
 
 
 
 
 
 
 
d53066f
f5a3917
 
5ff454a
f5a3917
 
 
 
 
 
cf4b1fe
f5a3917
 
7b026a2
f5a3917
fa4d0d9
f5a3917
 
 
5ff454a
 
 
 
 
f5a3917
 
 
 
 
34421df
f5a3917
 
9cbb806
f5a3917
838b223
418de0b
1fd65af
418de0b
664305c
1fd65af
 
 
 
b3b73c4
f5a3917
418de0b
f5a3917
adba430
09b14bf

import json
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
import re


client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

def extract_sentences_from_excel(file):
    df = pd.read_excel(file)
    sentences = df['metn'].astype(str).tolist()
    return sentences

def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
    sentences = extract_sentences_from_excel(file)
    data = []

    generate_kwargs = {
        "temperature": temperature,
        "max_new_tokens": max_new_tokens,
        "top_p": top_p,
        "repetition_penalty": repetition_penalty,
        "do_sample": True,
        "seed": 42,
    }

    for sentence in sentences:
        try:
            stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
            output = ""
            for response in stream:
                output += response.token.text
            data.append({"original_sentence": sentence, "generated_data": output})
        except Exception as e:
            print(f"Error generating data for sentence '{sentence}': {e}")

    filename = "synthetic_data.json"
    save_to_json(data, filename)

    return filename

def save_to_json(data, filename):
    json_data = []
    for item in data:
        generated_sentences = []
        confidence_scores = []
        for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
            generated_sentences.append(match.group(1))
            confidence_scores.append(float(match.group(2)))
        json_data.append({
            'original_sentence': item['original_sentence'],
            'generated_sentences': generated_sentences,
            'confidence_scores': confidence_scores
        })

    with open(filename, mode='w', encoding='utf-8') as file:
        json.dump(json_data, file, indent=4, ensure_ascii=False)

# Gradio arayüzü
gr.Interface(
    fn=generate,
    inputs=[
        gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
        gr.Textbox(label="Prompt", placeholder="Enter your prompt here"),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
    ],
    outputs=gr.File(label="Synthetic Data"),
    title="SDG",
    description=" *AYE* QABIL.",
    allow_flagging="never",
).launch()