File size: 3,179 Bytes
838b223 cd650c7 9cbb806 418de0b 838b223 d435c8a 97425d1 838b223 18cb91d 09b14bf aebf89a adba430 923f75f 664305c 8115786 f5a3917 664305c adba430 664305c 838b223 fa4d0d9 f5a3917 d53066f f5a3917 5ff454a f5a3917 cf4b1fe f5a3917 7b026a2 f5a3917 fa4d0d9 f5a3917 5ff454a f5a3917 34421df f5a3917 9cbb806 f5a3917 838b223 418de0b 1fd65af 418de0b 664305c 1fd65af b3b73c4 f5a3917 418de0b f5a3917 adba430 09b14bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import json
from huggingface_hub import InferenceClient
import gradio as gr
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
import re
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
def extract_sentences_from_excel(file):
df = pd.read_excel(file)
sentences = df['metn'].astype(str).tolist()
return sentences
def generate(file, prompt, temperature, max_new_tokens, top_p, repetition_penalty):
sentences = extract_sentences_from_excel(file)
data = []
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
for sentence in sentences:
try:
stream = client.text_generation(f"{prompt} Output the response in the following JSON format: {{'generated_sentence': 'The generated sentence text', 'confidence_score': 0.9}} {sentence}", **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
data.append({"original_sentence": sentence, "generated_data": output})
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
filename = "synthetic_data.json"
save_to_json(data, filename)
return filename
def save_to_json(data, filename):
json_data = []
for item in data:
generated_sentences = []
confidence_scores = []
for match in re.finditer(r"{'generated_sentence': '(.+?)', 'confidence_score': ([\d\.]+)}", item['generated_data']):
generated_sentences.append(match.group(1))
confidence_scores.append(float(match.group(2)))
json_data.append({
'original_sentence': item['original_sentence'],
'generated_sentences': generated_sentences,
'confidence_scores': confidence_scores
})
with open(filename, mode='w', encoding='utf-8') as file:
json.dump(json_data, file, indent=4, ensure_ascii=False)
# Gradio arayüzü
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload Excel File", file_count="single", file_types=[".xlsx"]),
gr.Textbox(label="Prompt", placeholder="Enter your prompt here"),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
],
outputs=gr.File(label="Synthetic Data"),
title="SDG",
description=" *AYE* QABIL.",
allow_flagging="never",
).launch() |