Spaces:

ramalMr
/

data_gen

Sleeping

File size: 3,413 Bytes

cd650c7
 
a26857e
adba430
 
438b552
fa4d0d9
 
daa6f24
34421df
adba430
 
923f75f
adba430
 
 
 
 
 
 
fa4d0d9
 
 
 
 
34421df
fa4d0d9
 
 
 
 
 
 
14ffea8
 
 
fa4d0d9
adba430
3c1274a
 
 
fa4d0d9
1fd65af
 
 
 
 
 
 
 
3c1274a
 
 
 
 
 
fa4d0d9
3c1274a
 
fa4d0d9
43561b8
14ffea8
5b06a47
 
 
34421df
 
 
 
 
 
 
59ef8d0
cd650c7
1fd65af
 
 
 
 
 
 
5b06a47
adba430
fa4d0d9
adba430

from huggingface_hub import InferenceClient
import gradio as gr
import PyPDF2
import random
import pandas as pd
from io import BytesIO 
import csv
import os
import io
import tempfile
# Initialize the inference client with your chosen model
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()
    return text

def save_to_csv(sentence, output, filename="synthetic_data.csv"):
    with open(filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([sentence, output])


def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
    text = extract_text_from_pdf(file)
    sentences = text.split('.')
    random.shuffle(sentences)  # Shuffle sentences

    # CSV dosyası için başlık
    if not os.path.exists("synthetic_data.csv"):
        with open("synthetic_data.csv", mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Original Sentence", "Synthetic Data"])

    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue

        generate_kwargs = {
            "temperature": temperature,
            "max_new_tokens": max_new_tokens,
            "top_p": top_p,
            "repetition_penalty": repetition_penalty,
            "do_sample": True,
            "seed": 42,
        }

        try:
            stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
            output = ""
            for response in stream:
                output += response.token.text
            save_to_csv(sentence, output)
        except Exception as e:
            print(f"Error generating data for sentence '{sentence}': {e}")
            save_to_csv(sentence, f"Error: {e}")

    # CSV dosyasını okuyup byte olarak döndür
    with open("synthetic_data.csv", "r", encoding="utf-8") as file:
        csv_content = file.read()
    csv_bytes = csv_content.encode()

    # Geçici dosya oluştur ve içeriği yaz
    with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
        tmp.write(csv_bytes)
        tmp_path = tmp.name

    return tmp_path
gr.Interface( 
    fn=generate,
    inputs=[
        gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
        gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
        gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
        gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
        gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
    ],
    outputs=gr.File(label="Synthetic Data CSV"),
    title="Synthetic Data Generation",
    description="This tool generates synthetic data from the sentences in your PDF and saves it to a CSV file.",
    allow_flagging="never",
).launch()