File size: 3,413 Bytes
cd650c7 a26857e adba430 438b552 fa4d0d9 daa6f24 34421df adba430 923f75f adba430 fa4d0d9 34421df fa4d0d9 14ffea8 fa4d0d9 adba430 3c1274a fa4d0d9 1fd65af 3c1274a fa4d0d9 3c1274a fa4d0d9 43561b8 14ffea8 5b06a47 34421df 59ef8d0 cd650c7 1fd65af 5b06a47 adba430 fa4d0d9 adba430 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
from huggingface_hub import InferenceClient
import gradio as gr
import PyPDF2
import random
import pandas as pd
from io import BytesIO
import csv
import os
import io
import tempfile
# Initialize the inference client with your chosen model
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
def extract_text_from_pdf(file):
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in range(len(pdf_reader.pages)):
text += pdf_reader.pages[page].extract_text()
return text
def save_to_csv(sentence, output, filename="synthetic_data.csv"):
with open(filename, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([sentence, output])
def generate(file, temperature, max_new_tokens, top_p, repetition_penalty):
text = extract_text_from_pdf(file)
sentences = text.split('.')
random.shuffle(sentences) # Shuffle sentences
# CSV dosyası için başlık
if not os.path.exists("synthetic_data.csv"):
with open("synthetic_data.csv", mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["Original Sentence", "Synthetic Data"])
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
generate_kwargs = {
"temperature": temperature,
"max_new_tokens": max_new_tokens,
"top_p": top_p,
"repetition_penalty": repetition_penalty,
"do_sample": True,
"seed": 42,
}
try:
stream = client.text_generation(sentence, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
save_to_csv(sentence, output)
except Exception as e:
print(f"Error generating data for sentence '{sentence}': {e}")
save_to_csv(sentence, f"Error: {e}")
# CSV dosyasını okuyup byte olarak döndür
with open("synthetic_data.csv", "r", encoding="utf-8") as file:
csv_content = file.read()
csv_bytes = csv_content.encode()
# Geçici dosya oluştur ve içeriği yaz
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
tmp.write(csv_bytes)
tmp_path = tmp.name
return tmp_path
gr.Interface(
fn=generate,
inputs=[
gr.File(label="Upload PDF File", file_count="single", file_types=[".pdf"]),
gr.Slider(label="Temperature", value=0.9, minimum=0.0, maximum=1.0, step=0.05, interactive=True, info="Higher values produce more diverse outputs"),
gr.Slider(label="Max new tokens", value=256, minimum=0, maximum=5120, step=64, interactive=True, info="The maximum numbers of new tokens"),
gr.Slider(label="Top-p (nucleus sampling)", value=0.95, minimum=0.0, maximum=1, step=0.05, interactive=True, info="Higher values sample more low-probability tokens"),
gr.Slider(label="Repetition penalty", value=1.0, minimum=1.0, maximum=2.0, step=0.1, interactive=True, info="Penalize repeated tokens"),
],
outputs=gr.File(label="Synthetic Data CSV"),
title="Synthetic Data Generation",
description="This tool generates synthetic data from the sentences in your PDF and saves it to a CSV file.",
allow_flagging="never",
).launch() |