BookTODataset / app.py
imseldrith's picture
Update app.py
5700651
raw
history blame
2.37 kB
import gradio as gr
import urllib.request
import PyPDF2
import re
import pandas as pd
from tqdm import tqdm
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ""
for page in range(pdf_reader.numPages):
text += pdf_reader.getPage(page).extractText()
return text
def extract_text_from_txt(txt_file):
with open(txt_file, "r") as file:
text = file.read()
return text
def book_to_dataset(file, file_type):
if file_type == "pdf":
text = extract_text_from_pdf(file)
elif file_type == "txt":
text = extract_text_from_txt(file)
else:
raise ValueError("Invalid file type")
words = re.findall(r'\w+', text)
words_frequency = {}
for word in words:
words_frequency[word] = words_frequency.get(word, 0) + 1
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
return df
def book_to_dataset_progress(file, file_type):
if file_type == "pdf":
text = extract_text_from_pdf(file)
elif file_type == "txt":
text = extract_text_from_txt(file)
else:
raise ValueError("Invalid file type")
words = re.findall(r'\w+', text)
words_frequency = {}
for word in tqdm(words, desc="Converting..."):
words_frequency[word] = words_frequency.get(word, 0) + 1
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
return df
def book_converter(inputs):
if inputs[1] == "URL":
url = inputs[0]
file_name = url.split("/")[-1]
urllib.request.urlretrieve(url, file_name)
file = file_name
file_type = file_name.split(".")[-1]
else:
file = inputs[0]
file_type = inputs[2].split(".")[-1]
return book_to_dataset_progress(file, file_type)
inputs = gr.inputs.Column(
[
gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
gr.inputs.Radio(["URL", "File"], default="URL"),
gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
],
label="Input"
)
interface = gr.Interface(
book_converter,
inputs,
gr.outputs.Dataframe(),
title="Book to Dataset Converter",
description="Convert a book in pdf or txt format to a dataset that can be used to train AI models."
)
interface.launch()