Spaces:
Build error
Build error
import gradio as gr | |
import urllib.request | |
import PyPDF2 | |
import re | |
import pandas as pd | |
from tqdm import tqdm | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfFileReader(pdf_file) | |
text = "" | |
for page in range(pdf_reader.numPages): | |
text += pdf_reader.getPage(page).extractText() | |
return text | |
def extract_text_from_txt(txt_file): | |
with open(txt_file, "r") as file: | |
text = file.read() | |
return text | |
def book_to_dataset(file, file_type): | |
if file_type == "pdf": | |
text = extract_text_from_pdf(file) | |
elif file_type == "txt": | |
text = extract_text_from_txt(file) | |
else: | |
raise ValueError("Invalid file type") | |
words = re.findall(r'\w+', text) | |
words_frequency = {} | |
for word in words: | |
words_frequency[word] = words_frequency.get(word, 0) + 1 | |
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"]) | |
return df | |
def book_to_dataset_progress(file, file_type): | |
if file_type == "pdf": | |
text = extract_text_from_pdf(file) | |
elif file_type == "txt": | |
text = extract_text_from_txt(file) | |
else: | |
raise ValueError("Invalid file type") | |
words = re.findall(r'\w+', text) | |
words_frequency = {} | |
for word in tqdm(words, desc="Converting..."): | |
words_frequency[word] = words_frequency.get(word, 0) + 1 | |
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"]) | |
return df | |
def book_converter(inputs): | |
if inputs[1] == "URL": | |
url = inputs[0] | |
file_name = url.split("/")[-1] | |
urllib.request.urlretrieve(url, file_name) | |
file = file_name | |
file_type = file_name.split(".")[-1] | |
else: | |
file = inputs[0] | |
file_type = inputs[2].split(".")[-1] | |
return book_to_dataset_progress(file, file_type) | |
inputs = gr.inputs.Column( | |
[ | |
gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"), | |
gr.inputs.Radio(["URL", "File"], default="URL"), | |
gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",) | |
], | |
label="Input" | |
) | |
interface = gr.Interface( | |
book_converter, | |
inputs, | |
gr.outputs.Dataframe(), | |
title="Book to Dataset Converter", | |
description="Convert a book in pdf or txt format to a dataset that can be used to train AI models." | |
) | |
interface.launch() |