Spaces:
Build error
Build error
File size: 2,369 Bytes
5700651 c529144 1f46a45 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 c529144 5700651 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import urllib.request
import PyPDF2
import re
import pandas as pd
from tqdm import tqdm
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ""
for page in range(pdf_reader.numPages):
text += pdf_reader.getPage(page).extractText()
return text
def extract_text_from_txt(txt_file):
with open(txt_file, "r") as file:
text = file.read()
return text
def book_to_dataset(file, file_type):
if file_type == "pdf":
text = extract_text_from_pdf(file)
elif file_type == "txt":
text = extract_text_from_txt(file)
else:
raise ValueError("Invalid file type")
words = re.findall(r'\w+', text)
words_frequency = {}
for word in words:
words_frequency[word] = words_frequency.get(word, 0) + 1
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
return df
def book_to_dataset_progress(file, file_type):
if file_type == "pdf":
text = extract_text_from_pdf(file)
elif file_type == "txt":
text = extract_text_from_txt(file)
else:
raise ValueError("Invalid file type")
words = re.findall(r'\w+', text)
words_frequency = {}
for word in tqdm(words, desc="Converting..."):
words_frequency[word] = words_frequency.get(word, 0) + 1
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
return df
def book_converter(inputs):
if inputs[1] == "URL":
url = inputs[0]
file_name = url.split("/")[-1]
urllib.request.urlretrieve(url, file_name)
file = file_name
file_type = file_name.split(".")[-1]
else:
file = inputs[0]
file_type = inputs[2].split(".")[-1]
return book_to_dataset_progress(file, file_type)
inputs = gr.inputs.Column(
[
gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
gr.inputs.Radio(["URL", "File"], default="URL"),
gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
],
label="Input"
)
interface = gr.Interface(
book_converter,
inputs,
gr.outputs.Dataframe(),
title="Book to Dataset Converter",
description="Convert a book in pdf or txt format to a dataset that can be used to train AI models."
)
interface.launch() |