Spaces:
Build error
Build error
File size: 2,665 Bytes
0014e32 5700651 c529144 1f46a45 c529144 0014e32 5700651 0014e32 5700651 c529144 0014e32 5700651 c529144 0014e32 c529144 0014e32 c529144 0014e32 c529144 0014e32 c529144 0014e32 c529144 e8e2695 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import streamlit as st
import urllib.request
import PyPDF2
import re
import pandas as pd
def convert_pdf_to_txt(pdf_file):
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
text = ''
for i in range(pdf_reader.numPages):
text += pdf_reader.getPage(i).extractText()
return text
def preprocess_text(text):
# Preprocess the text data to remove unwanted characters and convert to lowercase
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text
def download_book(url):
response = urllib.request.urlopen(url)
book = response.read()
return book
def upload_book():
uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"])
if uploaded_file is None:
return None
return uploaded_file.read()
def main():
st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide")
st.title("Book to Dataset Converter")
st.write("This app allows you to convert a book to a dataset that can be used to train AI models.")
source = st.sidebar.radio("Select source of book", ("URL", "Upload"))
if source == "URL":
url = st.sidebar.text_input("Enter URL of book")
if st.button("Convert"):
with st.spinner("Downloading book..."):
book = download_book(url)
if book is None:
st.error("Failed to download book")
with st.spinner("Converting book to dataset..."):
text = convert_pdf_to_txt(book)
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
else:
if st.button("Upload"):
uploaded_file = upload_book()
if uploaded_file is None:
st.error("Failed to upload book")
else:
if uploaded_file.endswith(b".pdf"):
with st.spinner("Converting book to dataset..."):
text = convert_pdf_to_txt(uploaded_file)
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
elif uploaded_file.endswith(b".txt"):
with st.spinner("Converting book to dataset..."):
text = uploaded_file.decode('utf-8')
text = preprocess_text(text)
dataset = pd.DataFrame({'text': [text]})
st.write(dataset)
else:
st.error("Invalid file format. Please upload a book in pdf or txt format.")
main() |