import streamlit as st import urllib.request import PyPDF2 import re import pandas as pd def convert_pdf_to_txt(pdf_file): pdf_reader = PyPDF2.PdfFileReader(pdf_file) text = '' for i in range(pdf_reader.numPages): text += pdf_reader.getPage(i).extractText() return text def preprocess_text(text): # Preprocess the text data to remove unwanted characters and convert to lowercase text = re.sub(r'[^\w\s]', '', text) text = text.lower() return text def download_book(url): response = urllib.request.urlopen(url) book = response.read() return book def upload_book(): uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"]) if uploaded_file is None: return None return uploaded_file.read() def main(): st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide") st.title("Book to Dataset Converter") st.write("This app allows you to convert a book to a dataset that can be used to train AI models.") source = st.sidebar.radio("Select source of book", ("URL", "Upload")) if source == "URL": url = st.sidebar.text_input("Enter URL of book") if st.button("Convert"): with st.spinner("Downloading book..."): book = download_book(url) if book is None: st.error("Failed to download book") with st.spinner("Converting book to dataset..."): text = convert_pdf_to_txt(book) text = preprocess_text(text) dataset = pd.DataFrame({'text': [text]}) st.write(dataset) else: if st.button("Upload"): uploaded_file = upload_book() if uploaded_file is None: st.error("Failed to upload book") else: if uploaded_file.endswith(b".pdf"): with st.spinner("Converting book to dataset..."): text = convert_pdf_to_txt(uploaded_file) text = preprocess_text(text) dataset = pd.DataFrame({'text': [text]}) st.write(dataset) elif uploaded_file.endswith(b".txt"): with st.spinner("Converting book to dataset..."): text = uploaded_file.decode('utf-8') text = preprocess_text(text) dataset = pd.DataFrame({'text': [text]}) st.write(dataset) else: st.error("Invalid file format. Please upload a book in pdf or txt format.") main()