File size: 2,665 Bytes
0014e32
5700651
c529144
1f46a45
c529144
 
0014e32
5700651
0014e32
 
 
5700651
c529144
0014e32
 
 
 
5700651
c529144
0014e32
 
 
 
c529144
0014e32
 
 
 
 
c529144
0014e32
 
 
 
c529144
0014e32
c529144
0014e32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c529144
e8e2695
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import streamlit as st
import urllib.request
import PyPDF2
import re
import pandas as pd

def convert_pdf_to_txt(pdf_file):
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ''
    for i in range(pdf_reader.numPages):
        text += pdf_reader.getPage(i).extractText()
    return text

def preprocess_text(text):
    # Preprocess the text data to remove unwanted characters and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def download_book(url):
    response = urllib.request.urlopen(url)
    book = response.read()
    return book

def upload_book():
    uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"])
    if uploaded_file is None:
        return None
    return uploaded_file.read()

def main():
    st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide")
    st.title("Book to Dataset Converter")
    st.write("This app allows you to convert a book to a dataset that can be used to train AI models.")

    source = st.sidebar.radio("Select source of book", ("URL", "Upload"))

    if source == "URL":
        url = st.sidebar.text_input("Enter URL of book")
        if st.button("Convert"):
            with st.spinner("Downloading book..."):
                book = download_book(url)
                if book is None:
                    st.error("Failed to download book")
            with st.spinner("Converting book to dataset..."):
                text = convert_pdf_to_txt(book)
                text = preprocess_text(text)
                dataset = pd.DataFrame({'text': [text]})
                st.write(dataset)
    else:
        if st.button("Upload"):
            uploaded_file = upload_book()
            if uploaded_file is None:
                st.error("Failed to upload book")
            else:
                if uploaded_file.endswith(b".pdf"):
                    with st.spinner("Converting book to dataset..."):
                        text = convert_pdf_to_txt(uploaded_file)
                        text = preprocess_text(text)
                        dataset = pd.DataFrame({'text': [text]})
                        st.write(dataset)
                elif uploaded_file.endswith(b".txt"):
                    with st.spinner("Converting book to dataset..."):
                        text = uploaded_file.decode('utf-8')
                        text = preprocess_text(text)
                        dataset = pd.DataFrame({'text': [text]})
                        st.write(dataset)
                else:
                    st.error("Invalid file format. Please upload a book in pdf or txt format.")

main()