Spaces:
Build error
Build error
Commit
·
0014e32
1
Parent(s):
5128b03
Update app.py
Browse files
app.py
CHANGED
@@ -1,76 +1,72 @@
|
|
1 |
-
import
|
2 |
import urllib.request
|
3 |
import PyPDF2
|
4 |
import re
|
5 |
import pandas as pd
|
6 |
-
from tqdm import tqdm
|
7 |
|
8 |
-
def
|
9 |
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
|
10 |
-
text =
|
11 |
-
for
|
12 |
-
text += pdf_reader.getPage(
|
13 |
return text
|
14 |
|
15 |
-
def
|
16 |
-
|
17 |
-
|
|
|
18 |
return text
|
19 |
|
20 |
-
def
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
text = extract_text_from_txt(file)
|
25 |
-
else:
|
26 |
-
raise ValueError("Invalid file type")
|
27 |
-
words = re.findall(r'\w+', text)
|
28 |
-
words_frequency = {}
|
29 |
-
for word in words:
|
30 |
-
words_frequency[word] = words_frequency.get(word, 0) + 1
|
31 |
-
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
|
32 |
-
return df
|
33 |
|
34 |
-
def
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
else:
|
40 |
-
raise ValueError("Invalid file type")
|
41 |
-
words = re.findall(r'\w+', text)
|
42 |
-
words_frequency = {}
|
43 |
-
for word in tqdm(words, desc="Converting..."):
|
44 |
-
words_frequency[word] = words_frequency.get(word, 0) + 1
|
45 |
-
df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
|
46 |
-
return df
|
47 |
|
48 |
-
def
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
urllib.request.urlretrieve(url, file_name)
|
53 |
-
file = file_name
|
54 |
-
file_type = file_name.split(".")[-1]
|
55 |
-
else:
|
56 |
-
file = inputs[0]
|
57 |
-
file_type = inputs[2].split(".")[-1]
|
58 |
-
return book_to_dataset_progress(file, file_type)
|
59 |
|
60 |
-
|
61 |
-
[
|
62 |
-
gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
|
63 |
-
gr.inputs.Radio(["URL", "File"], default="URL"),
|
64 |
-
gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
|
65 |
-
]
|
66 |
-
)
|
67 |
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
|
|
|
|
1 |
+
import streamlit as st
|
2 |
import urllib.request
|
3 |
import PyPDF2
|
4 |
import re
|
5 |
import pandas as pd
|
|
|
6 |
|
7 |
+
def convert_pdf_to_txt(pdf_file):
|
8 |
pdf_reader = PyPDF2.PdfFileReader(pdf_file)
|
9 |
+
text = ''
|
10 |
+
for i in range(pdf_reader.numPages):
|
11 |
+
text += pdf_reader.getPage(i).extractText()
|
12 |
return text
|
13 |
|
14 |
+
def preprocess_text(text):
|
15 |
+
# Preprocess the text data to remove unwanted characters and convert to lowercase
|
16 |
+
text = re.sub(r'[^\w\s]', '', text)
|
17 |
+
text = text.lower()
|
18 |
return text
|
19 |
|
20 |
+
def download_book(url):
|
21 |
+
response = urllib.request.urlopen(url)
|
22 |
+
book = response.read()
|
23 |
+
return book
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
def upload_book():
|
26 |
+
uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"])
|
27 |
+
if uploaded_file is None:
|
28 |
+
return None
|
29 |
+
return uploaded_file.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
+
def main():
|
32 |
+
st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide")
|
33 |
+
st.title("Book to Dataset Converter")
|
34 |
+
st.write("This app allows you to convert a book to a dataset that can be used to train AI models.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
source = st.sidebar.radio("Select source of book", ("URL", "Upload"))
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
if source == "URL":
|
39 |
+
url = st.sidebar.text_input("Enter URL of book")
|
40 |
+
if st.button("Convert"):
|
41 |
+
with st.spinner("Downloading book..."):
|
42 |
+
book = download_book(url)
|
43 |
+
if book is None:
|
44 |
+
st.error("Failed to download book")
|
45 |
+
with st.spinner("Converting book to dataset..."):
|
46 |
+
text = convert_pdf_to_txt(book)
|
47 |
+
text = preprocess_text(text)
|
48 |
+
dataset = pd.DataFrame({'text': [text]})
|
49 |
+
st.write(dataset)
|
50 |
+
else:
|
51 |
+
if st.button("Upload"):
|
52 |
+
uploaded_file = upload_book()
|
53 |
+
if uploaded_file is None:
|
54 |
+
st.error("Failed to upload book")
|
55 |
+
else:
|
56 |
+
if uploaded_file.endswith(b".pdf"):
|
57 |
+
with st.spinner("Converting book to dataset..."):
|
58 |
+
text = convert_pdf_to_txt(uploaded_file)
|
59 |
+
text = preprocess_text(text)
|
60 |
+
dataset = pd.DataFrame({'text': [text]})
|
61 |
+
st.write(dataset)
|
62 |
+
elif uploaded_file.endswith(b".txt"):
|
63 |
+
with st.spinner("Converting book to dataset..."):
|
64 |
+
text = uploaded_file.decode('utf-8')
|
65 |
+
text = preprocess_text(text)
|
66 |
+
dataset = pd.DataFrame({'text': [text]})
|
67 |
+
st.write(dataset)
|
68 |
+
else:
|
69 |
+
st.error("Invalid file format. Please upload a book in pdf or txt format.")
|
70 |
|
71 |
+
if name == 'main':
|
72 |
+
main()
|