imseldrith commited on
Commit
0014e32
·
1 Parent(s): 5128b03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -61
app.py CHANGED
@@ -1,76 +1,72 @@
1
- import gradio as gr
2
  import urllib.request
3
  import PyPDF2
4
  import re
5
  import pandas as pd
6
- from tqdm import tqdm
7
 
8
- def extract_text_from_pdf(pdf_file):
9
  pdf_reader = PyPDF2.PdfFileReader(pdf_file)
10
- text = ""
11
- for page in range(pdf_reader.numPages):
12
- text += pdf_reader.getPage(page).extractText()
13
  return text
14
 
15
- def extract_text_from_txt(txt_file):
16
- with open(txt_file, "r") as file:
17
- text = file.read()
 
18
  return text
19
 
20
- def book_to_dataset(file, file_type):
21
- if file_type == "pdf":
22
- text = extract_text_from_pdf(file)
23
- elif file_type == "txt":
24
- text = extract_text_from_txt(file)
25
- else:
26
- raise ValueError("Invalid file type")
27
- words = re.findall(r'\w+', text)
28
- words_frequency = {}
29
- for word in words:
30
- words_frequency[word] = words_frequency.get(word, 0) + 1
31
- df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
32
- return df
33
 
34
- def book_to_dataset_progress(file, file_type):
35
- if file_type == "pdf":
36
- text = extract_text_from_pdf(file)
37
- elif file_type == "txt":
38
- text = extract_text_from_txt(file)
39
- else:
40
- raise ValueError("Invalid file type")
41
- words = re.findall(r'\w+', text)
42
- words_frequency = {}
43
- for word in tqdm(words, desc="Converting..."):
44
- words_frequency[word] = words_frequency.get(word, 0) + 1
45
- df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
46
- return df
47
 
48
- def book_converter(inputs):
49
- if inputs[1] == "URL":
50
- url = inputs[0]
51
- file_name = url.split("/")[-1]
52
- urllib.request.urlretrieve(url, file_name)
53
- file = file_name
54
- file_type = file_name.split(".")[-1]
55
- else:
56
- file = inputs[0]
57
- file_type = inputs[2].split(".")[-1]
58
- return book_to_dataset_progress(file, file_type)
59
 
60
- inputs = (
61
- [
62
- gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
63
- gr.inputs.Radio(["URL", "File"], default="URL"),
64
- gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
65
- ]
66
- )
67
 
68
- interface = gr.Interface(
69
- book_converter,
70
- inputs,
71
- gr.outputs.Dataframe(),
72
- title="Book to Dataset Converter",
73
- description="Convert a book in pdf or txt format to a dataset that can be used to train AI models."
74
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- interface.launch()
 
 
1
+ import streamlit as st
2
  import urllib.request
3
  import PyPDF2
4
  import re
5
  import pandas as pd
 
6
 
7
+ def convert_pdf_to_txt(pdf_file):
8
  pdf_reader = PyPDF2.PdfFileReader(pdf_file)
9
+ text = ''
10
+ for i in range(pdf_reader.numPages):
11
+ text += pdf_reader.getPage(i).extractText()
12
  return text
13
 
14
+ def preprocess_text(text):
15
+ # Preprocess the text data to remove unwanted characters and convert to lowercase
16
+ text = re.sub(r'[^\w\s]', '', text)
17
+ text = text.lower()
18
  return text
19
 
20
+ def download_book(url):
21
+ response = urllib.request.urlopen(url)
22
+ book = response.read()
23
+ return book
 
 
 
 
 
 
 
 
 
24
 
25
+ def upload_book():
26
+ uploaded_file = st.file_uploader("Choose a book file", type=["pdf", "txt"])
27
+ if uploaded_file is None:
28
+ return None
29
+ return uploaded_file.read()
 
 
 
 
 
 
 
 
30
 
31
+ def main():
32
+ st.set_page_config(page_title="Book to Dataset Converter", page_icon=":book:", layout="wide")
33
+ st.title("Book to Dataset Converter")
34
+ st.write("This app allows you to convert a book to a dataset that can be used to train AI models.")
 
 
 
 
 
 
 
35
 
36
+ source = st.sidebar.radio("Select source of book", ("URL", "Upload"))
 
 
 
 
 
 
37
 
38
+ if source == "URL":
39
+ url = st.sidebar.text_input("Enter URL of book")
40
+ if st.button("Convert"):
41
+ with st.spinner("Downloading book..."):
42
+ book = download_book(url)
43
+ if book is None:
44
+ st.error("Failed to download book")
45
+ with st.spinner("Converting book to dataset..."):
46
+ text = convert_pdf_to_txt(book)
47
+ text = preprocess_text(text)
48
+ dataset = pd.DataFrame({'text': [text]})
49
+ st.write(dataset)
50
+ else:
51
+ if st.button("Upload"):
52
+ uploaded_file = upload_book()
53
+ if uploaded_file is None:
54
+ st.error("Failed to upload book")
55
+ else:
56
+ if uploaded_file.endswith(b".pdf"):
57
+ with st.spinner("Converting book to dataset..."):
58
+ text = convert_pdf_to_txt(uploaded_file)
59
+ text = preprocess_text(text)
60
+ dataset = pd.DataFrame({'text': [text]})
61
+ st.write(dataset)
62
+ elif uploaded_file.endswith(b".txt"):
63
+ with st.spinner("Converting book to dataset..."):
64
+ text = uploaded_file.decode('utf-8')
65
+ text = preprocess_text(text)
66
+ dataset = pd.DataFrame({'text': [text]})
67
+ st.write(dataset)
68
+ else:
69
+ st.error("Invalid file format. Please upload a book in pdf or txt format.")
70
 
71
+ if name == 'main':
72
+ main()