imseldrith commited on
Commit
5700651
·
1 Parent(s): 69f0100

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -66
app.py CHANGED
@@ -1,78 +1,77 @@
1
- #from flask import Flask, request, render_template
2
- import requests
3
  import PyPDF2
4
- import io
5
- import os
6
- import googletrans
7
  import re
8
  import pandas as pd
9
- import gradio as gr
10
-
11
-
12
- def upload(url , file):
13
- file = gr.Inputs.File('file')
14
- url = gr.Textbox('url')
15
 
16
- if file:
17
- filename = file.filename
18
- file_extension = os.path.splitext(filename)[1]
19
- file.save(filename)
 
 
20
 
21
- # Check file extension and read the content
22
- if file_extension == '.pdf':
23
- # Read pdf file
24
- pdf_file = PyPDF2.PdfFileReader(file)
25
- text = ''
26
- for page in range(pdf_file.getNumPages()):
27
- text += pdf_file.getPage(page).extractText() + ' '
28
- elif file_extension == '.txt':
29
- # Read txt file
30
- text = file.read().decode('utf-8')
31
- else:
32
- return 'Invalid file format'
33
- elif url:
34
- response = requests.get(url)
35
- file_extension = os.path.splitext(url)[1]
36
 
37
- # Check file extension and read the content
38
- if file_extension == '.pdf':
39
- # Read pdf file
40
- pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
41
- text = ''
42
- for page in range(pdf_file.getNumPages()):
43
- text += pdf_file.getPage(page).extractText() + ' '
44
- elif file_extension == '.txt':
45
- # Read txt file
46
- text = response.text
47
- else:
48
- return 'Invalid file format'
49
  else:
50
- return 'No file or URL found'
 
 
 
 
 
 
51
 
52
- # Check if the language of the text is English, otherwise translate it
53
- try:
54
- src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
55
- if src_lang != 'en':
56
- # Initialize the translator
57
- translator = googletrans.Translator()
58
-
59
- # Translate the text to English
60
- text = translator.translate(text, dest='en').text
61
-
62
- # Display a message indicating the text has been translated
63
- print('The text has been translated from {} to English'.format(src_lang))
64
- except Exception as e:
65
- print('Error:', e)
66
 
67
- # Convert the text to a dataset
68
- lines = re.split(r'[.!?]+', text)
69
- lines = [line.strip() for line in lines if line.strip() != '']
70
- data = {'sentence': lines}
71
- df = pd.DataFrame(data)
 
 
 
 
 
 
72
 
73
- # Save the dataset to a CSV file
74
- df.to_csv('dataset.csv', index=False)
 
 
 
 
 
 
75
 
76
- return 'Dataset created successfully!'
 
 
 
 
 
 
77
 
78
- gr.Interface(upload,inputs=[url or file], outputs="text").launch()
 
1
+ import gradio as gr
2
+ import urllib.request
3
  import PyPDF2
 
 
 
4
  import re
5
  import pandas as pd
6
+ from tqdm import tqdm
 
 
 
 
 
7
 
8
+ def extract_text_from_pdf(pdf_file):
9
+ pdf_reader = PyPDF2.PdfFileReader(pdf_file)
10
+ text = ""
11
+ for page in range(pdf_reader.numPages):
12
+ text += pdf_reader.getPage(page).extractText()
13
+ return text
14
 
15
+ def extract_text_from_txt(txt_file):
16
+ with open(txt_file, "r") as file:
17
+ text = file.read()
18
+ return text
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def book_to_dataset(file, file_type):
21
+ if file_type == "pdf":
22
+ text = extract_text_from_pdf(file)
23
+ elif file_type == "txt":
24
+ text = extract_text_from_txt(file)
 
 
 
 
 
 
 
25
  else:
26
+ raise ValueError("Invalid file type")
27
+ words = re.findall(r'\w+', text)
28
+ words_frequency = {}
29
+ for word in words:
30
+ words_frequency[word] = words_frequency.get(word, 0) + 1
31
+ df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
32
+ return df
33
 
34
+ def book_to_dataset_progress(file, file_type):
35
+ if file_type == "pdf":
36
+ text = extract_text_from_pdf(file)
37
+ elif file_type == "txt":
38
+ text = extract_text_from_txt(file)
39
+ else:
40
+ raise ValueError("Invalid file type")
41
+ words = re.findall(r'\w+', text)
42
+ words_frequency = {}
43
+ for word in tqdm(words, desc="Converting..."):
44
+ words_frequency[word] = words_frequency.get(word, 0) + 1
45
+ df = pd.DataFrame(list(words_frequency.items()), columns=["Word", "Frequency"])
46
+ return df
 
47
 
48
+ def book_converter(inputs):
49
+ if inputs[1] == "URL":
50
+ url = inputs[0]
51
+ file_name = url.split("/")[-1]
52
+ urllib.request.urlretrieve(url, file_name)
53
+ file = file_name
54
+ file_type = file_name.split(".")[-1]
55
+ else:
56
+ file = inputs[0]
57
+ file_type = inputs[2].split(".")[-1]
58
+ return book_to_dataset_progress(file, file_type)
59
 
60
+ inputs = gr.inputs.Column(
61
+ [
62
+ gr.inputs.Textbox(lines=1, default="Enter URL or choose file", element_type="url"),
63
+ gr.inputs.Radio(["URL", "File"], default="URL"),
64
+ gr.inputs.FileUploader(upload_label="Choose file", clear_label="Clear file",)
65
+ ],
66
+ label="Input"
67
+ )
68
 
69
+ interface = gr.Interface(
70
+ book_converter,
71
+ inputs,
72
+ gr.outputs.Dataframe(),
73
+ title="Book to Dataset Converter",
74
+ description="Convert a book in pdf or txt format to a dataset that can be used to train AI models."
75
+ )
76
 
77
+ interface.launch()