BookTODataset / app.py
imseldrith's picture
Update app.py
c0b0220
raw
history blame
2.5 kB
#from flask import Flask, request, render_template
import requests
import PyPDF2
import io
import os
import googletrans
import re
import pandas as pd
import gradio as gr
def upload(url or file):
file = gr.Inputs.File('file')
url = gr.Textbox('url')
if file:
filename = file.filename
file_extension = os.path.splitext(filename)[1]
file.save(filename)
# Check file extension and read the content
if file_extension == '.pdf':
# Read pdf file
pdf_file = PyPDF2.PdfFileReader(file)
text = ''
for page in range(pdf_file.getNumPages()):
text += pdf_file.getPage(page).extractText() + ' '
elif file_extension == '.txt':
# Read txt file
text = file.read().decode('utf-8')
else:
return 'Invalid file format'
elif url:
response = requests.get(url)
file_extension = os.path.splitext(url)[1]
# Check file extension and read the content
if file_extension == '.pdf':
# Read pdf file
pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
text = ''
for page in range(pdf_file.getNumPages()):
text += pdf_file.getPage(page).extractText() + ' '
elif file_extension == '.txt':
# Read txt file
text = response.text
else:
return 'Invalid file format'
else:
return 'No file or URL found'
# Check if the language of the text is English, otherwise translate it
try:
src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
if src_lang != 'en':
# Initialize the translator
translator = googletrans.Translator()
# Translate the text to English
text = translator.translate(text, dest='en').text
# Display a message indicating the text has been translated
print('The text has been translated from {} to English'.format(src_lang))
except Exception as e:
print('Error:', e)
# Convert the text to a dataset
lines = re.split(r'[.!?]+', text)
lines = [line.strip() for line in lines if line.strip() != '']
data = {'sentence': lines}
df = pd.DataFrame(data)
# Save the dataset to a CSV file
df.to_csv('dataset.csv', index=False)
return 'Dataset created successfully!'
gr.Interface(upload,inputs=[url or file], outputs="text").launch()