Spaces:
Build error
Build error
#from flask import Flask, request, render_template | |
import requests | |
import PyPDF2 | |
import io | |
import os | |
import googletrans | |
import re | |
import pandas as pd | |
import gradio as gr | |
def upload(url or file): | |
file = gr.Inputs.File('file') | |
url = gr.Textbox('url') | |
if file: | |
filename = file.filename | |
file_extension = os.path.splitext(filename)[1] | |
file.save(filename) | |
# Check file extension and read the content | |
if file_extension == '.pdf': | |
# Read pdf file | |
pdf_file = PyPDF2.PdfFileReader(file) | |
text = '' | |
for page in range(pdf_file.getNumPages()): | |
text += pdf_file.getPage(page).extractText() + ' ' | |
elif file_extension == '.txt': | |
# Read txt file | |
text = file.read().decode('utf-8') | |
else: | |
return 'Invalid file format' | |
elif url: | |
response = requests.get(url) | |
file_extension = os.path.splitext(url)[1] | |
# Check file extension and read the content | |
if file_extension == '.pdf': | |
# Read pdf file | |
pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content)) | |
text = '' | |
for page in range(pdf_file.getNumPages()): | |
text += pdf_file.getPage(page).extractText() + ' ' | |
elif file_extension == '.txt': | |
# Read txt file | |
text = response.text | |
else: | |
return 'Invalid file format' | |
else: | |
return 'No file or URL found' | |
# Check if the language of the text is English, otherwise translate it | |
try: | |
src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang] | |
if src_lang != 'en': | |
# Initialize the translator | |
translator = googletrans.Translator() | |
# Translate the text to English | |
text = translator.translate(text, dest='en').text | |
# Display a message indicating the text has been translated | |
print('The text has been translated from {} to English'.format(src_lang)) | |
except Exception as e: | |
print('Error:', e) | |
# Convert the text to a dataset | |
lines = re.split(r'[.!?]+', text) | |
lines = [line.strip() for line in lines if line.strip() != ''] | |
data = {'sentence': lines} | |
df = pd.DataFrame(data) | |
# Save the dataset to a CSV file | |
df.to_csv('dataset.csv', index=False) | |
return 'Dataset created successfully!' | |
gr.Interface(upload,inputs=[url or file], outputs="text").launch() |