Spaces:
Build error
Build error
Commit
·
c529144
1
Parent(s):
5280420
Update app.py
Browse files
app.py
CHANGED
@@ -1,58 +1,84 @@
|
|
1 |
from flask import Flask, request, render_template
|
2 |
import requests
|
|
|
|
|
|
|
|
|
3 |
import re
|
4 |
-
import
|
5 |
-
from googletrans import Translator
|
6 |
-
|
7 |
-
nltk.download("punkt")
|
8 |
|
9 |
app = Flask(__name__)
|
10 |
|
11 |
-
|
12 |
-
translator = Translator(service_urls=['translate.google.com'])
|
13 |
-
return translator.translate(text, dest=dest, src=src).text
|
14 |
-
|
15 |
-
def text_processing(text, language='en'):
|
16 |
-
# remove special characters and numbers
|
17 |
-
processed_text = re.sub(r'[^a-zA-Z\s]', '', text)
|
18 |
-
|
19 |
-
# tokenize the text into words
|
20 |
-
tokens = nltk.word_tokenize(processed_text)
|
21 |
-
|
22 |
-
# translate the text if it's not in English
|
23 |
-
if language != 'en':
|
24 |
-
translated_text = translate_text(processed_text, dest='en')
|
25 |
-
tokens = nltk.word_tokenize(translated_text)
|
26 |
-
|
27 |
-
# create the vocabulary of words
|
28 |
-
vocab = set(tokens)
|
29 |
-
|
30 |
-
# create a dictionary to encode each word as an integer
|
31 |
-
word_to_int = {word: i for i, word in enumerate(vocab)}
|
32 |
-
|
33 |
-
# encode the words as integers
|
34 |
-
encoded_text = [word_to_int[word] for word in tokens]
|
35 |
-
|
36 |
-
return encoded_text
|
37 |
-
|
38 |
-
@app.route("/", methods=['GET', 'POST'])
|
39 |
def index():
|
40 |
-
if request.method == 'POST':
|
41 |
-
if 'url' in request.form:
|
42 |
-
url = request.form['url']
|
43 |
-
response = requests.get(url)
|
44 |
-
text = response.text
|
45 |
-
language = translate_text(text, dest='en')
|
46 |
-
elif 'file' in request.files:
|
47 |
-
file = request.files['file']
|
48 |
-
text = file.read().decode('utf-8')
|
49 |
-
language = 'en'
|
50 |
-
|
51 |
-
encoded_text = text_processing(text, language)
|
52 |
-
|
53 |
-
return render_template('index.html', encoded_text=encoded_text)
|
54 |
-
|
55 |
return render_template('index.html')
|
56 |
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from flask import Flask, request, render_template
|
2 |
import requests
|
3 |
+
import PyPDF2
|
4 |
+
import io
|
5 |
+
import os
|
6 |
+
import googletrans
|
7 |
import re
|
8 |
+
import pandas as pd
|
|
|
|
|
|
|
9 |
|
10 |
app = Flask(__name__)
|
11 |
|
12 |
+
@app.route('/')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
def index():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
return render_template('index.html')
|
15 |
|
16 |
+
@app.route('/upload', methods=['POST'])
|
17 |
+
def upload():
|
18 |
+
file = request.files.get('file')
|
19 |
+
url = request.form.get('url')
|
20 |
+
|
21 |
+
if file:
|
22 |
+
filename = file.filename
|
23 |
+
file_extension = os.path.splitext(filename)[1]
|
24 |
+
file.save(filename)
|
25 |
+
|
26 |
+
# Check file extension and read the content
|
27 |
+
if file_extension == '.pdf':
|
28 |
+
# Read pdf file
|
29 |
+
pdf_file = PyPDF2.PdfFileReader(file)
|
30 |
+
text = ''
|
31 |
+
for page in range(pdf_file.getNumPages()):
|
32 |
+
text += pdf_file.getPage(page).extractText() + ' '
|
33 |
+
elif file_extension == '.txt':
|
34 |
+
# Read txt file
|
35 |
+
text = file.read().decode('utf-8')
|
36 |
+
else:
|
37 |
+
return 'Invalid file format'
|
38 |
+
elif url:
|
39 |
+
response = requests.get(url)
|
40 |
+
file_extension = os.path.splitext(url)[1]
|
41 |
+
|
42 |
+
# Check file extension and read the content
|
43 |
+
if file_extension == '.pdf':
|
44 |
+
# Read pdf file
|
45 |
+
pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
|
46 |
+
text = ''
|
47 |
+
for page in range(pdf_file.getNumPages()):
|
48 |
+
text += pdf_file.getPage(page).extractText() + ' '
|
49 |
+
elif file_extension == '.txt':
|
50 |
+
# Read txt file
|
51 |
+
text = response.text
|
52 |
+
else:
|
53 |
+
return 'Invalid file format'
|
54 |
+
else:
|
55 |
+
return 'No file or URL found'
|
56 |
+
|
57 |
+
# Check if the language of the text is English, otherwise translate it
|
58 |
+
try:
|
59 |
+
src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
|
60 |
+
if src_lang != 'en':
|
61 |
+
# Initialize the translator
|
62 |
+
translator = googletrans.Translator()
|
63 |
+
|
64 |
+
# Translate the text to English
|
65 |
+
text = translator.translate(text, dest='en').text
|
66 |
+
|
67 |
+
# Display a message indicating the text has been translated
|
68 |
+
print('The text has been translated from {} to English'.format(src_lang))
|
69 |
+
except Exception as e:
|
70 |
+
print('Error:', e)
|
71 |
+
|
72 |
+
# Convert the text to a dataset
|
73 |
+
lines = re.split(r'[.!?]+', text)
|
74 |
+
lines = [line.strip() for line in lines if line.strip() != '']
|
75 |
+
data = {'sentence': lines}
|
76 |
+
df = pd.DataFrame(data)
|
77 |
+
|
78 |
+
# Save the dataset to a CSV file
|
79 |
+
df.to_csv('dataset.csv', index=False)
|
80 |
+
|
81 |
+
return 'Dataset created successfully!'
|
82 |
+
|
83 |
+
if __name__ == '__main__':
|
84 |
+
app.run(host="0.0.0.0",port=7860,debug=True)
|