imseldrith commited on
Commit
c529144
·
1 Parent(s): 5280420

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -49
app.py CHANGED
@@ -1,58 +1,84 @@
1
  from flask import Flask, request, render_template
2
  import requests
 
 
 
 
3
  import re
4
- import nltk
5
- from googletrans import Translator
6
-
7
- nltk.download("punkt")
8
 
9
  app = Flask(__name__)
10
 
11
- def translate_text(text, src='en', dest='en'):
12
- translator = Translator(service_urls=['translate.google.com'])
13
- return translator.translate(text, dest=dest, src=src).text
14
-
15
- def text_processing(text, language='en'):
16
- # remove special characters and numbers
17
- processed_text = re.sub(r'[^a-zA-Z\s]', '', text)
18
-
19
- # tokenize the text into words
20
- tokens = nltk.word_tokenize(processed_text)
21
-
22
- # translate the text if it's not in English
23
- if language != 'en':
24
- translated_text = translate_text(processed_text, dest='en')
25
- tokens = nltk.word_tokenize(translated_text)
26
-
27
- # create the vocabulary of words
28
- vocab = set(tokens)
29
-
30
- # create a dictionary to encode each word as an integer
31
- word_to_int = {word: i for i, word in enumerate(vocab)}
32
-
33
- # encode the words as integers
34
- encoded_text = [word_to_int[word] for word in tokens]
35
-
36
- return encoded_text
37
-
38
- @app.route("/", methods=['GET', 'POST'])
39
  def index():
40
- if request.method == 'POST':
41
- if 'url' in request.form:
42
- url = request.form['url']
43
- response = requests.get(url)
44
- text = response.text
45
- language = translate_text(text, dest='en')
46
- elif 'file' in request.files:
47
- file = request.files['file']
48
- text = file.read().decode('utf-8')
49
- language = 'en'
50
-
51
- encoded_text = text_processing(text, language)
52
-
53
- return render_template('index.html', encoded_text=encoded_text)
54
-
55
  return render_template('index.html')
56
 
57
- if __name__ == "__main__":
58
- app.run(host="0.0.0.0",port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from flask import Flask, request, render_template
2
  import requests
3
+ import PyPDF2
4
+ import io
5
+ import os
6
+ import googletrans
7
  import re
8
+ import pandas as pd
 
 
 
9
 
10
  app = Flask(__name__)
11
 
12
+ @app.route('/')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def index():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  return render_template('index.html')
15
 
16
+ @app.route('/upload', methods=['POST'])
17
+ def upload():
18
+ file = request.files.get('file')
19
+ url = request.form.get('url')
20
+
21
+ if file:
22
+ filename = file.filename
23
+ file_extension = os.path.splitext(filename)[1]
24
+ file.save(filename)
25
+
26
+ # Check file extension and read the content
27
+ if file_extension == '.pdf':
28
+ # Read pdf file
29
+ pdf_file = PyPDF2.PdfFileReader(file)
30
+ text = ''
31
+ for page in range(pdf_file.getNumPages()):
32
+ text += pdf_file.getPage(page).extractText() + ' '
33
+ elif file_extension == '.txt':
34
+ # Read txt file
35
+ text = file.read().decode('utf-8')
36
+ else:
37
+ return 'Invalid file format'
38
+ elif url:
39
+ response = requests.get(url)
40
+ file_extension = os.path.splitext(url)[1]
41
+
42
+ # Check file extension and read the content
43
+ if file_extension == '.pdf':
44
+ # Read pdf file
45
+ pdf_file = PyPDF2.PdfFileReader(io.BytesIO(response.content))
46
+ text = ''
47
+ for page in range(pdf_file.getNumPages()):
48
+ text += pdf_file.getPage(page).extractText() + ' '
49
+ elif file_extension == '.txt':
50
+ # Read txt file
51
+ text = response.text
52
+ else:
53
+ return 'Invalid file format'
54
+ else:
55
+ return 'No file or URL found'
56
+
57
+ # Check if the language of the text is English, otherwise translate it
58
+ try:
59
+ src_lang = googletrans.LANGUAGES[googletrans.detect(text).lang]
60
+ if src_lang != 'en':
61
+ # Initialize the translator
62
+ translator = googletrans.Translator()
63
+
64
+ # Translate the text to English
65
+ text = translator.translate(text, dest='en').text
66
+
67
+ # Display a message indicating the text has been translated
68
+ print('The text has been translated from {} to English'.format(src_lang))
69
+ except Exception as e:
70
+ print('Error:', e)
71
+
72
+ # Convert the text to a dataset
73
+ lines = re.split(r'[.!?]+', text)
74
+ lines = [line.strip() for line in lines if line.strip() != '']
75
+ data = {'sentence': lines}
76
+ df = pd.DataFrame(data)
77
+
78
+ # Save the dataset to a CSV file
79
+ df.to_csv('dataset.csv', index=False)
80
+
81
+ return 'Dataset created successfully!'
82
+
83
+ if __name__ == '__main__':
84
+ app.run(host="0.0.0.0",port=7860,debug=True)