Spaces:
Runtime error
Runtime error
Jekyll2000
commited on
Commit
β’
d12008b
1
Parent(s):
ff74d32
Update app.py
Browse files
app.py
CHANGED
@@ -1,42 +1,20 @@
|
|
1 |
-
import
|
2 |
import zipfile, shutil, time
|
3 |
import os
|
4 |
import hashlib
|
5 |
-
#from streamlit_pdf_viewer import pdf_viewer
|
6 |
-
from streamlit import runtime
|
7 |
-
from streamlit.runtime.scriptrunner import get_script_run_ctx
|
8 |
-
from streamlit_js_eval import streamlit_js_eval
|
9 |
import secrets
|
10 |
-
|
11 |
import threading
|
12 |
-
from streamlit.runtime.scriptrunner import add_script_run_ctx
|
13 |
-
#import streamlit.components.v1 as components
|
14 |
-
from streamlit.runtime import get_instance
|
15 |
-
|
16 |
-
#from pypdf import PdfReader
|
17 |
import fitz
|
18 |
import glob
|
19 |
import logging
|
20 |
-
|
|
|
|
|
21 |
|
22 |
def get_remote_ip() -> str:
|
23 |
"""Get remote ip."""
|
|
|
24 |
|
25 |
-
try:
|
26 |
-
ctx = get_script_run_ctx()
|
27 |
-
if ctx is None:
|
28 |
-
return None
|
29 |
-
|
30 |
-
session_info = runtime.get_instance().get_client(ctx.session_id)
|
31 |
-
if session_info is None:
|
32 |
-
return None
|
33 |
-
except Exception as e:
|
34 |
-
return None
|
35 |
-
|
36 |
-
return session_info.request.remote_ip
|
37 |
-
|
38 |
-
|
39 |
-
# colab side make dir
|
40 |
def my_makedirs(path):
|
41 |
if not os.path.isdir(path):
|
42 |
os.makedirs(path)
|
@@ -46,170 +24,52 @@ def heart_beat():
|
|
46 |
Heartbeat function to track whether the session is alive
|
47 |
"""
|
48 |
thread = threading.Timer(interval=5, function=heart_beat)
|
|
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
add_script_run_ctx(thread)
|
54 |
-
|
55 |
-
# context is required to get session_id of the calling
|
56 |
-
# thread (which would be the script thread)
|
57 |
-
ctx = get_script_run_ctx()
|
58 |
-
|
59 |
-
# this is the main runtime, contains all the sessions
|
60 |
-
runtime = get_instance()
|
61 |
-
|
62 |
-
if runtime.is_active_session(session_id=ctx.session_id):
|
63 |
-
logging.info(f"{ctx.session_id} is alive.")
|
64 |
-
thread.start()
|
65 |
-
else:
|
66 |
-
if 'uniq' in st.session_state:
|
67 |
-
if os.path.isdir(f"removefolder/{st.session_state.uniq}"):
|
68 |
-
shutil.rmtree(f"removefolder/{st.session_state.uniq}")
|
69 |
-
logging.info(f"{ctx.session_id} is gone.")
|
70 |
-
return
|
71 |
-
|
72 |
-
# JavaScript to detect browser exit
|
73 |
-
EXIT_JS = """
|
74 |
-
<script>
|
75 |
-
window.addEventListener('beforeunload', function (event) {
|
76 |
-
fetch('/close_session', {method: 'POST'}).then(response => {
|
77 |
-
return response.text();
|
78 |
-
}).then(data => {
|
79 |
-
console.log(data);
|
80 |
-
});
|
81 |
-
});
|
82 |
-
</script>
|
83 |
-
"""
|
84 |
-
|
85 |
-
# Embed the JavaScript in the Streamlit app
|
86 |
-
#components.html(EXIT_JS)
|
87 |
-
streamlit_js_eval(js_expressions = EXIT_JS)
|
88 |
-
|
89 |
-
def main():
|
90 |
-
|
91 |
-
if 'uniq' not in st.session_state:
|
92 |
-
st.session_state.uniq = secrets.token_urlsafe()
|
93 |
-
|
94 |
-
temp_dir = st.session_state.uniq
|
95 |
-
my_makedirs(f"removefolder/{temp_dir}")
|
96 |
-
|
97 |
-
flag = True
|
98 |
-
if 'count' not in st.session_state:
|
99 |
-
st.session_state.count = 0
|
100 |
-
#tempolary
|
101 |
-
if 'temp' not in st.session_state:
|
102 |
-
st.session_state.temp = 0
|
103 |
-
|
104 |
-
if 'lang' not in st.session_state:
|
105 |
-
st.session_state.lang = ""
|
106 |
-
if 'result' not in st.session_state:
|
107 |
-
st.session_state.result = ""
|
108 |
-
|
109 |
-
apptitle = st.empty()
|
110 |
-
langs = st.empty()
|
111 |
-
description = st.empty()
|
112 |
-
obj_0 = st.empty()
|
113 |
-
obj_1 = st.empty()
|
114 |
-
|
115 |
-
apptitle.header("PDF file Translator π", divider="violet")
|
116 |
-
langs.write('This App can translate to <`Japanese`, `English`, `French`, `Chinese (traditional)`, `Chinese (simplified)`, `Russian`, `Korean`, `Vietnamese`, `Thai`, `Catalan`, `Sinhalese`, `Nepall`>')
|
117 |
-
description.write("""
|
118 |
-
It's easy to use.:black_cat: Just upload, select the language, and download the resulting .zip file.:package:
|
119 |
-
|
120 |
-
After uploading a PDF file and selecting the translation language, you can wait a while.
|
121 |
-
|
122 |
-
The original text, the original text and translation (a few lines of the original text followed by the translation) and the translation text will be compressed into a zip file and available for download.
|
123 |
-
|
124 |
-
When you press the download button, the compressed file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook:
|
125 |
-
|
126 |
-
:koala: The uploaded PDF file data will disappear <u>when you close the browser tab</u>. :eyes:
|
127 |
-
|
128 |
-
**Only PDF files can be uploaded.**
|
129 |
-
|
130 |
-
This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang:
|
131 |
-
|
132 |
-
:full_moon_with_face: Even if the PDF file has many pages, there is no limit to the number of pages or characters.
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
#### FAQ :coffee:
|
137 |
-
|
138 |
-
:baby: **Q** : Does the translated text have information about line breaks and paragraphs? :coffee:
|
139 |
-
|
140 |
-
:robot_face: **A** : Line breaks and paragraphs are not reflected in the translated text.
|
141 |
-
|
142 |
-
The text extracted from the original text has the same position indexed with `:::info` as the translated text.
|
143 |
-
|
144 |
-
Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in.
|
145 |
-
The original text has an `π°` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics.
|
146 |
-
|
147 |
-
π°00001-0;
|
148 |
-
|
149 |
-
Similarly, the translation of the original text is followed by a `π` : frog mark.
|
150 |
-
|
151 |
-
π00001-0;
|
152 |
-
|
153 |
-
:teapot: **Tips** : If you have a text editor with a pattern replacement function,
|
154 |
-
you can use the characters starting with
|
155 |
-
`:::info`
|
156 |
-
and ending with the line that contains only
|
157 |
-
`:::`
|
158 |
-
as a pattern to remove.
|
159 |
-
Try searching Google for keywords such as `grep and replace`.
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf")
|
166 |
-
####
|
167 |
if uploaded_file is not None:
|
168 |
-
flag = False
|
169 |
-
st.success("PDF file translator")
|
170 |
-
# hashed
|
171 |
raw_filename = uploaded_file.name
|
172 |
-
intext_0 = f'<span style="color:LavenderBlush;background:Orchid"> {raw_filename} </span>'
|
173 |
-
st.write(intext_0, unsafe_allow_html=True)
|
174 |
hashed_filename = hashlib.sha1(raw_filename.encode())
|
175 |
uploadedfilename = hashed_filename.hexdigest()
|
176 |
-
if "uploadedfilename" not in
|
177 |
-
|
178 |
-
|
179 |
-
if "book" not in st.session_state:
|
180 |
-
#pdf_viewer(input=uploaded_file.getvalue(), width=700, height=500)
|
181 |
|
|
|
182 |
my_makedirs(
|
183 |
-
f"removefolder/{temp_dir}/upload_folder_{
|
184 |
)
|
185 |
|
186 |
with open(
|
187 |
-
f'removefolder/{temp_dir}/upload_folder_{
|
188 |
'wb') as file:
|
189 |
file.write(uploaded_file.getvalue())
|
190 |
-
# pdf_viewer(input=f'{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', width=700, height=500)
|
191 |
|
192 |
-
# read from PDF file
|
193 |
PDF = glob.glob(
|
194 |
-
f"removefolder/{temp_dir}/upload_folder_{
|
195 |
)
|
196 |
|
197 |
-
#doc = PdfReader(PDF[0])
|
198 |
doc = fitz.open(PDF[0])
|
199 |
-
# meta = doc.metadata
|
200 |
-
#page_count = len(doc.pages)
|
201 |
page_count = len(doc)
|
202 |
-
book = []
|
203 |
-
|
204 |
-
my_bar1 = progressbar1.progress(0)
|
205 |
-
|
206 |
-
from bs4 import BeautifulSoup
|
207 |
-
|
208 |
for index, page in enumerate(doc):
|
209 |
-
#page_text = page.extract_text()
|
210 |
-
#page_text = page.get_text(sort=True)
|
211 |
blocks = page.get_text("xml")
|
212 |
-
soup = BeautifulSoup(blocks,'lxml-xml')
|
213 |
page_text2 = ""
|
214 |
|
215 |
for tag0 in soup.find_all("block"):
|
@@ -222,177 +82,34 @@ Try searching Google for keywords such as `grep and replace`.
|
|
222 |
page_text2 += "\n"
|
223 |
temp_y_posi = y_posi
|
224 |
page_text2 += tag3.get("c")
|
225 |
-
|
226 |
page_text2 += "\n\n"
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
#for index, page in enumerate(doc.pages):
|
232 |
-
#for index, page in enumerate(doc):
|
233 |
-
# #page_text = page.extract_text()
|
234 |
-
# page_text = page.get_text(sort=True)
|
235 |
-
# book.append((index, page_text))
|
236 |
book.append((index, page_text2))
|
237 |
|
238 |
-
done = int(((index + 1) / page_count) * 100)
|
239 |
-
my_bar1.progress(done,
|
240 |
-
text=f"Reading Page Number : {index + 1}")
|
241 |
doc.close()
|
242 |
-
|
243 |
-
st.session_state.book = book
|
244 |
-
my_bar1.empty()
|
245 |
if os.path.isfile(
|
246 |
-
f"removefolder/{temp_dir}/upload_folder_{
|
247 |
):
|
248 |
shutil.rmtree(
|
249 |
-
f"removefolder/{temp_dir}/upload_folder_{
|
250 |
)
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
for key in st.session_state.keys():
|
256 |
-
if key == "count" or key == "temp" or key == "lang":
|
257 |
-
continue
|
258 |
-
else:
|
259 |
-
del st.session_state[key]
|
260 |
-
shutil.rmtree(f"removefolder/{temp_dir}")
|
261 |
-
# page reload
|
262 |
-
streamlit_js_eval(js_expressions="parent.window.location.reload()")
|
263 |
-
st.markdown("----")
|
264 |
-
|
265 |
-
plain_text1 = " π select target language π "
|
266 |
-
var_text1 = f'##### <span style="color:green">{plain_text1}</span>'
|
267 |
-
|
268 |
-
select = st.empty()
|
269 |
-
select.write(var_text1, unsafe_allow_html=True)
|
270 |
-
|
271 |
-
# select language
|
272 |
-
st.markdown("""
|
273 |
-
`ja`: **Japanese**,
|
274 |
-
`en`: **English**,
|
275 |
-
`fr`: **French**,
|
276 |
-
`zb-TW`: **Chinese (traditional)**,
|
277 |
-
`zh-CN`: **Chinese (simplified)**,
|
278 |
-
`ru`: **Russian**,
|
279 |
-
`ko`: **Korean**,
|
280 |
-
`vi`: **Vietnamese**,
|
281 |
-
`th`: **Thai**,
|
282 |
-
`tl`: **Tagalog**,
|
283 |
-
`ca`: **Catalan**,
|
284 |
-
`si`: **Sinhalese**,
|
285 |
-
`ne`: **Nepall**
|
286 |
-
""")
|
287 |
-
lang_code = [
|
288 |
-
"select language",
|
289 |
-
"Japanese",
|
290 |
-
"English",
|
291 |
-
"French",
|
292 |
-
"Chinese traditional",
|
293 |
-
"Chinese simplified",
|
294 |
-
"Russian",
|
295 |
-
"Korean",
|
296 |
-
"Vietnamese",
|
297 |
-
"Thai",
|
298 |
-
"Tagalog",
|
299 |
-
"Catalan",
|
300 |
-
"Sinhalese",
|
301 |
-
"Nepall"
|
302 |
-
]
|
303 |
-
sel = st.empty()
|
304 |
-
language = sel.radio(
|
305 |
-
label='translate to',
|
306 |
-
options=lang_code,
|
307 |
-
index=0,
|
308 |
-
key = f"select_lang{st.session_state.count}",
|
309 |
-
horizontal=True)
|
310 |
-
#language = sel.selectbox(
|
311 |
-
# 'translate to',
|
312 |
-
# lang_code,
|
313 |
-
# index=0,
|
314 |
-
# #placeholder = "select language",
|
315 |
-
# key=f"select_lang{st.session_state.count}")
|
316 |
-
|
317 |
-
statename = f"select_lang{st.session_state.count}"
|
318 |
-
if "target_lang" not in st.session_state:
|
319 |
-
st.session_state.target_lang = "UNSELECTED"
|
320 |
-
|
321 |
-
def reset_selected_lang():
|
322 |
-
st.session_state[statename] = "select language"
|
323 |
-
|
324 |
-
st.button('Reset Language', on_click=reset_selected_lang)
|
325 |
-
|
326 |
-
area = st.empty()
|
327 |
-
if flag:
|
328 |
-
if "select_lang" in st.session_state:
|
329 |
-
if st.session_state.select_lang != "select language":
|
330 |
-
area2 = st.empty()
|
331 |
-
plain_text2 = "βReset Languageβ"
|
332 |
-
empty_text = "β β"
|
333 |
-
var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>'
|
334 |
-
while flag:
|
335 |
-
area2.write(var_text2, unsafe_allow_html=True)
|
336 |
-
time.sleep(0.9)
|
337 |
-
area2.write(empty_text)
|
338 |
-
time.sleep(0.5)
|
339 |
-
|
340 |
-
while flag:
|
341 |
-
area.text("π€ upload PDF file π€")
|
342 |
-
time.sleep(1)
|
343 |
-
area.text("π₯ π₯")
|
344 |
-
time.sleep(0.8)
|
345 |
-
else:
|
346 |
-
if f"select_lang{st.session_state.count}" in st.session_state:
|
347 |
-
statename = f"select_lang{st.session_state.count}"
|
348 |
-
if st.session_state[statename] != "select language":
|
349 |
-
plain_text2 = "Reset Language"
|
350 |
-
var_text2 = f'<span style="color:gray">β² `{plain_text2}`</span>'
|
351 |
-
area.write(var_text2, unsafe_allow_html=True)
|
352 |
-
|
353 |
-
obj_0.empty()
|
354 |
-
obj_1.empty() # uploader hide
|
355 |
-
|
356 |
-
# pdf translator
|
357 |
-
#------------------------------------------
|
358 |
-
st.markdown("----")
|
359 |
-
st.success("translator")
|
360 |
-
|
361 |
-
if "book" in st.session_state:
|
362 |
-
book_data = st.session_state.book
|
363 |
-
page_count = len(book_data)
|
364 |
-
else:
|
365 |
-
page_count = 0
|
366 |
-
|
367 |
-
st.text(f"PDF total pages : {page_count}")
|
368 |
-
|
369 |
-
progressbar = st.empty()
|
370 |
-
my_bar = progressbar.progress(0)
|
371 |
-
|
372 |
-
#3
|
373 |
-
# from google.colab import output
|
374 |
-
import re
|
375 |
-
#from googletrans import Translator
|
376 |
-
from deep_translator import GoogleTranslator
|
377 |
-
|
378 |
-
title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}")
|
379 |
-
|
380 |
-
if st.session_state.temp != int(st.session_state.count):
|
381 |
-
st.session_state.lang = "init"
|
382 |
-
st.session_state.temp = int(st.session_state.count)
|
383 |
|
384 |
if language not in lang_code[1:]:
|
385 |
language = None
|
386 |
|
387 |
-
if
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
|
392 |
-
description.empty()
|
393 |
-
|
394 |
my_makedirs(
|
395 |
-
f"removefolder/{temp_dir}/work_{
|
396 |
|
397 |
to = ""
|
398 |
match language:
|
@@ -425,34 +142,22 @@ Try searching Google for keywords such as `grep and replace`.
|
|
425 |
case _:
|
426 |
to = "unknown"
|
427 |
|
428 |
-
|
429 |
-
|
430 |
-
st.session_state.target_lang = to
|
431 |
|
432 |
-
|
433 |
-
|
434 |
-
work_area2 = st.empty()
|
435 |
-
#--------------------------------------
|
436 |
|
437 |
for index, page in enumerate(book_data):
|
438 |
page_text = page[1]
|
439 |
-
# print("\nPage Number:" + str(index))
|
440 |
-
done = int(((index + 1) / page_count) * 100)
|
441 |
-
my_bar.progress(done,
|
442 |
-
text=f"Working Page Number : {index + 1}")
|
443 |
-
# print(len(page_text))
|
444 |
-
# text_list = [s for s in page_text.split('\n') if s]
|
445 |
page_text = re.sub('\.', '.π', page_text)
|
446 |
text_list = [s for s in page_text.split('π')]
|
447 |
if len(text_list) < 1:
|
448 |
continue
|
449 |
|
450 |
-
limit = 0
|
451 |
temp_list = []
|
452 |
line_number = []
|
453 |
|
454 |
for n, line in enumerate(text_list):
|
455 |
-
|
456 |
line2 = re.sub(r"\s+", " ", line)
|
457 |
if line2 == "":
|
458 |
continue
|
@@ -470,142 +175,142 @@ Try searching Google for keywords such as `grep and replace`.
|
|
470 |
|
471 |
text_2 = text_
|
472 |
text_ = re.sub('π', "", text_)
|
473 |
-
#while (re.search('π', text_2)):
|
474 |
-
# num = line_number.pop(0)
|
475 |
-
# rep_words = f"πNO:{num}| "
|
476 |
-
# text_2 = text_2.replace('π', rep_words, 1)
|
477 |
line_number.clear()
|
478 |
|
479 |
-
# print(re.sub("π","\n", text_2))
|
480 |
-
#ts = Translator()
|
481 |
all_text_orig = f":::info\nπ°{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
|
482 |
|
483 |
for times in range(0, 5):
|
484 |
-
|
485 |
try:
|
486 |
tsd = GoogleTranslator(
|
487 |
source="auto",
|
488 |
target=to).translate(text=text_)
|
489 |
if tsd == None:
|
490 |
tsd = text_
|
491 |
-
#tsd = ts.translate(text_, src="en", dest="ja")
|
492 |
-
#translated_text = ts.translate(line, src="en", dest="ja").text
|
493 |
all_text_done = f":::info\nπ{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
|
494 |
-
|
495 |
-
|
496 |
-
# all_text_orig += str(n) + "; " + tsd.pronunciation + "\n"
|
497 |
-
# print(index,n, line)
|
498 |
-
# print(index,n, tsd.text)
|
499 |
-
|
500 |
-
# print(all_text_orig)
|
501 |
-
# print(all_text_done + "\n")
|
502 |
-
if type(all_text_orig) is str and type(
|
503 |
-
all_text_done) is str:
|
504 |
-
|
505 |
-
# intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>'
|
506 |
-
# work_area1.markdown(intext_1, unsafe_allow_html=True)
|
507 |
-
work_area1.write(f"{all_text_orig}")
|
508 |
-
# intext_2 = f'<span style="color:LavenderBlush;background:Gray">{all_text_done}</span>'
|
509 |
-
work_area2.write(f"{all_text_done}")
|
510 |
-
# work_area2.markdown(intext_2, unsafe_allow_html=True)
|
511 |
-
|
512 |
-
with open(
|
513 |
-
f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
|
514 |
-
"a") as tempf:
|
515 |
-
tempf.write(all_text_orig + "\n\n" +
|
516 |
-
all_text_done + "\n\n")
|
517 |
-
|
518 |
-
# st.session_state.result += all_text_orig + "\n\n"
|
519 |
-
# st.session_state.result += all_text_done + "\n\n"
|
520 |
-
|
521 |
-
# print(n, tsd.pronunciation)
|
522 |
with open(
|
523 |
-
f"removefolder/{temp_dir}/work_{
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
"a") as f:
|
525 |
f.write(all_text_orig + all_text_done +
|
526 |
"\n")
|
527 |
with open(
|
528 |
-
f"removefolder/{temp_dir}/work_{
|
529 |
"a") as f:
|
530 |
f.write(all_text_done + "\n")
|
531 |
|
532 |
break
|
533 |
|
534 |
except Exception as e:
|
535 |
-
print(e)
|
536 |
time.sleep(3)
|
537 |
continue
|
538 |
|
539 |
with open(
|
540 |
-
f"removefolder/{temp_dir}/work_{
|
541 |
"a") as f:
|
542 |
f.write(all_text_orig + "\n")
|
543 |
|
544 |
-
|
545 |
-
st.markdown("----")
|
546 |
-
|
547 |
my_makedirs(f"removefolder/{temp_dir}/download_section")
|
548 |
shutil.move(
|
549 |
-
f"removefolder/{temp_dir}/work_{
|
550 |
-
f"removefolder/{temp_dir}/download_section/reuseMarkdown_{
|
551 |
)
|
552 |
|
553 |
shutil.make_archive(
|
554 |
-
f'removefolder/{temp_dir}/download_section/{
|
555 |
-
format='zip'
|
556 |
-
root_dir=f'removefolder/{temp_dir}/work_{
|
557 |
-
|
558 |
shutil.rmtree(
|
559 |
-
f"removefolder/{temp_dir}/work_{
|
560 |
-
|
561 |
-
st.balloons()
|
562 |
-
|
563 |
-
work_area1.empty()
|
564 |
-
work_area2.empty()
|
565 |
-
|
566 |
-
#--------------------------------------
|
567 |
-
|
568 |
-
st.success("Download translated text files")
|
569 |
-
st.write(intext_0, unsafe_allow_html=True)
|
570 |
-
# plain_text3 = f"[ {st.session_state.target_lang} ] : translated text files"
|
571 |
-
plain_text3 = f"[ {language} ] : translated text files"
|
572 |
-
var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>'
|
573 |
-
|
574 |
-
translated = st.empty()
|
575 |
-
translated.write(var_text3, unsafe_allow_html=True)
|
576 |
|
577 |
if os.path.isfile(
|
578 |
-
f'removefolder/{temp_dir}/download_section/{
|
579 |
):
|
580 |
with open(
|
581 |
-
f"removefolder/{temp_dir}/download_section/{
|
582 |
"rb") as fpath:
|
583 |
-
|
584 |
-
label=f"DOWNLOAD .zip file",
|
585 |
-
data=fpath,
|
586 |
-
file_name=
|
587 |
-
f"{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
|
588 |
-
mime="application/zip")
|
589 |
|
590 |
-
|
591 |
-
var_text4 = f'<span style="color:gray">β² `{plain_text4}` π </span>'
|
592 |
-
st.write(var_text4, unsafe_allow_html=True)
|
593 |
|
594 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
595 |
|
596 |
-
|
597 |
-
|
598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
608 |
|
609 |
if __name__ == "__main__":
|
610 |
heart_beat()
|
611 |
-
|
|
|
1 |
+
import gradio as gr
|
2 |
import zipfile, shutil, time
|
3 |
import os
|
4 |
import hashlib
|
|
|
|
|
|
|
|
|
5 |
import secrets
|
|
|
6 |
import threading
|
|
|
|
|
|
|
|
|
|
|
7 |
import fitz
|
8 |
import glob
|
9 |
import logging
|
10 |
+
from deep_translator import GoogleTranslator
|
11 |
+
import re
|
12 |
+
from bs4 import BeautifulSoup
|
13 |
|
14 |
def get_remote_ip() -> str:
|
15 |
"""Get remote ip."""
|
16 |
+
return "127.0.0.1" # Placeholder for remote IP in Gradio
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def my_makedirs(path):
|
19 |
if not os.path.isdir(path):
|
20 |
os.makedirs(path)
|
|
|
24 |
Heartbeat function to track whether the session is alive
|
25 |
"""
|
26 |
thread = threading.Timer(interval=5, function=heart_beat)
|
27 |
+
thread.start()
|
28 |
|
29 |
+
def main(uploaded_file, language):
|
30 |
+
if 'uniq' not in gr.session_state:
|
31 |
+
gr.session_state.uniq = secrets.token_urlsafe()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
temp_dir = gr.session_state.uniq
|
34 |
+
my_makedirs(f"removefolder/{temp_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
if 'count' not in gr.session_state:
|
37 |
+
gr.session_state.count = 0
|
38 |
+
if 'temp' not in gr.session_state:
|
39 |
+
gr.session_state.temp = 0
|
40 |
+
if 'lang' not in gr.session_state:
|
41 |
+
gr.session_state.lang = ""
|
42 |
+
if 'result' not in gr.session_state:
|
43 |
+
gr.session_state.result = ""
|
44 |
|
|
|
|
|
45 |
if uploaded_file is not None:
|
|
|
|
|
|
|
46 |
raw_filename = uploaded_file.name
|
|
|
|
|
47 |
hashed_filename = hashlib.sha1(raw_filename.encode())
|
48 |
uploadedfilename = hashed_filename.hexdigest()
|
49 |
+
if "uploadedfilename" not in gr.session_state:
|
50 |
+
gr.session_state.uploadedfilename = uploadedfilename
|
|
|
|
|
|
|
51 |
|
52 |
+
if "book" not in gr.session_state:
|
53 |
my_makedirs(
|
54 |
+
f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}"
|
55 |
)
|
56 |
|
57 |
with open(
|
58 |
+
f'removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf',
|
59 |
'wb') as file:
|
60 |
file.write(uploaded_file.getvalue())
|
|
|
61 |
|
|
|
62 |
PDF = glob.glob(
|
63 |
+
f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf"
|
64 |
)
|
65 |
|
|
|
66 |
doc = fitz.open(PDF[0])
|
|
|
|
|
67 |
page_count = len(doc)
|
68 |
+
book = []
|
69 |
+
|
|
|
|
|
|
|
|
|
70 |
for index, page in enumerate(doc):
|
|
|
|
|
71 |
blocks = page.get_text("xml")
|
72 |
+
soup = BeautifulSoup(blocks, 'lxml-xml')
|
73 |
page_text2 = ""
|
74 |
|
75 |
for tag0 in soup.find_all("block"):
|
|
|
82 |
page_text2 += "\n"
|
83 |
temp_y_posi = y_posi
|
84 |
page_text2 += tag3.get("c")
|
85 |
+
|
86 |
page_text2 += "\n\n"
|
87 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
book.append((index, page_text2))
|
89 |
|
|
|
|
|
|
|
90 |
doc.close()
|
91 |
+
gr.session_state.book = book
|
|
|
|
|
92 |
if os.path.isfile(
|
93 |
+
f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf"
|
94 |
):
|
95 |
shutil.rmtree(
|
96 |
+
f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/"
|
97 |
)
|
98 |
|
99 |
+
if gr.session_state.temp != int(gr.session_state.count):
|
100 |
+
gr.session_state.lang = "init"
|
101 |
+
gr.session_state.temp = int(gr.session_state.count)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
if language not in lang_code[1:]:
|
104 |
language = None
|
105 |
|
106 |
+
if gr.session_state.lang != language and language is not None:
|
107 |
+
gr.session_state.count += 1
|
108 |
+
gr.session_state.result = ""
|
109 |
+
gr.session_state.lang = language
|
110 |
|
|
|
|
|
111 |
my_makedirs(
|
112 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}")
|
113 |
|
114 |
to = ""
|
115 |
match language:
|
|
|
142 |
case _:
|
143 |
to = "unknown"
|
144 |
|
145 |
+
gr.session_state.target_lang = to
|
|
|
|
|
146 |
|
147 |
+
book_data = gr.session_state.book
|
148 |
+
page_count = len(book_data)
|
|
|
|
|
149 |
|
150 |
for index, page in enumerate(book_data):
|
151 |
page_text = page[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
page_text = re.sub('\.', '.π', page_text)
|
153 |
text_list = [s for s in page_text.split('π')]
|
154 |
if len(text_list) < 1:
|
155 |
continue
|
156 |
|
|
|
157 |
temp_list = []
|
158 |
line_number = []
|
159 |
|
160 |
for n, line in enumerate(text_list):
|
|
|
161 |
line2 = re.sub(r"\s+", " ", line)
|
162 |
if line2 == "":
|
163 |
continue
|
|
|
175 |
|
176 |
text_2 = text_
|
177 |
text_ = re.sub('π', "", text_)
|
|
|
|
|
|
|
|
|
178 |
line_number.clear()
|
179 |
|
|
|
|
|
180 |
all_text_orig = f":::info\nπ°{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
|
181 |
|
182 |
for times in range(0, 5):
|
|
|
183 |
try:
|
184 |
tsd = GoogleTranslator(
|
185 |
source="auto",
|
186 |
target=to).translate(text=text_)
|
187 |
if tsd == None:
|
188 |
tsd = text_
|
|
|
|
|
189 |
all_text_done = f":::info\nπ{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
|
190 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
with open(
|
192 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}/reuseMarkdown.txt",
|
193 |
+
"a") as tempf:
|
194 |
+
tempf.write(all_text_orig + "\n\n" +
|
195 |
+
all_text_done + "\n\n")
|
196 |
+
|
197 |
+
with open(
|
198 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_done.txt",
|
199 |
"a") as f:
|
200 |
f.write(all_text_orig + all_text_done +
|
201 |
"\n")
|
202 |
with open(
|
203 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_done_{language}.txt",
|
204 |
"a") as f:
|
205 |
f.write(all_text_done + "\n")
|
206 |
|
207 |
break
|
208 |
|
209 |
except Exception as e:
|
|
|
210 |
time.sleep(3)
|
211 |
continue
|
212 |
|
213 |
with open(
|
214 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_orig.txt",
|
215 |
"a") as f:
|
216 |
f.write(all_text_orig + "\n")
|
217 |
|
|
|
|
|
|
|
218 |
my_makedirs(f"removefolder/{temp_dir}/download_section")
|
219 |
shutil.move(
|
220 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}/reuseMarkdown.txt",
|
221 |
+
f"removefolder/{temp_dir}/download_section/reuseMarkdown_{gr.session_state.count}.txt"
|
222 |
)
|
223 |
|
224 |
shutil.make_archive(
|
225 |
+
f'removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}',
|
226 |
+
format='zip',
|
227 |
+
root_dir=f'removefolder/{temp_dir}/work_{gr.session_state.count}'
|
228 |
+
)
|
229 |
shutil.rmtree(
|
230 |
+
f"removefolder/{temp_dir}/work_{gr.session_state.count}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
if os.path.isfile(
|
233 |
+
f'removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}.zip'
|
234 |
):
|
235 |
with open(
|
236 |
+
f"removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}.zip",
|
237 |
"rb") as fpath:
|
238 |
+
return fpath.read()
|
|
|
|
|
|
|
|
|
|
|
239 |
|
240 |
+
return None
|
|
|
|
|
241 |
|
242 |
+
lang_code = [
|
243 |
+
"select language",
|
244 |
+
"Japanese",
|
245 |
+
"English",
|
246 |
+
"French",
|
247 |
+
"Chinese traditional",
|
248 |
+
"Chinese simplified",
|
249 |
+
"Russian",
|
250 |
+
"Korean",
|
251 |
+
"Vietnamese",
|
252 |
+
"Thai",
|
253 |
+
"Tagalog",
|
254 |
+
"Catalan",
|
255 |
+
"Sinhalese",
|
256 |
+
"Nepall"
|
257 |
+
]
|
258 |
|
259 |
+
iface = gr.Interface(
|
260 |
+
fn=main,
|
261 |
+
inputs=[
|
262 |
+
gr.inputs.File(label="UPLOAD your .pdf file"),
|
263 |
+
gr.inputs.Radio(lang_code, label="translate to", default="select language")
|
264 |
+
],
|
265 |
+
outputs=gr.outputs.File(label="DOWNLOAD .zip file"),
|
266 |
+
title="PDF file Translator π",
|
267 |
+
description="""
|
268 |
+
It's easy to use.:black_cat: Just upload, select the language, and download the resulting .zip file.:package:
|
269 |
|
270 |
+
After uploading a PDF file and selecting the translation language, you can wait a while.
|
271 |
+
|
272 |
+
The original text, the original text and translation (a few lines of the original text followed by the translation) and the translation text will be compressed into a zip file and available for download.
|
273 |
+
|
274 |
+
When you press the download button, the compressed file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook:
|
275 |
+
|
276 |
+
:koala: The uploaded PDF file data will disappear <u>when you close the browser tab</u>. :eyes:
|
277 |
+
|
278 |
+
**Only PDF files can be uploaded.**
|
279 |
+
|
280 |
+
This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang:
|
281 |
+
|
282 |
+
:full_moon_with_face: Even if the PDF file has many pages, there is no limit to the number of pages or characters.
|
283 |
+
|
284 |
+
<u>The untranslated data will be retained until the browser is closed, but once the app page is closed, the connection will be cut off and the data will be deleted.</u> :thought_balloon:
|
285 |
+
|
286 |
+
#### FAQ :coffee:
|
287 |
+
|
288 |
+
:baby: **Q** : Does the translated text have information about line breaks and paragraphs? :coffee:
|
289 |
+
|
290 |
+
:robot_face: **A** : Line breaks and paragraphs are not reflected in the translated text.
|
291 |
+
|
292 |
+
The text extracted from the original text has the same position indexed with `:::info` as the translated text.
|
293 |
+
|
294 |
+
Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in.
|
295 |
+
The original text has an `π°` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics.
|
296 |
+
|
297 |
+
π°00001-0;
|
298 |
+
|
299 |
+
Similarly, the translation of the original text is followed by a `π` : frog mark.
|
300 |
+
|
301 |
+
π00001-0;
|
302 |
+
|
303 |
+
:teapot: **Tips** : If you have a text editor with a pattern replacement function,
|
304 |
+
you can use the characters starting with
|
305 |
+
`:::info`
|
306 |
+
and ending with the line that contains only
|
307 |
+
`:::`
|
308 |
+
as a pattern to remove.
|
309 |
+
Try searching Google for keywords such as `grep and replace`.
|
310 |
+
""",
|
311 |
+
live=True
|
312 |
+
)
|
313 |
|
314 |
if __name__ == "__main__":
|
315 |
heart_beat()
|
316 |
+
iface.launch()
|