Jekyll2000 commited on
Commit
d12008b
β€’
1 Parent(s): ff74d32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -430
app.py CHANGED
@@ -1,42 +1,20 @@
1
- import streamlit as st
2
  import zipfile, shutil, time
3
  import os
4
  import hashlib
5
- #from streamlit_pdf_viewer import pdf_viewer
6
- from streamlit import runtime
7
- from streamlit.runtime.scriptrunner import get_script_run_ctx
8
- from streamlit_js_eval import streamlit_js_eval
9
  import secrets
10
-
11
  import threading
12
- from streamlit.runtime.scriptrunner import add_script_run_ctx
13
- #import streamlit.components.v1 as components
14
- from streamlit.runtime import get_instance
15
-
16
- #from pypdf import PdfReader
17
  import fitz
18
  import glob
19
  import logging
20
-
 
 
21
 
22
  def get_remote_ip() -> str:
23
  """Get remote ip."""
 
24
 
25
- try:
26
- ctx = get_script_run_ctx()
27
- if ctx is None:
28
- return None
29
-
30
- session_info = runtime.get_instance().get_client(ctx.session_id)
31
- if session_info is None:
32
- return None
33
- except Exception as e:
34
- return None
35
-
36
- return session_info.request.remote_ip
37
-
38
-
39
- # colab side make dir
40
  def my_makedirs(path):
41
  if not os.path.isdir(path):
42
  os.makedirs(path)
@@ -46,170 +24,52 @@ def heart_beat():
46
  Heartbeat function to track whether the session is alive
47
  """
48
  thread = threading.Timer(interval=5, function=heart_beat)
 
49
 
50
- # insert context to the current thread, needed for
51
- # getting session specific attributes like st.session_state
52
-
53
- add_script_run_ctx(thread)
54
-
55
- # context is required to get session_id of the calling
56
- # thread (which would be the script thread)
57
- ctx = get_script_run_ctx()
58
-
59
- # this is the main runtime, contains all the sessions
60
- runtime = get_instance()
61
-
62
- if runtime.is_active_session(session_id=ctx.session_id):
63
- logging.info(f"{ctx.session_id} is alive.")
64
- thread.start()
65
- else:
66
- if 'uniq' in st.session_state:
67
- if os.path.isdir(f"removefolder/{st.session_state.uniq}"):
68
- shutil.rmtree(f"removefolder/{st.session_state.uniq}")
69
- logging.info(f"{ctx.session_id} is gone.")
70
- return
71
-
72
- # JavaScript to detect browser exit
73
- EXIT_JS = """
74
- <script>
75
- window.addEventListener('beforeunload', function (event) {
76
- fetch('/close_session', {method: 'POST'}).then(response => {
77
- return response.text();
78
- }).then(data => {
79
- console.log(data);
80
- });
81
- });
82
- </script>
83
- """
84
-
85
- # Embed the JavaScript in the Streamlit app
86
- #components.html(EXIT_JS)
87
- streamlit_js_eval(js_expressions = EXIT_JS)
88
-
89
- def main():
90
-
91
- if 'uniq' not in st.session_state:
92
- st.session_state.uniq = secrets.token_urlsafe()
93
-
94
- temp_dir = st.session_state.uniq
95
- my_makedirs(f"removefolder/{temp_dir}")
96
-
97
- flag = True
98
- if 'count' not in st.session_state:
99
- st.session_state.count = 0
100
- #tempolary
101
- if 'temp' not in st.session_state:
102
- st.session_state.temp = 0
103
-
104
- if 'lang' not in st.session_state:
105
- st.session_state.lang = ""
106
- if 'result' not in st.session_state:
107
- st.session_state.result = ""
108
-
109
- apptitle = st.empty()
110
- langs = st.empty()
111
- description = st.empty()
112
- obj_0 = st.empty()
113
- obj_1 = st.empty()
114
-
115
- apptitle.header("PDF file Translator π“†Š", divider="violet")
116
- langs.write('This App can translate to <`Japanese`, `English`, `French`, `Chinese (traditional)`, `Chinese (simplified)`, `Russian`, `Korean`, `Vietnamese`, `Thai`, `Catalan`, `Sinhalese`, `Nepall`>')
117
- description.write("""
118
- It's easy to use.:black_cat: Just upload, select the language, and download the resulting .zip file.:package:
119
-
120
- After uploading a PDF file and selecting the translation language, you can wait a while.
121
-
122
- The original text, the original text and translation (a few lines of the original text followed by the translation) and the translation text will be compressed into a zip file and available for download.
123
-
124
- When you press the download button, the compressed file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook:
125
-
126
- :koala: The uploaded PDF file data will disappear <u>when you close the browser tab</u>. :eyes:
127
-
128
- **Only PDF files can be uploaded.**
129
-
130
- This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang:
131
-
132
- :full_moon_with_face: Even if the PDF file has many pages, there is no limit to the number of pages or characters.
133
 
134
- <u>The untranslated data will be retained until the browser is closed, but once the app page is closed, the connection will be cut off and the data will be deleted.</u> :thought_balloon:
135
-
136
- #### FAQ :coffee:
137
-
138
- :baby: **Q** : Does the translated text have information about line breaks and paragraphs? :coffee:
139
-
140
- :robot_face: **A** : Line breaks and paragraphs are not reflected in the translated text.
141
-
142
- The text extracted from the original text has the same position indexed with `:::info` as the translated text.
143
-
144
- Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in.
145
- The original text has an `𓃰` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics.
146
-
147
- 𓃰00001-0;
148
-
149
- Similarly, the translation of the original text is followed by a `𓆏` : frog mark.
150
-
151
- 𓆏00001-0;
152
-
153
- :teapot: **Tips** : If you have a text editor with a pattern replacement function,
154
- you can use the characters starting with
155
- `:::info`
156
- and ending with the line that contains only
157
- `:::`
158
- as a pattern to remove.
159
- Try searching Google for keywords such as `grep and replace`.
160
 
161
- """, unsafe_allow_html=True)
162
- obj_0.success("PDF file uploader")
163
- # st.markdown(f"The remote ip is `{get_remote_ip()}`")
 
 
 
 
 
164
 
165
- uploaded_file = obj_1.file_uploader("UPLOAD your .pdf file", type="pdf")
166
- ####
167
  if uploaded_file is not None:
168
- flag = False
169
- st.success("PDF file translator")
170
- # hashed
171
  raw_filename = uploaded_file.name
172
- intext_0 = f'<span style="color:LavenderBlush;background:Orchid"> {raw_filename} </span>'
173
- st.write(intext_0, unsafe_allow_html=True)
174
  hashed_filename = hashlib.sha1(raw_filename.encode())
175
  uploadedfilename = hashed_filename.hexdigest()
176
- if "uploadedfilename" not in st.session_state:
177
- st.session_state.uploadedfilename = uploadedfilename
178
-
179
- if "book" not in st.session_state:
180
- #pdf_viewer(input=uploaded_file.getvalue(), width=700, height=500)
181
 
 
182
  my_makedirs(
183
- f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}"
184
  )
185
 
186
  with open(
187
- f'removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf',
188
  'wb') as file:
189
  file.write(uploaded_file.getvalue())
190
- # pdf_viewer(input=f'{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf', width=700, height=500)
191
 
192
- # read from PDF file
193
  PDF = glob.glob(
194
- f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
195
  )
196
 
197
- #doc = PdfReader(PDF[0])
198
  doc = fitz.open(PDF[0])
199
- # meta = doc.metadata
200
- #page_count = len(doc.pages)
201
  page_count = len(doc)
202
- book = [] # PDF text data pool
203
- progressbar1 = st.empty()
204
- my_bar1 = progressbar1.progress(0)
205
-
206
- from bs4 import BeautifulSoup
207
-
208
  for index, page in enumerate(doc):
209
- #page_text = page.extract_text()
210
- #page_text = page.get_text(sort=True)
211
  blocks = page.get_text("xml")
212
- soup = BeautifulSoup(blocks,'lxml-xml')
213
  page_text2 = ""
214
 
215
  for tag0 in soup.find_all("block"):
@@ -222,177 +82,34 @@ Try searching Google for keywords such as `grep and replace`.
222
  page_text2 += "\n"
223
  temp_y_posi = y_posi
224
  page_text2 += tag3.get("c")
225
-
226
  page_text2 += "\n\n"
227
-
228
-
229
-
230
-
231
- #for index, page in enumerate(doc.pages):
232
- #for index, page in enumerate(doc):
233
- # #page_text = page.extract_text()
234
- # page_text = page.get_text(sort=True)
235
- # book.append((index, page_text))
236
  book.append((index, page_text2))
237
 
238
- done = int(((index + 1) / page_count) * 100)
239
- my_bar1.progress(done,
240
- text=f"Reading Page Number : {index + 1}")
241
  doc.close()
242
-
243
- st.session_state.book = book
244
- my_bar1.empty()
245
  if os.path.isfile(
246
- f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/{uploadedfilename}.pdf"
247
  ):
248
  shutil.rmtree(
249
- f"removefolder/{temp_dir}/upload_folder_{st.session_state.count}/"
250
  )
251
 
252
- ########
253
- reload_bt = st.empty()
254
- if reload_bt.button("Upload another PDF file"):
255
- for key in st.session_state.keys():
256
- if key == "count" or key == "temp" or key == "lang":
257
- continue
258
- else:
259
- del st.session_state[key]
260
- shutil.rmtree(f"removefolder/{temp_dir}")
261
- # page reload
262
- streamlit_js_eval(js_expressions="parent.window.location.reload()")
263
- st.markdown("----")
264
-
265
- plain_text1 = " 𓃠 select target language 𓃠 "
266
- var_text1 = f'##### <span style="color:green">{plain_text1}</span>'
267
-
268
- select = st.empty()
269
- select.write(var_text1, unsafe_allow_html=True)
270
-
271
- # select language
272
- st.markdown("""
273
- `ja`: **Japanese**,
274
- `en`: **English**,
275
- `fr`: **French**,
276
- `zb-TW`: **Chinese (traditional)**,
277
- `zh-CN`: **Chinese (simplified)**,
278
- `ru`: **Russian**,
279
- `ko`: **Korean**,
280
- `vi`: **Vietnamese**,
281
- `th`: **Thai**,
282
- `tl`: **Tagalog**,
283
- `ca`: **Catalan**,
284
- `si`: **Sinhalese**,
285
- `ne`: **Nepall**
286
- """)
287
- lang_code = [
288
- "select language",
289
- "Japanese",
290
- "English",
291
- "French",
292
- "Chinese traditional",
293
- "Chinese simplified",
294
- "Russian",
295
- "Korean",
296
- "Vietnamese",
297
- "Thai",
298
- "Tagalog",
299
- "Catalan",
300
- "Sinhalese",
301
- "Nepall"
302
- ]
303
- sel = st.empty()
304
- language = sel.radio(
305
- label='translate to',
306
- options=lang_code,
307
- index=0,
308
- key = f"select_lang{st.session_state.count}",
309
- horizontal=True)
310
- #language = sel.selectbox(
311
- # 'translate to',
312
- # lang_code,
313
- # index=0,
314
- # #placeholder = "select language",
315
- # key=f"select_lang{st.session_state.count}")
316
-
317
- statename = f"select_lang{st.session_state.count}"
318
- if "target_lang" not in st.session_state:
319
- st.session_state.target_lang = "UNSELECTED"
320
-
321
- def reset_selected_lang():
322
- st.session_state[statename] = "select language"
323
-
324
- st.button('Reset Language', on_click=reset_selected_lang)
325
-
326
- area = st.empty()
327
- if flag:
328
- if "select_lang" in st.session_state:
329
- if st.session_state.select_lang != "select language":
330
- area2 = st.empty()
331
- plain_text2 = "☟Reset Language☟"
332
- empty_text = "☟ ☟"
333
- var_text2 = f'<span style="color:#FF69B4">{plain_text2}</span>'
334
- while flag:
335
- area2.write(var_text2, unsafe_allow_html=True)
336
- time.sleep(0.9)
337
- area2.write(empty_text)
338
- time.sleep(0.5)
339
-
340
- while flag:
341
- area.text("π“€€ upload PDF file π“€€")
342
- time.sleep(1)
343
- area.text("π“€₯ π“€₯")
344
- time.sleep(0.8)
345
- else:
346
- if f"select_lang{st.session_state.count}" in st.session_state:
347
- statename = f"select_lang{st.session_state.count}"
348
- if st.session_state[statename] != "select language":
349
- plain_text2 = "Reset Language"
350
- var_text2 = f'<span style="color:gray">β–² `{plain_text2}`</span>'
351
- area.write(var_text2, unsafe_allow_html=True)
352
-
353
- obj_0.empty()
354
- obj_1.empty() # uploader hide
355
-
356
- # pdf translator
357
- #------------------------------------------
358
- st.markdown("----")
359
- st.success("translator")
360
-
361
- if "book" in st.session_state:
362
- book_data = st.session_state.book
363
- page_count = len(book_data)
364
- else:
365
- page_count = 0
366
-
367
- st.text(f"PDF total pages : {page_count}")
368
-
369
- progressbar = st.empty()
370
- my_bar = progressbar.progress(0)
371
-
372
- #3
373
- # from google.colab import output
374
- import re
375
- #from googletrans import Translator
376
- from deep_translator import GoogleTranslator
377
-
378
- title_name = re.sub("\.| |%|@|\"|\'", "_", f"{uploaded_file.name}")
379
-
380
- if st.session_state.temp != int(st.session_state.count):
381
- st.session_state.lang = "init"
382
- st.session_state.temp = int(st.session_state.count)
383
 
384
  if language not in lang_code[1:]:
385
  language = None
386
 
387
- if st.session_state.lang != language and language is not None:
388
- st.session_state.count += 1
389
- st.session_state.result = ""
390
- st.session_state.lang = language
391
 
392
- description.empty()
393
-
394
  my_makedirs(
395
- f"removefolder/{temp_dir}/work_{st.session_state.count}")
396
 
397
  to = ""
398
  match language:
@@ -425,34 +142,22 @@ Try searching Google for keywords such as `grep and replace`.
425
  case _:
426
  to = "unknown"
427
 
428
- st.info(f"translate to [ {language} ]")
429
-
430
- st.session_state.target_lang = to
431
 
432
- with st.container():
433
- work_area1 = st.empty()
434
- work_area2 = st.empty()
435
- #--------------------------------------
436
 
437
  for index, page in enumerate(book_data):
438
  page_text = page[1]
439
- # print("\nPage Number:" + str(index))
440
- done = int(((index + 1) / page_count) * 100)
441
- my_bar.progress(done,
442
- text=f"Working Page Number : {index + 1}")
443
- # print(len(page_text))
444
- # text_list = [s for s in page_text.split('\n') if s]
445
  page_text = re.sub('\.', '.π“‚€', page_text)
446
  text_list = [s for s in page_text.split('π“‚€')]
447
  if len(text_list) < 1:
448
  continue
449
 
450
- limit = 0
451
  temp_list = []
452
  line_number = []
453
 
454
  for n, line in enumerate(text_list):
455
-
456
  line2 = re.sub(r"\s+", " ", line)
457
  if line2 == "":
458
  continue
@@ -470,142 +175,142 @@ Try searching Google for keywords such as `grep and replace`.
470
 
471
  text_2 = text_
472
  text_ = re.sub('π“‚€', "", text_)
473
- #while (re.search('π“‚€', text_2)):
474
- # num = line_number.pop(0)
475
- # rep_words = f"𓃐NO:{num}| "
476
- # text_2 = text_2.replace('π“‚€', rep_words, 1)
477
  line_number.clear()
478
 
479
- # print(re.sub("𓃐","\n", text_2))
480
- #ts = Translator()
481
  all_text_orig = f":::info\n𓃰{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
482
 
483
  for times in range(0, 5):
484
-
485
  try:
486
  tsd = GoogleTranslator(
487
  source="auto",
488
  target=to).translate(text=text_)
489
  if tsd == None:
490
  tsd = text_
491
- #tsd = ts.translate(text_, src="en", dest="ja")
492
- #translated_text = ts.translate(line, src="en", dest="ja").text
493
  all_text_done = f":::info\n𓆏{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
494
- #all_text_done = f"**{index:05d}" + f"-{n}" + "; " + tsd.text + "\n"
495
-
496
- # all_text_orig += str(n) + "; " + tsd.pronunciation + "\n"
497
- # print(index,n, line)
498
- # print(index,n, tsd.text)
499
-
500
- # print(all_text_orig)
501
- # print(all_text_done + "\n")
502
- if type(all_text_orig) is str and type(
503
- all_text_done) is str:
504
-
505
- # intext_1 = f'<span style="color:DimGray;background:GhostWhite">{all_text_orig}</span>'
506
- # work_area1.markdown(intext_1, unsafe_allow_html=True)
507
- work_area1.write(f"{all_text_orig}")
508
- # intext_2 = f'<span style="color:LavenderBlush;background:Gray">{all_text_done}</span>'
509
- work_area2.write(f"{all_text_done}")
510
- # work_area2.markdown(intext_2, unsafe_allow_html=True)
511
-
512
- with open(
513
- f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
514
- "a") as tempf:
515
- tempf.write(all_text_orig + "\n\n" +
516
- all_text_done + "\n\n")
517
-
518
- # st.session_state.result += all_text_orig + "\n\n"
519
- # st.session_state.result += all_text_done + "\n\n"
520
-
521
- # print(n, tsd.pronunciation)
522
  with open(
523
- f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done.txt",
 
 
 
 
 
 
524
  "a") as f:
525
  f.write(all_text_orig + all_text_done +
526
  "\n")
527
  with open(
528
- f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_done_{language}.txt",
529
  "a") as f:
530
  f.write(all_text_done + "\n")
531
 
532
  break
533
 
534
  except Exception as e:
535
- print(e)
536
  time.sleep(3)
537
  continue
538
 
539
  with open(
540
- f"removefolder/{temp_dir}/work_{st.session_state.count}/{title_name}_orig.txt",
541
  "a") as f:
542
  f.write(all_text_orig + "\n")
543
 
544
-
545
- st.markdown("----")
546
-
547
  my_makedirs(f"removefolder/{temp_dir}/download_section")
548
  shutil.move(
549
- f"removefolder/{temp_dir}/work_{st.session_state.count}/reuseMarkdown.txt",
550
- f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
551
  )
552
 
553
  shutil.make_archive(
554
- f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}',\
555
- format='zip',\
556
- root_dir=f'removefolder/{temp_dir}/work_{st.session_state.count}'\
557
- )
558
  shutil.rmtree(
559
- f"removefolder/{temp_dir}/work_{st.session_state.count}")
560
-
561
- st.balloons()
562
-
563
- work_area1.empty()
564
- work_area2.empty()
565
-
566
- #--------------------------------------
567
-
568
- st.success("Download translated text files")
569
- st.write(intext_0, unsafe_allow_html=True)
570
- # plain_text3 = f"[ {st.session_state.target_lang} ] : translated text files"
571
- plain_text3 = f"[ {language} ] : translated text files"
572
- var_text3 = f'##### <span style="color:#FF69B4">{plain_text3}</span>'
573
-
574
- translated = st.empty()
575
- translated.write(var_text3, unsafe_allow_html=True)
576
 
577
  if os.path.isfile(
578
- f'removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip'
579
  ):
580
  with open(
581
- f"removefolder/{temp_dir}/download_section/{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
582
  "rb") as fpath:
583
- btn = st.download_button(
584
- label=f"DOWNLOAD .zip file",
585
- data=fpath,
586
- file_name=
587
- f"{st.session_state.uploadedfilename}_{st.session_state.count}.zip",
588
- mime="application/zip")
589
 
590
- plain_text4 = "download zipfile"
591
- var_text4 = f'<span style="color:gray">β–² `{plain_text4}` 𓁉 </span>'
592
- st.write(var_text4, unsafe_allow_html=True)
593
 
594
- st.markdown("----")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
 
596
- plain_text5 = " 𓀑 results 𓁙 "
597
- var_text5 = f'##### <span style="color:#20B2AA">{plain_text5}</span>'
598
- st.write(var_text5, unsafe_allow_html=True)
 
 
 
 
 
 
 
599
 
600
- tempf = open(
601
- f"removefolder/{temp_dir}/download_section/reuseMarkdown_{st.session_state.count}.txt"
602
- )
603
- all_result = tempf.read()
604
- tempf.close()
605
- st.write(intext_0, unsafe_allow_html=True)
606
- st.write(all_result, unsafe_allow_html=True)
607
- # st.write(st.session_state.result, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
  if __name__ == "__main__":
610
  heart_beat()
611
- main()
 
1
+ import gradio as gr
2
  import zipfile, shutil, time
3
  import os
4
  import hashlib
 
 
 
 
5
  import secrets
 
6
  import threading
 
 
 
 
 
7
  import fitz
8
  import glob
9
  import logging
10
+ from deep_translator import GoogleTranslator
11
+ import re
12
+ from bs4 import BeautifulSoup
13
 
14
  def get_remote_ip() -> str:
15
  """Get remote ip."""
16
+ return "127.0.0.1" # Placeholder for remote IP in Gradio
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def my_makedirs(path):
19
  if not os.path.isdir(path):
20
  os.makedirs(path)
 
24
  Heartbeat function to track whether the session is alive
25
  """
26
  thread = threading.Timer(interval=5, function=heart_beat)
27
+ thread.start()
28
 
29
+ def main(uploaded_file, language):
30
+ if 'uniq' not in gr.session_state:
31
+ gr.session_state.uniq = secrets.token_urlsafe()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ temp_dir = gr.session_state.uniq
34
+ my_makedirs(f"removefolder/{temp_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ if 'count' not in gr.session_state:
37
+ gr.session_state.count = 0
38
+ if 'temp' not in gr.session_state:
39
+ gr.session_state.temp = 0
40
+ if 'lang' not in gr.session_state:
41
+ gr.session_state.lang = ""
42
+ if 'result' not in gr.session_state:
43
+ gr.session_state.result = ""
44
 
 
 
45
  if uploaded_file is not None:
 
 
 
46
  raw_filename = uploaded_file.name
 
 
47
  hashed_filename = hashlib.sha1(raw_filename.encode())
48
  uploadedfilename = hashed_filename.hexdigest()
49
+ if "uploadedfilename" not in gr.session_state:
50
+ gr.session_state.uploadedfilename = uploadedfilename
 
 
 
51
 
52
+ if "book" not in gr.session_state:
53
  my_makedirs(
54
+ f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}"
55
  )
56
 
57
  with open(
58
+ f'removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf',
59
  'wb') as file:
60
  file.write(uploaded_file.getvalue())
 
61
 
 
62
  PDF = glob.glob(
63
+ f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf"
64
  )
65
 
 
66
  doc = fitz.open(PDF[0])
 
 
67
  page_count = len(doc)
68
+ book = []
69
+
 
 
 
 
70
  for index, page in enumerate(doc):
 
 
71
  blocks = page.get_text("xml")
72
+ soup = BeautifulSoup(blocks, 'lxml-xml')
73
  page_text2 = ""
74
 
75
  for tag0 in soup.find_all("block"):
 
82
  page_text2 += "\n"
83
  temp_y_posi = y_posi
84
  page_text2 += tag3.get("c")
85
+
86
  page_text2 += "\n\n"
87
+
 
 
 
 
 
 
 
 
88
  book.append((index, page_text2))
89
 
 
 
 
90
  doc.close()
91
+ gr.session_state.book = book
 
 
92
  if os.path.isfile(
93
+ f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/{uploadedfilename}.pdf"
94
  ):
95
  shutil.rmtree(
96
+ f"removefolder/{temp_dir}/upload_folder_{gr.session_state.count}/"
97
  )
98
 
99
+ if gr.session_state.temp != int(gr.session_state.count):
100
+ gr.session_state.lang = "init"
101
+ gr.session_state.temp = int(gr.session_state.count)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  if language not in lang_code[1:]:
104
  language = None
105
 
106
+ if gr.session_state.lang != language and language is not None:
107
+ gr.session_state.count += 1
108
+ gr.session_state.result = ""
109
+ gr.session_state.lang = language
110
 
 
 
111
  my_makedirs(
112
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}")
113
 
114
  to = ""
115
  match language:
 
142
  case _:
143
  to = "unknown"
144
 
145
+ gr.session_state.target_lang = to
 
 
146
 
147
+ book_data = gr.session_state.book
148
+ page_count = len(book_data)
 
 
149
 
150
  for index, page in enumerate(book_data):
151
  page_text = page[1]
 
 
 
 
 
 
152
  page_text = re.sub('\.', '.π“‚€', page_text)
153
  text_list = [s for s in page_text.split('π“‚€')]
154
  if len(text_list) < 1:
155
  continue
156
 
 
157
  temp_list = []
158
  line_number = []
159
 
160
  for n, line in enumerate(text_list):
 
161
  line2 = re.sub(r"\s+", " ", line)
162
  if line2 == "":
163
  continue
 
175
 
176
  text_2 = text_
177
  text_ = re.sub('π“‚€', "", text_)
 
 
 
 
178
  line_number.clear()
179
 
 
 
180
  all_text_orig = f":::info\n𓃰{index + 1:05d}" + f"-{n}" + f";\n:::\n{text_}\n"
181
 
182
  for times in range(0, 5):
 
183
  try:
184
  tsd = GoogleTranslator(
185
  source="auto",
186
  target=to).translate(text=text_)
187
  if tsd == None:
188
  tsd = text_
 
 
189
  all_text_done = f":::info\n𓆏{index + 1:05d}" + f"-{n}" + f";\n:::\n{tsd}\n"
190
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with open(
192
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}/reuseMarkdown.txt",
193
+ "a") as tempf:
194
+ tempf.write(all_text_orig + "\n\n" +
195
+ all_text_done + "\n\n")
196
+
197
+ with open(
198
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_done.txt",
199
  "a") as f:
200
  f.write(all_text_orig + all_text_done +
201
  "\n")
202
  with open(
203
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_done_{language}.txt",
204
  "a") as f:
205
  f.write(all_text_done + "\n")
206
 
207
  break
208
 
209
  except Exception as e:
 
210
  time.sleep(3)
211
  continue
212
 
213
  with open(
214
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}/{uploadedfilename}_orig.txt",
215
  "a") as f:
216
  f.write(all_text_orig + "\n")
217
 
 
 
 
218
  my_makedirs(f"removefolder/{temp_dir}/download_section")
219
  shutil.move(
220
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}/reuseMarkdown.txt",
221
+ f"removefolder/{temp_dir}/download_section/reuseMarkdown_{gr.session_state.count}.txt"
222
  )
223
 
224
  shutil.make_archive(
225
+ f'removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}',
226
+ format='zip',
227
+ root_dir=f'removefolder/{temp_dir}/work_{gr.session_state.count}'
228
+ )
229
  shutil.rmtree(
230
+ f"removefolder/{temp_dir}/work_{gr.session_state.count}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  if os.path.isfile(
233
+ f'removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}.zip'
234
  ):
235
  with open(
236
+ f"removefolder/{temp_dir}/download_section/{gr.session_state.uploadedfilename}_{gr.session_state.count}.zip",
237
  "rb") as fpath:
238
+ return fpath.read()
 
 
 
 
 
239
 
240
+ return None
 
 
241
 
242
+ lang_code = [
243
+ "select language",
244
+ "Japanese",
245
+ "English",
246
+ "French",
247
+ "Chinese traditional",
248
+ "Chinese simplified",
249
+ "Russian",
250
+ "Korean",
251
+ "Vietnamese",
252
+ "Thai",
253
+ "Tagalog",
254
+ "Catalan",
255
+ "Sinhalese",
256
+ "Nepall"
257
+ ]
258
 
259
+ iface = gr.Interface(
260
+ fn=main,
261
+ inputs=[
262
+ gr.inputs.File(label="UPLOAD your .pdf file"),
263
+ gr.inputs.Radio(lang_code, label="translate to", default="select language")
264
+ ],
265
+ outputs=gr.outputs.File(label="DOWNLOAD .zip file"),
266
+ title="PDF file Translator π“†Š",
267
+ description="""
268
+ It's easy to use.:black_cat: Just upload, select the language, and download the resulting .zip file.:package:
269
 
270
+ After uploading a PDF file and selecting the translation language, you can wait a while.
271
+
272
+ The original text, the original text and translation (a few lines of the original text followed by the translation) and the translation text will be compressed into a zip file and available for download.
273
+
274
+ When you press the download button, the compressed file will be downloaded, and if you select another translation language, the process will be repeated.:leftwards_arrow_with_hook:
275
+
276
+ :koala: The uploaded PDF file data will disappear <u>when you close the browser tab</u>. :eyes:
277
+
278
+ **Only PDF files can be uploaded.**
279
+
280
+ This translation app is useful for people who want to translate something or want to read something but cannot read it unless it is translated, and who want to quickly check the original text and the translation by comparing them in pairs. :yin_yang:
281
+
282
+ :full_moon_with_face: Even if the PDF file has many pages, there is no limit to the number of pages or characters.
283
+
284
+ <u>The untranslated data will be retained until the browser is closed, but once the app page is closed, the connection will be cut off and the data will be deleted.</u> :thought_balloon:
285
+
286
+ #### FAQ :coffee:
287
+
288
+ :baby: **Q** : Does the translated text have information about line breaks and paragraphs? :coffee:
289
+
290
+ :robot_face: **A** : Line breaks and paragraphs are not reflected in the translated text.
291
+
292
+ The text extracted from the original text has the same position indexed with `:::info` as the translated text.
293
+
294
+ Regardless of the contents of the uploaded PDF or document, counting starts from zero and you can see which page or sentence you are in.
295
+ The original text has an `𓃰` : elephant mark after `:::info`, which are unicode characters representing Egyptian hieroglyphics.
296
+
297
+ 𓃰00001-0;
298
+
299
+ Similarly, the translation of the original text is followed by a `𓆏` : frog mark.
300
+
301
+ 𓆏00001-0;
302
+
303
+ :teapot: **Tips** : If you have a text editor with a pattern replacement function,
304
+ you can use the characters starting with
305
+ `:::info`
306
+ and ending with the line that contains only
307
+ `:::`
308
+ as a pattern to remove.
309
+ Try searching Google for keywords such as `grep and replace`.
310
+ """,
311
+ live=True
312
+ )
313
 
314
  if __name__ == "__main__":
315
  heart_beat()
316
+ iface.launch()