leonsimon23 commited on
Commit
65d246e
·
verified ·
1 Parent(s): e903a45

Create bak.txt

Browse files
Files changed (1) hide show
  1. bak.txt +500 -0
bak.txt ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from pdf2zh import __version__
5
+ from pdf2zh.pdf2zh import extract_text
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import pymupdf
10
+ import tqdm
11
+ import requests
12
+ import cgi
13
+
14
+ # Map service names to pdf2zh service options
15
+ service_map = {
16
+ #"Google": ("google", None, None),
17
+ #"DeepL": ("deepl", "DEEPL_AUTH_KEY", None),
18
+ #"DeepLX": ("deeplx", "DEEPLX_AUTH_KEY", None),
19
+ #"Ollama": ("ollama", None, "gemma2"),
20
+ "OpenAI": ("openai", "OPENAI_API_KEY", "gpt-4o"),
21
+ #"Azure": ("azure", "AZURE_APIKEY", None),
22
+ #"Tencent": ("tencent", "TENCENT_SECRET_KEY", None),
23
+ }
24
+ lang_map = {
25
+ "Chinese": "zh",
26
+ "English": "en",
27
+ "French": "fr",
28
+ "German": "de",
29
+ "Japanese": "ja",
30
+ "Korean": "ko",
31
+ "Russian": "ru",
32
+ "Spanish": "es",
33
+ "Italian": "it",
34
+ }
35
+ page_map = {
36
+ "All": None,
37
+ "First": [0],
38
+ "First 5 pages": list(range(0, 5)),
39
+ }
40
+
41
+ flag_demo = False
42
+ if os.environ.get("PDF2ZH_DEMO"):
43
+ flag_demo = True
44
+ service_map = {
45
+ "Google": ("google", None, None),
46
+ }
47
+ page_map = {
48
+ "First": [0],
49
+ "First 20 pages": list(range(0, 20)),
50
+ }
51
+ client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
52
+ server_key = os.environ.get("PDF2ZH_SERVER_KEY")
53
+
54
+
55
+ def verify_recaptcha(response):
56
+ recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
57
+
58
+ print("reCAPTCHA", server_key, response)
59
+
60
+ data = {"secret": server_key, "response": response}
61
+ result = requests.post(recaptcha_url, data=data).json()
62
+
63
+ print("reCAPTCHA", result.get("success"))
64
+
65
+ return result.get("success")
66
+
67
+
68
+ def pdf_preview(file):
69
+ doc = pymupdf.open(file)
70
+ page = doc[0]
71
+ pix = page.get_pixmap()
72
+ image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
73
+ return image
74
+
75
+
76
+ def upload_file(file, service, progress=gr.Progress()):
77
+ """Handle file upload, validation, and initial preview."""
78
+ if not file or not os.path.exists(file):
79
+ return None, None
80
+
81
+ try:
82
+ # Convert first page for preview
83
+ preview_image = pdf_preview(file)
84
+
85
+ return file, preview_image
86
+ except Exception as e:
87
+ print(f"Error converting PDF: {e}")
88
+ return None, None
89
+
90
+
91
+ def download_with_limit(url, save_path, size_limit):
92
+ chunk_size = 1024
93
+ total_size = 0
94
+ with requests.get(url, stream=True, timeout=10) as response:
95
+ response.raise_for_status()
96
+ content = response.headers.get("Content-Disposition")
97
+ try:
98
+ _, params = cgi.parse_header(content)
99
+ filename = params["filename"]
100
+ except Exception:
101
+ filename = os.path.basename(url)
102
+ with open(save_path / filename, "wb") as file:
103
+ for chunk in response.iter_content(chunk_size=chunk_size):
104
+ total_size += len(chunk)
105
+ if size_limit and total_size > size_limit:
106
+ raise gr.Error("Exceeds file size limit")
107
+ file.write(chunk)
108
+ return save_path / filename
109
+
110
+
111
+ def translate(
112
+ file_type,
113
+ file_input,
114
+ link_input,
115
+ service,
116
+ apikey,
117
+ model_id,
118
+ lang_from,
119
+ lang_to,
120
+ page_range,
121
+ recaptcha_response,
122
+ progress=gr.Progress(),
123
+ ):
124
+ """Translate PDF content using selected service."""
125
+ if flag_demo and not verify_recaptcha(recaptcha_response):
126
+ raise gr.Error("reCAPTCHA fail")
127
+
128
+ progress(0, desc="Starting translation...")
129
+
130
+ output = Path("pdf2zh_files")
131
+ output.mkdir(parents=True, exist_ok=True)
132
+
133
+ if file_type == "File":
134
+ if not file_input:
135
+ raise gr.Error("No input")
136
+ file_path = shutil.copy(file_input, output)
137
+ else:
138
+ if not link_input:
139
+ raise gr.Error("No input")
140
+ file_path = download_with_limit(
141
+ link_input,
142
+ output,
143
+ 5 * 1024 * 1024 if flag_demo else None,
144
+ )
145
+
146
+ filename = os.path.splitext(os.path.basename(file_path))[0]
147
+ file_en = output / f"{filename}.pdf"
148
+ file_zh = output / f"{filename}-zh.pdf"
149
+ file_dual = output / f"{filename}-dual.pdf"
150
+
151
+ selected_service = service_map[service][0]
152
+ if service_map[service][1]:
153
+ os.environ.setdefault(service_map[service][1], apikey)
154
+ selected_page = page_map[page_range]
155
+ lang_from = lang_map[lang_from]
156
+ lang_to = lang_map[lang_to]
157
+ if selected_service == "google":
158
+ lang_from = "zh-CN" if lang_from == "zh" else lang_from
159
+ lang_to = "zh-CN" if lang_to == "zh" else lang_to
160
+
161
+ print(f"Files before translation: {os.listdir(output)}")
162
+
163
+ def progress_bar(t: tqdm.tqdm):
164
+ progress(t.n / t.total, desc="Translating...")
165
+
166
+ param = {
167
+ "files": [file_en],
168
+ "pages": selected_page,
169
+ "lang_in": lang_from,
170
+ "lang_out": lang_to,
171
+ "service": f"{selected_service}:{model_id}",
172
+ "output": output,
173
+ "thread": 4,
174
+ "callback": progress_bar,
175
+ }
176
+ print(param)
177
+ extract_text(**param)
178
+ print(f"Files after translation: {os.listdir(output)}")
179
+
180
+ if not file_zh.exists() or not file_dual.exists():
181
+ raise gr.Error("No output")
182
+
183
+ try:
184
+ translated_preview = pdf_preview(str(file_zh))
185
+ except Exception:
186
+ raise gr.Error("No preview")
187
+
188
+ progress(1.0, desc="Translation complete!")
189
+
190
+ return (
191
+ str(file_zh),
192
+ translated_preview,
193
+ str(file_dual),
194
+ gr.update(visible=True),
195
+ gr.update(visible=True),
196
+ gr.update(visible=True),
197
+ )
198
+
199
+
200
+ # Global setup
201
+ custom_blue = gr.themes.Color(
202
+ c50="#E8F3FF",
203
+ c100="#BEDAFF",
204
+ c200="#94BFFF",
205
+ c300="#6AA1FF",
206
+ c400="#4080FF",
207
+ c500="#165DFF", # Primary color
208
+ c600="#0E42D2",
209
+ c700="#0A2BA6",
210
+ c800="#061D79",
211
+ c900="#03114D",
212
+ c950="#020B33",
213
+ )
214
+
215
+ with gr.Blocks(
216
+ title="PDFBestTranslate - PDF Translation with preserved formats",
217
+ theme=gr.themes.Default(
218
+ primary_hue=custom_blue, spacing_size="md", radius_size="lg"
219
+ ),
220
+ css="""
221
+ .secondary-text {color: #999 !important;}
222
+ footer {visibility: hidden}
223
+ .env-warning {color: #dd5500 !important;}
224
+ .env-success {color: #559900 !important;}
225
+ /* Add dashed border to input-file class */
226
+ .input-file {
227
+ border: 1.2px dashed #165DFF !important;
228
+ border-radius: 6px !important;
229
+ # background-color: #ffffff !important;
230
+ transition: background-color 0.4s ease-out;
231
+ }
232
+ .input-file:hover {
233
+ border: 1.2px dashed #165DFF !important;
234
+ border-radius: 6px !important;
235
+ color: #165DFF !important;
236
+ background-color: #E8F3FF !important;
237
+ transition: background-color 0.2s ease-in;
238
+ }
239
+ .progress-bar-wrap {
240
+ border-radius: 8px !important;
241
+ }
242
+ .progress-bar {
243
+ border-radius: 8px !important;
244
+ }
245
+ # .input-file label {
246
+ # color: #165DFF !important;
247
+ # border: 1.2px dashed #165DFF !important;
248
+ # border-left: none !important;
249
+ # border-top: none !important;
250
+ # }
251
+ # .input-file .wrap {
252
+ # color: #165DFF !important;
253
+ # }
254
+ # .input-file .or {
255
+ # color: #165DFF !important;
256
+ # }
257
+ """,
258
+ head=(
259
+ """
260
+ <script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
261
+ <script type="text/javascript">
262
+ var onVerify = function(token) {
263
+ el=document.getElementById('verify').getElementsByTagName('textarea')[0];
264
+ el.value=token;
265
+ el.dispatchEvent(new Event('input'));
266
+ };
267
+ </script>
268
+ """
269
+ if flag_demo
270
+ else ""
271
+ ),
272
+ ) as demo:
273
+ gr.Markdown(
274
+ #"# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
275
+ "# [PDFMathTranslate——科研之心免费提供(更多科研AI智能体请点击)](https://ai.linkagi.top)"
276
+ )
277
+
278
+ with gr.Row():
279
+ with gr.Column(scale=1):
280
+ gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
281
+ file_type = gr.Radio(
282
+ choices=["File", "Link"],
283
+ label="Type",
284
+ value="File",
285
+ )
286
+ file_input = gr.File(
287
+ label="File",
288
+ file_count="single",
289
+ file_types=[".pdf"],
290
+ type="filepath",
291
+ elem_classes=["input-file"],
292
+ )
293
+ link_input = gr.Textbox(
294
+ label="Link",
295
+ visible=False,
296
+ interactive=True,
297
+ )
298
+ gr.Markdown("## Option(请先选择翻译模型)")
299
+ with gr.Row():
300
+ service = gr.Dropdown(
301
+ label="Service",
302
+ choices=service_map.keys(),
303
+ value="Google",
304
+ )
305
+ apikey = gr.Textbox(
306
+ label="API Key",
307
+ max_lines=1,
308
+ visible=False,
309
+ )
310
+ with gr.Row():
311
+ lang_from = gr.Dropdown(
312
+ label="Translate from",
313
+ choices=lang_map.keys(),
314
+ value="English",
315
+ )
316
+ lang_to = gr.Dropdown(
317
+ label="Translate to",
318
+ choices=lang_map.keys(),
319
+ value="Chinese",
320
+ )
321
+ page_range = gr.Radio(
322
+ choices=page_map.keys(),
323
+ label="Pages",
324
+ value=list(page_map.keys())[0],
325
+ )
326
+ model_id = gr.Textbox(
327
+ label="Model ID",
328
+ visible=False,
329
+ interactive=True,
330
+ )
331
+ envs_status = "<span class='env-success'>- Properly configured.</span><br>"
332
+
333
+ def details_wrapper(text_markdown):
334
+ text = f"""
335
+ <summary>Technical details</summary>
336
+ {text_markdown}
337
+ - GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
338
+ - GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
339
+ - Version: {__version__}
340
+ """
341
+ return text
342
+
343
+ def env_var_checker(env_var_name: str) -> str:
344
+ if env_var_name:
345
+ if not os.environ.get(env_var_name):
346
+ envs_status = (
347
+ f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
348
+ + "</span><br>- Please make sure that the environment variables are properly configured "
349
+ + "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
350
+ )
351
+ else:
352
+ value = str(os.environ.get(env_var_name))
353
+ envs_status = "<span class='env-success'>- Properly configured.</span><br>"
354
+ envs_status += (
355
+ f"- {env_var_name}: <code>{value[:13]}***</code><br>"
356
+ )
357
+ else:
358
+ envs_status = (
359
+ "<span class='env-success'>- Properly configured.</span><br>"
360
+ )
361
+ return details_wrapper(envs_status)
362
+
363
+ def on_select_service(service, evt: gr.EventData):
364
+ if service_map[service][1]:
365
+ apikey_content = gr.update(
366
+ visible=False, value=os.environ.get(service_map[service][1])
367
+ )
368
+ else:
369
+ apikey_content = gr.update(visible=False)
370
+ if service_map[service][2]:
371
+ model_visibility = gr.update(
372
+ visible=True, value=service_map[service][2]
373
+ )
374
+ else:
375
+ model_visibility = gr.update(visible=False)
376
+ return (
377
+ env_var_checker(service_map[service][1]),
378
+ model_visibility,
379
+ apikey_content,
380
+ )
381
+
382
+ def on_select_filetype(file_type):
383
+ return (
384
+ gr.update(visible=file_type == "File"),
385
+ gr.update(visible=file_type == "Link"),
386
+ )
387
+
388
+ output_title = gr.Markdown("## Translated", visible=False)
389
+ output_file = gr.File(label="Download Translation", visible=False)
390
+ output_file_dual = gr.File(
391
+ label="Download Translation (Dual)", visible=False
392
+ )
393
+ recaptcha_response = gr.Textbox(
394
+ label="reCAPTCHA Response", elem_id="verify", visible=False
395
+ )
396
+ recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
397
+ translate_btn = gr.Button("Translate", variant="primary")
398
+ tech_details_tog = gr.Markdown(
399
+ details_wrapper(envs_status),
400
+ elem_classes=["secondary-text"],
401
+ )
402
+ service.select(
403
+ on_select_service, service, [tech_details_tog, model_id, apikey]
404
+ )
405
+ file_type.select(
406
+ on_select_filetype,
407
+ file_type,
408
+ [file_input, link_input],
409
+ js=(
410
+ f"""
411
+ (a,b)=>{{
412
+ try{{
413
+ grecaptcha.render('recaptcha-box',{{
414
+ 'sitekey':'{client_key}',
415
+ 'callback':'onVerify'
416
+ }});
417
+ }}catch(error){{}}
418
+ return [a];
419
+ }}
420
+ """
421
+ if flag_demo
422
+ else ""
423
+ ),
424
+ )
425
+
426
+ with gr.Column(scale=2):
427
+ gr.Markdown("## Preview")
428
+ preview = gr.Image(label="Document Preview", visible=True)
429
+
430
+ # Event handlers
431
+ file_input.upload(
432
+ upload_file,
433
+ inputs=[file_input, service],
434
+ outputs=[file_input, preview],
435
+ js=(
436
+ f"""
437
+ (a,b)=>{{
438
+ try{{
439
+ grecaptcha.render('recaptcha-box',{{
440
+ 'sitekey':'{client_key}',
441
+ 'callback':'onVerify'
442
+ }});
443
+ }}catch(error){{}}
444
+ return [a];
445
+ }}
446
+ """
447
+ if flag_demo
448
+ else ""
449
+ ),
450
+ )
451
+
452
+ translate_btn.click(
453
+ translate,
454
+ inputs=[
455
+ file_type,
456
+ file_input,
457
+ link_input,
458
+ service,
459
+ apikey,
460
+ model_id,
461
+ lang_from,
462
+ lang_to,
463
+ page_range,
464
+ recaptcha_response,
465
+ ],
466
+ outputs=[
467
+ output_file,
468
+ preview,
469
+ output_file_dual,
470
+ output_file,
471
+ output_file_dual,
472
+ output_title,
473
+ ],
474
+ ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
475
+
476
+
477
+ def setup_gui(share=False):
478
+ if flag_demo:
479
+ demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
480
+ else:
481
+ try:
482
+ demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
483
+ except Exception:
484
+ print(
485
+ "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
486
+ )
487
+ try:
488
+ demo.launch(
489
+ server_name="127.0.0.1", debug=True, inbrowser=True, share=share
490
+ )
491
+ except Exception:
492
+ print(
493
+ "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
494
+ )
495
+ demo.launch(debug=True, inbrowser=True, share=True)
496
+
497
+
498
+ # For auto-reloading while developing
499
+ if __name__ == "__main__":
500
+ setup_gui()