leonsimon23 commited on
Commit
8b23ca3
·
verified ·
1 Parent(s): 2a78cd7

Upload 9 files

Browse files
pdf2zh/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ log = logging.getLogger(__name__)
4
+
5
+ __version__ = "1.8.4"
6
+ __author__ = "Byaidu"
pdf2zh/cache.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import os
3
+ import time
4
+ import hashlib
5
+ import shutil
6
+
7
+ cache_dir = os.path.join(tempfile.gettempdir(), "cache")
8
+ os.makedirs(cache_dir, exist_ok=True)
9
+ time_filename = "update_time"
10
+ max_cache = 5
11
+
12
+
13
+ def deterministic_hash(obj):
14
+ hash_object = hashlib.sha256()
15
+ hash_object.update(str(obj).encode())
16
+ return hash_object.hexdigest()[0:20]
17
+
18
+
19
+ def get_dirs():
20
+ dirs = [
21
+ os.path.join(cache_dir, dir)
22
+ for dir in os.listdir(cache_dir)
23
+ if os.path.isdir(os.path.join(cache_dir, dir))
24
+ ]
25
+ return dirs
26
+
27
+
28
+ def get_time(dir):
29
+ try:
30
+ timefile = os.path.join(dir, time_filename)
31
+ t = float(open(timefile, encoding="utf-8").read())
32
+ return t
33
+ except FileNotFoundError:
34
+ # handle the error as needed, for now we'll just return a default value
35
+ return float(
36
+ "inf"
37
+ ) # This ensures that this directory will be the first to be removed if required
38
+
39
+
40
+ def write_time(dir):
41
+ timefile = os.path.join(dir, time_filename)
42
+ t = time.time()
43
+ print(t, file=open(timefile, "w", encoding="utf-8"), end="")
44
+
45
+
46
+ def argmin(iterable):
47
+ return min(enumerate(iterable), key=lambda x: x[1])[0]
48
+
49
+
50
+ def remove_extra():
51
+ dirs = get_dirs()
52
+ for dir in dirs:
53
+ if not os.path.isdir(
54
+ dir
55
+ ): # This line might be redundant now, as get_dirs() ensures only directories are returned
56
+ os.remove(dir)
57
+ try:
58
+ get_time(dir)
59
+ except BaseException:
60
+ shutil.rmtree(dir)
61
+ while True:
62
+ dirs = get_dirs()
63
+ if len(dirs) <= max_cache:
64
+ break
65
+ times = [get_time(dir) for dir in dirs]
66
+ arg = argmin(times)
67
+ shutil.rmtree(dirs[arg])
68
+
69
+
70
+ def is_cached(hash_key):
71
+ dir = os.path.join(cache_dir, hash_key)
72
+ return os.path.exists(dir)
73
+
74
+
75
+ def create_cache(hash_key):
76
+ dir = os.path.join(cache_dir, hash_key)
77
+ os.makedirs(dir, exist_ok=True)
78
+ write_time(dir)
79
+
80
+
81
+ def load_paragraph(hash_key, hash_key_paragraph):
82
+ filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
83
+ if os.path.exists(filename):
84
+ return open(filename, encoding="utf-8").read()
85
+ else:
86
+ return None
87
+
88
+
89
+ def write_paragraph(hash_key, hash_key_paragraph, paragraph):
90
+ filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
91
+ print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
pdf2zh/converter.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
2
+ from pdfminer.pdffont import PDFFont, PDFCIDFont
3
+ from pdfminer.converter import PDFConverter
4
+ from pdfminer.pdffont import PDFUnicodeNotDefined
5
+ from pdfminer.utils import apply_matrix_pt, mult_matrix
6
+ from pdfminer.layout import (
7
+ LTChar,
8
+ LTFigure,
9
+ LTLine,
10
+ LTPage,
11
+ )
12
+ import logging
13
+ import re
14
+ import concurrent.futures
15
+ import numpy as np
16
+ import unicodedata
17
+ from tenacity import retry, wait_fixed
18
+ from pdf2zh import cache
19
+ from pdf2zh.translator import (
20
+ BaseTranslator,
21
+ GoogleTranslator,
22
+ DeepLTranslator,
23
+ DeepLXTranslator,
24
+ OllamaTranslator,
25
+ OpenAITranslator,
26
+ AzureTranslator,
27
+ TencentTranslator,
28
+ )
29
+ from pymupdf import Font
30
+
31
+ log = logging.getLogger(__name__)
32
+
33
+
34
+ class PDFConverterEx(PDFConverter):
35
+ def __init__(
36
+ self,
37
+ rsrcmgr: PDFResourceManager,
38
+ ) -> None:
39
+ PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
40
+
41
+ def begin_page(self, page, ctm) -> None:
42
+ # 重载替换 cropbox
43
+ (x0, y0, x1, y1) = page.cropbox
44
+ (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
45
+ (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
46
+ mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
47
+ self.cur_item = LTPage(page.pageno, mediabox)
48
+
49
+ def end_page(self, page):
50
+ # 重载返回指令流
51
+ return self.receive_layout(self.cur_item)
52
+
53
+ def begin_figure(self, name, bbox, matrix) -> None:
54
+ # 重载设置 pageid
55
+ self._stack.append(self.cur_item)
56
+ self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
57
+ self.cur_item.pageid = self._stack[-1].pageid
58
+
59
+ def end_figure(self, _: str) -> None:
60
+ # 重载返回指令流
61
+ fig = self.cur_item
62
+ assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
63
+ self.cur_item = self._stack.pop()
64
+ self.cur_item.add(fig)
65
+ return self.receive_layout(fig)
66
+
67
+ def render_char(
68
+ self,
69
+ matrix,
70
+ font,
71
+ fontsize: float,
72
+ scaling: float,
73
+ rise: float,
74
+ cid: int,
75
+ ncs,
76
+ graphicstate: PDFGraphicState,
77
+ ) -> float:
78
+ # 重载设置 cid 和 font
79
+ try:
80
+ text = font.to_unichr(cid)
81
+ assert isinstance(text, str), str(type(text))
82
+ except PDFUnicodeNotDefined:
83
+ text = self.handle_undefined_char(font, cid)
84
+ textwidth = font.char_width(cid)
85
+ textdisp = font.char_disp(cid)
86
+ item = LTChar(
87
+ matrix,
88
+ font,
89
+ fontsize,
90
+ scaling,
91
+ rise,
92
+ text,
93
+ textwidth,
94
+ textdisp,
95
+ ncs,
96
+ graphicstate,
97
+ )
98
+ self.cur_item.add(item)
99
+ item.cid = cid # hack 插入原字符编码
100
+ item.font = font # hack 插入原字符字体
101
+ return item.adv
102
+
103
+
104
+ class Paragraph:
105
+ def __init__(self, y, x, x0, x1, size, font, brk):
106
+ self.y: float = y # 初始纵坐标
107
+ self.x: float = x # 初始横坐标
108
+ self.x0: float = x0 # 左边界
109
+ self.x1: float = x1 # 右边界
110
+ self.size: float = size # 字体大小
111
+ self.font: PDFFont = font # 字体
112
+ self.brk: bool = brk # 换行标记
113
+
114
+
115
+ # fmt: off
116
+ class TranslateConverter(PDFConverterEx):
117
+ def __init__(
118
+ self,
119
+ rsrcmgr,
120
+ vfont: str = None,
121
+ vchar: str = None,
122
+ thread: int = 0,
123
+ layout={},
124
+ lang_in: str = "",
125
+ lang_out: str = "",
126
+ service: str = "",
127
+ resfont: str = "",
128
+ noto: Font = None,
129
+ ) -> None:
130
+ super().__init__(rsrcmgr)
131
+ self.vfont = vfont
132
+ self.vchar = vchar
133
+ self.thread = thread
134
+ self.layout = layout
135
+ self.resfont = resfont
136
+ self.noto = noto
137
+ self.translator: BaseTranslator = None
138
+ param = service.split(":", 1)
139
+ if param[0] == "google":
140
+ self.translator = GoogleTranslator(service, lang_out, lang_in, None)
141
+ elif param[0] == "deepl":
142
+ self.translator = DeepLTranslator(service, lang_out, lang_in, None)
143
+ elif param[0] == "deeplx":
144
+ self.translator = DeepLXTranslator(service, lang_out, lang_in, None)
145
+ elif param[0] == "ollama":
146
+ self.translator = OllamaTranslator(service, lang_out, lang_in, param[1])
147
+ elif param[0] == "openai":
148
+ self.translator = OpenAITranslator(service, lang_out, lang_in, param[1])
149
+ elif param[0] == "azure":
150
+ self.translator = AzureTranslator(service, lang_out, lang_in, None)
151
+ elif param[0] == "tencent":
152
+ self.translator = TencentTranslator(service, lang_out, lang_in, None)
153
+ else:
154
+ raise ValueError("Unsupported translation service")
155
+
156
+ def receive_layout(self, ltpage: LTPage):
157
+ # 段落
158
+ sstk: list[str] = [] # 段落文字栈
159
+ pstk: list[Paragraph] = [] # 段落属性栈
160
+ vbkt: int = 0 # 段落公式括号计数
161
+ # 公式组
162
+ vstk: list[LTChar] = [] # 公式符号组
163
+ vlstk: list[LTLine] = [] # 公式线条组
164
+ vfix: float = 0 # 公式纵向偏移
165
+ # 公式组栈
166
+ var: list[list[LTChar]] = [] # 公式符号组栈
167
+ varl: list[list[LTLine]] = [] # 公式线条组栈
168
+ varf: list[float] = [] # 公式纵向偏移栈
169
+ vlen: list[float] = [] # 公式宽度栈
170
+ # 全局
171
+ lstk: list[LTLine] = [] # 全局线条栈
172
+ xt: LTChar = None # 上一个字符
173
+ xt_cls: int = -1 # 上一个字符所属段落
174
+ vmax: float = ltpage.width / 4 # 行内公式最大宽度
175
+ ops: str = "" # 渲染结果
176
+
177
+ def vflag(font: str, char: str): # 匹配公式(和角标)字体
178
+ font = font.split("+")[-1] # 字体名截断
179
+ if re.match(r"\(cid:", char):
180
+ return True
181
+ # 基于字体名规则的判定
182
+ if self.vfont:
183
+ if re.match(self.vfont, font):
184
+ return True
185
+ else:
186
+ if re.match( # latex 字体
187
+ r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
188
+ font,
189
+ ):
190
+ return True
191
+ # 基于字符集规则的判定
192
+ if self.vchar:
193
+ if re.match(self.vchar, char):
194
+ return True
195
+ else:
196
+ if (
197
+ char
198
+ and char != " " # 非空格
199
+ and (
200
+ unicodedata.category(char[0])
201
+ in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
202
+ or ord(char[0]) in range(0x370, 0x400) # 希腊字母
203
+ )
204
+ ):
205
+ return True
206
+ return False
207
+
208
+ ############################################################
209
+ # A. 原文档解析
210
+ for child in ltpage:
211
+ if isinstance(child, LTChar):
212
+ cur_v = False
213
+ layout = self.layout[ltpage.pageid]
214
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
215
+ h, w = layout.shape
216
+ # 读取当前字符在 layout 中的类别
217
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
218
+ cls = layout[cy, cx]
219
+ if ( # 判定当前字符是否属于公式
220
+ cls == 0 # 1. 类别为保留区域
221
+ or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
222
+ or vflag(child.fontname, child.get_text()) # 3. 公式字体
223
+ or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
224
+ ):
225
+ cur_v = True
226
+ # 判定括号组是否属于公式
227
+ if not cur_v:
228
+ if vstk and child.get_text() == "(":
229
+ cur_v = True
230
+ vbkt += 1
231
+ if vbkt and child.get_text() == ")":
232
+ cur_v = True
233
+ vbkt -= 1
234
+ if ( # 判定当前公式是否结束
235
+ not cur_v # 1. 当前字符不属于公式
236
+ or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
237
+ or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
238
+ ):
239
+ if vstk:
240
+ if ( # 根据公式右侧的文字修正公式的纵向偏移
241
+ not cur_v # 1. 当前字符不属于公式
242
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
243
+ and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
244
+ ):
245
+ vfix = vstk[0].y0 - child.y0
246
+ sstk[-1] += f"$v{len(var)}$"
247
+ var.append(vstk)
248
+ varl.append(vlstk)
249
+ varf.append(vfix)
250
+ vstk = []
251
+ vlstk = []
252
+ vfix = 0
253
+ # 当前字符不属于公式或当前字符是公式的第一个字符
254
+ if not vstk:
255
+ if cls == xt_cls: # 当前字符与前一个字符属于同一段落
256
+ if child.x0 > xt.x1 + 1: # 添加行内空格
257
+ sstk[-1] += " "
258
+ elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
259
+ sstk[-1] += " "
260
+ pstk[-1].brk = True
261
+ else: # 根据当前字符构建一个新的段落
262
+ sstk.append("")
263
+ pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
264
+ if not cur_v: # 文字入栈
265
+ if ( # 根据当前字符修正段落属性
266
+ child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
267
+ or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
268
+ or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体
269
+ or re.match( # 4. 段落字体为粗体
270
+ r"(.*Medi|.*Bold)",
271
+ pstk[-1].font.fontname,
272
+ re.IGNORECASE,
273
+ )
274
+ ):
275
+ pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
276
+ pstk[-1].size = child.size
277
+ pstk[-1].font = child.font
278
+ sstk[-1] += child.get_text()
279
+ else: # 公式入栈
280
+ if ( # 根据公式左侧的文字修正公式的纵向偏移
281
+ not vstk # 1. 当前字符是公式的第一个字符
282
+ and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
283
+ and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
284
+ ):
285
+ vfix = child.y0 - xt.y0
286
+ vstk.append(child)
287
+ # 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
288
+ pstk[-1].x0 = min(pstk[-1].x0, child.x0)
289
+ pstk[-1].x1 = max(pstk[-1].x1, child.x1)
290
+ # 更新上一个字符
291
+ xt = child
292
+ xt_cls = cls
293
+ elif isinstance(child, LTFigure): # 图表
294
+ pass
295
+ elif isinstance(child, LTLine): # 线条
296
+ layout = self.layout[ltpage.pageid]
297
+ # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
298
+ h, w = layout.shape
299
+ # 读取当前线条在 layout 中的类别
300
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
301
+ cls = layout[cy, cx]
302
+ if vstk and cls == xt_cls: # 公式线条
303
+ vlstk.append(child)
304
+ else: # 全局线条
305
+ lstk.append(child)
306
+ else:
307
+ pass
308
+ # 处理结尾
309
+ if vstk: # 公式出栈
310
+ sstk[-1] += f"$v{len(var)}$"
311
+ var.append(vstk)
312
+ varl.append(vlstk)
313
+ varf.append(vfix)
314
+ log.debug("\n==========[VSTACK]==========\n")
315
+ for id, v in enumerate(var): # 计算公式宽度
316
+ l = max([vch.x1 for vch in v]) - v[0].x0
317
+ log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
318
+ vlen.append(l)
319
+
320
+ ############################################################
321
+ # B. 段落翻译
322
+ log.debug("\n==========[SSTACK]==========\n")
323
+ hash_key = cache.deterministic_hash("PDFMathTranslate")
324
+ cache.create_cache(hash_key)
325
+
326
+ @retry(wait=wait_fixed(1))
327
+ def worker(s: str): # 多线程翻译
328
+ try:
329
+ hash_key_paragraph = cache.deterministic_hash(
330
+ (s, str(self.translator))
331
+ )
332
+ new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
333
+ if new is None:
334
+ new = self.translator.translate(s)
335
+ cache.write_paragraph(hash_key, hash_key_paragraph, new)
336
+ return new
337
+ except BaseException as e:
338
+ if log.isEnabledFor(logging.DEBUG):
339
+ log.exception(e)
340
+ else:
341
+ log.exception(e, exc_info=False)
342
+ raise e
343
+ with concurrent.futures.ThreadPoolExecutor(
344
+ max_workers=self.thread
345
+ ) as executor:
346
+ news = list(executor.map(worker, sstk))
347
+
348
+ ############################################################
349
+ # C. 新文档排版
350
+ def raw_string(fcur: str, cstk: str): # 编码字符串
351
+ if fcur == 'noto':
352
+ return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
353
+ elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
354
+ return "".join(["%04x" % ord(c) for c in cstk])
355
+ else:
356
+ return "".join(["%02x" % ord(c) for c in cstk])
357
+
358
+ _x, _y = 0, 0
359
+ for id, new in enumerate(news):
360
+ x: float = pstk[id].x # 段落初始横坐标
361
+ y: float = pstk[id].y # 段落上边界
362
+ x0: float = pstk[id].x0 # 段落左边界
363
+ x1: float = pstk[id].x1 # 段落右边界
364
+ size: float = pstk[id].size # 段落字体大小
365
+ font: PDFFont = pstk[id].font # 段落字体
366
+ brk: bool = pstk[id].brk # 段落属性
367
+ cstk: str = "" # 当前文字栈
368
+ fcur: str = None # 当前字体ID
369
+ tx = x
370
+ fcur_ = fcur
371
+ ptr = 0
372
+ log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
373
+ while ptr < len(new):
374
+ vy_regex = re.match(
375
+ r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
376
+ ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
377
+ mod = 0 # 文字修饰符
378
+ if vy_regex: # 加载公式
379
+ ptr += len(vy_regex.group(0))
380
+ try:
381
+ vid = int(vy_regex.group(1).replace(" ", ""))
382
+ adv = vlen[vid]
383
+ except Exception:
384
+ continue # 翻译器可能会自动补个越界的公式标记
385
+ if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
386
+ mod = var[vid][-1].width
387
+ else: # 加载文字
388
+ ch = new[ptr]
389
+ fcur_ = None
390
+ # 原字体编码容易出问题,这里直接放弃掉
391
+ # try:
392
+ # if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
393
+ # fcur_=self.fontid[font] # 原字体
394
+ # except:
395
+ # pass
396
+ try:
397
+ if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
398
+ fcur_ = "tiro" # 默认拉丁字体
399
+ except Exception:
400
+ pass
401
+ if fcur_ is None:
402
+ fcur_ = self.resfont # 默认非拉丁字体
403
+ # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
404
+ if fcur_ == 'noto':
405
+ adv = self.noto.char_lengths(ch, size)[0]
406
+ else:
407
+ adv = self.fontmap[fcur_].char_width(ord(ch)) * size
408
+ ptr += 1
409
+ if ( # 输出文字缓冲区
410
+ fcur_ != fcur # 1. 字体更新
411
+ or vy_regex # 2. 插入公式
412
+ or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
413
+ ):
414
+ if cstk:
415
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
416
+ cstk = ""
417
+ if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
418
+ x = x0
419
+ lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
420
+ y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
421
+ if vy_regex: # 插入公式
422
+ fix = 0
423
+ if fcur is not None: # 段落内公式修正纵向偏移
424
+ fix = varf[vid]
425
+ for vch in var[vid]: # 排版公式字符
426
+ vc = chr(vch.cid)
427
+ ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
428
+ if log.isEnabledFor(logging.DEBUG):
429
+ lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
430
+ _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
431
+ for l in varl[vid]: # 排版公式线条
432
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
433
+ ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
434
+ else: # 插入文字缓冲区
435
+ if not cstk: # 单行开头
436
+ tx = x
437
+ if x == x0 and ch == " ": # 消除段落换行空格
438
+ adv = 0
439
+ else:
440
+ cstk += ch
441
+ else:
442
+ cstk += ch
443
+ adv -= mod # 文字修饰符
444
+ fcur = fcur_
445
+ x += adv
446
+ if log.isEnabledFor(logging.DEBUG):
447
+ lstk.append(LTLine(0.1, (_x, _y), (x, y)))
448
+ _x, _y = x, y
449
+ # 处理结尾
450
+ if cstk:
451
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
452
+ for l in lstk: # 排版全局线条
453
+ if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
454
+ ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
455
+ ops = f"BT {ops}ET "
456
+ return ops
pdf2zh/doclayout.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import cv2
3
+ import numpy as np
4
+ import ast
5
+ import onnx
6
+ import onnxruntime
7
+ from huggingface_hub import hf_hub_download
8
+
9
+
10
+ class DocLayoutModel(abc.ABC):
11
+ @staticmethod
12
+ def load_onnx():
13
+ model = OnnxModel.from_pretrained(
14
+ repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
15
+ filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
16
+ )
17
+ return model
18
+
19
+ @staticmethod
20
+ def load_available():
21
+ return DocLayoutModel.load_onnx()
22
+
23
+ @property
24
+ @abc.abstractmethod
25
+ def stride(self) -> int:
26
+ """Stride of the model input."""
27
+ pass
28
+
29
+ @abc.abstractmethod
30
+ def predict(self, image, imgsz=1024, **kwargs) -> list:
31
+ """
32
+ Predict the layout of a document page.
33
+
34
+ Args:
35
+ image: The image of the document page.
36
+ imgsz: Resize the image to this size. Must be a multiple of the stride.
37
+ **kwargs: Additional arguments.
38
+ """
39
+ pass
40
+
41
+
42
+ class YoloResult:
43
+ """Helper class to store detection results from ONNX model."""
44
+
45
+ def __init__(self, boxes, names):
46
+ self.boxes = [YoloBox(data=d) for d in boxes]
47
+ self.boxes.sort(key=lambda x: x.conf, reverse=True)
48
+ self.names = names
49
+
50
+
51
+ class YoloBox:
52
+ """Helper class to store detection results from ONNX model."""
53
+
54
+ def __init__(self, data):
55
+ self.xyxy = data[:4]
56
+ self.conf = data[-2]
57
+ self.cls = data[-1]
58
+
59
+
60
+ class OnnxModel(DocLayoutModel):
61
+ def __init__(self, model_path: str):
62
+ self.model_path = model_path
63
+
64
+ model = onnx.load(model_path)
65
+ metadata = {d.key: d.value for d in model.metadata_props}
66
+ self._stride = ast.literal_eval(metadata["stride"])
67
+ self._names = ast.literal_eval(metadata["names"])
68
+
69
+ self.model = onnxruntime.InferenceSession(model.SerializeToString())
70
+
71
+ @staticmethod
72
+ def from_pretrained(repo_id: str, filename: str):
73
+ pth = hf_hub_download(repo_id=repo_id, filename=filename)
74
+ return OnnxModel(pth)
75
+
76
+ @property
77
+ def stride(self):
78
+ return self._stride
79
+
80
+ def resize_and_pad_image(self, image, new_shape):
81
+ """
82
+ Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
83
+
84
+ Parameters:
85
+ - image: Input image
86
+ - new_shape: Target size (integer or (height, width) tuple)
87
+ - stride: Padding alignment stride, default 32
88
+
89
+ Returns:
90
+ - Processed image
91
+ """
92
+ if isinstance(new_shape, int):
93
+ new_shape = (new_shape, new_shape)
94
+
95
+ h, w = image.shape[:2]
96
+ new_h, new_w = new_shape
97
+
98
+ # Calculate scaling ratio
99
+ r = min(new_h / h, new_w / w)
100
+ resized_h, resized_w = int(round(h * r)), int(round(w * r))
101
+
102
+ # Resize image
103
+ image = cv2.resize(
104
+ image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
105
+ )
106
+
107
+ # Calculate padding size and align to stride multiple
108
+ pad_w = (new_w - resized_w) % self.stride
109
+ pad_h = (new_h - resized_h) % self.stride
110
+ top, bottom = pad_h // 2, pad_h - pad_h // 2
111
+ left, right = pad_w // 2, pad_w - pad_w // 2
112
+
113
+ # Add padding
114
+ image = cv2.copyMakeBorder(
115
+ image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
116
+ )
117
+
118
+ return image
119
+
120
+ def scale_boxes(self, img1_shape, boxes, img0_shape):
121
+ """
122
+ Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
123
+ specified in (img1_shape) to the shape of a different image (img0_shape).
124
+
125
+ Args:
126
+ img1_shape (tuple): The shape of the image that the bounding boxes are for,
127
+ in the format of (height, width).
128
+ boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
129
+ img0_shape (tuple): the shape of the target image, in the format of (height, width).
130
+
131
+ Returns:
132
+ boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
133
+ """
134
+
135
+ # Calculate scaling ratio
136
+ gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
137
+
138
+ # Calculate padding size
139
+ pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
140
+ pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
141
+
142
+ # Remove padding and scale boxes
143
+ boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
144
+ return boxes
145
+
146
+ def predict(self, image, imgsz=1024, **kwargs):
147
+ # Preprocess input image
148
+ orig_h, orig_w = image.shape[:2]
149
+ pix = self.resize_and_pad_image(image, new_shape=imgsz)
150
+ pix = np.transpose(pix, (2, 0, 1)) # CHW
151
+ pix = np.expand_dims(pix, axis=0) # BCHW
152
+ pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1]
153
+ new_h, new_w = pix.shape[2:]
154
+
155
+ # Run inference
156
+ preds = self.model.run(None, {"images": pix})[0]
157
+
158
+ # Postprocess predictions
159
+ preds = preds[preds[..., 4] > 0.25]
160
+ preds[..., :4] = self.scale_boxes(
161
+ (new_h, new_w), preds[..., :4], (orig_h, orig_w)
162
+ )
163
+ return [YoloResult(boxes=preds, names=self._names)]
pdf2zh/gui.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from pdf2zh import __version__
5
+ from pdf2zh.pdf2zh import extract_text
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import pymupdf
10
+ import tqdm
11
+ import requests
12
+ import cgi
13
+
14
+ # Map service names to pdf2zh service options
15
+ service_map = {
16
+ "Google": ("google", None, None),
17
+ "DeepL": ("deepl", "DEEPL_AUTH_KEY", None),
18
+ "DeepLX": ("deeplx", "DEEPLX_AUTH_KEY", None),
19
+ "Ollama": ("ollama", None, "gemma2"),
20
+ "OpenAI": ("openai", "OPENAI_API_KEY", "gpt-4o"),
21
+ "Azure": ("azure", "AZURE_APIKEY", None),
22
+ "Tencent": ("tencent", "TENCENT_SECRET_KEY", None),
23
+ }
24
+ lang_map = {
25
+ "Chinese": "zh",
26
+ "English": "en",
27
+ "French": "fr",
28
+ "German": "de",
29
+ "Japanese": "ja",
30
+ "Korean": "ko",
31
+ "Russian": "ru",
32
+ "Spanish": "es",
33
+ "Italian": "it",
34
+ }
35
+ page_map = {
36
+ "All": None,
37
+ "First": [0],
38
+ "First 5 pages": list(range(0, 5)),
39
+ }
40
+
41
+ flag_demo = False
42
+ if os.environ.get("PDF2ZH_DEMO"):
43
+ flag_demo = True
44
+ service_map = {
45
+ "Google": ("google", None, None),
46
+ }
47
+ page_map = {
48
+ "First": [0],
49
+ "First 20 pages": list(range(0, 20)),
50
+ }
51
+ client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
52
+ server_key = os.environ.get("PDF2ZH_SERVER_KEY")
53
+
54
+
55
+ def verify_recaptcha(response):
56
+ recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
57
+
58
+ print("reCAPTCHA", server_key, response)
59
+
60
+ data = {"secret": server_key, "response": response}
61
+ result = requests.post(recaptcha_url, data=data).json()
62
+
63
+ print("reCAPTCHA", result.get("success"))
64
+
65
+ return result.get("success")
66
+
67
+
68
+ def pdf_preview(file):
69
+ doc = pymupdf.open(file)
70
+ page = doc[0]
71
+ pix = page.get_pixmap()
72
+ image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
73
+ return image
74
+
75
+
76
+ def upload_file(file, service, progress=gr.Progress()):
77
+ """Handle file upload, validation, and initial preview."""
78
+ if not file or not os.path.exists(file):
79
+ return None, None
80
+
81
+ try:
82
+ # Convert first page for preview
83
+ preview_image = pdf_preview(file)
84
+
85
+ return file, preview_image
86
+ except Exception as e:
87
+ print(f"Error converting PDF: {e}")
88
+ return None, None
89
+
90
+
91
+ def download_with_limit(url, save_path, size_limit):
92
+ chunk_size = 1024
93
+ total_size = 0
94
+ with requests.get(url, stream=True, timeout=10) as response:
95
+ response.raise_for_status()
96
+ content = response.headers.get("Content-Disposition")
97
+ try:
98
+ _, params = cgi.parse_header(content)
99
+ filename = params["filename"]
100
+ except Exception:
101
+ filename = os.path.basename(url)
102
+ with open(save_path / filename, "wb") as file:
103
+ for chunk in response.iter_content(chunk_size=chunk_size):
104
+ total_size += len(chunk)
105
+ if size_limit and total_size > size_limit:
106
+ raise gr.Error("Exceeds file size limit")
107
+ file.write(chunk)
108
+ return save_path / filename
109
+
110
+
111
+ def translate(
112
+ file_type,
113
+ file_input,
114
+ link_input,
115
+ service,
116
+ apikey,
117
+ model_id,
118
+ lang_from,
119
+ lang_to,
120
+ page_range,
121
+ recaptcha_response,
122
+ progress=gr.Progress(),
123
+ ):
124
+ """Translate PDF content using selected service."""
125
+ if flag_demo and not verify_recaptcha(recaptcha_response):
126
+ raise gr.Error("reCAPTCHA fail")
127
+
128
+ progress(0, desc="Starting translation...")
129
+
130
+ output = Path("pdf2zh_files")
131
+ output.mkdir(parents=True, exist_ok=True)
132
+
133
+ if file_type == "File":
134
+ if not file_input:
135
+ raise gr.Error("No input")
136
+ file_path = shutil.copy(file_input, output)
137
+ else:
138
+ if not link_input:
139
+ raise gr.Error("No input")
140
+ file_path = download_with_limit(
141
+ link_input,
142
+ output,
143
+ 5 * 1024 * 1024 if flag_demo else None,
144
+ )
145
+
146
+ filename = os.path.splitext(os.path.basename(file_path))[0]
147
+ file_en = output / f"{filename}.pdf"
148
+ file_zh = output / f"{filename}-zh.pdf"
149
+ file_dual = output / f"{filename}-dual.pdf"
150
+
151
+ selected_service = service_map[service][0]
152
+ if service_map[service][1]:
153
+ os.environ.setdefault(service_map[service][1], apikey)
154
+ selected_page = page_map[page_range]
155
+ lang_from = lang_map[lang_from]
156
+ lang_to = lang_map[lang_to]
157
+ if selected_service == "google":
158
+ lang_from = "zh-CN" if lang_from == "zh" else lang_from
159
+ lang_to = "zh-CN" if lang_to == "zh" else lang_to
160
+
161
+ print(f"Files before translation: {os.listdir(output)}")
162
+
163
+ def progress_bar(t: tqdm.tqdm):
164
+ progress(t.n / t.total, desc="Translating...")
165
+
166
+ param = {
167
+ "files": [file_en],
168
+ "pages": selected_page,
169
+ "lang_in": lang_from,
170
+ "lang_out": lang_to,
171
+ "service": f"{selected_service}:{model_id}",
172
+ "output": output,
173
+ "thread": 4,
174
+ "callback": progress_bar,
175
+ }
176
+ print(param)
177
+ extract_text(**param)
178
+ print(f"Files after translation: {os.listdir(output)}")
179
+
180
+ if not file_zh.exists() or not file_dual.exists():
181
+ raise gr.Error("No output")
182
+
183
+ try:
184
+ translated_preview = pdf_preview(str(file_zh))
185
+ except Exception:
186
+ raise gr.Error("No preview")
187
+
188
+ progress(1.0, desc="Translation complete!")
189
+
190
+ return (
191
+ str(file_zh),
192
+ translated_preview,
193
+ str(file_dual),
194
+ gr.update(visible=True),
195
+ gr.update(visible=True),
196
+ gr.update(visible=True),
197
+ )
198
+
199
+
200
+ # Global setup
201
+ custom_blue = gr.themes.Color(
202
+ c50="#E8F3FF",
203
+ c100="#BEDAFF",
204
+ c200="#94BFFF",
205
+ c300="#6AA1FF",
206
+ c400="#4080FF",
207
+ c500="#165DFF", # Primary color
208
+ c600="#0E42D2",
209
+ c700="#0A2BA6",
210
+ c800="#061D79",
211
+ c900="#03114D",
212
+ c950="#020B33",
213
+ )
214
+
215
+ with gr.Blocks(
216
+ title="PDFMathTranslate - PDF Translation with preserved formats",
217
+ theme=gr.themes.Default(
218
+ primary_hue=custom_blue, spacing_size="md", radius_size="lg"
219
+ ),
220
+ css="""
221
+ .secondary-text {color: #999 !important;}
222
+ footer {visibility: hidden}
223
+ .env-warning {color: #dd5500 !important;}
224
+ .env-success {color: #559900 !important;}
225
+
226
+ /* Add dashed border to input-file class */
227
+ .input-file {
228
+ border: 1.2px dashed #165DFF !important;
229
+ border-radius: 6px !important;
230
+ # background-color: #ffffff !important;
231
+ transition: background-color 0.4s ease-out;
232
+ }
233
+
234
+ .input-file:hover {
235
+ border: 1.2px dashed #165DFF !important;
236
+ border-radius: 6px !important;
237
+ color: #165DFF !important;
238
+ background-color: #E8F3FF !important;
239
+ transition: background-color 0.2s ease-in;
240
+ }
241
+
242
+ .progress-bar-wrap {
243
+ border-radius: 8px !important;
244
+ }
245
+ .progress-bar {
246
+ border-radius: 8px !important;
247
+ }
248
+
249
+ # .input-file label {
250
+ # color: #165DFF !important;
251
+ # border: 1.2px dashed #165DFF !important;
252
+ # border-left: none !important;
253
+ # border-top: none !important;
254
+ # }
255
+ # .input-file .wrap {
256
+ # color: #165DFF !important;
257
+ # }
258
+ # .input-file .or {
259
+ # color: #165DFF !important;
260
+ # }
261
+ """,
262
+ head=(
263
+ """
264
+ <script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
265
+ <script type="text/javascript">
266
+ var onVerify = function(token) {
267
+ el=document.getElementById('verify').getElementsByTagName('textarea')[0];
268
+ el.value=token;
269
+ el.dispatchEvent(new Event('input'));
270
+ };
271
+ </script>
272
+ """
273
+ if flag_demo
274
+ else ""
275
+ ),
276
+ ) as demo:
277
+ gr.Markdown(
278
+ "# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
279
+ )
280
+
281
+ with gr.Row():
282
+ with gr.Column(scale=1):
283
+ gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
284
+ file_type = gr.Radio(
285
+ choices=["File", "Link"],
286
+ label="Type",
287
+ value="File",
288
+ )
289
+ file_input = gr.File(
290
+ label="File",
291
+ file_count="single",
292
+ file_types=[".pdf"],
293
+ type="filepath",
294
+ elem_classes=["input-file"],
295
+ )
296
+ link_input = gr.Textbox(
297
+ label="Link",
298
+ visible=False,
299
+ interactive=True,
300
+ )
301
+ gr.Markdown("## Option")
302
+ with gr.Row():
303
+ service = gr.Dropdown(
304
+ label="Service",
305
+ choices=service_map.keys(),
306
+ value="Google",
307
+ )
308
+ apikey = gr.Textbox(
309
+ label="API Key",
310
+ max_lines=1,
311
+ visible=False,
312
+ )
313
+ with gr.Row():
314
+ lang_from = gr.Dropdown(
315
+ label="Translate from",
316
+ choices=lang_map.keys(),
317
+ value="English",
318
+ )
319
+ lang_to = gr.Dropdown(
320
+ label="Translate to",
321
+ choices=lang_map.keys(),
322
+ value="Chinese",
323
+ )
324
+ page_range = gr.Radio(
325
+ choices=page_map.keys(),
326
+ label="Pages",
327
+ value=list(page_map.keys())[0],
328
+ )
329
+ model_id = gr.Textbox(
330
+ label="Model ID",
331
+ visible=False,
332
+ interactive=True,
333
+ )
334
+ envs_status = "<span class='env-success'>- Properly configured.</span><br>"
335
+
336
+ def details_wrapper(text_markdown):
337
+ text = f"""
338
+ <summary>Technical details</summary>
339
+ {text_markdown}
340
+ - GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
341
+ - GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
342
+ - Version: {__version__}
343
+ """
344
+ return text
345
+
346
+ def env_var_checker(env_var_name: str) -> str:
347
+ if env_var_name:
348
+ if not os.environ.get(env_var_name):
349
+ envs_status = (
350
+ f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
351
+ + "</span><br>- Please make sure that the environment variables are properly configured "
352
+ + "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
353
+ )
354
+ else:
355
+ value = str(os.environ.get(env_var_name))
356
+ envs_status = "<span class='env-success'>- Properly configured.</span><br>"
357
+ envs_status += (
358
+ f"- {env_var_name}: <code>{value[:13]}***</code><br>"
359
+ )
360
+ else:
361
+ envs_status = (
362
+ "<span class='env-success'>- Properly configured.</span><br>"
363
+ )
364
+ return details_wrapper(envs_status)
365
+
366
+ def on_select_service(service, evt: gr.EventData):
367
+ if service_map[service][1]:
368
+ apikey_content = gr.update(
369
+ visible=True, value=os.environ.get(service_map[service][1])
370
+ )
371
+ else:
372
+ apikey_content = gr.update(visible=False)
373
+ if service_map[service][2]:
374
+ model_visibility = gr.update(
375
+ visible=True, value=service_map[service][2]
376
+ )
377
+ else:
378
+ model_visibility = gr.update(visible=False)
379
+ return (
380
+ env_var_checker(service_map[service][1]),
381
+ model_visibility,
382
+ apikey_content,
383
+ )
384
+
385
+ def on_select_filetype(file_type):
386
+ return (
387
+ gr.update(visible=file_type == "File"),
388
+ gr.update(visible=file_type == "Link"),
389
+ )
390
+
391
+ output_title = gr.Markdown("## Translated", visible=False)
392
+ output_file = gr.File(label="Download Translation", visible=False)
393
+ output_file_dual = gr.File(
394
+ label="Download Translation (Dual)", visible=False
395
+ )
396
+ recaptcha_response = gr.Textbox(
397
+ label="reCAPTCHA Response", elem_id="verify", visible=False
398
+ )
399
+ recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
400
+ translate_btn = gr.Button("Translate", variant="primary")
401
+ tech_details_tog = gr.Markdown(
402
+ details_wrapper(envs_status),
403
+ elem_classes=["secondary-text"],
404
+ )
405
+ service.select(
406
+ on_select_service, service, [tech_details_tog, model_id, apikey]
407
+ )
408
+ file_type.select(
409
+ on_select_filetype,
410
+ file_type,
411
+ [file_input, link_input],
412
+ js=(
413
+ f"""
414
+ (a,b)=>{{
415
+ try{{
416
+ grecaptcha.render('recaptcha-box',{{
417
+ 'sitekey':'{client_key}',
418
+ 'callback':'onVerify'
419
+ }});
420
+ }}catch(error){{}}
421
+ return [a];
422
+ }}
423
+ """
424
+ if flag_demo
425
+ else ""
426
+ ),
427
+ )
428
+
429
+ with gr.Column(scale=2):
430
+ gr.Markdown("## Preview")
431
+ preview = gr.Image(label="Document Preview", visible=True)
432
+
433
+ # Event handlers
434
+ file_input.upload(
435
+ upload_file,
436
+ inputs=[file_input, service],
437
+ outputs=[file_input, preview],
438
+ js=(
439
+ f"""
440
+ (a,b)=>{{
441
+ try{{
442
+ grecaptcha.render('recaptcha-box',{{
443
+ 'sitekey':'{client_key}',
444
+ 'callback':'onVerify'
445
+ }});
446
+ }}catch(error){{}}
447
+ return [a];
448
+ }}
449
+ """
450
+ if flag_demo
451
+ else ""
452
+ ),
453
+ )
454
+
455
+ translate_btn.click(
456
+ translate,
457
+ inputs=[
458
+ file_type,
459
+ file_input,
460
+ link_input,
461
+ service,
462
+ apikey,
463
+ model_id,
464
+ lang_from,
465
+ lang_to,
466
+ page_range,
467
+ recaptcha_response,
468
+ ],
469
+ outputs=[
470
+ output_file,
471
+ preview,
472
+ output_file_dual,
473
+ output_file,
474
+ output_file_dual,
475
+ output_title,
476
+ ],
477
+ ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
478
+
479
+
480
+ def setup_gui(share=False):
481
+ if flag_demo:
482
+ demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
483
+ else:
484
+ try:
485
+ demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
486
+ except Exception:
487
+ print(
488
+ "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
489
+ )
490
+ try:
491
+ demo.launch(
492
+ server_name="127.0.0.1", debug=True, inbrowser=True, share=share
493
+ )
494
+ except Exception:
495
+ print(
496
+ "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
497
+ )
498
+ demo.launch(debug=True, inbrowser=True, share=True)
499
+
500
+
501
+ # For auto-reloading while developing
502
+ if __name__ == "__main__":
503
+ setup_gui()
pdf2zh/high_level.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Functions that can be used for the most common use-cases for pdf2zh.six"""
2
+
3
+ from typing import BinaryIO
4
+ import numpy as np
5
+ import tqdm
6
+ from pymupdf import Document
7
+ from pdfminer.pdfpage import PDFPage
8
+ from pdfminer.pdfinterp import PDFResourceManager
9
+ from pdfminer.pdfdocument import PDFDocument
10
+ from pdfminer.pdfparser import PDFParser
11
+ from pdf2zh.converter import TranslateConverter
12
+ from pdf2zh.pdfinterp import PDFPageInterpreterEx
13
+ from pymupdf import Font
14
+
15
+
16
+ def extract_text_to_fp(
17
+ inf: BinaryIO,
18
+ pages=None,
19
+ password: str = "",
20
+ debug: bool = False,
21
+ page_count: int = 0,
22
+ vfont: str = "",
23
+ vchar: str = "",
24
+ thread: int = 0,
25
+ doc_en: Document = None,
26
+ model=None,
27
+ lang_in: str = "",
28
+ lang_out: str = "",
29
+ service: str = "",
30
+ resfont: str = "",
31
+ noto: Font = None,
32
+ callback: object = None,
33
+ **kwarg,
34
+ ) -> None:
35
+ rsrcmgr = PDFResourceManager()
36
+ layout = {}
37
+ device = TranslateConverter(
38
+ rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
39
+ )
40
+
41
+ assert device is not None
42
+ obj_patch = {}
43
+ interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
44
+ if pages:
45
+ total_pages = len(pages)
46
+ else:
47
+ total_pages = page_count
48
+
49
+ parser = PDFParser(inf)
50
+ doc = PDFDocument(parser, password=password)
51
+ with tqdm.tqdm(
52
+ enumerate(PDFPage.create_pages(doc)),
53
+ total=total_pages,
54
+ ) as progress:
55
+ for pageno, page in progress:
56
+ if pages and (pageno not in pages):
57
+ continue
58
+ if callback:
59
+ callback(progress)
60
+ page.pageno = pageno
61
+ pix = doc_en[page.pageno].get_pixmap()
62
+ image = np.fromstring(pix.samples, np.uint8).reshape(
63
+ pix.height, pix.width, 3
64
+ )[:, :, ::-1]
65
+ page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
66
+ # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
67
+ box = np.ones((pix.height, pix.width))
68
+ h, w = box.shape
69
+ vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
70
+ for i, d in enumerate(page_layout.boxes):
71
+ if not page_layout.names[int(d.cls)] in vcls:
72
+ x0, y0, x1, y1 = d.xyxy.squeeze()
73
+ x0, y0, x1, y1 = (
74
+ np.clip(int(x0 - 1), 0, w - 1),
75
+ np.clip(int(h - y1 - 1), 0, h - 1),
76
+ np.clip(int(x1 + 1), 0, w - 1),
77
+ np.clip(int(h - y0 + 1), 0, h - 1),
78
+ )
79
+ box[y0:y1, x0:x1] = i + 2
80
+ for i, d in enumerate(page_layout.boxes):
81
+ if page_layout.names[int(d.cls)] in vcls:
82
+ x0, y0, x1, y1 = d.xyxy.squeeze()
83
+ x0, y0, x1, y1 = (
84
+ np.clip(int(x0 - 1), 0, w - 1),
85
+ np.clip(int(h - y1 - 1), 0, h - 1),
86
+ np.clip(int(x1 + 1), 0, w - 1),
87
+ np.clip(int(h - y0 + 1), 0, h - 1),
88
+ )
89
+ box[y0:y1, x0:x1] = 0
90
+ layout[page.pageno] = box
91
+ # 新建一个 xref 存放新指令流
92
+ page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
93
+ doc_en.update_object(page.page_xref, "<<>>")
94
+ doc_en.update_stream(page.page_xref, b"")
95
+ doc_en[page.pageno].set_contents(page.page_xref)
96
+ interpreter.process_page(page)
97
+
98
+ device.close()
99
+ return obj_patch
pdf2zh/pdf2zh.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """A command line tool for extracting text and images from PDF and
3
+ output it to plain text, html, xml or tags.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import os
10
+ import sys
11
+ import logging
12
+ from pathlib import Path
13
+ from typing import Any, Container, Iterable, List, Optional
14
+ import urllib.request
15
+ from pdfminer.pdfexceptions import PDFValueError
16
+
17
+ import pymupdf
18
+ import requests
19
+ import tempfile
20
+
21
+ from pdf2zh import __version__, log
22
+ from pdf2zh.high_level import extract_text_to_fp
23
+ from pdf2zh.doclayout import DocLayoutModel
24
+
25
+ logging.basicConfig()
26
+
27
+ model = DocLayoutModel.load_available()
28
+
29
+ resfont_map = {
30
+ "zh-CN": "china-ss",
31
+ "zh-TW": "china-ts",
32
+ "ja": "japan-s",
33
+ "ko": "korea-s",
34
+ }
35
+ noto_list = [
36
+ "am", # Amharic
37
+ "ar", # Arabic
38
+ "bn", # Bengali
39
+ "bg", # Bulgarian
40
+ "chr", # Cherokee
41
+ "el", # Greek
42
+ "gu", # Gujarati
43
+ "iw", # Hebrew
44
+ "hi", # Hindi
45
+ # "ja", # Japanese
46
+ "kn", # Kannada
47
+ # "ko", # Korean
48
+ "ml", # Malayalam
49
+ "mr", # Marathi
50
+ "ru", # Russian
51
+ "sr", # Serbian
52
+ # "zh-CN",# Chinese (PRC)
53
+ "ta", # Tamil
54
+ "te", # Telugu
55
+ "th", # Thai
56
+ # "zh-TW",# Chinese (Taiwan)
57
+ "ur", # Urdu
58
+ "uk", # Ukrainian
59
+ ]
60
+
61
+
62
+ def check_files(files: List[str]) -> List[str]:
63
+ files = [
64
+ f for f in files if not f.startswith("http://")
65
+ ] # exclude online files, http
66
+ files = [
67
+ f for f in files if not f.startswith("https://")
68
+ ] # exclude online files, https
69
+ missing_files = [file for file in files if not os.path.exists(file)]
70
+ return missing_files
71
+
72
+
73
+ def extract_text(
74
+ files: Iterable[str] = [],
75
+ pages: Optional[Container[int]] = None,
76
+ password: str = "",
77
+ debug: bool = False,
78
+ vfont: str = "",
79
+ vchar: str = "",
80
+ thread: int = 0,
81
+ lang_in: str = "",
82
+ lang_out: str = "",
83
+ service: str = "",
84
+ callback: object = None,
85
+ output: str = "",
86
+ **kwargs: Any,
87
+ ):
88
+ if debug:
89
+ log.setLevel(logging.DEBUG)
90
+
91
+ if not files:
92
+ raise PDFValueError("Must provide files to work upon!")
93
+
94
+ for file in files:
95
+ if file is str and (file.startswith("http://") or file.startswith("https://")):
96
+ print("Online files detected, downloading...")
97
+ try:
98
+ r = requests.get(file, allow_redirects=True)
99
+ if r.status_code == 200:
100
+ if not os.path.exists("./pdf2zh_files"):
101
+ print("Making a temporary dir for downloading PDF files...")
102
+ os.mkdir(os.path.dirname("./pdf2zh_files"))
103
+ with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
104
+ print(f"Writing the file: {file}...")
105
+ f.write(r.content)
106
+ file = "./pdf2zh_files/tmp_download.pdf"
107
+ else:
108
+ r.raise_for_status()
109
+ except Exception as e:
110
+ raise PDFValueError(
111
+ f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
112
+ )
113
+ filename = os.path.splitext(os.path.basename(file))[0]
114
+
115
+ font_list = [("tiro", None)]
116
+ noto = None
117
+ if lang_out in resfont_map: # CJK
118
+ resfont = resfont_map[lang_out]
119
+ font_list.append((resfont, None))
120
+ elif lang_out in noto_list: # noto
121
+ resfont = "noto"
122
+ ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
123
+ if not os.path.exists(ttf_path):
124
+ print("Downloading Noto font...")
125
+ urllib.request.urlretrieve(
126
+ "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
127
+ ttf_path,
128
+ )
129
+ font_list.append(("noto", ttf_path))
130
+ noto = pymupdf.Font("noto", ttf_path)
131
+ else: # auto
132
+ resfont = "china-ss"
133
+ font_list.append(("china-ss", None))
134
+
135
+ doc_en = pymupdf.open(file)
136
+ page_count = doc_en.page_count
137
+ # font_list = [("china-ss", None), ("tiro", None)]
138
+ font_id = {}
139
+ for page in doc_en:
140
+ for font in font_list:
141
+ font_id[font[0]] = page.insert_font(font[0], font[1])
142
+ xreflen = doc_en.xref_length()
143
+ for xref in range(1, xreflen):
144
+ for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
145
+ try: # xref 读写可能出错
146
+ font_res = doc_en.xref_get_key(xref, f"{label}Font")
147
+ if font_res[0] == "dict":
148
+ for font in font_list:
149
+ font_exist = doc_en.xref_get_key(
150
+ xref, f"{label}Font/{font[0]}"
151
+ )
152
+ if font_exist[0] == "null":
153
+ doc_en.xref_set_key(
154
+ xref,
155
+ f"{label}Font/{font[0]}",
156
+ f"{font_id[font[0]]} 0 R",
157
+ )
158
+ except Exception:
159
+ pass
160
+ doc_en.save(Path(output) / f"{filename}-en.pdf")
161
+
162
+ with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
163
+ obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
164
+
165
+ for obj_id, ops_new in obj_patch.items():
166
+ # ops_old=doc_en.xref_stream(obj_id)
167
+ # print(obj_id)
168
+ # print(ops_old)
169
+ # print(ops_new.encode())
170
+ doc_en.update_stream(obj_id, ops_new.encode())
171
+
172
+ doc_zh = doc_en
173
+ doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
174
+ doc_dual.insert_file(doc_zh)
175
+ for id in range(page_count):
176
+ doc_dual.move_page(page_count + id, id * 2 + 1)
177
+ doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
178
+ doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
179
+ doc_zh.close()
180
+ doc_dual.close()
181
+ os.remove(Path(output) / f"{filename}-en.pdf")
182
+
183
+ return
184
+
185
+
186
+ def create_parser() -> argparse.ArgumentParser:
187
+ parser = argparse.ArgumentParser(description=__doc__, add_help=True)
188
+ parser.add_argument(
189
+ "files",
190
+ type=str,
191
+ default=None,
192
+ nargs="*",
193
+ help="One or more paths to PDF files.",
194
+ )
195
+ parser.add_argument(
196
+ "--version",
197
+ "-v",
198
+ action="version",
199
+ version=f"pdf2zh v{__version__}",
200
+ )
201
+ parser.add_argument(
202
+ "--debug",
203
+ "-d",
204
+ default=False,
205
+ action="store_true",
206
+ help="Use debug logging level.",
207
+ )
208
+ parse_params = parser.add_argument_group(
209
+ "Parser",
210
+ description="Used during PDF parsing",
211
+ )
212
+ parse_params.add_argument(
213
+ "--pages",
214
+ "-p",
215
+ type=str,
216
+ help="The list of page numbers to parse.",
217
+ )
218
+ parse_params.add_argument(
219
+ "--password",
220
+ "-P",
221
+ type=str,
222
+ default="",
223
+ help="The password to use for decrypting PDF file.",
224
+ )
225
+ parse_params.add_argument(
226
+ "--vfont",
227
+ "-f",
228
+ type=str,
229
+ default="",
230
+ help="The regex to math font name of formula.",
231
+ )
232
+ parse_params.add_argument(
233
+ "--vchar",
234
+ "-c",
235
+ type=str,
236
+ default="",
237
+ help="The regex to math character of formula.",
238
+ )
239
+ parse_params.add_argument(
240
+ "--lang-in",
241
+ "-li",
242
+ type=str,
243
+ default="auto",
244
+ help="The code of source language.",
245
+ )
246
+ parse_params.add_argument(
247
+ "--lang-out",
248
+ "-lo",
249
+ type=str,
250
+ default="auto",
251
+ help="The code of target language.",
252
+ )
253
+ parse_params.add_argument(
254
+ "--service",
255
+ "-s",
256
+ type=str,
257
+ default="google",
258
+ help="The service to use for translation.",
259
+ )
260
+ parse_params.add_argument(
261
+ "--output",
262
+ "-o",
263
+ type=str,
264
+ default="",
265
+ help="Output directory for files.",
266
+ )
267
+ parse_params.add_argument(
268
+ "--thread",
269
+ "-t",
270
+ type=int,
271
+ default=4,
272
+ help="The number of threads to execute translation.",
273
+ )
274
+ parse_params.add_argument(
275
+ "--interactive",
276
+ "-i",
277
+ action="store_true",
278
+ help="Interact with GUI.",
279
+ )
280
+ parse_params.add_argument(
281
+ "--share",
282
+ action="store_true",
283
+ help="Enable Gradio Share",
284
+ )
285
+
286
+ return parser
287
+
288
+
289
+ def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
290
+ parsed_args = create_parser().parse_args(args=args)
291
+
292
+ if parsed_args.pages:
293
+ pages = []
294
+ for p in parsed_args.pages.split(","):
295
+ if "-" in p:
296
+ start, end = p.split("-")
297
+ pages.extend(range(int(start) - 1, int(end)))
298
+ else:
299
+ pages.append(int(p) - 1)
300
+ parsed_args.pages = pages
301
+
302
+ return parsed_args
303
+
304
+
305
+ def main(args: Optional[List[str]] = None) -> int:
306
+ parsed_args = parse_args(args)
307
+
308
+ missing_files = check_files(parsed_args.files)
309
+ if missing_files:
310
+ print("The following files do not exist:", file=sys.stderr)
311
+ for file in missing_files:
312
+ print(f" {file}", file=sys.stderr)
313
+ return -1
314
+ if parsed_args.interactive:
315
+ from pdf2zh.gui import setup_gui
316
+
317
+ setup_gui(parsed_args.share)
318
+ return 0
319
+
320
+ extract_text(**vars(parsed_args))
321
+ return 0
322
+
323
+
324
+ if __name__ == "__main__":
325
+ sys.exit(main())
pdf2zh/pdfinterp.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any, Dict, Optional, Sequence, Tuple, cast
3
+ import numpy as np
4
+
5
+ from pdfminer import settings
6
+ from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
7
+ from pdfminer.pdfdevice import PDFDevice
8
+ from pdfminer.pdfinterp import (
9
+ PDFPageInterpreter,
10
+ PDFResourceManager,
11
+ PDFContentParser,
12
+ PDFInterpreterError,
13
+ Color,
14
+ PDFStackT,
15
+ LITERAL_FORM,
16
+ LITERAL_IMAGE,
17
+ )
18
+ from pdfminer.pdffont import PDFFont
19
+ from pdfminer.pdfpage import PDFPage
20
+ from pdfminer.pdftypes import (
21
+ PDFObjRef,
22
+ dict_value,
23
+ list_value,
24
+ resolve1,
25
+ stream_value,
26
+ )
27
+ from pdfminer.psexceptions import PSEOF
28
+ from pdfminer.psparser import (
29
+ PSKeyword,
30
+ keyword_name,
31
+ literal_name,
32
+ )
33
+ from pdfminer.utils import (
34
+ MATRIX_IDENTITY,
35
+ Matrix,
36
+ Rect,
37
+ mult_matrix,
38
+ apply_matrix_pt,
39
+ )
40
+
41
+ log = logging.getLogger(__name__)
42
+
43
+
44
+ def safe_float(o: Any) -> Optional[float]:
45
+ try:
46
+ return float(o)
47
+ except (TypeError, ValueError):
48
+ return None
49
+
50
+
51
+ class PDFPageInterpreterEx(PDFPageInterpreter):
52
+ """Processor for the content of a PDF page
53
+
54
+ Reference: PDF Reference, Appendix A, Operator Summary
55
+ """
56
+
57
+ def __init__(
58
+ self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
59
+ ) -> None:
60
+ self.rsrcmgr = rsrcmgr
61
+ self.device = device
62
+ self.obj_patch = obj_patch
63
+
64
+ def dup(self) -> "PDFPageInterpreterEx":
65
+ return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
66
+
67
+ def init_resources(self, resources: Dict[object, object]) -> None:
68
+ # 重载设置 fontid 和 descent
69
+ """Prepare the fonts and XObjects listed in the Resource attribute."""
70
+ self.resources = resources
71
+ self.fontmap: Dict[object, PDFFont] = {}
72
+ self.fontid: Dict[PDFFont, object] = {}
73
+ self.xobjmap = {}
74
+ self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
75
+ if not resources:
76
+ return
77
+
78
+ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
79
+ if isinstance(spec, list):
80
+ name = literal_name(spec[0])
81
+ else:
82
+ name = literal_name(spec)
83
+ if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
84
+ return PDFColorSpace(name, stream_value(spec[1])["N"])
85
+ elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
86
+ return PDFColorSpace(name, len(list_value(spec[1])))
87
+ else:
88
+ return PREDEFINED_COLORSPACE.get(name)
89
+
90
+ for k, v in dict_value(resources).items():
91
+ # log.debug("Resource: %r: %r", k, v)
92
+ if k == "Font":
93
+ for fontid, spec in dict_value(v).items():
94
+ objid = None
95
+ if isinstance(spec, PDFObjRef):
96
+ objid = spec.objid
97
+ spec = dict_value(spec)
98
+ self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
99
+ self.fontmap[fontid].descent = 0 # hack fix descent
100
+ self.fontid[self.fontmap[fontid]] = fontid
101
+ elif k == "ColorSpace":
102
+ for csid, spec in dict_value(v).items():
103
+ colorspace = get_colorspace(resolve1(spec))
104
+ if colorspace is not None:
105
+ self.csmap[csid] = colorspace
106
+ elif k == "ProcSet":
107
+ self.rsrcmgr.get_procset(list_value(v))
108
+ elif k == "XObject":
109
+ for xobjid, xobjstrm in dict_value(v).items():
110
+ self.xobjmap[xobjid] = xobjstrm
111
+
112
+ def do_S(self) -> None:
113
+ # 重载过滤非公式线条
114
+ """Stroke path"""
115
+
116
+ def is_black(color: Color) -> bool:
117
+ if isinstance(color, Tuple):
118
+ return sum(color) == 0
119
+ else:
120
+ return color == 0
121
+
122
+ if (
123
+ len(self.curpath) == 2
124
+ and self.curpath[0][0] == "m"
125
+ and self.curpath[1][0] == "l"
126
+ and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
127
+ == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
128
+ and is_black(self.graphicstate.scolor)
129
+ ): # 独立直线,水平,黑色
130
+ # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
131
+ self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
132
+ self.curpath = []
133
+ return "n"
134
+ else:
135
+ self.curpath = []
136
+
137
+ ############################################################
138
+ # 重载过滤非公式线条(F/B)
139
+ def do_f(self) -> None:
140
+ """Fill path using nonzero winding number rule"""
141
+ # self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
142
+ self.curpath = []
143
+
144
+ def do_F(self) -> None:
145
+ """Fill path using nonzero winding number rule (obsolete)"""
146
+
147
+ def do_f_a(self) -> None:
148
+ """Fill path using even-odd rule"""
149
+ # self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
150
+ self.curpath = []
151
+
152
+ def do_B(self) -> None:
153
+ """Fill and stroke path using nonzero winding number rule"""
154
+ # self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
155
+ self.curpath = []
156
+
157
+ def do_B_a(self) -> None:
158
+ """Fill and stroke path using even-odd rule"""
159
+ # self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
160
+ self.curpath = []
161
+
162
+ ############################################################
163
+ # 重载返回调用参数(SCN)
164
+ def do_SCN(self) -> None:
165
+ """Set color for stroking operations."""
166
+ if self.scs:
167
+ n = self.scs.ncomponents
168
+ else:
169
+ if settings.STRICT:
170
+ raise PDFInterpreterError("No colorspace specified!")
171
+ n = 1
172
+ args = self.pop(n)
173
+ self.graphicstate.scolor = cast(Color, args)
174
+ return args
175
+
176
+ def do_scn(self) -> None:
177
+ """Set color for nonstroking operations"""
178
+ if self.ncs:
179
+ n = self.ncs.ncomponents
180
+ else:
181
+ if settings.STRICT:
182
+ raise PDFInterpreterError("No colorspace specified!")
183
+ n = 1
184
+ args = self.pop(n)
185
+ self.graphicstate.ncolor = cast(Color, args)
186
+ return args
187
+
188
+ def do_SC(self) -> None:
189
+ """Set color for stroking operations"""
190
+ return self.do_SCN()
191
+
192
+ def do_sc(self) -> None:
193
+ """Set color for nonstroking operations"""
194
+ return self.do_scn()
195
+
196
+ def do_Do(self, xobjid_arg: PDFStackT) -> None:
197
+ # 重载设置 xobj 的 obj_patch
198
+ """Invoke named XObject"""
199
+ xobjid = literal_name(xobjid_arg)
200
+ try:
201
+ xobj = stream_value(self.xobjmap[xobjid])
202
+ except KeyError:
203
+ if settings.STRICT:
204
+ raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
205
+ return
206
+ # log.debug("Processing xobj: %r", xobj)
207
+ subtype = xobj.get("Subtype")
208
+ if subtype is LITERAL_FORM and "BBox" in xobj:
209
+ interpreter = self.dup()
210
+ bbox = cast(Rect, list_value(xobj["BBox"]))
211
+ matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
212
+ # According to PDF reference 1.7 section 4.9.1, XObjects in
213
+ # earlier PDFs (prior to v1.2) use the page's Resources entry
214
+ # instead of having their own Resources entry.
215
+ xobjres = xobj.get("Resources")
216
+ if xobjres:
217
+ resources = dict_value(xobjres)
218
+ else:
219
+ resources = self.resources.copy()
220
+ self.device.begin_figure(xobjid, bbox, matrix)
221
+ ctm = mult_matrix(matrix, self.ctm)
222
+ ops_base = interpreter.render_contents(
223
+ resources,
224
+ [xobj],
225
+ ctm=ctm,
226
+ )
227
+ try: # 有的时候 form 字体加不上这里会烂掉
228
+ self.device.fontid = interpreter.fontid
229
+ self.device.fontmap = interpreter.fontmap
230
+ ops_new = self.device.end_figure(xobjid)
231
+ ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
232
+ pos_inv = -np.mat(ctm[4:]) * ctm_inv
233
+ a, b, c, d = ctm_inv.reshape(4).tolist()
234
+ e, f = pos_inv.tolist()[0]
235
+ self.obj_patch[self.xobjmap[xobjid].objid] = (
236
+ f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
237
+ )
238
+ except Exception:
239
+ pass
240
+ elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
241
+ self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
242
+ self.device.render_image(xobjid, xobj)
243
+ self.device.end_figure(xobjid)
244
+ else:
245
+ # unsupported xobject type.
246
+ pass
247
+
248
+ def process_page(self, page: PDFPage) -> None:
249
+ # 重载设置 page 的 obj_patch
250
+ # log.debug("Processing page: %r", page)
251
+ # print(page.mediabox,page.cropbox)
252
+ # (x0, y0, x1, y1) = page.mediabox
253
+ (x0, y0, x1, y1) = page.cropbox
254
+ if page.rotate == 90:
255
+ ctm = (0, -1, 1, 0, -y0, x1)
256
+ elif page.rotate == 180:
257
+ ctm = (-1, 0, 0, -1, x1, y1)
258
+ elif page.rotate == 270:
259
+ ctm = (0, 1, -1, 0, y1, -x0)
260
+ else:
261
+ ctm = (1, 0, 0, 1, -x0, -y0)
262
+ self.device.begin_page(page, ctm)
263
+ ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
264
+ self.device.fontid = self.fontid
265
+ self.device.fontmap = self.fontmap
266
+ ops_new = self.device.end_page(page)
267
+ # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
268
+ self.obj_patch[page.page_xref] = (
269
+ f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
270
+ )
271
+ for obj in page.contents:
272
+ self.obj_patch[obj.objid] = ""
273
+
274
+ def render_contents(
275
+ self,
276
+ resources: Dict[object, object],
277
+ streams: Sequence[object],
278
+ ctm: Matrix = MATRIX_IDENTITY,
279
+ ) -> None:
280
+ # 重载返回指令流
281
+ """Render the content streams.
282
+
283
+ This method may be called recursively.
284
+ """
285
+ # log.debug(
286
+ # "render_contents: resources=%r, streams=%r, ctm=%r",
287
+ # resources,
288
+ # streams,
289
+ # ctm,
290
+ # )
291
+ self.init_resources(resources)
292
+ self.init_state(ctm)
293
+ return self.execute(list_value(streams))
294
+
295
+ def execute(self, streams: Sequence[object]) -> None:
296
+ # 重载返回指令流
297
+ ops = ""
298
+ try:
299
+ parser = PDFContentParser(streams)
300
+ except PSEOF:
301
+ # empty page
302
+ return
303
+ while True:
304
+ try:
305
+ (_, obj) = parser.nextobject()
306
+ except PSEOF:
307
+ break
308
+ if isinstance(obj, PSKeyword):
309
+ name = keyword_name(obj)
310
+ method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
311
+ "'",
312
+ "_q",
313
+ )
314
+ if hasattr(self, method):
315
+ func = getattr(self, method)
316
+ nargs = func.__code__.co_argcount - 1
317
+ if nargs:
318
+ args = self.pop(nargs)
319
+ # log.debug("exec: %s %r", name, args)
320
+ if len(args) == nargs:
321
+ func(*args)
322
+ if not (
323
+ name[0] == "T"
324
+ or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
325
+ ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
326
+ p = " ".join(
327
+ [
328
+ (
329
+ f"{x:f}"
330
+ if isinstance(x, float)
331
+ else str(x).replace("'", "")
332
+ )
333
+ for x in args
334
+ ]
335
+ )
336
+ ops += f"{p} {name} "
337
+ else:
338
+ # log.debug("exec: %s", name)
339
+ targs = func()
340
+ if targs is None:
341
+ targs = []
342
+ if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
343
+ p = " ".join(
344
+ [
345
+ (
346
+ f"{x:f}"
347
+ if isinstance(x, float)
348
+ else str(x).replace("'", "")
349
+ )
350
+ for x in targs
351
+ ]
352
+ )
353
+ ops += f"{p} {name} "
354
+ elif settings.STRICT:
355
+ error_msg = "Unknown operator: %r" % name
356
+ raise PDFInterpreterError(error_msg)
357
+ else:
358
+ self.push(obj)
359
+ # print('REV DATA',ops)
360
+ return ops
pdf2zh/translator.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import hmac
3
+ import html
4
+ import logging
5
+ import os
6
+ import re
7
+ import time
8
+ from datetime import timezone, datetime
9
+
10
+ from json import dumps, loads
11
+ import unicodedata
12
+
13
+ import deepl
14
+ import ollama
15
+ import openai
16
+ import requests
17
+ from azure.ai.translation.text import TextTranslationClient
18
+ from azure.core.credentials import AzureKeyCredential
19
+
20
+
21
+ def remove_control_characters(s):
22
+ return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
23
+
24
+
25
+ class BaseTranslator:
26
+ def __init__(self, service, lang_out, lang_in, model):
27
+ self.service = service
28
+ self.lang_out = lang_out
29
+ self.lang_in = lang_in
30
+ self.model = model
31
+
32
+ def translate(self, text) -> str: ... # noqa: E704
33
+
34
+ def __str__(self):
35
+ return f"{self.service} {self.lang_out} {self.lang_in}"
36
+
37
+
38
+ class GoogleTranslator(BaseTranslator):
39
+ def __init__(self, service, lang_out, lang_in, model):
40
+ lang_out = "zh-CN" if lang_out == "auto" else lang_out
41
+ lang_in = "en" if lang_in == "auto" else lang_in
42
+ super().__init__(service, lang_out, lang_in, model)
43
+ self.session = requests.Session()
44
+ self.base_link = "http://translate.google.com/m"
45
+ self.headers = {
46
+ "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
47
+ }
48
+
49
+ def translate(self, text):
50
+ text = text[:5000] # google translate max length
51
+ response = self.session.get(
52
+ self.base_link,
53
+ params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
54
+ headers=self.headers,
55
+ )
56
+ re_result = re.findall(
57
+ r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
58
+ )
59
+ if response.status_code == 400:
60
+ result = "IRREPARABLE TRANSLATION ERROR"
61
+ elif len(re_result) == 0:
62
+ raise ValueError("Empty translation result")
63
+ else:
64
+ result = html.unescape(re_result[0])
65
+ return remove_control_characters(result)
66
+
67
+
68
+ class TencentTranslator(BaseTranslator):
69
+ def sign(self, key, msg):
70
+ return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
71
+
72
+ def __init__(self, service, lang_out, lang_in, model):
73
+ lang_out = "zh" if lang_out == "auto" else lang_out
74
+ lang_in = "en" if lang_in == "auto" else lang_in
75
+ super().__init__(service, lang_out, lang_in, model)
76
+ try:
77
+ server_url = "tmt.tencentcloudapi.com"
78
+ self.secret_id = os.getenv("TENCENT_SECRET_ID")
79
+ self.secret_key = os.getenv("TENCENT_SECRET_KEY")
80
+
81
+ except KeyError as e:
82
+ missing_var = e.args[0]
83
+ raise ValueError(
84
+ f"The environment variable '{missing_var}' is required but not set."
85
+ ) from e
86
+
87
+ self.session = requests.Session()
88
+ self.base_link = f"{server_url}"
89
+
90
+ def translate(self, text):
91
+ text = text[:5000]
92
+ data = {
93
+ "SourceText": text,
94
+ "Source": self.lang_in,
95
+ "Target": self.lang_out,
96
+ "ProjectId": 0,
97
+ }
98
+ payloadx = dumps(data)
99
+ hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
100
+ canonical_request = (
101
+ "POST"
102
+ + "\n"
103
+ + "/"
104
+ + "\n"
105
+ + ""
106
+ + "\n"
107
+ + "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
108
+ + "\n"
109
+ + "content-type;host;x-tc-action"
110
+ + "\n"
111
+ + hashed_request_payload
112
+ )
113
+
114
+ timestamp = int(time.time())
115
+ date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
116
+ credential_scope = date + "/tmt/tc3_request"
117
+ hashed_canonical_request = hashlib.sha256(
118
+ canonical_request.encode("utf-8")
119
+ ).hexdigest()
120
+ algorithm = "TC3-HMAC-SHA256"
121
+ string_to_sign = (
122
+ algorithm
123
+ + "\n"
124
+ + str(timestamp)
125
+ + "\n"
126
+ + credential_scope
127
+ + "\n"
128
+ + hashed_canonical_request
129
+ )
130
+ secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
131
+ secret_service = self.sign(secret_date, "tmt")
132
+ secret_signing = self.sign(secret_service, "tc3_request")
133
+ signed_headers = "content-type;host;x-tc-action"
134
+ signature = hmac.new(
135
+ secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
136
+ ).hexdigest()
137
+ authorization = (
138
+ algorithm
139
+ + " "
140
+ + "Credential="
141
+ + str(self.secret_id)
142
+ + "/"
143
+ + credential_scope
144
+ + ", "
145
+ + "SignedHeaders="
146
+ + signed_headers
147
+ + ", "
148
+ + "Signature="
149
+ + signature
150
+ )
151
+ self.headers = {
152
+ "Authorization": authorization,
153
+ "Content-Type": "application/json; charset=utf-8",
154
+ "Host": "tmt.tencentcloudapi.com",
155
+ "X-TC-Action": "TextTranslate",
156
+ "X-TC-Region": "ap-beijing",
157
+ "X-TC-Timestamp": str(timestamp),
158
+ "X-TC-Version": "2018-03-21",
159
+ }
160
+
161
+ response = self.session.post(
162
+ "https://" + self.base_link,
163
+ json=data,
164
+ headers=self.headers,
165
+ )
166
+ # 1. Status code test
167
+ if response.status_code == 200:
168
+ result = loads(response.text)
169
+ else:
170
+ raise ValueError("HTTP error: " + str(response.status_code))
171
+ # 2. Result test
172
+ try:
173
+ result = result["Response"]["TargetText"]
174
+ # return result
175
+ except KeyError:
176
+ result = ""
177
+ # raise ValueError("No valid key in Tencent's response")
178
+ # # 3. Result length check
179
+ # if len(result) == 0:
180
+ # raise ValueError("Empty translation result")
181
+ return result
182
+
183
+
184
+ class DeepLXTranslator(BaseTranslator):
185
+ def __init__(self, service, lang_out, lang_in, model):
186
+ lang_out = "zh" if lang_out == "auto" else lang_out
187
+ lang_in = "en" if lang_in == "auto" else lang_in
188
+ super().__init__(service, lang_out, lang_in, model)
189
+ try:
190
+ auth_key = os.getenv("DEEPLX_AUTH_KEY")
191
+ server_url = (
192
+ "https://api.deeplx.org"
193
+ if not os.getenv("DEEPLX_SERVER_URL")
194
+ else os.getenv("DEEPLX_SERVER_URL")
195
+ )
196
+ except KeyError as e:
197
+ missing_var = e.args[0]
198
+ raise ValueError(
199
+ f"The environment variable '{missing_var}' is required but not set."
200
+ ) from e
201
+
202
+ self.session = requests.Session()
203
+ server_url = str(server_url).rstrip("/")
204
+ if auth_key:
205
+ self.base_link = f"{server_url}/{auth_key}/translate"
206
+ else:
207
+ self.base_link = f"{server_url}/translate"
208
+ self.headers = {
209
+ "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
210
+ }
211
+
212
+ def translate(self, text):
213
+ text = text[:5000] # google translate max length
214
+ response = self.session.post(
215
+ self.base_link,
216
+ dumps(
217
+ {
218
+ "target_lang": self.lang_out,
219
+ "text": text,
220
+ }
221
+ ),
222
+ headers=self.headers,
223
+ )
224
+ # 1. Status code test
225
+ if response.status_code == 200:
226
+ result = loads(response.text)
227
+ else:
228
+ raise ValueError("HTTP error: " + str(response.status_code))
229
+ # 2. Result test
230
+ try:
231
+ result = result["data"]
232
+ return result
233
+ except KeyError:
234
+ result = ""
235
+ raise ValueError("No valid key in DeepLX's response")
236
+ # 3. Result length check
237
+ if len(result) == 0:
238
+ raise ValueError("Empty translation result")
239
+ return result
240
+
241
+
242
+ class DeepLTranslator(BaseTranslator):
243
+ def __init__(self, service, lang_out, lang_in, model):
244
+ lang_out = "ZH" if lang_out == "auto" else lang_out
245
+ lang_in = "EN" if lang_in == "auto" else lang_in
246
+ super().__init__(service, lang_out, lang_in, model)
247
+ self.session = requests.Session()
248
+ auth_key = os.getenv("DEEPL_AUTH_KEY")
249
+ server_url = os.getenv("DEEPL_SERVER_URL")
250
+ self.client = deepl.Translator(auth_key, server_url=server_url)
251
+
252
+ def translate(self, text):
253
+ response = self.client.translate_text(
254
+ text, target_lang=self.lang_out, source_lang=self.lang_in
255
+ )
256
+ return response.text
257
+
258
+
259
+ class OllamaTranslator(BaseTranslator):
260
+ def __init__(self, service, lang_out, lang_in, model):
261
+ lang_out = "zh-CN" if lang_out == "auto" else lang_out
262
+ lang_in = "en" if lang_in == "auto" else lang_in
263
+ super().__init__(service, lang_out, lang_in, model)
264
+ self.options = {"temperature": 0} # 随机采样可能会打断公式标记
265
+ # OLLAMA_HOST
266
+ self.client = ollama.Client()
267
+
268
+ def translate(self, text):
269
+ response = self.client.chat(
270
+ model=self.model,
271
+ options=self.options,
272
+ messages=[
273
+ {
274
+ "role": "system",
275
+ "content": "You are a professional,authentic machine translation engine.",
276
+ },
277
+ {
278
+ "role": "user",
279
+ "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
280
+ },
281
+ ],
282
+ )
283
+ return response["message"]["content"].strip()
284
+
285
+
286
+ class OpenAITranslator(BaseTranslator):
287
+ def __init__(self, service, lang_out, lang_in, model):
288
+ lang_out = "zh-CN" if lang_out == "auto" else lang_out
289
+ lang_in = "en" if lang_in == "auto" else lang_in
290
+ super().__init__(service, lang_out, lang_in, model)
291
+ self.options = {"temperature": 0} # 随机采样可能会打断公式标记
292
+ # OPENAI_BASE_URL
293
+ # OPENAI_API_KEY
294
+ self.client = openai.OpenAI()
295
+
296
+ def translate(self, text) -> str:
297
+ response = self.client.chat.completions.create(
298
+ model=self.model,
299
+ **self.options,
300
+ messages=[
301
+ {
302
+ "role": "system",
303
+ "content": "You are a professional,authentic machine translation engine.",
304
+ },
305
+ {
306
+ "role": "user",
307
+ "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
308
+ },
309
+ ],
310
+ )
311
+ return response.choices[0].message.content.strip()
312
+
313
+
314
+ class AzureTranslator(BaseTranslator):
315
+ def __init__(self, service, lang_out, lang_in, model):
316
+ lang_out = "zh-Hans" if lang_out == "auto" else lang_out
317
+ lang_in = "en" if lang_in == "auto" else lang_in
318
+ super().__init__(service, lang_out, lang_in, model)
319
+
320
+ try:
321
+ api_key = os.environ["AZURE_APIKEY"]
322
+ endpoint = os.environ["AZURE_ENDPOINT"]
323
+ region = os.environ["AZURE_REGION"]
324
+ except KeyError as e:
325
+ missing_var = e.args[0]
326
+ raise ValueError(
327
+ f"The environment variable '{missing_var}' is required but not set."
328
+ ) from e
329
+
330
+ credential = AzureKeyCredential(api_key)
331
+ self.client = TextTranslationClient(
332
+ endpoint=endpoint, credential=credential, region=region
333
+ )
334
+
335
+ # https://github.com/Azure/azure-sdk-for-python/issues/9422
336
+ logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
337
+ logger.setLevel(logging.WARNING)
338
+
339
+ def translate(self, text) -> str:
340
+ response = self.client.translate(
341
+ body=[text],
342
+ from_language=self.lang_in,
343
+ to_language=[self.lang_out],
344
+ )
345
+
346
+ translated_text = response[0].translations[0].text
347
+ return translated_text