Spaces:
Running
Running
leonsimon23
commited on
Upload 9 files
Browse files- pdf2zh/__init__.py +6 -0
- pdf2zh/cache.py +91 -0
- pdf2zh/converter.py +456 -0
- pdf2zh/doclayout.py +163 -0
- pdf2zh/gui.py +503 -0
- pdf2zh/high_level.py +99 -0
- pdf2zh/pdf2zh.py +325 -0
- pdf2zh/pdfinterp.py +360 -0
- pdf2zh/translator.py +347 -0
pdf2zh/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
|
3 |
+
log = logging.getLogger(__name__)
|
4 |
+
|
5 |
+
__version__ = "1.8.4"
|
6 |
+
__author__ = "Byaidu"
|
pdf2zh/cache.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tempfile
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import hashlib
|
5 |
+
import shutil
|
6 |
+
|
7 |
+
cache_dir = os.path.join(tempfile.gettempdir(), "cache")
|
8 |
+
os.makedirs(cache_dir, exist_ok=True)
|
9 |
+
time_filename = "update_time"
|
10 |
+
max_cache = 5
|
11 |
+
|
12 |
+
|
13 |
+
def deterministic_hash(obj):
|
14 |
+
hash_object = hashlib.sha256()
|
15 |
+
hash_object.update(str(obj).encode())
|
16 |
+
return hash_object.hexdigest()[0:20]
|
17 |
+
|
18 |
+
|
19 |
+
def get_dirs():
|
20 |
+
dirs = [
|
21 |
+
os.path.join(cache_dir, dir)
|
22 |
+
for dir in os.listdir(cache_dir)
|
23 |
+
if os.path.isdir(os.path.join(cache_dir, dir))
|
24 |
+
]
|
25 |
+
return dirs
|
26 |
+
|
27 |
+
|
28 |
+
def get_time(dir):
|
29 |
+
try:
|
30 |
+
timefile = os.path.join(dir, time_filename)
|
31 |
+
t = float(open(timefile, encoding="utf-8").read())
|
32 |
+
return t
|
33 |
+
except FileNotFoundError:
|
34 |
+
# handle the error as needed, for now we'll just return a default value
|
35 |
+
return float(
|
36 |
+
"inf"
|
37 |
+
) # This ensures that this directory will be the first to be removed if required
|
38 |
+
|
39 |
+
|
40 |
+
def write_time(dir):
|
41 |
+
timefile = os.path.join(dir, time_filename)
|
42 |
+
t = time.time()
|
43 |
+
print(t, file=open(timefile, "w", encoding="utf-8"), end="")
|
44 |
+
|
45 |
+
|
46 |
+
def argmin(iterable):
|
47 |
+
return min(enumerate(iterable), key=lambda x: x[1])[0]
|
48 |
+
|
49 |
+
|
50 |
+
def remove_extra():
|
51 |
+
dirs = get_dirs()
|
52 |
+
for dir in dirs:
|
53 |
+
if not os.path.isdir(
|
54 |
+
dir
|
55 |
+
): # This line might be redundant now, as get_dirs() ensures only directories are returned
|
56 |
+
os.remove(dir)
|
57 |
+
try:
|
58 |
+
get_time(dir)
|
59 |
+
except BaseException:
|
60 |
+
shutil.rmtree(dir)
|
61 |
+
while True:
|
62 |
+
dirs = get_dirs()
|
63 |
+
if len(dirs) <= max_cache:
|
64 |
+
break
|
65 |
+
times = [get_time(dir) for dir in dirs]
|
66 |
+
arg = argmin(times)
|
67 |
+
shutil.rmtree(dirs[arg])
|
68 |
+
|
69 |
+
|
70 |
+
def is_cached(hash_key):
|
71 |
+
dir = os.path.join(cache_dir, hash_key)
|
72 |
+
return os.path.exists(dir)
|
73 |
+
|
74 |
+
|
75 |
+
def create_cache(hash_key):
|
76 |
+
dir = os.path.join(cache_dir, hash_key)
|
77 |
+
os.makedirs(dir, exist_ok=True)
|
78 |
+
write_time(dir)
|
79 |
+
|
80 |
+
|
81 |
+
def load_paragraph(hash_key, hash_key_paragraph):
|
82 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
83 |
+
if os.path.exists(filename):
|
84 |
+
return open(filename, encoding="utf-8").read()
|
85 |
+
else:
|
86 |
+
return None
|
87 |
+
|
88 |
+
|
89 |
+
def write_paragraph(hash_key, hash_key_paragraph, paragraph):
|
90 |
+
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
|
91 |
+
print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
|
pdf2zh/converter.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
2 |
+
from pdfminer.pdffont import PDFFont, PDFCIDFont
|
3 |
+
from pdfminer.converter import PDFConverter
|
4 |
+
from pdfminer.pdffont import PDFUnicodeNotDefined
|
5 |
+
from pdfminer.utils import apply_matrix_pt, mult_matrix
|
6 |
+
from pdfminer.layout import (
|
7 |
+
LTChar,
|
8 |
+
LTFigure,
|
9 |
+
LTLine,
|
10 |
+
LTPage,
|
11 |
+
)
|
12 |
+
import logging
|
13 |
+
import re
|
14 |
+
import concurrent.futures
|
15 |
+
import numpy as np
|
16 |
+
import unicodedata
|
17 |
+
from tenacity import retry, wait_fixed
|
18 |
+
from pdf2zh import cache
|
19 |
+
from pdf2zh.translator import (
|
20 |
+
BaseTranslator,
|
21 |
+
GoogleTranslator,
|
22 |
+
DeepLTranslator,
|
23 |
+
DeepLXTranslator,
|
24 |
+
OllamaTranslator,
|
25 |
+
OpenAITranslator,
|
26 |
+
AzureTranslator,
|
27 |
+
TencentTranslator,
|
28 |
+
)
|
29 |
+
from pymupdf import Font
|
30 |
+
|
31 |
+
log = logging.getLogger(__name__)
|
32 |
+
|
33 |
+
|
34 |
+
class PDFConverterEx(PDFConverter):
|
35 |
+
def __init__(
|
36 |
+
self,
|
37 |
+
rsrcmgr: PDFResourceManager,
|
38 |
+
) -> None:
|
39 |
+
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
|
40 |
+
|
41 |
+
def begin_page(self, page, ctm) -> None:
|
42 |
+
# 重载替换 cropbox
|
43 |
+
(x0, y0, x1, y1) = page.cropbox
|
44 |
+
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
45 |
+
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
46 |
+
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
|
47 |
+
self.cur_item = LTPage(page.pageno, mediabox)
|
48 |
+
|
49 |
+
def end_page(self, page):
|
50 |
+
# 重载返回指令流
|
51 |
+
return self.receive_layout(self.cur_item)
|
52 |
+
|
53 |
+
def begin_figure(self, name, bbox, matrix) -> None:
|
54 |
+
# 重载设置 pageid
|
55 |
+
self._stack.append(self.cur_item)
|
56 |
+
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
57 |
+
self.cur_item.pageid = self._stack[-1].pageid
|
58 |
+
|
59 |
+
def end_figure(self, _: str) -> None:
|
60 |
+
# 重载返回指令流
|
61 |
+
fig = self.cur_item
|
62 |
+
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
|
63 |
+
self.cur_item = self._stack.pop()
|
64 |
+
self.cur_item.add(fig)
|
65 |
+
return self.receive_layout(fig)
|
66 |
+
|
67 |
+
def render_char(
|
68 |
+
self,
|
69 |
+
matrix,
|
70 |
+
font,
|
71 |
+
fontsize: float,
|
72 |
+
scaling: float,
|
73 |
+
rise: float,
|
74 |
+
cid: int,
|
75 |
+
ncs,
|
76 |
+
graphicstate: PDFGraphicState,
|
77 |
+
) -> float:
|
78 |
+
# 重载设置 cid 和 font
|
79 |
+
try:
|
80 |
+
text = font.to_unichr(cid)
|
81 |
+
assert isinstance(text, str), str(type(text))
|
82 |
+
except PDFUnicodeNotDefined:
|
83 |
+
text = self.handle_undefined_char(font, cid)
|
84 |
+
textwidth = font.char_width(cid)
|
85 |
+
textdisp = font.char_disp(cid)
|
86 |
+
item = LTChar(
|
87 |
+
matrix,
|
88 |
+
font,
|
89 |
+
fontsize,
|
90 |
+
scaling,
|
91 |
+
rise,
|
92 |
+
text,
|
93 |
+
textwidth,
|
94 |
+
textdisp,
|
95 |
+
ncs,
|
96 |
+
graphicstate,
|
97 |
+
)
|
98 |
+
self.cur_item.add(item)
|
99 |
+
item.cid = cid # hack 插入原字符编码
|
100 |
+
item.font = font # hack 插入原字符字体
|
101 |
+
return item.adv
|
102 |
+
|
103 |
+
|
104 |
+
class Paragraph:
|
105 |
+
def __init__(self, y, x, x0, x1, size, font, brk):
|
106 |
+
self.y: float = y # 初始纵坐标
|
107 |
+
self.x: float = x # 初始横坐标
|
108 |
+
self.x0: float = x0 # 左边界
|
109 |
+
self.x1: float = x1 # 右边界
|
110 |
+
self.size: float = size # 字体大小
|
111 |
+
self.font: PDFFont = font # 字体
|
112 |
+
self.brk: bool = brk # 换行标记
|
113 |
+
|
114 |
+
|
115 |
+
# fmt: off
|
116 |
+
class TranslateConverter(PDFConverterEx):
|
117 |
+
def __init__(
|
118 |
+
self,
|
119 |
+
rsrcmgr,
|
120 |
+
vfont: str = None,
|
121 |
+
vchar: str = None,
|
122 |
+
thread: int = 0,
|
123 |
+
layout={},
|
124 |
+
lang_in: str = "",
|
125 |
+
lang_out: str = "",
|
126 |
+
service: str = "",
|
127 |
+
resfont: str = "",
|
128 |
+
noto: Font = None,
|
129 |
+
) -> None:
|
130 |
+
super().__init__(rsrcmgr)
|
131 |
+
self.vfont = vfont
|
132 |
+
self.vchar = vchar
|
133 |
+
self.thread = thread
|
134 |
+
self.layout = layout
|
135 |
+
self.resfont = resfont
|
136 |
+
self.noto = noto
|
137 |
+
self.translator: BaseTranslator = None
|
138 |
+
param = service.split(":", 1)
|
139 |
+
if param[0] == "google":
|
140 |
+
self.translator = GoogleTranslator(service, lang_out, lang_in, None)
|
141 |
+
elif param[0] == "deepl":
|
142 |
+
self.translator = DeepLTranslator(service, lang_out, lang_in, None)
|
143 |
+
elif param[0] == "deeplx":
|
144 |
+
self.translator = DeepLXTranslator(service, lang_out, lang_in, None)
|
145 |
+
elif param[0] == "ollama":
|
146 |
+
self.translator = OllamaTranslator(service, lang_out, lang_in, param[1])
|
147 |
+
elif param[0] == "openai":
|
148 |
+
self.translator = OpenAITranslator(service, lang_out, lang_in, param[1])
|
149 |
+
elif param[0] == "azure":
|
150 |
+
self.translator = AzureTranslator(service, lang_out, lang_in, None)
|
151 |
+
elif param[0] == "tencent":
|
152 |
+
self.translator = TencentTranslator(service, lang_out, lang_in, None)
|
153 |
+
else:
|
154 |
+
raise ValueError("Unsupported translation service")
|
155 |
+
|
156 |
+
def receive_layout(self, ltpage: LTPage):
|
157 |
+
# 段落
|
158 |
+
sstk: list[str] = [] # 段落文字栈
|
159 |
+
pstk: list[Paragraph] = [] # 段落属性栈
|
160 |
+
vbkt: int = 0 # 段落公式括号计数
|
161 |
+
# 公式组
|
162 |
+
vstk: list[LTChar] = [] # 公式符号组
|
163 |
+
vlstk: list[LTLine] = [] # 公式线条组
|
164 |
+
vfix: float = 0 # 公式纵向偏移
|
165 |
+
# 公式组栈
|
166 |
+
var: list[list[LTChar]] = [] # 公式符号组栈
|
167 |
+
varl: list[list[LTLine]] = [] # 公式线条组栈
|
168 |
+
varf: list[float] = [] # 公式纵向偏移栈
|
169 |
+
vlen: list[float] = [] # 公式宽度栈
|
170 |
+
# 全局
|
171 |
+
lstk: list[LTLine] = [] # 全局线条栈
|
172 |
+
xt: LTChar = None # 上一个字符
|
173 |
+
xt_cls: int = -1 # 上一个字符所属段落
|
174 |
+
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
175 |
+
ops: str = "" # 渲染结果
|
176 |
+
|
177 |
+
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
178 |
+
font = font.split("+")[-1] # 字体名截断
|
179 |
+
if re.match(r"\(cid:", char):
|
180 |
+
return True
|
181 |
+
# 基于字体名规则的判定
|
182 |
+
if self.vfont:
|
183 |
+
if re.match(self.vfont, font):
|
184 |
+
return True
|
185 |
+
else:
|
186 |
+
if re.match( # latex 字体
|
187 |
+
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
|
188 |
+
font,
|
189 |
+
):
|
190 |
+
return True
|
191 |
+
# 基于字符集规则的判定
|
192 |
+
if self.vchar:
|
193 |
+
if re.match(self.vchar, char):
|
194 |
+
return True
|
195 |
+
else:
|
196 |
+
if (
|
197 |
+
char
|
198 |
+
and char != " " # 非空格
|
199 |
+
and (
|
200 |
+
unicodedata.category(char[0])
|
201 |
+
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
202 |
+
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
203 |
+
)
|
204 |
+
):
|
205 |
+
return True
|
206 |
+
return False
|
207 |
+
|
208 |
+
############################################################
|
209 |
+
# A. 原文档解析
|
210 |
+
for child in ltpage:
|
211 |
+
if isinstance(child, LTChar):
|
212 |
+
cur_v = False
|
213 |
+
layout = self.layout[ltpage.pageid]
|
214 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
215 |
+
h, w = layout.shape
|
216 |
+
# 读取当前字符在 layout 中的类别
|
217 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
218 |
+
cls = layout[cy, cx]
|
219 |
+
if ( # 判定当前字符是否属于公式
|
220 |
+
cls == 0 # 1. 类别为保留区域
|
221 |
+
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
222 |
+
or vflag(child.fontname, child.get_text()) # 3. 公式字体
|
223 |
+
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
224 |
+
):
|
225 |
+
cur_v = True
|
226 |
+
# 判定括号组是否属于公式
|
227 |
+
if not cur_v:
|
228 |
+
if vstk and child.get_text() == "(":
|
229 |
+
cur_v = True
|
230 |
+
vbkt += 1
|
231 |
+
if vbkt and child.get_text() == ")":
|
232 |
+
cur_v = True
|
233 |
+
vbkt -= 1
|
234 |
+
if ( # 判定当前公式是否结束
|
235 |
+
not cur_v # 1. 当前字符不属于公式
|
236 |
+
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
237 |
+
or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
238 |
+
):
|
239 |
+
if vstk:
|
240 |
+
if ( # 根据公式右侧的文字修正公式的纵向偏移
|
241 |
+
not cur_v # 1. 当前字符不属于公式
|
242 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
243 |
+
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
244 |
+
):
|
245 |
+
vfix = vstk[0].y0 - child.y0
|
246 |
+
sstk[-1] += f"$v{len(var)}$"
|
247 |
+
var.append(vstk)
|
248 |
+
varl.append(vlstk)
|
249 |
+
varf.append(vfix)
|
250 |
+
vstk = []
|
251 |
+
vlstk = []
|
252 |
+
vfix = 0
|
253 |
+
# 当前字符不属于公式或当前字符是公式的第一个字符
|
254 |
+
if not vstk:
|
255 |
+
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
256 |
+
if child.x0 > xt.x1 + 1: # 添加行内空格
|
257 |
+
sstk[-1] += " "
|
258 |
+
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
259 |
+
sstk[-1] += " "
|
260 |
+
pstk[-1].brk = True
|
261 |
+
else: # 根据当前字符构建一个新的段落
|
262 |
+
sstk.append("")
|
263 |
+
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
|
264 |
+
if not cur_v: # 文字入栈
|
265 |
+
if ( # 根据当前字符修正段落属性
|
266 |
+
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
|
267 |
+
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
268 |
+
or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体
|
269 |
+
or re.match( # 4. 段落字体为粗体
|
270 |
+
r"(.*Medi|.*Bold)",
|
271 |
+
pstk[-1].font.fontname,
|
272 |
+
re.IGNORECASE,
|
273 |
+
)
|
274 |
+
):
|
275 |
+
pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
|
276 |
+
pstk[-1].size = child.size
|
277 |
+
pstk[-1].font = child.font
|
278 |
+
sstk[-1] += child.get_text()
|
279 |
+
else: # 公式入栈
|
280 |
+
if ( # 根据公式左侧的文字修正公式的纵向偏移
|
281 |
+
not vstk # 1. 当前字符是公式的第一个字符
|
282 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
283 |
+
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
|
284 |
+
):
|
285 |
+
vfix = child.y0 - xt.y0
|
286 |
+
vstk.append(child)
|
287 |
+
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
288 |
+
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
289 |
+
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
290 |
+
# 更新上一个字符
|
291 |
+
xt = child
|
292 |
+
xt_cls = cls
|
293 |
+
elif isinstance(child, LTFigure): # 图表
|
294 |
+
pass
|
295 |
+
elif isinstance(child, LTLine): # 线条
|
296 |
+
layout = self.layout[ltpage.pageid]
|
297 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
298 |
+
h, w = layout.shape
|
299 |
+
# 读取当前线条在 layout 中的类别
|
300 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
301 |
+
cls = layout[cy, cx]
|
302 |
+
if vstk and cls == xt_cls: # 公式线条
|
303 |
+
vlstk.append(child)
|
304 |
+
else: # 全局线条
|
305 |
+
lstk.append(child)
|
306 |
+
else:
|
307 |
+
pass
|
308 |
+
# 处理结尾
|
309 |
+
if vstk: # 公式出栈
|
310 |
+
sstk[-1] += f"$v{len(var)}$"
|
311 |
+
var.append(vstk)
|
312 |
+
varl.append(vlstk)
|
313 |
+
varf.append(vfix)
|
314 |
+
log.debug("\n==========[VSTACK]==========\n")
|
315 |
+
for id, v in enumerate(var): # 计算公式宽度
|
316 |
+
l = max([vch.x1 for vch in v]) - v[0].x0
|
317 |
+
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
|
318 |
+
vlen.append(l)
|
319 |
+
|
320 |
+
############################################################
|
321 |
+
# B. 段落翻译
|
322 |
+
log.debug("\n==========[SSTACK]==========\n")
|
323 |
+
hash_key = cache.deterministic_hash("PDFMathTranslate")
|
324 |
+
cache.create_cache(hash_key)
|
325 |
+
|
326 |
+
@retry(wait=wait_fixed(1))
|
327 |
+
def worker(s: str): # 多线程翻译
|
328 |
+
try:
|
329 |
+
hash_key_paragraph = cache.deterministic_hash(
|
330 |
+
(s, str(self.translator))
|
331 |
+
)
|
332 |
+
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
|
333 |
+
if new is None:
|
334 |
+
new = self.translator.translate(s)
|
335 |
+
cache.write_paragraph(hash_key, hash_key_paragraph, new)
|
336 |
+
return new
|
337 |
+
except BaseException as e:
|
338 |
+
if log.isEnabledFor(logging.DEBUG):
|
339 |
+
log.exception(e)
|
340 |
+
else:
|
341 |
+
log.exception(e, exc_info=False)
|
342 |
+
raise e
|
343 |
+
with concurrent.futures.ThreadPoolExecutor(
|
344 |
+
max_workers=self.thread
|
345 |
+
) as executor:
|
346 |
+
news = list(executor.map(worker, sstk))
|
347 |
+
|
348 |
+
############################################################
|
349 |
+
# C. 新文档排版
|
350 |
+
def raw_string(fcur: str, cstk: str): # 编码字符串
|
351 |
+
if fcur == 'noto':
|
352 |
+
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
|
353 |
+
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
354 |
+
return "".join(["%04x" % ord(c) for c in cstk])
|
355 |
+
else:
|
356 |
+
return "".join(["%02x" % ord(c) for c in cstk])
|
357 |
+
|
358 |
+
_x, _y = 0, 0
|
359 |
+
for id, new in enumerate(news):
|
360 |
+
x: float = pstk[id].x # 段落初始横坐标
|
361 |
+
y: float = pstk[id].y # 段落上边界
|
362 |
+
x0: float = pstk[id].x0 # 段落左边界
|
363 |
+
x1: float = pstk[id].x1 # 段落右边界
|
364 |
+
size: float = pstk[id].size # 段落字体大小
|
365 |
+
font: PDFFont = pstk[id].font # 段落字体
|
366 |
+
brk: bool = pstk[id].brk # 段落属性
|
367 |
+
cstk: str = "" # 当前文字栈
|
368 |
+
fcur: str = None # 当前字体ID
|
369 |
+
tx = x
|
370 |
+
fcur_ = fcur
|
371 |
+
ptr = 0
|
372 |
+
log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
|
373 |
+
while ptr < len(new):
|
374 |
+
vy_regex = re.match(
|
375 |
+
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
|
376 |
+
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
|
377 |
+
mod = 0 # 文字修饰符
|
378 |
+
if vy_regex: # 加载公式
|
379 |
+
ptr += len(vy_regex.group(0))
|
380 |
+
try:
|
381 |
+
vid = int(vy_regex.group(1).replace(" ", ""))
|
382 |
+
adv = vlen[vid]
|
383 |
+
except Exception:
|
384 |
+
continue # 翻译器可能会自动补个越界的公式标记
|
385 |
+
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
|
386 |
+
mod = var[vid][-1].width
|
387 |
+
else: # 加载文字
|
388 |
+
ch = new[ptr]
|
389 |
+
fcur_ = None
|
390 |
+
# 原字体编码容易出问题,这里直接放弃掉
|
391 |
+
# try:
|
392 |
+
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
|
393 |
+
# fcur_=self.fontid[font] # 原字体
|
394 |
+
# except:
|
395 |
+
# pass
|
396 |
+
try:
|
397 |
+
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
398 |
+
fcur_ = "tiro" # 默认拉丁字体
|
399 |
+
except Exception:
|
400 |
+
pass
|
401 |
+
if fcur_ is None:
|
402 |
+
fcur_ = self.resfont # 默认非拉丁字体
|
403 |
+
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
|
404 |
+
if fcur_ == 'noto':
|
405 |
+
adv = self.noto.char_lengths(ch, size)[0]
|
406 |
+
else:
|
407 |
+
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
408 |
+
ptr += 1
|
409 |
+
if ( # 输出文字缓冲区
|
410 |
+
fcur_ != fcur # 1. 字体更新
|
411 |
+
or vy_regex # 2. 插入公式
|
412 |
+
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
413 |
+
):
|
414 |
+
if cstk:
|
415 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
416 |
+
cstk = ""
|
417 |
+
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
418 |
+
x = x0
|
419 |
+
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
|
420 |
+
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
|
421 |
+
if vy_regex: # 插入公式
|
422 |
+
fix = 0
|
423 |
+
if fcur is not None: # 段落内公式修正纵向偏移
|
424 |
+
fix = varf[vid]
|
425 |
+
for vch in var[vid]: # 排版公式字符
|
426 |
+
vc = chr(vch.cid)
|
427 |
+
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
|
428 |
+
if log.isEnabledFor(logging.DEBUG):
|
429 |
+
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
430 |
+
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
431 |
+
for l in varl[vid]: # 排版公式线条
|
432 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
433 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
434 |
+
else: # 插入文字缓冲区
|
435 |
+
if not cstk: # 单行开头
|
436 |
+
tx = x
|
437 |
+
if x == x0 and ch == " ": # 消除段落换行空格
|
438 |
+
adv = 0
|
439 |
+
else:
|
440 |
+
cstk += ch
|
441 |
+
else:
|
442 |
+
cstk += ch
|
443 |
+
adv -= mod # 文字修饰符
|
444 |
+
fcur = fcur_
|
445 |
+
x += adv
|
446 |
+
if log.isEnabledFor(logging.DEBUG):
|
447 |
+
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
448 |
+
_x, _y = x, y
|
449 |
+
# 处理结尾
|
450 |
+
if cstk:
|
451 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
452 |
+
for l in lstk: # 排版全局线条
|
453 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
454 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
455 |
+
ops = f"BT {ops}ET "
|
456 |
+
return ops
|
pdf2zh/doclayout.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
import ast
|
5 |
+
import onnx
|
6 |
+
import onnxruntime
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
|
9 |
+
|
10 |
+
class DocLayoutModel(abc.ABC):
|
11 |
+
@staticmethod
|
12 |
+
def load_onnx():
|
13 |
+
model = OnnxModel.from_pretrained(
|
14 |
+
repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
|
15 |
+
filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
|
16 |
+
)
|
17 |
+
return model
|
18 |
+
|
19 |
+
@staticmethod
|
20 |
+
def load_available():
|
21 |
+
return DocLayoutModel.load_onnx()
|
22 |
+
|
23 |
+
@property
|
24 |
+
@abc.abstractmethod
|
25 |
+
def stride(self) -> int:
|
26 |
+
"""Stride of the model input."""
|
27 |
+
pass
|
28 |
+
|
29 |
+
@abc.abstractmethod
|
30 |
+
def predict(self, image, imgsz=1024, **kwargs) -> list:
|
31 |
+
"""
|
32 |
+
Predict the layout of a document page.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
image: The image of the document page.
|
36 |
+
imgsz: Resize the image to this size. Must be a multiple of the stride.
|
37 |
+
**kwargs: Additional arguments.
|
38 |
+
"""
|
39 |
+
pass
|
40 |
+
|
41 |
+
|
42 |
+
class YoloResult:
|
43 |
+
"""Helper class to store detection results from ONNX model."""
|
44 |
+
|
45 |
+
def __init__(self, boxes, names):
|
46 |
+
self.boxes = [YoloBox(data=d) for d in boxes]
|
47 |
+
self.boxes.sort(key=lambda x: x.conf, reverse=True)
|
48 |
+
self.names = names
|
49 |
+
|
50 |
+
|
51 |
+
class YoloBox:
|
52 |
+
"""Helper class to store detection results from ONNX model."""
|
53 |
+
|
54 |
+
def __init__(self, data):
|
55 |
+
self.xyxy = data[:4]
|
56 |
+
self.conf = data[-2]
|
57 |
+
self.cls = data[-1]
|
58 |
+
|
59 |
+
|
60 |
+
class OnnxModel(DocLayoutModel):
|
61 |
+
def __init__(self, model_path: str):
|
62 |
+
self.model_path = model_path
|
63 |
+
|
64 |
+
model = onnx.load(model_path)
|
65 |
+
metadata = {d.key: d.value for d in model.metadata_props}
|
66 |
+
self._stride = ast.literal_eval(metadata["stride"])
|
67 |
+
self._names = ast.literal_eval(metadata["names"])
|
68 |
+
|
69 |
+
self.model = onnxruntime.InferenceSession(model.SerializeToString())
|
70 |
+
|
71 |
+
@staticmethod
|
72 |
+
def from_pretrained(repo_id: str, filename: str):
|
73 |
+
pth = hf_hub_download(repo_id=repo_id, filename=filename)
|
74 |
+
return OnnxModel(pth)
|
75 |
+
|
76 |
+
@property
|
77 |
+
def stride(self):
|
78 |
+
return self._stride
|
79 |
+
|
80 |
+
def resize_and_pad_image(self, image, new_shape):
|
81 |
+
"""
|
82 |
+
Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
|
83 |
+
|
84 |
+
Parameters:
|
85 |
+
- image: Input image
|
86 |
+
- new_shape: Target size (integer or (height, width) tuple)
|
87 |
+
- stride: Padding alignment stride, default 32
|
88 |
+
|
89 |
+
Returns:
|
90 |
+
- Processed image
|
91 |
+
"""
|
92 |
+
if isinstance(new_shape, int):
|
93 |
+
new_shape = (new_shape, new_shape)
|
94 |
+
|
95 |
+
h, w = image.shape[:2]
|
96 |
+
new_h, new_w = new_shape
|
97 |
+
|
98 |
+
# Calculate scaling ratio
|
99 |
+
r = min(new_h / h, new_w / w)
|
100 |
+
resized_h, resized_w = int(round(h * r)), int(round(w * r))
|
101 |
+
|
102 |
+
# Resize image
|
103 |
+
image = cv2.resize(
|
104 |
+
image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
|
105 |
+
)
|
106 |
+
|
107 |
+
# Calculate padding size and align to stride multiple
|
108 |
+
pad_w = (new_w - resized_w) % self.stride
|
109 |
+
pad_h = (new_h - resized_h) % self.stride
|
110 |
+
top, bottom = pad_h // 2, pad_h - pad_h // 2
|
111 |
+
left, right = pad_w // 2, pad_w - pad_w // 2
|
112 |
+
|
113 |
+
# Add padding
|
114 |
+
image = cv2.copyMakeBorder(
|
115 |
+
image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
116 |
+
)
|
117 |
+
|
118 |
+
return image
|
119 |
+
|
120 |
+
def scale_boxes(self, img1_shape, boxes, img0_shape):
|
121 |
+
"""
|
122 |
+
Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
|
123 |
+
specified in (img1_shape) to the shape of a different image (img0_shape).
|
124 |
+
|
125 |
+
Args:
|
126 |
+
img1_shape (tuple): The shape of the image that the bounding boxes are for,
|
127 |
+
in the format of (height, width).
|
128 |
+
boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
|
129 |
+
img0_shape (tuple): the shape of the target image, in the format of (height, width).
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
|
133 |
+
"""
|
134 |
+
|
135 |
+
# Calculate scaling ratio
|
136 |
+
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
|
137 |
+
|
138 |
+
# Calculate padding size
|
139 |
+
pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
|
140 |
+
pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
|
141 |
+
|
142 |
+
# Remove padding and scale boxes
|
143 |
+
boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
|
144 |
+
return boxes
|
145 |
+
|
146 |
+
def predict(self, image, imgsz=1024, **kwargs):
|
147 |
+
# Preprocess input image
|
148 |
+
orig_h, orig_w = image.shape[:2]
|
149 |
+
pix = self.resize_and_pad_image(image, new_shape=imgsz)
|
150 |
+
pix = np.transpose(pix, (2, 0, 1)) # CHW
|
151 |
+
pix = np.expand_dims(pix, axis=0) # BCHW
|
152 |
+
pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1]
|
153 |
+
new_h, new_w = pix.shape[2:]
|
154 |
+
|
155 |
+
# Run inference
|
156 |
+
preds = self.model.run(None, {"images": pix})[0]
|
157 |
+
|
158 |
+
# Postprocess predictions
|
159 |
+
preds = preds[preds[..., 4] > 0.25]
|
160 |
+
preds[..., :4] = self.scale_boxes(
|
161 |
+
(new_h, new_w), preds[..., :4], (orig_h, orig_w)
|
162 |
+
)
|
163 |
+
return [YoloResult(boxes=preds, names=self._names)]
|
pdf2zh/gui.py
ADDED
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from pathlib import Path
|
4 |
+
from pdf2zh import __version__
|
5 |
+
from pdf2zh.pdf2zh import extract_text
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
import pymupdf
|
10 |
+
import tqdm
|
11 |
+
import requests
|
12 |
+
import cgi
|
13 |
+
|
14 |
+
# Map service names to pdf2zh service options
|
15 |
+
service_map = {
|
16 |
+
"Google": ("google", None, None),
|
17 |
+
"DeepL": ("deepl", "DEEPL_AUTH_KEY", None),
|
18 |
+
"DeepLX": ("deeplx", "DEEPLX_AUTH_KEY", None),
|
19 |
+
"Ollama": ("ollama", None, "gemma2"),
|
20 |
+
"OpenAI": ("openai", "OPENAI_API_KEY", "gpt-4o"),
|
21 |
+
"Azure": ("azure", "AZURE_APIKEY", None),
|
22 |
+
"Tencent": ("tencent", "TENCENT_SECRET_KEY", None),
|
23 |
+
}
|
24 |
+
lang_map = {
|
25 |
+
"Chinese": "zh",
|
26 |
+
"English": "en",
|
27 |
+
"French": "fr",
|
28 |
+
"German": "de",
|
29 |
+
"Japanese": "ja",
|
30 |
+
"Korean": "ko",
|
31 |
+
"Russian": "ru",
|
32 |
+
"Spanish": "es",
|
33 |
+
"Italian": "it",
|
34 |
+
}
|
35 |
+
page_map = {
|
36 |
+
"All": None,
|
37 |
+
"First": [0],
|
38 |
+
"First 5 pages": list(range(0, 5)),
|
39 |
+
}
|
40 |
+
|
41 |
+
flag_demo = False
|
42 |
+
if os.environ.get("PDF2ZH_DEMO"):
|
43 |
+
flag_demo = True
|
44 |
+
service_map = {
|
45 |
+
"Google": ("google", None, None),
|
46 |
+
}
|
47 |
+
page_map = {
|
48 |
+
"First": [0],
|
49 |
+
"First 20 pages": list(range(0, 20)),
|
50 |
+
}
|
51 |
+
client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
|
52 |
+
server_key = os.environ.get("PDF2ZH_SERVER_KEY")
|
53 |
+
|
54 |
+
|
55 |
+
def verify_recaptcha(response):
|
56 |
+
recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
|
57 |
+
|
58 |
+
print("reCAPTCHA", server_key, response)
|
59 |
+
|
60 |
+
data = {"secret": server_key, "response": response}
|
61 |
+
result = requests.post(recaptcha_url, data=data).json()
|
62 |
+
|
63 |
+
print("reCAPTCHA", result.get("success"))
|
64 |
+
|
65 |
+
return result.get("success")
|
66 |
+
|
67 |
+
|
68 |
+
def pdf_preview(file):
|
69 |
+
doc = pymupdf.open(file)
|
70 |
+
page = doc[0]
|
71 |
+
pix = page.get_pixmap()
|
72 |
+
image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
|
73 |
+
return image
|
74 |
+
|
75 |
+
|
76 |
+
def upload_file(file, service, progress=gr.Progress()):
|
77 |
+
"""Handle file upload, validation, and initial preview."""
|
78 |
+
if not file or not os.path.exists(file):
|
79 |
+
return None, None
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Convert first page for preview
|
83 |
+
preview_image = pdf_preview(file)
|
84 |
+
|
85 |
+
return file, preview_image
|
86 |
+
except Exception as e:
|
87 |
+
print(f"Error converting PDF: {e}")
|
88 |
+
return None, None
|
89 |
+
|
90 |
+
|
91 |
+
def download_with_limit(url, save_path, size_limit):
|
92 |
+
chunk_size = 1024
|
93 |
+
total_size = 0
|
94 |
+
with requests.get(url, stream=True, timeout=10) as response:
|
95 |
+
response.raise_for_status()
|
96 |
+
content = response.headers.get("Content-Disposition")
|
97 |
+
try:
|
98 |
+
_, params = cgi.parse_header(content)
|
99 |
+
filename = params["filename"]
|
100 |
+
except Exception:
|
101 |
+
filename = os.path.basename(url)
|
102 |
+
with open(save_path / filename, "wb") as file:
|
103 |
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
104 |
+
total_size += len(chunk)
|
105 |
+
if size_limit and total_size > size_limit:
|
106 |
+
raise gr.Error("Exceeds file size limit")
|
107 |
+
file.write(chunk)
|
108 |
+
return save_path / filename
|
109 |
+
|
110 |
+
|
111 |
+
def translate(
|
112 |
+
file_type,
|
113 |
+
file_input,
|
114 |
+
link_input,
|
115 |
+
service,
|
116 |
+
apikey,
|
117 |
+
model_id,
|
118 |
+
lang_from,
|
119 |
+
lang_to,
|
120 |
+
page_range,
|
121 |
+
recaptcha_response,
|
122 |
+
progress=gr.Progress(),
|
123 |
+
):
|
124 |
+
"""Translate PDF content using selected service."""
|
125 |
+
if flag_demo and not verify_recaptcha(recaptcha_response):
|
126 |
+
raise gr.Error("reCAPTCHA fail")
|
127 |
+
|
128 |
+
progress(0, desc="Starting translation...")
|
129 |
+
|
130 |
+
output = Path("pdf2zh_files")
|
131 |
+
output.mkdir(parents=True, exist_ok=True)
|
132 |
+
|
133 |
+
if file_type == "File":
|
134 |
+
if not file_input:
|
135 |
+
raise gr.Error("No input")
|
136 |
+
file_path = shutil.copy(file_input, output)
|
137 |
+
else:
|
138 |
+
if not link_input:
|
139 |
+
raise gr.Error("No input")
|
140 |
+
file_path = download_with_limit(
|
141 |
+
link_input,
|
142 |
+
output,
|
143 |
+
5 * 1024 * 1024 if flag_demo else None,
|
144 |
+
)
|
145 |
+
|
146 |
+
filename = os.path.splitext(os.path.basename(file_path))[0]
|
147 |
+
file_en = output / f"{filename}.pdf"
|
148 |
+
file_zh = output / f"{filename}-zh.pdf"
|
149 |
+
file_dual = output / f"{filename}-dual.pdf"
|
150 |
+
|
151 |
+
selected_service = service_map[service][0]
|
152 |
+
if service_map[service][1]:
|
153 |
+
os.environ.setdefault(service_map[service][1], apikey)
|
154 |
+
selected_page = page_map[page_range]
|
155 |
+
lang_from = lang_map[lang_from]
|
156 |
+
lang_to = lang_map[lang_to]
|
157 |
+
if selected_service == "google":
|
158 |
+
lang_from = "zh-CN" if lang_from == "zh" else lang_from
|
159 |
+
lang_to = "zh-CN" if lang_to == "zh" else lang_to
|
160 |
+
|
161 |
+
print(f"Files before translation: {os.listdir(output)}")
|
162 |
+
|
163 |
+
def progress_bar(t: tqdm.tqdm):
|
164 |
+
progress(t.n / t.total, desc="Translating...")
|
165 |
+
|
166 |
+
param = {
|
167 |
+
"files": [file_en],
|
168 |
+
"pages": selected_page,
|
169 |
+
"lang_in": lang_from,
|
170 |
+
"lang_out": lang_to,
|
171 |
+
"service": f"{selected_service}:{model_id}",
|
172 |
+
"output": output,
|
173 |
+
"thread": 4,
|
174 |
+
"callback": progress_bar,
|
175 |
+
}
|
176 |
+
print(param)
|
177 |
+
extract_text(**param)
|
178 |
+
print(f"Files after translation: {os.listdir(output)}")
|
179 |
+
|
180 |
+
if not file_zh.exists() or not file_dual.exists():
|
181 |
+
raise gr.Error("No output")
|
182 |
+
|
183 |
+
try:
|
184 |
+
translated_preview = pdf_preview(str(file_zh))
|
185 |
+
except Exception:
|
186 |
+
raise gr.Error("No preview")
|
187 |
+
|
188 |
+
progress(1.0, desc="Translation complete!")
|
189 |
+
|
190 |
+
return (
|
191 |
+
str(file_zh),
|
192 |
+
translated_preview,
|
193 |
+
str(file_dual),
|
194 |
+
gr.update(visible=True),
|
195 |
+
gr.update(visible=True),
|
196 |
+
gr.update(visible=True),
|
197 |
+
)
|
198 |
+
|
199 |
+
|
200 |
+
# Global setup
|
201 |
+
custom_blue = gr.themes.Color(
|
202 |
+
c50="#E8F3FF",
|
203 |
+
c100="#BEDAFF",
|
204 |
+
c200="#94BFFF",
|
205 |
+
c300="#6AA1FF",
|
206 |
+
c400="#4080FF",
|
207 |
+
c500="#165DFF", # Primary color
|
208 |
+
c600="#0E42D2",
|
209 |
+
c700="#0A2BA6",
|
210 |
+
c800="#061D79",
|
211 |
+
c900="#03114D",
|
212 |
+
c950="#020B33",
|
213 |
+
)
|
214 |
+
|
215 |
+
with gr.Blocks(
|
216 |
+
title="PDFMathTranslate - PDF Translation with preserved formats",
|
217 |
+
theme=gr.themes.Default(
|
218 |
+
primary_hue=custom_blue, spacing_size="md", radius_size="lg"
|
219 |
+
),
|
220 |
+
css="""
|
221 |
+
.secondary-text {color: #999 !important;}
|
222 |
+
footer {visibility: hidden}
|
223 |
+
.env-warning {color: #dd5500 !important;}
|
224 |
+
.env-success {color: #559900 !important;}
|
225 |
+
|
226 |
+
/* Add dashed border to input-file class */
|
227 |
+
.input-file {
|
228 |
+
border: 1.2px dashed #165DFF !important;
|
229 |
+
border-radius: 6px !important;
|
230 |
+
# background-color: #ffffff !important;
|
231 |
+
transition: background-color 0.4s ease-out;
|
232 |
+
}
|
233 |
+
|
234 |
+
.input-file:hover {
|
235 |
+
border: 1.2px dashed #165DFF !important;
|
236 |
+
border-radius: 6px !important;
|
237 |
+
color: #165DFF !important;
|
238 |
+
background-color: #E8F3FF !important;
|
239 |
+
transition: background-color 0.2s ease-in;
|
240 |
+
}
|
241 |
+
|
242 |
+
.progress-bar-wrap {
|
243 |
+
border-radius: 8px !important;
|
244 |
+
}
|
245 |
+
.progress-bar {
|
246 |
+
border-radius: 8px !important;
|
247 |
+
}
|
248 |
+
|
249 |
+
# .input-file label {
|
250 |
+
# color: #165DFF !important;
|
251 |
+
# border: 1.2px dashed #165DFF !important;
|
252 |
+
# border-left: none !important;
|
253 |
+
# border-top: none !important;
|
254 |
+
# }
|
255 |
+
# .input-file .wrap {
|
256 |
+
# color: #165DFF !important;
|
257 |
+
# }
|
258 |
+
# .input-file .or {
|
259 |
+
# color: #165DFF !important;
|
260 |
+
# }
|
261 |
+
""",
|
262 |
+
head=(
|
263 |
+
"""
|
264 |
+
<script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
|
265 |
+
<script type="text/javascript">
|
266 |
+
var onVerify = function(token) {
|
267 |
+
el=document.getElementById('verify').getElementsByTagName('textarea')[0];
|
268 |
+
el.value=token;
|
269 |
+
el.dispatchEvent(new Event('input'));
|
270 |
+
};
|
271 |
+
</script>
|
272 |
+
"""
|
273 |
+
if flag_demo
|
274 |
+
else ""
|
275 |
+
),
|
276 |
+
) as demo:
|
277 |
+
gr.Markdown(
|
278 |
+
"# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
|
279 |
+
)
|
280 |
+
|
281 |
+
with gr.Row():
|
282 |
+
with gr.Column(scale=1):
|
283 |
+
gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
|
284 |
+
file_type = gr.Radio(
|
285 |
+
choices=["File", "Link"],
|
286 |
+
label="Type",
|
287 |
+
value="File",
|
288 |
+
)
|
289 |
+
file_input = gr.File(
|
290 |
+
label="File",
|
291 |
+
file_count="single",
|
292 |
+
file_types=[".pdf"],
|
293 |
+
type="filepath",
|
294 |
+
elem_classes=["input-file"],
|
295 |
+
)
|
296 |
+
link_input = gr.Textbox(
|
297 |
+
label="Link",
|
298 |
+
visible=False,
|
299 |
+
interactive=True,
|
300 |
+
)
|
301 |
+
gr.Markdown("## Option")
|
302 |
+
with gr.Row():
|
303 |
+
service = gr.Dropdown(
|
304 |
+
label="Service",
|
305 |
+
choices=service_map.keys(),
|
306 |
+
value="Google",
|
307 |
+
)
|
308 |
+
apikey = gr.Textbox(
|
309 |
+
label="API Key",
|
310 |
+
max_lines=1,
|
311 |
+
visible=False,
|
312 |
+
)
|
313 |
+
with gr.Row():
|
314 |
+
lang_from = gr.Dropdown(
|
315 |
+
label="Translate from",
|
316 |
+
choices=lang_map.keys(),
|
317 |
+
value="English",
|
318 |
+
)
|
319 |
+
lang_to = gr.Dropdown(
|
320 |
+
label="Translate to",
|
321 |
+
choices=lang_map.keys(),
|
322 |
+
value="Chinese",
|
323 |
+
)
|
324 |
+
page_range = gr.Radio(
|
325 |
+
choices=page_map.keys(),
|
326 |
+
label="Pages",
|
327 |
+
value=list(page_map.keys())[0],
|
328 |
+
)
|
329 |
+
model_id = gr.Textbox(
|
330 |
+
label="Model ID",
|
331 |
+
visible=False,
|
332 |
+
interactive=True,
|
333 |
+
)
|
334 |
+
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
335 |
+
|
336 |
+
def details_wrapper(text_markdown):
|
337 |
+
text = f"""
|
338 |
+
<summary>Technical details</summary>
|
339 |
+
{text_markdown}
|
340 |
+
- GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
|
341 |
+
- GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
|
342 |
+
- Version: {__version__}
|
343 |
+
"""
|
344 |
+
return text
|
345 |
+
|
346 |
+
def env_var_checker(env_var_name: str) -> str:
|
347 |
+
if env_var_name:
|
348 |
+
if not os.environ.get(env_var_name):
|
349 |
+
envs_status = (
|
350 |
+
f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
|
351 |
+
+ "</span><br>- Please make sure that the environment variables are properly configured "
|
352 |
+
+ "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
|
353 |
+
)
|
354 |
+
else:
|
355 |
+
value = str(os.environ.get(env_var_name))
|
356 |
+
envs_status = "<span class='env-success'>- Properly configured.</span><br>"
|
357 |
+
envs_status += (
|
358 |
+
f"- {env_var_name}: <code>{value[:13]}***</code><br>"
|
359 |
+
)
|
360 |
+
else:
|
361 |
+
envs_status = (
|
362 |
+
"<span class='env-success'>- Properly configured.</span><br>"
|
363 |
+
)
|
364 |
+
return details_wrapper(envs_status)
|
365 |
+
|
366 |
+
def on_select_service(service, evt: gr.EventData):
|
367 |
+
if service_map[service][1]:
|
368 |
+
apikey_content = gr.update(
|
369 |
+
visible=True, value=os.environ.get(service_map[service][1])
|
370 |
+
)
|
371 |
+
else:
|
372 |
+
apikey_content = gr.update(visible=False)
|
373 |
+
if service_map[service][2]:
|
374 |
+
model_visibility = gr.update(
|
375 |
+
visible=True, value=service_map[service][2]
|
376 |
+
)
|
377 |
+
else:
|
378 |
+
model_visibility = gr.update(visible=False)
|
379 |
+
return (
|
380 |
+
env_var_checker(service_map[service][1]),
|
381 |
+
model_visibility,
|
382 |
+
apikey_content,
|
383 |
+
)
|
384 |
+
|
385 |
+
def on_select_filetype(file_type):
|
386 |
+
return (
|
387 |
+
gr.update(visible=file_type == "File"),
|
388 |
+
gr.update(visible=file_type == "Link"),
|
389 |
+
)
|
390 |
+
|
391 |
+
output_title = gr.Markdown("## Translated", visible=False)
|
392 |
+
output_file = gr.File(label="Download Translation", visible=False)
|
393 |
+
output_file_dual = gr.File(
|
394 |
+
label="Download Translation (Dual)", visible=False
|
395 |
+
)
|
396 |
+
recaptcha_response = gr.Textbox(
|
397 |
+
label="reCAPTCHA Response", elem_id="verify", visible=False
|
398 |
+
)
|
399 |
+
recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
|
400 |
+
translate_btn = gr.Button("Translate", variant="primary")
|
401 |
+
tech_details_tog = gr.Markdown(
|
402 |
+
details_wrapper(envs_status),
|
403 |
+
elem_classes=["secondary-text"],
|
404 |
+
)
|
405 |
+
service.select(
|
406 |
+
on_select_service, service, [tech_details_tog, model_id, apikey]
|
407 |
+
)
|
408 |
+
file_type.select(
|
409 |
+
on_select_filetype,
|
410 |
+
file_type,
|
411 |
+
[file_input, link_input],
|
412 |
+
js=(
|
413 |
+
f"""
|
414 |
+
(a,b)=>{{
|
415 |
+
try{{
|
416 |
+
grecaptcha.render('recaptcha-box',{{
|
417 |
+
'sitekey':'{client_key}',
|
418 |
+
'callback':'onVerify'
|
419 |
+
}});
|
420 |
+
}}catch(error){{}}
|
421 |
+
return [a];
|
422 |
+
}}
|
423 |
+
"""
|
424 |
+
if flag_demo
|
425 |
+
else ""
|
426 |
+
),
|
427 |
+
)
|
428 |
+
|
429 |
+
with gr.Column(scale=2):
|
430 |
+
gr.Markdown("## Preview")
|
431 |
+
preview = gr.Image(label="Document Preview", visible=True)
|
432 |
+
|
433 |
+
# Event handlers
|
434 |
+
file_input.upload(
|
435 |
+
upload_file,
|
436 |
+
inputs=[file_input, service],
|
437 |
+
outputs=[file_input, preview],
|
438 |
+
js=(
|
439 |
+
f"""
|
440 |
+
(a,b)=>{{
|
441 |
+
try{{
|
442 |
+
grecaptcha.render('recaptcha-box',{{
|
443 |
+
'sitekey':'{client_key}',
|
444 |
+
'callback':'onVerify'
|
445 |
+
}});
|
446 |
+
}}catch(error){{}}
|
447 |
+
return [a];
|
448 |
+
}}
|
449 |
+
"""
|
450 |
+
if flag_demo
|
451 |
+
else ""
|
452 |
+
),
|
453 |
+
)
|
454 |
+
|
455 |
+
translate_btn.click(
|
456 |
+
translate,
|
457 |
+
inputs=[
|
458 |
+
file_type,
|
459 |
+
file_input,
|
460 |
+
link_input,
|
461 |
+
service,
|
462 |
+
apikey,
|
463 |
+
model_id,
|
464 |
+
lang_from,
|
465 |
+
lang_to,
|
466 |
+
page_range,
|
467 |
+
recaptcha_response,
|
468 |
+
],
|
469 |
+
outputs=[
|
470 |
+
output_file,
|
471 |
+
preview,
|
472 |
+
output_file_dual,
|
473 |
+
output_file,
|
474 |
+
output_file_dual,
|
475 |
+
output_title,
|
476 |
+
],
|
477 |
+
).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
|
478 |
+
|
479 |
+
|
480 |
+
def setup_gui(share=False):
|
481 |
+
if flag_demo:
|
482 |
+
demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
|
483 |
+
else:
|
484 |
+
try:
|
485 |
+
demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
|
486 |
+
except Exception:
|
487 |
+
print(
|
488 |
+
"Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
|
489 |
+
)
|
490 |
+
try:
|
491 |
+
demo.launch(
|
492 |
+
server_name="127.0.0.1", debug=True, inbrowser=True, share=share
|
493 |
+
)
|
494 |
+
except Exception:
|
495 |
+
print(
|
496 |
+
"Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
|
497 |
+
)
|
498 |
+
demo.launch(debug=True, inbrowser=True, share=True)
|
499 |
+
|
500 |
+
|
501 |
+
# For auto-reloading while developing
|
502 |
+
if __name__ == "__main__":
|
503 |
+
setup_gui()
|
pdf2zh/high_level.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Functions that can be used for the most common use-cases for pdf2zh.six"""
|
2 |
+
|
3 |
+
from typing import BinaryIO
|
4 |
+
import numpy as np
|
5 |
+
import tqdm
|
6 |
+
from pymupdf import Document
|
7 |
+
from pdfminer.pdfpage import PDFPage
|
8 |
+
from pdfminer.pdfinterp import PDFResourceManager
|
9 |
+
from pdfminer.pdfdocument import PDFDocument
|
10 |
+
from pdfminer.pdfparser import PDFParser
|
11 |
+
from pdf2zh.converter import TranslateConverter
|
12 |
+
from pdf2zh.pdfinterp import PDFPageInterpreterEx
|
13 |
+
from pymupdf import Font
|
14 |
+
|
15 |
+
|
16 |
+
def extract_text_to_fp(
|
17 |
+
inf: BinaryIO,
|
18 |
+
pages=None,
|
19 |
+
password: str = "",
|
20 |
+
debug: bool = False,
|
21 |
+
page_count: int = 0,
|
22 |
+
vfont: str = "",
|
23 |
+
vchar: str = "",
|
24 |
+
thread: int = 0,
|
25 |
+
doc_en: Document = None,
|
26 |
+
model=None,
|
27 |
+
lang_in: str = "",
|
28 |
+
lang_out: str = "",
|
29 |
+
service: str = "",
|
30 |
+
resfont: str = "",
|
31 |
+
noto: Font = None,
|
32 |
+
callback: object = None,
|
33 |
+
**kwarg,
|
34 |
+
) -> None:
|
35 |
+
rsrcmgr = PDFResourceManager()
|
36 |
+
layout = {}
|
37 |
+
device = TranslateConverter(
|
38 |
+
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
|
39 |
+
)
|
40 |
+
|
41 |
+
assert device is not None
|
42 |
+
obj_patch = {}
|
43 |
+
interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
|
44 |
+
if pages:
|
45 |
+
total_pages = len(pages)
|
46 |
+
else:
|
47 |
+
total_pages = page_count
|
48 |
+
|
49 |
+
parser = PDFParser(inf)
|
50 |
+
doc = PDFDocument(parser, password=password)
|
51 |
+
with tqdm.tqdm(
|
52 |
+
enumerate(PDFPage.create_pages(doc)),
|
53 |
+
total=total_pages,
|
54 |
+
) as progress:
|
55 |
+
for pageno, page in progress:
|
56 |
+
if pages and (pageno not in pages):
|
57 |
+
continue
|
58 |
+
if callback:
|
59 |
+
callback(progress)
|
60 |
+
page.pageno = pageno
|
61 |
+
pix = doc_en[page.pageno].get_pixmap()
|
62 |
+
image = np.fromstring(pix.samples, np.uint8).reshape(
|
63 |
+
pix.height, pix.width, 3
|
64 |
+
)[:, :, ::-1]
|
65 |
+
page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
|
66 |
+
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
|
67 |
+
box = np.ones((pix.height, pix.width))
|
68 |
+
h, w = box.shape
|
69 |
+
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
|
70 |
+
for i, d in enumerate(page_layout.boxes):
|
71 |
+
if not page_layout.names[int(d.cls)] in vcls:
|
72 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
73 |
+
x0, y0, x1, y1 = (
|
74 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
75 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
76 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
77 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
78 |
+
)
|
79 |
+
box[y0:y1, x0:x1] = i + 2
|
80 |
+
for i, d in enumerate(page_layout.boxes):
|
81 |
+
if page_layout.names[int(d.cls)] in vcls:
|
82 |
+
x0, y0, x1, y1 = d.xyxy.squeeze()
|
83 |
+
x0, y0, x1, y1 = (
|
84 |
+
np.clip(int(x0 - 1), 0, w - 1),
|
85 |
+
np.clip(int(h - y1 - 1), 0, h - 1),
|
86 |
+
np.clip(int(x1 + 1), 0, w - 1),
|
87 |
+
np.clip(int(h - y0 + 1), 0, h - 1),
|
88 |
+
)
|
89 |
+
box[y0:y1, x0:x1] = 0
|
90 |
+
layout[page.pageno] = box
|
91 |
+
# 新建一个 xref 存放新指令流
|
92 |
+
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
|
93 |
+
doc_en.update_object(page.page_xref, "<<>>")
|
94 |
+
doc_en.update_stream(page.page_xref, b"")
|
95 |
+
doc_en[page.pageno].set_contents(page.page_xref)
|
96 |
+
interpreter.process_page(page)
|
97 |
+
|
98 |
+
device.close()
|
99 |
+
return obj_patch
|
pdf2zh/pdf2zh.py
ADDED
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""A command line tool for extracting text and images from PDF and
|
3 |
+
output it to plain text, html, xml or tags.
|
4 |
+
"""
|
5 |
+
|
6 |
+
from __future__ import annotations
|
7 |
+
|
8 |
+
import argparse
|
9 |
+
import os
|
10 |
+
import sys
|
11 |
+
import logging
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Any, Container, Iterable, List, Optional
|
14 |
+
import urllib.request
|
15 |
+
from pdfminer.pdfexceptions import PDFValueError
|
16 |
+
|
17 |
+
import pymupdf
|
18 |
+
import requests
|
19 |
+
import tempfile
|
20 |
+
|
21 |
+
from pdf2zh import __version__, log
|
22 |
+
from pdf2zh.high_level import extract_text_to_fp
|
23 |
+
from pdf2zh.doclayout import DocLayoutModel
|
24 |
+
|
25 |
+
logging.basicConfig()
|
26 |
+
|
27 |
+
model = DocLayoutModel.load_available()
|
28 |
+
|
29 |
+
resfont_map = {
|
30 |
+
"zh-CN": "china-ss",
|
31 |
+
"zh-TW": "china-ts",
|
32 |
+
"ja": "japan-s",
|
33 |
+
"ko": "korea-s",
|
34 |
+
}
|
35 |
+
noto_list = [
|
36 |
+
"am", # Amharic
|
37 |
+
"ar", # Arabic
|
38 |
+
"bn", # Bengali
|
39 |
+
"bg", # Bulgarian
|
40 |
+
"chr", # Cherokee
|
41 |
+
"el", # Greek
|
42 |
+
"gu", # Gujarati
|
43 |
+
"iw", # Hebrew
|
44 |
+
"hi", # Hindi
|
45 |
+
# "ja", # Japanese
|
46 |
+
"kn", # Kannada
|
47 |
+
# "ko", # Korean
|
48 |
+
"ml", # Malayalam
|
49 |
+
"mr", # Marathi
|
50 |
+
"ru", # Russian
|
51 |
+
"sr", # Serbian
|
52 |
+
# "zh-CN",# Chinese (PRC)
|
53 |
+
"ta", # Tamil
|
54 |
+
"te", # Telugu
|
55 |
+
"th", # Thai
|
56 |
+
# "zh-TW",# Chinese (Taiwan)
|
57 |
+
"ur", # Urdu
|
58 |
+
"uk", # Ukrainian
|
59 |
+
]
|
60 |
+
|
61 |
+
|
62 |
+
def check_files(files: List[str]) -> List[str]:
|
63 |
+
files = [
|
64 |
+
f for f in files if not f.startswith("http://")
|
65 |
+
] # exclude online files, http
|
66 |
+
files = [
|
67 |
+
f for f in files if not f.startswith("https://")
|
68 |
+
] # exclude online files, https
|
69 |
+
missing_files = [file for file in files if not os.path.exists(file)]
|
70 |
+
return missing_files
|
71 |
+
|
72 |
+
|
73 |
+
def extract_text(
|
74 |
+
files: Iterable[str] = [],
|
75 |
+
pages: Optional[Container[int]] = None,
|
76 |
+
password: str = "",
|
77 |
+
debug: bool = False,
|
78 |
+
vfont: str = "",
|
79 |
+
vchar: str = "",
|
80 |
+
thread: int = 0,
|
81 |
+
lang_in: str = "",
|
82 |
+
lang_out: str = "",
|
83 |
+
service: str = "",
|
84 |
+
callback: object = None,
|
85 |
+
output: str = "",
|
86 |
+
**kwargs: Any,
|
87 |
+
):
|
88 |
+
if debug:
|
89 |
+
log.setLevel(logging.DEBUG)
|
90 |
+
|
91 |
+
if not files:
|
92 |
+
raise PDFValueError("Must provide files to work upon!")
|
93 |
+
|
94 |
+
for file in files:
|
95 |
+
if file is str and (file.startswith("http://") or file.startswith("https://")):
|
96 |
+
print("Online files detected, downloading...")
|
97 |
+
try:
|
98 |
+
r = requests.get(file, allow_redirects=True)
|
99 |
+
if r.status_code == 200:
|
100 |
+
if not os.path.exists("./pdf2zh_files"):
|
101 |
+
print("Making a temporary dir for downloading PDF files...")
|
102 |
+
os.mkdir(os.path.dirname("./pdf2zh_files"))
|
103 |
+
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
|
104 |
+
print(f"Writing the file: {file}...")
|
105 |
+
f.write(r.content)
|
106 |
+
file = "./pdf2zh_files/tmp_download.pdf"
|
107 |
+
else:
|
108 |
+
r.raise_for_status()
|
109 |
+
except Exception as e:
|
110 |
+
raise PDFValueError(
|
111 |
+
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
|
112 |
+
)
|
113 |
+
filename = os.path.splitext(os.path.basename(file))[0]
|
114 |
+
|
115 |
+
font_list = [("tiro", None)]
|
116 |
+
noto = None
|
117 |
+
if lang_out in resfont_map: # CJK
|
118 |
+
resfont = resfont_map[lang_out]
|
119 |
+
font_list.append((resfont, None))
|
120 |
+
elif lang_out in noto_list: # noto
|
121 |
+
resfont = "noto"
|
122 |
+
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
|
123 |
+
if not os.path.exists(ttf_path):
|
124 |
+
print("Downloading Noto font...")
|
125 |
+
urllib.request.urlretrieve(
|
126 |
+
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
|
127 |
+
ttf_path,
|
128 |
+
)
|
129 |
+
font_list.append(("noto", ttf_path))
|
130 |
+
noto = pymupdf.Font("noto", ttf_path)
|
131 |
+
else: # auto
|
132 |
+
resfont = "china-ss"
|
133 |
+
font_list.append(("china-ss", None))
|
134 |
+
|
135 |
+
doc_en = pymupdf.open(file)
|
136 |
+
page_count = doc_en.page_count
|
137 |
+
# font_list = [("china-ss", None), ("tiro", None)]
|
138 |
+
font_id = {}
|
139 |
+
for page in doc_en:
|
140 |
+
for font in font_list:
|
141 |
+
font_id[font[0]] = page.insert_font(font[0], font[1])
|
142 |
+
xreflen = doc_en.xref_length()
|
143 |
+
for xref in range(1, xreflen):
|
144 |
+
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
|
145 |
+
try: # xref 读写可能出错
|
146 |
+
font_res = doc_en.xref_get_key(xref, f"{label}Font")
|
147 |
+
if font_res[0] == "dict":
|
148 |
+
for font in font_list:
|
149 |
+
font_exist = doc_en.xref_get_key(
|
150 |
+
xref, f"{label}Font/{font[0]}"
|
151 |
+
)
|
152 |
+
if font_exist[0] == "null":
|
153 |
+
doc_en.xref_set_key(
|
154 |
+
xref,
|
155 |
+
f"{label}Font/{font[0]}",
|
156 |
+
f"{font_id[font[0]]} 0 R",
|
157 |
+
)
|
158 |
+
except Exception:
|
159 |
+
pass
|
160 |
+
doc_en.save(Path(output) / f"{filename}-en.pdf")
|
161 |
+
|
162 |
+
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
|
163 |
+
obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
|
164 |
+
|
165 |
+
for obj_id, ops_new in obj_patch.items():
|
166 |
+
# ops_old=doc_en.xref_stream(obj_id)
|
167 |
+
# print(obj_id)
|
168 |
+
# print(ops_old)
|
169 |
+
# print(ops_new.encode())
|
170 |
+
doc_en.update_stream(obj_id, ops_new.encode())
|
171 |
+
|
172 |
+
doc_zh = doc_en
|
173 |
+
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
|
174 |
+
doc_dual.insert_file(doc_zh)
|
175 |
+
for id in range(page_count):
|
176 |
+
doc_dual.move_page(page_count + id, id * 2 + 1)
|
177 |
+
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
|
178 |
+
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
|
179 |
+
doc_zh.close()
|
180 |
+
doc_dual.close()
|
181 |
+
os.remove(Path(output) / f"{filename}-en.pdf")
|
182 |
+
|
183 |
+
return
|
184 |
+
|
185 |
+
|
186 |
+
def create_parser() -> argparse.ArgumentParser:
|
187 |
+
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
|
188 |
+
parser.add_argument(
|
189 |
+
"files",
|
190 |
+
type=str,
|
191 |
+
default=None,
|
192 |
+
nargs="*",
|
193 |
+
help="One or more paths to PDF files.",
|
194 |
+
)
|
195 |
+
parser.add_argument(
|
196 |
+
"--version",
|
197 |
+
"-v",
|
198 |
+
action="version",
|
199 |
+
version=f"pdf2zh v{__version__}",
|
200 |
+
)
|
201 |
+
parser.add_argument(
|
202 |
+
"--debug",
|
203 |
+
"-d",
|
204 |
+
default=False,
|
205 |
+
action="store_true",
|
206 |
+
help="Use debug logging level.",
|
207 |
+
)
|
208 |
+
parse_params = parser.add_argument_group(
|
209 |
+
"Parser",
|
210 |
+
description="Used during PDF parsing",
|
211 |
+
)
|
212 |
+
parse_params.add_argument(
|
213 |
+
"--pages",
|
214 |
+
"-p",
|
215 |
+
type=str,
|
216 |
+
help="The list of page numbers to parse.",
|
217 |
+
)
|
218 |
+
parse_params.add_argument(
|
219 |
+
"--password",
|
220 |
+
"-P",
|
221 |
+
type=str,
|
222 |
+
default="",
|
223 |
+
help="The password to use for decrypting PDF file.",
|
224 |
+
)
|
225 |
+
parse_params.add_argument(
|
226 |
+
"--vfont",
|
227 |
+
"-f",
|
228 |
+
type=str,
|
229 |
+
default="",
|
230 |
+
help="The regex to math font name of formula.",
|
231 |
+
)
|
232 |
+
parse_params.add_argument(
|
233 |
+
"--vchar",
|
234 |
+
"-c",
|
235 |
+
type=str,
|
236 |
+
default="",
|
237 |
+
help="The regex to math character of formula.",
|
238 |
+
)
|
239 |
+
parse_params.add_argument(
|
240 |
+
"--lang-in",
|
241 |
+
"-li",
|
242 |
+
type=str,
|
243 |
+
default="auto",
|
244 |
+
help="The code of source language.",
|
245 |
+
)
|
246 |
+
parse_params.add_argument(
|
247 |
+
"--lang-out",
|
248 |
+
"-lo",
|
249 |
+
type=str,
|
250 |
+
default="auto",
|
251 |
+
help="The code of target language.",
|
252 |
+
)
|
253 |
+
parse_params.add_argument(
|
254 |
+
"--service",
|
255 |
+
"-s",
|
256 |
+
type=str,
|
257 |
+
default="google",
|
258 |
+
help="The service to use for translation.",
|
259 |
+
)
|
260 |
+
parse_params.add_argument(
|
261 |
+
"--output",
|
262 |
+
"-o",
|
263 |
+
type=str,
|
264 |
+
default="",
|
265 |
+
help="Output directory for files.",
|
266 |
+
)
|
267 |
+
parse_params.add_argument(
|
268 |
+
"--thread",
|
269 |
+
"-t",
|
270 |
+
type=int,
|
271 |
+
default=4,
|
272 |
+
help="The number of threads to execute translation.",
|
273 |
+
)
|
274 |
+
parse_params.add_argument(
|
275 |
+
"--interactive",
|
276 |
+
"-i",
|
277 |
+
action="store_true",
|
278 |
+
help="Interact with GUI.",
|
279 |
+
)
|
280 |
+
parse_params.add_argument(
|
281 |
+
"--share",
|
282 |
+
action="store_true",
|
283 |
+
help="Enable Gradio Share",
|
284 |
+
)
|
285 |
+
|
286 |
+
return parser
|
287 |
+
|
288 |
+
|
289 |
+
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
|
290 |
+
parsed_args = create_parser().parse_args(args=args)
|
291 |
+
|
292 |
+
if parsed_args.pages:
|
293 |
+
pages = []
|
294 |
+
for p in parsed_args.pages.split(","):
|
295 |
+
if "-" in p:
|
296 |
+
start, end = p.split("-")
|
297 |
+
pages.extend(range(int(start) - 1, int(end)))
|
298 |
+
else:
|
299 |
+
pages.append(int(p) - 1)
|
300 |
+
parsed_args.pages = pages
|
301 |
+
|
302 |
+
return parsed_args
|
303 |
+
|
304 |
+
|
305 |
+
def main(args: Optional[List[str]] = None) -> int:
|
306 |
+
parsed_args = parse_args(args)
|
307 |
+
|
308 |
+
missing_files = check_files(parsed_args.files)
|
309 |
+
if missing_files:
|
310 |
+
print("The following files do not exist:", file=sys.stderr)
|
311 |
+
for file in missing_files:
|
312 |
+
print(f" {file}", file=sys.stderr)
|
313 |
+
return -1
|
314 |
+
if parsed_args.interactive:
|
315 |
+
from pdf2zh.gui import setup_gui
|
316 |
+
|
317 |
+
setup_gui(parsed_args.share)
|
318 |
+
return 0
|
319 |
+
|
320 |
+
extract_text(**vars(parsed_args))
|
321 |
+
return 0
|
322 |
+
|
323 |
+
|
324 |
+
if __name__ == "__main__":
|
325 |
+
sys.exit(main())
|
pdf2zh/pdfinterp.py
ADDED
@@ -0,0 +1,360 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import Any, Dict, Optional, Sequence, Tuple, cast
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from pdfminer import settings
|
6 |
+
from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
|
7 |
+
from pdfminer.pdfdevice import PDFDevice
|
8 |
+
from pdfminer.pdfinterp import (
|
9 |
+
PDFPageInterpreter,
|
10 |
+
PDFResourceManager,
|
11 |
+
PDFContentParser,
|
12 |
+
PDFInterpreterError,
|
13 |
+
Color,
|
14 |
+
PDFStackT,
|
15 |
+
LITERAL_FORM,
|
16 |
+
LITERAL_IMAGE,
|
17 |
+
)
|
18 |
+
from pdfminer.pdffont import PDFFont
|
19 |
+
from pdfminer.pdfpage import PDFPage
|
20 |
+
from pdfminer.pdftypes import (
|
21 |
+
PDFObjRef,
|
22 |
+
dict_value,
|
23 |
+
list_value,
|
24 |
+
resolve1,
|
25 |
+
stream_value,
|
26 |
+
)
|
27 |
+
from pdfminer.psexceptions import PSEOF
|
28 |
+
from pdfminer.psparser import (
|
29 |
+
PSKeyword,
|
30 |
+
keyword_name,
|
31 |
+
literal_name,
|
32 |
+
)
|
33 |
+
from pdfminer.utils import (
|
34 |
+
MATRIX_IDENTITY,
|
35 |
+
Matrix,
|
36 |
+
Rect,
|
37 |
+
mult_matrix,
|
38 |
+
apply_matrix_pt,
|
39 |
+
)
|
40 |
+
|
41 |
+
log = logging.getLogger(__name__)
|
42 |
+
|
43 |
+
|
44 |
+
def safe_float(o: Any) -> Optional[float]:
|
45 |
+
try:
|
46 |
+
return float(o)
|
47 |
+
except (TypeError, ValueError):
|
48 |
+
return None
|
49 |
+
|
50 |
+
|
51 |
+
class PDFPageInterpreterEx(PDFPageInterpreter):
|
52 |
+
"""Processor for the content of a PDF page
|
53 |
+
|
54 |
+
Reference: PDF Reference, Appendix A, Operator Summary
|
55 |
+
"""
|
56 |
+
|
57 |
+
def __init__(
|
58 |
+
self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
|
59 |
+
) -> None:
|
60 |
+
self.rsrcmgr = rsrcmgr
|
61 |
+
self.device = device
|
62 |
+
self.obj_patch = obj_patch
|
63 |
+
|
64 |
+
def dup(self) -> "PDFPageInterpreterEx":
|
65 |
+
return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
|
66 |
+
|
67 |
+
def init_resources(self, resources: Dict[object, object]) -> None:
|
68 |
+
# 重载设置 fontid 和 descent
|
69 |
+
"""Prepare the fonts and XObjects listed in the Resource attribute."""
|
70 |
+
self.resources = resources
|
71 |
+
self.fontmap: Dict[object, PDFFont] = {}
|
72 |
+
self.fontid: Dict[PDFFont, object] = {}
|
73 |
+
self.xobjmap = {}
|
74 |
+
self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
|
75 |
+
if not resources:
|
76 |
+
return
|
77 |
+
|
78 |
+
def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
|
79 |
+
if isinstance(spec, list):
|
80 |
+
name = literal_name(spec[0])
|
81 |
+
else:
|
82 |
+
name = literal_name(spec)
|
83 |
+
if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
|
84 |
+
return PDFColorSpace(name, stream_value(spec[1])["N"])
|
85 |
+
elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
|
86 |
+
return PDFColorSpace(name, len(list_value(spec[1])))
|
87 |
+
else:
|
88 |
+
return PREDEFINED_COLORSPACE.get(name)
|
89 |
+
|
90 |
+
for k, v in dict_value(resources).items():
|
91 |
+
# log.debug("Resource: %r: %r", k, v)
|
92 |
+
if k == "Font":
|
93 |
+
for fontid, spec in dict_value(v).items():
|
94 |
+
objid = None
|
95 |
+
if isinstance(spec, PDFObjRef):
|
96 |
+
objid = spec.objid
|
97 |
+
spec = dict_value(spec)
|
98 |
+
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
|
99 |
+
self.fontmap[fontid].descent = 0 # hack fix descent
|
100 |
+
self.fontid[self.fontmap[fontid]] = fontid
|
101 |
+
elif k == "ColorSpace":
|
102 |
+
for csid, spec in dict_value(v).items():
|
103 |
+
colorspace = get_colorspace(resolve1(spec))
|
104 |
+
if colorspace is not None:
|
105 |
+
self.csmap[csid] = colorspace
|
106 |
+
elif k == "ProcSet":
|
107 |
+
self.rsrcmgr.get_procset(list_value(v))
|
108 |
+
elif k == "XObject":
|
109 |
+
for xobjid, xobjstrm in dict_value(v).items():
|
110 |
+
self.xobjmap[xobjid] = xobjstrm
|
111 |
+
|
112 |
+
def do_S(self) -> None:
|
113 |
+
# 重载过滤非公式线条
|
114 |
+
"""Stroke path"""
|
115 |
+
|
116 |
+
def is_black(color: Color) -> bool:
|
117 |
+
if isinstance(color, Tuple):
|
118 |
+
return sum(color) == 0
|
119 |
+
else:
|
120 |
+
return color == 0
|
121 |
+
|
122 |
+
if (
|
123 |
+
len(self.curpath) == 2
|
124 |
+
and self.curpath[0][0] == "m"
|
125 |
+
and self.curpath[1][0] == "l"
|
126 |
+
and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
|
127 |
+
== apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
|
128 |
+
and is_black(self.graphicstate.scolor)
|
129 |
+
): # 独立直线,水平,黑色
|
130 |
+
# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
|
131 |
+
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
|
132 |
+
self.curpath = []
|
133 |
+
return "n"
|
134 |
+
else:
|
135 |
+
self.curpath = []
|
136 |
+
|
137 |
+
############################################################
|
138 |
+
# 重载过滤非公式线条(F/B)
|
139 |
+
def do_f(self) -> None:
|
140 |
+
"""Fill path using nonzero winding number rule"""
|
141 |
+
# self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
|
142 |
+
self.curpath = []
|
143 |
+
|
144 |
+
def do_F(self) -> None:
|
145 |
+
"""Fill path using nonzero winding number rule (obsolete)"""
|
146 |
+
|
147 |
+
def do_f_a(self) -> None:
|
148 |
+
"""Fill path using even-odd rule"""
|
149 |
+
# self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
|
150 |
+
self.curpath = []
|
151 |
+
|
152 |
+
def do_B(self) -> None:
|
153 |
+
"""Fill and stroke path using nonzero winding number rule"""
|
154 |
+
# self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
|
155 |
+
self.curpath = []
|
156 |
+
|
157 |
+
def do_B_a(self) -> None:
|
158 |
+
"""Fill and stroke path using even-odd rule"""
|
159 |
+
# self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
|
160 |
+
self.curpath = []
|
161 |
+
|
162 |
+
############################################################
|
163 |
+
# 重载返回调用参数(SCN)
|
164 |
+
def do_SCN(self) -> None:
|
165 |
+
"""Set color for stroking operations."""
|
166 |
+
if self.scs:
|
167 |
+
n = self.scs.ncomponents
|
168 |
+
else:
|
169 |
+
if settings.STRICT:
|
170 |
+
raise PDFInterpreterError("No colorspace specified!")
|
171 |
+
n = 1
|
172 |
+
args = self.pop(n)
|
173 |
+
self.graphicstate.scolor = cast(Color, args)
|
174 |
+
return args
|
175 |
+
|
176 |
+
def do_scn(self) -> None:
|
177 |
+
"""Set color for nonstroking operations"""
|
178 |
+
if self.ncs:
|
179 |
+
n = self.ncs.ncomponents
|
180 |
+
else:
|
181 |
+
if settings.STRICT:
|
182 |
+
raise PDFInterpreterError("No colorspace specified!")
|
183 |
+
n = 1
|
184 |
+
args = self.pop(n)
|
185 |
+
self.graphicstate.ncolor = cast(Color, args)
|
186 |
+
return args
|
187 |
+
|
188 |
+
def do_SC(self) -> None:
|
189 |
+
"""Set color for stroking operations"""
|
190 |
+
return self.do_SCN()
|
191 |
+
|
192 |
+
def do_sc(self) -> None:
|
193 |
+
"""Set color for nonstroking operations"""
|
194 |
+
return self.do_scn()
|
195 |
+
|
196 |
+
def do_Do(self, xobjid_arg: PDFStackT) -> None:
|
197 |
+
# 重载设置 xobj 的 obj_patch
|
198 |
+
"""Invoke named XObject"""
|
199 |
+
xobjid = literal_name(xobjid_arg)
|
200 |
+
try:
|
201 |
+
xobj = stream_value(self.xobjmap[xobjid])
|
202 |
+
except KeyError:
|
203 |
+
if settings.STRICT:
|
204 |
+
raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
|
205 |
+
return
|
206 |
+
# log.debug("Processing xobj: %r", xobj)
|
207 |
+
subtype = xobj.get("Subtype")
|
208 |
+
if subtype is LITERAL_FORM and "BBox" in xobj:
|
209 |
+
interpreter = self.dup()
|
210 |
+
bbox = cast(Rect, list_value(xobj["BBox"]))
|
211 |
+
matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
|
212 |
+
# According to PDF reference 1.7 section 4.9.1, XObjects in
|
213 |
+
# earlier PDFs (prior to v1.2) use the page's Resources entry
|
214 |
+
# instead of having their own Resources entry.
|
215 |
+
xobjres = xobj.get("Resources")
|
216 |
+
if xobjres:
|
217 |
+
resources = dict_value(xobjres)
|
218 |
+
else:
|
219 |
+
resources = self.resources.copy()
|
220 |
+
self.device.begin_figure(xobjid, bbox, matrix)
|
221 |
+
ctm = mult_matrix(matrix, self.ctm)
|
222 |
+
ops_base = interpreter.render_contents(
|
223 |
+
resources,
|
224 |
+
[xobj],
|
225 |
+
ctm=ctm,
|
226 |
+
)
|
227 |
+
try: # 有的时候 form 字体加不上这里会烂掉
|
228 |
+
self.device.fontid = interpreter.fontid
|
229 |
+
self.device.fontmap = interpreter.fontmap
|
230 |
+
ops_new = self.device.end_figure(xobjid)
|
231 |
+
ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
|
232 |
+
pos_inv = -np.mat(ctm[4:]) * ctm_inv
|
233 |
+
a, b, c, d = ctm_inv.reshape(4).tolist()
|
234 |
+
e, f = pos_inv.tolist()[0]
|
235 |
+
self.obj_patch[self.xobjmap[xobjid].objid] = (
|
236 |
+
f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
|
237 |
+
)
|
238 |
+
except Exception:
|
239 |
+
pass
|
240 |
+
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
|
241 |
+
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
|
242 |
+
self.device.render_image(xobjid, xobj)
|
243 |
+
self.device.end_figure(xobjid)
|
244 |
+
else:
|
245 |
+
# unsupported xobject type.
|
246 |
+
pass
|
247 |
+
|
248 |
+
def process_page(self, page: PDFPage) -> None:
|
249 |
+
# 重载设置 page 的 obj_patch
|
250 |
+
# log.debug("Processing page: %r", page)
|
251 |
+
# print(page.mediabox,page.cropbox)
|
252 |
+
# (x0, y0, x1, y1) = page.mediabox
|
253 |
+
(x0, y0, x1, y1) = page.cropbox
|
254 |
+
if page.rotate == 90:
|
255 |
+
ctm = (0, -1, 1, 0, -y0, x1)
|
256 |
+
elif page.rotate == 180:
|
257 |
+
ctm = (-1, 0, 0, -1, x1, y1)
|
258 |
+
elif page.rotate == 270:
|
259 |
+
ctm = (0, 1, -1, 0, y1, -x0)
|
260 |
+
else:
|
261 |
+
ctm = (1, 0, 0, 1, -x0, -y0)
|
262 |
+
self.device.begin_page(page, ctm)
|
263 |
+
ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
|
264 |
+
self.device.fontid = self.fontid
|
265 |
+
self.device.fontmap = self.fontmap
|
266 |
+
ops_new = self.device.end_page(page)
|
267 |
+
# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
|
268 |
+
self.obj_patch[page.page_xref] = (
|
269 |
+
f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
|
270 |
+
)
|
271 |
+
for obj in page.contents:
|
272 |
+
self.obj_patch[obj.objid] = ""
|
273 |
+
|
274 |
+
def render_contents(
|
275 |
+
self,
|
276 |
+
resources: Dict[object, object],
|
277 |
+
streams: Sequence[object],
|
278 |
+
ctm: Matrix = MATRIX_IDENTITY,
|
279 |
+
) -> None:
|
280 |
+
# 重载返回指令流
|
281 |
+
"""Render the content streams.
|
282 |
+
|
283 |
+
This method may be called recursively.
|
284 |
+
"""
|
285 |
+
# log.debug(
|
286 |
+
# "render_contents: resources=%r, streams=%r, ctm=%r",
|
287 |
+
# resources,
|
288 |
+
# streams,
|
289 |
+
# ctm,
|
290 |
+
# )
|
291 |
+
self.init_resources(resources)
|
292 |
+
self.init_state(ctm)
|
293 |
+
return self.execute(list_value(streams))
|
294 |
+
|
295 |
+
def execute(self, streams: Sequence[object]) -> None:
|
296 |
+
# 重载返回指令流
|
297 |
+
ops = ""
|
298 |
+
try:
|
299 |
+
parser = PDFContentParser(streams)
|
300 |
+
except PSEOF:
|
301 |
+
# empty page
|
302 |
+
return
|
303 |
+
while True:
|
304 |
+
try:
|
305 |
+
(_, obj) = parser.nextobject()
|
306 |
+
except PSEOF:
|
307 |
+
break
|
308 |
+
if isinstance(obj, PSKeyword):
|
309 |
+
name = keyword_name(obj)
|
310 |
+
method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
|
311 |
+
"'",
|
312 |
+
"_q",
|
313 |
+
)
|
314 |
+
if hasattr(self, method):
|
315 |
+
func = getattr(self, method)
|
316 |
+
nargs = func.__code__.co_argcount - 1
|
317 |
+
if nargs:
|
318 |
+
args = self.pop(nargs)
|
319 |
+
# log.debug("exec: %s %r", name, args)
|
320 |
+
if len(args) == nargs:
|
321 |
+
func(*args)
|
322 |
+
if not (
|
323 |
+
name[0] == "T"
|
324 |
+
or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
|
325 |
+
): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
|
326 |
+
p = " ".join(
|
327 |
+
[
|
328 |
+
(
|
329 |
+
f"{x:f}"
|
330 |
+
if isinstance(x, float)
|
331 |
+
else str(x).replace("'", "")
|
332 |
+
)
|
333 |
+
for x in args
|
334 |
+
]
|
335 |
+
)
|
336 |
+
ops += f"{p} {name} "
|
337 |
+
else:
|
338 |
+
# log.debug("exec: %s", name)
|
339 |
+
targs = func()
|
340 |
+
if targs is None:
|
341 |
+
targs = []
|
342 |
+
if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
|
343 |
+
p = " ".join(
|
344 |
+
[
|
345 |
+
(
|
346 |
+
f"{x:f}"
|
347 |
+
if isinstance(x, float)
|
348 |
+
else str(x).replace("'", "")
|
349 |
+
)
|
350 |
+
for x in targs
|
351 |
+
]
|
352 |
+
)
|
353 |
+
ops += f"{p} {name} "
|
354 |
+
elif settings.STRICT:
|
355 |
+
error_msg = "Unknown operator: %r" % name
|
356 |
+
raise PDFInterpreterError(error_msg)
|
357 |
+
else:
|
358 |
+
self.push(obj)
|
359 |
+
# print('REV DATA',ops)
|
360 |
+
return ops
|
pdf2zh/translator.py
ADDED
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hashlib
|
2 |
+
import hmac
|
3 |
+
import html
|
4 |
+
import logging
|
5 |
+
import os
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
from datetime import timezone, datetime
|
9 |
+
|
10 |
+
from json import dumps, loads
|
11 |
+
import unicodedata
|
12 |
+
|
13 |
+
import deepl
|
14 |
+
import ollama
|
15 |
+
import openai
|
16 |
+
import requests
|
17 |
+
from azure.ai.translation.text import TextTranslationClient
|
18 |
+
from azure.core.credentials import AzureKeyCredential
|
19 |
+
|
20 |
+
|
21 |
+
def remove_control_characters(s):
|
22 |
+
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
|
23 |
+
|
24 |
+
|
25 |
+
class BaseTranslator:
|
26 |
+
def __init__(self, service, lang_out, lang_in, model):
|
27 |
+
self.service = service
|
28 |
+
self.lang_out = lang_out
|
29 |
+
self.lang_in = lang_in
|
30 |
+
self.model = model
|
31 |
+
|
32 |
+
def translate(self, text) -> str: ... # noqa: E704
|
33 |
+
|
34 |
+
def __str__(self):
|
35 |
+
return f"{self.service} {self.lang_out} {self.lang_in}"
|
36 |
+
|
37 |
+
|
38 |
+
class GoogleTranslator(BaseTranslator):
|
39 |
+
def __init__(self, service, lang_out, lang_in, model):
|
40 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
41 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
42 |
+
super().__init__(service, lang_out, lang_in, model)
|
43 |
+
self.session = requests.Session()
|
44 |
+
self.base_link = "http://translate.google.com/m"
|
45 |
+
self.headers = {
|
46 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
47 |
+
}
|
48 |
+
|
49 |
+
def translate(self, text):
|
50 |
+
text = text[:5000] # google translate max length
|
51 |
+
response = self.session.get(
|
52 |
+
self.base_link,
|
53 |
+
params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
|
54 |
+
headers=self.headers,
|
55 |
+
)
|
56 |
+
re_result = re.findall(
|
57 |
+
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
|
58 |
+
)
|
59 |
+
if response.status_code == 400:
|
60 |
+
result = "IRREPARABLE TRANSLATION ERROR"
|
61 |
+
elif len(re_result) == 0:
|
62 |
+
raise ValueError("Empty translation result")
|
63 |
+
else:
|
64 |
+
result = html.unescape(re_result[0])
|
65 |
+
return remove_control_characters(result)
|
66 |
+
|
67 |
+
|
68 |
+
class TencentTranslator(BaseTranslator):
|
69 |
+
def sign(self, key, msg):
|
70 |
+
return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
|
71 |
+
|
72 |
+
def __init__(self, service, lang_out, lang_in, model):
|
73 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
74 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
75 |
+
super().__init__(service, lang_out, lang_in, model)
|
76 |
+
try:
|
77 |
+
server_url = "tmt.tencentcloudapi.com"
|
78 |
+
self.secret_id = os.getenv("TENCENT_SECRET_ID")
|
79 |
+
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
|
80 |
+
|
81 |
+
except KeyError as e:
|
82 |
+
missing_var = e.args[0]
|
83 |
+
raise ValueError(
|
84 |
+
f"The environment variable '{missing_var}' is required but not set."
|
85 |
+
) from e
|
86 |
+
|
87 |
+
self.session = requests.Session()
|
88 |
+
self.base_link = f"{server_url}"
|
89 |
+
|
90 |
+
def translate(self, text):
|
91 |
+
text = text[:5000]
|
92 |
+
data = {
|
93 |
+
"SourceText": text,
|
94 |
+
"Source": self.lang_in,
|
95 |
+
"Target": self.lang_out,
|
96 |
+
"ProjectId": 0,
|
97 |
+
}
|
98 |
+
payloadx = dumps(data)
|
99 |
+
hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
|
100 |
+
canonical_request = (
|
101 |
+
"POST"
|
102 |
+
+ "\n"
|
103 |
+
+ "/"
|
104 |
+
+ "\n"
|
105 |
+
+ ""
|
106 |
+
+ "\n"
|
107 |
+
+ "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
|
108 |
+
+ "\n"
|
109 |
+
+ "content-type;host;x-tc-action"
|
110 |
+
+ "\n"
|
111 |
+
+ hashed_request_payload
|
112 |
+
)
|
113 |
+
|
114 |
+
timestamp = int(time.time())
|
115 |
+
date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
|
116 |
+
credential_scope = date + "/tmt/tc3_request"
|
117 |
+
hashed_canonical_request = hashlib.sha256(
|
118 |
+
canonical_request.encode("utf-8")
|
119 |
+
).hexdigest()
|
120 |
+
algorithm = "TC3-HMAC-SHA256"
|
121 |
+
string_to_sign = (
|
122 |
+
algorithm
|
123 |
+
+ "\n"
|
124 |
+
+ str(timestamp)
|
125 |
+
+ "\n"
|
126 |
+
+ credential_scope
|
127 |
+
+ "\n"
|
128 |
+
+ hashed_canonical_request
|
129 |
+
)
|
130 |
+
secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
|
131 |
+
secret_service = self.sign(secret_date, "tmt")
|
132 |
+
secret_signing = self.sign(secret_service, "tc3_request")
|
133 |
+
signed_headers = "content-type;host;x-tc-action"
|
134 |
+
signature = hmac.new(
|
135 |
+
secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
|
136 |
+
).hexdigest()
|
137 |
+
authorization = (
|
138 |
+
algorithm
|
139 |
+
+ " "
|
140 |
+
+ "Credential="
|
141 |
+
+ str(self.secret_id)
|
142 |
+
+ "/"
|
143 |
+
+ credential_scope
|
144 |
+
+ ", "
|
145 |
+
+ "SignedHeaders="
|
146 |
+
+ signed_headers
|
147 |
+
+ ", "
|
148 |
+
+ "Signature="
|
149 |
+
+ signature
|
150 |
+
)
|
151 |
+
self.headers = {
|
152 |
+
"Authorization": authorization,
|
153 |
+
"Content-Type": "application/json; charset=utf-8",
|
154 |
+
"Host": "tmt.tencentcloudapi.com",
|
155 |
+
"X-TC-Action": "TextTranslate",
|
156 |
+
"X-TC-Region": "ap-beijing",
|
157 |
+
"X-TC-Timestamp": str(timestamp),
|
158 |
+
"X-TC-Version": "2018-03-21",
|
159 |
+
}
|
160 |
+
|
161 |
+
response = self.session.post(
|
162 |
+
"https://" + self.base_link,
|
163 |
+
json=data,
|
164 |
+
headers=self.headers,
|
165 |
+
)
|
166 |
+
# 1. Status code test
|
167 |
+
if response.status_code == 200:
|
168 |
+
result = loads(response.text)
|
169 |
+
else:
|
170 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
171 |
+
# 2. Result test
|
172 |
+
try:
|
173 |
+
result = result["Response"]["TargetText"]
|
174 |
+
# return result
|
175 |
+
except KeyError:
|
176 |
+
result = ""
|
177 |
+
# raise ValueError("No valid key in Tencent's response")
|
178 |
+
# # 3. Result length check
|
179 |
+
# if len(result) == 0:
|
180 |
+
# raise ValueError("Empty translation result")
|
181 |
+
return result
|
182 |
+
|
183 |
+
|
184 |
+
class DeepLXTranslator(BaseTranslator):
|
185 |
+
def __init__(self, service, lang_out, lang_in, model):
|
186 |
+
lang_out = "zh" if lang_out == "auto" else lang_out
|
187 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
188 |
+
super().__init__(service, lang_out, lang_in, model)
|
189 |
+
try:
|
190 |
+
auth_key = os.getenv("DEEPLX_AUTH_KEY")
|
191 |
+
server_url = (
|
192 |
+
"https://api.deeplx.org"
|
193 |
+
if not os.getenv("DEEPLX_SERVER_URL")
|
194 |
+
else os.getenv("DEEPLX_SERVER_URL")
|
195 |
+
)
|
196 |
+
except KeyError as e:
|
197 |
+
missing_var = e.args[0]
|
198 |
+
raise ValueError(
|
199 |
+
f"The environment variable '{missing_var}' is required but not set."
|
200 |
+
) from e
|
201 |
+
|
202 |
+
self.session = requests.Session()
|
203 |
+
server_url = str(server_url).rstrip("/")
|
204 |
+
if auth_key:
|
205 |
+
self.base_link = f"{server_url}/{auth_key}/translate"
|
206 |
+
else:
|
207 |
+
self.base_link = f"{server_url}/translate"
|
208 |
+
self.headers = {
|
209 |
+
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
|
210 |
+
}
|
211 |
+
|
212 |
+
def translate(self, text):
|
213 |
+
text = text[:5000] # google translate max length
|
214 |
+
response = self.session.post(
|
215 |
+
self.base_link,
|
216 |
+
dumps(
|
217 |
+
{
|
218 |
+
"target_lang": self.lang_out,
|
219 |
+
"text": text,
|
220 |
+
}
|
221 |
+
),
|
222 |
+
headers=self.headers,
|
223 |
+
)
|
224 |
+
# 1. Status code test
|
225 |
+
if response.status_code == 200:
|
226 |
+
result = loads(response.text)
|
227 |
+
else:
|
228 |
+
raise ValueError("HTTP error: " + str(response.status_code))
|
229 |
+
# 2. Result test
|
230 |
+
try:
|
231 |
+
result = result["data"]
|
232 |
+
return result
|
233 |
+
except KeyError:
|
234 |
+
result = ""
|
235 |
+
raise ValueError("No valid key in DeepLX's response")
|
236 |
+
# 3. Result length check
|
237 |
+
if len(result) == 0:
|
238 |
+
raise ValueError("Empty translation result")
|
239 |
+
return result
|
240 |
+
|
241 |
+
|
242 |
+
class DeepLTranslator(BaseTranslator):
|
243 |
+
def __init__(self, service, lang_out, lang_in, model):
|
244 |
+
lang_out = "ZH" if lang_out == "auto" else lang_out
|
245 |
+
lang_in = "EN" if lang_in == "auto" else lang_in
|
246 |
+
super().__init__(service, lang_out, lang_in, model)
|
247 |
+
self.session = requests.Session()
|
248 |
+
auth_key = os.getenv("DEEPL_AUTH_KEY")
|
249 |
+
server_url = os.getenv("DEEPL_SERVER_URL")
|
250 |
+
self.client = deepl.Translator(auth_key, server_url=server_url)
|
251 |
+
|
252 |
+
def translate(self, text):
|
253 |
+
response = self.client.translate_text(
|
254 |
+
text, target_lang=self.lang_out, source_lang=self.lang_in
|
255 |
+
)
|
256 |
+
return response.text
|
257 |
+
|
258 |
+
|
259 |
+
class OllamaTranslator(BaseTranslator):
|
260 |
+
def __init__(self, service, lang_out, lang_in, model):
|
261 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
262 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
263 |
+
super().__init__(service, lang_out, lang_in, model)
|
264 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
265 |
+
# OLLAMA_HOST
|
266 |
+
self.client = ollama.Client()
|
267 |
+
|
268 |
+
def translate(self, text):
|
269 |
+
response = self.client.chat(
|
270 |
+
model=self.model,
|
271 |
+
options=self.options,
|
272 |
+
messages=[
|
273 |
+
{
|
274 |
+
"role": "system",
|
275 |
+
"content": "You are a professional,authentic machine translation engine.",
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"role": "user",
|
279 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
280 |
+
},
|
281 |
+
],
|
282 |
+
)
|
283 |
+
return response["message"]["content"].strip()
|
284 |
+
|
285 |
+
|
286 |
+
class OpenAITranslator(BaseTranslator):
|
287 |
+
def __init__(self, service, lang_out, lang_in, model):
|
288 |
+
lang_out = "zh-CN" if lang_out == "auto" else lang_out
|
289 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
290 |
+
super().__init__(service, lang_out, lang_in, model)
|
291 |
+
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
|
292 |
+
# OPENAI_BASE_URL
|
293 |
+
# OPENAI_API_KEY
|
294 |
+
self.client = openai.OpenAI()
|
295 |
+
|
296 |
+
def translate(self, text) -> str:
|
297 |
+
response = self.client.chat.completions.create(
|
298 |
+
model=self.model,
|
299 |
+
**self.options,
|
300 |
+
messages=[
|
301 |
+
{
|
302 |
+
"role": "system",
|
303 |
+
"content": "You are a professional,authentic machine translation engine.",
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"role": "user",
|
307 |
+
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
|
308 |
+
},
|
309 |
+
],
|
310 |
+
)
|
311 |
+
return response.choices[0].message.content.strip()
|
312 |
+
|
313 |
+
|
314 |
+
class AzureTranslator(BaseTranslator):
|
315 |
+
def __init__(self, service, lang_out, lang_in, model):
|
316 |
+
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
|
317 |
+
lang_in = "en" if lang_in == "auto" else lang_in
|
318 |
+
super().__init__(service, lang_out, lang_in, model)
|
319 |
+
|
320 |
+
try:
|
321 |
+
api_key = os.environ["AZURE_APIKEY"]
|
322 |
+
endpoint = os.environ["AZURE_ENDPOINT"]
|
323 |
+
region = os.environ["AZURE_REGION"]
|
324 |
+
except KeyError as e:
|
325 |
+
missing_var = e.args[0]
|
326 |
+
raise ValueError(
|
327 |
+
f"The environment variable '{missing_var}' is required but not set."
|
328 |
+
) from e
|
329 |
+
|
330 |
+
credential = AzureKeyCredential(api_key)
|
331 |
+
self.client = TextTranslationClient(
|
332 |
+
endpoint=endpoint, credential=credential, region=region
|
333 |
+
)
|
334 |
+
|
335 |
+
# https://github.com/Azure/azure-sdk-for-python/issues/9422
|
336 |
+
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
|
337 |
+
logger.setLevel(logging.WARNING)
|
338 |
+
|
339 |
+
def translate(self, text) -> str:
|
340 |
+
response = self.client.translate(
|
341 |
+
body=[text],
|
342 |
+
from_language=self.lang_in,
|
343 |
+
to_language=[self.lang_out],
|
344 |
+
)
|
345 |
+
|
346 |
+
translated_text = response[0].translations[0].text
|
347 |
+
return translated_text
|