Spaces:
Running
Running
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager | |
from pdfminer.pdffont import PDFFont, PDFCIDFont | |
from pdfminer.converter import PDFConverter | |
from pdfminer.pdffont import PDFUnicodeNotDefined | |
from pdfminer.utils import apply_matrix_pt, mult_matrix | |
from pdfminer.layout import ( | |
LTChar, | |
LTFigure, | |
LTLine, | |
LTPage, | |
) | |
import logging | |
import re | |
import concurrent.futures | |
import numpy as np | |
import unicodedata | |
from tenacity import retry, wait_fixed | |
from pdf2zh import cache | |
from pdf2zh.translator import ( | |
BaseTranslator, | |
GoogleTranslator, | |
DeepLTranslator, | |
DeepLXTranslator, | |
OllamaTranslator, | |
OpenAITranslator, | |
AzureTranslator, | |
TencentTranslator, | |
) | |
from pymupdf import Font | |
log = logging.getLogger(__name__) | |
class PDFConverterEx(PDFConverter): | |
def __init__( | |
self, | |
rsrcmgr: PDFResourceManager, | |
) -> None: | |
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None) | |
def begin_page(self, page, ctm) -> None: | |
# 重载替换 cropbox | |
(x0, y0, x1, y1) = page.cropbox | |
(x0, y0) = apply_matrix_pt(ctm, (x0, y0)) | |
(x1, y1) = apply_matrix_pt(ctm, (x1, y1)) | |
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1)) | |
self.cur_item = LTPage(page.pageno, mediabox) | |
def end_page(self, page): | |
# 重载返回指令流 | |
return self.receive_layout(self.cur_item) | |
def begin_figure(self, name, bbox, matrix) -> None: | |
# 重载设置 pageid | |
self._stack.append(self.cur_item) | |
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm)) | |
self.cur_item.pageid = self._stack[-1].pageid | |
def end_figure(self, _: str) -> None: | |
# 重载返回指令流 | |
fig = self.cur_item | |
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item)) | |
self.cur_item = self._stack.pop() | |
self.cur_item.add(fig) | |
return self.receive_layout(fig) | |
def render_char( | |
self, | |
matrix, | |
font, | |
fontsize: float, | |
scaling: float, | |
rise: float, | |
cid: int, | |
ncs, | |
graphicstate: PDFGraphicState, | |
) -> float: | |
# 重载设置 cid 和 font | |
try: | |
text = font.to_unichr(cid) | |
assert isinstance(text, str), str(type(text)) | |
except PDFUnicodeNotDefined: | |
text = self.handle_undefined_char(font, cid) | |
textwidth = font.char_width(cid) | |
textdisp = font.char_disp(cid) | |
item = LTChar( | |
matrix, | |
font, | |
fontsize, | |
scaling, | |
rise, | |
text, | |
textwidth, | |
textdisp, | |
ncs, | |
graphicstate, | |
) | |
self.cur_item.add(item) | |
item.cid = cid # hack 插入原字符编码 | |
item.font = font # hack 插入原字符字体 | |
return item.adv | |
class Paragraph: | |
def __init__(self, y, x, x0, x1, size, font, brk): | |
self.y: float = y # 初始纵坐标 | |
self.x: float = x # 初始横坐标 | |
self.x0: float = x0 # 左边界 | |
self.x1: float = x1 # 右边界 | |
self.size: float = size # 字体大小 | |
self.font: PDFFont = font # 字体 | |
self.brk: bool = brk # 换行标记 | |
# fmt: off | |
class TranslateConverter(PDFConverterEx): | |
def __init__( | |
self, | |
rsrcmgr, | |
vfont: str = None, | |
vchar: str = None, | |
thread: int = 0, | |
layout={}, | |
lang_in: str = "", | |
lang_out: str = "", | |
service: str = "", | |
resfont: str = "", | |
noto: Font = None, | |
) -> None: | |
super().__init__(rsrcmgr) | |
self.vfont = vfont | |
self.vchar = vchar | |
self.thread = thread | |
self.layout = layout | |
self.resfont = resfont | |
self.noto = noto | |
self.translator: BaseTranslator = None | |
param = service.split(":", 1) | |
if param[0] == "google": | |
self.translator = GoogleTranslator(service, lang_out, lang_in, None) | |
elif param[0] == "deepl": | |
self.translator = DeepLTranslator(service, lang_out, lang_in, None) | |
elif param[0] == "deeplx": | |
self.translator = DeepLXTranslator(service, lang_out, lang_in, None) | |
elif param[0] == "ollama": | |
self.translator = OllamaTranslator(service, lang_out, lang_in, param[1]) | |
elif param[0] == "openai": | |
self.translator = OpenAITranslator(service, lang_out, lang_in, param[1]) | |
elif param[0] == "azure": | |
self.translator = AzureTranslator(service, lang_out, lang_in, None) | |
elif param[0] == "tencent": | |
self.translator = TencentTranslator(service, lang_out, lang_in, None) | |
else: | |
raise ValueError("Unsupported translation service") | |
def receive_layout(self, ltpage: LTPage): | |
# 段落 | |
sstk: list[str] = [] # 段落文字栈 | |
pstk: list[Paragraph] = [] # 段落属性栈 | |
vbkt: int = 0 # 段落公式括号计数 | |
# 公式组 | |
vstk: list[LTChar] = [] # 公式符号组 | |
vlstk: list[LTLine] = [] # 公式线条组 | |
vfix: float = 0 # 公式纵向偏移 | |
# 公式组栈 | |
var: list[list[LTChar]] = [] # 公式符号组栈 | |
varl: list[list[LTLine]] = [] # 公式线条组栈 | |
varf: list[float] = [] # 公式纵向偏移栈 | |
vlen: list[float] = [] # 公式宽度栈 | |
# 全局 | |
lstk: list[LTLine] = [] # 全局线条栈 | |
xt: LTChar = None # 上一个字符 | |
xt_cls: int = -1 # 上一个字符所属段落 | |
vmax: float = ltpage.width / 4 # 行内公式最大宽度 | |
ops: str = "" # 渲染结果 | |
def vflag(font: str, char: str): # 匹配公式(和角标)字体 | |
font = font.split("+")[-1] # 字体名截断 | |
if re.match(r"\(cid:", char): | |
return True | |
# 基于字体名规则的判定 | |
if self.vfont: | |
if re.match(self.vfont, font): | |
return True | |
else: | |
if re.match( # latex 字体 | |
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)", | |
font, | |
): | |
return True | |
# 基于字符集规则的判定 | |
if self.vchar: | |
if re.match(self.vchar, char): | |
return True | |
else: | |
if ( | |
char | |
and char != " " # 非空格 | |
and ( | |
unicodedata.category(char[0]) | |
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号 | |
or ord(char[0]) in range(0x370, 0x400) # 希腊字母 | |
) | |
): | |
return True | |
return False | |
############################################################ | |
# A. 原文档解析 | |
for child in ltpage: | |
if isinstance(child, LTChar): | |
cur_v = False | |
layout = self.layout[ltpage.pageid] | |
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape | |
h, w = layout.shape | |
# 读取当前字符在 layout 中的类别 | |
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) | |
cls = layout[cy, cx] | |
if ( # 判定当前字符是否属于公式 | |
cls == 0 # 1. 类别为保留区域 | |
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况 | |
or vflag(child.fontname, child.get_text()) # 3. 公式字体 | |
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体 | |
): | |
cur_v = True | |
# 判定括号组是否属于公式 | |
if not cur_v: | |
if vstk and child.get_text() == "(": | |
cur_v = True | |
vbkt += 1 | |
if vbkt and child.get_text() == ")": | |
cur_v = True | |
vbkt -= 1 | |
if ( # 判定当前公式是否结束 | |
not cur_v # 1. 当前字符不属于公式 | |
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落 | |
or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分 | |
): | |
if vstk: | |
if ( # 根据公式右侧的文字修正公式的纵向偏移 | |
not cur_v # 1. 当前字符不属于公式 | |
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 | |
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧 | |
): | |
vfix = vstk[0].y0 - child.y0 | |
sstk[-1] += f"$v{len(var)}$" | |
var.append(vstk) | |
varl.append(vlstk) | |
varf.append(vfix) | |
vstk = [] | |
vlstk = [] | |
vfix = 0 | |
# 当前字符不属于公式或当前字符是公式的第一个字符 | |
if not vstk: | |
if cls == xt_cls: # 当前字符与前一个字符属于同一段落 | |
if child.x0 > xt.x1 + 1: # 添加行内空格 | |
sstk[-1] += " " | |
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行 | |
sstk[-1] += " " | |
pstk[-1].brk = True | |
else: # 根据当前字符构建一个新的段落 | |
sstk.append("") | |
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False)) | |
if not cur_v: # 文字入栈 | |
if ( # 根据当前字符修正段落属性 | |
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大 | |
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况) | |
or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体 | |
or re.match( # 4. 段落字体为粗体 | |
r"(.*Medi|.*Bold)", | |
pstk[-1].font.fontname, | |
re.IGNORECASE, | |
) | |
): | |
pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧 | |
pstk[-1].size = child.size | |
pstk[-1].font = child.font | |
sstk[-1] += child.get_text() | |
else: # 公式入栈 | |
if ( # 根据公式左侧的文字修正公式的纵向偏移 | |
not vstk # 1. 当前字符是公式的第一个字符 | |
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落 | |
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧 | |
): | |
vfix = child.y0 - xt.y0 | |
vstk.append(child) | |
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理 | |
pstk[-1].x0 = min(pstk[-1].x0, child.x0) | |
pstk[-1].x1 = max(pstk[-1].x1, child.x1) | |
# 更新上一个字符 | |
xt = child | |
xt_cls = cls | |
elif isinstance(child, LTFigure): # 图表 | |
pass | |
elif isinstance(child, LTLine): # 线条 | |
layout = self.layout[ltpage.pageid] | |
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape | |
h, w = layout.shape | |
# 读取当前线条在 layout 中的类别 | |
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1) | |
cls = layout[cy, cx] | |
if vstk and cls == xt_cls: # 公式线条 | |
vlstk.append(child) | |
else: # 全局线条 | |
lstk.append(child) | |
else: | |
pass | |
# 处理结尾 | |
if vstk: # 公式出栈 | |
sstk[-1] += f"$v{len(var)}$" | |
var.append(vstk) | |
varl.append(vlstk) | |
varf.append(vfix) | |
log.debug("\n==========[VSTACK]==========\n") | |
for id, v in enumerate(var): # 计算公式宽度 | |
l = max([vch.x1 for vch in v]) - v[0].x0 | |
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}') | |
vlen.append(l) | |
############################################################ | |
# B. 段落翻译 | |
log.debug("\n==========[SSTACK]==========\n") | |
hash_key = cache.deterministic_hash("PDFMathTranslate") | |
cache.create_cache(hash_key) | |
def worker(s: str): # 多线程翻译 | |
try: | |
hash_key_paragraph = cache.deterministic_hash( | |
(s, str(self.translator)) | |
) | |
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存 | |
if new is None: | |
new = self.translator.translate(s) | |
cache.write_paragraph(hash_key, hash_key_paragraph, new) | |
return new | |
except BaseException as e: | |
if log.isEnabledFor(logging.DEBUG): | |
log.exception(e) | |
else: | |
log.exception(e, exc_info=False) | |
raise e | |
with concurrent.futures.ThreadPoolExecutor( | |
max_workers=self.thread | |
) as executor: | |
news = list(executor.map(worker, sstk)) | |
############################################################ | |
# C. 新文档排版 | |
def raw_string(fcur: str, cstk: str): # 编码字符串 | |
if fcur == 'noto': | |
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk]) | |
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 | |
return "".join(["%04x" % ord(c) for c in cstk]) | |
else: | |
return "".join(["%02x" % ord(c) for c in cstk]) | |
_x, _y = 0, 0 | |
for id, new in enumerate(news): | |
x: float = pstk[id].x # 段落初始横坐标 | |
y: float = pstk[id].y # 段落上边界 | |
x0: float = pstk[id].x0 # 段落左边界 | |
x1: float = pstk[id].x1 # 段落右边界 | |
size: float = pstk[id].size # 段落字体大小 | |
font: PDFFont = pstk[id].font # 段落字体 | |
brk: bool = pstk[id].brk # 段落属性 | |
cstk: str = "" # 当前文字栈 | |
fcur: str = None # 当前字体ID | |
tx = x | |
fcur_ = fcur | |
ptr = 0 | |
log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}") | |
while ptr < len(new): | |
vy_regex = re.match( | |
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE | |
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉 | |
mod = 0 # 文字修饰符 | |
if vy_regex: # 加载公式 | |
ptr += len(vy_regex.group(0)) | |
try: | |
vid = int(vy_regex.group(1).replace(" ", "")) | |
adv = vlen[vid] | |
except Exception: | |
continue # 翻译器可能会自动补个越界的公式标记 | |
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符 | |
mod = var[vid][-1].width | |
else: # 加载文字 | |
ch = new[ptr] | |
fcur_ = None | |
# 原字体编码容易出问题,这里直接放弃掉 | |
# try: | |
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch: | |
# fcur_=self.fontid[font] # 原字体 | |
# except: | |
# pass | |
try: | |
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch: | |
fcur_ = "tiro" # 默认拉丁字体 | |
except Exception: | |
pass | |
if fcur_ is None: | |
fcur_ = self.resfont # 默认非拉丁字体 | |
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch))) | |
if fcur_ == 'noto': | |
adv = self.noto.char_lengths(ch, size)[0] | |
else: | |
adv = self.fontmap[fcur_].char_width(ord(ch)) * size | |
ptr += 1 | |
if ( # 输出文字缓冲区 | |
fcur_ != fcur # 1. 字体更新 | |
or vy_regex # 2. 插入公式 | |
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) | |
): | |
if cstk: | |
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ " | |
cstk = "" | |
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行 | |
x = x0 | |
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8} | |
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1 | |
if vy_regex: # 插入公式 | |
fix = 0 | |
if fcur is not None: # 段落内公式修正纵向偏移 | |
fix = varf[vid] | |
for vch in var[vid]: # 排版公式字符 | |
vc = chr(vch.cid) | |
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " | |
if log.isEnabledFor(logging.DEBUG): | |
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0))) | |
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0 | |
for l in varl[vid]: # 排版公式线条 | |
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 | |
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " | |
else: # 插入文字缓冲区 | |
if not cstk: # 单行开头 | |
tx = x | |
if x == x0 and ch == " ": # 消除段落换行空格 | |
adv = 0 | |
else: | |
cstk += ch | |
else: | |
cstk += ch | |
adv -= mod # 文字修饰符 | |
fcur = fcur_ | |
x += adv | |
if log.isEnabledFor(logging.DEBUG): | |
lstk.append(LTLine(0.1, (_x, _y), (x, y))) | |
_x, _y = x, y | |
# 处理结尾 | |
if cstk: | |
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ " | |
for l in lstk: # 排版全局线条 | |
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景 | |
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " | |
ops = f"BT {ops}ET " | |
return ops | |