sciwin_translate / pdf2zh /converter.py
leonsimon23's picture
Upload 9 files
8b23ca3 verified
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
from pdfminer.pdffont import PDFFont, PDFCIDFont
from pdfminer.converter import PDFConverter
from pdfminer.pdffont import PDFUnicodeNotDefined
from pdfminer.utils import apply_matrix_pt, mult_matrix
from pdfminer.layout import (
LTChar,
LTFigure,
LTLine,
LTPage,
)
import logging
import re
import concurrent.futures
import numpy as np
import unicodedata
from tenacity import retry, wait_fixed
from pdf2zh import cache
from pdf2zh.translator import (
BaseTranslator,
GoogleTranslator,
DeepLTranslator,
DeepLXTranslator,
OllamaTranslator,
OpenAITranslator,
AzureTranslator,
TencentTranslator,
)
from pymupdf import Font
log = logging.getLogger(__name__)
class PDFConverterEx(PDFConverter):
def __init__(
self,
rsrcmgr: PDFResourceManager,
) -> None:
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
def begin_page(self, page, ctm) -> None:
# 重载替换 cropbox
(x0, y0, x1, y1) = page.cropbox
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
self.cur_item = LTPage(page.pageno, mediabox)
def end_page(self, page):
# 重载返回指令流
return self.receive_layout(self.cur_item)
def begin_figure(self, name, bbox, matrix) -> None:
# 重载设置 pageid
self._stack.append(self.cur_item)
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
self.cur_item.pageid = self._stack[-1].pageid
def end_figure(self, _: str) -> None:
# 重载返回指令流
fig = self.cur_item
assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
self.cur_item = self._stack.pop()
self.cur_item.add(fig)
return self.receive_layout(fig)
def render_char(
self,
matrix,
font,
fontsize: float,
scaling: float,
rise: float,
cid: int,
ncs,
graphicstate: PDFGraphicState,
) -> float:
# 重载设置 cid 和 font
try:
text = font.to_unichr(cid)
assert isinstance(text, str), str(type(text))
except PDFUnicodeNotDefined:
text = self.handle_undefined_char(font, cid)
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
item = LTChar(
matrix,
font,
fontsize,
scaling,
rise,
text,
textwidth,
textdisp,
ncs,
graphicstate,
)
self.cur_item.add(item)
item.cid = cid # hack 插入原字符编码
item.font = font # hack 插入原字符字体
return item.adv
class Paragraph:
def __init__(self, y, x, x0, x1, size, font, brk):
self.y: float = y # 初始纵坐标
self.x: float = x # 初始横坐标
self.x0: float = x0 # 左边界
self.x1: float = x1 # 右边界
self.size: float = size # 字体大小
self.font: PDFFont = font # 字体
self.brk: bool = brk # 换行标记
# fmt: off
class TranslateConverter(PDFConverterEx):
def __init__(
self,
rsrcmgr,
vfont: str = None,
vchar: str = None,
thread: int = 0,
layout={},
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto: Font = None,
) -> None:
super().__init__(rsrcmgr)
self.vfont = vfont
self.vchar = vchar
self.thread = thread
self.layout = layout
self.resfont = resfont
self.noto = noto
self.translator: BaseTranslator = None
param = service.split(":", 1)
if param[0] == "google":
self.translator = GoogleTranslator(service, lang_out, lang_in, None)
elif param[0] == "deepl":
self.translator = DeepLTranslator(service, lang_out, lang_in, None)
elif param[0] == "deeplx":
self.translator = DeepLXTranslator(service, lang_out, lang_in, None)
elif param[0] == "ollama":
self.translator = OllamaTranslator(service, lang_out, lang_in, param[1])
elif param[0] == "openai":
self.translator = OpenAITranslator(service, lang_out, lang_in, param[1])
elif param[0] == "azure":
self.translator = AzureTranslator(service, lang_out, lang_in, None)
elif param[0] == "tencent":
self.translator = TencentTranslator(service, lang_out, lang_in, None)
else:
raise ValueError("Unsupported translation service")
def receive_layout(self, ltpage: LTPage):
# 段落
sstk: list[str] = [] # 段落文字栈
pstk: list[Paragraph] = [] # 段落属性栈
vbkt: int = 0 # 段落公式括号计数
# 公式组
vstk: list[LTChar] = [] # 公式符号组
vlstk: list[LTLine] = [] # 公式线条组
vfix: float = 0 # 公式纵向偏移
# 公式组栈
var: list[list[LTChar]] = [] # 公式符号组栈
varl: list[list[LTLine]] = [] # 公式线条组栈
varf: list[float] = [] # 公式纵向偏移栈
vlen: list[float] = [] # 公式宽度栈
# 全局
lstk: list[LTLine] = [] # 全局线条栈
xt: LTChar = None # 上一个字符
xt_cls: int = -1 # 上一个字符所属段落
vmax: float = ltpage.width / 4 # 行内公式最大宽度
ops: str = "" # 渲染结果
def vflag(font: str, char: str): # 匹配公式(和角标)字体
font = font.split("+")[-1] # 字体名截断
if re.match(r"\(cid:", char):
return True
# 基于字体名规则的判定
if self.vfont:
if re.match(self.vfont, font):
return True
else:
if re.match( # latex 字体
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
font,
):
return True
# 基于字符集规则的判定
if self.vchar:
if re.match(self.vchar, char):
return True
else:
if (
char
and char != " " # 非空格
and (
unicodedata.category(char[0])
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
)
):
return True
return False
############################################################
# A. 原文档解析
for child in ltpage:
if isinstance(child, LTChar):
cur_v = False
layout = self.layout[ltpage.pageid]
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
h, w = layout.shape
# 读取当前字符在 layout 中的类别
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
cls = layout[cy, cx]
if ( # 判定当前字符是否属于公式
cls == 0 # 1. 类别为保留区域
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
or vflag(child.fontname, child.get_text()) # 3. 公式字体
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
):
cur_v = True
# 判定括号组是否属于公式
if not cur_v:
if vstk and child.get_text() == "(":
cur_v = True
vbkt += 1
if vbkt and child.get_text() == ")":
cur_v = True
vbkt -= 1
if ( # 判定当前公式是否结束
not cur_v # 1. 当前字符不属于公式
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
):
if vstk:
if ( # 根据公式右侧的文字修正公式的纵向偏移
not cur_v # 1. 当前字符不属于公式
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
):
vfix = vstk[0].y0 - child.y0
sstk[-1] += f"$v{len(var)}$"
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
vstk = []
vlstk = []
vfix = 0
# 当前字符不属于公式或当前字符是公式的第一个字符
if not vstk:
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
if child.x0 > xt.x1 + 1: # 添加行内空格
sstk[-1] += " "
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
sstk[-1] += " "
pstk[-1].brk = True
else: # 根据当前字符构建一个新的段落
sstk.append("")
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
if not cur_v: # 文字入栈
if ( # 根据当前字符修正段落属性
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
or vflag(pstk[-1].font.fontname, "") # 3. 段落字体为公式字体
or re.match( # 4. 段落字体为粗体
r"(.*Medi|.*Bold)",
pstk[-1].font.fontname,
re.IGNORECASE,
)
):
pstk[-1].y -= child.size - pstk[-1].size # hack 这个段落纵向位置的修正有问题,不过先凑合用吧
pstk[-1].size = child.size
pstk[-1].font = child.font
sstk[-1] += child.get_text()
else: # 公式入栈
if ( # 根据公式左侧的文字修正公式的纵向偏移
not vstk # 1. 当前字符是公式的第一个字符
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
):
vfix = child.y0 - xt.y0
vstk.append(child)
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
# 更新上一个字符
xt = child
xt_cls = cls
elif isinstance(child, LTFigure): # 图表
pass
elif isinstance(child, LTLine): # 线条
layout = self.layout[ltpage.pageid]
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
h, w = layout.shape
# 读取当前线条在 layout 中的类别
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
cls = layout[cy, cx]
if vstk and cls == xt_cls: # 公式线条
vlstk.append(child)
else: # 全局线条
lstk.append(child)
else:
pass
# 处理结尾
if vstk: # 公式出栈
sstk[-1] += f"$v{len(var)}$"
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
log.debug("\n==========[VSTACK]==========\n")
for id, v in enumerate(var): # 计算公式宽度
l = max([vch.x1 for vch in v]) - v[0].x0
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
vlen.append(l)
############################################################
# B. 段落翻译
log.debug("\n==========[SSTACK]==========\n")
hash_key = cache.deterministic_hash("PDFMathTranslate")
cache.create_cache(hash_key)
@retry(wait=wait_fixed(1))
def worker(s: str): # 多线程翻译
try:
hash_key_paragraph = cache.deterministic_hash(
(s, str(self.translator))
)
new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
if new is None:
new = self.translator.translate(s)
cache.write_paragraph(hash_key, hash_key_paragraph, new)
return new
except BaseException as e:
if log.isEnabledFor(logging.DEBUG):
log.exception(e)
else:
log.exception(e, exc_info=False)
raise e
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.thread
) as executor:
news = list(executor.map(worker, sstk))
############################################################
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if fcur == 'noto':
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
else:
return "".join(["%02x" % ord(c) for c in cstk])
_x, _y = 0, 0
for id, new in enumerate(news):
x: float = pstk[id].x # 段落初始横坐标
y: float = pstk[id].y # 段落上边界
x0: float = pstk[id].x0 # 段落左边界
x1: float = pstk[id].x1 # 段落右边界
size: float = pstk[id].size # 段落字体大小
font: PDFFont = pstk[id].font # 段落字体
brk: bool = pstk[id].brk # 段落属性
cstk: str = "" # 当前文字栈
fcur: str = None # 当前字体ID
tx = x
fcur_ = fcur
ptr = 0
log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
while ptr < len(new):
vy_regex = re.match(
r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
mod = 0 # 文字修饰符
if vy_regex: # 加载公式
ptr += len(vy_regex.group(0))
try:
vid = int(vy_regex.group(1).replace(" ", ""))
adv = vlen[vid]
except Exception:
continue # 翻译器可能会自动补个越界的公式标记
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
mod = var[vid][-1].width
else: # 加载文字
ch = new[ptr]
fcur_ = None
# 原字体编码容易出问题,这里直接放弃掉
# try:
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
# fcur_=self.fontid[font] # 原字体
# except:
# pass
try:
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
fcur_ = "tiro" # 默认拉丁字体
except Exception:
pass
if fcur_ is None:
fcur_ = self.resfont # 默认非拉丁字体
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
if fcur_ == 'noto':
adv = self.noto.char_lengths(ch, size)[0]
else:
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
ptr += 1
if ( # 输出文字缓冲区
fcur_ != fcur # 1. 字体更新
or vy_regex # 2. 插入公式
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
):
if cstk:
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
cstk = ""
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
x = x0
lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
y -= size * lang_space.get(self.translator.lang_out, 1.1) # 小语种大多适配 1.1
if vy_regex: # 插入公式
fix = 0
if fcur is not None: # 段落内公式修正纵向偏移
fix = varf[vid]
for vch in var[vid]: # 排版公式字符
vc = chr(vch.cid)
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
if log.isEnabledFor(logging.DEBUG):
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
for l in varl[vid]: # 排版公式线条
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
else: # 插入文字缓冲区
if not cstk: # 单行开头
tx = x
if x == x0 and ch == " ": # 消除段落换行空格
adv = 0
else:
cstk += ch
else:
cstk += ch
adv -= mod # 文字修饰符
fcur = fcur_
x += adv
if log.isEnabledFor(logging.DEBUG):
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
_x, _y = x, y
# 处理结尾
if cstk:
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
for l in lstk: # 排版全局线条
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
ops = f"BT {ops}ET "
return ops