Spaces:

leonsimon23
/

sciwin_translate

Sleeping

App Files Files Community

leonsimon23 commited on Dec 6, 2024

Commit

8b23ca3

verified ·

1 Parent(s): 2a78cd7

Upload 9 files

Browse files

Files changed (9) hide show

pdf2zh/__init__.py +6 -0
pdf2zh/cache.py +91 -0
pdf2zh/converter.py +456 -0
pdf2zh/doclayout.py +163 -0
pdf2zh/gui.py +503 -0
pdf2zh/high_level.py +99 -0
pdf2zh/pdf2zh.py +325 -0
pdf2zh/pdfinterp.py +360 -0
pdf2zh/translator.py +347 -0

pdf2zh/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import logging
+log = logging.getLogger(__name__)
+__version__ = "1.8.4"
+__author__ = "Byaidu"

pdf2zh/cache.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import tempfile
+import os
+import time
+import hashlib
+import shutil
+cache_dir = os.path.join(tempfile.gettempdir(), "cache")
+os.makedirs(cache_dir, exist_ok=True)
+time_filename = "update_time"
+max_cache = 5
+def deterministic_hash(obj):
+    hash_object = hashlib.sha256()
+    hash_object.update(str(obj).encode())
+    return hash_object.hexdigest()[0:20]
+def get_dirs():
+    dirs = [
+        os.path.join(cache_dir, dir)
+        for dir in os.listdir(cache_dir)
+        if os.path.isdir(os.path.join(cache_dir, dir))
+    ]
+    return dirs
+def get_time(dir):
+    try:
+        timefile = os.path.join(dir, time_filename)
+        t = float(open(timefile, encoding="utf-8").read())
+        return t
+    except FileNotFoundError:
+        # handle the error as needed, for now we'll just return a default value
+        return float(
+            "inf"
+        )  # This ensures that this directory will be the first to be removed if required
+def write_time(dir):
+    timefile = os.path.join(dir, time_filename)
+    t = time.time()
+    print(t, file=open(timefile, "w", encoding="utf-8"), end="")
+def argmin(iterable):
+    return min(enumerate(iterable), key=lambda x: x[1])[0]
+def remove_extra():
+    dirs = get_dirs()
+    for dir in dirs:
+        if not os.path.isdir(
+            dir
+        ):  # This line might be redundant now, as get_dirs() ensures only directories are returned
+            os.remove(dir)
+        try:
+            get_time(dir)
+        except BaseException:
+            shutil.rmtree(dir)
+    while True:
+        dirs = get_dirs()
+        if len(dirs) <= max_cache:
+            break
+        times = [get_time(dir) for dir in dirs]
+        arg = argmin(times)
+        shutil.rmtree(dirs[arg])
+def is_cached(hash_key):
+    dir = os.path.join(cache_dir, hash_key)
+    return os.path.exists(dir)
+def create_cache(hash_key):
+    dir = os.path.join(cache_dir, hash_key)
+    os.makedirs(dir, exist_ok=True)
+    write_time(dir)
+def load_paragraph(hash_key, hash_key_paragraph):
+    filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
+    if os.path.exists(filename):
+        return open(filename, encoding="utf-8").read()
+    else:
+        return None
+def write_paragraph(hash_key, hash_key_paragraph, paragraph):
+    filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
+    print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")

pdf2zh/converter.py ADDED Viewed

	@@ -0,0 +1,456 @@

+from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
+from pdfminer.pdffont import PDFFont, PDFCIDFont
+from pdfminer.converter import PDFConverter
+from pdfminer.pdffont import PDFUnicodeNotDefined
+from pdfminer.utils import apply_matrix_pt, mult_matrix
+from pdfminer.layout import (
+    LTChar,
+    LTFigure,
+    LTLine,
+    LTPage,
+)
+import logging
+import re
+import concurrent.futures
+import numpy as np
+import unicodedata
+from tenacity import retry, wait_fixed
+from pdf2zh import cache
+from pdf2zh.translator import (
+    BaseTranslator,
+    GoogleTranslator,
+    DeepLTranslator,
+    DeepLXTranslator,
+    OllamaTranslator,
+    OpenAITranslator,
+    AzureTranslator,
+    TencentTranslator,
+)
+from pymupdf import Font
+log = logging.getLogger(__name__)
+class PDFConverterEx(PDFConverter):
+    def __init__(
+        self,
+        rsrcmgr: PDFResourceManager,
+    ) -> None:
+        PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
+    def begin_page(self, page, ctm) -> None:
+        # 重载替换 cropbox
+        (x0, y0, x1, y1) = page.cropbox
+        (x0, y0) = apply_matrix_pt(ctm, (x0, y0))
+        (x1, y1) = apply_matrix_pt(ctm, (x1, y1))
+        mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
+        self.cur_item = LTPage(page.pageno, mediabox)
+    def end_page(self, page):
+        # 重载返回指令流
+        return self.receive_layout(self.cur_item)
+    def begin_figure(self, name, bbox, matrix) -> None:
+        # 重载设置 pageid
+        self._stack.append(self.cur_item)
+        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
+        self.cur_item.pageid = self._stack[-1].pageid
+    def end_figure(self, _: str) -> None:
+        # 重载返回指令流
+        fig = self.cur_item
+        assert isinstance(self.cur_item, LTFigure), str(type(self.cur_item))
+        self.cur_item = self._stack.pop()
+        self.cur_item.add(fig)
+        return self.receive_layout(fig)
+    def render_char(
+        self,
+        matrix,
+        font,
+        fontsize: float,
+        scaling: float,
+        rise: float,
+        cid: int,
+        ncs,
+        graphicstate: PDFGraphicState,
+    ) -> float:
+        # 重载设置 cid 和 font
+        try:
+            text = font.to_unichr(cid)
+            assert isinstance(text, str), str(type(text))
+        except PDFUnicodeNotDefined:
+            text = self.handle_undefined_char(font, cid)
+        textwidth = font.char_width(cid)
+        textdisp = font.char_disp(cid)
+        item = LTChar(
+            matrix,
+            font,
+            fontsize,
+            scaling,
+            rise,
+            text,
+            textwidth,
+            textdisp,
+            ncs,
+            graphicstate,
+        )
+        self.cur_item.add(item)
+        item.cid = cid  # hack 插入原字符编码
+        item.font = font  # hack 插入原字符字体
+        return item.adv
+class Paragraph:
+    def __init__(self, y, x, x0, x1, size, font, brk):
+        self.y: float = y  # 初始纵坐标
+        self.x: float = x  # 初始横坐标
+        self.x0: float = x0  # 左边界
+        self.x1: float = x1  # 右边界
+        self.size: float = size  # 字体大小
+        self.font: PDFFont = font  # 字体
+        self.brk: bool = brk  # 换行标记
+# fmt: off
+class TranslateConverter(PDFConverterEx):
+    def __init__(
+        self,
+        rsrcmgr,
+        vfont: str = None,
+        vchar: str = None,
+        thread: int = 0,
+        layout={},
+        lang_in: str = "",
+        lang_out: str = "",
+        service: str = "",
+        resfont: str = "",
+        noto: Font = None,
+    ) -> None:
+        super().__init__(rsrcmgr)
+        self.vfont = vfont
+        self.vchar = vchar
+        self.thread = thread
+        self.layout = layout
+        self.resfont = resfont
+        self.noto = noto
+        self.translator: BaseTranslator = None
+        param = service.split(":", 1)
+        if param[0] == "google":
+            self.translator = GoogleTranslator(service, lang_out, lang_in, None)
+        elif param[0] == "deepl":
+            self.translator = DeepLTranslator(service, lang_out, lang_in, None)
+        elif param[0] == "deeplx":
+            self.translator = DeepLXTranslator(service, lang_out, lang_in, None)
+        elif param[0] == "ollama":
+            self.translator = OllamaTranslator(service, lang_out, lang_in, param[1])
+        elif param[0] == "openai":
+            self.translator = OpenAITranslator(service, lang_out, lang_in, param[1])
+        elif param[0] == "azure":
+            self.translator = AzureTranslator(service, lang_out, lang_in, None)
+        elif param[0] == "tencent":
+            self.translator = TencentTranslator(service, lang_out, lang_in, None)
+        else:
+            raise ValueError("Unsupported translation service")
+    def receive_layout(self, ltpage: LTPage):
+        # 段落
+        sstk: list[str] = []            # 段落文字栈
+        pstk: list[Paragraph] = []      # 段落属性栈
+        vbkt: int = 0                   # 段落公式括号计数
+        # 公式组
+        vstk: list[LTChar] = []         # 公式符号组
+        vlstk: list[LTLine] = []        # 公式线条组
+        vfix: float = 0                 # 公式纵向偏移
+        # 公式组栈
+        var: list[list[LTChar]] = []    # 公式符号组栈
+        varl: list[list[LTLine]] = []   # 公式线条组栈
+        varf: list[float] = []          # 公式纵向偏移栈
+        vlen: list[float] = []          # 公式宽度栈
+        # 全局
+        lstk: list[LTLine] = []         # 全局线条栈
+        xt: LTChar = None               # 上一个字符
+        xt_cls: int = -1                # 上一个字符所属段落
+        vmax: float = ltpage.width / 4  # 行内公式最大宽度
+        ops: str = ""                   # 渲染结果
+        def vflag(font: str, char: str):    # 匹配公式（和角标）字体
+            font = font.split("+")[-1]      # 字体名截断
+            if re.match(r"\(cid:", char):
+                return True
+            # 基于字体名规则的判定
+            if self.vfont:
+                if re.match(self.vfont, font):
+                    return True
+            else:
+                if re.match(                                            # latex 字体
+                    r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
+                    font,
+                ):
+                    return True
+            # 基于字符集规则的判定
+            if self.vchar:
+                if re.match(self.vchar, char):
+                    return True
+            else:
+                if (
+                    char
+                    and char != " "                                     # 非空格
+                    and (
+                        unicodedata.category(char[0])
+                        in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]   # 文字修饰符、数学符号、分隔符号
+                        or ord(char[0]) in range(0x370, 0x400)          # 希腊字母
+                    )
+                ):
+                    return True
+            return False
+        ############################################################
+        # A. 原文档解析
+        for child in ltpage:
+            if isinstance(child, LTChar):
+                cur_v = False
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前字符在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if (                                                                                        # 判定当前字符是否属于公式
+                    cls == 0                                                                                # 1. 类别为保留区域
+                    or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79)  # 2. 角标字体，有 0.76 的角标和 0.799 的大写，这里用 0.79 取中，同时考虑首字母放大的情况
+                    or vflag(child.fontname, child.get_text())                                              # 3. 公式字体
+                    or (child.matrix[0] == 0 and child.matrix[3] == 0)                                      # 4. 垂直字体
+                ):
+                    cur_v = True
+                # 判定括号组是否属于公式
+                if not cur_v:
+                    if vstk and child.get_text() == "(":
+                        cur_v = True
+                        vbkt += 1
+                    if vbkt and child.get_text() == ")":
+                        cur_v = True
+                        vbkt -= 1
+                if (                                                        # 判定当前公式是否结束
+                    not cur_v                                               # 1. 当前字符不属于公式
+                    or cls != xt_cls                                        # 2. 当前字符与前一个字符不属于同一段落
+                    or (abs(child.x0 - xt.x0) > vmax and cls != 0)          # 3. 段落内换行，可能是一长串斜体的段落，也可能是段内分式换行，这里设个阈值进行区分
+                ):
+                    if vstk:
+                        if (                                                # 根据公式右侧的文字修正公式的纵向偏移
+                            not cur_v                                       # 1. 当前字符不属于公式
+                            and cls == xt_cls                               # 2. 当前字符与前一个字符属于同一段落
+                            and child.x0 > max([vch.x0 for vch in vstk])    # 3. 当前字符在公式右侧
+                        ):
+                            vfix = vstk[0].y0 - child.y0
+                        sstk[-1] += f"$v{len(var)}$"
+                        var.append(vstk)
+                        varl.append(vlstk)
+                        varf.append(vfix)
+                        vstk = []
+                        vlstk = []
+                        vfix = 0
+                # 当前字符不属于公式或当前字符是公式的第一个字符
+                if not vstk:
+                    if cls == xt_cls:               # 当前字符与前一个字符属于同一段落
+                        if child.x0 > xt.x1 + 1:    # 添加行内空格
+                            sstk[-1] += " "
+                        elif child.x1 < xt.x0:      # 添加换行空格并标记原文段落存在换行
+                            sstk[-1] += " "
+                            pstk[-1].brk = True
+                    else:                           # 根据当前字符构建一个新的段落
+                        sstk.append("")
+                        pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, child.font, False))
+                if not cur_v:                                               # 文字入栈
+                    if (                                                    # 根据当前字符修正段落属性
+                        child.size > pstk[-1].size / 0.79                   # 1. 当前字符显著比段落字体大
+                        or len(sstk[-1].strip()) == 1                       # 2. 当前字符为段落第二个文字（考虑首字母放大的情况）
+                        or vflag(pstk[-1].font.fontname, "")                # 3. 段落字体为公式字体
+                        or re.match(                                        # 4. 段落字体为粗体
+                            r"(.*Medi|.*Bold)",
+                            pstk[-1].font.fontname,
+                            re.IGNORECASE,
+                        )
+                    ):
+                        pstk[-1].y -= child.size - pstk[-1].size             # hack 这个段落纵向位置的修正有问题，不过先凑合用吧
+                        pstk[-1].size = child.size
+                        pstk[-1].font = child.font
+                    sstk[-1] += child.get_text()
+                else:                                                       # 公式入栈
+                    if (                                                    # 根据公式左侧的文字修正公式的纵向偏移
+                        not vstk                                            # 1. 当前字符是公式的第一个字符
+                        and cls == xt_cls                                   # 2. 当前字符与前一个字符属于同一段落
+                        and child.x0 > xt.x0                                # 3. 前一个字符在公式左侧
+                    ):
+                        vfix = child.y0 - xt.y0
+                    vstk.append(child)
+                # 更新段落边界，因为段落内换行之后可能是公式开头，所以要在外边处理
+                pstk[-1].x0 = min(pstk[-1].x0, child.x0)
+                pstk[-1].x1 = max(pstk[-1].x1, child.x1)
+                # 更新上一个字符
+                xt = child
+                xt_cls = cls
+            elif isinstance(child, LTFigure):   # 图表
+                pass
+            elif isinstance(child, LTLine):     # 线条
+                layout = self.layout[ltpage.pageid]
+                # ltpage.height 可能是 fig 里面的高度，这里统一用 layout.shape
+                h, w = layout.shape
+                # 读取当前线条在 layout 中的类别
+                cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
+                cls = layout[cy, cx]
+                if vstk and cls == xt_cls:      # 公式线条
+                    vlstk.append(child)
+                else:                           # 全局线条
+                    lstk.append(child)
+            else:
+                pass
+        # 处理结尾
+        if vstk:    # 公式出栈
+            sstk[-1] += f"$v{len(var)}$"
+            var.append(vstk)
+            varl.append(vlstk)
+            varf.append(vfix)
+        log.debug("\n==========[VSTACK]==========\n")
+        for id, v in enumerate(var):  # 计算公式宽度
+            l = max([vch.x1 for vch in v]) - v[0].x0
+            log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
+            vlen.append(l)
+        ############################################################
+        # B. 段落翻译
+        log.debug("\n==========[SSTACK]==========\n")
+        hash_key = cache.deterministic_hash("PDFMathTranslate")
+        cache.create_cache(hash_key)
+        @retry(wait=wait_fixed(1))
+        def worker(s: str):  # 多线程翻译
+            try:
+                hash_key_paragraph = cache.deterministic_hash(
+                    (s, str(self.translator))
+                )
+                new = cache.load_paragraph(hash_key, hash_key_paragraph)  # 查询缓存
+                if new is None:
+                    new = self.translator.translate(s)
+                    cache.write_paragraph(hash_key, hash_key_paragraph, new)
+                return new
+            except BaseException as e:
+                if log.isEnabledFor(logging.DEBUG):
+                    log.exception(e)
+                else:
+                    log.exception(e, exc_info=False)
+                raise e
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.thread
+        ) as executor:
+            news = list(executor.map(worker, sstk))
+        ############################################################
+        # C. 新文档排版
+        def raw_string(fcur: str, cstk: str):  # 编码字符串
+            if fcur == 'noto':
+                return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
+            elif isinstance(self.fontmap[fcur], PDFCIDFont):  # 判断编码长度
+                return "".join(["%04x" % ord(c) for c in cstk])
+            else:
+                return "".join(["%02x" % ord(c) for c in cstk])
+        _x, _y = 0, 0
+        for id, new in enumerate(news):
+            x: float = pstk[id].x           # 段落初始横坐标
+            y: float = pstk[id].y           # 段落上边界
+            x0: float = pstk[id].x0         # 段落左边界
+            x1: float = pstk[id].x1         # 段落右边界
+            size: float = pstk[id].size     # 段落字体大小
+            font: PDFFont = pstk[id].font   # 段落字体
+            brk: bool = pstk[id].brk        # 段落属性
+            cstk: str = ""                  # 当前文字栈
+            fcur: str = None                # 当前字体ID
+            tx = x
+            fcur_ = fcur
+            ptr = 0
+            log.debug(f"< {y} {x} {x0} {x1} {size} {font.fontname} {brk} > {sstk[id]} | {new}")
+            while ptr < len(new):
+                vy_regex = re.match(
+                    r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
+                )  # 匹配 $vn$ 公式标记，前面的 $ 有的时候会被丢掉
+                mod = 0  # 文字修饰符
+                if vy_regex:  # 加载公式
+                    ptr += len(vy_regex.group(0))
+                    try:
+                        vid = int(vy_regex.group(1).replace(" ", ""))
+                        adv = vlen[vid]
+                    except Exception:
+                        continue  # 翻译器可能会自动补个越界的公式标记
+                    if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]:  # 文字修饰符
+                        mod = var[vid][-1].width
+                else:  # 加载文字
+                    ch = new[ptr]
+                    fcur_ = None
+                    # 原字体编码容易出问题，这里直接放弃掉
+                    # try:
+                    #     if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
+                    #         fcur_=self.fontid[font] # 原字体
+                    # except:
+                    #     pass
+                    try:
+                        if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
+                            fcur_ = "tiro"  # 默认拉丁字体
+                    except Exception:
+                        pass
+                    if fcur_ is None:
+                        fcur_ = self.resfont  # 默认非拉丁字体
+                    # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
+                    if fcur_ == 'noto':
+                        adv = self.noto.char_lengths(ch, size)[0]
+                    else:
+                        adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+                    ptr += 1
+                if (                                # 输出文字缓冲区
+                    fcur_ != fcur                   # 1. 字体更新
+                    or vy_regex                     # 2. 插入公式
+                    or x + adv > x1 + 0.1 * size    # 3. 到达右边界（可能一整行都被符号化，这里需要考虑浮点误差）
+                ):
+                    if cstk:
+                        ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+                        cstk = ""
+                if brk and x + adv > x1 + 0.1 * size:  # 到达右边界且原文段落存在换行
+                    x = x0
+                    lang_space = {"zh-CN": 1.4, "zh-TW": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
+                    y -= size * lang_space.get(self.translator.lang_out, 1.1)  # 小语种大多适配 1.1
+                if vy_regex:  # 插入公式
+                    fix = 0
+                    if fcur is not None:  # 段落内公式修正纵向偏移
+                        fix = varf[vid]
+                    for vch in var[vid]:  # 排版公式字符
+                        vc = chr(vch.cid)
+                        ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ "
+                        if log.isEnabledFor(logging.DEBUG):
+                            lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
+                            _x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
+                    for l in varl[vid]:  # 排版公式线条
+                        if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                            ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+                else:  # 插入文字缓冲区
+                    if not cstk:  # 单行开头
+                        tx = x
+                        if x == x0 and ch == " ":  # 消除段落换行空格
+                            adv = 0
+                        else:
+                            cstk += ch
+                    else:
+                        cstk += ch
+                adv -= mod # 文字修饰符
+                fcur = fcur_
+                x += adv
+                if log.isEnabledFor(logging.DEBUG):
+                    lstk.append(LTLine(0.1, (_x, _y), (x, y)))
+                    _x, _y = x, y
+            # 处理结尾
+            if cstk:
+                ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+        for l in lstk:  # 排版全局线条
+            if l.linewidth < 5:  # hack 有的文档会用粗线条当图片背景
+                ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
+        ops = f"BT {ops}ET "
+        return ops

pdf2zh/doclayout.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import abc
+import cv2
+import numpy as np
+import ast
+import onnx
+import onnxruntime
+from huggingface_hub import hf_hub_download
+class DocLayoutModel(abc.ABC):
+    @staticmethod
+    def load_onnx():
+        model = OnnxModel.from_pretrained(
+            repo_id="wybxc/DocLayout-YOLO-DocStructBench-onnx",
+            filename="doclayout_yolo_docstructbench_imgsz1024.onnx",
+        )
+        return model
+    @staticmethod
+    def load_available():
+        return DocLayoutModel.load_onnx()
+    @property
+    @abc.abstractmethod
+    def stride(self) -> int:
+        """Stride of the model input."""
+        pass
+    @abc.abstractmethod
+    def predict(self, image, imgsz=1024, **kwargs) -> list:
+        """
+        Predict the layout of a document page.
+        Args:
+            image: The image of the document page.
+            imgsz: Resize the image to this size. Must be a multiple of the stride.
+            **kwargs: Additional arguments.
+        """
+        pass
+class YoloResult:
+    """Helper class to store detection results from ONNX model."""
+    def __init__(self, boxes, names):
+        self.boxes = [YoloBox(data=d) for d in boxes]
+        self.boxes.sort(key=lambda x: x.conf, reverse=True)
+        self.names = names
+class YoloBox:
+    """Helper class to store detection results from ONNX model."""
+    def __init__(self, data):
+        self.xyxy = data[:4]
+        self.conf = data[-2]
+        self.cls = data[-1]
+class OnnxModel(DocLayoutModel):
+    def __init__(self, model_path: str):
+        self.model_path = model_path
+        model = onnx.load(model_path)
+        metadata = {d.key: d.value for d in model.metadata_props}
+        self._stride = ast.literal_eval(metadata["stride"])
+        self._names = ast.literal_eval(metadata["names"])
+        self.model = onnxruntime.InferenceSession(model.SerializeToString())
+    @staticmethod
+    def from_pretrained(repo_id: str, filename: str):
+        pth = hf_hub_download(repo_id=repo_id, filename=filename)
+        return OnnxModel(pth)
+    @property
+    def stride(self):
+        return self._stride
+    def resize_and_pad_image(self, image, new_shape):
+        """
+        Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
+        Parameters:
+        - image: Input image
+        - new_shape: Target size (integer or (height, width) tuple)
+        - stride: Padding alignment stride, default 32
+        Returns:
+        - Processed image
+        """
+        if isinstance(new_shape, int):
+            new_shape = (new_shape, new_shape)
+        h, w = image.shape[:2]
+        new_h, new_w = new_shape
+        # Calculate scaling ratio
+        r = min(new_h / h, new_w / w)
+        resized_h, resized_w = int(round(h * r)), int(round(w * r))
+        # Resize image
+        image = cv2.resize(
+            image, (resized_w, resized_h), interpolation=cv2.INTER_LINEAR
+        )
+        # Calculate padding size and align to stride multiple
+        pad_w = (new_w - resized_w) % self.stride
+        pad_h = (new_h - resized_h) % self.stride
+        top, bottom = pad_h // 2, pad_h - pad_h // 2
+        left, right = pad_w // 2, pad_w - pad_w // 2
+        # Add padding
+        image = cv2.copyMakeBorder(
+            image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
+        )
+        return image
+    def scale_boxes(self, img1_shape, boxes, img0_shape):
+        """
+        Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
+        specified in (img1_shape) to the shape of a different image (img0_shape).
+        Args:
+            img1_shape (tuple): The shape of the image that the bounding boxes are for,
+                in the format of (height, width).
+            boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
+            img0_shape (tuple): the shape of the target image, in the format of (height, width).
+        Returns:
+            boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
+        """
+        # Calculate scaling ratio
+        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
+        # Calculate padding size
+        pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
+        pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
+        # Remove padding and scale boxes
+        boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
+        return boxes
+    def predict(self, image, imgsz=1024, **kwargs):
+        # Preprocess input image
+        orig_h, orig_w = image.shape[:2]
+        pix = self.resize_and_pad_image(image, new_shape=imgsz)
+        pix = np.transpose(pix, (2, 0, 1))  # CHW
+        pix = np.expand_dims(pix, axis=0)  # BCHW
+        pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
+        new_h, new_w = pix.shape[2:]
+        # Run inference
+        preds = self.model.run(None, {"images": pix})[0]
+        # Postprocess predictions
+        preds = preds[preds[..., 4] > 0.25]
+        preds[..., :4] = self.scale_boxes(
+            (new_h, new_w), preds[..., :4], (orig_h, orig_w)
+        )
+        return [YoloResult(boxes=preds, names=self._names)]

pdf2zh/gui.py ADDED Viewed

	@@ -0,0 +1,503 @@

+import os
+import shutil
+from pathlib import Path
+from pdf2zh import __version__
+from pdf2zh.pdf2zh import extract_text
+import gradio as gr
+import numpy as np
+import pymupdf
+import tqdm
+import requests
+import cgi
+# Map service names to pdf2zh service options
+service_map = {
+    "Google": ("google", None, None),
+    "DeepL": ("deepl", "DEEPL_AUTH_KEY", None),
+    "DeepLX": ("deeplx", "DEEPLX_AUTH_KEY", None),
+    "Ollama": ("ollama", None, "gemma2"),
+    "OpenAI": ("openai", "OPENAI_API_KEY", "gpt-4o"),
+    "Azure": ("azure", "AZURE_APIKEY", None),
+    "Tencent": ("tencent", "TENCENT_SECRET_KEY", None),
+}
+lang_map = {
+    "Chinese": "zh",
+    "English": "en",
+    "French": "fr",
+    "German": "de",
+    "Japanese": "ja",
+    "Korean": "ko",
+    "Russian": "ru",
+    "Spanish": "es",
+    "Italian": "it",
+}
+page_map = {
+    "All": None,
+    "First": [0],
+    "First 5 pages": list(range(0, 5)),
+}
+flag_demo = False
+if os.environ.get("PDF2ZH_DEMO"):
+    flag_demo = True
+    service_map = {
+        "Google": ("google", None, None),
+    }
+    page_map = {
+        "First": [0],
+        "First 20 pages": list(range(0, 20)),
+    }
+    client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
+    server_key = os.environ.get("PDF2ZH_SERVER_KEY")
+def verify_recaptcha(response):
+    recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
+    print("reCAPTCHA", server_key, response)
+    data = {"secret": server_key, "response": response}
+    result = requests.post(recaptcha_url, data=data).json()
+    print("reCAPTCHA", result.get("success"))
+    return result.get("success")
+def pdf_preview(file):
+    doc = pymupdf.open(file)
+    page = doc[0]
+    pix = page.get_pixmap()
+    image = np.frombuffer(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)
+    return image
+def upload_file(file, service, progress=gr.Progress()):
+    """Handle file upload, validation, and initial preview."""
+    if not file or not os.path.exists(file):
+        return None, None
+    try:
+        # Convert first page for preview
+        preview_image = pdf_preview(file)
+        return file, preview_image
+    except Exception as e:
+        print(f"Error converting PDF: {e}")
+        return None, None
+def download_with_limit(url, save_path, size_limit):
+    chunk_size = 1024
+    total_size = 0
+    with requests.get(url, stream=True, timeout=10) as response:
+        response.raise_for_status()
+        content = response.headers.get("Content-Disposition")
+        try:
+            _, params = cgi.parse_header(content)
+            filename = params["filename"]
+        except Exception:
+            filename = os.path.basename(url)
+        with open(save_path / filename, "wb") as file:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                total_size += len(chunk)
+                if size_limit and total_size > size_limit:
+                    raise gr.Error("Exceeds file size limit")
+                file.write(chunk)
+    return save_path / filename
+def translate(
+    file_type,
+    file_input,
+    link_input,
+    service,
+    apikey,
+    model_id,
+    lang_from,
+    lang_to,
+    page_range,
+    recaptcha_response,
+    progress=gr.Progress(),
+):
+    """Translate PDF content using selected service."""
+    if flag_demo and not verify_recaptcha(recaptcha_response):
+        raise gr.Error("reCAPTCHA fail")
+    progress(0, desc="Starting translation...")
+    output = Path("pdf2zh_files")
+    output.mkdir(parents=True, exist_ok=True)
+    if file_type == "File":
+        if not file_input:
+            raise gr.Error("No input")
+        file_path = shutil.copy(file_input, output)
+    else:
+        if not link_input:
+            raise gr.Error("No input")
+        file_path = download_with_limit(
+            link_input,
+            output,
+            5 * 1024 * 1024 if flag_demo else None,
+        )
+    filename = os.path.splitext(os.path.basename(file_path))[0]
+    file_en = output / f"{filename}.pdf"
+    file_zh = output / f"{filename}-zh.pdf"
+    file_dual = output / f"{filename}-dual.pdf"
+    selected_service = service_map[service][0]
+    if service_map[service][1]:
+        os.environ.setdefault(service_map[service][1], apikey)
+    selected_page = page_map[page_range]
+    lang_from = lang_map[lang_from]
+    lang_to = lang_map[lang_to]
+    if selected_service == "google":
+        lang_from = "zh-CN" if lang_from == "zh" else lang_from
+        lang_to = "zh-CN" if lang_to == "zh" else lang_to
+    print(f"Files before translation: {os.listdir(output)}")
+    def progress_bar(t: tqdm.tqdm):
+        progress(t.n / t.total, desc="Translating...")
+    param = {
+        "files": [file_en],
+        "pages": selected_page,
+        "lang_in": lang_from,
+        "lang_out": lang_to,
+        "service": f"{selected_service}:{model_id}",
+        "output": output,
+        "thread": 4,
+        "callback": progress_bar,
+    }
+    print(param)
+    extract_text(**param)
+    print(f"Files after translation: {os.listdir(output)}")
+    if not file_zh.exists() or not file_dual.exists():
+        raise gr.Error("No output")
+    try:
+        translated_preview = pdf_preview(str(file_zh))
+    except Exception:
+        raise gr.Error("No preview")
+    progress(1.0, desc="Translation complete!")
+    return (
+        str(file_zh),
+        translated_preview,
+        str(file_dual),
+        gr.update(visible=True),
+        gr.update(visible=True),
+        gr.update(visible=True),
+    )
+# Global setup
+custom_blue = gr.themes.Color(
+    c50="#E8F3FF",
+    c100="#BEDAFF",
+    c200="#94BFFF",
+    c300="#6AA1FF",
+    c400="#4080FF",
+    c500="#165DFF",  # Primary color
+    c600="#0E42D2",
+    c700="#0A2BA6",
+    c800="#061D79",
+    c900="#03114D",
+    c950="#020B33",
+)
+with gr.Blocks(
+    title="PDFMathTranslate - PDF Translation with preserved formats",
+    theme=gr.themes.Default(
+        primary_hue=custom_blue, spacing_size="md", radius_size="lg"
+    ),
+    css="""
+    .secondary-text {color: #999 !important;}
+    footer {visibility: hidden}
+    .env-warning {color: #dd5500 !important;}
+    .env-success {color: #559900 !important;}
+    /* Add dashed border to input-file class */
+    .input-file {
+        border: 1.2px dashed #165DFF !important;
+        border-radius: 6px !important;
+        # background-color: #ffffff !important;
+        transition: background-color 0.4s ease-out;
+    }
+    .input-file:hover {
+        border: 1.2px dashed #165DFF !important;
+        border-radius: 6px !important;
+        color: #165DFF !important;
+        background-color: #E8F3FF !important;
+        transition: background-color 0.2s ease-in;
+    }
+    .progress-bar-wrap {
+    border-radius: 8px !important;
+    }
+    .progress-bar {
+    border-radius: 8px !important;
+    }
+    # .input-file label {
+    #     color: #165DFF !important;
+    #     border: 1.2px dashed #165DFF !important;
+    #     border-left: none !important;
+    #     border-top: none !important;
+    # }
+    # .input-file .wrap {
+    #     color: #165DFF !important;
+    # }
+    # .input-file .or {
+    #     color: #165DFF !important;
+    # }
+    """,
+    head=(
+        """
+    <script src="https://www.google.com/recaptcha/api.js?render=explicit" async defer></script>
+    <script type="text/javascript">
+        var onVerify = function(token) {
+            el=document.getElementById('verify').getElementsByTagName('textarea')[0];
+            el.value=token;
+            el.dispatchEvent(new Event('input'));
+        };
+    </script>
+    """
+        if flag_demo
+        else ""
+    ),
+) as demo:
+    gr.Markdown(
+        "# [PDFMathTranslate @ GitHub](https://github.com/Byaidu/PDFMathTranslate)"
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            gr.Markdown("## File | < 5 MB" if flag_demo else "## File")
+            file_type = gr.Radio(
+                choices=["File", "Link"],
+                label="Type",
+                value="File",
+            )
+            file_input = gr.File(
+                label="File",
+                file_count="single",
+                file_types=[".pdf"],
+                type="filepath",
+                elem_classes=["input-file"],
+            )
+            link_input = gr.Textbox(
+                label="Link",
+                visible=False,
+                interactive=True,
+            )
+            gr.Markdown("## Option")
+            with gr.Row():
+                service = gr.Dropdown(
+                    label="Service",
+                    choices=service_map.keys(),
+                    value="Google",
+                )
+                apikey = gr.Textbox(
+                    label="API Key",
+                    max_lines=1,
+                    visible=False,
+                )
+            with gr.Row():
+                lang_from = gr.Dropdown(
+                    label="Translate from",
+                    choices=lang_map.keys(),
+                    value="English",
+                )
+                lang_to = gr.Dropdown(
+                    label="Translate to",
+                    choices=lang_map.keys(),
+                    value="Chinese",
+                )
+            page_range = gr.Radio(
+                choices=page_map.keys(),
+                label="Pages",
+                value=list(page_map.keys())[0],
+            )
+            model_id = gr.Textbox(
+                label="Model ID",
+                visible=False,
+                interactive=True,
+            )
+            envs_status = "<span class='env-success'>- Properly configured.</span><br>"
+            def details_wrapper(text_markdown):
+                text = f"""
+                    <summary>Technical details</summary>
+                    {text_markdown}
+                    - GitHub: <a href="https://github.com/Byaidu/PDFMathTranslate">Byaidu/PDFMathTranslate</a><br>
+                    - GUI by: <a href="https://github.com/reycn">Rongxin</a><br>
+                    - Version: {__version__}
+                """
+                return text
+            def env_var_checker(env_var_name: str) -> str:
+                if env_var_name:
+                    if not os.environ.get(env_var_name):
+                        envs_status = (
+                            f"<span class='env-warning'>- Warning: environmental not found or error ({env_var_name})."
+                            + "</span><br>- Please make sure that the environment variables are properly configured "
+                            + "(<a href='https://github.com/Byaidu/PDFMathTranslate'>guide</a>).<br>"
+                        )
+                    else:
+                        value = str(os.environ.get(env_var_name))
+                        envs_status = "<span class='env-success'>- Properly configured.</span><br>"
+                        envs_status += (
+                            f"- {env_var_name}: <code>{value[:13]}***</code><br>"
+                        )
+                else:
+                    envs_status = (
+                        "<span class='env-success'>- Properly configured.</span><br>"
+                    )
+                return details_wrapper(envs_status)
+            def on_select_service(service, evt: gr.EventData):
+                if service_map[service][1]:
+                    apikey_content = gr.update(
+                        visible=True, value=os.environ.get(service_map[service][1])
+                    )
+                else:
+                    apikey_content = gr.update(visible=False)
+                if service_map[service][2]:
+                    model_visibility = gr.update(
+                        visible=True, value=service_map[service][2]
+                    )
+                else:
+                    model_visibility = gr.update(visible=False)
+                return (
+                    env_var_checker(service_map[service][1]),
+                    model_visibility,
+                    apikey_content,
+                )
+            def on_select_filetype(file_type):
+                return (
+                    gr.update(visible=file_type == "File"),
+                    gr.update(visible=file_type == "Link"),
+                )
+            output_title = gr.Markdown("## Translated", visible=False)
+            output_file = gr.File(label="Download Translation", visible=False)
+            output_file_dual = gr.File(
+                label="Download Translation (Dual)", visible=False
+            )
+            recaptcha_response = gr.Textbox(
+                label="reCAPTCHA Response", elem_id="verify", visible=False
+            )
+            recaptcha_box = gr.HTML('<div id="recaptcha-box"></div>')
+            translate_btn = gr.Button("Translate", variant="primary")
+            tech_details_tog = gr.Markdown(
+                details_wrapper(envs_status),
+                elem_classes=["secondary-text"],
+            )
+            service.select(
+                on_select_service, service, [tech_details_tog, model_id, apikey]
+            )
+            file_type.select(
+                on_select_filetype,
+                file_type,
+                [file_input, link_input],
+                js=(
+                    f"""
+                    (a,b)=>{{
+                        try{{
+                            grecaptcha.render('recaptcha-box',{{
+                                'sitekey':'{client_key}',
+                                'callback':'onVerify'
+                            }});
+                        }}catch(error){{}}
+                        return [a];
+                    }}
+                    """
+                    if flag_demo
+                    else ""
+                ),
+            )
+        with gr.Column(scale=2):
+            gr.Markdown("## Preview")
+            preview = gr.Image(label="Document Preview", visible=True)
+    # Event handlers
+    file_input.upload(
+        upload_file,
+        inputs=[file_input, service],
+        outputs=[file_input, preview],
+        js=(
+            f"""
+            (a,b)=>{{
+                try{{
+                    grecaptcha.render('recaptcha-box',{{
+                        'sitekey':'{client_key}',
+                        'callback':'onVerify'
+                    }});
+                }}catch(error){{}}
+                return [a];
+            }}
+            """
+            if flag_demo
+            else ""
+        ),
+    )
+    translate_btn.click(
+        translate,
+        inputs=[
+            file_type,
+            file_input,
+            link_input,
+            service,
+            apikey,
+            model_id,
+            lang_from,
+            lang_to,
+            page_range,
+            recaptcha_response,
+        ],
+        outputs=[
+            output_file,
+            preview,
+            output_file_dual,
+            output_file,
+            output_file_dual,
+            output_title,
+        ],
+    ).then(lambda: None, js="()=>{grecaptcha.reset()}" if flag_demo else "")
+def setup_gui(share=False):
+    if flag_demo:
+        demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
+    else:
+        try:
+            demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
+        except Exception:
+            print(
+                "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
+            )
+            try:
+                demo.launch(
+                    server_name="127.0.0.1", debug=True, inbrowser=True, share=share
+                )
+            except Exception:
+                print(
+                    "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
+                )
+                demo.launch(debug=True, inbrowser=True, share=True)
+# For auto-reloading while developing
+if __name__ == "__main__":
+    setup_gui()

pdf2zh/high_level.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""Functions that can be used for the most common use-cases for pdf2zh.six"""
+from typing import BinaryIO
+import numpy as np
+import tqdm
+from pymupdf import Document
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfinterp import PDFResourceManager
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfparser import PDFParser
+from pdf2zh.converter import TranslateConverter
+from pdf2zh.pdfinterp import PDFPageInterpreterEx
+from pymupdf import Font
+def extract_text_to_fp(
+    inf: BinaryIO,
+    pages=None,
+    password: str = "",
+    debug: bool = False,
+    page_count: int = 0,
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    doc_en: Document = None,
+    model=None,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    resfont: str = "",
+    noto: Font = None,
+    callback: object = None,
+    **kwarg,
+) -> None:
+    rsrcmgr = PDFResourceManager()
+    layout = {}
+    device = TranslateConverter(
+        rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
+    )
+    assert device is not None
+    obj_patch = {}
+    interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
+    if pages:
+        total_pages = len(pages)
+    else:
+        total_pages = page_count
+    parser = PDFParser(inf)
+    doc = PDFDocument(parser, password=password)
+    with tqdm.tqdm(
+        enumerate(PDFPage.create_pages(doc)),
+        total=total_pages,
+    ) as progress:
+        for pageno, page in progress:
+            if pages and (pageno not in pages):
+                continue
+            if callback:
+                callback(progress)
+            page.pageno = pageno
+            pix = doc_en[page.pageno].get_pixmap()
+            image = np.fromstring(pix.samples, np.uint8).reshape(
+                pix.height, pix.width, 3
+            )[:, :, ::-1]
+            page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
+            # kdtree 是不可能 kdtree 的，不如直接渲染成图片，用空间换时间
+            box = np.ones((pix.height, pix.width))
+            h, w = box.shape
+            vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
+            for i, d in enumerate(page_layout.boxes):
+                if not page_layout.names[int(d.cls)] in vcls:
+                    x0, y0, x1, y1 = d.xyxy.squeeze()
+                    x0, y0, x1, y1 = (
+                        np.clip(int(x0 - 1), 0, w - 1),
+                        np.clip(int(h - y1 - 1), 0, h - 1),
+                        np.clip(int(x1 + 1), 0, w - 1),
+                        np.clip(int(h - y0 + 1), 0, h - 1),
+                    )
+                    box[y0:y1, x0:x1] = i + 2
+            for i, d in enumerate(page_layout.boxes):
+                if page_layout.names[int(d.cls)] in vcls:
+                    x0, y0, x1, y1 = d.xyxy.squeeze()
+                    x0, y0, x1, y1 = (
+                        np.clip(int(x0 - 1), 0, w - 1),
+                        np.clip(int(h - y1 - 1), 0, h - 1),
+                        np.clip(int(x1 + 1), 0, w - 1),
+                        np.clip(int(h - y0 + 1), 0, h - 1),
+                    )
+                    box[y0:y1, x0:x1] = 0
+            layout[page.pageno] = box
+            # 新建一个 xref 存放新指令流
+            page.page_xref = doc_en.get_new_xref()  # hack 插入页面的新 xref
+            doc_en.update_object(page.page_xref, "<<>>")
+            doc_en.update_stream(page.page_xref, b"")
+            doc_en[page.pageno].set_contents(page.page_xref)
+            interpreter.process_page(page)
+    device.close()
+    return obj_patch

pdf2zh/pdf2zh.py ADDED Viewed

	@@ -0,0 +1,325 @@

+#!/usr/bin/env python3
+"""A command line tool for extracting text and images from PDF and
+output it to plain text, html, xml or tags.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import logging
+from pathlib import Path
+from typing import Any, Container, Iterable, List, Optional
+import urllib.request
+from pdfminer.pdfexceptions import PDFValueError
+import pymupdf
+import requests
+import tempfile
+from pdf2zh import __version__, log
+from pdf2zh.high_level import extract_text_to_fp
+from pdf2zh.doclayout import DocLayoutModel
+logging.basicConfig()
+model = DocLayoutModel.load_available()
+resfont_map = {
+    "zh-CN": "china-ss",
+    "zh-TW": "china-ts",
+    "ja": "japan-s",
+    "ko": "korea-s",
+}
+noto_list = [
+    "am",  # Amharic
+    "ar",  # Arabic
+    "bn",  # Bengali
+    "bg",  # Bulgarian
+    "chr",  # Cherokee
+    "el",  # Greek
+    "gu",  # Gujarati
+    "iw",  # Hebrew
+    "hi",  # Hindi
+    # "ja",  # Japanese
+    "kn",  # Kannada
+    # "ko",  # Korean
+    "ml",  # Malayalam
+    "mr",  # Marathi
+    "ru",  # Russian
+    "sr",  # Serbian
+    # "zh-CN",# Chinese (PRC)
+    "ta",  # Tamil
+    "te",  # Telugu
+    "th",  # Thai
+    # "zh-TW",# Chinese (Taiwan)
+    "ur",  # Urdu
+    "uk",  # Ukrainian
+]
+def check_files(files: List[str]) -> List[str]:
+    files = [
+        f for f in files if not f.startswith("http://")
+    ]  # exclude online files, http
+    files = [
+        f for f in files if not f.startswith("https://")
+    ]  # exclude online files, https
+    missing_files = [file for file in files if not os.path.exists(file)]
+    return missing_files
+def extract_text(
+    files: Iterable[str] = [],
+    pages: Optional[Container[int]] = None,
+    password: str = "",
+    debug: bool = False,
+    vfont: str = "",
+    vchar: str = "",
+    thread: int = 0,
+    lang_in: str = "",
+    lang_out: str = "",
+    service: str = "",
+    callback: object = None,
+    output: str = "",
+    **kwargs: Any,
+):
+    if debug:
+        log.setLevel(logging.DEBUG)
+    if not files:
+        raise PDFValueError("Must provide files to work upon!")
+    for file in files:
+        if file is str and (file.startswith("http://") or file.startswith("https://")):
+            print("Online files detected, downloading...")
+            try:
+                r = requests.get(file, allow_redirects=True)
+                if r.status_code == 200:
+                    if not os.path.exists("./pdf2zh_files"):
+                        print("Making a temporary dir for downloading PDF files...")
+                        os.mkdir(os.path.dirname("./pdf2zh_files"))
+                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
+                        print(f"Writing the file: {file}...")
+                        f.write(r.content)
+                    file = "./pdf2zh_files/tmp_download.pdf"
+                else:
+                    r.raise_for_status()
+            except Exception as e:
+                raise PDFValueError(
+                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
+                )
+        filename = os.path.splitext(os.path.basename(file))[0]
+        font_list = [("tiro", None)]
+        noto = None
+        if lang_out in resfont_map:  # CJK
+            resfont = resfont_map[lang_out]
+            font_list.append((resfont, None))
+        elif lang_out in noto_list:  # noto
+            resfont = "noto"
+            ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
+            if not os.path.exists(ttf_path):
+                print("Downloading Noto font...")
+                urllib.request.urlretrieve(
+                    "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
+                    ttf_path,
+                )
+            font_list.append(("noto", ttf_path))
+            noto = pymupdf.Font("noto", ttf_path)
+        else:  # auto
+            resfont = "china-ss"
+            font_list.append(("china-ss", None))
+        doc_en = pymupdf.open(file)
+        page_count = doc_en.page_count
+        # font_list = [("china-ss", None), ("tiro", None)]
+        font_id = {}
+        for page in doc_en:
+            for font in font_list:
+                font_id[font[0]] = page.insert_font(font[0], font[1])
+        xreflen = doc_en.xref_length()
+        for xref in range(1, xreflen):
+            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
+                try:  # xref 读写可能出错
+                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
+                    if font_res[0] == "dict":
+                        for font in font_list:
+                            font_exist = doc_en.xref_get_key(
+                                xref, f"{label}Font/{font[0]}"
+                            )
+                            if font_exist[0] == "null":
+                                doc_en.xref_set_key(
+                                    xref,
+                                    f"{label}Font/{font[0]}",
+                                    f"{font_id[font[0]]} 0 R",
+                                )
+                except Exception:
+                    pass
+        doc_en.save(Path(output) / f"{filename}-en.pdf")
+        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
+            obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
+        for obj_id, ops_new in obj_patch.items():
+            # ops_old=doc_en.xref_stream(obj_id)
+            # print(obj_id)
+            # print(ops_old)
+            # print(ops_new.encode())
+            doc_en.update_stream(obj_id, ops_new.encode())
+        doc_zh = doc_en
+        doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
+        doc_dual.insert_file(doc_zh)
+        for id in range(page_count):
+            doc_dual.move_page(page_count + id, id * 2 + 1)
+        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
+        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
+        doc_zh.close()
+        doc_dual.close()
+        os.remove(Path(output) / f"{filename}-en.pdf")
+    return
+def create_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=__doc__, add_help=True)
+    parser.add_argument(
+        "files",
+        type=str,
+        default=None,
+        nargs="*",
+        help="One or more paths to PDF files.",
+    )
+    parser.add_argument(
+        "--version",
+        "-v",
+        action="version",
+        version=f"pdf2zh v{__version__}",
+    )
+    parser.add_argument(
+        "--debug",
+        "-d",
+        default=False,
+        action="store_true",
+        help="Use debug logging level.",
+    )
+    parse_params = parser.add_argument_group(
+        "Parser",
+        description="Used during PDF parsing",
+    )
+    parse_params.add_argument(
+        "--pages",
+        "-p",
+        type=str,
+        help="The list of page numbers to parse.",
+    )
+    parse_params.add_argument(
+        "--password",
+        "-P",
+        type=str,
+        default="",
+        help="The password to use for decrypting PDF file.",
+    )
+    parse_params.add_argument(
+        "--vfont",
+        "-f",
+        type=str,
+        default="",
+        help="The regex to math font name of formula.",
+    )
+    parse_params.add_argument(
+        "--vchar",
+        "-c",
+        type=str,
+        default="",
+        help="The regex to math character of formula.",
+    )
+    parse_params.add_argument(
+        "--lang-in",
+        "-li",
+        type=str,
+        default="auto",
+        help="The code of source language.",
+    )
+    parse_params.add_argument(
+        "--lang-out",
+        "-lo",
+        type=str,
+        default="auto",
+        help="The code of target language.",
+    )
+    parse_params.add_argument(
+        "--service",
+        "-s",
+        type=str,
+        default="google",
+        help="The service to use for translation.",
+    )
+    parse_params.add_argument(
+        "--output",
+        "-o",
+        type=str,
+        default="",
+        help="Output directory for files.",
+    )
+    parse_params.add_argument(
+        "--thread",
+        "-t",
+        type=int,
+        default=4,
+        help="The number of threads to execute translation.",
+    )
+    parse_params.add_argument(
+        "--interactive",
+        "-i",
+        action="store_true",
+        help="Interact with GUI.",
+    )
+    parse_params.add_argument(
+        "--share",
+        action="store_true",
+        help="Enable Gradio Share",
+    )
+    return parser
+def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
+    parsed_args = create_parser().parse_args(args=args)
+    if parsed_args.pages:
+        pages = []
+        for p in parsed_args.pages.split(","):
+            if "-" in p:
+                start, end = p.split("-")
+                pages.extend(range(int(start) - 1, int(end)))
+            else:
+                pages.append(int(p) - 1)
+        parsed_args.pages = pages
+    return parsed_args
+def main(args: Optional[List[str]] = None) -> int:
+    parsed_args = parse_args(args)
+    missing_files = check_files(parsed_args.files)
+    if missing_files:
+        print("The following files do not exist:", file=sys.stderr)
+        for file in missing_files:
+            print(f"  {file}", file=sys.stderr)
+        return -1
+    if parsed_args.interactive:
+        from pdf2zh.gui import setup_gui
+        setup_gui(parsed_args.share)
+        return 0
+    extract_text(**vars(parsed_args))
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

pdf2zh/pdfinterp.py ADDED Viewed

	@@ -0,0 +1,360 @@

+import logging
+from typing import Any, Dict, Optional, Sequence, Tuple, cast
+import numpy as np
+from pdfminer import settings
+from pdfminer.pdfcolor import PREDEFINED_COLORSPACE, PDFColorSpace
+from pdfminer.pdfdevice import PDFDevice
+from pdfminer.pdfinterp import (
+    PDFPageInterpreter,
+    PDFResourceManager,
+    PDFContentParser,
+    PDFInterpreterError,
+    Color,
+    PDFStackT,
+    LITERAL_FORM,
+    LITERAL_IMAGE,
+)
+from pdfminer.pdffont import PDFFont
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdftypes import (
+    PDFObjRef,
+    dict_value,
+    list_value,
+    resolve1,
+    stream_value,
+)
+from pdfminer.psexceptions import PSEOF
+from pdfminer.psparser import (
+    PSKeyword,
+    keyword_name,
+    literal_name,
+)
+from pdfminer.utils import (
+    MATRIX_IDENTITY,
+    Matrix,
+    Rect,
+    mult_matrix,
+    apply_matrix_pt,
+)
+log = logging.getLogger(__name__)
+def safe_float(o: Any) -> Optional[float]:
+    try:
+        return float(o)
+    except (TypeError, ValueError):
+        return None
+class PDFPageInterpreterEx(PDFPageInterpreter):
+    """Processor for the content of a PDF page
+    Reference: PDF Reference, Appendix A, Operator Summary
+    """
+    def __init__(
+        self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
+    ) -> None:
+        self.rsrcmgr = rsrcmgr
+        self.device = device
+        self.obj_patch = obj_patch
+    def dup(self) -> "PDFPageInterpreterEx":
+        return self.__class__(self.rsrcmgr, self.device, self.obj_patch)
+    def init_resources(self, resources: Dict[object, object]) -> None:
+        # 重载设置 fontid 和 descent
+        """Prepare the fonts and XObjects listed in the Resource attribute."""
+        self.resources = resources
+        self.fontmap: Dict[object, PDFFont] = {}
+        self.fontid: Dict[PDFFont, object] = {}
+        self.xobjmap = {}
+        self.csmap: Dict[str, PDFColorSpace] = PREDEFINED_COLORSPACE.copy()
+        if not resources:
+            return
+        def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
+            if isinstance(spec, list):
+                name = literal_name(spec[0])
+            else:
+                name = literal_name(spec)
+            if name == "ICCBased" and isinstance(spec, list) and len(spec) >= 2:
+                return PDFColorSpace(name, stream_value(spec[1])["N"])
+            elif name == "DeviceN" and isinstance(spec, list) and len(spec) >= 2:
+                return PDFColorSpace(name, len(list_value(spec[1])))
+            else:
+                return PREDEFINED_COLORSPACE.get(name)
+        for k, v in dict_value(resources).items():
+            # log.debug("Resource: %r: %r", k, v)
+            if k == "Font":
+                for fontid, spec in dict_value(v).items():
+                    objid = None
+                    if isinstance(spec, PDFObjRef):
+                        objid = spec.objid
+                    spec = dict_value(spec)
+                    self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
+                    self.fontmap[fontid].descent = 0  # hack fix descent
+                    self.fontid[self.fontmap[fontid]] = fontid
+            elif k == "ColorSpace":
+                for csid, spec in dict_value(v).items():
+                    colorspace = get_colorspace(resolve1(spec))
+                    if colorspace is not None:
+                        self.csmap[csid] = colorspace
+            elif k == "ProcSet":
+                self.rsrcmgr.get_procset(list_value(v))
+            elif k == "XObject":
+                for xobjid, xobjstrm in dict_value(v).items():
+                    self.xobjmap[xobjid] = xobjstrm
+    def do_S(self) -> None:
+        # 重载过滤非公式线条
+        """Stroke path"""
+        def is_black(color: Color) -> bool:
+            if isinstance(color, Tuple):
+                return sum(color) == 0
+            else:
+                return color == 0
+        if (
+            len(self.curpath) == 2
+            and self.curpath[0][0] == "m"
+            and self.curpath[1][0] == "l"
+            and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
+            == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
+            and is_black(self.graphicstate.scolor)
+        ):  # 独立直线，水平，黑色
+            # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
+            self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
+            self.curpath = []
+            return "n"
+        else:
+            self.curpath = []
+    ############################################################
+    # 重载过滤非公式线条（F/B）
+    def do_f(self) -> None:
+        """Fill path using nonzero winding number rule"""
+        # self.device.paint_path(self.graphicstate, False, True, False, self.curpath)
+        self.curpath = []
+    def do_F(self) -> None:
+        """Fill path using nonzero winding number rule (obsolete)"""
+    def do_f_a(self) -> None:
+        """Fill path using even-odd rule"""
+        # self.device.paint_path(self.graphicstate, False, True, True, self.curpath)
+        self.curpath = []
+    def do_B(self) -> None:
+        """Fill and stroke path using nonzero winding number rule"""
+        # self.device.paint_path(self.graphicstate, True, True, False, self.curpath)
+        self.curpath = []
+    def do_B_a(self) -> None:
+        """Fill and stroke path using even-odd rule"""
+        # self.device.paint_path(self.graphicstate, True, True, True, self.curpath)
+        self.curpath = []
+    ############################################################
+    # 重载返回调用参数（SCN）
+    def do_SCN(self) -> None:
+        """Set color for stroking operations."""
+        if self.scs:
+            n = self.scs.ncomponents
+        else:
+            if settings.STRICT:
+                raise PDFInterpreterError("No colorspace specified!")
+            n = 1
+        args = self.pop(n)
+        self.graphicstate.scolor = cast(Color, args)
+        return args
+    def do_scn(self) -> None:
+        """Set color for nonstroking operations"""
+        if self.ncs:
+            n = self.ncs.ncomponents
+        else:
+            if settings.STRICT:
+                raise PDFInterpreterError("No colorspace specified!")
+            n = 1
+        args = self.pop(n)
+        self.graphicstate.ncolor = cast(Color, args)
+        return args
+    def do_SC(self) -> None:
+        """Set color for stroking operations"""
+        return self.do_SCN()
+    def do_sc(self) -> None:
+        """Set color for nonstroking operations"""
+        return self.do_scn()
+    def do_Do(self, xobjid_arg: PDFStackT) -> None:
+        # 重载设置 xobj 的 obj_patch
+        """Invoke named XObject"""
+        xobjid = literal_name(xobjid_arg)
+        try:
+            xobj = stream_value(self.xobjmap[xobjid])
+        except KeyError:
+            if settings.STRICT:
+                raise PDFInterpreterError("Undefined xobject id: %r" % xobjid)
+            return
+        # log.debug("Processing xobj: %r", xobj)
+        subtype = xobj.get("Subtype")
+        if subtype is LITERAL_FORM and "BBox" in xobj:
+            interpreter = self.dup()
+            bbox = cast(Rect, list_value(xobj["BBox"]))
+            matrix = cast(Matrix, list_value(xobj.get("Matrix", MATRIX_IDENTITY)))
+            # According to PDF reference 1.7 section 4.9.1, XObjects in
+            # earlier PDFs (prior to v1.2) use the page's Resources entry
+            # instead of having their own Resources entry.
+            xobjres = xobj.get("Resources")
+            if xobjres:
+                resources = dict_value(xobjres)
+            else:
+                resources = self.resources.copy()
+            self.device.begin_figure(xobjid, bbox, matrix)
+            ctm = mult_matrix(matrix, self.ctm)
+            ops_base = interpreter.render_contents(
+                resources,
+                [xobj],
+                ctm=ctm,
+            )
+            try:  # 有的时候 form 字体加不上这里会烂掉
+                self.device.fontid = interpreter.fontid
+                self.device.fontmap = interpreter.fontmap
+                ops_new = self.device.end_figure(xobjid)
+                ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
+                pos_inv = -np.mat(ctm[4:]) * ctm_inv
+                a, b, c, d = ctm_inv.reshape(4).tolist()
+                e, f = pos_inv.tolist()[0]
+                self.obj_patch[self.xobjmap[xobjid].objid] = (
+                    f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
+                )
+            except Exception:
+                pass
+        elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
+            self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
+            self.device.render_image(xobjid, xobj)
+            self.device.end_figure(xobjid)
+        else:
+            # unsupported xobject type.
+            pass
+    def process_page(self, page: PDFPage) -> None:
+        # 重载设置 page 的 obj_patch
+        # log.debug("Processing page: %r", page)
+        # print(page.mediabox,page.cropbox)
+        # (x0, y0, x1, y1) = page.mediabox
+        (x0, y0, x1, y1) = page.cropbox
+        if page.rotate == 90:
+            ctm = (0, -1, 1, 0, -y0, x1)
+        elif page.rotate == 180:
+            ctm = (-1, 0, 0, -1, x1, y1)
+        elif page.rotate == 270:
+            ctm = (0, 1, -1, 0, y1, -x0)
+        else:
+            ctm = (1, 0, 0, 1, -x0, -y0)
+        self.device.begin_page(page, ctm)
+        ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
+        self.device.fontid = self.fontid
+        self.device.fontmap = self.fontmap
+        ops_new = self.device.end_page(page)
+        # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标，这里输出的时候需要用 cm 把页面偏移加回来
+        self.obj_patch[page.page_xref] = (
+            f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}"  # ops_base 里可能有图，需要让 ops_new 里的文字覆盖在上面，使用 q/Q 重置位置矩阵
+        )
+        for obj in page.contents:
+            self.obj_patch[obj.objid] = ""
+    def render_contents(
+        self,
+        resources: Dict[object, object],
+        streams: Sequence[object],
+        ctm: Matrix = MATRIX_IDENTITY,
+    ) -> None:
+        # 重载返回指令流
+        """Render the content streams.
+        This method may be called recursively.
+        """
+        # log.debug(
+        #     "render_contents: resources=%r, streams=%r, ctm=%r",
+        #     resources,
+        #     streams,
+        #     ctm,
+        # )
+        self.init_resources(resources)
+        self.init_state(ctm)
+        return self.execute(list_value(streams))
+    def execute(self, streams: Sequence[object]) -> None:
+        # 重载返回指令流
+        ops = ""
+        try:
+            parser = PDFContentParser(streams)
+        except PSEOF:
+            # empty page
+            return
+        while True:
+            try:
+                (_, obj) = parser.nextobject()
+            except PSEOF:
+                break
+            if isinstance(obj, PSKeyword):
+                name = keyword_name(obj)
+                method = "do_%s" % name.replace("*", "_a").replace('"', "_w").replace(
+                    "'",
+                    "_q",
+                )
+                if hasattr(self, method):
+                    func = getattr(self, method)
+                    nargs = func.__code__.co_argcount - 1
+                    if nargs:
+                        args = self.pop(nargs)
+                        # log.debug("exec: %s %r", name, args)
+                        if len(args) == nargs:
+                            func(*args)
+                            if not (
+                                name[0] == "T"
+                                or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
+                            ):  # 过滤 T 系列文字指令，因为 EI 的参数是 obj 所以也需要过滤（只在少数文档中画横线时使用），过滤 marked 系列指令
+                                p = " ".join(
+                                    [
+                                        (
+                                            f"{x:f}"
+                                            if isinstance(x, float)
+                                            else str(x).replace("'", "")
+                                        )
+                                        for x in args
+                                    ]
+                                )
+                                ops += f"{p} {name} "
+                    else:
+                        # log.debug("exec: %s", name)
+                        targs = func()
+                        if targs is None:
+                            targs = []
+                        if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
+                            p = " ".join(
+                                [
+                                    (
+                                        f"{x:f}"
+                                        if isinstance(x, float)
+                                        else str(x).replace("'", "")
+                                    )
+                                    for x in targs
+                                ]
+                            )
+                            ops += f"{p} {name} "
+                elif settings.STRICT:
+                    error_msg = "Unknown operator: %r" % name
+                    raise PDFInterpreterError(error_msg)
+            else:
+                self.push(obj)
+        # print('REV DATA',ops)
+        return ops

pdf2zh/translator.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import hashlib
+import hmac
+import html
+import logging
+import os
+import re
+import time
+from datetime import timezone, datetime
+from json import dumps, loads
+import unicodedata
+import deepl
+import ollama
+import openai
+import requests
+from azure.ai.translation.text import TextTranslationClient
+from azure.core.credentials import AzureKeyCredential
+def remove_control_characters(s):
+    return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
+class BaseTranslator:
+    def __init__(self, service, lang_out, lang_in, model):
+        self.service = service
+        self.lang_out = lang_out
+        self.lang_in = lang_in
+        self.model = model
+    def translate(self, text) -> str: ...  # noqa: E704
+    def __str__(self):
+        return f"{self.service} {self.lang_out} {self.lang_in}"
+class GoogleTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.session = requests.Session()
+        self.base_link = "http://translate.google.com/m"
+        self.headers = {
+            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
+        }
+    def translate(self, text):
+        text = text[:5000]  # google translate max length
+        response = self.session.get(
+            self.base_link,
+            params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
+            headers=self.headers,
+        )
+        re_result = re.findall(
+            r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
+        )
+        if response.status_code == 400:
+            result = "IRREPARABLE TRANSLATION ERROR"
+        elif len(re_result) == 0:
+            raise ValueError("Empty translation result")
+        else:
+            result = html.unescape(re_result[0])
+        return remove_control_characters(result)
+class TencentTranslator(BaseTranslator):
+    def sign(self, key, msg):
+        return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            server_url = "tmt.tencentcloudapi.com"
+            self.secret_id = os.getenv("TENCENT_SECRET_ID")
+            self.secret_key = os.getenv("TENCENT_SECRET_KEY")
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        self.session = requests.Session()
+        self.base_link = f"{server_url}"
+    def translate(self, text):
+        text = text[:5000]
+        data = {
+            "SourceText": text,
+            "Source": self.lang_in,
+            "Target": self.lang_out,
+            "ProjectId": 0,
+        }
+        payloadx = dumps(data)
+        hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
+        canonical_request = (
+            "POST"
+            + "\n"
+            + "/"
+            + "\n"
+            + ""
+            + "\n"
+            + "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
+            + "\n"
+            + "content-type;host;x-tc-action"
+            + "\n"
+            + hashed_request_payload
+        )
+        timestamp = int(time.time())
+        date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
+        credential_scope = date + "/tmt/tc3_request"
+        hashed_canonical_request = hashlib.sha256(
+            canonical_request.encode("utf-8")
+        ).hexdigest()
+        algorithm = "TC3-HMAC-SHA256"
+        string_to_sign = (
+            algorithm
+            + "\n"
+            + str(timestamp)
+            + "\n"
+            + credential_scope
+            + "\n"
+            + hashed_canonical_request
+        )
+        secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
+        secret_service = self.sign(secret_date, "tmt")
+        secret_signing = self.sign(secret_service, "tc3_request")
+        signed_headers = "content-type;host;x-tc-action"
+        signature = hmac.new(
+            secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
+        ).hexdigest()
+        authorization = (
+            algorithm
+            + " "
+            + "Credential="
+            + str(self.secret_id)
+            + "/"
+            + credential_scope
+            + ", "
+            + "SignedHeaders="
+            + signed_headers
+            + ", "
+            + "Signature="
+            + signature
+        )
+        self.headers = {
+            "Authorization": authorization,
+            "Content-Type": "application/json; charset=utf-8",
+            "Host": "tmt.tencentcloudapi.com",
+            "X-TC-Action": "TextTranslate",
+            "X-TC-Region": "ap-beijing",
+            "X-TC-Timestamp": str(timestamp),
+            "X-TC-Version": "2018-03-21",
+        }
+        response = self.session.post(
+            "https://" + self.base_link,
+            json=data,
+            headers=self.headers,
+        )
+        # 1. Status code test
+        if response.status_code == 200:
+            result = loads(response.text)
+        else:
+            raise ValueError("HTTP error: " + str(response.status_code))
+        # 2. Result test
+        try:
+            result = result["Response"]["TargetText"]
+            # return result
+        except KeyError:
+            result = ""
+        #     raise ValueError("No valid key in Tencent's response")
+        # # 3. Result length check
+        # if len(result) == 0:
+        #     raise ValueError("Empty translation result")
+        return result
+class DeepLXTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            auth_key = os.getenv("DEEPLX_AUTH_KEY")
+            server_url = (
+                "https://api.deeplx.org"
+                if not os.getenv("DEEPLX_SERVER_URL")
+                else os.getenv("DEEPLX_SERVER_URL")
+            )
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        self.session = requests.Session()
+        server_url = str(server_url).rstrip("/")
+        if auth_key:
+            self.base_link = f"{server_url}/{auth_key}/translate"
+        else:
+            self.base_link = f"{server_url}/translate"
+        self.headers = {
+            "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"  # noqa: E501
+        }
+    def translate(self, text):
+        text = text[:5000]  # google translate max length
+        response = self.session.post(
+            self.base_link,
+            dumps(
+                {
+                    "target_lang": self.lang_out,
+                    "text": text,
+                }
+            ),
+            headers=self.headers,
+        )
+        # 1. Status code test
+        if response.status_code == 200:
+            result = loads(response.text)
+        else:
+            raise ValueError("HTTP error: " + str(response.status_code))
+        # 2. Result test
+        try:
+            result = result["data"]
+            return result
+        except KeyError:
+            result = ""
+            raise ValueError("No valid key in DeepLX's response")
+        # 3. Result length check
+        if len(result) == 0:
+            raise ValueError("Empty translation result")
+        return result
+class DeepLTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "ZH" if lang_out == "auto" else lang_out
+        lang_in = "EN" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.session = requests.Session()
+        auth_key = os.getenv("DEEPL_AUTH_KEY")
+        server_url = os.getenv("DEEPL_SERVER_URL")
+        self.client = deepl.Translator(auth_key, server_url=server_url)
+    def translate(self, text):
+        response = self.client.translate_text(
+            text, target_lang=self.lang_out, source_lang=self.lang_in
+        )
+        return response.text
+class OllamaTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
+        # OLLAMA_HOST
+        self.client = ollama.Client()
+    def translate(self, text):
+        response = self.client.chat(
+            model=self.model,
+            options=self.options,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a professional,authentic machine translation engine.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",  # noqa: E501
+                },
+            ],
+        )
+        return response["message"]["content"].strip()
+class OpenAITranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-CN" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        self.options = {"temperature": 0}  # 随机采样可能会打断公式标记
+        # OPENAI_BASE_URL
+        # OPENAI_API_KEY
+        self.client = openai.OpenAI()
+    def translate(self, text) -> str:
+        response = self.client.chat.completions.create(
+            model=self.model,
+            **self.options,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a professional,authentic machine translation engine.",
+                },
+                {
+                    "role": "user",
+                    "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",  # noqa: E501
+                },
+            ],
+        )
+        return response.choices[0].message.content.strip()
+class AzureTranslator(BaseTranslator):
+    def __init__(self, service, lang_out, lang_in, model):
+        lang_out = "zh-Hans" if lang_out == "auto" else lang_out
+        lang_in = "en" if lang_in == "auto" else lang_in
+        super().__init__(service, lang_out, lang_in, model)
+        try:
+            api_key = os.environ["AZURE_APIKEY"]
+            endpoint = os.environ["AZURE_ENDPOINT"]
+            region = os.environ["AZURE_REGION"]
+        except KeyError as e:
+            missing_var = e.args[0]
+            raise ValueError(
+                f"The environment variable '{missing_var}' is required but not set."
+            ) from e
+        credential = AzureKeyCredential(api_key)
+        self.client = TextTranslationClient(
+            endpoint=endpoint, credential=credential, region=region
+        )
+        # https://github.com/Azure/azure-sdk-for-python/issues/9422
+        logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
+        logger.setLevel(logging.WARNING)
+    def translate(self, text) -> str:
+        response = self.client.translate(
+            body=[text],
+            from_language=self.lang_in,
+            to_language=[self.lang_out],
+        )
+        translated_text = response[0].translations[0].text
+        return translated_text