sciwin_translate / pdf2zh /high_level.py
leonsimon23's picture
Upload 9 files
8b23ca3 verified
"""Functions that can be used for the most common use-cases for pdf2zh.six"""
from typing import BinaryIO
import numpy as np
import tqdm
from pymupdf import Document
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdf2zh.converter import TranslateConverter
from pdf2zh.pdfinterp import PDFPageInterpreterEx
from pymupdf import Font
def extract_text_to_fp(
inf: BinaryIO,
pages=None,
password: str = "",
debug: bool = False,
page_count: int = 0,
vfont: str = "",
vchar: str = "",
thread: int = 0,
doc_en: Document = None,
model=None,
lang_in: str = "",
lang_out: str = "",
service: str = "",
resfont: str = "",
noto: Font = None,
callback: object = None,
**kwarg,
) -> None:
rsrcmgr = PDFResourceManager()
layout = {}
device = TranslateConverter(
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
)
assert device is not None
obj_patch = {}
interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
if pages:
total_pages = len(pages)
else:
total_pages = page_count
parser = PDFParser(inf)
doc = PDFDocument(parser, password=password)
with tqdm.tqdm(
enumerate(PDFPage.create_pages(doc)),
total=total_pages,
) as progress:
for pageno, page in progress:
if pages and (pageno not in pages):
continue
if callback:
callback(progress)
page.pageno = pageno
pix = doc_en[page.pageno].get_pixmap()
image = np.fromstring(pix.samples, np.uint8).reshape(
pix.height, pix.width, 3
)[:, :, ::-1]
page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
box = np.ones((pix.height, pix.width))
h, w = box.shape
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
for i, d in enumerate(page_layout.boxes):
if not page_layout.names[int(d.cls)] in vcls:
x0, y0, x1, y1 = d.xyxy.squeeze()
x0, y0, x1, y1 = (
np.clip(int(x0 - 1), 0, w - 1),
np.clip(int(h - y1 - 1), 0, h - 1),
np.clip(int(x1 + 1), 0, w - 1),
np.clip(int(h - y0 + 1), 0, h - 1),
)
box[y0:y1, x0:x1] = i + 2
for i, d in enumerate(page_layout.boxes):
if page_layout.names[int(d.cls)] in vcls:
x0, y0, x1, y1 = d.xyxy.squeeze()
x0, y0, x1, y1 = (
np.clip(int(x0 - 1), 0, w - 1),
np.clip(int(h - y1 - 1), 0, h - 1),
np.clip(int(x1 + 1), 0, w - 1),
np.clip(int(h - y0 + 1), 0, h - 1),
)
box[y0:y1, x0:x1] = 0
layout[page.pageno] = box
# 新建一个 xref 存放新指令流
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
doc_en.update_object(page.page_xref, "<<>>")
doc_en.update_stream(page.page_xref, b"")
doc_en[page.pageno].set_contents(page.page_xref)
interpreter.process_page(page)
device.close()
return obj_patch