Spaces:
Sleeping
Sleeping
"""Functions that can be used for the most common use-cases for pdf2zh.six""" | |
from typing import BinaryIO | |
import numpy as np | |
import tqdm | |
from pymupdf import Document | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.pdfdocument import PDFDocument | |
from pdfminer.pdfparser import PDFParser | |
from pdf2zh.converter import TranslateConverter | |
from pdf2zh.pdfinterp import PDFPageInterpreterEx | |
from pymupdf import Font | |
def extract_text_to_fp( | |
inf: BinaryIO, | |
pages=None, | |
password: str = "", | |
debug: bool = False, | |
page_count: int = 0, | |
vfont: str = "", | |
vchar: str = "", | |
thread: int = 0, | |
doc_en: Document = None, | |
model=None, | |
lang_in: str = "", | |
lang_out: str = "", | |
service: str = "", | |
resfont: str = "", | |
noto: Font = None, | |
callback: object = None, | |
**kwarg, | |
) -> None: | |
rsrcmgr = PDFResourceManager() | |
layout = {} | |
device = TranslateConverter( | |
rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto | |
) | |
assert device is not None | |
obj_patch = {} | |
interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch) | |
if pages: | |
total_pages = len(pages) | |
else: | |
total_pages = page_count | |
parser = PDFParser(inf) | |
doc = PDFDocument(parser, password=password) | |
with tqdm.tqdm( | |
enumerate(PDFPage.create_pages(doc)), | |
total=total_pages, | |
) as progress: | |
for pageno, page in progress: | |
if pages and (pageno not in pages): | |
continue | |
if callback: | |
callback(progress) | |
page.pageno = pageno | |
pix = doc_en[page.pageno].get_pixmap() | |
image = np.fromstring(pix.samples, np.uint8).reshape( | |
pix.height, pix.width, 3 | |
)[:, :, ::-1] | |
page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0] | |
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间 | |
box = np.ones((pix.height, pix.width)) | |
h, w = box.shape | |
vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"] | |
for i, d in enumerate(page_layout.boxes): | |
if not page_layout.names[int(d.cls)] in vcls: | |
x0, y0, x1, y1 = d.xyxy.squeeze() | |
x0, y0, x1, y1 = ( | |
np.clip(int(x0 - 1), 0, w - 1), | |
np.clip(int(h - y1 - 1), 0, h - 1), | |
np.clip(int(x1 + 1), 0, w - 1), | |
np.clip(int(h - y0 + 1), 0, h - 1), | |
) | |
box[y0:y1, x0:x1] = i + 2 | |
for i, d in enumerate(page_layout.boxes): | |
if page_layout.names[int(d.cls)] in vcls: | |
x0, y0, x1, y1 = d.xyxy.squeeze() | |
x0, y0, x1, y1 = ( | |
np.clip(int(x0 - 1), 0, w - 1), | |
np.clip(int(h - y1 - 1), 0, h - 1), | |
np.clip(int(x1 + 1), 0, w - 1), | |
np.clip(int(h - y0 + 1), 0, h - 1), | |
) | |
box[y0:y1, x0:x1] = 0 | |
layout[page.pageno] = box | |
# 新建一个 xref 存放新指令流 | |
page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref | |
doc_en.update_object(page.page_xref, "<<>>") | |
doc_en.update_stream(page.page_xref, b"") | |
doc_en[page.pageno].set_contents(page.page_xref) | |
interpreter.process_page(page) | |
device.close() | |
return obj_patch | |