Spaces:

leonsimon23
/

sciwin_translate

Running

App Files Files Community

sciwin_translate / pdf2zh /high_level.py

leonsimon23

Upload 9 files

8b23ca3 verified 7 months ago

raw

history blame contribute delete

3.62 kB

	"""Functions that can be used for the most common use-cases for pdf2zh.six"""

	from typing import BinaryIO
	import numpy as np
	import tqdm
	from pymupdf import Document
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdfinterp import PDFResourceManager
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfparser import PDFParser
	from pdf2zh.converter import TranslateConverter
	from pdf2zh.pdfinterp import PDFPageInterpreterEx
	from pymupdf import Font


	def extract_text_to_fp(
	inf: BinaryIO,
	pages=None,
	password: str = "",
	debug: bool = False,
	page_count: int = 0,
	vfont: str = "",
	vchar: str = "",
	thread: int = 0,
	doc_en: Document = None,
	model=None,
	lang_in: str = "",
	lang_out: str = "",
	service: str = "",
	resfont: str = "",
	noto: Font = None,
	callback: object = None,
	**kwarg,
	) -> None:
	rsrcmgr = PDFResourceManager()
	layout = {}
	device = TranslateConverter(
	rsrcmgr, vfont, vchar, thread, layout, lang_in, lang_out, service, resfont, noto
	)

	assert device is not None
	obj_patch = {}
	interpreter = PDFPageInterpreterEx(rsrcmgr, device, obj_patch)
	if pages:
	total_pages = len(pages)
	else:
	total_pages = page_count

	parser = PDFParser(inf)
	doc = PDFDocument(parser, password=password)
	with tqdm.tqdm(
	enumerate(PDFPage.create_pages(doc)),
	total=total_pages,
	) as progress:
	for pageno, page in progress:
	if pages and (pageno not in pages):
	continue
	if callback:
	callback(progress)
	page.pageno = pageno
	pix = doc_en[page.pageno].get_pixmap()
	image = np.fromstring(pix.samples, np.uint8).reshape(
	pix.height, pix.width, 3
	)[:, :, ::-1]
	page_layout = model.predict(image, imgsz=int(pix.height / 32) * 32)[0]
	# kdtree 是不可能 kdtree 的，不如直接渲染成图片，用空间换时间
	box = np.ones((pix.height, pix.width))
	h, w = box.shape
	vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
	for i, d in enumerate(page_layout.boxes):
	if not page_layout.names[int(d.cls)] in vcls:
	x0, y0, x1, y1 = d.xyxy.squeeze()
	x0, y0, x1, y1 = (
	np.clip(int(x0 - 1), 0, w - 1),
	np.clip(int(h - y1 - 1), 0, h - 1),
	np.clip(int(x1 + 1), 0, w - 1),
	np.clip(int(h - y0 + 1), 0, h - 1),
	)
	box[y0:y1, x0:x1] = i + 2
	for i, d in enumerate(page_layout.boxes):
	if page_layout.names[int(d.cls)] in vcls:
	x0, y0, x1, y1 = d.xyxy.squeeze()
	x0, y0, x1, y1 = (
	np.clip(int(x0 - 1), 0, w - 1),
	np.clip(int(h - y1 - 1), 0, h - 1),
	np.clip(int(x1 + 1), 0, w - 1),
	np.clip(int(h - y0 + 1), 0, h - 1),
	)
	box[y0:y1, x0:x1] = 0
	layout[page.pageno] = box
	# 新建一个 xref 存放新指令流
	page.page_xref = doc_en.get_new_xref() # hack 插入页面的新 xref
	doc_en.update_object(page.page_xref, "<<>>")
	doc_en.update_stream(page.page_xref, b"")
	doc_en[page.pageno].set_contents(page.page_xref)
	interpreter.process_page(page)

	device.close()
	return obj_patch