Spaces:

leonsimon23
/

sciwin_translate

Running

App Files Files Community

sciwin_translate / pdf2zh /pdf2zh.py

leonsimon23

Upload 9 files

8b23ca3 verified 7 months ago

raw

history blame

9.59 kB

	#!/usr/bin/env python3
	"""A command line tool for extracting text and images from PDF and
	output it to plain text, html, xml or tags.
	"""

	from __future__ import annotations

	import argparse
	import os
	import sys
	import logging
	from pathlib import Path
	from typing import Any, Container, Iterable, List, Optional
	import urllib.request
	from pdfminer.pdfexceptions import PDFValueError

	import pymupdf
	import requests
	import tempfile

	from pdf2zh import __version__, log
	from pdf2zh.high_level import extract_text_to_fp
	from pdf2zh.doclayout import DocLayoutModel

	logging.basicConfig()

	model = DocLayoutModel.load_available()

	resfont_map = {
	"zh-CN": "china-ss",
	"zh-TW": "china-ts",
	"ja": "japan-s",
	"ko": "korea-s",
	}
	noto_list = [
	"am", # Amharic
	"ar", # Arabic
	"bn", # Bengali
	"bg", # Bulgarian
	"chr", # Cherokee
	"el", # Greek
	"gu", # Gujarati
	"iw", # Hebrew
	"hi", # Hindi
	# "ja", # Japanese
	"kn", # Kannada
	# "ko", # Korean
	"ml", # Malayalam
	"mr", # Marathi
	"ru", # Russian
	"sr", # Serbian
	# "zh-CN",# Chinese (PRC)
	"ta", # Tamil
	"te", # Telugu
	"th", # Thai
	# "zh-TW",# Chinese (Taiwan)
	"ur", # Urdu
	"uk", # Ukrainian
	]


	def check_files(files: List[str]) -> List[str]:
	files = [
	f for f in files if not f.startswith("http://")
	] # exclude online files, http
	files = [
	f for f in files if not f.startswith("https://")
	] # exclude online files, https
	missing_files = [file for file in files if not os.path.exists(file)]
	return missing_files


	def extract_text(
	files: Iterable[str] = [],
	pages: Optional[Container[int]] = None,
	password: str = "",
	debug: bool = False,
	vfont: str = "",
	vchar: str = "",
	thread: int = 0,
	lang_in: str = "",
	lang_out: str = "",
	service: str = "",
	callback: object = None,
	output: str = "",
	**kwargs: Any,
	):
	if debug:
	log.setLevel(logging.DEBUG)

	if not files:
	raise PDFValueError("Must provide files to work upon!")

	for file in files:
	if file is str and (file.startswith("http://") or file.startswith("https://")):
	print("Online files detected, downloading...")
	try:
	r = requests.get(file, allow_redirects=True)
	if r.status_code == 200:
	if not os.path.exists("./pdf2zh_files"):
	print("Making a temporary dir for downloading PDF files...")
	os.mkdir(os.path.dirname("./pdf2zh_files"))
	with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
	print(f"Writing the file: {file}...")
	f.write(r.content)
	file = "./pdf2zh_files/tmp_download.pdf"
	else:
	r.raise_for_status()
	except Exception as e:
	raise PDFValueError(
	f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
	)
	filename = os.path.splitext(os.path.basename(file))[0]

	font_list = [("tiro", None)]
	noto = None
	if lang_out in resfont_map: # CJK
	resfont = resfont_map[lang_out]
	font_list.append((resfont, None))
	elif lang_out in noto_list: # noto
	resfont = "noto"
	ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
	if not os.path.exists(ttf_path):
	print("Downloading Noto font...")
	urllib.request.urlretrieve(
	"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
	ttf_path,
	)
	font_list.append(("noto", ttf_path))
	noto = pymupdf.Font("noto", ttf_path)
	else: # auto
	resfont = "china-ss"
	font_list.append(("china-ss", None))

	doc_en = pymupdf.open(file)
	page_count = doc_en.page_count
	# font_list = [("china-ss", None), ("tiro", None)]
	font_id = {}
	for page in doc_en:
	for font in font_list:
	font_id[font[0]] = page.insert_font(font[0], font[1])
	xreflen = doc_en.xref_length()
	for xref in range(1, xreflen):
	for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
	try: # xref 读写可能出错
	font_res = doc_en.xref_get_key(xref, f"{label}Font")
	if font_res[0] == "dict":
	for font in font_list:
	font_exist = doc_en.xref_get_key(
	xref, f"{label}Font/{font[0]}"
	)
	if font_exist[0] == "null":
	doc_en.xref_set_key(
	xref,
	f"{label}Font/{font[0]}",
	f"{font_id[font[0]]} 0 R",
	)
	except Exception:
	pass
	doc_en.save(Path(output) / f"{filename}-en.pdf")

	with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
	obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())

	for obj_id, ops_new in obj_patch.items():
	# ops_old=doc_en.xref_stream(obj_id)
	# print(obj_id)
	# print(ops_old)
	# print(ops_new.encode())
	doc_en.update_stream(obj_id, ops_new.encode())

	doc_zh = doc_en
	doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
	doc_dual.insert_file(doc_zh)
	for id in range(page_count):
	doc_dual.move_page(page_count + id, id * 2 + 1)
	doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
	doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
	doc_zh.close()
	doc_dual.close()
	os.remove(Path(output) / f"{filename}-en.pdf")

	return


	def create_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(description=__doc__, add_help=True)
	parser.add_argument(
	"files",
	type=str,
	default=None,
	nargs="*",
	help="One or more paths to PDF files.",
	)
	parser.add_argument(
	"--version",
	"-v",
	action="version",
	version=f"pdf2zh v{__version__}",
	)
	parser.add_argument(
	"--debug",
	"-d",
	default=False,
	action="store_true",
	help="Use debug logging level.",
	)
	parse_params = parser.add_argument_group(
	"Parser",
	description="Used during PDF parsing",
	)
	parse_params.add_argument(
	"--pages",
	"-p",
	type=str,
	help="The list of page numbers to parse.",
	)
	parse_params.add_argument(
	"--password",
	"-P",
	type=str,
	default="",
	help="The password to use for decrypting PDF file.",
	)
	parse_params.add_argument(
	"--vfont",
	"-f",
	type=str,
	default="",
	help="The regex to math font name of formula.",
	)
	parse_params.add_argument(
	"--vchar",
	"-c",
	type=str,
	default="",
	help="The regex to math character of formula.",
	)
	parse_params.add_argument(
	"--lang-in",
	"-li",
	type=str,
	default="auto",
	help="The code of source language.",
	)
	parse_params.add_argument(
	"--lang-out",
	"-lo",
	type=str,
	default="auto",
	help="The code of target language.",
	)
	parse_params.add_argument(
	"--service",
	"-s",
	type=str,
	default="google",
	help="The service to use for translation.",
	)
	parse_params.add_argument(
	"--output",
	"-o",
	type=str,
	default="",
	help="Output directory for files.",
	)
	parse_params.add_argument(
	"--thread",
	"-t",
	type=int,
	default=4,
	help="The number of threads to execute translation.",
	)
	parse_params.add_argument(
	"--interactive",
	"-i",
	action="store_true",
	help="Interact with GUI.",
	)
	parse_params.add_argument(
	"--share",
	action="store_true",
	help="Enable Gradio Share",
	)

	return parser


	def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
	parsed_args = create_parser().parse_args(args=args)

	if parsed_args.pages:
	pages = []
	for p in parsed_args.pages.split(","):
	if "-" in p:
	start, end = p.split("-")
	pages.extend(range(int(start) - 1, int(end)))
	else:
	pages.append(int(p) - 1)
	parsed_args.pages = pages

	return parsed_args


	def main(args: Optional[List[str]] = None) -> int:
	parsed_args = parse_args(args)

	missing_files = check_files(parsed_args.files)
	if missing_files:
	print("The following files do not exist:", file=sys.stderr)
	for file in missing_files:
	print(f" {file}", file=sys.stderr)
	return -1
	if parsed_args.interactive:
	from pdf2zh.gui import setup_gui

	setup_gui(parsed_args.share)
	return 0

	extract_text(**vars(parsed_args))
	return 0


	if __name__ == "__main__":
	sys.exit(main())