PDFTranslate / pdf2zh /pdf2zh.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
raw
history blame
9.05 kB
#!/usr/bin/env python3
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags.
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional
import pymupdf
import requests
from pdf2zh import __version__
from pdf2zh.pdfexceptions import PDFValueError
if TYPE_CHECKING:
from pdf2zh.layout import LAParams
from pdf2zh.utils import AnyIO
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))
def setup_log() -> None:
logging.basicConfig()
try:
import doclayout_yolo
doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
except ImportError:
pass
def check_files(files: List[str]) -> List[str]:
files = [
f for f in files if not f.startswith("http://")
] # exclude online files, http
files = [
f for f in files if not f.startswith("https://")
] # exclude online files, https
missing_files = [file for file in files if not os.path.exists(file)]
return missing_files
def float_or_disabled(x: str) -> Optional[float]:
if x.lower().strip() == "disabled":
return None
try:
return float(x)
except ValueError:
raise argparse.ArgumentTypeError(f"invalid float value: {x}")
def extract_text(
files: Iterable[str] = [],
outfile: str = "-",
laparams: Optional[LAParams] = None,
output_type: str = "text",
codec: str = "utf-8",
strip_control: bool = False,
maxpages: int = 0,
pages: Optional[Container[int]] = None,
password: str = "",
scale: float = 1.0,
rotation: int = 0,
layoutmode: str = "normal",
output_dir: Optional[str] = None,
debug: bool = False,
disable_caching: bool = False,
vfont: str = "",
vchar: str = "",
thread: int = 0,
lang_in: str = "",
lang_out: str = "",
service: str = "",
callback: object = None,
output: str = "",
**kwargs: Any,
) -> AnyIO:
import pdf2zh.high_level
from pdf2zh.doclayout import DocLayoutModel
if not files:
raise PDFValueError("Must provide files to work upon!")
if output_type == "text" and outfile != "-":
for override, alttype in OUTPUT_TYPES:
if outfile.endswith(override):
output_type = alttype
outfp: AnyIO = sys.stdout
model = DocLayoutModel.load_available()
for file in files:
if file.startswith("http://") or file.startswith("https://"):
print("Online files detected, downloading...")
try:
r = requests.get(file, allow_redirects=True)
if r.status_code == 200:
if not os.path.exists("./pdf2zh_files"):
print("Making a temporary dir for downloading PDF files...")
os.mkdir(os.path.dirname("./pdf2zh_files"))
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
print(f"Writing the file: {file}...")
f.write(r.content)
file = "./pdf2zh_files/tmp_download.pdf"
else:
r.raise_for_status()
except Exception as e:
raise PDFValueError(
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
)
filename = os.path.splitext(os.path.basename(file))[0]
doc_en = pymupdf.open(file)
page_count = doc_en.page_count
font_list = ["china-ss", "tiro"]
font_id = {}
for page in doc_en:
for font in font_list:
font_id[font] = page.insert_font(font)
xreflen = doc_en.xref_length()
for xref in range(1, xreflen):
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
try: # xref 读写可能出错
font_res = doc_en.xref_get_key(xref, f"{label}Font")
if font_res[0] == "dict":
for font in font_list:
font_exist = doc_en.xref_get_key(
xref, f"{label}Font/{font}"
)
if font_exist[0] == "null":
doc_en.xref_set_key(
xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
)
except Exception:
pass
doc_en.save(Path(output) / f"{filename}-en.pdf")
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())
for obj_id, ops_new in obj_patch.items():
# ops_old=doc_en.xref_stream(obj_id)
# print(obj_id)
# print(ops_old)
# print(ops_new.encode())
doc_en.update_stream(obj_id, ops_new.encode())
doc_zh = doc_en
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
doc_dual.insert_file(doc_zh)
for id in range(page_count):
doc_dual.move_page(page_count + id, id * 2 + 1)
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
doc_zh.close()
doc_dual.close()
os.remove(Path(output) / f"{filename}-en.pdf")
return
def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files",
type=str,
default=None,
nargs="*",
help="One or more paths to PDF files.",
)
parser.add_argument(
"--version",
"-v",
action="version",
version=f"pdf2zh v{__version__}",
)
parser.add_argument(
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parse_params = parser.add_argument_group(
"Parser",
description="Used during PDF parsing",
)
parse_params.add_argument(
"--pages",
"-p",
type=str,
help="The list of page numbers to parse.",
)
parse_params.add_argument(
"--password",
"-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
parse_params.add_argument(
"--vfont",
"-f",
type=str,
default="",
help="The regex to math font name of formula.",
)
parse_params.add_argument(
"--vchar",
"-c",
type=str,
default="",
help="The regex to math character of formula.",
)
parse_params.add_argument(
"--lang-in",
"-li",
type=str,
default="auto",
help="The code of source language.",
)
parse_params.add_argument(
"--lang-out",
"-lo",
type=str,
default="auto",
help="The code of target language.",
)
parse_params.add_argument(
"--service",
"-s",
type=str,
default="google",
help="The service to use for translation.",
)
parse_params.add_argument(
"--output",
"-o",
type=str,
default="",
help="Output directory for files.",
)
parse_params.add_argument(
"--thread",
"-t",
type=int,
default=4,
help="The number of threads to execute translation.",
)
parse_params.add_argument(
"--interactive",
"-i",
action="store_true",
help="Interact with GUI.",
)
parse_params.add_argument(
"--share",
action="store_true",
help="Enable Gradio Share",
)
return parser
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parsed_args = create_parser().parse_args(args=args)
if parsed_args.pages:
pages = []
for p in parsed_args.pages.split(","):
if "-" in p:
start, end = p.split("-")
pages.extend(range(int(start) - 1, int(end)))
else:
pages.append(int(p) - 1)
parsed_args.pages = pages
return parsed_args
def main(args: Optional[List[str]] = None) -> int:
parsed_args = parse_args(args)
missing_files = check_files(parsed_args.files)
if missing_files:
print("The following files do not exist:", file=sys.stderr)
for file in missing_files:
print(f" {file}", file=sys.stderr)
return -1
if parsed_args.interactive:
from pdf2zh.gui import setup_gui
setup_gui(parsed_args.share)
return 0
setup_log()
extract_text(**vars(parsed_args))
return 0
if __name__ == "__main__":
sys.exit(main())
sys.exit(main())