#!/usr/bin/env python3 """A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags. """ from __future__ import annotations import argparse import logging import os import sys from pathlib import Path from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional import pymupdf import requests from pdf2zh import __version__ from pdf2zh.pdfexceptions import PDFValueError if TYPE_CHECKING: from pdf2zh.layout import LAParams from pdf2zh.utils import AnyIO OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")) def setup_log() -> None: logging.basicConfig() try: import doclayout_yolo doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING) except ImportError: pass def check_files(files: List[str]) -> List[str]: files = [ f for f in files if not f.startswith("http://") ] # exclude online files, http files = [ f for f in files if not f.startswith("https://") ] # exclude online files, https missing_files = [file for file in files if not os.path.exists(file)] return missing_files def float_or_disabled(x: str) -> Optional[float]: if x.lower().strip() == "disabled": return None try: return float(x) except ValueError: raise argparse.ArgumentTypeError(f"invalid float value: {x}") def extract_text( files: Iterable[str] = [], outfile: str = "-", laparams: Optional[LAParams] = None, output_type: str = "text", codec: str = "utf-8", strip_control: bool = False, maxpages: int = 0, pages: Optional[Container[int]] = None, password: str = "", scale: float = 1.0, rotation: int = 0, layoutmode: str = "normal", output_dir: Optional[str] = None, debug: bool = False, disable_caching: bool = False, vfont: str = "", vchar: str = "", thread: int = 0, lang_in: str = "", lang_out: str = "", service: str = "", callback: object = None, output: str = "", **kwargs: Any, ) -> AnyIO: import pdf2zh.high_level from pdf2zh.doclayout import DocLayoutModel if not files: raise PDFValueError("Must provide files to work upon!") if output_type == "text" and outfile != "-": for override, alttype in OUTPUT_TYPES: if outfile.endswith(override): output_type = alttype outfp: AnyIO = sys.stdout model = DocLayoutModel.load_available() for file in files: if file.startswith("http://") or file.startswith("https://"): print("Online files detected, downloading...") try: r = requests.get(file, allow_redirects=True) if r.status_code == 200: if not os.path.exists("./pdf2zh_files"): print("Making a temporary dir for downloading PDF files...") os.mkdir(os.path.dirname("./pdf2zh_files")) with open("./pdf2zh_files/tmp_download.pdf", "wb") as f: print(f"Writing the file: {file}...") f.write(r.content) file = "./pdf2zh_files/tmp_download.pdf" else: r.raise_for_status() except Exception as e: raise PDFValueError( f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" ) filename = os.path.splitext(os.path.basename(file))[0] doc_en = pymupdf.open(file) page_count = doc_en.page_count font_list = ["china-ss", "tiro"] font_id = {} for page in doc_en: for font in font_list: font_id[font] = page.insert_font(font) xreflen = doc_en.xref_length() for xref in range(1, xreflen): for label in ["Resources/", ""]: # 可能是基于 xobj 的 res try: # xref 读写可能出错 font_res = doc_en.xref_get_key(xref, f"{label}Font") if font_res[0] == "dict": for font in font_list: font_exist = doc_en.xref_get_key( xref, f"{label}Font/{font}" ) if font_exist[0] == "null": doc_en.xref_set_key( xref, f"{label}Font/{font}", f"{font_id[font]} 0 R" ) except Exception: pass doc_en.save(Path(output) / f"{filename}-en.pdf") with open(Path(output) / f"{filename}-en.pdf", "rb") as fp: obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals()) for obj_id, ops_new in obj_patch.items(): # ops_old=doc_en.xref_stream(obj_id) # print(obj_id) # print(ops_old) # print(ops_new.encode()) doc_en.update_stream(obj_id, ops_new.encode()) doc_zh = doc_en doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf") doc_dual.insert_file(doc_zh) for id in range(page_count): doc_dual.move_page(page_count + id, id * 2 + 1) doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1) doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1) doc_zh.close() doc_dual.close() os.remove(Path(output) / f"{filename}-en.pdf") return def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="*", help="One or more paths to PDF files.", ) parser.add_argument( "--version", "-v", action="version", version=f"pdf2zh v{__version__}", ) parser.add_argument( "--debug", "-d", default=False, action="store_true", help="Use debug logging level.", ) parse_params = parser.add_argument_group( "Parser", description="Used during PDF parsing", ) parse_params.add_argument( "--pages", "-p", type=str, help="The list of page numbers to parse.", ) parse_params.add_argument( "--password", "-P", type=str, default="", help="The password to use for decrypting PDF file.", ) parse_params.add_argument( "--vfont", "-f", type=str, default="", help="The regex to math font name of formula.", ) parse_params.add_argument( "--vchar", "-c", type=str, default="", help="The regex to math character of formula.", ) parse_params.add_argument( "--lang-in", "-li", type=str, default="auto", help="The code of source language.", ) parse_params.add_argument( "--lang-out", "-lo", type=str, default="auto", help="The code of target language.", ) parse_params.add_argument( "--service", "-s", type=str, default="google", help="The service to use for translation.", ) parse_params.add_argument( "--output", "-o", type=str, default="", help="Output directory for files.", ) parse_params.add_argument( "--thread", "-t", type=int, default=4, help="The number of threads to execute translation.", ) parse_params.add_argument( "--interactive", "-i", action="store_true", help="Interact with GUI.", ) parse_params.add_argument( "--share", action="store_true", help="Enable Gradio Share", ) return parser def parse_args(args: Optional[List[str]]) -> argparse.Namespace: parsed_args = create_parser().parse_args(args=args) if parsed_args.pages: pages = [] for p in parsed_args.pages.split(","): if "-" in p: start, end = p.split("-") pages.extend(range(int(start) - 1, int(end))) else: pages.append(int(p) - 1) parsed_args.pages = pages return parsed_args def main(args: Optional[List[str]] = None) -> int: parsed_args = parse_args(args) missing_files = check_files(parsed_args.files) if missing_files: print("The following files do not exist:", file=sys.stderr) for file in missing_files: print(f" {file}", file=sys.stderr) return -1 if parsed_args.interactive: from pdf2zh.gui import setup_gui setup_gui(parsed_args.share) return 0 setup_log() extract_text(**vars(parsed_args)) return 0 if __name__ == "__main__": sys.exit(main()) sys.exit(main())