#!/usr/bin/env python3 """A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags. """ from __future__ import annotations import argparse import os import sys import logging from pathlib import Path from typing import Any, Container, Iterable, List, Optional import urllib.request from pdfminer.pdfexceptions import PDFValueError import pymupdf import requests import tempfile from pdf2zh import __version__, log from pdf2zh.high_level import extract_text_to_fp from pdf2zh.doclayout import DocLayoutModel logging.basicConfig() model = DocLayoutModel.load_available() resfont_map = { "zh-CN": "china-ss", "zh-TW": "china-ts", "ja": "japan-s", "ko": "korea-s", } noto_list = [ "am", # Amharic "ar", # Arabic "bn", # Bengali "bg", # Bulgarian "chr", # Cherokee "el", # Greek "gu", # Gujarati "iw", # Hebrew "hi", # Hindi # "ja", # Japanese "kn", # Kannada # "ko", # Korean "ml", # Malayalam "mr", # Marathi "ru", # Russian "sr", # Serbian # "zh-CN",# Chinese (PRC) "ta", # Tamil "te", # Telugu "th", # Thai # "zh-TW",# Chinese (Taiwan) "ur", # Urdu "uk", # Ukrainian ] def check_files(files: List[str]) -> List[str]: files = [ f for f in files if not f.startswith("http://") ] # exclude online files, http files = [ f for f in files if not f.startswith("https://") ] # exclude online files, https missing_files = [file for file in files if not os.path.exists(file)] return missing_files def extract_text( files: Iterable[str] = [], pages: Optional[Container[int]] = None, password: str = "", debug: bool = False, vfont: str = "", vchar: str = "", thread: int = 0, lang_in: str = "", lang_out: str = "", service: str = "", callback: object = None, output: str = "", **kwargs: Any, ): if debug: log.setLevel(logging.DEBUG) if not files: raise PDFValueError("Must provide files to work upon!") for file in files: if file is str and (file.startswith("http://") or file.startswith("https://")): print("Online files detected, downloading...") try: r = requests.get(file, allow_redirects=True) if r.status_code == 200: if not os.path.exists("./pdf2zh_files"): print("Making a temporary dir for downloading PDF files...") os.mkdir(os.path.dirname("./pdf2zh_files")) with open("./pdf2zh_files/tmp_download.pdf", "wb") as f: print(f"Writing the file: {file}...") f.write(r.content) file = "./pdf2zh_files/tmp_download.pdf" else: r.raise_for_status() except Exception as e: raise PDFValueError( f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" ) filename = os.path.splitext(os.path.basename(file))[0] font_list = [("tiro", None)] noto = None if lang_out in resfont_map: # CJK resfont = resfont_map[lang_out] font_list.append((resfont, None)) elif lang_out in noto_list: # noto resfont = "noto" ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf") if not os.path.exists(ttf_path): print("Downloading Noto font...") urllib.request.urlretrieve( "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf", ttf_path, ) font_list.append(("noto", ttf_path)) noto = pymupdf.Font("noto", ttf_path) else: # auto resfont = "china-ss" font_list.append(("china-ss", None)) doc_en = pymupdf.open(file) page_count = doc_en.page_count # font_list = [("china-ss", None), ("tiro", None)] font_id = {} for page in doc_en: for font in font_list: font_id[font[0]] = page.insert_font(font[0], font[1]) xreflen = doc_en.xref_length() for xref in range(1, xreflen): for label in ["Resources/", ""]: # 可能是基于 xobj 的 res try: # xref 读写可能出错 font_res = doc_en.xref_get_key(xref, f"{label}Font") if font_res[0] == "dict": for font in font_list: font_exist = doc_en.xref_get_key( xref, f"{label}Font/{font[0]}" ) if font_exist[0] == "null": doc_en.xref_set_key( xref, f"{label}Font/{font[0]}", f"{font_id[font[0]]} 0 R", ) except Exception: pass doc_en.save(Path(output) / f"{filename}-en.pdf") with open(Path(output) / f"{filename}-en.pdf", "rb") as fp: obj_patch: dict = extract_text_to_fp(fp, model=model, **locals()) for obj_id, ops_new in obj_patch.items(): # ops_old=doc_en.xref_stream(obj_id) # print(obj_id) # print(ops_old) # print(ops_new.encode()) doc_en.update_stream(obj_id, ops_new.encode()) doc_zh = doc_en doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf") doc_dual.insert_file(doc_zh) for id in range(page_count): doc_dual.move_page(page_count + id, id * 2 + 1) doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1) doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1) doc_zh.close() doc_dual.close() os.remove(Path(output) / f"{filename}-en.pdf") return def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="*", help="One or more paths to PDF files.", ) parser.add_argument( "--version", "-v", action="version", version=f"pdf2zh v{__version__}", ) parser.add_argument( "--debug", "-d", default=False, action="store_true", help="Use debug logging level.", ) parse_params = parser.add_argument_group( "Parser", description="Used during PDF parsing", ) parse_params.add_argument( "--pages", "-p", type=str, help="The list of page numbers to parse.", ) parse_params.add_argument( "--password", "-P", type=str, default="", help="The password to use for decrypting PDF file.", ) parse_params.add_argument( "--vfont", "-f", type=str, default="", help="The regex to math font name of formula.", ) parse_params.add_argument( "--vchar", "-c", type=str, default="", help="The regex to math character of formula.", ) parse_params.add_argument( "--lang-in", "-li", type=str, default="auto", help="The code of source language.", ) parse_params.add_argument( "--lang-out", "-lo", type=str, default="auto", help="The code of target language.", ) parse_params.add_argument( "--service", "-s", type=str, default="google", help="The service to use for translation.", ) parse_params.add_argument( "--output", "-o", type=str, default="", help="Output directory for files.", ) parse_params.add_argument( "--thread", "-t", type=int, default=4, help="The number of threads to execute translation.", ) parse_params.add_argument( "--interactive", "-i", action="store_true", help="Interact with GUI.", ) parse_params.add_argument( "--share", action="store_true", help="Enable Gradio Share", ) return parser def parse_args(args: Optional[List[str]]) -> argparse.Namespace: parsed_args = create_parser().parse_args(args=args) if parsed_args.pages: pages = [] for p in parsed_args.pages.split(","): if "-" in p: start, end = p.split("-") pages.extend(range(int(start) - 1, int(end))) else: pages.append(int(p) - 1) parsed_args.pages = pages return parsed_args def main(args: Optional[List[str]] = None) -> int: parsed_args = parse_args(args) missing_files = check_files(parsed_args.files) if missing_files: print("The following files do not exist:", file=sys.stderr) for file in missing_files: print(f" {file}", file=sys.stderr) return -1 if parsed_args.interactive: from pdf2zh.gui import setup_gui setup_gui(parsed_args.share) return 0 extract_text(**vars(parsed_args)) return 0 if __name__ == "__main__": sys.exit(main())