Spaces:
Running
Running
#!/usr/bin/env python3 | |
"""A command line tool for extracting text and images from PDF and | |
output it to plain text, html, xml or tags. | |
""" | |
from __future__ import annotations | |
import argparse | |
import os | |
import sys | |
import logging | |
from pathlib import Path | |
from typing import Any, Container, Iterable, List, Optional | |
import urllib.request | |
from pdfminer.pdfexceptions import PDFValueError | |
import pymupdf | |
import requests | |
import tempfile | |
from pdf2zh import __version__, log | |
from pdf2zh.high_level import extract_text_to_fp | |
from pdf2zh.doclayout import DocLayoutModel | |
logging.basicConfig() | |
model = DocLayoutModel.load_available() | |
resfont_map = { | |
"zh-CN": "china-ss", | |
"zh-TW": "china-ts", | |
"ja": "japan-s", | |
"ko": "korea-s", | |
} | |
noto_list = [ | |
"am", # Amharic | |
"ar", # Arabic | |
"bn", # Bengali | |
"bg", # Bulgarian | |
"chr", # Cherokee | |
"el", # Greek | |
"gu", # Gujarati | |
"iw", # Hebrew | |
"hi", # Hindi | |
# "ja", # Japanese | |
"kn", # Kannada | |
# "ko", # Korean | |
"ml", # Malayalam | |
"mr", # Marathi | |
"ru", # Russian | |
"sr", # Serbian | |
# "zh-CN",# Chinese (PRC) | |
"ta", # Tamil | |
"te", # Telugu | |
"th", # Thai | |
# "zh-TW",# Chinese (Taiwan) | |
"ur", # Urdu | |
"uk", # Ukrainian | |
] | |
def check_files(files: List[str]) -> List[str]: | |
files = [ | |
f for f in files if not f.startswith("http://") | |
] # exclude online files, http | |
files = [ | |
f for f in files if not f.startswith("https://") | |
] # exclude online files, https | |
missing_files = [file for file in files if not os.path.exists(file)] | |
return missing_files | |
def extract_text( | |
files: Iterable[str] = [], | |
pages: Optional[Container[int]] = None, | |
password: str = "", | |
debug: bool = False, | |
vfont: str = "", | |
vchar: str = "", | |
thread: int = 0, | |
lang_in: str = "", | |
lang_out: str = "", | |
service: str = "", | |
callback: object = None, | |
output: str = "", | |
**kwargs: Any, | |
): | |
if debug: | |
log.setLevel(logging.DEBUG) | |
if not files: | |
raise PDFValueError("Must provide files to work upon!") | |
for file in files: | |
if file is str and (file.startswith("http://") or file.startswith("https://")): | |
print("Online files detected, downloading...") | |
try: | |
r = requests.get(file, allow_redirects=True) | |
if r.status_code == 200: | |
if not os.path.exists("./pdf2zh_files"): | |
print("Making a temporary dir for downloading PDF files...") | |
os.mkdir(os.path.dirname("./pdf2zh_files")) | |
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f: | |
print(f"Writing the file: {file}...") | |
f.write(r.content) | |
file = "./pdf2zh_files/tmp_download.pdf" | |
else: | |
r.raise_for_status() | |
except Exception as e: | |
raise PDFValueError( | |
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" | |
) | |
filename = os.path.splitext(os.path.basename(file))[0] | |
font_list = [("tiro", None)] | |
noto = None | |
if lang_out in resfont_map: # CJK | |
resfont = resfont_map[lang_out] | |
font_list.append((resfont, None)) | |
elif lang_out in noto_list: # noto | |
resfont = "noto" | |
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf") | |
if not os.path.exists(ttf_path): | |
print("Downloading Noto font...") | |
urllib.request.urlretrieve( | |
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf", | |
ttf_path, | |
) | |
font_list.append(("noto", ttf_path)) | |
noto = pymupdf.Font("noto", ttf_path) | |
else: # auto | |
resfont = "china-ss" | |
font_list.append(("china-ss", None)) | |
doc_en = pymupdf.open(file) | |
page_count = doc_en.page_count | |
# font_list = [("china-ss", None), ("tiro", None)] | |
font_id = {} | |
for page in doc_en: | |
for font in font_list: | |
font_id[font[0]] = page.insert_font(font[0], font[1]) | |
xreflen = doc_en.xref_length() | |
for xref in range(1, xreflen): | |
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res | |
try: # xref 读写可能出错 | |
font_res = doc_en.xref_get_key(xref, f"{label}Font") | |
if font_res[0] == "dict": | |
for font in font_list: | |
font_exist = doc_en.xref_get_key( | |
xref, f"{label}Font/{font[0]}" | |
) | |
if font_exist[0] == "null": | |
doc_en.xref_set_key( | |
xref, | |
f"{label}Font/{font[0]}", | |
f"{font_id[font[0]]} 0 R", | |
) | |
except Exception: | |
pass | |
doc_en.save(Path(output) / f"{filename}-en.pdf") | |
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp: | |
obj_patch: dict = extract_text_to_fp(fp, model=model, **locals()) | |
for obj_id, ops_new in obj_patch.items(): | |
# ops_old=doc_en.xref_stream(obj_id) | |
# print(obj_id) | |
# print(ops_old) | |
# print(ops_new.encode()) | |
doc_en.update_stream(obj_id, ops_new.encode()) | |
doc_zh = doc_en | |
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf") | |
doc_dual.insert_file(doc_zh) | |
for id in range(page_count): | |
doc_dual.move_page(page_count + id, id * 2 + 1) | |
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1) | |
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1) | |
doc_zh.close() | |
doc_dual.close() | |
os.remove(Path(output) / f"{filename}-en.pdf") | |
return | |
def create_parser() -> argparse.ArgumentParser: | |
parser = argparse.ArgumentParser(description=__doc__, add_help=True) | |
parser.add_argument( | |
"files", | |
type=str, | |
default=None, | |
nargs="*", | |
help="One or more paths to PDF files.", | |
) | |
parser.add_argument( | |
"--version", | |
"-v", | |
action="version", | |
version=f"pdf2zh v{__version__}", | |
) | |
parser.add_argument( | |
"--debug", | |
"-d", | |
default=False, | |
action="store_true", | |
help="Use debug logging level.", | |
) | |
parse_params = parser.add_argument_group( | |
"Parser", | |
description="Used during PDF parsing", | |
) | |
parse_params.add_argument( | |
"--pages", | |
"-p", | |
type=str, | |
help="The list of page numbers to parse.", | |
) | |
parse_params.add_argument( | |
"--password", | |
"-P", | |
type=str, | |
default="", | |
help="The password to use for decrypting PDF file.", | |
) | |
parse_params.add_argument( | |
"--vfont", | |
"-f", | |
type=str, | |
default="", | |
help="The regex to math font name of formula.", | |
) | |
parse_params.add_argument( | |
"--vchar", | |
"-c", | |
type=str, | |
default="", | |
help="The regex to math character of formula.", | |
) | |
parse_params.add_argument( | |
"--lang-in", | |
"-li", | |
type=str, | |
default="auto", | |
help="The code of source language.", | |
) | |
parse_params.add_argument( | |
"--lang-out", | |
"-lo", | |
type=str, | |
default="auto", | |
help="The code of target language.", | |
) | |
parse_params.add_argument( | |
"--service", | |
"-s", | |
type=str, | |
default="google", | |
help="The service to use for translation.", | |
) | |
parse_params.add_argument( | |
"--output", | |
"-o", | |
type=str, | |
default="", | |
help="Output directory for files.", | |
) | |
parse_params.add_argument( | |
"--thread", | |
"-t", | |
type=int, | |
default=4, | |
help="The number of threads to execute translation.", | |
) | |
parse_params.add_argument( | |
"--interactive", | |
"-i", | |
action="store_true", | |
help="Interact with GUI.", | |
) | |
parse_params.add_argument( | |
"--share", | |
action="store_true", | |
help="Enable Gradio Share", | |
) | |
return parser | |
def parse_args(args: Optional[List[str]]) -> argparse.Namespace: | |
parsed_args = create_parser().parse_args(args=args) | |
if parsed_args.pages: | |
pages = [] | |
for p in parsed_args.pages.split(","): | |
if "-" in p: | |
start, end = p.split("-") | |
pages.extend(range(int(start) - 1, int(end))) | |
else: | |
pages.append(int(p) - 1) | |
parsed_args.pages = pages | |
return parsed_args | |
def main(args: Optional[List[str]] = None) -> int: | |
parsed_args = parse_args(args) | |
missing_files = check_files(parsed_args.files) | |
if missing_files: | |
print("The following files do not exist:", file=sys.stderr) | |
for file in missing_files: | |
print(f" {file}", file=sys.stderr) | |
return -1 | |
if parsed_args.interactive: | |
from pdf2zh.gui import setup_gui | |
setup_gui(parsed_args.share) | |
return 0 | |
extract_text(**vars(parsed_args)) | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) | |