Spaces:
Running
Running
#!/usr/bin/env python3 | |
"""A command line tool for extracting text and images from PDF and | |
output it to plain text, html, xml or tags. | |
""" | |
from __future__ import annotations | |
import argparse | |
import logging | |
import os | |
import sys | |
from pathlib import Path | |
from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional | |
import pymupdf | |
import requests | |
from pdf2zh import __version__ | |
from pdf2zh.pdfexceptions import PDFValueError | |
if TYPE_CHECKING: | |
from pdf2zh.layout import LAParams | |
from pdf2zh.utils import AnyIO | |
OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag")) | |
def setup_log() -> None: | |
logging.basicConfig() | |
try: | |
import doclayout_yolo | |
doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING) | |
except ImportError: | |
pass | |
def check_files(files: List[str]) -> List[str]: | |
files = [ | |
f for f in files if not f.startswith("http://") | |
] # exclude online files, http | |
files = [ | |
f for f in files if not f.startswith("https://") | |
] # exclude online files, https | |
missing_files = [file for file in files if not os.path.exists(file)] | |
return missing_files | |
def float_or_disabled(x: str) -> Optional[float]: | |
if x.lower().strip() == "disabled": | |
return None | |
try: | |
return float(x) | |
except ValueError: | |
raise argparse.ArgumentTypeError(f"invalid float value: {x}") | |
def extract_text( | |
files: Iterable[str] = [], | |
outfile: str = "-", | |
laparams: Optional[LAParams] = None, | |
output_type: str = "text", | |
codec: str = "utf-8", | |
strip_control: bool = False, | |
maxpages: int = 0, | |
pages: Optional[Container[int]] = None, | |
password: str = "", | |
scale: float = 1.0, | |
rotation: int = 0, | |
layoutmode: str = "normal", | |
output_dir: Optional[str] = None, | |
debug: bool = False, | |
disable_caching: bool = False, | |
vfont: str = "", | |
vchar: str = "", | |
thread: int = 0, | |
lang_in: str = "", | |
lang_out: str = "", | |
service: str = "", | |
callback: object = None, | |
output: str = "", | |
**kwargs: Any, | |
) -> AnyIO: | |
import pdf2zh.high_level | |
from pdf2zh.doclayout import DocLayoutModel | |
if not files: | |
raise PDFValueError("Must provide files to work upon!") | |
if output_type == "text" and outfile != "-": | |
for override, alttype in OUTPUT_TYPES: | |
if outfile.endswith(override): | |
output_type = alttype | |
outfp: AnyIO = sys.stdout | |
model = DocLayoutModel.load_available() | |
for file in files: | |
if file.startswith("http://") or file.startswith("https://"): | |
print("Online files detected, downloading...") | |
try: | |
r = requests.get(file, allow_redirects=True) | |
if r.status_code == 200: | |
if not os.path.exists("./pdf2zh_files"): | |
print("Making a temporary dir for downloading PDF files...") | |
os.mkdir(os.path.dirname("./pdf2zh_files")) | |
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f: | |
print(f"Writing the file: {file}...") | |
f.write(r.content) | |
file = "./pdf2zh_files/tmp_download.pdf" | |
else: | |
r.raise_for_status() | |
except Exception as e: | |
raise PDFValueError( | |
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}" | |
) | |
filename = os.path.splitext(os.path.basename(file))[0] | |
doc_en = pymupdf.open(file) | |
page_count = doc_en.page_count | |
font_list = ["china-ss", "tiro"] | |
font_id = {} | |
for page in doc_en: | |
for font in font_list: | |
font_id[font] = page.insert_font(font) | |
xreflen = doc_en.xref_length() | |
for xref in range(1, xreflen): | |
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res | |
try: # xref 读写可能出错 | |
font_res = doc_en.xref_get_key(xref, f"{label}Font") | |
if font_res[0] == "dict": | |
for font in font_list: | |
font_exist = doc_en.xref_get_key( | |
xref, f"{label}Font/{font}" | |
) | |
if font_exist[0] == "null": | |
doc_en.xref_set_key( | |
xref, f"{label}Font/{font}", f"{font_id[font]} 0 R" | |
) | |
except Exception: | |
pass | |
doc_en.save(Path(output) / f"{filename}-en.pdf") | |
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp: | |
obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals()) | |
for obj_id, ops_new in obj_patch.items(): | |
# ops_old=doc_en.xref_stream(obj_id) | |
# print(obj_id) | |
# print(ops_old) | |
# print(ops_new.encode()) | |
doc_en.update_stream(obj_id, ops_new.encode()) | |
doc_zh = doc_en | |
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf") | |
doc_dual.insert_file(doc_zh) | |
for id in range(page_count): | |
doc_dual.move_page(page_count + id, id * 2 + 1) | |
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1) | |
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1) | |
doc_zh.close() | |
doc_dual.close() | |
os.remove(Path(output) / f"{filename}-en.pdf") | |
return | |
def create_parser() -> argparse.ArgumentParser: | |
parser = argparse.ArgumentParser(description=__doc__, add_help=True) | |
parser.add_argument( | |
"files", | |
type=str, | |
default=None, | |
nargs="*", | |
help="One or more paths to PDF files.", | |
) | |
parser.add_argument( | |
"--version", | |
"-v", | |
action="version", | |
version=f"pdf2zh v{__version__}", | |
) | |
parser.add_argument( | |
"--debug", | |
"-d", | |
default=False, | |
action="store_true", | |
help="Use debug logging level.", | |
) | |
parse_params = parser.add_argument_group( | |
"Parser", | |
description="Used during PDF parsing", | |
) | |
parse_params.add_argument( | |
"--pages", | |
"-p", | |
type=str, | |
help="The list of page numbers to parse.", | |
) | |
parse_params.add_argument( | |
"--password", | |
"-P", | |
type=str, | |
default="", | |
help="The password to use for decrypting PDF file.", | |
) | |
parse_params.add_argument( | |
"--vfont", | |
"-f", | |
type=str, | |
default="", | |
help="The regex to math font name of formula.", | |
) | |
parse_params.add_argument( | |
"--vchar", | |
"-c", | |
type=str, | |
default="", | |
help="The regex to math character of formula.", | |
) | |
parse_params.add_argument( | |
"--lang-in", | |
"-li", | |
type=str, | |
default="auto", | |
help="The code of source language.", | |
) | |
parse_params.add_argument( | |
"--lang-out", | |
"-lo", | |
type=str, | |
default="auto", | |
help="The code of target language.", | |
) | |
parse_params.add_argument( | |
"--service", | |
"-s", | |
type=str, | |
default="google", | |
help="The service to use for translation.", | |
) | |
parse_params.add_argument( | |
"--output", | |
"-o", | |
type=str, | |
default="", | |
help="Output directory for files.", | |
) | |
parse_params.add_argument( | |
"--thread", | |
"-t", | |
type=int, | |
default=4, | |
help="The number of threads to execute translation.", | |
) | |
parse_params.add_argument( | |
"--interactive", | |
"-i", | |
action="store_true", | |
help="Interact with GUI.", | |
) | |
parse_params.add_argument( | |
"--share", | |
action="store_true", | |
help="Enable Gradio Share", | |
) | |
return parser | |
def parse_args(args: Optional[List[str]]) -> argparse.Namespace: | |
parsed_args = create_parser().parse_args(args=args) | |
if parsed_args.pages: | |
pages = [] | |
for p in parsed_args.pages.split(","): | |
if "-" in p: | |
start, end = p.split("-") | |
pages.extend(range(int(start) - 1, int(end))) | |
else: | |
pages.append(int(p) - 1) | |
parsed_args.pages = pages | |
return parsed_args | |
def main(args: Optional[List[str]] = None) -> int: | |
parsed_args = parse_args(args) | |
missing_files = check_files(parsed_args.files) | |
if missing_files: | |
print("The following files do not exist:", file=sys.stderr) | |
for file in missing_files: | |
print(f" {file}", file=sys.stderr) | |
return -1 | |
if parsed_args.interactive: | |
from pdf2zh.gui import setup_gui | |
setup_gui(parsed_args.share) | |
return 0 | |
setup_log() | |
extract_text(**vars(parsed_args)) | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) | |
sys.exit(main()) | |