leonsimon23's picture
Upload 9 files
8b23ca3 verified
raw
history blame
9.59 kB
#!/usr/bin/env python3
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags.
"""
from __future__ import annotations
import argparse
import os
import sys
import logging
from pathlib import Path
from typing import Any, Container, Iterable, List, Optional
import urllib.request
from pdfminer.pdfexceptions import PDFValueError
import pymupdf
import requests
import tempfile
from pdf2zh import __version__, log
from pdf2zh.high_level import extract_text_to_fp
from pdf2zh.doclayout import DocLayoutModel
logging.basicConfig()
model = DocLayoutModel.load_available()
resfont_map = {
"zh-CN": "china-ss",
"zh-TW": "china-ts",
"ja": "japan-s",
"ko": "korea-s",
}
noto_list = [
"am", # Amharic
"ar", # Arabic
"bn", # Bengali
"bg", # Bulgarian
"chr", # Cherokee
"el", # Greek
"gu", # Gujarati
"iw", # Hebrew
"hi", # Hindi
# "ja", # Japanese
"kn", # Kannada
# "ko", # Korean
"ml", # Malayalam
"mr", # Marathi
"ru", # Russian
"sr", # Serbian
# "zh-CN",# Chinese (PRC)
"ta", # Tamil
"te", # Telugu
"th", # Thai
# "zh-TW",# Chinese (Taiwan)
"ur", # Urdu
"uk", # Ukrainian
]
def check_files(files: List[str]) -> List[str]:
files = [
f for f in files if not f.startswith("http://")
] # exclude online files, http
files = [
f for f in files if not f.startswith("https://")
] # exclude online files, https
missing_files = [file for file in files if not os.path.exists(file)]
return missing_files
def extract_text(
files: Iterable[str] = [],
pages: Optional[Container[int]] = None,
password: str = "",
debug: bool = False,
vfont: str = "",
vchar: str = "",
thread: int = 0,
lang_in: str = "",
lang_out: str = "",
service: str = "",
callback: object = None,
output: str = "",
**kwargs: Any,
):
if debug:
log.setLevel(logging.DEBUG)
if not files:
raise PDFValueError("Must provide files to work upon!")
for file in files:
if file is str and (file.startswith("http://") or file.startswith("https://")):
print("Online files detected, downloading...")
try:
r = requests.get(file, allow_redirects=True)
if r.status_code == 200:
if not os.path.exists("./pdf2zh_files"):
print("Making a temporary dir for downloading PDF files...")
os.mkdir(os.path.dirname("./pdf2zh_files"))
with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
print(f"Writing the file: {file}...")
f.write(r.content)
file = "./pdf2zh_files/tmp_download.pdf"
else:
r.raise_for_status()
except Exception as e:
raise PDFValueError(
f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
)
filename = os.path.splitext(os.path.basename(file))[0]
font_list = [("tiro", None)]
noto = None
if lang_out in resfont_map: # CJK
resfont = resfont_map[lang_out]
font_list.append((resfont, None))
elif lang_out in noto_list: # noto
resfont = "noto"
ttf_path = os.path.join(tempfile.gettempdir(), "GoNotoKurrent-Regular.ttf")
if not os.path.exists(ttf_path):
print("Downloading Noto font...")
urllib.request.urlretrieve(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = pymupdf.Font("noto", ttf_path)
else: # auto
resfont = "china-ss"
font_list.append(("china-ss", None))
doc_en = pymupdf.open(file)
page_count = doc_en.page_count
# font_list = [("china-ss", None), ("tiro", None)]
font_id = {}
for page in doc_en:
for font in font_list:
font_id[font[0]] = page.insert_font(font[0], font[1])
xreflen = doc_en.xref_length()
for xref in range(1, xreflen):
for label in ["Resources/", ""]: # 可能是基于 xobj 的 res
try: # xref 读写可能出错
font_res = doc_en.xref_get_key(xref, f"{label}Font")
if font_res[0] == "dict":
for font in font_list:
font_exist = doc_en.xref_get_key(
xref, f"{label}Font/{font[0]}"
)
if font_exist[0] == "null":
doc_en.xref_set_key(
xref,
f"{label}Font/{font[0]}",
f"{font_id[font[0]]} 0 R",
)
except Exception:
pass
doc_en.save(Path(output) / f"{filename}-en.pdf")
with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
obj_patch: dict = extract_text_to_fp(fp, model=model, **locals())
for obj_id, ops_new in obj_patch.items():
# ops_old=doc_en.xref_stream(obj_id)
# print(obj_id)
# print(ops_old)
# print(ops_new.encode())
doc_en.update_stream(obj_id, ops_new.encode())
doc_zh = doc_en
doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
doc_dual.insert_file(doc_zh)
for id in range(page_count):
doc_dual.move_page(page_count + id, id * 2 + 1)
doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
doc_zh.close()
doc_dual.close()
os.remove(Path(output) / f"{filename}-en.pdf")
return
def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files",
type=str,
default=None,
nargs="*",
help="One or more paths to PDF files.",
)
parser.add_argument(
"--version",
"-v",
action="version",
version=f"pdf2zh v{__version__}",
)
parser.add_argument(
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parse_params = parser.add_argument_group(
"Parser",
description="Used during PDF parsing",
)
parse_params.add_argument(
"--pages",
"-p",
type=str,
help="The list of page numbers to parse.",
)
parse_params.add_argument(
"--password",
"-P",
type=str,
default="",
help="The password to use for decrypting PDF file.",
)
parse_params.add_argument(
"--vfont",
"-f",
type=str,
default="",
help="The regex to math font name of formula.",
)
parse_params.add_argument(
"--vchar",
"-c",
type=str,
default="",
help="The regex to math character of formula.",
)
parse_params.add_argument(
"--lang-in",
"-li",
type=str,
default="auto",
help="The code of source language.",
)
parse_params.add_argument(
"--lang-out",
"-lo",
type=str,
default="auto",
help="The code of target language.",
)
parse_params.add_argument(
"--service",
"-s",
type=str,
default="google",
help="The service to use for translation.",
)
parse_params.add_argument(
"--output",
"-o",
type=str,
default="",
help="Output directory for files.",
)
parse_params.add_argument(
"--thread",
"-t",
type=int,
default=4,
help="The number of threads to execute translation.",
)
parse_params.add_argument(
"--interactive",
"-i",
action="store_true",
help="Interact with GUI.",
)
parse_params.add_argument(
"--share",
action="store_true",
help="Enable Gradio Share",
)
return parser
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parsed_args = create_parser().parse_args(args=args)
if parsed_args.pages:
pages = []
for p in parsed_args.pages.split(","):
if "-" in p:
start, end = p.split("-")
pages.extend(range(int(start) - 1, int(end)))
else:
pages.append(int(p) - 1)
parsed_args.pages = pages
return parsed_args
def main(args: Optional[List[str]] = None) -> int:
parsed_args = parse_args(args)
missing_files = check_files(parsed_args.files)
if missing_files:
print("The following files do not exist:", file=sys.stderr)
for file in missing_files:
print(f" {file}", file=sys.stderr)
return -1
if parsed_args.interactive:
from pdf2zh.gui import setup_gui
setup_gui(parsed_args.share)
return 0
extract_text(**vars(parsed_args))
return 0
if __name__ == "__main__":
sys.exit(main())