Spaces:

sanbo1200
/

PDFTranslate

Running

File size: 9,053 Bytes

9b0f4a0

#!/usr/bin/env python3
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags.
"""

from __future__ import annotations

import argparse
import logging
import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional

import pymupdf
import requests

from pdf2zh import __version__
from pdf2zh.pdfexceptions import PDFValueError

if TYPE_CHECKING:
    from pdf2zh.layout import LAParams
    from pdf2zh.utils import AnyIO

OUTPUT_TYPES = ((".htm", "html"), (".html", "html"), (".xml", "xml"), (".tag", "tag"))


def setup_log() -> None:
    logging.basicConfig()

    try:
        import doclayout_yolo

        doclayout_yolo.utils.LOGGER.setLevel(logging.WARNING)
    except ImportError:
        pass


def check_files(files: List[str]) -> List[str]:
    files = [
        f for f in files if not f.startswith("http://")
    ]  # exclude online files, http
    files = [
        f for f in files if not f.startswith("https://")
    ]  # exclude online files, https
    missing_files = [file for file in files if not os.path.exists(file)]
    return missing_files


def float_or_disabled(x: str) -> Optional[float]:
    if x.lower().strip() == "disabled":
        return None
    try:
        return float(x)
    except ValueError:
        raise argparse.ArgumentTypeError(f"invalid float value: {x}")


def extract_text(
    files: Iterable[str] = [],
    outfile: str = "-",
    laparams: Optional[LAParams] = None,
    output_type: str = "text",
    codec: str = "utf-8",
    strip_control: bool = False,
    maxpages: int = 0,
    pages: Optional[Container[int]] = None,
    password: str = "",
    scale: float = 1.0,
    rotation: int = 0,
    layoutmode: str = "normal",
    output_dir: Optional[str] = None,
    debug: bool = False,
    disable_caching: bool = False,
    vfont: str = "",
    vchar: str = "",
    thread: int = 0,
    lang_in: str = "",
    lang_out: str = "",
    service: str = "",
    callback: object = None,
    output: str = "",
    **kwargs: Any,
) -> AnyIO:
    import pdf2zh.high_level
    from pdf2zh.doclayout import DocLayoutModel

    if not files:
        raise PDFValueError("Must provide files to work upon!")

    if output_type == "text" and outfile != "-":
        for override, alttype in OUTPUT_TYPES:
            if outfile.endswith(override):
                output_type = alttype

    outfp: AnyIO = sys.stdout
    model = DocLayoutModel.load_available()

    for file in files:
        if file.startswith("http://") or file.startswith("https://"):
            print("Online files detected, downloading...")
            try:
                r = requests.get(file, allow_redirects=True)
                if r.status_code == 200:
                    if not os.path.exists("./pdf2zh_files"):
                        print("Making a temporary dir for downloading PDF files...")
                        os.mkdir(os.path.dirname("./pdf2zh_files"))
                    with open("./pdf2zh_files/tmp_download.pdf", "wb") as f:
                        print(f"Writing the file: {file}...")
                        f.write(r.content)
                    file = "./pdf2zh_files/tmp_download.pdf"
                else:
                    r.raise_for_status()
            except Exception as e:
                raise PDFValueError(
                    f"Errors occur in downloading the PDF file. Please check the link(s).\nError:\n{e}"
                )
        filename = os.path.splitext(os.path.basename(file))[0]

        doc_en = pymupdf.open(file)
        page_count = doc_en.page_count
        font_list = ["china-ss", "tiro"]
        font_id = {}
        for page in doc_en:
            for font in font_list:
                font_id[font] = page.insert_font(font)
        xreflen = doc_en.xref_length()
        for xref in range(1, xreflen):
            for label in ["Resources/", ""]:  # 可能是基于 xobj 的 res
                try:  # xref 读写可能出错
                    font_res = doc_en.xref_get_key(xref, f"{label}Font")
                    if font_res[0] == "dict":
                        for font in font_list:
                            font_exist = doc_en.xref_get_key(
                                xref, f"{label}Font/{font}"
                            )
                            if font_exist[0] == "null":
                                doc_en.xref_set_key(
                                    xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
                                )
                except Exception:
                    pass
        doc_en.save(Path(output) / f"{filename}-en.pdf")

        with open(Path(output) / f"{filename}-en.pdf", "rb") as fp:
            obj_patch: dict = pdf2zh.high_level.extract_text_to_fp(fp, **locals())

        for obj_id, ops_new in obj_patch.items():
            # ops_old=doc_en.xref_stream(obj_id)
            # print(obj_id)
            # print(ops_old)
            # print(ops_new.encode())
            doc_en.update_stream(obj_id, ops_new.encode())

        doc_zh = doc_en
        doc_dual = pymupdf.open(Path(output) / f"{filename}-en.pdf")
        doc_dual.insert_file(doc_zh)
        for id in range(page_count):
            doc_dual.move_page(page_count + id, id * 2 + 1)
        doc_zh.save(Path(output) / f"{filename}-zh.pdf", deflate=1)
        doc_dual.save(Path(output) / f"{filename}-dual.pdf", deflate=1)
        doc_zh.close()
        doc_dual.close()
        os.remove(Path(output) / f"{filename}-en.pdf")

    return


def create_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description=__doc__, add_help=True)
    parser.add_argument(
        "files",
        type=str,
        default=None,
        nargs="*",
        help="One or more paths to PDF files.",
    )
    parser.add_argument(
        "--version",
        "-v",
        action="version",
        version=f"pdf2zh v{__version__}",
    )
    parser.add_argument(
        "--debug",
        "-d",
        default=False,
        action="store_true",
        help="Use debug logging level.",
    )
    parse_params = parser.add_argument_group(
        "Parser",
        description="Used during PDF parsing",
    )
    parse_params.add_argument(
        "--pages",
        "-p",
        type=str,
        help="The list of page numbers to parse.",
    )
    parse_params.add_argument(
        "--password",
        "-P",
        type=str,
        default="",
        help="The password to use for decrypting PDF file.",
    )
    parse_params.add_argument(
        "--vfont",
        "-f",
        type=str,
        default="",
        help="The regex to math font name of formula.",
    )
    parse_params.add_argument(
        "--vchar",
        "-c",
        type=str,
        default="",
        help="The regex to math character of formula.",
    )
    parse_params.add_argument(
        "--lang-in",
        "-li",
        type=str,
        default="auto",
        help="The code of source language.",
    )
    parse_params.add_argument(
        "--lang-out",
        "-lo",
        type=str,
        default="auto",
        help="The code of target language.",
    )
    parse_params.add_argument(
        "--service",
        "-s",
        type=str,
        default="google",
        help="The service to use for translation.",
    )
    parse_params.add_argument(
        "--output",
        "-o",
        type=str,
        default="",
        help="Output directory for files.",
    )
    parse_params.add_argument(
        "--thread",
        "-t",
        type=int,
        default=4,
        help="The number of threads to execute translation.",
    )
    parse_params.add_argument(
        "--interactive",
        "-i",
        action="store_true",
        help="Interact with GUI.",
    )
    parse_params.add_argument(
        "--share",
        action="store_true",
        help="Enable Gradio Share",
    )

    return parser


def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
    parsed_args = create_parser().parse_args(args=args)

    if parsed_args.pages:
        pages = []
        for p in parsed_args.pages.split(","):
            if "-" in p:
                start, end = p.split("-")
                pages.extend(range(int(start) - 1, int(end)))
            else:
                pages.append(int(p) - 1)
        parsed_args.pages = pages

    return parsed_args


def main(args: Optional[List[str]] = None) -> int:
    parsed_args = parse_args(args)

    missing_files = check_files(parsed_args.files)
    if missing_files:
        print("The following files do not exist:", file=sys.stderr)
        for file in missing_files:
            print(f"  {file}", file=sys.stderr)
        return -1
    if parsed_args.interactive:
        from pdf2zh.gui import setup_gui

        setup_gui(parsed_args.share)
        return 0

    setup_log()
    extract_text(**vars(parsed_args))
    return 0


if __name__ == "__main__":
    sys.exit(main())
    sys.exit(main())