File size: 6,091 Bytes
91e1678
 
 
 
 
 
 
 
 
 
 
 
 
4a4fa23
 
91e1678
 
 
 
 
 
 
4a4fa23
91e1678
4a4fa23
 
91e1678
4a4fa23
91e1678
4a4fa23
 
 
 
 
 
 
 
 
 
 
 
91e1678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a4fa23
 
 
 
 
 
91e1678
 
 
 
 
 
 
 
 
 
 
 
 
 
4a4fa23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91e1678
4a4fa23
 
 
 
91e1678
 
 
 
 
 
 
 
4a4fa23
 
 
 
 
 
 
 
 
 
 
 
9d793d0
4a4fa23
9d793d0
 
4a4fa23
 
 
 
 
 
 
91e1678
 
 
 
 
 
4a4fa23
 
 
 
91e1678
 
 
4a4fa23
 
 
 
 
91e1678
 
 
 
4a4fa23
 
91e1678
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import argparse
import pymupdf
from deep_translator import (
    GoogleTranslator,
    ChatGptTranslator,
)

# Map of supported translators
TRANSLATORS = {
    'google': GoogleTranslator,
    'chatgpt': ChatGptTranslator,
}

def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Text", 
                 translator_name: str = "google", text_color: str = "darkred", keep_original: bool = True):
    """
    Translate a PDF file from source language to target language
    
    Args:
        input_file: Path to input PDF file
        source_lang: Source language code (e.g. 'en', 'fr')
        target_lang: Target language code (e.g. 'ko', 'ja') 
        layer: Name of the OCG layer (default: "Text")
        translator_name: Name of the translator to use (default: "google")
        text_color: Color of translated text (default: "darkred")
        keep_original: Whether to keep original text visible (default: True)
    """
    # Define colors
    WHITE = pymupdf.pdfcolor["white"]
    
    # Color mapping
    COLOR_MAP = {
        "darkred": (0.8, 0, 0),
        "black": (0, 0, 0),
        "blue": (0, 0, 0.8),
        "darkgreen": (0, 0.5, 0),
        "purple": (0.5, 0, 0.5),
    }
    
    # Get RGB color values, default to darkred if color not found
    rgb_color = COLOR_MAP.get(text_color.lower(), COLOR_MAP["darkred"])

    # This flag ensures that text will be dehyphenated after extraction.
    textflags = pymupdf.TEXT_DEHYPHENATE

    # Get the translator class
    if translator_name not in TRANSLATORS:
        raise ValueError(f"Unsupported translator: {translator_name}. Available translators: {', '.join(TRANSLATORS.keys())}")
    
    TranslatorClass = TRANSLATORS[translator_name]
    
    # Configure the translator
    translator = TranslatorClass(source=source_lang, target=target_lang)

    # Generate output filename
    output_file = input_file.rsplit('.', 1)[0] + f'-{target_lang}.pdf'

    # Open the document
    doc = pymupdf.open(input_file)

    # Define an Optional Content layer for translation
    ocg_trans = doc.add_ocg(layer, on=True)
    
    # If not keeping original, create a layer for original text and hide it
    if not keep_original:
        ocg_orig = doc.add_ocg("Original", on=False)

    # Iterate over all pages
    for page in doc:
        # Extract text grouped like lines in a paragraph.
        blocks = page.get_text("blocks", flags=textflags)

        # Every block of text is contained in a rectangle ("bbox")
        for block in blocks:
            bbox = block[:4]  # area containing the text
            text = block[4]  # the text of this block

            # Invoke the actual translation
            translated = translator.translate(text)

            if not keep_original:
                # Move original text to hidden layer
                page.insert_htmlbox(
                    bbox,
                    text,
                    css="* {font-family: sans-serif;}",
                    oc=ocg_orig
                )
                # Clear original text area in base layer
                page.draw_rect(bbox, color=None, fill=WHITE)
            else:
                # Cover the original text only in translation layer
                page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_trans)

            # Write the translated text in specified color
            page.insert_htmlbox(
                bbox,
                translated,
                css=f"* {{font-family: sans-serif; color: rgb({int(rgb_color[0]*255)}, {int(rgb_color[1]*255)}, {int(rgb_color[2]*255)});}}",
                oc=ocg_trans
            )

    doc.subset_fonts()
    doc.ez_save(output_file)
    print(f"Translated PDF saved as: {output_file}")

def main():
    """
    can be invoked like this:
    ```
    # Basic usage
    python translator_cli.py --source english --target zh-CN input.pdf

    # With custom color and hiding original text
    python translator_cli.py --source english --target zh-CN --color blue --no-original input.pdf

    # Using ChatGPT translator
    export OPENAI_API_KEY=sk-proj-xxxx
    export OPENAI_API_BASE=https://api.xxxx.com/v1
    export OPENAI_API_BASE=http://localhost:8080/v1 #  for local llm api
    export OPENAI_MODEL=default_model
    
    python translator_cli.py --source english --translator chatgpt --target zh-CN input.pdf

    # do not keep original text as an optional layer:
    python translator_cli.py --source english --translator chatgpt --target zh-CN --no-original input.pdf
    
    ```

    The translated content is an optional content layer in the new PDF file. 
    The optional layer can be hidden in Acrobat PDF Reader and Foxit Reader.
    """
    
    parser = argparse.ArgumentParser(description='Translate PDF documents.')
    parser.add_argument('input_file', help='Input PDF file path')
    parser.add_argument('--source', '-s', default='en',
                       help='Source language code (default: en)')
    parser.add_argument('--target', '-t', default='zh-CN',
                       help='Target language code (default: zh-CN)')
    parser.add_argument('--layer', '-l', default='Text',
                       help='Name of the OCG layer (default: Text)')
    parser.add_argument('--translator', '-tr', default='google',
                       choices=list(TRANSLATORS.keys()),
                       help='Translator to use (default: google)')
    parser.add_argument('--color', '-c', default='darkred',
                       choices=['darkred', 'black', 'blue', 'darkgreen', 'purple'],
                       help='Color of translated text (default: darkred)')
    parser.add_argument('--no-original', action='store_true',
                       help='Do not keep original text in base layer (default: False)')

    args = parser.parse_args()

    try:
        translate_pdf(args.input_file, args.source, args.target, args.layer, 
                     args.translator, args.color, not args.no_original)
    except Exception as e:
        print(f"Error: {str(e)}")
        exit(1)

if __name__ == "__main__":
    main()