davideuler
commited on
Commit
·
4a4fa23
1
Parent(s):
91e1678
model default to default_model for mlx/gguf models, and add new text as optional layer; allow --no-original option to remove original content in pdf, just save the translated content
Browse files- deep_translator/chatgpt.py +4 -1
- docs/README.rst +2 -2
- translator_cli.py +75 -18
deep_translator/chatgpt.py
CHANGED
@@ -58,8 +58,10 @@ class ChatGptTranslator(BaseTranslator):
|
|
58 |
prompt = f"Translate the text below into {self.target}.\n"
|
59 |
prompt += f'Text: "{text}"'
|
60 |
|
|
|
|
|
61 |
response = client.chat.completions.create(
|
62 |
-
model=self.model,
|
63 |
messages=[
|
64 |
{
|
65 |
"role": "user",
|
@@ -67,6 +69,7 @@ class ChatGptTranslator(BaseTranslator):
|
|
67 |
}
|
68 |
],
|
69 |
)
|
|
|
70 |
|
71 |
return response.choices[0].message.content
|
72 |
|
|
|
58 |
prompt = f"Translate the text below into {self.target}.\n"
|
59 |
prompt += f'Text: "{text}"'
|
60 |
|
61 |
+
# if model is empty (for mlx_lm.server, the model should be default_model)
|
62 |
+
# export OPENAI_MODEL=default_model
|
63 |
response = client.chat.completions.create(
|
64 |
+
model=self.model if self.model else "default_model",
|
65 |
messages=[
|
66 |
{
|
67 |
"role": "user",
|
|
|
69 |
}
|
70 |
],
|
71 |
)
|
72 |
+
|
73 |
|
74 |
return response.choices[0].message.content
|
75 |
|
docs/README.rst
CHANGED
@@ -184,7 +184,7 @@ or even directly from terminal:
|
|
184 |
|
185 |
or shorter
|
186 |
|
187 |
-
$ dt -tg de -txt "hello world"
|
188 |
|
189 |
|
190 |
=====
|
@@ -594,7 +594,7 @@ ChatGpt Translator
|
|
594 |
|
595 |
You can provide your api key, api base as an argument or you can export it as an env var
|
596 |
e.g.
|
597 |
-
|
598 |
`export OPENAI_API_KEY="your_key"`
|
599 |
|
600 |
`export OPENAI_API_BASE=https://api.openai.com/v1`
|
|
|
184 |
|
185 |
or shorter
|
186 |
|
187 |
+
$ dt --translator chatgpt -tg de -txt "hello world"
|
188 |
|
189 |
|
190 |
=====
|
|
|
594 |
|
595 |
You can provide your api key, api base as an argument or you can export it as an env var
|
596 |
e.g.
|
597 |
+
|
598 |
`export OPENAI_API_KEY="your_key"`
|
599 |
|
600 |
`export OPENAI_API_BASE=https://api.openai.com/v1`
|
translator_cli.py
CHANGED
@@ -11,7 +11,8 @@ TRANSLATORS = {
|
|
11 |
'chatgpt': ChatGptTranslator,
|
12 |
}
|
13 |
|
14 |
-
def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "
|
|
|
15 |
"""
|
16 |
Translate a PDF file from source language to target language
|
17 |
|
@@ -19,11 +20,25 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
|
|
19 |
input_file: Path to input PDF file
|
20 |
source_lang: Source language code (e.g. 'en', 'fr')
|
21 |
target_lang: Target language code (e.g. 'ko', 'ja')
|
22 |
-
layer: Name of the OCG layer (default: "
|
23 |
translator_name: Name of the translator to use (default: "google")
|
|
|
|
|
24 |
"""
|
25 |
-
# Define
|
26 |
WHITE = pymupdf.pdfcolor["white"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
# This flag ensures that text will be dehyphenated after extraction.
|
29 |
textflags = pymupdf.TEXT_DEHYPHENATE
|
@@ -43,9 +58,12 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
|
|
43 |
# Open the document
|
44 |
doc = pymupdf.open(input_file)
|
45 |
|
46 |
-
# Define an Optional Content layer
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
49 |
|
50 |
# Iterate over all pages
|
51 |
for page in doc:
|
@@ -60,12 +78,26 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
|
|
60 |
# Invoke the actual translation
|
61 |
translated = translator.translate(text)
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
page.insert_htmlbox(
|
68 |
-
bbox,
|
|
|
|
|
|
|
69 |
)
|
70 |
|
71 |
doc.subset_fonts()
|
@@ -74,26 +106,51 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
|
|
74 |
|
75 |
def main():
|
76 |
"""
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
"""
|
80 |
|
81 |
parser = argparse.ArgumentParser(description='Translate PDF documents.')
|
82 |
parser.add_argument('input_file', help='Input PDF file path')
|
83 |
parser.add_argument('--source', '-s', default='en',
|
84 |
help='Source language code (default: en)')
|
85 |
-
parser.add_argument('--target', '-t', default='
|
86 |
-
help='Target language code (default:
|
87 |
-
parser.add_argument('--layer', '-l', default='
|
88 |
-
help='Name of the OCG layer (default:
|
89 |
parser.add_argument('--translator', '-tr', default='google',
|
90 |
choices=list(TRANSLATORS.keys()),
|
91 |
help='Translator to use (default: google)')
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
args = parser.parse_args()
|
94 |
|
95 |
try:
|
96 |
-
translate_pdf(args.input_file, args.source, args.target, args.layer,
|
|
|
97 |
except Exception as e:
|
98 |
print(f"Error: {str(e)}")
|
99 |
exit(1)
|
|
|
11 |
'chatgpt': ChatGptTranslator,
|
12 |
}
|
13 |
|
14 |
+
def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Text",
|
15 |
+
translator_name: str = "google", text_color: str = "darkred", keep_original: bool = True):
|
16 |
"""
|
17 |
Translate a PDF file from source language to target language
|
18 |
|
|
|
20 |
input_file: Path to input PDF file
|
21 |
source_lang: Source language code (e.g. 'en', 'fr')
|
22 |
target_lang: Target language code (e.g. 'ko', 'ja')
|
23 |
+
layer: Name of the OCG layer (default: "Text")
|
24 |
translator_name: Name of the translator to use (default: "google")
|
25 |
+
text_color: Color of translated text (default: "darkred")
|
26 |
+
keep_original: Whether to keep original text visible (default: True)
|
27 |
"""
|
28 |
+
# Define colors
|
29 |
WHITE = pymupdf.pdfcolor["white"]
|
30 |
+
|
31 |
+
# Color mapping
|
32 |
+
COLOR_MAP = {
|
33 |
+
"darkred": (0.8, 0, 0),
|
34 |
+
"black": (0, 0, 0),
|
35 |
+
"blue": (0, 0, 0.8),
|
36 |
+
"darkgreen": (0, 0.5, 0),
|
37 |
+
"purple": (0.5, 0, 0.5),
|
38 |
+
}
|
39 |
+
|
40 |
+
# Get RGB color values, default to darkred if color not found
|
41 |
+
rgb_color = COLOR_MAP.get(text_color.lower(), COLOR_MAP["darkred"])
|
42 |
|
43 |
# This flag ensures that text will be dehyphenated after extraction.
|
44 |
textflags = pymupdf.TEXT_DEHYPHENATE
|
|
|
58 |
# Open the document
|
59 |
doc = pymupdf.open(input_file)
|
60 |
|
61 |
+
# Define an Optional Content layer for translation
|
62 |
+
ocg_trans = doc.add_ocg(layer, on=True)
|
63 |
+
|
64 |
+
# If not keeping original, create a layer for original text and hide it
|
65 |
+
if not keep_original:
|
66 |
+
ocg_orig = doc.add_ocg("Original", on=False)
|
67 |
|
68 |
# Iterate over all pages
|
69 |
for page in doc:
|
|
|
78 |
# Invoke the actual translation
|
79 |
translated = translator.translate(text)
|
80 |
|
81 |
+
if not keep_original:
|
82 |
+
# Move original text to hidden layer
|
83 |
+
page.insert_htmlbox(
|
84 |
+
bbox,
|
85 |
+
text,
|
86 |
+
css="* {font-family: sans-serif;}",
|
87 |
+
oc=ocg_orig
|
88 |
+
)
|
89 |
+
# Clear original text area in base layer
|
90 |
+
page.draw_rect(bbox, color=None, fill=WHITE)
|
91 |
+
else:
|
92 |
+
# Cover the original text only in translation layer
|
93 |
+
page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_trans)
|
94 |
+
|
95 |
+
# Write the translated text in specified color
|
96 |
page.insert_htmlbox(
|
97 |
+
bbox,
|
98 |
+
translated,
|
99 |
+
css=f"* {{font-family: sans-serif; color: rgb({int(rgb_color[0]*255)}, {int(rgb_color[1]*255)}, {int(rgb_color[2]*255)});}}",
|
100 |
+
oc=ocg_trans
|
101 |
)
|
102 |
|
103 |
doc.subset_fonts()
|
|
|
106 |
|
107 |
def main():
|
108 |
"""
|
109 |
+
can be invoked like this:
|
110 |
+
```
|
111 |
+
# Basic usage
|
112 |
+
python translator_cli.py --source english --target zh-CN input.pdf
|
113 |
+
|
114 |
+
# With custom color and hiding original text
|
115 |
+
python translator_cli.py --source english --target zh-CN --color blue --no-original input.pdf
|
116 |
+
|
117 |
+
# Using ChatGPT translator
|
118 |
+
export OPENAI_API_KEY=sk-proj-xxxx
|
119 |
+
export OPENAI_API_BASE=https://api.xxxx.com/v1
|
120 |
+
export OPENAI_API_BASE=http://localhost:8080/v1 # for local llm api
|
121 |
+
python translator_cli.py --source english --translator chatgpt --target zh-CN input.pdf
|
122 |
+
|
123 |
+
# do not keep original text as an optional layer:
|
124 |
+
python translator_cli.py --source english --translator chatgpt --target zh-CN --no-original input.pdf
|
125 |
+
|
126 |
+
```
|
127 |
+
|
128 |
+
The translated content is an optional content layer in the new PDF file.
|
129 |
+
The optional layer can be hidden in Acrobat PDF Reader and Foxit Reader.
|
130 |
"""
|
131 |
|
132 |
parser = argparse.ArgumentParser(description='Translate PDF documents.')
|
133 |
parser.add_argument('input_file', help='Input PDF file path')
|
134 |
parser.add_argument('--source', '-s', default='en',
|
135 |
help='Source language code (default: en)')
|
136 |
+
parser.add_argument('--target', '-t', default='zh-CN',
|
137 |
+
help='Target language code (default: zh-CN)')
|
138 |
+
parser.add_argument('--layer', '-l', default='Text',
|
139 |
+
help='Name of the OCG layer (default: Text)')
|
140 |
parser.add_argument('--translator', '-tr', default='google',
|
141 |
choices=list(TRANSLATORS.keys()),
|
142 |
help='Translator to use (default: google)')
|
143 |
+
parser.add_argument('--color', '-c', default='darkred',
|
144 |
+
choices=['darkred', 'black', 'blue', 'darkgreen', 'purple'],
|
145 |
+
help='Color of translated text (default: darkred)')
|
146 |
+
parser.add_argument('--no-original', action='store_true',
|
147 |
+
help='Do not keep original text in base layer (default: False)')
|
148 |
|
149 |
args = parser.parse_args()
|
150 |
|
151 |
try:
|
152 |
+
translate_pdf(args.input_file, args.source, args.target, args.layer,
|
153 |
+
args.translator, args.color, not args.no_original)
|
154 |
except Exception as e:
|
155 |
print(f"Error: {str(e)}")
|
156 |
exit(1)
|