davideuler commited on
Commit
4a4fa23
·
1 Parent(s): 91e1678

model default to default_model for mlx/gguf models, and add new text as optional layer; allow --no-original option to remove original content in pdf, just save the translated content

Browse files
Files changed (3) hide show
  1. deep_translator/chatgpt.py +4 -1
  2. docs/README.rst +2 -2
  3. translator_cli.py +75 -18
deep_translator/chatgpt.py CHANGED
@@ -58,8 +58,10 @@ class ChatGptTranslator(BaseTranslator):
58
  prompt = f"Translate the text below into {self.target}.\n"
59
  prompt += f'Text: "{text}"'
60
 
 
 
61
  response = client.chat.completions.create(
62
- model=self.model,
63
  messages=[
64
  {
65
  "role": "user",
@@ -67,6 +69,7 @@ class ChatGptTranslator(BaseTranslator):
67
  }
68
  ],
69
  )
 
70
 
71
  return response.choices[0].message.content
72
 
 
58
  prompt = f"Translate the text below into {self.target}.\n"
59
  prompt += f'Text: "{text}"'
60
 
61
+ # if model is empty (for mlx_lm.server, the model should be default_model)
62
+ # export OPENAI_MODEL=default_model
63
  response = client.chat.completions.create(
64
+ model=self.model if self.model else "default_model",
65
  messages=[
66
  {
67
  "role": "user",
 
69
  }
70
  ],
71
  )
72
+
73
 
74
  return response.choices[0].message.content
75
 
docs/README.rst CHANGED
@@ -184,7 +184,7 @@ or even directly from terminal:
184
 
185
  or shorter
186
 
187
- $ dt -tg de -txt "hello world"
188
 
189
 
190
  =====
@@ -594,7 +594,7 @@ ChatGpt Translator
594
 
595
  You can provide your api key, api base as an argument or you can export it as an env var
596
  e.g.
597
-
598
  `export OPENAI_API_KEY="your_key"`
599
 
600
  `export OPENAI_API_BASE=https://api.openai.com/v1`
 
184
 
185
  or shorter
186
 
187
+ $ dt --translator chatgpt -tg de -txt "hello world"
188
 
189
 
190
  =====
 
594
 
595
  You can provide your api key, api base as an argument or you can export it as an env var
596
  e.g.
597
+
598
  `export OPENAI_API_KEY="your_key"`
599
 
600
  `export OPENAI_API_BASE=https://api.openai.com/v1`
translator_cli.py CHANGED
@@ -11,7 +11,8 @@ TRANSLATORS = {
11
  'chatgpt': ChatGptTranslator,
12
  }
13
 
14
- def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Korean", translator_name: str = "google"):
 
15
  """
16
  Translate a PDF file from source language to target language
17
 
@@ -19,11 +20,25 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
19
  input_file: Path to input PDF file
20
  source_lang: Source language code (e.g. 'en', 'fr')
21
  target_lang: Target language code (e.g. 'ko', 'ja')
22
- layer: Name of the OCG layer (default: "Korean")
23
  translator_name: Name of the translator to use (default: "google")
 
 
24
  """
25
- # Define color "white"
26
  WHITE = pymupdf.pdfcolor["white"]
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  # This flag ensures that text will be dehyphenated after extraction.
29
  textflags = pymupdf.TEXT_DEHYPHENATE
@@ -43,9 +58,12 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
43
  # Open the document
44
  doc = pymupdf.open(input_file)
45
 
46
- # Define an Optional Content layer in the document.
47
- # Activate it by default.
48
- ocg_xref = doc.add_ocg(layer, on=True)
 
 
 
49
 
50
  # Iterate over all pages
51
  for page in doc:
@@ -60,12 +78,26 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
60
  # Invoke the actual translation
61
  translated = translator.translate(text)
62
 
63
- # Cover the source text with a white rectangle.
64
- page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_xref)
65
-
66
- # Write the translated text into the original rectangle
 
 
 
 
 
 
 
 
 
 
 
67
  page.insert_htmlbox(
68
- bbox, translated, css="* {font-family: sans-serif;}", oc=ocg_xref
 
 
 
69
  )
70
 
71
  doc.subset_fonts()
@@ -74,26 +106,51 @@ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: st
74
 
75
  def main():
76
  """
77
- can be invoked like this:
78
- python translator_cli.py --source english --target zh-CN "/Users/david/Downloads/Level_up_coding_by_ai.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  """
80
 
81
  parser = argparse.ArgumentParser(description='Translate PDF documents.')
82
  parser.add_argument('input_file', help='Input PDF file path')
83
  parser.add_argument('--source', '-s', default='en',
84
  help='Source language code (default: en)')
85
- parser.add_argument('--target', '-t', default='ko',
86
- help='Target language code (default: ko)')
87
- parser.add_argument('--layer', '-l', default='Korean',
88
- help='Name of the OCG layer (default: Korean)')
89
  parser.add_argument('--translator', '-tr', default='google',
90
  choices=list(TRANSLATORS.keys()),
91
  help='Translator to use (default: google)')
 
 
 
 
 
92
 
93
  args = parser.parse_args()
94
 
95
  try:
96
- translate_pdf(args.input_file, args.source, args.target, args.layer, args.translator)
 
97
  except Exception as e:
98
  print(f"Error: {str(e)}")
99
  exit(1)
 
11
  'chatgpt': ChatGptTranslator,
12
  }
13
 
14
+ def translate_pdf(input_file: str, source_lang: str, target_lang: str, layer: str = "Text",
15
+ translator_name: str = "google", text_color: str = "darkred", keep_original: bool = True):
16
  """
17
  Translate a PDF file from source language to target language
18
 
 
20
  input_file: Path to input PDF file
21
  source_lang: Source language code (e.g. 'en', 'fr')
22
  target_lang: Target language code (e.g. 'ko', 'ja')
23
+ layer: Name of the OCG layer (default: "Text")
24
  translator_name: Name of the translator to use (default: "google")
25
+ text_color: Color of translated text (default: "darkred")
26
+ keep_original: Whether to keep original text visible (default: True)
27
  """
28
+ # Define colors
29
  WHITE = pymupdf.pdfcolor["white"]
30
+
31
+ # Color mapping
32
+ COLOR_MAP = {
33
+ "darkred": (0.8, 0, 0),
34
+ "black": (0, 0, 0),
35
+ "blue": (0, 0, 0.8),
36
+ "darkgreen": (0, 0.5, 0),
37
+ "purple": (0.5, 0, 0.5),
38
+ }
39
+
40
+ # Get RGB color values, default to darkred if color not found
41
+ rgb_color = COLOR_MAP.get(text_color.lower(), COLOR_MAP["darkred"])
42
 
43
  # This flag ensures that text will be dehyphenated after extraction.
44
  textflags = pymupdf.TEXT_DEHYPHENATE
 
58
  # Open the document
59
  doc = pymupdf.open(input_file)
60
 
61
+ # Define an Optional Content layer for translation
62
+ ocg_trans = doc.add_ocg(layer, on=True)
63
+
64
+ # If not keeping original, create a layer for original text and hide it
65
+ if not keep_original:
66
+ ocg_orig = doc.add_ocg("Original", on=False)
67
 
68
  # Iterate over all pages
69
  for page in doc:
 
78
  # Invoke the actual translation
79
  translated = translator.translate(text)
80
 
81
+ if not keep_original:
82
+ # Move original text to hidden layer
83
+ page.insert_htmlbox(
84
+ bbox,
85
+ text,
86
+ css="* {font-family: sans-serif;}",
87
+ oc=ocg_orig
88
+ )
89
+ # Clear original text area in base layer
90
+ page.draw_rect(bbox, color=None, fill=WHITE)
91
+ else:
92
+ # Cover the original text only in translation layer
93
+ page.draw_rect(bbox, color=None, fill=WHITE, oc=ocg_trans)
94
+
95
+ # Write the translated text in specified color
96
  page.insert_htmlbox(
97
+ bbox,
98
+ translated,
99
+ css=f"* {{font-family: sans-serif; color: rgb({int(rgb_color[0]*255)}, {int(rgb_color[1]*255)}, {int(rgb_color[2]*255)});}}",
100
+ oc=ocg_trans
101
  )
102
 
103
  doc.subset_fonts()
 
106
 
107
  def main():
108
  """
109
+ can be invoked like this:
110
+ ```
111
+ # Basic usage
112
+ python translator_cli.py --source english --target zh-CN input.pdf
113
+
114
+ # With custom color and hiding original text
115
+ python translator_cli.py --source english --target zh-CN --color blue --no-original input.pdf
116
+
117
+ # Using ChatGPT translator
118
+ export OPENAI_API_KEY=sk-proj-xxxx
119
+ export OPENAI_API_BASE=https://api.xxxx.com/v1
120
+ export OPENAI_API_BASE=http://localhost:8080/v1 # for local llm api
121
+ python translator_cli.py --source english --translator chatgpt --target zh-CN input.pdf
122
+
123
+ # do not keep original text as an optional layer:
124
+ python translator_cli.py --source english --translator chatgpt --target zh-CN --no-original input.pdf
125
+
126
+ ```
127
+
128
+ The translated content is an optional content layer in the new PDF file.
129
+ The optional layer can be hidden in Acrobat PDF Reader and Foxit Reader.
130
  """
131
 
132
  parser = argparse.ArgumentParser(description='Translate PDF documents.')
133
  parser.add_argument('input_file', help='Input PDF file path')
134
  parser.add_argument('--source', '-s', default='en',
135
  help='Source language code (default: en)')
136
+ parser.add_argument('--target', '-t', default='zh-CN',
137
+ help='Target language code (default: zh-CN)')
138
+ parser.add_argument('--layer', '-l', default='Text',
139
+ help='Name of the OCG layer (default: Text)')
140
  parser.add_argument('--translator', '-tr', default='google',
141
  choices=list(TRANSLATORS.keys()),
142
  help='Translator to use (default: google)')
143
+ parser.add_argument('--color', '-c', default='darkred',
144
+ choices=['darkred', 'black', 'blue', 'darkgreen', 'purple'],
145
+ help='Color of translated text (default: darkred)')
146
+ parser.add_argument('--no-original', action='store_true',
147
+ help='Do not keep original text in base layer (default: False)')
148
 
149
  args = parser.parse_args()
150
 
151
  try:
152
+ translate_pdf(args.input_file, args.source, args.target, args.layer,
153
+ args.translator, args.color, not args.no_original)
154
  except Exception as e:
155
  print(f"Error: {str(e)}")
156
  exit(1)