import hashlib
import hmac
import html
import logging
import os
import re
import time
from datetime import timezone, datetime
from json import dumps, loads
import unicodedata
import deepl
import ollama
import openai
import requests
from azure.ai.translation.text import TextTranslationClient
from azure.core.credentials import AzureKeyCredential
def remove_control_characters(s):
return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
class BaseTranslator:
def __init__(self, service, lang_out, lang_in, model):
self.service = service
self.lang_out = lang_out
self.lang_in = lang_in
self.model = model
def translate(self, text) -> str: ... # noqa: E704
def __str__(self):
return f"{self.service} {self.lang_out} {self.lang_in}"
class GoogleTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
self.base_link = "http://translate.google.com/m"
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
text = text[:5000] # google translate max length
response = self.session.get(
self.base_link,
params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
headers=self.headers,
)
re_result = re.findall(
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
)
if response.status_code == 400:
result = "IRREPARABLE TRANSLATION ERROR"
elif len(re_result) == 0:
raise ValueError("Empty translation result")
else:
result = html.unescape(re_result[0])
return remove_control_characters(result)
class TencentTranslator(BaseTranslator):
def sign(self, key, msg):
return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
server_url = "tmt.tencentcloudapi.com"
self.secret_id = os.getenv("TENCENT_SECRET_ID")
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
self.session = requests.Session()
self.base_link = f"{server_url}"
def translate(self, text):
text = text[:5000]
data = {
"SourceText": text,
"Source": self.lang_in,
"Target": self.lang_out,
"ProjectId": 0,
}
payloadx = dumps(data)
hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
canonical_request = (
"POST"
+ "\n"
+ "/"
+ "\n"
+ ""
+ "\n"
+ "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n"
+ "\n"
+ "content-type;host;x-tc-action"
+ "\n"
+ hashed_request_payload
)
timestamp = int(time.time())
date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d")
credential_scope = date + "/tmt/tc3_request"
hashed_canonical_request = hashlib.sha256(
canonical_request.encode("utf-8")
).hexdigest()
algorithm = "TC3-HMAC-SHA256"
string_to_sign = (
algorithm
+ "\n"
+ str(timestamp)
+ "\n"
+ credential_scope
+ "\n"
+ hashed_canonical_request
)
secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date)
secret_service = self.sign(secret_date, "tmt")
secret_signing = self.sign(secret_service, "tc3_request")
signed_headers = "content-type;host;x-tc-action"
signature = hmac.new(
secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256
).hexdigest()
authorization = (
algorithm
+ " "
+ "Credential="
+ str(self.secret_id)
+ "/"
+ credential_scope
+ ", "
+ "SignedHeaders="
+ signed_headers
+ ", "
+ "Signature="
+ signature
)
self.headers = {
"Authorization": authorization,
"Content-Type": "application/json; charset=utf-8",
"Host": "tmt.tencentcloudapi.com",
"X-TC-Action": "TextTranslate",
"X-TC-Region": "ap-beijing",
"X-TC-Timestamp": str(timestamp),
"X-TC-Version": "2018-03-21",
}
response = self.session.post(
"https://" + self.base_link,
json=data,
headers=self.headers,
)
# 1. Status code test
if response.status_code == 200:
result = loads(response.text)
else:
raise ValueError("HTTP error: " + str(response.status_code))
# 2. Result test
try:
result = result["Response"]["TargetText"]
# return result
except KeyError:
result = ""
# raise ValueError("No valid key in Tencent's response")
# # 3. Result length check
# if len(result) == 0:
# raise ValueError("Empty translation result")
return result
class DeepLXTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
auth_key = os.getenv("DEEPLX_AUTH_KEY")
server_url = (
"https://api.deeplx.org"
if not os.getenv("DEEPLX_SERVER_URL")
else os.getenv("DEEPLX_SERVER_URL")
)
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
self.session = requests.Session()
server_url = str(server_url).rstrip("/")
if auth_key:
self.base_link = f"{server_url}/{auth_key}/translate"
else:
self.base_link = f"{server_url}/translate"
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
text = text[:5000] # google translate max length
response = self.session.post(
self.base_link,
dumps(
{
"target_lang": self.lang_out,
"text": text,
}
),
headers=self.headers,
)
# 1. Status code test
if response.status_code == 200:
result = loads(response.text)
else:
raise ValueError("HTTP error: " + str(response.status_code))
# 2. Result test
try:
result = result["data"]
return result
except KeyError:
result = ""
raise ValueError("No valid key in DeepLX's response")
# 3. Result length check
if len(result) == 0:
raise ValueError("Empty translation result")
return result
class DeepLTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "ZH" if lang_out == "auto" else lang_out
lang_in = "EN" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
auth_key = os.getenv("DEEPL_AUTH_KEY")
server_url = os.getenv("DEEPL_SERVER_URL")
self.client = deepl.Translator(auth_key, server_url=server_url)
def translate(self, text):
response = self.client.translate_text(
text, target_lang=self.lang_out, source_lang=self.lang_in
)
return response.text
class OllamaTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OLLAMA_HOST
self.client = ollama.Client()
def translate(self, text):
response = self.client.chat(
model=self.model,
options=self.options,
messages=[
{
"role": "system",
"content": "You are a professional,authentic machine translation engine.",
},
{
"role": "user",
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
return response["message"]["content"].strip()
class OpenAITranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OPENAI_BASE_URL
# OPENAI_API_KEY
self.client = openai.OpenAI()
def translate(self, text) -> str:
response = self.client.chat.completions.create(
model=self.model,
**self.options,
messages=[
{
"role": "system",
"content": "You are a professional,authentic machine translation engine.",
},
{
"role": "user",
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
return response.choices[0].message.content.strip()
class AzureTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
api_key = os.environ["AZURE_APIKEY"]
endpoint = os.environ["AZURE_ENDPOINT"]
region = os.environ["AZURE_REGION"]
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
credential = AzureKeyCredential(api_key)
self.client = TextTranslationClient(
endpoint=endpoint, credential=credential, region=region
)
# https://github.com/Azure/azure-sdk-for-python/issues/9422
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
logger.setLevel(logging.WARNING)
def translate(self, text) -> str:
response = self.client.translate(
body=[text],
from_language=self.lang_in,
to_language=[self.lang_out],
)
translated_text = response[0].translations[0].text
return translated_text