import hashlib import hmac import html import logging import os import re import time from datetime import timezone, datetime from json import dumps, loads import unicodedata import deepl import ollama import openai import requests from azure.ai.translation.text import TextTranslationClient from azure.core.credentials import AzureKeyCredential def remove_control_characters(s): return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") class BaseTranslator: def __init__(self, service, lang_out, lang_in, model): self.service = service self.lang_out = lang_out self.lang_in = lang_in self.model = model def translate(self, text) -> str: ... # noqa: E704 def __str__(self): return f"{self.service} {self.lang_out} {self.lang_in}" class GoogleTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh-CN" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.session = requests.Session() self.base_link = "http://translate.google.com/m" self.headers = { "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 } def translate(self, text): text = text[:5000] # google translate max length response = self.session.get( self.base_link, params={"tl": self.lang_out, "sl": self.lang_in, "q": text}, headers=self.headers, ) re_result = re.findall( r'(?s)class="(?:t0|result-container)">(.*?)<', response.text ) if response.status_code == 400: result = "IRREPARABLE TRANSLATION ERROR" elif len(re_result) == 0: raise ValueError("Empty translation result") else: result = html.unescape(re_result[0]) return remove_control_characters(result) class TencentTranslator(BaseTranslator): def sign(self, key, msg): return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest() def __init__(self, service, lang_out, lang_in, model): lang_out = "zh" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) try: server_url = "tmt.tencentcloudapi.com" self.secret_id = os.getenv("TENCENT_SECRET_ID") self.secret_key = os.getenv("TENCENT_SECRET_KEY") except KeyError as e: missing_var = e.args[0] raise ValueError( f"The environment variable '{missing_var}' is required but not set." ) from e self.session = requests.Session() self.base_link = f"{server_url}" def translate(self, text): text = text[:5000] data = { "SourceText": text, "Source": self.lang_in, "Target": self.lang_out, "ProjectId": 0, } payloadx = dumps(data) hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest() canonical_request = ( "POST" + "\n" + "/" + "\n" + "" + "\n" + "content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n" + "\n" + "content-type;host;x-tc-action" + "\n" + hashed_request_payload ) timestamp = int(time.time()) date = datetime.fromtimestamp(timestamp, timezone.utc).strftime("%Y-%m-%d") credential_scope = date + "/tmt/tc3_request" hashed_canonical_request = hashlib.sha256( canonical_request.encode("utf-8") ).hexdigest() algorithm = "TC3-HMAC-SHA256" string_to_sign = ( algorithm + "\n" + str(timestamp) + "\n" + credential_scope + "\n" + hashed_canonical_request ) secret_date = self.sign(("TC3" + str(self.secret_key)).encode("utf-8"), date) secret_service = self.sign(secret_date, "tmt") secret_signing = self.sign(secret_service, "tc3_request") signed_headers = "content-type;host;x-tc-action" signature = hmac.new( secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256 ).hexdigest() authorization = ( algorithm + " " + "Credential=" + str(self.secret_id) + "/" + credential_scope + ", " + "SignedHeaders=" + signed_headers + ", " + "Signature=" + signature ) self.headers = { "Authorization": authorization, "Content-Type": "application/json; charset=utf-8", "Host": "tmt.tencentcloudapi.com", "X-TC-Action": "TextTranslate", "X-TC-Region": "ap-beijing", "X-TC-Timestamp": str(timestamp), "X-TC-Version": "2018-03-21", } response = self.session.post( "https://" + self.base_link, json=data, headers=self.headers, ) # 1. Status code test if response.status_code == 200: result = loads(response.text) else: raise ValueError("HTTP error: " + str(response.status_code)) # 2. Result test try: result = result["Response"]["TargetText"] # return result except KeyError: result = "" # raise ValueError("No valid key in Tencent's response") # # 3. Result length check # if len(result) == 0: # raise ValueError("Empty translation result") return result class DeepLXTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) try: auth_key = os.getenv("DEEPLX_AUTH_KEY") server_url = ( "https://api.deeplx.org" if not os.getenv("DEEPLX_SERVER_URL") else os.getenv("DEEPLX_SERVER_URL") ) except KeyError as e: missing_var = e.args[0] raise ValueError( f"The environment variable '{missing_var}' is required but not set." ) from e self.session = requests.Session() server_url = str(server_url).rstrip("/") if auth_key: self.base_link = f"{server_url}/{auth_key}/translate" else: self.base_link = f"{server_url}/translate" self.headers = { "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 } def translate(self, text): text = text[:5000] # google translate max length response = self.session.post( self.base_link, dumps( { "target_lang": self.lang_out, "text": text, } ), headers=self.headers, ) # 1. Status code test if response.status_code == 200: result = loads(response.text) else: raise ValueError("HTTP error: " + str(response.status_code)) # 2. Result test try: result = result["data"] return result except KeyError: result = "" raise ValueError("No valid key in DeepLX's response") # 3. Result length check if len(result) == 0: raise ValueError("Empty translation result") return result class DeepLTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "ZH" if lang_out == "auto" else lang_out lang_in = "EN" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.session = requests.Session() auth_key = os.getenv("DEEPL_AUTH_KEY") server_url = os.getenv("DEEPL_SERVER_URL") self.client = deepl.Translator(auth_key, server_url=server_url) def translate(self, text): response = self.client.translate_text( text, target_lang=self.lang_out, source_lang=self.lang_in ) return response.text class OllamaTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh-CN" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 # OLLAMA_HOST self.client = ollama.Client() def translate(self, text): response = self.client.chat( model=self.model, options=self.options, messages=[ { "role": "system", "content": "You are a professional,authentic machine translation engine.", }, { "role": "user", "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 }, ], ) return response["message"]["content"].strip() class OpenAITranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh-CN" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 # OPENAI_BASE_URL # OPENAI_API_KEY self.client = openai.OpenAI() def translate(self, text) -> str: response = self.client.chat.completions.create( model=self.model, **self.options, messages=[ { "role": "system", "content": "You are a professional,authentic machine translation engine.", }, { "role": "user", "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 }, ], ) return response.choices[0].message.content.strip() class AzureTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): lang_out = "zh-Hans" if lang_out == "auto" else lang_out lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) try: api_key = os.environ["AZURE_APIKEY"] endpoint = os.environ["AZURE_ENDPOINT"] region = os.environ["AZURE_REGION"] except KeyError as e: missing_var = e.args[0] raise ValueError( f"The environment variable '{missing_var}' is required but not set." ) from e credential = AzureKeyCredential(api_key) self.client = TextTranslationClient( endpoint=endpoint, credential=credential, region=region ) # https://github.com/Azure/azure-sdk-for-python/issues/9422 logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy") logger.setLevel(logging.WARNING) def translate(self, text) -> str: response = self.client.translate( body=[text], from_language=self.lang_in, to_language=[self.lang_out], ) translated_text = response[0].translations[0].text return translated_text