Check if the settings file with the languages available and able to be loaded

In [2]:
import json
from pathlib import Path

def isSettingsFileAvailable():
 current_dir = Path.cwd()
 file_path =current_dir.parent /'settings.json'
 try:
 if file_path.exists() and file_path.is_file():
 with file_path.open('r') as file:
 settings = json.load(file)
 return settings
 else:
 return "Settings file is not found"
 except Exception as err:
 return "Issue reading the settings file"
 finally:
 if "file" in locals() and not file.closed:
 file.close()

If the settings file is present ---> validate the ISO code passed to API is a valid one 

In [3]:
value = isSettingsFileAvailable()
print(value)

{'language_supported': ['en', 'zh-CN', 'zh-TW', 'ms', 'ja', 'kr']}


Provide a logging mechanism to handle any errors during the translation process

In [4]:
import logging
import json

# Configure logging
logging.basicConfig(level=logging.ERROR,
 format='%(asctime)s %(levelname)s %(message)s',
 handlers=[
 logging.FileHandler("../logs/translation_error.log"),
 logging.StreamHandler()
 ])

logger = logging.getLogger()

def log_error(error_message):
 try:
 log_entry = {
 "error_message": error_message
 }
 logger.error(json.dumps(log_entry))
 except json.JSONDecodeError as json_err:
 logger.error(f"Failed to serialize error message as JSON: {error_message}")
 logger.error(f"JSON serialization error details: {json_err}")
 except Exception as ex:
 logger.error(f"An error occurred while logging: {str(ex)}")

Check if the target language is within the translation list, if yes can proceed with that 

In [7]:
def isTargetLanguageSupported(target_langcode):
 try:
 settings_config = isSettingsFileAvailable()
 language_config = settings_config.get('language_supported','')
 if language_config and target_langcode.lower() in language_config:
 return True
 else:
 log_error(f"Language ---{target_langcode}--- provided is not supported as per settings")
 return False 
 except Exception as ex:
 log_error(str(ex))
 return False

In [8]:
print(isTargetLanguageSupported('zh-CN'))

2024-06-25 12:13:45,428 ERROR {"error_message": "Language ---zh-CN--- provided is not supported as per settings"}


False


After this basic check ups, lets start with the actual translation process

In [6]:
%pip install -q deep_translator 

In [9]:
from deep_translator import GoogleTranslator

In [10]:
def translate_text_usingGoogleTranslator(text, language):
 try:
 isLanguageSupported = isTargetLanguageSupported(language)
 if isLanguageSupported:
 translated_text = GoogleTranslator(source='auto', target=language).translate(text)
 return translated_text
 else:
 return False
 except Exception as ex:
 log_error(str(ex))
 return False

In [12]:
print(translate_text_usingGoogleTranslator('Machine learning.','zh-CN'))

2024-06-25 12:14:23,295 ERROR {"error_message": "Language ---zh-CN--- provided is not supported as per settings"}


False


Calculate the BLEU score - THIs WILL BE CALCULATED BETWEEN TRANSLATED TEXT and a REFERENCE TEXT(GENERATED BY MS translator)

Step 1- Populate the reference text which is from MS translator

In [10]:
#rc1 is the release candidate version from the google translate 

%pip install -q googletrans==4.0.0-rc1

Once the source language is there, use MS mymemory provider to populate the reference text

In [11]:
from translate import Translator

def translate_text_usingMyMemory(text, from_lang, to_lang):
 translator = Translator(provider='mymemory', from_lang= from_lang, to_lang=to_lang)
 return translator.translate(text)

In [12]:
translate_text_usingMyMemory('i am good','en', 'zh') 

'我很好'

Auto-detect the language ---- IF NEEDED

In [8]:
from googletrans import Translator

def detect_language_with_googletrans(text):
 translator = Translator()
 detection = translator.detect(text)
 return detection.lang

# Example usage
text = "naunsa ka dili man ko maayo"
detected_language = detect_language_with_googletrans(text)
print(f"Detected language: {detected_language}")

Detected language: ceb


Perform metrics evaluation on how well the translation is used.. Will use BLEU score for that

In [93]:
#nltk - Natural language toolkit is the library to process for different words
#jieba - used for tokenization in Chinese language ONLY as the concept of tokenization works a bit different there 
%pip install -q nltk jieba

BLEU score calculation for Chinese words

In [13]:
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu_score_usingjieba(reference_text, candidate_text):
 # Use jieba to tokenize the sentences
 reference_tokens = list(jieba.cut(reference_text))
 candidate_tokens = list(jieba.cut(candidate_text))

 # Wrap the reference tokens in a nested list
 reference = [reference_tokens]
 candidate = candidate_tokens

 # Calculate BLEU score with smoothing
 bleu_score = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method6)
 print(bleu_score)

In [14]:
calculate_bleu_score_usingjieba('我很好','我很好')

Building prefix dict from the default dictionary ...
2024-06-25 09:36:09,429 DEBUG Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\soumya\AppData\Local\Temp\jieba.cache
2024-06-25 09:36:09,558 DEBUG Loading model from cache C:\Users\soumya\AppData\Local\Temp\jieba.cache
Loading model cost 0.820 seconds.
2024-06-25 09:36:10,361 DEBUG Loading model cost 0.820 seconds.
Prefix dict has been built successfully.
2024-06-25 09:36:10,362 DEBUG Prefix dict has been built successfully.


1.0


Calculate BLEU score for other languages such as english, malay etc. 
Tokenizer used here can be word net tokenizer

In [15]:
from nltk.tokenize import word_tokenize

def calculate_bleu_score_usingnltk(reference_text, candidate_text):
 reference_tokens = word_tokenize(reference_text.lower())
 candidate_tokens = word_tokenize(candidate_text.lower())

 print(reference_tokens)
 print(candidate_tokens)

 # Calculate BLEU score with smoothing
 bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method2)
 print(bleu_score)

In [16]:
calculate_bleu_score_usingnltk("saya baik",'saya baik')

['saya', 'baik']
['saya', 'baik']
0.7071067811865476


Questions: 

1) I have configured the supported languages in settings file ? 
2) The request will be based on text/per language ?