{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Check if the settings file with the languages available and able to be loaded" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json\n", "from pathlib import Path\n", "\n", "def isSettingsFileAvailable():\n", " current_dir = Path.cwd()\n", " file_path =current_dir.parent /'settings.json'\n", " try:\n", " if file_path.exists() and file_path.is_file():\n", " with file_path.open('r') as file:\n", " settings = json.load(file)\n", " return settings\n", " else:\n", " return \"Settings file is not found\"\n", " except Exception as err:\n", " return \"Issue reading the settings file\"\n", " finally:\n", " if \"file\" in locals() and not file.closed:\n", " file.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If the settings file is present ---> validate the ISO code passed to API is a valid one " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'language_supported': ['en', 'zh-CN', 'zh-TW', 'ms', 'ja', 'kr']}\n" ] } ], "source": [ "value = isSettingsFileAvailable()\n", "print(value)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Provide a logging mechanism to handle any errors during the translation process" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import json\n", "\n", "# Configure logging\n", "logging.basicConfig(level=logging.ERROR,\n", " format='%(asctime)s %(levelname)s %(message)s',\n", " handlers=[\n", " logging.FileHandler(\"../logs/translation_error.log\"),\n", " logging.StreamHandler()\n", " ])\n", "\n", "logger = logging.getLogger()\n", "\n", "def log_error(error_message):\n", " try:\n", " log_entry = {\n", " \"error_message\": error_message\n", " }\n", " logger.error(json.dumps(log_entry))\n", " except json.JSONDecodeError as json_err:\n", " logger.error(f\"Failed to serialize error message as JSON: {error_message}\")\n", " logger.error(f\"JSON serialization error details: {json_err}\")\n", " except Exception as ex:\n", " logger.error(f\"An error occurred while logging: {str(ex)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check if the target language is within the translation list, if yes can proceed with that " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def isTargetLanguageSupported(target_langcode):\n", " try:\n", " settings_config = isSettingsFileAvailable()\n", " language_config = settings_config.get('language_supported','')\n", " if language_config and target_langcode.lower() in language_config:\n", " return True\n", " else:\n", " log_error(f\"Language ---{target_langcode}--- provided is not supported as per settings\")\n", " return False \n", " except Exception as ex:\n", " log_error(str(ex))\n", " return False" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-06-25 12:13:45,428 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "print(isTargetLanguageSupported('zh-CN'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "After this basic check ups, lets start with the actual translation process" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "%pip install -q deep_translator " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from deep_translator import GoogleTranslator" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def translate_text_usingGoogleTranslator(text, language):\n", " try:\n", " isLanguageSupported = isTargetLanguageSupported(language)\n", " if isLanguageSupported:\n", " translated_text = GoogleTranslator(source='auto', target=language).translate(text)\n", " return translated_text\n", " else:\n", " return False\n", " except Exception as ex:\n", " log_error(str(ex))\n", " return False" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2024-06-25 12:14:23,295 ERROR {\"error_message\": \"Language ---zh-CN--- provided is not supported as per settings\"}\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] } ], "source": [ "print(translate_text_usingGoogleTranslator('Machine learning.','zh-CN'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate the BLEU score - THIs WILL BE CALCULATED BETWEEN TRANSLATED TEXT and a REFERENCE TEXT(GENERATED BY MS translator)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Step 1- Populate the reference text which is from MS translator" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "#rc1 is the release candidate version from the google translate \n", "\n", "%pip install -q googletrans==4.0.0-rc1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once the source language is there, use MS mymemory provider to populate the reference text" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from translate import Translator\n", "\n", "def translate_text_usingMyMemory(text, from_lang, to_lang):\n", " translator = Translator(provider='mymemory', from_lang= from_lang, to_lang=to_lang)\n", " return translator.translate(text)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'我很好'" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "translate_text_usingMyMemory('i am good','en', 'zh') " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Auto-detect the language ---- IF NEEDED" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Detected language: ceb\n" ] } ], "source": [ "from googletrans import Translator\n", "\n", "def detect_language_with_googletrans(text):\n", " translator = Translator()\n", " detection = translator.detect(text)\n", " return detection.lang\n", "\n", "# Example usage\n", "text = \"naunsa ka dili man ko maayo\"\n", "detected_language = detect_language_with_googletrans(text)\n", "print(f\"Detected language: {detected_language}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Perform metrics evaluation on how well the translation is used.. Will use BLEU score for that" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "#nltk - Natural language toolkit is the library to process for different words\n", "#jieba - used for tokenization in Chinese language ONLY as the concept of tokenization works a bit different there \n", "%pip install -q nltk jieba" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "BLEU score calculation for Chinese words" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import jieba\n", "from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction\n", "\n", "def calculate_bleu_score_usingjieba(reference_text, candidate_text):\n", " # Use jieba to tokenize the sentences\n", " reference_tokens = list(jieba.cut(reference_text))\n", " candidate_tokens = list(jieba.cut(candidate_text))\n", "\n", " # Wrap the reference tokens in a nested list\n", " reference = [reference_tokens]\n", " candidate = candidate_tokens\n", "\n", " # Calculate BLEU score with smoothing\n", " bleu_score = sentence_bleu(reference, candidate, smoothing_function=SmoothingFunction().method6)\n", " print(bleu_score)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Building prefix dict from the default dictionary ...\n", "2024-06-25 09:36:09,429 DEBUG Building prefix dict from the default dictionary ...\n", "Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n", "2024-06-25 09:36:09,558 DEBUG Loading model from cache C:\\Users\\soumya\\AppData\\Local\\Temp\\jieba.cache\n", "Loading model cost 0.820 seconds.\n", "2024-06-25 09:36:10,361 DEBUG Loading model cost 0.820 seconds.\n", "Prefix dict has been built successfully.\n", "2024-06-25 09:36:10,362 DEBUG Prefix dict has been built successfully.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "1.0\n" ] } ], "source": [ "calculate_bleu_score_usingjieba('我很好','我很好')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculate BLEU score for other languages such as english, malay etc. \n", "Tokenizer used here can be word net tokenizer" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize\n", "\n", "def calculate_bleu_score_usingnltk(reference_text, candidate_text):\n", " reference_tokens = word_tokenize(reference_text.lower())\n", " candidate_tokens = word_tokenize(candidate_text.lower())\n", "\n", " print(reference_tokens)\n", " print(candidate_tokens)\n", "\n", " # Calculate BLEU score with smoothing\n", " bleu_score = sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method2)\n", " print(bleu_score)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['saya', 'baik']\n", "['saya', 'baik']\n", "0.7071067811865476\n" ] } ], "source": [ "calculate_bleu_score_usingnltk(\"saya baik\",'saya baik')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Questions: \n", "\n", "1) I have configured the supported languages in settings file ? \n", "2) The request will be based on text/per language ?" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }