File size: 18,367 Bytes

2d8da09

# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This script would interpolate two arpa N-gram language models (LMs), 
culculate perplexity of resulted LM, and make binary KenLM from it.

Minimun usage example to interpolate two N-gram language models with weights:
alpha * ngram_a + beta * ngram_b = 2 * ngram_a + 1 * ngram_b

python3 ngram_merge.py  --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \
                    --arpa_a /path/ngram_a.kenlm.tmp.arpa \
                    --alpha 2 \
                    --arpa_b /path/ngram_b.kenlm.tmp.arpa \
                    --beta 1 \
                    --out_path /path/out


Merge two N-gram language models and calculate its perplexity with test_file.
python3 ngram_merge.py  --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \
                    --ngram_bin_path /workspace/nemo/decoders/ngram-1.3.14/src/bin \
                    --arpa_a /path/ngram_a.kenlm.tmp.arpa \
                    --alpha 0.5 \
                    --arpa_b /path/ngram_b.kenlm.tmp.arpa \
                    --beta 0.5 \
                    --out_path /path/out \
                    --nemo_model_file /path/to/model_tokenizer.nemo \
                    --test_file /path/to/test_manifest.json \
                    --force
"""

import argparse
import os
import subprocess
import sys
from typing import Tuple

import kenlm_utils
import torch

import nemo.collections.asr as nemo_asr
from nemo.collections.asr.modules.rnnt import RNNTDecoder
from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET
from nemo.utils import logging


class NgramMerge:
    def __init__(self, ngram_bin_path):
        self.ngram_bin_path = ngram_bin_path

    def ngrammerge(self, arpa_a: str, alpha: float, arpa_b: str, beta: float, arpa_c: str, force: bool) -> str:
        """
        Merge two ARPA n-gram language models using the ngrammerge command-line tool and output the result in ARPA format.
        
        Args:
            arpa_a (str): Path to the first input ARPA file.
            alpha (float): Interpolation weight for the first model.
            arpa_b (str): Path to the second input ARPA file.
            beta (float): Interpolation weight for the second model.
            arpa_c (str): Path to the output ARPA file.
            force (bool): Whether to overwrite existing output files.
        
        Returns:
            str: Path to the output ARPA file in mod format.
        """
        mod_a = arpa_a + ".mod"
        mod_b = arpa_b + ".mod"
        mod_c = arpa_c + ".mod"
        if os.path.isfile(mod_c) and not force:
            logging.info("File " + mod_c + " exists. Skipping.")
        else:
            sh_args = [
                os.path.join(self.ngram_bin_path, "ngrammerge"),
                "--alpha=" + str(alpha),
                "--beta=" + str(beta),
                "--normalize",
                # "--use_smoothing",
                mod_a,
                mod_b,
                mod_c,
            ]
            logging.info(
                "\n"
                + str(subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,))
                + "\n",
            )
        return mod_c

    def arpa2mod(self, arpa_path: str, force: bool):
        """
        This function reads an ARPA n-gram model and converts it to a binary format. The binary model is saved to the same directory as the ARPA model with a ".mod" extension. If the binary model file already exists and force argument is False, then the function skips conversion and returns a message. Otherwise, it executes the command to create a binary model using the subprocess.run method.

        Parameters:
            arpa_path (string): The file path to the ARPA n-gram model.
            force (bool): If True, the function will convert the ARPA model to binary even if the binary file already exists. If False and the binary file exists, the function will skip the conversion.
        Returns:
            If the binary model file already exists and force argument is False, returns a message indicating that the file exists and the conversion is skipped.
            Otherwise, returns a subprocess.CompletedProcess object, which contains information about the executed command. The subprocess's output and error streams are redirected to stdout and stderr, respectively.
        """
        mod_path = arpa_path + ".mod"
        if os.path.isfile(mod_path) and not force:
            return "File " + mod_path + " exists. Skipping."
        else:
            sh_args = [
                os.path.join(self.ngram_bin_path, "ngramread"),
                "--ARPA",
                arpa_path,
                mod_path,
            ]
            return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,)

    def merge(
        self, arpa_a: str, alpha: float, arpa_b: str, beta: float, out_path: str, force: bool
    ) -> Tuple[str, str]:
        """
        Merges two ARPA language models using the ngrammerge tool.

        Args:
            arpa_a (str): Path to the first ARPA language model file.
            alpha (float): Interpolation weight for the first model.
            arpa_b (str): Path to the second ARPA language model file.
            beta (float): Interpolation weight for the second model.
            out_path (str): Path to the output directory for the merged ARPA model.
            force (bool): Whether to force overwrite of existing files.

        Returns:
            Tuple[str, str]: A tuple containing the path to the merged binary language model file and the path to the 
            merged ARPA language model file.
        """
        logging.info("\n" + str(self.arpa2mod(arpa_a, force)) + "\n")

        logging.info("\n" + str(self.arpa2mod(arpa_b, force)) + "\n")
        arpa_c = os.path.join(out_path, f"{os.path.split(arpa_a)[1]}-{alpha}-{os.path.split(arpa_b)[1]}-{beta}.arpa",)
        mod_c = self.ngrammerge(arpa_a, alpha, arpa_b, beta, arpa_c, force)
        return mod_c, arpa_c

    def perplexity(self, ngram_mod: str, test_far: str) -> str:
        """
        Calculates perplexity of a given ngram model on a test file.

        Args:
            ngram_mod (str): The path to the ngram model file.
            test_far (str): The path to the test file.

        Returns:
            str: A string representation of the perplexity calculated.

        Raises:
            AssertionError: If the subprocess to calculate perplexity returns a non-zero exit code.

        Example:
            >>> perplexity("/path/to/ngram_model", "/path/to/test_file")
            'Perplexity: 123.45'
        """
        sh_args = [
            os.path.join(self.ngram_bin_path, "ngramperplexity"),
            "--v=1",
            ngram_mod,
            test_far,
        ]
        ps = subprocess.Popen(sh_args, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = ps.communicate()
        exit_code = ps.wait()
        command = " ".join(sh_args)
        assert (
            exit_code == 0
        ), f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}"
        perplexity_out = "\n".join(stdout.split("\n")[-6:-1])
        return perplexity_out

    def make_arpa(self, ngram_mod: str, ngram_arpa: str, force: bool):
        """
        Converts an ngram model in binary format to ARPA format.

        Args:
        - ngram_mod (str): The path to the ngram model in binary format.
        - ngram_arpa (str): The desired path for the ARPA format output file.
        - force (bool): If True, the ARPA format file will be generated even if it already exists.

        Returns:
        - Tuple[bytes, bytes]

        Raises:
        - AssertionError: If the shell command execution returns a non-zero exit code.
        - FileNotFoundError: If the binary ngram model file does not exist.
        """
        if os.path.isfile(ngram_arpa) and not force:
            logging.info("File " + ngram_arpa + " exists. Skipping.")
            return None
        else:
            sh_args = [
                os.path.join(self.ngram_bin_path, "ngramprint"),
                "--ARPA",
                ngram_mod,
                ngram_arpa,
            ]
            return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,)

    def test_perplexity(self, mod_c: str, symbols: str, test_txt: str, nemo_model_file: str, tmp_path: str) -> str:
        """
        Tests the perplexity of a given ngram model on a test file.

        Args:
            mod_c (str): The path to the ngram model file.
            symbols (str): The path to the symbol table file.
            test_txt (str): The path to the test text file.
            nemo_model_file (str): The path to the NeMo model file.
            tmp_path (str): The path to the temporary directory where the test far file will be created.
            force (bool): If True, overwrites any existing far file.

        Returns:
            str: A string representation of the perplexity calculated.

        Example:
            >>> test_perplexity("/path/to/ngram_model", "/path/to/symbol_table", "/path/to/test_file", "/path/to/tokenizer_model", "/path/to/tmp_dir", True)
            'Perplexity: 123.45'
        """

        test_far = farcompile(symbols, test_txt, tmp_path, nemo_model_file)
        res_p = self.perplexity(mod_c, test_far)
        return res_p


def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str) -> str:
    """
    Compiles a text file into a FAR file using the given symbol table or tokenizer.

    Args:
        symbols (str): The path to the symbol table file.
        text_file (str): The path to the text file to compile.
        tmp_path (str): The path to the temporary directory where the test far file will be created.
        nemo_model_file (str): The path to the NeMo model file (.nemo).
        force (bool): If True, overwrites any existing FAR file.

    Returns:
        test_far (str): The path to the resulting FAR file.

    Example:
        >>> farcompile("/path/to/symbol_table", "/path/to/text_file", "/path/to/far_file", "/path/to/tokenizer_model", "/path/to/nemo_model", True)
    """
    test_far = os.path.join(tmp_path, os.path.split(text_file)[1] + ".far")

    sh_args = [
        "farcompilestrings",
        "--generate_keys=10",
        "--fst_type=compact",
        "--symbols=" + symbols,
        "--keep_symbols",
        ">",
        test_far,
    ]

    tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(nemo_model_file)

    ps = subprocess.Popen(" ".join(sh_args), shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr,)

    kenlm_utils.iter_files(
        source_path=[text_file],
        dest_path=ps.stdin,
        tokenizer=tokenizer,
        encoding_level=encoding_level,
        is_aggregate_tokenizer=is_aggregate_tokenizer,
        verbose=1,
    )
    stdout, stderr = ps.communicate()

    exit_code = ps.returncode

    command = " ".join(sh_args)
    assert exit_code == 0, f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}"
    return test_far


def make_kenlm(kenlm_bin_path: str, ngram_arpa: str, force: bool):
    """
    Builds a language model from an ARPA format file using the KenLM toolkit.

    Args:
    - kenlm_bin_path (str): The path to the KenLM toolkit binary.
    - ngram_arpa (str): The path to the ARPA format file.
    - force (bool): If True, the KenLM language model will be generated even if it already exists.

    Raises:
    - AssertionError: If the shell command execution returns a non-zero exit code.
    - FileNotFoundError: If the KenLM binary or ARPA format file does not exist.
    """
    ngram_kenlm = ngram_arpa + ".kenlm"
    if os.path.isfile(ngram_kenlm) and not force:
        logging.info("File " + ngram_kenlm + " exists. Skipping.")
        return None
    else:
        sh_args = [os.path.join(kenlm_bin_path, "build_binary"), "trie", "-i", ngram_arpa, ngram_kenlm]
        return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,)


def make_symbol_list(nemo_model_file, symbols, force):
    """
    Function: make_symbol_list

    Create a symbol table for the input tokenizer model file.

    Args:
        nemo_model_file (str): Path to the NeMo model file.
        symbols (str): Path to the file where symbol list will be saved.
        force (bool): Flag to force creation of symbol list even if it already exists.
    
    Returns:
        None

    Raises:
        None
    """
    if os.path.isfile(symbols) and not force:
        logging.info("File " + symbols + " exists. Skipping.")
    else:
        if nemo_model_file.endswith('.nemo'):
            asr_model = nemo_asr.models.ASRModel.restore_from(nemo_model_file, map_location=torch.device('cpu'))
        else:
            logging.warning(
                "nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name."
            )
            asr_model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu'))

        if isinstance(asr_model.decoder, RNNTDecoder):
            vocab_size = asr_model.decoder.blank_idx
        else:
            vocab_size = len(asr_model.decoder.vocabulary)

        vocab = [chr(idx + DEFAULT_TOKEN_OFFSET) for idx in range(vocab_size)]
        with open(symbols, "w", encoding="utf-8") as f:
            for i, v in enumerate(vocab):
                f.write(v + " " + str(i) + "\n")


def main(
    kenlm_bin_path: str,
    ngram_bin_path: str,
    arpa_a: str,
    alpha: float,
    arpa_b: str,
    beta: float,
    out_path: str,
    test_file: str,
    symbols: str,
    nemo_model_file: str,
    force: bool,
) -> None:
    """
    Entry point function for merging ARPA format language models, testing perplexity, creating symbol list, 
    and making ARPA and Kenlm models.

    Args:
    - kenlm_bin_path (str): The path to the Kenlm binary.
    - arpa_a (str): The path to the first ARPA format language model.
    - alpha (float): The weight given to the first language model during merging.
    - arpa_b (str): The path to the second ARPA format language model.
    - beta (float): The weight given to the second language model during merging.
    - out_path (str): The path where the output files will be saved.
    - test_file (str): The path to the file on which perplexity needs to be calculated.
    - symbols (str): The path to the file where symbol list for the tokenizer model will be saved.
    - nemo_model_file (str): The path to the NeMo model file.
    - force (bool): If True, overwrite existing files, otherwise skip the operations.

    Returns:
    - None
    """
    nm = NgramMerge(ngram_bin_path)
    mod_c, arpa_c = nm.merge(arpa_a, alpha, arpa_b, beta, out_path, force)

    if test_file and nemo_model_file:
        if not symbols:
            symbols = os.path.join(out_path, os.path.split(nemo_model_file)[1] + ".syms")
            make_symbol_list(nemo_model_file, symbols, force)
        for test_f in test_file.split(","):
            test_p = nm.test_perplexity(mod_c, symbols, test_f, nemo_model_file, out_path)
            logging.info("Perplexity summary " + test_f + " : " + test_p)

    logging.info("Making ARPA and Kenlm model " + arpa_c)
    out = nm.make_arpa(mod_c, arpa_c, force)
    if out:
        logging.info("\n" + str(out) + "\n")

    out = make_kenlm(kenlm_bin_path, arpa_c, force)
    if out:
        logging.info("\n" + str(out) + "\n")


def _parse_args():
    parser = argparse.ArgumentParser(
        description="Interpolate ARPA N-gram language models and make KenLM binary model to be used with beam search decoder of ASR models."
    )
    parser.add_argument(
        "--kenlm_bin_path", required=True, type=str, help="The path to the bin folder of KenLM library.",
    )  # Use /workspace/nemo/decoders/kenlm/build/bin if installed it with scripts/asr_language_modeling/ngram_lm/install_beamsearch_decoders.sh
    parser.add_argument(
        "--ngram_bin_path", required=True, type=str, help="The path to the bin folder of OpenGrm Ngram library.",
    )  # Use /workspace/nemo/decoders/ngram-1.3.14/src/bin if installed it with scripts/installers/install_opengrm.sh
    parser.add_argument("--arpa_a", required=True, type=str, help="Path to the arpa_a")
    parser.add_argument("--alpha", required=True, type=float, help="Weight of arpa_a")
    parser.add_argument("--arpa_b", required=True, type=str, help="Path to the arpa_b")
    parser.add_argument("--beta", required=True, type=float, help="Weight of arpa_b")
    parser.add_argument(
        "--out_path", required=True, type=str, help="Path to write tmp and resulted files.",
    )
    parser.add_argument(
        "--test_file",
        required=False,
        type=str,
        default=None,
        help="Path to test file to count perplexity if provided.",
    )
    parser.add_argument(
        "--symbols",
        required=False,
        type=str,
        default=None,
        help="Path to symbols (.syms) file . Could be calculated if it is not provided. Use as: --symbols /path/to/earnest.syms",
    )
    parser.add_argument(
        "--nemo_model_file",
        required=False,
        type=str,
        default=None,
        help="The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model",
    )
    parser.add_argument("--force", "-f", action="store_true", help="Whether to recompile and rewrite all files")
    return parser.parse_args()


if __name__ == "__main__":
    main(**vars(_parse_args()))