|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script would interpolate two arpa N-gram language models (LMs), |
|
culculate perplexity of resulted LM, and make binary KenLM from it. |
|
|
|
Minimun usage example to interpolate two N-gram language models with weights: |
|
alpha * ngram_a + beta * ngram_b = 2 * ngram_a + 1 * ngram_b |
|
|
|
python3 ngram_merge.py --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \ |
|
--arpa_a /path/ngram_a.kenlm.tmp.arpa \ |
|
--alpha 2 \ |
|
--arpa_b /path/ngram_b.kenlm.tmp.arpa \ |
|
--beta 1 \ |
|
--out_path /path/out |
|
|
|
|
|
Merge two N-gram language models and calculate its perplexity with test_file. |
|
python3 ngram_merge.py --kenlm_bin_path /workspace/nemo/decoders/kenlm/build/bin \ |
|
--ngram_bin_path /workspace/nemo/decoders/ngram-1.3.14/src/bin \ |
|
--arpa_a /path/ngram_a.kenlm.tmp.arpa \ |
|
--alpha 0.5 \ |
|
--arpa_b /path/ngram_b.kenlm.tmp.arpa \ |
|
--beta 0.5 \ |
|
--out_path /path/out \ |
|
--nemo_model_file /path/to/model_tokenizer.nemo \ |
|
--test_file /path/to/test_manifest.json \ |
|
--force |
|
""" |
|
|
|
import argparse |
|
import os |
|
import subprocess |
|
import sys |
|
from typing import Tuple |
|
|
|
import kenlm_utils |
|
import torch |
|
|
|
import nemo.collections.asr as nemo_asr |
|
from nemo.collections.asr.modules.rnnt import RNNTDecoder |
|
from nemo.collections.asr.parts.submodules.ctc_beam_decoding import DEFAULT_TOKEN_OFFSET |
|
from nemo.utils import logging |
|
|
|
|
|
class NgramMerge: |
|
def __init__(self, ngram_bin_path): |
|
self.ngram_bin_path = ngram_bin_path |
|
|
|
def ngrammerge(self, arpa_a: str, alpha: float, arpa_b: str, beta: float, arpa_c: str, force: bool) -> str: |
|
""" |
|
Merge two ARPA n-gram language models using the ngrammerge command-line tool and output the result in ARPA format. |
|
|
|
Args: |
|
arpa_a (str): Path to the first input ARPA file. |
|
alpha (float): Interpolation weight for the first model. |
|
arpa_b (str): Path to the second input ARPA file. |
|
beta (float): Interpolation weight for the second model. |
|
arpa_c (str): Path to the output ARPA file. |
|
force (bool): Whether to overwrite existing output files. |
|
|
|
Returns: |
|
str: Path to the output ARPA file in mod format. |
|
""" |
|
mod_a = arpa_a + ".mod" |
|
mod_b = arpa_b + ".mod" |
|
mod_c = arpa_c + ".mod" |
|
if os.path.isfile(mod_c) and not force: |
|
logging.info("File " + mod_c + " exists. Skipping.") |
|
else: |
|
sh_args = [ |
|
os.path.join(self.ngram_bin_path, "ngrammerge"), |
|
"--alpha=" + str(alpha), |
|
"--beta=" + str(beta), |
|
"--normalize", |
|
|
|
mod_a, |
|
mod_b, |
|
mod_c, |
|
] |
|
logging.info( |
|
"\n" |
|
+ str(subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,)) |
|
+ "\n", |
|
) |
|
return mod_c |
|
|
|
def arpa2mod(self, arpa_path: str, force: bool): |
|
""" |
|
This function reads an ARPA n-gram model and converts it to a binary format. The binary model is saved to the same directory as the ARPA model with a ".mod" extension. If the binary model file already exists and force argument is False, then the function skips conversion and returns a message. Otherwise, it executes the command to create a binary model using the subprocess.run method. |
|
|
|
Parameters: |
|
arpa_path (string): The file path to the ARPA n-gram model. |
|
force (bool): If True, the function will convert the ARPA model to binary even if the binary file already exists. If False and the binary file exists, the function will skip the conversion. |
|
Returns: |
|
If the binary model file already exists and force argument is False, returns a message indicating that the file exists and the conversion is skipped. |
|
Otherwise, returns a subprocess.CompletedProcess object, which contains information about the executed command. The subprocess's output and error streams are redirected to stdout and stderr, respectively. |
|
""" |
|
mod_path = arpa_path + ".mod" |
|
if os.path.isfile(mod_path) and not force: |
|
return "File " + mod_path + " exists. Skipping." |
|
else: |
|
sh_args = [ |
|
os.path.join(self.ngram_bin_path, "ngramread"), |
|
"--ARPA", |
|
arpa_path, |
|
mod_path, |
|
] |
|
return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) |
|
|
|
def merge( |
|
self, arpa_a: str, alpha: float, arpa_b: str, beta: float, out_path: str, force: bool |
|
) -> Tuple[str, str]: |
|
""" |
|
Merges two ARPA language models using the ngrammerge tool. |
|
|
|
Args: |
|
arpa_a (str): Path to the first ARPA language model file. |
|
alpha (float): Interpolation weight for the first model. |
|
arpa_b (str): Path to the second ARPA language model file. |
|
beta (float): Interpolation weight for the second model. |
|
out_path (str): Path to the output directory for the merged ARPA model. |
|
force (bool): Whether to force overwrite of existing files. |
|
|
|
Returns: |
|
Tuple[str, str]: A tuple containing the path to the merged binary language model file and the path to the |
|
merged ARPA language model file. |
|
""" |
|
logging.info("\n" + str(self.arpa2mod(arpa_a, force)) + "\n") |
|
|
|
logging.info("\n" + str(self.arpa2mod(arpa_b, force)) + "\n") |
|
arpa_c = os.path.join(out_path, f"{os.path.split(arpa_a)[1]}-{alpha}-{os.path.split(arpa_b)[1]}-{beta}.arpa",) |
|
mod_c = self.ngrammerge(arpa_a, alpha, arpa_b, beta, arpa_c, force) |
|
return mod_c, arpa_c |
|
|
|
def perplexity(self, ngram_mod: str, test_far: str) -> str: |
|
""" |
|
Calculates perplexity of a given ngram model on a test file. |
|
|
|
Args: |
|
ngram_mod (str): The path to the ngram model file. |
|
test_far (str): The path to the test file. |
|
|
|
Returns: |
|
str: A string representation of the perplexity calculated. |
|
|
|
Raises: |
|
AssertionError: If the subprocess to calculate perplexity returns a non-zero exit code. |
|
|
|
Example: |
|
>>> perplexity("/path/to/ngram_model", "/path/to/test_file") |
|
'Perplexity: 123.45' |
|
""" |
|
sh_args = [ |
|
os.path.join(self.ngram_bin_path, "ngramperplexity"), |
|
"--v=1", |
|
ngram_mod, |
|
test_far, |
|
] |
|
ps = subprocess.Popen(sh_args, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
stdout, stderr = ps.communicate() |
|
exit_code = ps.wait() |
|
command = " ".join(sh_args) |
|
assert ( |
|
exit_code == 0 |
|
), f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" |
|
perplexity_out = "\n".join(stdout.split("\n")[-6:-1]) |
|
return perplexity_out |
|
|
|
def make_arpa(self, ngram_mod: str, ngram_arpa: str, force: bool): |
|
""" |
|
Converts an ngram model in binary format to ARPA format. |
|
|
|
Args: |
|
- ngram_mod (str): The path to the ngram model in binary format. |
|
- ngram_arpa (str): The desired path for the ARPA format output file. |
|
- force (bool): If True, the ARPA format file will be generated even if it already exists. |
|
|
|
Returns: |
|
- Tuple[bytes, bytes] |
|
|
|
Raises: |
|
- AssertionError: If the shell command execution returns a non-zero exit code. |
|
- FileNotFoundError: If the binary ngram model file does not exist. |
|
""" |
|
if os.path.isfile(ngram_arpa) and not force: |
|
logging.info("File " + ngram_arpa + " exists. Skipping.") |
|
return None |
|
else: |
|
sh_args = [ |
|
os.path.join(self.ngram_bin_path, "ngramprint"), |
|
"--ARPA", |
|
ngram_mod, |
|
ngram_arpa, |
|
] |
|
return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) |
|
|
|
def test_perplexity(self, mod_c: str, symbols: str, test_txt: str, nemo_model_file: str, tmp_path: str) -> str: |
|
""" |
|
Tests the perplexity of a given ngram model on a test file. |
|
|
|
Args: |
|
mod_c (str): The path to the ngram model file. |
|
symbols (str): The path to the symbol table file. |
|
test_txt (str): The path to the test text file. |
|
nemo_model_file (str): The path to the NeMo model file. |
|
tmp_path (str): The path to the temporary directory where the test far file will be created. |
|
force (bool): If True, overwrites any existing far file. |
|
|
|
Returns: |
|
str: A string representation of the perplexity calculated. |
|
|
|
Example: |
|
>>> test_perplexity("/path/to/ngram_model", "/path/to/symbol_table", "/path/to/test_file", "/path/to/tokenizer_model", "/path/to/tmp_dir", True) |
|
'Perplexity: 123.45' |
|
""" |
|
|
|
test_far = farcompile(symbols, test_txt, tmp_path, nemo_model_file) |
|
res_p = self.perplexity(mod_c, test_far) |
|
return res_p |
|
|
|
|
|
def farcompile(symbols: str, text_file: str, tmp_path: str, nemo_model_file: str) -> str: |
|
""" |
|
Compiles a text file into a FAR file using the given symbol table or tokenizer. |
|
|
|
Args: |
|
symbols (str): The path to the symbol table file. |
|
text_file (str): The path to the text file to compile. |
|
tmp_path (str): The path to the temporary directory where the test far file will be created. |
|
nemo_model_file (str): The path to the NeMo model file (.nemo). |
|
force (bool): If True, overwrites any existing FAR file. |
|
|
|
Returns: |
|
test_far (str): The path to the resulting FAR file. |
|
|
|
Example: |
|
>>> farcompile("/path/to/symbol_table", "/path/to/text_file", "/path/to/far_file", "/path/to/tokenizer_model", "/path/to/nemo_model", True) |
|
""" |
|
test_far = os.path.join(tmp_path, os.path.split(text_file)[1] + ".far") |
|
|
|
sh_args = [ |
|
"farcompilestrings", |
|
"--generate_keys=10", |
|
"--fst_type=compact", |
|
"--symbols=" + symbols, |
|
"--keep_symbols", |
|
">", |
|
test_far, |
|
] |
|
|
|
tokenizer, encoding_level, is_aggregate_tokenizer = kenlm_utils.setup_tokenizer(nemo_model_file) |
|
|
|
ps = subprocess.Popen(" ".join(sh_args), shell=True, stdin=subprocess.PIPE, stdout=sys.stdout, stderr=sys.stderr,) |
|
|
|
kenlm_utils.iter_files( |
|
source_path=[text_file], |
|
dest_path=ps.stdin, |
|
tokenizer=tokenizer, |
|
encoding_level=encoding_level, |
|
is_aggregate_tokenizer=is_aggregate_tokenizer, |
|
verbose=1, |
|
) |
|
stdout, stderr = ps.communicate() |
|
|
|
exit_code = ps.returncode |
|
|
|
command = " ".join(sh_args) |
|
assert exit_code == 0, f"Exit_code must be 0.\n bash command: {command} \n stdout: {stdout} \n stderr: {stderr}" |
|
return test_far |
|
|
|
|
|
def make_kenlm(kenlm_bin_path: str, ngram_arpa: str, force: bool): |
|
""" |
|
Builds a language model from an ARPA format file using the KenLM toolkit. |
|
|
|
Args: |
|
- kenlm_bin_path (str): The path to the KenLM toolkit binary. |
|
- ngram_arpa (str): The path to the ARPA format file. |
|
- force (bool): If True, the KenLM language model will be generated even if it already exists. |
|
|
|
Raises: |
|
- AssertionError: If the shell command execution returns a non-zero exit code. |
|
- FileNotFoundError: If the KenLM binary or ARPA format file does not exist. |
|
""" |
|
ngram_kenlm = ngram_arpa + ".kenlm" |
|
if os.path.isfile(ngram_kenlm) and not force: |
|
logging.info("File " + ngram_kenlm + " exists. Skipping.") |
|
return None |
|
else: |
|
sh_args = [os.path.join(kenlm_bin_path, "build_binary"), "trie", "-i", ngram_arpa, ngram_kenlm] |
|
return subprocess.run(sh_args, capture_output=False, text=True, stdout=sys.stdout, stderr=sys.stderr,) |
|
|
|
|
|
def make_symbol_list(nemo_model_file, symbols, force): |
|
""" |
|
Function: make_symbol_list |
|
|
|
Create a symbol table for the input tokenizer model file. |
|
|
|
Args: |
|
nemo_model_file (str): Path to the NeMo model file. |
|
symbols (str): Path to the file where symbol list will be saved. |
|
force (bool): Flag to force creation of symbol list even if it already exists. |
|
|
|
Returns: |
|
None |
|
|
|
Raises: |
|
None |
|
""" |
|
if os.path.isfile(symbols) and not force: |
|
logging.info("File " + symbols + " exists. Skipping.") |
|
else: |
|
if nemo_model_file.endswith('.nemo'): |
|
asr_model = nemo_asr.models.ASRModel.restore_from(nemo_model_file, map_location=torch.device('cpu')) |
|
else: |
|
logging.warning( |
|
"nemo_model_file does not end with .nemo, therefore trying to load a pretrained model with this name." |
|
) |
|
asr_model = nemo_asr.models.ASRModel.from_pretrained(nemo_model_file, map_location=torch.device('cpu')) |
|
|
|
if isinstance(asr_model.decoder, RNNTDecoder): |
|
vocab_size = asr_model.decoder.blank_idx |
|
else: |
|
vocab_size = len(asr_model.decoder.vocabulary) |
|
|
|
vocab = [chr(idx + DEFAULT_TOKEN_OFFSET) for idx in range(vocab_size)] |
|
with open(symbols, "w", encoding="utf-8") as f: |
|
for i, v in enumerate(vocab): |
|
f.write(v + " " + str(i) + "\n") |
|
|
|
|
|
def main( |
|
kenlm_bin_path: str, |
|
ngram_bin_path: str, |
|
arpa_a: str, |
|
alpha: float, |
|
arpa_b: str, |
|
beta: float, |
|
out_path: str, |
|
test_file: str, |
|
symbols: str, |
|
nemo_model_file: str, |
|
force: bool, |
|
) -> None: |
|
""" |
|
Entry point function for merging ARPA format language models, testing perplexity, creating symbol list, |
|
and making ARPA and Kenlm models. |
|
|
|
Args: |
|
- kenlm_bin_path (str): The path to the Kenlm binary. |
|
- arpa_a (str): The path to the first ARPA format language model. |
|
- alpha (float): The weight given to the first language model during merging. |
|
- arpa_b (str): The path to the second ARPA format language model. |
|
- beta (float): The weight given to the second language model during merging. |
|
- out_path (str): The path where the output files will be saved. |
|
- test_file (str): The path to the file on which perplexity needs to be calculated. |
|
- symbols (str): The path to the file where symbol list for the tokenizer model will be saved. |
|
- nemo_model_file (str): The path to the NeMo model file. |
|
- force (bool): If True, overwrite existing files, otherwise skip the operations. |
|
|
|
Returns: |
|
- None |
|
""" |
|
nm = NgramMerge(ngram_bin_path) |
|
mod_c, arpa_c = nm.merge(arpa_a, alpha, arpa_b, beta, out_path, force) |
|
|
|
if test_file and nemo_model_file: |
|
if not symbols: |
|
symbols = os.path.join(out_path, os.path.split(nemo_model_file)[1] + ".syms") |
|
make_symbol_list(nemo_model_file, symbols, force) |
|
for test_f in test_file.split(","): |
|
test_p = nm.test_perplexity(mod_c, symbols, test_f, nemo_model_file, out_path) |
|
logging.info("Perplexity summary " + test_f + " : " + test_p) |
|
|
|
logging.info("Making ARPA and Kenlm model " + arpa_c) |
|
out = nm.make_arpa(mod_c, arpa_c, force) |
|
if out: |
|
logging.info("\n" + str(out) + "\n") |
|
|
|
out = make_kenlm(kenlm_bin_path, arpa_c, force) |
|
if out: |
|
logging.info("\n" + str(out) + "\n") |
|
|
|
|
|
def _parse_args(): |
|
parser = argparse.ArgumentParser( |
|
description="Interpolate ARPA N-gram language models and make KenLM binary model to be used with beam search decoder of ASR models." |
|
) |
|
parser.add_argument( |
|
"--kenlm_bin_path", required=True, type=str, help="The path to the bin folder of KenLM library.", |
|
) |
|
parser.add_argument( |
|
"--ngram_bin_path", required=True, type=str, help="The path to the bin folder of OpenGrm Ngram library.", |
|
) |
|
parser.add_argument("--arpa_a", required=True, type=str, help="Path to the arpa_a") |
|
parser.add_argument("--alpha", required=True, type=float, help="Weight of arpa_a") |
|
parser.add_argument("--arpa_b", required=True, type=str, help="Path to the arpa_b") |
|
parser.add_argument("--beta", required=True, type=float, help="Weight of arpa_b") |
|
parser.add_argument( |
|
"--out_path", required=True, type=str, help="Path to write tmp and resulted files.", |
|
) |
|
parser.add_argument( |
|
"--test_file", |
|
required=False, |
|
type=str, |
|
default=None, |
|
help="Path to test file to count perplexity if provided.", |
|
) |
|
parser.add_argument( |
|
"--symbols", |
|
required=False, |
|
type=str, |
|
default=None, |
|
help="Path to symbols (.syms) file . Could be calculated if it is not provided. Use as: --symbols /path/to/earnest.syms", |
|
) |
|
parser.add_argument( |
|
"--nemo_model_file", |
|
required=False, |
|
type=str, |
|
default=None, |
|
help="The path to '.nemo' file of the ASR model, or name of a pretrained NeMo model", |
|
) |
|
parser.add_argument("--force", "-f", action="store_true", help="Whether to recompile and rewrite all files") |
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == "__main__": |
|
main(**vars(_parse_args())) |
|
|