|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
This script provides a functionality to create confidence-based ensembles |
|
from a collection of pretrained models. |
|
|
|
For more details see the paper https://arxiv.org/abs/2306.15824 |
|
or tutorial in tutorials/asr/Confidence_Ensembles.ipynb |
|
|
|
You would typically use this script by providing a yaml config file or overriding |
|
default options from command line. |
|
|
|
Usage examples: |
|
|
|
1. Building an ensemble of two monolingual models with default settings (no confidence tuning). |
|
|
|
python build_ensemble.py --config-path=. --config-name=ensemble_config.yaml |
|
ensemble.0.model=stt_it_conformer_ctc_large |
|
ensemble.0.training_manifest=<path to the Italian data of 100+ utterances (no transcription required)> |
|
ensemble.1.model=stt_es_conformer_ctc_large |
|
ensemble.1.training_manifest=<path to the Spanish data of 100+ utterances (no transcription required)> |
|
output_path=<path to the desired location of the .nemo checkpoint> |
|
|
|
You can have more than 2 models and can control transcription settings (e.g., batch size) |
|
with ``transcription.<any argument of examples/asr/transcribe_speech.py>`` parameters. |
|
|
|
2. If you want to get improved results, you can enable tuning of the confidence and logistic regression (LR) parameters. |
|
E.g. |
|
|
|
python build_ensemble.py |
|
<all arguments like in the previous example> |
|
ensemble.0.dev_manifest=<path to the dev data that's required for tuning> |
|
... |
|
# IMPORTANT: see the note below if you use > 2 models! |
|
ensemble.N.dev_manifest=<path to the dev data that's required for tuning> |
|
tune_confidence=True # to allow confidence tuning. LR is tuned by default |
|
|
|
As with any tuning, it is recommended to have reasonably large validation set for each model, |
|
otherwise you might overfit to the validation data. |
|
|
|
Note that if you add additional models (> 2) you will need to modify ensemble_config.yaml |
|
or create a new one with added models in there. While it's theoretically possible to |
|
fully override such parameters from commandline, hydra is very unfriendly for such |
|
use-cases, so it's strongly recommended to be creating new configs. |
|
|
|
3. If you want to precisely control tuning grid search, you can do that with |
|
|
|
python build_ensemble.py |
|
<all arguments as in the previous examples> |
|
tune_confidence_config.confidence_type='[entropy_renyi_exp,entropy_tsallis_exp]' # only tune over this set |
|
tune_confidence_config.alpha='[0.1,0.5,1.0]' # only tune over this set |
|
|
|
You can check the dataclasses in this file for the full list of supported |
|
arguments and their default values. |
|
""" |
|
|
|
import atexit |
|
|
|
|
|
import logging |
|
import os |
|
import random |
|
import sys |
|
import tempfile |
|
from copy import deepcopy |
|
from dataclasses import dataclass, field |
|
from pathlib import Path |
|
from typing import Dict, List, Optional, Tuple |
|
|
|
import joblib |
|
import numpy as np |
|
import pytorch_lightning as pl |
|
from omegaconf import MISSING, DictConfig, OmegaConf |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.metrics import confusion_matrix |
|
from sklearn.pipeline import Pipeline, make_pipeline |
|
from sklearn.preprocessing import StandardScaler |
|
from tqdm import tqdm |
|
|
|
from nemo.collections.asr.models.confidence_ensemble import ( |
|
ConfidenceEnsembleModel, |
|
ConfidenceSpec, |
|
compute_confidence, |
|
get_filtered_logprobs, |
|
) |
|
from nemo.collections.asr.parts.utils.asr_confidence_utils import ( |
|
ConfidenceConfig, |
|
ConfidenceMethodConfig, |
|
get_confidence_aggregation_bank, |
|
get_confidence_measure_bank, |
|
) |
|
from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis |
|
from nemo.core.config import hydra_runner |
|
|
|
LOG = logging.getLogger(__file__) |
|
|
|
|
|
try: |
|
sys.path.append(str(Path(__file__).parents[2] / "examples" / "asr")) |
|
import transcribe_speech |
|
except ImportError: |
|
|
|
|
|
|
|
|
|
print( |
|
"Current script depends on 'examples/asr/transcribe_speech.py', but can't find it. " |
|
"If it's not present, download it from the NeMo github manually and put inside this folder." |
|
) |
|
|
|
|
|
@dataclass |
|
class EnsembleConfig: |
|
|
|
model: str = MISSING |
|
|
|
training_manifest: str = MISSING |
|
|
|
|
|
max_training_samples: int = 1000 |
|
|
|
dev_manifest: Optional[str] = None |
|
|
|
|
|
@dataclass |
|
class TuneConfidenceConfig: |
|
|
|
exclude_blank: Tuple[bool] = (True, False) |
|
|
|
aggregation: Tuple[str] = ("mean", "min", "max") |
|
|
|
|
|
confidence_type: Tuple[str] = ( |
|
"entropy_renyi_exp", |
|
"entropy_renyi_lin", |
|
"entropy_tsallis_exp", |
|
"entropy_tsallis_lin", |
|
"entropy_gibbs_lin", |
|
"entropy_gibbs_exp", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
alpha: Tuple[float] = (0.25, 0.33, 0.5, 1.0) |
|
|
|
def get_grid_size(self) -> int: |
|
"""Returns the total number of points in the search space.""" |
|
if "max_prob" in self.confidence_type: |
|
return ( |
|
len(self.exclude_blank) |
|
* len(self.aggregation) |
|
* ((len(self.confidence_type) - 1) * len(self.alpha) + 1) |
|
) |
|
return len(self.exclude_blank) * len(self.aggregation) * len(self.confidence_type) * len(self.alpha) |
|
|
|
|
|
@dataclass |
|
class TuneLogisticRegressionConfig: |
|
|
|
|
|
C_num_points: int = 10 |
|
C_min: float = 0.0001 |
|
C_max: float = 10.0 |
|
|
|
|
|
multi_class: Tuple[str] = ("ovr", "multinomial") |
|
|
|
|
|
class_weight: Tuple = (None, "balanced") |
|
|
|
|
|
max_iter: int = 1000 |
|
|
|
|
|
@dataclass |
|
class BuildEnsembleConfig: |
|
|
|
output_path: str = MISSING |
|
|
|
|
|
ensemble: List[EnsembleConfig] = MISSING |
|
|
|
random_seed: int = 0 |
|
|
|
|
|
confidence: ConfidenceConfig = field( |
|
default_factory=lambda: ConfidenceConfig( |
|
|
|
preserve_frame_confidence=True, |
|
exclude_blank=True, |
|
aggregation="mean", |
|
method_cfg=ConfidenceMethodConfig(name="entropy", entropy_type="renyi", alpha=0.25, entropy_norm="lin",), |
|
) |
|
) |
|
temperature: float = 1.0 |
|
|
|
|
|
|
|
|
|
transcription: transcribe_speech.TranscriptionConfig = field( |
|
default_factory=lambda: transcribe_speech.TranscriptionConfig() |
|
) |
|
|
|
|
|
|
|
tune_confidence: bool = False |
|
|
|
|
|
|
|
tune_confidence_config: TuneConfidenceConfig = field(default_factory=lambda: TuneConfidenceConfig()) |
|
|
|
|
|
|
|
tune_logistic_regression: bool = True |
|
tune_logistic_regression_config: TuneLogisticRegressionConfig = field( |
|
default_factory=lambda: TuneLogisticRegressionConfig() |
|
) |
|
|
|
def __post_init__(self): |
|
"""Checking that if any dev data is provided, all are provided. |
|
|
|
Will also auto-set tune_logistic_regression to False if no dev data |
|
is available. |
|
|
|
If tune_confidence is set to True (user choice) and no dev data is |
|
provided, will raise an error. |
|
""" |
|
num_dev_data = 0 |
|
for ensemble_cfg in self.ensemble: |
|
num_dev_data += ensemble_cfg.dev_manifest is not None |
|
if num_dev_data == 0: |
|
if self.tune_confidence: |
|
raise ValueError("tune_confidence is set to True, but no dev data is provided") |
|
LOG.info("Setting tune_logistic_regression = False since no dev data is provided") |
|
self.tune_logistic_regression = False |
|
return |
|
|
|
if num_dev_data < len(self.ensemble): |
|
raise ValueError( |
|
"Some ensemble configs specify dev data, but some don't. Either all have to specify it or none!" |
|
) |
|
|
|
|
|
def calculate_score(features: np.ndarray, labels: np.ndarray, pipe: Pipeline) -> Tuple[float, np.ndarray]: |
|
"""Score is always calculated as mean of the per-class scores. |
|
|
|
This is done to account for possible class imbalances. |
|
|
|
Args: |
|
features: numpy array of features of shape [N x D], where N is the |
|
number of objects (typically a total number of utterances in |
|
all datasets) and D is the total number of confidence scores |
|
used to train the model (typically = number of models). |
|
labels: numpy array of shape [N] contatining ground-truth model indices. |
|
pipe: classification pipeline (currently, standardization + logistic |
|
regression). |
|
|
|
Returns: |
|
tuple: score value in [0, 1] and full classification confusion matrix. |
|
""" |
|
predictions = pipe.predict(features) |
|
conf_m = confusion_matrix(labels, predictions) |
|
score = np.diag(conf_m).sum() / conf_m.sum() |
|
return score, conf_m |
|
|
|
|
|
def train_model_selection( |
|
training_features: np.ndarray, |
|
training_labels: np.ndarray, |
|
dev_features: Optional[np.ndarray] = None, |
|
dev_labels: Optional[np.ndarray] = None, |
|
tune_lr: bool = False, |
|
tune_lr_cfg: Optional[TuneLogisticRegressionConfig] = None, |
|
verbose: bool = False, |
|
) -> Tuple[Pipeline, float]: |
|
"""Trains model selection block with an (optional) tuning of the parameters. |
|
|
|
Returns a pipeline consisting of feature standardization and logistic |
|
regression. If tune_lr is set to True, dev features/labels will be used |
|
to tune the hyperparameters of the logistic regression with the grid |
|
search that's defined via ``tune_lr_cfg``. |
|
|
|
If no tuning is requested, uses the following parameters:: |
|
|
|
best_pipe = make_pipeline( |
|
StandardScaler(), |
|
LogisticRegression( |
|
multi_class="multinomial", |
|
C=10000.0, |
|
max_iter=1000, |
|
class_weight="balanced", |
|
), |
|
) |
|
|
|
Args: |
|
training_features: numpy array of features of shape [N x D], where N is |
|
the number of objects (typically a total number of utterances in |
|
all training datasets) and D is the total number of confidence |
|
scores used to train the model (typically = number of models). |
|
training_labels: numpy array of shape [N] contatining ground-truth |
|
model indices. |
|
dev_features: same as training, but for the validation subset. |
|
dev_labels: same as training, but for the validation subset. |
|
tune_lr: controls whether tuning of LR hyperparameters is performed. |
|
If set to True, it's required to also provide dev features/labels. |
|
tune_lr_cfg: specifies what values of LR hyperparameters to try. |
|
verbose: if True, will output final training/dev scores. |
|
|
|
Returns: |
|
tuple: trained model selection pipeline, best score (or -1 if no tuning |
|
was done). |
|
""" |
|
if not tune_lr: |
|
|
|
best_pipe = make_pipeline( |
|
StandardScaler(), |
|
LogisticRegression(multi_class="multinomial", C=10000.0, max_iter=1000, class_weight="balanced"), |
|
) |
|
max_score = -1 |
|
else: |
|
C_pms = np.append( |
|
np.exp(np.linspace(np.log(tune_lr_cfg.C_min), np.log(tune_lr_cfg.C_max), tune_lr_cfg.C_num_points)), |
|
10000.0, |
|
) |
|
max_score = 0 |
|
best_pipe = None |
|
for class_weight in tune_lr_cfg.class_weight: |
|
for multi_class in tune_lr_cfg.multi_class: |
|
for C in C_pms: |
|
pipe = make_pipeline( |
|
StandardScaler(), |
|
LogisticRegression( |
|
multi_class=multi_class, C=C, max_iter=tune_lr_cfg.max_iter, class_weight=class_weight |
|
), |
|
) |
|
pipe.fit(training_features, training_labels) |
|
score, confusion = calculate_score(dev_features, dev_labels, pipe) |
|
if score > max_score: |
|
max_score = score |
|
best_pipe = pipe |
|
|
|
best_pipe.fit(training_features, training_labels) |
|
if verbose: |
|
accuracy, confusion = calculate_score(training_features, training_labels, best_pipe) |
|
LOG.info("Training fit accuracy: %.4f", accuracy * 100.0) |
|
LOG.info("Training confusion matrix:\n%s", str(confusion)) |
|
if dev_features is not None and verbose: |
|
accuracy, confusion = calculate_score(dev_features, dev_labels, best_pipe) |
|
LOG.info("Dev fit accuracy: %.4f", accuracy * 100.0) |
|
LOG.info("Dev confusion matrix:\n%s", str(confusion)) |
|
|
|
return best_pipe, max_score |
|
|
|
|
|
def subsample_manifest(manifest_file: str, max_samples: int) -> str: |
|
"""Will save a subsampled version of the manifest to the same folder. |
|
|
|
Have to save to the same folder to support relative paths. |
|
|
|
Args: |
|
manifest_file: path to the manifest file that needs subsampling. |
|
max_samples: how many samples to retain. Will randomly select that |
|
many lines from the manifest. |
|
|
|
Returns: |
|
str: the path to the subsampled manifest file. |
|
""" |
|
with open(manifest_file, "rt", encoding="utf-8") as fin: |
|
lines = fin.readlines() |
|
if max_samples < len(lines): |
|
lines = random.sample(lines, max_samples) |
|
output_file = manifest_file + "-subsampled" |
|
with open(output_file, "wt", encoding="utf-8") as fout: |
|
fout.write("".join(lines)) |
|
return output_file |
|
|
|
|
|
def cleanup_subsampled_manifests(subsampled_manifests: List[str]): |
|
"""Removes all generated subsamples manifests.""" |
|
for manifest in subsampled_manifests: |
|
os.remove(manifest) |
|
|
|
|
|
def compute_all_confidences( |
|
hypothesis: Hypothesis, tune_confidence_cfg: TuneConfidenceConfig |
|
) -> Dict[ConfidenceSpec, float]: |
|
"""Computes a set of confidence scores from a given hypothesis. |
|
|
|
Works with the output of both CTC and Transducer decoding. |
|
|
|
Args: |
|
hypothesis: generated hypothesis as returned from the transcribe |
|
method of the ASR model. |
|
tune_confidence_cfg: config specifying what confidence scores to |
|
compute. |
|
|
|
Returns: |
|
dict: dictionary with confidenct spec -> confidence score mapping. |
|
""" |
|
conf_values = {} |
|
|
|
for exclude_blank in tune_confidence_cfg.exclude_blank: |
|
filtered_logprobs = get_filtered_logprobs(hypothesis, exclude_blank) |
|
vocab_size = filtered_logprobs.shape[1] |
|
for aggregation in tune_confidence_cfg.aggregation: |
|
aggr_func = get_confidence_aggregation_bank()[aggregation] |
|
for conf_type in tune_confidence_cfg.confidence_type: |
|
conf_func = get_confidence_measure_bank()[conf_type] |
|
if conf_type == "max_prob": |
|
conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=1.0)).cpu().item() |
|
conf_values[ConfidenceSpec(exclude_blank, aggregation, conf_type, 1.0)] = conf_value |
|
else: |
|
for alpha in tune_confidence_cfg.alpha: |
|
conf_value = aggr_func(conf_func(filtered_logprobs, v=vocab_size, t=alpha)).cpu().item() |
|
conf_values[ConfidenceSpec(exclude_blank, aggregation, conf_type, alpha)] = conf_value |
|
|
|
return conf_values |
|
|
|
|
|
def find_best_confidence( |
|
train_confidences: List[List[Dict[ConfidenceSpec, float]]], |
|
train_labels: List[int], |
|
dev_confidences: List[List[Dict[ConfidenceSpec, float]]], |
|
dev_labels: List[int], |
|
tune_lr: bool, |
|
tune_lr_config: TuneConfidenceConfig, |
|
) -> Tuple[ConfidenceConfig, Pipeline]: |
|
"""Finds the best confidence configuration for model selection. |
|
|
|
Will loop over all values in the confidence dictionary and fit the LR |
|
model (optionally tuning its HPs). The best performing confidence (on the |
|
dev set) will be used for the final LR model. |
|
|
|
Args: |
|
train_confidences: this is an object of type |
|
``List[List[Dict[ConfidenceSpec, float]]]``. The shape of this |
|
object is [M, N, S], where |
|
M: number of models |
|
N: number of utterances in all training sets |
|
S: number of confidence scores to try |
|
|
|
This argument will be used to construct np.array objects for each |
|
of the confidence scores with the shape [M, N] |
|
|
|
train_labels: ground-truth labels of the correct model for each data |
|
points. This is a list of size [N] |
|
dev_confidences: same as training, but for the validation subset. |
|
dev_labels: same as training, but for the validation subset. |
|
tune_lr: controls whether tuning of LR hyperparameters is performed. |
|
tune_lr_cfg: specifies what values of LR hyperparameters to try. |
|
|
|
Returns: |
|
tuple: best confidence config, best model selection pipeline |
|
""" |
|
max_score = 0 |
|
best_pipe = None |
|
best_conf_spec = None |
|
LOG.info("Evaluation all confidences. Total grid size: %d", len(train_confidences[0][0].keys())) |
|
for conf_spec in tqdm(train_confidences[0][0].keys()): |
|
cur_train_confidences = [] |
|
for model_confs in train_confidences: |
|
cur_train_confidences.append([]) |
|
for model_conf in model_confs: |
|
cur_train_confidences[-1].append(model_conf[conf_spec]) |
|
cur_dev_confidences = [] |
|
for model_confs in dev_confidences: |
|
cur_dev_confidences.append([]) |
|
for model_conf in model_confs: |
|
cur_dev_confidences[-1].append(model_conf[conf_spec]) |
|
|
|
training_features = np.array(list(zip(*cur_train_confidences))) |
|
training_labels = np.array(train_labels) |
|
dev_features = np.array(list(zip(*cur_dev_confidences))) |
|
dev_labels = np.array(dev_labels) |
|
pipe, score = train_model_selection( |
|
training_features, training_labels, dev_features, dev_labels, tune_lr, tune_lr_config, |
|
) |
|
if max_score < score: |
|
max_score = score |
|
best_pipe = pipe |
|
best_conf_spec = conf_spec |
|
LOG.info("Found better parameters: %s. New score: %.4f", str(conf_spec), max_score) |
|
|
|
return best_conf_spec.to_confidence_config(), best_pipe |
|
|
|
|
|
@hydra_runner(config_name="BuildEnsembleConfig", schema=BuildEnsembleConfig) |
|
def main(cfg: BuildEnsembleConfig): |
|
|
|
logging.getLogger('pytorch_lightning').setLevel(logging.CRITICAL) |
|
logging.getLogger('nemo_logger').setLevel(logging.CRITICAL) |
|
LOG.info(f'Build ensemble config:\n{OmegaConf.to_yaml(cfg)}') |
|
|
|
|
|
cfg = BuildEnsembleConfig(**cfg) |
|
|
|
pl.seed_everything(cfg.random_seed) |
|
cfg.transcription.random_seed = None |
|
cfg.transcription.return_transcriptions = True |
|
cfg.transcription.preserve_alignment = True |
|
cfg.transcription.ctc_decoding.temperature = cfg.temperature |
|
cfg.transcription.rnnt_decoding.temperature = cfg.temperature |
|
|
|
|
|
train_confidences = [] |
|
dev_confidences = [] |
|
train_labels = [] |
|
dev_labels = [] |
|
|
|
|
|
|
|
|
|
subsampled_manifests = [] |
|
atexit.register(cleanup_subsampled_manifests, subsampled_manifests) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for model_idx, model_cfg in enumerate(cfg.ensemble): |
|
train_model_confidences = [] |
|
dev_model_confidences = [] |
|
for data_idx, data_cfg in enumerate(cfg.ensemble): |
|
if model_idx == 0: |
|
subsampled_manifests.append( |
|
subsample_manifest(data_cfg.training_manifest, data_cfg.max_training_samples) |
|
) |
|
subsampled_manifest = subsampled_manifests[data_idx] |
|
|
|
if model_cfg.model.endswith(".nemo"): |
|
cfg.transcription.model_path = model_cfg.model |
|
else: |
|
cfg.transcription.pretrained_name = model_cfg.model |
|
|
|
cfg.transcription.dataset_manifest = subsampled_manifest |
|
|
|
|
|
with tempfile.NamedTemporaryFile() as output_file: |
|
cfg.transcription.output_filename = output_file.name |
|
LOG.info("Transcribing training dataset %d with model %d", data_idx, model_idx) |
|
transcriptions = transcribe_speech.main(deepcopy(cfg.transcription)) |
|
LOG.info("Generating confidence scores") |
|
|
|
for transcription in tqdm(transcriptions): |
|
if cfg.tune_confidence: |
|
train_model_confidences.append( |
|
compute_all_confidences(transcription, cfg.tune_confidence_config) |
|
) |
|
else: |
|
train_model_confidences.append(compute_confidence(transcription, cfg.confidence)) |
|
if model_idx == 0: |
|
train_labels.append(data_idx) |
|
|
|
|
|
if data_cfg.dev_manifest is not None: |
|
cfg.transcription.dataset_manifest = data_cfg.dev_manifest |
|
with tempfile.NamedTemporaryFile() as output_file: |
|
cfg.transcription.output_filename = output_file.name |
|
LOG.info("Transcribing dev dataset %d with model %d", data_idx, model_idx) |
|
transcriptions = transcribe_speech.main(deepcopy(cfg.transcription)) |
|
LOG.info("Generating confidence scores") |
|
for transcription in tqdm(transcriptions): |
|
if cfg.tune_confidence: |
|
dev_model_confidences.append( |
|
compute_all_confidences(transcription, cfg.tune_confidence_config) |
|
) |
|
else: |
|
dev_model_confidences.append(compute_confidence(transcription, cfg.confidence)) |
|
if model_idx == 0: |
|
dev_labels.append(data_idx) |
|
|
|
train_confidences.append(train_model_confidences) |
|
if dev_model_confidences: |
|
dev_confidences.append(dev_model_confidences) |
|
|
|
if cfg.tune_confidence: |
|
best_confidence, model_selection_block = find_best_confidence( |
|
train_confidences, |
|
train_labels, |
|
dev_confidences, |
|
dev_labels, |
|
cfg.tune_logistic_regression, |
|
cfg.tune_logistic_regression_config, |
|
) |
|
else: |
|
best_confidence = cfg.confidence |
|
|
|
training_features = np.array(list(zip(*train_confidences))) |
|
training_labels = np.array(train_labels) |
|
if dev_confidences: |
|
dev_features = np.array(list(zip(*dev_confidences))) |
|
dev_labels = np.array(dev_labels) |
|
else: |
|
dev_features = None |
|
dev_labels = None |
|
model_selection_block, _ = train_model_selection( |
|
training_features, |
|
training_labels, |
|
dev_features, |
|
dev_labels, |
|
cfg.tune_logistic_regression, |
|
cfg.tune_logistic_regression_config, |
|
verbose=True, |
|
) |
|
|
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
model_selection_block_path = os.path.join(tmpdir, 'model_selection_block.pkl') |
|
joblib.dump(model_selection_block, model_selection_block_path) |
|
|
|
|
|
ensemble_model = ConfidenceEnsembleModel( |
|
cfg=DictConfig( |
|
{ |
|
'model_selection_block': model_selection_block_path, |
|
'confidence': best_confidence, |
|
'temperature': cfg.temperature, |
|
'load_models': [model_cfg.model for model_cfg in cfg.ensemble], |
|
} |
|
), |
|
trainer=None, |
|
) |
|
ensemble_model.save_to(cfg.output_path) |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|