# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import time import numpy as np from sklearn.cluster import MiniBatchKMeans import joblib from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( get_and_dump_features, get_features, ) def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser( description="Learn K-means clustering over acoustic features." ) # Features arguments parser.add_argument( "--in_features_path", type=str, default=None, help="Features file path" ) parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, help="Acoustic feature type", ) parser.add_argument( "--manifest_path", type=str, default=None, help="Manifest file containing the root dir and file names", ) parser.add_argument( "--out_features_path", type=str, default=None, help="Features file path to write to", ) parser.add_argument( "--checkpoint_path", type=str, help="Pretrained acoustic model checkpoint", ) parser.add_argument( "--layer", type=int, help="The layer of the pretrained model to extract features from", default=-1, ) parser.add_argument( "--sample_pct", type=float, help="Percent data to use for K-means training", default=0.1, ) # K-means arguments parser.add_argument( "--num_clusters", type=int, help="Nubmer of clusters", default=50 ) parser.add_argument("--init", default="k-means++") parser.add_argument( "--max_iter", type=int, help="Maximum number of iterations for K-means training", default=150, ) parser.add_argument( "--batch_size", type=int, help="Batch size for K-means training", default=10000, ) parser.add_argument("--tol", default=0.0, type=float) parser.add_argument("--max_no_improvement", default=100, type=int) parser.add_argument("--n_init", default=20, type=int) parser.add_argument("--reassignment_ratio", default=0.5, type=float) parser.add_argument( "--out_kmeans_model_path", type=str, required=True, help="Path to save K-means model", ) # Leftovers parser.add_argument( "--seed", type=int, help="Random seed to use for K-means training", default=1369, ) return parser def get_kmeans_model( n_clusters, init, max_iter, batch_size, tol, max_no_improvement, n_init, reassignment_ratio, random_state, ): return MiniBatchKMeans( n_clusters=n_clusters, init=init, max_iter=max_iter, batch_size=batch_size, tol=tol, max_no_improvement=max_no_improvement, n_init=n_init, reassignment_ratio=reassignment_ratio, random_state=random_state, verbose=1, compute_labels=True, init_size=None, ) def train_kmeans(kmeans_model, features_batch): start_time = time.time() kmeans_model.fit(features_batch) time_taken = round((time.time() - start_time) // 60, 2) return kmeans_model, time_taken def main(args, logger): # Features loading/extraction for K-means if args.in_features_path: # Feature loading logger.info(f"Loading features from {args.in_features_path}...") features_batch = np.load(args.in_features_path, allow_pickle=True) else: # Feature extraction logger.info(f"Extracting {args.feature_type} acoustic features...") features_batch = ( get_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=args.sample_pct, flatten=True, ) if not args.out_features_path else get_and_dump_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=args.sample_pct, flatten=True, out_features_path=args.out_features_path, ) ) if args.out_features_path: logger.info( f"Saved extracted features at {args.out_features_path}" ) logger.info(f"Features shape = {features_batch.shape}\n") # Learn and save K-means model kmeans_model = get_kmeans_model( n_clusters=args.num_clusters, init=args.init, max_iter=args.max_iter, batch_size=args.batch_size, tol=args.tol, max_no_improvement=args.max_no_improvement, n_init=args.n_init, reassignment_ratio=args.reassignment_ratio, random_state=args.seed, ) logger.info("Starting k-means training...") kmeans_model, time_taken = train_kmeans( kmeans_model=kmeans_model, features_batch=features_batch ) logger.info(f"...done k-means training in {time_taken} minutes") inertia = -kmeans_model.score(features_batch) / len(features_batch) logger.info(f"Total intertia: {round(inertia, 2)}\n") logger.info(f"Saving k-means model to {args.out_kmeans_model_path}") os.makedirs(os.path.dirname(args.out_kmeans_model_path), exist_ok=True) joblib.dump(kmeans_model, open(args.out_kmeans_model_path, "wb")) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger)