|
import argparse |
|
import json |
|
import os |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
from tqdm import tqdm |
|
|
|
from Utility.storage_config import MODELS_DIR |
|
|
|
|
|
def approximate_and_inject_language_embeddings(model_path, df, iso_lookup, min_n_langs=5, max_n_langs=25, threshold_percentile=50): |
|
|
|
model = torch.load(model_path, map_location="cpu") |
|
lang_embs = model["model"]["encoder.language_embedding.weight"] |
|
|
|
features_per_closest_lang = 2 |
|
|
|
if "combined_dist_0" in df.columns: |
|
if "map_dist_0" in df.columns: |
|
features_per_closest_lang += 1 |
|
if "asp_dist_0" in df.columns: |
|
features_per_closest_lang += 1 |
|
if "tree_dist_0" in df.columns: |
|
features_per_closest_lang += 1 |
|
n_closest = len(df.columns) // features_per_closest_lang |
|
distance_type = "combined" |
|
|
|
else: |
|
n_closest = len(df.columns) // features_per_closest_lang |
|
if "map_dist_0" in df.columns: |
|
distance_type = "map" |
|
elif "tree_dist_0" in df.columns: |
|
distance_type = "tree" |
|
elif "asp_dist_0" in df.columns: |
|
distance_type = "asp" |
|
elif "learned_dist_0" in df.columns: |
|
distance_type = "learned" |
|
else: |
|
distance_type = "random" |
|
|
|
|
|
closest_lang_columns = [f"closest_lang_{i}" for i in range(n_closest)] |
|
closest_dist_columns = [f"{distance_type}_dist_{i}" for i in range(n_closest)] |
|
closest_lang_columns = closest_lang_columns[:max_n_langs] |
|
closest_dist_columns = closest_dist_columns[:max_n_langs] |
|
assert df[closest_dist_columns[-1]].isna().sum().sum() == 0 |
|
|
|
|
|
threshold = np.percentile(df[closest_dist_columns[-1]], threshold_percentile) |
|
print(f"threshold: {threshold:.4f}") |
|
for row in tqdm(df.itertuples(), total=df.shape[0], desc="Approximating language embeddings"): |
|
avg_emb = torch.zeros([32]) |
|
dists = [getattr(row, d) for i, d in enumerate(closest_dist_columns) if i < min_n_langs or getattr(row, d) < threshold] |
|
langs = [getattr(row, l) for l in closest_lang_columns[:len(dists)]] |
|
|
|
for lang in langs: |
|
lang_emb = lang_embs[iso_lookup[-1][str(lang)]] |
|
avg_emb += lang_emb |
|
avg_emb /= len(langs) |
|
lang_embs[iso_lookup[-1][str(row.target_lang)]] = avg_emb |
|
|
|
|
|
model["model"]["encoder.language_embedding.weight"] = lang_embs |
|
modified_model_path = model_path.split(".")[0] + "_zeroshot_lang_embs.pt" |
|
torch.save(model, modified_model_path) |
|
print(f"Replaced unsupervised language embeddings with zero-shot approximations.\nSaved modified model to {modified_model_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
default_model_path = os.path.join(MODELS_DIR, "ToucanTTS_Meta", "best.pt") |
|
default_csv_path = "distance_datasets/dataset_learned_top30.csv" |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--model_path", type=str, default=default_model_path, help="path of the model for which the language embeddings should be modified") |
|
parser.add_argument("--dataset_path", type=str, default=default_csv_path, help="path to distance dataset CSV") |
|
parser.add_argument("--min_n_langs", type=int, default=5, help="minimum amount of languages used for averaging") |
|
parser.add_argument("--max_n_langs", type=int, default=25, help="maximum amount of languages used for averaging") |
|
parser.add_argument("--threshold_percentile", type=int, default=50, help="percentile of the furthest used languages \ |
|
used as cutoff threshold (no langs >= the threshold are used for averaging)") |
|
args = parser.parse_args() |
|
ISO_LOOKUP_PATH = "iso_lookup.json" |
|
with open(ISO_LOOKUP_PATH, "r") as f: |
|
iso_lookup = json.load(f) |
|
|
|
distance_df = pd.read_csv(args.dataset_path, sep="|") |
|
approximate_and_inject_language_embeddings(model_path=args.model_path, |
|
df=distance_df, |
|
iso_lookup=iso_lookup, |
|
min_n_langs=args.min_n_langs, |
|
max_n_langs=args.max_n_langs, |
|
threshold_percentile=args.threshold_percentile) |
|
|