|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import os |
|
import os.path as osp |
|
import math |
|
import numpy as np |
|
import tqdm |
|
import torch |
|
from shutil import copyfile |
|
|
|
from npy_append_array import NpyAppendArray |
|
|
|
|
|
def get_parser(): |
|
parser = argparse.ArgumentParser( |
|
description="transforms features via a given pca and stored them in target dir" |
|
) |
|
|
|
parser.add_argument('source', help='directory with features') |
|
parser.add_argument('--split', help='which split to read', required=True) |
|
parser.add_argument('--save-dir', help='where to save the output', required=True) |
|
parser.add_argument('--pca-path', type=str, help='pca location. will append _A.npy and _b.npy', required=True) |
|
parser.add_argument('--batch-size', type=int, default=2048000, help='batch size') |
|
parser.add_argument('--unfiltered', action='store_true', help='process the unfiltered version') |
|
|
|
|
|
return parser |
|
|
|
|
|
def main(): |
|
parser = get_parser() |
|
args = parser.parse_args() |
|
|
|
source_path = osp.join(args.source, args.split) |
|
data_poth = source_path + "_unfiltered" if args.unfiltered else source_path |
|
|
|
print(f"data path: {data_poth}") |
|
|
|
features = np.load(data_poth + ".npy", mmap_mode="r") |
|
pca_A = torch.from_numpy(np.load(args.pca_path + "_A.npy")).cuda() |
|
pca_b = torch.from_numpy(np.load(args.pca_path + "_b.npy")).cuda() |
|
|
|
os.makedirs(args.save_dir, exist_ok=True) |
|
save_path = osp.join(args.save_dir, args.split) |
|
|
|
copyfile(source_path + ".tsv", save_path + ".tsv") |
|
copyfile(data_poth + ".lengths", save_path + ".lengths") |
|
|
|
if osp.exists(source_path + ".phn"): |
|
copyfile(source_path + ".phn", save_path + ".phn") |
|
|
|
if osp.exists(source_path + ".wrd"): |
|
copyfile(source_path + ".wrd", save_path + ".wrd") |
|
|
|
if osp.exists(save_path + ".npy"): |
|
os.remove(save_path + ".npy") |
|
npaa = NpyAppendArray(save_path + ".npy") |
|
|
|
batches = math.ceil(features.shape[0] / args.batch_size) |
|
|
|
with torch.no_grad(): |
|
for b in tqdm.trange(batches): |
|
start = b * args.batch_size |
|
end = start + args.batch_size |
|
x = torch.from_numpy(features[start:end]).cuda() |
|
x = torch.matmul(x, pca_A) + pca_b |
|
npaa.append(x.cpu().numpy()) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|