|
from datasets import load_dataset |
|
from tqdm import tqdm |
|
import pandas as pd |
|
|
|
cache_dir = "./../../../cache" |
|
|
|
dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True) |
|
|
|
from repcodec.RepCodec import RepCodec |
|
import torch |
|
import yaml |
|
|
|
config = "./../repcodec/configs/repcodec_dim1024.yaml" |
|
with open(config) as fp: |
|
conf = yaml.load(fp, Loader=yaml.FullLoader) |
|
|
|
model = RepCodec(**conf) |
|
model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"]) |
|
model.quantizer.initial() |
|
model.eval() |
|
model.to("cuda:0") |
|
|
|
from data2vec_feature_reader import Data2vecFeatureReader |
|
|
|
reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000) |
|
|
|
import torch.nn.functional as F |
|
import numpy as np |
|
|
|
for split in dataset.keys(): |
|
|
|
tokens = [] |
|
|
|
for idx in tqdm(range(len(dataset[split]))): |
|
|
|
sample = dataset[split][idx] |
|
|
|
x = sample["audio"]["array"] |
|
|
|
with torch.no_grad(): |
|
x = torch.from_numpy(x).float().to(reader.device) |
|
if reader.task.cfg.normalize: |
|
x = F.layer_norm(x, x.shape) |
|
x = x.view(1, -1) |
|
|
|
feat = [] |
|
for start in range(0, x.size(1), reader.max_chunk): |
|
x_chunk = x[:, start: start + reader.max_chunk] |
|
res = reader.model.extract_features( |
|
source=x_chunk, |
|
padding_mask=None, |
|
mask=False, |
|
layer=reader.layer, |
|
) |
|
feat_chunk = res["x"] |
|
feat.append(feat_chunk) |
|
|
|
features = torch.cat(feat, 1).permute(0, 2, 1) |
|
|
|
x = model.encoder(features) |
|
z = model.projector(x) |
|
_, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1)) |
|
tkn = idx.detach().cpu().data.numpy()[0] |
|
|
|
tokens.append(tkn) |
|
np.savez(f"./tkns/{split}.npz", *tokens) |