darshanmakwana's picture
Upload folder using huggingface_hub
2cddd11 verified
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
cache_dir = "./../../../cache"
dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
from repcodec.RepCodec import RepCodec
import torch
import yaml
config = "./../repcodec/configs/repcodec_dim1024.yaml"
with open(config) as fp:
conf = yaml.load(fp, Loader=yaml.FullLoader)
model = RepCodec(**conf)
model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
model.quantizer.initial()
model.eval()
model.to("cuda:0")
from data2vec_feature_reader import Data2vecFeatureReader
reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000)
import torch.nn.functional as F
import numpy as np
for split in dataset.keys():
tokens = []
for idx in tqdm(range(len(dataset[split]))):
sample = dataset[split][idx]
x = sample["audio"]["array"]
with torch.no_grad():
x = torch.from_numpy(x).float().to(reader.device)
if reader.task.cfg.normalize:
x = F.layer_norm(x, x.shape)
x = x.view(1, -1)
feat = []
for start in range(0, x.size(1), reader.max_chunk):
x_chunk = x[:, start: start + reader.max_chunk]
res = reader.model.extract_features(
source=x_chunk,
padding_mask=None,
mask=False,
layer=reader.layer,
)
feat_chunk = res["x"]
feat.append(feat_chunk)
features = torch.cat(feat, 1).permute(0, 2, 1)
x = model.encoder(features)
z = model.projector(x)
_, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
tkn = idx.detach().cpu().data.numpy()[0]
tokens.append(tkn)
np.savez(f"./tkns/{split}.npz", *tokens)