In [1]:
from data2vec_feature_reader import Data2vecFeatureReader

reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000)

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

cache_dir = "./../../../cache"

dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)

Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/60 [00:00<?, ?it/s]

In [3]:
from repcodec.RepCodec import RepCodec
import torch
import yaml

config = "./../repcodec/configs/repcodec_dim1024.yaml"
with open(config) as fp:
    conf = yaml.load(fp, Loader=yaml.FullLoader)

model = RepCodec(**conf)
model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
model.quantizer.initial()
model.eval()

RepCodec(
  (encoder): Encoder(
    (conv): Conv1d(
      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    )
    (conv_blocks): ModuleList(
      (0-1): 2 x EncoderBlock(
        (res_units): ModuleList(
          (0-1): 2 x ResidualUnit(
            (activation): ELU(alpha=1.0)
            (conv1): Conv1d(
              (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
            )
            (conv2): Conv1d1x1(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)
          )
        )
        (conv): Conv1d(
          (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
        )
      )
    )
  )
  (decoder): Decoder(
    (conv1): Conv1d(
      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
    )
    (conv_blocks): ModuleList(
      (0-1): 2 x DecoderBlock(
        (conv): Conv1d(
          (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), paddi

In [22]:
import torch.nn.functional as F

sample = dataset["train.clean.100"][1]

x = sample["audio"]["array"]

with torch.no_grad():
    x = torch.from_numpy(x).float().to(reader.device)
    if reader.task.cfg.normalize:
        x = F.layer_norm(x, x.shape)
    x = x.view(1, -1)

    feat = []
    for start in range(0, x.size(1), reader.max_chunk):
        x_chunk = x[:, start: start + reader.max_chunk]
        res = reader.model.extract_features(
            source=x_chunk,
            padding_mask=None,
            mask=False,
            layer=reader.layer,
        )
        feat_chunk = res["x"]
        feat.append(feat_chunk)
        
    features = torch.cat(feat, 1).permute(0, 2, 1)

    x = model.encoder(features)
    z = model.projector(x)
    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
    tokens = idx.cpu().data.numpy().tolist()[0]

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [14]:
features.shape

torch.Size([1, 804, 1024])

In [8]:
feat.shape

torch.Size([726, 1024])

In [None]:
import logging
import os
import sys

import tqdm
from npy_append_array import NpyAppendArray

def get_shard_range(tot, nshard, rank):
    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
    start = round(tot / nshard * rank)
    end = round(tot / nshard * (rank + 1))
    assert start < end, f"start={start}, end={end}"
    logger.info(
        f"rank {rank} of {nshard}, process {end-start} "
        f"({start}-{end}) out of {tot}"
    )
    return start, end

def get_path_iterator(tsv, nshard, rank):
    with open(tsv, "r") as f:
        root = f.readline().rstrip()
        lines = [line.rstrip() for line in f]
        start, end = get_shard_range(len(lines), nshard, rank)
        lines = lines[start:end]
        def iterate():
            for line in lines:
                subpath, nsample = line.split("\t")
                yield f"{root}/{subpath}", int(nsample)
    return iterate, len(lines)

def dump_feature(reader, generator, num, nshard, rank, feat_dir):
    iterator = generator()

    feat_path = f"{feat_dir}/{rank}_{nshard}.npy"
    leng_path = f"{feat_dir}/{rank}_{nshard}.len"

    os.makedirs(feat_dir, exist_ok=True)
    if os.path.exists(feat_path):
        os.remove(feat_path)

    feat_f = NpyAppendArray(feat_path)
    with open(leng_path, "w") as leng_f:
        for path, nsample in tqdm.tqdm(iterator, total=num):
            feat = reader.get_feats(path, nsample)
            feat_f.append(feat.cpu().numpy())
            leng_f.write(f"{len(feat)}\n")
    logger.info("finished successfully")

generator, num = get_path_iterator(tsv_path, nshard, rank)
dump_feature(reader, generator, num, nshard, rank, feat_dir)