In [None]:
from repcodec.RepCodec import RepCodec
import torch
import yaml

config = "repcodec/configs/repcodec_dim1024.yaml"
with open(config) as fp:
    conf = yaml.load(fp, Loader=yaml.FullLoader)

model = RepCodec(**conf)
model.load_state_dict(torch.load("./../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
model.quantizer.initial()
model.eval()

In [None]:
# input shape: (batch size, hidden dim, sequence length)
random_features = torch.randn(size=(1, 1024, 100))
with torch.no_grad():
    x = model.encoder(random_features)
    z = model.projector(x)
    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
    tokens = idx.cpu().data.numpy().tolist()[0]

## Dump Representations

In [None]:
python3 examples/dump_feature.py --model_type data2vec --tsv_path "./files/train.clean.100.tsv" --ckpt_path "./../models/vox_pretrained.pt" --layer 18 --feat_dir "./features/train.clean.100"

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

cache_dir = "./../../cache"

dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)

# for split in dataset.keys():
#     data = dataset[split]
#     num_frames = []
#     for idx in tqdm(range(len(data))):
#         audio = data[idx]["audio"]
#         num_frames.append(int(len(audio["array"]) * 16000 // audio["sampling_rate"]))
        
#     df = pd.DataFrame.from_dict({
#         "file_path": list(data["file"]),
#         "num_frames": num_frames
#     })
#     df.to_csv(f"./files/{split}.tsv", sep="\t", index=False)

In [None]:
dataset["train.clean.100"][0]

## Prepare the Dataset

In [None]:
from datasets import Dataset, load_dataset
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import string

cache_dir = "./../../cache"

dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)

def process(text):

    # Lower case every letter
    text = text.lower()

    # Remove punctuation
    punctuation_to_remove = string.punctuation.replace("'", "")
    translation_table = str.maketrans('', '', punctuation_to_remove)
    text = text.translate(translation_table)

    # Remove whitespaces from front and behind
    while text[0] == ' ' or text[-1] == ' ':
        if text[0] == ' ':
            text = text[1:]
        if text[-1] == ' ':
            text = text[:-1]
    
    return text

dataset = dataset.remove_columns(["audio", "speaker_id", "chapter_id"])

tokenized_ds = defaultdict(lambda: [])

for split in dataset.keys():

    texts = []
    tokens = []
    tkns = np.load(f"./examples/tkns/{split}.npz")

    for idx, key in enumerate(tqdm(tkns.files)):
        tokens.append(list(tkns[key]))
        texts.append(process(dataset[split][idx]["text"]))

    tokenized_ds[split] = Dataset.from_dict({
        "text": texts,
        "audio_tokens": tokens
    })

In [None]:
from datasets import dataset_dict, DatasetDict

tds = DatasetDict(tokenized_ds)

In [None]:
tds.save_to_disk("librispeech_tokenized.hf")

In [2]:
from datasets import load_dataset

dataset = load_dataset("./librispeech_tokenized.hf")

ValueError: Couldn't infer the same data file format for all splits. Got {NamedSplit('train'): ('arrow', {}), NamedSplit('validation'): ('json', {}), NamedSplit('test'): ('json', {})}

In [8]:
from datasets import dataset_dict, DatasetDict, Dataset

dataset = DatasetDict.load_from_disk("./librispeech_tokenized.hf")

In [13]:
len(dataset["train.clean.100"][0]["audio_tokens"])

726