{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "72bf1b45-66fd-450d-8d5c-bec9e0b3d08f", "metadata": {}, "outputs": [], "source": [ "from data2vec_feature_reader import Data2vecFeatureReader\n", "\n", "reader = Data2vecFeatureReader(\"./../../models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)" ] }, { "cell_type": "code", "execution_count": 2, "id": "84a9d238-048a-4772-a47b-5aadc50f36df", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "fb01bc434d964db08fde7f9f2c90ea3c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading dataset shards: 0%| | 0/45 [00:00 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mRepCodec\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RepCodec\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n", "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package" ] } ], "source": [ "from .RepCodec import RepCodec\n", "import torch\n", "import yaml\n", "\n", "config = \"./../repcodec/configs/repcodec_dim1024.yaml\"\n", "with open(config) as fp:\n", " conf = yaml.load(fp, Loader=yaml.FullLoader)\n", "\n", "model = RepCodec(**conf)\n", "model.load_state_dict(torch.load(\"./../../models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n", "model.quantizer.initial()\n", "model.eval()\n", "model.to(\"cuda:0\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a9a1731e-052c-4af0-a29c-b171a988b300", "metadata": {}, "outputs": [], "source": [ "import torch.nn.functional as F\n", "\n", "sample = dataset[\"train.clean.100\"][1]\n", "\n", "x = sample[\"audio\"][\"array\"]\n", "\n", "with torch.no_grad():\n", " x = torch.from_numpy(x).float().to(reader.device)\n", " if reader.task.cfg.normalize:\n", " x = F.layer_norm(x, x.shape)\n", " x = x.view(1, -1)\n", "\n", " feat = []\n", " for start in range(0, x.size(1), reader.max_chunk):\n", " x_chunk = x[:, start: start + reader.max_chunk]\n", " res = reader.model.extract_features(\n", " source=x_chunk,\n", " padding_mask=None,\n", " mask=False,\n", " layer=reader.layer,\n", " )\n", " feat_chunk = res[\"x\"]\n", " feat.append(feat_chunk)\n", " \n", " features = torch.cat(feat, 1).permute(0, 2, 1)\n", "\n", " x = model.encoder(features)\n", " z = model.projector(x)\n", " _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))\n", " tokens = idx.cpu().data.numpy().tolist()[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "1810e6dc-2ece-4aca-a29a-e1933b8ce82a", "metadata": {}, "outputs": [], "source": [ "import logging\n", "import os\n", "import sys\n", "\n", "import tqdm\n", "from npy_append_array import NpyAppendArray\n", "\n", "def get_shard_range(tot, nshard, rank):\n", " assert rank < nshard and rank >= 0, f\"invaid rank/nshard {rank}/{nshard}\"\n", " start = round(tot / nshard * rank)\n", " end = round(tot / nshard * (rank + 1))\n", " assert start < end, f\"start={start}, end={end}\"\n", " logger.info(\n", " f\"rank {rank} of {nshard}, process {end-start} \"\n", " f\"({start}-{end}) out of {tot}\"\n", " )\n", " return start, end\n", "\n", "def get_path_iterator(tsv, nshard, rank):\n", " with open(tsv, \"r\") as f:\n", " root = f.readline().rstrip()\n", " lines = [line.rstrip() for line in f]\n", " start, end = get_shard_range(len(lines), nshard, rank)\n", " lines = lines[start:end]\n", " def iterate():\n", " for line in lines:\n", " subpath, nsample = line.split(\"\\t\")\n", " yield f\"{root}/{subpath}\", int(nsample)\n", " return iterate, len(lines)\n", "\n", "def dump_feature(reader, generator, num, nshard, rank, feat_dir):\n", " iterator = generator()\n", "\n", " feat_path = f\"{feat_dir}/{rank}_{nshard}.npy\"\n", " leng_path = f\"{feat_dir}/{rank}_{nshard}.len\"\n", "\n", " os.makedirs(feat_dir, exist_ok=True)\n", " if os.path.exists(feat_path):\n", " os.remove(feat_path)\n", "\n", " feat_f = NpyAppendArray(feat_path)\n", " with open(leng_path, \"w\") as leng_f:\n", " for path, nsample in tqdm.tqdm(iterator, total=num):\n", " feat = reader.get_feats(path, nsample)\n", " feat_f.append(feat.cpu().numpy())\n", " leng_f.write(f\"{len(feat)}\\n\")\n", " logger.info(\"finished successfully\")\n", "\n", "generator, num = get_path_iterator(tsv_path, nshard, rank)\n", "dump_feature(reader, generator, num, nshard, rank, feat_dir)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }