darshanmakwana commited on Jun 24, 2024

Commit

2cddd11

verified ·

1 Parent(s): 03d39ff

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
prompting/.ipynb_checkpoints/dataset_generation-checkpoint.ipynb +332 -0
prompting/.ipynb_checkpoints/generate_rare_words-checkpoint.py +62 -0
prompting/.ipynb_checkpoints/generate_transcripts-checkpoint.py +60 -0
prompting/.ipynb_checkpoints/get_error_word_count-checkpoint.py +106 -0
prompting/.ipynb_checkpoints/model-checkpoint.py +53 -0
prompting/.ipynb_checkpoints/train_clean_100_error-checkpoint.json +3 -0
prompting/.ipynb_checkpoints/train_lora-checkpoint.py +137 -0
prompting/.ipynb_checkpoints/train_phi-checkpoint.py +86 -0
prompting/.ipynb_checkpoints/training-checkpoint.ipynb +278 -0
prompting/RepCodec/.gitignore +160 -0
prompting/RepCodec/.ipynb_checkpoints/tinker-checkpoint.ipynb +267 -0
prompting/RepCodec/LICENSE +428 -0
prompting/RepCodec/README.md +273 -0
prompting/RepCodec/dataloader/__init__.py +2 -0
prompting/RepCodec/dataloader/collater.py +22 -0
prompting/RepCodec/dataloader/dataset.py +90 -0
prompting/RepCodec/examples/.ipynb_checkpoints/Untitled-checkpoint.ipynb +334 -0
prompting/RepCodec/examples/.ipynb_checkpoints/data2vec_audio-checkpoint.py +541 -0
prompting/RepCodec/examples/.ipynb_checkpoints/data2vec_feature_reader-checkpoint.py +87 -0
prompting/RepCodec/examples/.ipynb_checkpoints/dump_feature-checkpoint.py +142 -0
prompting/RepCodec/examples/.ipynb_checkpoints/feature_utils-checkpoint.py +70 -0
prompting/RepCodec/examples/.ipynb_checkpoints/some_run-checkpoint.py +66 -0
prompting/RepCodec/examples/Untitled.ipynb +214 -0
prompting/RepCodec/examples/__pycache__/data2vec_audio.cpython-38.pyc +0 -0
prompting/RepCodec/examples/__pycache__/data2vec_feature_reader.cpython-38.pyc +0 -0
prompting/RepCodec/examples/__pycache__/feature_utils.cpython-38.pyc +0 -0
prompting/RepCodec/examples/__pycache__/hubert_feature_reader.cpython-38.pyc +0 -0
prompting/RepCodec/examples/__pycache__/tokenize.cpython-38.pyc +0 -0
prompting/RepCodec/examples/data2vec_audio.py +541 -0
prompting/RepCodec/examples/data2vec_feature_reader.py +87 -0
prompting/RepCodec/examples/dump_feature.py +142 -0
prompting/RepCodec/examples/feature_utils.py +70 -0
prompting/RepCodec/examples/hubert_feature_reader.py +64 -0
prompting/RepCodec/examples/some_run.py +66 -0
prompting/RepCodec/examples/tkns/test.clean.npz +3 -0
prompting/RepCodec/examples/tkns/test.other.npz +3 -0
prompting/RepCodec/examples/tkns/train.clean.100.npz +3 -0
prompting/RepCodec/examples/tkns/train.clean.360.npz +3 -0
prompting/RepCodec/examples/tkns/train.other.500.npz +3 -0
prompting/RepCodec/examples/tkns/validation.clean.npz +3 -0
prompting/RepCodec/examples/tkns/validation.other.npz +3 -0
prompting/RepCodec/examples/tokens/data2vec_base_l6_dev-clean.tokens +0 -0
prompting/RepCodec/examples/tokens/data2vec_large_l18_dev-clean.tokens +0 -0
prompting/RepCodec/examples/tokens/hubert_base_l9_dev-clean.tokens +0 -0
prompting/RepCodec/examples/tokens/hubert_large_l18_dev-clean.tokens +0 -0
prompting/RepCodec/examples/tokens/whisper_large_l32_dev-clean.tokens +0 -0
prompting/RepCodec/examples/tokens/whisper_medium_l24_dev-clean.tokens +0 -0
prompting/RepCodec/examples/whisper_feature_reader.py +110 -0
prompting/RepCodec/examples/whisper_model.py +58 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+prompting/.ipynb_checkpoints/train_clean_100_error-checkpoint.json filter=lfs diff=lfs merge=lfs -text
+prompting/train_clean_100_error.json filter=lfs diff=lfs merge=lfs -text
+prompting/train_data/train.clean.360.json filter=lfs diff=lfs merge=lfs -text
+prompting/train_data/train.other.500.json filter=lfs diff=lfs merge=lfs -text
+prompting/transcripts/train.clean.360.txt filter=lfs diff=lfs merge=lfs -text
+prompting/transcripts/train.other.500.txt filter=lfs diff=lfs merge=lfs -text
+prompting/wandb/run-20240615_114519-wfpe2teb/run-wfpe2teb.wandb filter=lfs diff=lfs merge=lfs -text

prompting/.ipynb_checkpoints/dataset_generation-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,332 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "59efc3d7-a57f-43cc-8aa3-34bb57de0251",
+   "metadata": {},
+   "source": [
+    "## Librispeech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "327243de-fd0f-449d-998a-63282a1c67a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "cache_dir = \"./../cache\"\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "456889e1-f8cc-440b-bf6b-f6fbfafc367d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torchmetrics import WordErrorRate, CharErrorRate\n",
+    "from edit_distance import SequenceMatcher\n",
+    "from tqdm import tqdm\n",
+    "import jiwer\n",
+    "\n",
+    "def correct_text(text):\n",
+    "   transforms = jiwer.Compose(\n",
+    "       [\n",
+    "           jiwer.ExpandCommonEnglishContractions(),\n",
+    "           jiwer.ToLowerCase(),\n",
+    "           jiwer.RemoveMultipleSpaces(),\n",
+    "           jiwer.Strip(),\n",
+    "           jiwer.RemovePunctuation(),\n",
+    "           jiwer.ReduceToListOfListOfWords(),\n",
+    "       ]\n",
+    "   )\n",
+    "   return transforms(text)\n",
+    "\n",
+    "def align_gt_asr(gt, asr):\n",
+    "\n",
+    "   sm = SequenceMatcher(a=gt, b=asr)\n",
+    "   best_path = []\n",
+    "   opcodes = sm.get_opcodes()\n",
+    "\n",
+    "   for tag, i1, i2, j1, j2 in opcodes:\n",
+    "\n",
+    "       if tag == \"delete\":\n",
+    "           for i in range(i1, i2):\n",
+    "               best_path.append([gt[i], \"\"])\n",
+    "\n",
+    "       if tag == \"replace\" or tag == \"equal\":\n",
+    "           for i, j in zip(range(i1, i2), range(j1, j2)):\n",
+    "               best_path.append([gt[i], asr[j]])\n",
+    "\n",
+    "       if tag == \"insert\":\n",
+    "           for j in range(j1, j2):\n",
+    "               best_path.append([\"\", asr[j]])\n",
+    "\n",
+    "   return best_path\n",
+    "\n",
+    "import string\n",
+    "def process(text):\n",
+    "\n",
+    "    # Lower case every letter\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    # Remove punctuation\n",
+    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
+    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
+    "    text = text.translate(translation_table)\n",
+    "\n",
+    "    # Remove whitespaces from front and behind\n",
+    "    while text[0] == ' ' or text[-1] == ' ':\n",
+    "        if text[0] == ' ':\n",
+    "            text = text[1:]\n",
+    "        if text[-1] == ' ':\n",
+    "            text = text[:-1]\n",
+    "    \n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3bc907b0-2ebe-46ac-b6a1-02919e69af88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "\n",
+    "gens = []\n",
+    "texts = []\n",
+    "\n",
+    "unmatches = []\n",
+    "\n",
+    "for split in [\"validation.clean\"]:\n",
+    "    data = dataset[split]\n",
+    "    with open(f\"./transcripts/{split}.txt\", \"r\") as f:\n",
+    "        for idx, line in enumerate(tqdm(f)):\n",
+    "            preds = process(line.rstrip())\n",
+    "            text = data[idx][\"text\"]\n",
+    "\n",
+    "            path = align_gt_asr(correct_text(text)[0], correct_text(preds)[0])\n",
+    "            un = 0\n",
+    "            for a, b in path:\n",
+    "                if a!=b:\n",
+    "                    un+=1\n",
+    "                    \n",
+    "            unmatches.append(un)\n",
+    "\n",
+    "            # texts.append(process(text))\n",
+    "            # gens.append(preds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cac10009-1b47-4e2f-a232-f71b23ee983e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "np.count_nonzero(unmatches)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afdc9f74-c2cf-4d52-8563-1bd827f6d900",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def align_gt_asr(gt, asr):\n",
+    "\n",
+    "    sm = SequenceMatcher(a=gt, b=asr)\n",
+    "    best_path = []\n",
+    "    opcodes = sm.get_opcodes()\n",
+    "    \n",
+    "    for tag, i1, i2, j1, j2 in opcodes:\n",
+    "    \n",
+    "        if tag == \"delete\":\n",
+    "           for i in range(i1, i2):\n",
+    "               best_path.append([gt[i], \"\"])\n",
+    "        \n",
+    "        if tag == \"replace\" or tag == \"equal\":\n",
+    "           for i, j in zip(range(i1, i2), range(j1, j2)):\n",
+    "               best_path.append([gt[i], asr[j]])\n",
+    "        \n",
+    "        if tag == \"insert\":\n",
+    "           for j in range(j1, j2):\n",
+    "               best_path.append([\"\", asr[j]])\n",
+    "    \n",
+    "    return best_path\n",
+    "\n",
+    "# align_gt_asr(correct_text(text), correct_text(preds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cdfd3d9-6c22-4ccd-a22b-df8e79fc20b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "correct_text(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c33f46a-f3dd-435f-81e3-e7b10ae03470",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "correct_text([\"hello\", \"hey\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cfab12a-2b2c-4c00-bd80-ab571c012f29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Transcript of whisper small    WER\n",
+    "## validation.clean              4.62\n",
+    "## validation.other              8.11\n",
+    "## test.clean                    4.22\n",
+    "## test.other                    8.56\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24cb2d8f-9ce2-42f2-bbf0-522106078aac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
+    "from datasets import load_dataset\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "cache_dir = \"./../cache\"\n",
+    "model_id = \"openai/whisper-small\"\n",
+    "\n",
+    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\", cache_dir=cache_dir)\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(model_id, cache_dir=cache_dir, attn_implementation=\"sdpa\").to(device).to(dtype).eval()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5fa6f8e-43f2-44ce-b719-2d8fde4067ce",
+   "metadata": {},
+   "source": [
+    "## Biasing List"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cc0f934-d208-445e-aecd-31df73be6986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys, os\n",
+    "import json\n",
+    "import string\n",
+    "from tqdm import tqdm\n",
+    "def process(text):\n",
+    "\n",
+    "    # Lower case every letter\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    # Remove punctuation\n",
+    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
+    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
+    "    text = text.translate(translation_table)\n",
+    "\n",
+    "    # Remove whitespaces from front and behind\n",
+    "    while text[0] == ' ' or text[-1] == ' ':\n",
+    "        if text[0] == ' ':\n",
+    "            text = text[1:]\n",
+    "        if text[-1] == ' ':\n",
+    "            text = text[:-1]\n",
+    "    \n",
+    "    return text\n",
+    "\n",
+    "split_name = \"train.clean.100\"\n",
+    "\n",
+    "with open(\"./blist/all_rare_words.txt\") as fin:\n",
+    "    rarewords = [process(word.strip()) for word in fin]\n",
+    "\n",
+    "with open(f\"./transcripts/{split_name}.txt\") as fin:\n",
+    "    transcripts = [line.strip() for line in fin]\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "cache_dir = \"./../cache\"\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)\n",
+    "\n",
+    "train_data = []\n",
+    "\n",
+    "pbar = tqdm(dataset[split_name])\n",
+    "for idx, sample in enumerate(pbar):\n",
+    "    \n",
+    "    text = process(sample[\"text\"])\n",
+    "    transcript = transcripts[idx]\n",
+    "    \n",
+    "    bwords = []\n",
+    "    for word in text.split():\n",
+    "        if word in rarewords and word not in transcript:\n",
+    "            bwords.append(word)\n",
+    "            \n",
+    "    if len(bwords) > 0:\n",
+    "        train_data.append({\n",
+    "            \"split\": split_name,\n",
+    "            \"idx\": idx,\n",
+    "            \"text\": text,\n",
+    "            \"transcript\": transcript,\n",
+    "            \"b_words\": bwords,\n",
+    "        })\n",
+    "    pbar.set_description(f\"Len of train data: {len(train_data)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cac9a909-e1ce-426a-bda3-b65ba3985d06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(f\"./train_data/{split_name}.json\", \"w\") as fout:\n",
+    "    json.dump(train_data, fout, indent=4)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

prompting/.ipynb_checkpoints/generate_rare_words-checkpoint.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import sys, os
+import json
+import string
+from tqdm import tqdm
+def process(text):
+    # Lower case every letter
+    text = text.lower()
+    # Remove punctuation
+    punctuation_to_remove = string.punctuation.replace("'", "")
+    translation_table = str.maketrans('', '', punctuation_to_remove)
+    text = text.translate(translation_table)
+    # Remove whitespaces from front and behind
+    while text[0] == ' ' or text[-1] == ' ':
+        if text[0] == ' ':
+            text = text[1:]
+        if text[-1] == ' ':
+            text = text[:-1]
+    return text
+split_name = "train.other.500"
+with open("./blist/all_rare_words.txt") as fin:
+    rarewords = [process(word.strip()) for word in fin]
+with open(f"./transcripts/{split_name}.txt") as fin:
+    transcripts = [line.strip() for line in fin]
+from datasets import load_dataset
+cache_dir = "./../cache"
+dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
+train_data = []
+pbar = tqdm(dataset[split_name])
+for idx, sample in enumerate(pbar):
+    text = process(sample["text"])
+    transcript = transcripts[idx]
+    bwords = []
+    for word in text.split():
+        if word in rarewords and word not in transcript:
+            bwords.append(word)
+    if len(bwords) > 0:
+        train_data.append({
+            "split": split_name,
+            "idx": idx,
+            "text": text,
+            "transcript": transcript,
+            "b_words": bwords,
+        })
+    pbar.set_description(f"Len of train data: {len(train_data)}")
+with open(f"./train_data/{split_name}.json", "w") as fout:
+    json.dump(train_data, fout, indent=4)

prompting/.ipynb_checkpoints/generate_transcripts-checkpoint.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from datasets import load_dataset
+from tqdm import tqdm
+from math import ceil
+from model import generate, flush
+import numpy as np
+import os
+import torch
+import string
+def process(text):
+    # Lower case every letter
+    text = text.lower()
+    # Remove punctuation
+    punctuation_to_remove = string.punctuation.replace("'", "")
+    translation_table = str.maketrans('', '', punctuation_to_remove)
+    text = text.translate(translation_table)
+    # Remove whitespaces from front and behind
+    while text[0] == ' ' or text[-1] == ' ':
+        if text[0] == ' ':
+            text = text[1:]
+        if text[-1] == ' ':
+            text = text[:-1]
+    return text
+device = "cuda:0"
+dtype = torch.float16
+cache_dir = "./../cache"
+model_id = "openai/whisper-small"
+batch_size = 250
+out_dir = "./transcripts"
+dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
+processor = WhisperProcessor.from_pretrained(model_id, cache_dir=cache_dir)
+model = WhisperForConditionalGeneration.from_pretrained(model_id, cache_dir=cache_dir, attn_implementation="sdpa").to(device).to(dtype).eval()
+for split in dataset.keys():
+    data = dataset[split]
+    os.makedirs(out_dir, exist_ok=True)
+    for idx in tqdm(range(ceil(len(data)/batch_size))):
+        audios = data[idx * batch_size: (idx + 1) * batch_size]["audio"]
+        arrays = [a["array"] for a in audios]
+        transcripts = generate(arrays, model, processor)
+        with open(os.path.join(out_dir, f"{split}.txt"), "a") as disk:
+            disk.writelines([process(text) + "\n" for text in transcripts])
+        disk.close()
+        flush()

prompting/.ipynb_checkpoints/get_error_word_count-checkpoint.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import sys, os
+# from normalizers.english import EnglishTextNormalizer
+error_words_freqs = {}
+infile = sys.argv[1]
+# setname = sys.argv[2]
+insert_error = 0
+insert_rare = 0
+freqlist_test = {}
+# eng_norm = EnglishTextNormalizer()
+freqlist = {}
+with open("./blist/word_freq.txt") as fin:
+    for line in fin:
+        word, freq = line.split()
+        freqlist[word.upper()] = int(freq)
+with open("./blist/all_rare_words.txt") as fin:
+    rareset = set()
+    for line in fin:
+        rareset.add(line.strip().upper())
+project_set = set()
+with open(infile) as fin:
+    lines = fin.readlines()
+for i, line in enumerate(lines):
+    if line.startswith('id:'):
+        project = line.strip(')\n').split('-')[-3:]
+        project = '-'.join(project)
+    if "REF:" in line:
+        nextline = lines[i+1].split()
+        for j, word in enumerate(line.split()):
+            if '*' in word:
+                insert_error += 1
+                if nextline[j].upper() in rareset:
+                    insert_rare += 1
+        line = line.replace('*', '')
+        line.replace('%BCACK', '')
+        for word in line.split()[1:]:
+            if not word.startswith('('):
+                if word.upper() not in freqlist_test:
+                    freqlist_test[word.upper()] = 1
+                else:
+                    freqlist_test[word.upper()] += 1
+                if word != word.lower() and word.upper() in error_words_freqs:
+                    error_words_freqs[word.upper()] += 1
+                elif word != word.lower() and word.upper() not in error_words_freqs:
+                    error_words_freqs[word.upper()] = 1
+                elif word == word.lower() and word.upper() not in error_words_freqs:
+                    error_words_freqs[word.upper()] = 0
+print(len(error_words_freqs.keys()))
+print(insert_rare)
+commonwords = []
+rarewords = []
+oovwords = []
+common_freq = 0
+rare_freq = 0
+oov_freq = 0
+common_error = 0
+rare_error = 0
+oov_error = 0
+partial_error = 0
+partial_freq = 0
+very_common_error = 0
+very_common_words = 0
+words_error_freq = {}
+words_total_freq = {}
+for word, error in error_words_freqs.items():
+    if word in rareset:
+        rarewords.append(word)
+        rare_freq += freqlist_test[word]
+        rare_error += error
+    elif word not in freqlist:
+        oovwords.append(word)
+        oov_freq += freqlist_test[word] if word in freqlist_test else 1
+        oov_error += error
+    else:
+        if freqlist[word] <= 10 and freqlist[word] >= 3:
+            if freqlist[word] not in words_error_freq:
+                words_error_freq[freqlist[word]] = error
+                words_total_freq[freqlist[word]] = freqlist_test[word]
+            else:
+                words_error_freq[freqlist[word]] += error
+                words_total_freq[freqlist[word]] += freqlist_test[word]
+        if freqlist[word] <= 10 and freqlist[word] >= 3:
+            very_common_error += error
+            very_common_words += freqlist_test[word]
+        commonwords.append(word)
+        common_freq += freqlist_test[word]
+        common_error += error
+total_words = common_freq + rare_freq + oov_freq
+total_errors = common_error+rare_error+oov_error + insert_error
+WER = total_errors / total_words
+print('='*89)
+print('Common words error freq: {} / {} = {}'.format(common_error, common_freq, common_error/common_freq))
+print('Rare words error freq: {} / {} = {}'.format(rare_error+insert_rare, rare_freq, (rare_error + insert_rare)/rare_freq))
+print('OOV words error freq: {} / {} = {}'.format(oov_error, oov_freq, oov_error/max(oov_freq, 1)))
+print('WER estimate: {} / {} = {}'.format(total_errors, total_words, WER))
+# print('Partial word count: {} / {}'.format(partial_error, partial_freq))
+print('Insert error: {} / {} = {}'.format(insert_error - insert_rare, total_words, (insert_error - insert_rare)/total_words))
+print('Insertion + OOV error {}'.format((insert_error + oov_error - insert_rare) / total_words))
+# print('Very common words error freq: {} / {} = {}'.format(very_common_error, very_common_words, very_common_error/very_common_words))
+print('='*89)

prompting/.ipynb_checkpoints/model-checkpoint.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import numpy as np
+import gc
+from typing import List
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+@torch.no_grad()
+def generate(arrays, model, processor, max_new_tokens = 444) -> List[str]:
+    """
+    arrays: a list of audio arrays
+    model: the whisper model to use
+    processor: the wisper processor to use
+    """
+    inputs = processor(arrays, sampling_rate=16000, return_tensors="pt").input_features
+    # Cache the encoder hidden states
+    encoder_hidden_states = model.model.encoder(inputs.to(model.device).to(model.dtype)).last_hidden_state
+    decoder_ids = torch.tensor([[50258, 50259, 50359, 50363] for _ in range(inputs.shape[0])]).to(model.device)
+    # Tensor to keep track of which samples have reached the end of text token
+    inference_continues = torch.ones(inputs.shape[0], dtype=torch.bool).to(model.device)
+    while inference_continues.any() and max_new_tokens > 0:
+        last_hidden_state = model.model.decoder(input_ids = decoder_ids, encoder_hidden_states = encoder_hidden_states).last_hidden_state
+        # A small optimization to only project the hidden states of the last token
+        last_token_hidden_state = last_hidden_state[:, -1, :]
+        logits = model.proj_out(last_token_hidden_state)
+        # Greedy Sampling
+        probas = torch.softmax(logits, dim=-1)
+        pred_idx = torch.argmax(probas, dim=-1, keepdim=True)
+        # Fill the samples where inference has stopped with <|end of text|> token
+        pred_idx[~inference_continues, :] = 50257
+        decoder_ids = torch.cat((decoder_ids, pred_idx), dim=-1)
+        # Check if any sample has reached the end of text token
+        reached_end_of_text = (pred_idx.squeeze(-1) == 50257)
+        inference_continues &= ~reached_end_of_text
+        max_new_tokens -= 1
+    transcripts = processor.batch_decode(decoder_ids, skip_special_tokens=True)
+    return transcripts

prompting/.ipynb_checkpoints/train_clean_100_error-checkpoint.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71787a0de524627122b64b419a50d551ca4a9ddb5ac3888c2e54990850cde26e
+size 12684279

prompting/.ipynb_checkpoints/train_lora-checkpoint.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from datasets import load_dataset
+from peft import LoraConfig, prepare_model_for_kbit_training, TaskType
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    set_seed,
+    pipeline
+)
+from trl import SFTTrainer, SFTConfig
+from random import randrange
+import torch
+import wandb
+cache_dir = "./../cache"
+model_id = "microsoft/Phi-3-mini-4k-instruct"
+new_model = "python-phi-3-mini-4k-instruct"
+username = "ellipticaloranges"
+device_map = {"": 0}
+hf_model_repo = username + "/" + new_model
+## ------------------------LoRA Configs------------------------------------------------------
+lora_r = 16
+lora_alpha = 16
+lora_dropout = 0.05
+target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
+## ------------------------------------------------------------------------------------------
+dataset_name = "flytech/python-codes-25k"
+dataset_split= "train"
+dataset = load_dataset(dataset_name, split=dataset_split, cache_dir=cache_dir)
+print(f"Dataset size: {len(dataset)}")
+tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=cache_dir, trust_remote_code=True, add_eos_token=True, use_fast=True)
+# The padding token is set to the unknown token.
+tokenizer.pad_token = tokenizer.unk_token
+# The ID of the padding token is set to the ID of the unknown token.
+tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
+# ValueError: You are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input.
+tokenizer.padding_side = 'left'
+def create_message_column(row):
+    messages = []
+    user = {
+        "content": f"{row['instruction']}",
+        "role": "user"
+    }
+    messages.append(user)
+    assistant = {
+        "content": f"{row['input']}\n{row['output']}",
+        "role": "assistant"
+    }
+    messages.append(assistant)
+    return {"messages": messages}
+def format_dataset_chatml(row):
+    return {"text": tokenizer.apply_chat_template(row["messages"], add_generation_prompt=False, tokenize=False)}
+dataset_chatml = dataset.map(create_message_column)
+dataset_chatml = dataset_chatml.map(format_dataset_chatml)
+dataset_chatml = dataset_chatml.train_test_split(test_size=0.05, seed=1234)
+# print("Max Seq Length", max(map(lambda x: len(tokenizer.encode(x["text"])), dataset)))
+if torch.cuda.is_bf16_supported():
+    compute_dtype = torch.bfloat16
+    attn_implementation = 'flash_attention_2'
+else:
+    compute_dtype = torch.float16
+    attn_implementation = 'sdpa'
+print(f"Using {compute_dtype} with {attn_implementation} implementation")
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype = compute_dtype,
+    trust_remote_code = True,
+    device_map = device_map,
+    attn_implementation = attn_implementation,
+    cache_dir = cache_dir
+)
+args = SFTConfig(
+    output_dir="./phi-3-mini-LoRA",
+    eval_strategy="steps",
+    do_eval=True,
+    optim="adamw_torch",
+    per_device_train_batch_size=8,
+    gradient_accumulation_steps=4,
+    per_device_eval_batch_size=8,
+    log_level="debug",
+    save_strategy="epoch",
+    logging_steps=10,
+    learning_rate=1e-4,
+    fp16 = not torch.cuda.is_bf16_supported(),
+    bf16 = torch.cuda.is_bf16_supported(),
+    eval_steps=100,
+    dataset_text_field="text",
+    max_seq_length=512,
+    num_train_epochs=3,
+    warmup_ratio=0.1,
+    lr_scheduler_type="linear",
+    report_to="wandb",
+    seed=42,
+)
+peft_config = LoraConfig(
+    r=lora_r,
+    lora_alpha=lora_alpha,
+    lora_dropout=lora_dropout,
+    task_type=TaskType.CAUSAL_LM,
+    target_modules=target_modules,
+)
+model.add_adapter(peft_config)
+wandb.init(project = "Phi 3", name = "python-phi-3-lora")
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset_chatml['train'],
+    eval_dataset=dataset_chatml['test'],
+    peft_config=peft_config,
+    tokenizer=tokenizer,
+    args=args,
+)
+trainer.train()
+# Save the model to the `output_dir` after training
+model.save_pretrained("./out/")

prompting/.ipynb_checkpoints/train_phi-checkpoint.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
+from huggingface_hub import ModelCard, ModelCardData, HfApi
+from datasets import load_dataset
+from jinja2 import Template
+from trl import SFTTrainer
+import yaml
+import torch
+# Model Configs
+MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
+NEW_MODEL_NAME = "opus-phi-3-mini-4k-instruct"
+CACHE_DIR = "./../cache"
+# Dataset Configs
+DATASET_NAME = ""
+SPLIT = "train"
+# the maximum length of the sequences that the model will handle
+MAX_SEQ_LENGTH = 4096
+num_train_epochs = 1
+license = "apache-2.0"
+username = "darshanmakwana412"
+learning_rate = 1.41e-5
+per_device_train_batch_size = 4
+gradient_accumulation_steps = 1
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)
+dataset = load_dataset(DATASET_NAME, split=SPLIT)
+# EOS Token is used to mark the end of a sentence
+EOS_TOKEN=tokenizer.eos_token_id
+def formatting_prompts_func(examples):
+    # Extract the conversations from the examples.
+    convos = examples["conversations"]
+    # Initialize an empty list to store the formatted texts.
+    texts = []
+    # Define a dictionary to map the 'from' field in the conversation to a prefix.
+    mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
+    # Define a dictionary to map the 'from' field in the conversation to a suffix.
+    end_mapper = {"system": "", "human": "", "gpt": ""}
+    # Iterate over each conversation.
+    for convo in convos:
+        # Format the conversation by joining each turn with its corresponding prefix and suffix.
+        # Append the EOS token to the end of the conversation.
+        text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
+        texts.append(f"{text}{EOS_TOKEN}")
+    # Return the formatted texts.
+    return {"text": texts}
+dataset = dataset.map(formatting_prompts_func, batched=True)
+args = TrainingArguments(
+    evaluation_strategy="steps",
+    per_device_train_batch_size=per_device_train_batch_size,
+    gradient_accumulation_steps=gradient_accumulation_steps,
+    gradient_checkpointing=True,
+    learning_rate=learning_rate,
+    fp16 = not torch.cuda.is_bf16_supported(),
+    bf16 = torch.cuda.is_bf16_supported(),
+    max_steps=-1,
+    num_train_epochs=num_train_epochs,
+    save_strategy="epoch",
+    logging_steps=10,
+    output_dir=NEW_MODEL_NAME,
+    optim="paged_adamw_32bit",
+    lr_scheduler_type="linear"
+)
+trainer = SFTTrainer(
+    model=model,
+    args=args,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=MAX_SEQ_LENGTH,
+    formatting_func=formatting_prompts_func
+)
+import gc
+import os
+gc.collect()
+torch.cuda.empty_cache()
+trainer.train()

prompting/.ipynb_checkpoints/training-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,278 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "12315053-0630-4d3d-8028-02035c2dbf14",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Slide Speech Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8eed0bf-d822-4091-8762-df6582095ab4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Dir Structure:\n",
+    " - data\n",
+    "   - info\n",
+    "   - test\n",
+    "   - val\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e999f437-d756-492d-b873-6ee656279b53",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Phi 3 Tinkering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f5f1033-9118-4106-a7fb-6c3b527fe075",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "Prompt template for Phi-3\n",
+    "<|system|>\n",
+    "You are a python developer.<|end|>\n",
+    "<|user|>\n",
+    "Help me generate a bubble sort algorithm<|end|>\n",
+    "<|assistant|>\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e9e81b0-ae3d-46f1-97ff-138984d07a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
+    "\n",
+    "cache_dir = \"./../cache\"\n",
+    "model_id = \"microsoft/Phi-3-mini-4k-instruct\"\n",
+    "device = \"cuda:0\"\n",
+    "dtype = torch.float16\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    device_map = device,\n",
+    "    torch_dtype = dtype,\n",
+    "    trust_remote_code=True,\n",
+    "    cache_dir = cache_dir,\n",
+    "    attn_implementation = \"flash_attention_2\"\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir = cache_dir)\n",
+    "\n",
+    "pipe = pipeline(\n",
+    "    \"text-generation\",\n",
+    "    model = model,\n",
+    "    tokenizer = tokenizer\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93b9fb26-4661-4a35-ad62-b87834f577bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\"role\": \"system\", \"content\": \"You are a python developer\"},\n",
+    "    {\"role\": \"user\", \"content\": \"Help me generate a bubble sort algorithm\"}\n",
+    "]\n",
+    "\n",
+    "generation_args = {\n",
+    "    \"max_new_tokens\": 600,\n",
+    "    \"return_full_text\": False,\n",
+    "    \"temperature\": 1.0,\n",
+    "    \"do_sample\": True\n",
+    "}\n",
+    "\n",
+    "output = pipe(messages, **generation_args)\n",
+    "print(output[0][\"generated_text\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62db724a-9e20-422d-a5b3-dd55cae55cc7",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "## Training Phi 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2036e6b5-c794-4668-9a79-8a53a2736cfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig\n",
+    "from huggingface_hub import ModelCard, ModelCardData, HfApi\n",
+    "from datasets import load_dataset\n",
+    "from jinja2 import Template\n",
+    "from trl import SFTTrainer\n",
+    "import yaml\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2dcff92e-ab3e-407a-8595-31ffae5f7acd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Model Configs\n",
+    "MODEL_ID = \"microsoft/Phi-3-mini-4k-instruct\"\n",
+    "NEW_MODEL_NAME = \"opus-phi-3-mini-4k-instruct\"\n",
+    "CACHE_DIR = \"./../cache\"\n",
+    "\n",
+    "# Dataset Configs\n",
+    "DATASET_NAME = \"\"\n",
+    "SPLIT = \"train\"\n",
+    "\n",
+    "# the maximum length of the sequences that the model will handle\n",
+    "MAX_SEQ_LENGTH = 4096\n",
+    "num_train_epochs = 1\n",
+    "license = \"apache-2.0\"\n",
+    "username = \"darshanmakwana412\"\n",
+    "learning_rate = 1.41e-5\n",
+    "per_device_train_batch_size = 4\n",
+    "gradient_accumulation_steps = 1\n",
+    "\n",
+    "# If bd16 is supported use bf16 otherwise use f16\n",
+    "if torch.cuda.is_bf16_supported():\n",
+    "    compute_dtype = torch.bfloat16\n",
+    "else:\n",
+    "    compute_dtype = torch.float16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fbcf3c6-dd94-4133-a805-910d57c9f974",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=CACHE_DIR, trust_remote_code=True)\n",
+    "# dataset = load_dataset(DATASET_NAME, split=SPLIT)\n",
+    "\n",
+    "# EOS Token is used to mark the end of a sentence\n",
+    "EOS_TOKEN=tokenizer.eos_token_id"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e31954ea-1838-4992-9132-32e59c42a128",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def formatting_prompts_func(examples):\n",
+    "    # Extract the conversations from the examples.\n",
+    "    convos = examples[\"conversations\"]\n",
+    "    # Initialize an empty list to store the formatted texts.\n",
+    "    texts = []\n",
+    "    # Define a dictionary to map the 'from' field in the conversation to a prefix.\n",
+    "    mapper = {\"system\": \"system\\n\", \"human\": \"\\nuser\\n\", \"gpt\": \"\\nassistant\\n\"}\n",
+    "    # Define a dictionary to map the 'from' field in the conversation to a suffix.\n",
+    "    end_mapper = {\"system\": \"\", \"human\": \"\", \"gpt\": \"\"}\n",
+    "    # Iterate over each conversation.\n",
+    "    for convo in convos:\n",
+    "        # Format the conversation by joining each turn with its corresponding prefix and suffix.\n",
+    "        # Append the EOS token to the end of the conversation.\n",
+    "        text = \"\".join(f\"{mapper[(turn := x['from'])]} {x['value']}\\n{end_mapper[turn]}\" for x in convo)\n",
+    "        texts.append(f\"{text}{EOS_TOKEN}\")\n",
+    "    # Return the formatted texts.\n",
+    "    return {\"text\": texts}\n",
+    "\n",
+    "dataset = dataset.map(formatting_prompts_func, batched=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3086c2a4-7cca-461e-894c-376046089fab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "args = TrainingArguments(\n",
+    "    evaluation_strategy=\"steps\",\n",
+    "    per_device_train_batch_size=7,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    gradient_checkpointing=True,\n",
+    "    learning_rate=1e-4,\n",
+    "    fp16 = not torch.cuda.is_bf16_supported(),\n",
+    "    bf16 = torch.cuda.is_bf16_supported(),\n",
+    "    max_steps=-1,\n",
+    "    num_train_epochs=3,\n",
+    "    save_strategy=\"epoch\",\n",
+    "    logging_steps=10,\n",
+    "    output_dir=NEW_MODEL_NAME,\n",
+    "    optim=\"paged_adamw_32bit\",\n",
+    "    lr_scheduler_type=\"linear\"\n",
+    ")\n",
+    "\n",
+    "trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    args=args,\n",
+    "    train_dataset=dataset,\n",
+    "    dataset_text_field=\"text\",\n",
+    "    max_seq_length=128,\n",
+    "    formatting_func=formatting_prompts_func\n",
+    ")\n",
+    "\n",
+    "device = \"cuda:0\"\n",
+    "\n",
+    "import gc\n",
+    "import os\n",
+    "\n",
+    "gc.collect()\n",
+    "torch.cuda.empty_cache()\n",
+    "\n",
+    "trainer.train()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

prompting/RepCodec/.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/

prompting/RepCodec/.ipynb_checkpoints/tinker-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,267 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "997bed07-1181-4562-962a-cb8aa18e1d16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from repcodec.RepCodec import RepCodec\n",
+    "import torch\n",
+    "import yaml\n",
+    "\n",
+    "config = \"repcodec/configs/repcodec_dim1024.yaml\"\n",
+    "with open(config) as fp:\n",
+    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
+    "\n",
+    "model = RepCodec(**conf)\n",
+    "model.load_state_dict(torch.load(\"./../models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
+    "model.quantizer.initial()\n",
+    "model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c5516f1-3565-4080-8612-d5ce52ea2a4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# input shape: (batch size, hidden dim, sequence length)\n",
+    "random_features = torch.randn(size=(1, 1024, 100))\n",
+    "with torch.no_grad():\n",
+    "    x = model.encoder(random_features)\n",
+    "    z = model.projector(x)\n",
+    "    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "439ecea7-f0d4-4a61-80c2-729138beee32",
+   "metadata": {},
+   "source": [
+    "## Dump Representations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6efa1891-0810-4cfb-9552-764297209e99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "python3 examples/dump_feature.py --model_type data2vec --tsv_path \"./files/train.clean.100.tsv\" --ckpt_path \"./../models/vox_pretrained.pt\" --layer 18 --feat_dir \"./features/train.clean.100\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbd1c550-0606-4217-ac65-55ae92843f19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "\n",
+    "cache_dir = \"./../../cache\"\n",
+    "\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)\n",
+    "\n",
+    "# for split in dataset.keys():\n",
+    "#     data = dataset[split]\n",
+    "#     num_frames = []\n",
+    "#     for idx in tqdm(range(len(data))):\n",
+    "#         audio = data[idx][\"audio\"]\n",
+    "#         num_frames.append(int(len(audio[\"array\"]) * 16000 // audio[\"sampling_rate\"]))\n",
+    "        \n",
+    "#     df = pd.DataFrame.from_dict({\n",
+    "#         \"file_path\": list(data[\"file\"]),\n",
+    "#         \"num_frames\": num_frames\n",
+    "#     })\n",
+    "#     df.to_csv(f\"./files/{split}.tsv\", sep=\"\\t\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b4af1af-5726-4899-8272-dfe867cb48a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset[\"train.clean.100\"][0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae6a0ef4-8c0a-4f6e-9a81-a9c3350e1266",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1247988-5eaa-492a-a3ab-2b11505126a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import Dataset, load_dataset\n",
+    "from collections import defaultdict\n",
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "import string\n",
+    "\n",
+    "cache_dir = \"./../../cache\"\n",
+    "\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)\n",
+    "\n",
+    "def process(text):\n",
+    "\n",
+    "    # Lower case every letter\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    # Remove punctuation\n",
+    "    punctuation_to_remove = string.punctuation.replace(\"'\", \"\")\n",
+    "    translation_table = str.maketrans('', '', punctuation_to_remove)\n",
+    "    text = text.translate(translation_table)\n",
+    "\n",
+    "    # Remove whitespaces from front and behind\n",
+    "    while text[0] == ' ' or text[-1] == ' ':\n",
+    "        if text[0] == ' ':\n",
+    "            text = text[1:]\n",
+    "        if text[-1] == ' ':\n",
+    "            text = text[:-1]\n",
+    "    \n",
+    "    return text\n",
+    "\n",
+    "dataset = dataset.remove_columns([\"audio\", \"speaker_id\", \"chapter_id\"])\n",
+    "\n",
+    "tokenized_ds = defaultdict(lambda: [])\n",
+    "\n",
+    "for split in dataset.keys():\n",
+    "\n",
+    "    texts = []\n",
+    "    tokens = []\n",
+    "    tkns = np.load(f\"./examples/tkns/{split}.npz\")\n",
+    "\n",
+    "    for idx, key in enumerate(tqdm(tkns.files)):\n",
+    "        tokens.append(list(tkns[key]))\n",
+    "        texts.append(process(dataset[split][idx][\"text\"]))\n",
+    "\n",
+    "    tokenized_ds[split] = Dataset.from_dict({\n",
+    "        \"text\": texts,\n",
+    "        \"audio_tokens\": tokens\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfc82444-3081-4138-aa06-6fb0b7cbc6c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import dataset_dict, DatasetDict\n",
+    "\n",
+    "tds = DatasetDict(tokenized_ds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "006171b9-d479-4462-9642-d126f77edfc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tds.save_to_disk(\"librispeech_tokenized.hf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "12970376-4f6f-4926-a954-29c32043b64c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "Couldn't infer the same data file format for all splits. Got {NamedSplit('train'): ('arrow', {}), NamedSplit('validation'): ('json', {}), NamedSplit('test'): ('json', {})}",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[0;32m----> 3\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./librispeech_tokenized.hf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/datasets/load.py:2594\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)\u001b[0m\n\u001b[1;32m   2589\u001b[0m verification_mode \u001b[38;5;241m=\u001b[39m VerificationMode(\n\u001b[1;32m   2590\u001b[0m     (verification_mode \u001b[38;5;129;01mor\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mBASIC_CHECKS) \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m save_infos \u001b[38;5;28;01melse\u001b[39;00m VerificationMode\u001b[38;5;241m.\u001b[39mALL_CHECKS\n\u001b[1;32m   2591\u001b[0m )\n\u001b[1;32m   2593\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 2594\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2595\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2596\u001b[0m \u001b[43m    \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2597\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2598\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2599\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2600\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2601\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2602\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2603\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2604\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2605\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2606\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2607\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   2608\u001b[0m \u001b[43m    \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2609\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2611\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m   2612\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/datasets/load.py:2266\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, token, use_auth_token, storage_options, trust_remote_code, _require_default_config_name, **config_kwargs)\u001b[0m\n\u001b[1;32m   2264\u001b[0m     download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[1;32m   2265\u001b[0m     download_config\u001b[38;5;241m.\u001b[39mstorage_options\u001b[38;5;241m.\u001b[39mupdate(storage_options)\n\u001b[0;32m-> 2266\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2267\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2268\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2269\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2270\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2271\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2272\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2273\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2274\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrust_remote_code\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2275\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_default_config_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_require_default_config_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2276\u001b[0m \u001b[43m    \u001b[49m\u001b[43m_require_custom_configs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mbool\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2277\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2278\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[1;32m   2279\u001b[0m builder_kwargs \u001b[38;5;241m=\u001b[39m dataset_module\u001b[38;5;241m.\u001b[39mbuilder_kwargs\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/datasets/load.py:1825\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, cache_dir, trust_remote_code, _require_default_config_name, _require_custom_configs, **download_kwargs)\u001b[0m\n\u001b[1;32m   1818\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m LocalDatasetModuleFactoryWithScript(\n\u001b[1;32m   1819\u001b[0m         combined_path,\n\u001b[1;32m   1820\u001b[0m         download_mode\u001b[38;5;241m=\u001b[39mdownload_mode,\n\u001b[1;32m   1821\u001b[0m         dynamic_modules_path\u001b[38;5;241m=\u001b[39mdynamic_modules_path,\n\u001b[1;32m   1822\u001b[0m         trust_remote_code\u001b[38;5;241m=\u001b[39mtrust_remote_code,\n\u001b[1;32m   1823\u001b[0m     )\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m   1824\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39misdir(path):\n\u001b[0;32m-> 1825\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mLocalDatasetModuleFactoryWithoutScript\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1826\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\n\u001b[1;32m   1827\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_module\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1828\u001b[0m \u001b[38;5;66;03m# Try remotely\u001b[39;00m\n\u001b[1;32m   1829\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_relative_path(path) \u001b[38;5;129;01mand\u001b[39;00m path\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/datasets/load.py:1040\u001b[0m, in \u001b[0;36mLocalDatasetModuleFactoryWithoutScript.get_module\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1034\u001b[0m     patterns \u001b[38;5;241m=\u001b[39m get_data_patterns(base_path)\n\u001b[1;32m   1035\u001b[0m data_files \u001b[38;5;241m=\u001b[39m DataFilesDict\u001b[38;5;241m.\u001b[39mfrom_patterns(\n\u001b[1;32m   1036\u001b[0m     patterns,\n\u001b[1;32m   1037\u001b[0m     base_path\u001b[38;5;241m=\u001b[39mbase_path,\n\u001b[1;32m   1038\u001b[0m     allowed_extensions\u001b[38;5;241m=\u001b[39mALL_ALLOWED_EXTENSIONS,\n\u001b[1;32m   1039\u001b[0m )\n\u001b[0;32m-> 1040\u001b[0m module_name, default_builder_kwargs \u001b[38;5;241m=\u001b[39m \u001b[43minfer_module_for_data_files\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1041\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1042\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1043\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1044\u001b[0m data_files \u001b[38;5;241m=\u001b[39m data_files\u001b[38;5;241m.\u001b[39mfilter_extensions(_MODULE_TO_EXTENSIONS[module_name])\n\u001b[1;32m   1045\u001b[0m \u001b[38;5;66;03m# Collect metadata files if the module supports them\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/datasets/load.py:596\u001b[0m, in \u001b[0;36minfer_module_for_data_files\u001b[0;34m(data_files, path, download_config)\u001b[0m\n\u001b[1;32m    594\u001b[0m module_name, default_builder_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28miter\u001b[39m(split_modules\u001b[38;5;241m.\u001b[39mvalues()))\n\u001b[1;32m    595\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28many\u001b[39m((module_name, default_builder_kwargs) \u001b[38;5;241m!=\u001b[39m split_module \u001b[38;5;28;01mfor\u001b[39;00m split_module \u001b[38;5;129;01min\u001b[39;00m split_modules\u001b[38;5;241m.\u001b[39mvalues()):\n\u001b[0;32m--> 596\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt infer the same data file format for all splits. Got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msplit_modules\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    597\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m module_name:\n\u001b[1;32m    598\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m DataFilesNotFoundError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo (supported) data files found\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m path \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n",
+      "\u001b[0;31mValueError\u001b[0m: Couldn't infer the same data file format for all splits. Got {NamedSplit('train'): ('arrow', {}), NamedSplit('validation'): ('json', {}), NamedSplit('test'): ('json', {})}"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset = load_dataset(\"./librispeech_tokenized.hf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "b3ba0d58-b788-43b5-87a7-726aaa12dbbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import dataset_dict, DatasetDict, Dataset\n",
+    "\n",
+    "dataset = DatasetDict.load_from_disk(\"./librispeech_tokenized.hf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "b7239186-73ae-407a-b9f6-b5a16f3a7ddc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "726"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dataset[\"train.clean.100\"][0][\"audio_tokens\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

prompting/RepCodec/LICENSE ADDED Viewed

	@@ -0,0 +1,428 @@

+MIT License
+Copyright (c) ByteDance, Inc. and its affiliates.
+Copyright (c) Chutong Meng
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+Attribution-NonCommercial 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

prompting/RepCodec/README.md ADDED Viewed

	@@ -0,0 +1,273 @@

+# RepCodec: A Speech Representation Codec for Speech Tokenization
+> [**RepCodec: A Speech Representation Codec for Speech Tokenization**](https://arxiv.org/abs/2309.00169)
+## Introduction
+**RepCodec** is a speech tokenization method for converting a speech waveform into a sequence of discrete semantic
+tokens.
+The main idea is to train a representation codec which learns a vector quantization codebook through reconstructing the
+input speech representations from speech encoders like HuBERT or data2vec.
+Extensive experiments show that RepCodec significantly outperforms the widely used k-means clustering approach in both
+speech understanding and generation.
+Also, RepCodec generalizes well across various speech encoders and languages.
+<img src="images/RepCodec.png" alt="se" width="1000" />
+## RepCodec Models
+| Feature Type                                                                                                          | Speech Data                                              | RepCodec Model                                                                                           |
+|-----------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|----------------------------------------------------------------------------------------------------------|
+| [HuBERT base](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models) layer 9     | [Librispeech](http://www.openslr.org/12) train-clean-100 | [hubert_base_l9](https://drive.google.com/file/d/1XD0HKl607FFjri2-VJT7lHQeSpxsCCFO/view?usp=sharing)     |
+| [HuBERT large](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models) layer 18   | [Librispeech](http://www.openslr.org/12) train-clean-100 | [hubert_large_l18](https://drive.google.com/file/d/1mTbm5GeJ7gp_5L3QLP-JGXdf8RnRw5n6/view?usp=sharing)   |
+| [data2vec base](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2) layer 6   | [Librispeech](http://www.openslr.org/12) train-clean-100 | [data2vec_base_l6](https://drive.google.com/file/d/1d8sf3Ko_fYM9zlaiwxK_4xusLRKV5EMd/view?usp=sharing)   |
+| [data2vec large](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2) layer 18 | [Librispeech](http://www.openslr.org/12) train-clean-100 | [data2vec_large_l18](https://drive.google.com/file/d/1nuRIHaejT-uVi4cluftbT8o_JZqar5SU/view?usp=sharing) |
+| [Whisper medium](https://github.com/openai/whisper/tree/main#available-models-and-languages) layer 24                 | [Librispeech](http://www.openslr.org/12) train-clean-100 | [whisper_medium_l24](https://drive.google.com/file/d/1V6YJSA2V4iywXrecJAN0oqsa3aHowexZ/view?usp=sharing) |
+| [Whisper large-v2](https://github.com/openai/whisper/tree/main#available-models-and-languages) layer 32                                                                                         | [Librispeech](http://www.openslr.org/12) train-clean-100 | [whisper_large_l32](https://drive.google.com/file/d/1k_X7ZMPg8iOeDrIJe70v6CHfFygzufXC/view?usp=sharing)  |
+## Speech Tokenization Using Pre-Trained Models
+### Installation
+Please first install RepCodec by
+```
+git clone https://github.com/mct10/RepCodec.git
+cd RepCodec
+pip install .
+```
+We used Python 3.9.18 and PyTorch 1.12.1 to test the usage, but the code should be compatible with other recent Python
+and PyTorch versions.
+### Representation Preparation
+We adapt the `dump_hubert_feature.py` script
+from [fairseq](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert/simple_kmeans#hubert-feature)
+to support dumping representations from **data2vec**, **HuBERT**, or **Whisper** encoders.
+If you use our script (`examples/dump_feature.py`), please also install the following packages:
+```
+pip install npy_append_array soundfile
+```
+Additionally, if you want to dump representations from
+- **data2vec** or **HuBERT**: please
+  follow [fairseq's instruction](https://github.com/facebookresearch/fairseq#requirements-and-installation) to install
+  the latest fairseq.
+- **Whisper**: please follow [Whispers'instruction](https://github.com/openai/whisper/tree/main#setup) to install the
+  latest
+  Whisper.
+Then, you can follow the given examples to dump representations:
+```
+# Example 1: dump from HuBERT base layer 9
+# (for data2vec, simply change "model_type" to data2vec and "ckpt_path" to the path of data2vec model)
+layer=9
+python3 examples/dump_feature.py \
+    --model_type hubert \
+    --tsv_path /path/to/tsv/file \
+    --ckpt_path /path/to/HuBERT/model  \
+    --layer ${layer} \
+    --feat_dir /dir/to/save/representations
+# Example 2: dump from Whisper medium layer 24
+layer=24
+python3 examples/dump_feature.py \
+    --model_type whisper \
+    --tsv_path /path/to/tsv/file \
+    --whisper_root /directory/to/save/whisper/model \
+    --whisper_name medium \
+    --layer ${layer} \
+    --feat_dir /dir/to/save/representations
+```
+Explanations about the args:
+- **model_type:** choose from `data2vec`, `hubert`, and `whisper`.
+- **tsv_path:** path of the tsv file.
+  Should have the format of
+```
+/dir/to/dataset
+path_of_utterance_1 number_of_frames
+path_of_utterance_2 number_of_frames
+```
+You can follow [this script](https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py)
+to generate the tsv file.
+For example, by running
+```
+python wav2vec_manifest.py \
+  /dir/to/LibriSpeech/dev-clean \
+  --dest /dir/to/manifest \
+  --ext flac \
+  --valid-percent 0
+```
+you can obtain the `dev-clean.tsv` in `/dir/to/manifest` for LibriSpeech. (By default, the output file name
+is `train.tsv`. Remember to rename the file.)
+It should be similar to:
+```
+/dir/to/LibriSpeech/dev-clean
+2277/149896/2277-149896-0026.flac	78720
+2277/149896/2277-149896-0005.flac	89600
+2277/149896/2277-149896-0033.flac	45520
+```
+- **ckpt_path**:
+  must provide for data2vec and HuBERT.
+  You need to download the model
+  from [data2vec website](https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/README.md#speech-2)
+  or [HuBERT website](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert#pre-trained-and-fine-tuned-asr-models)
+  yourself.
+  `--ckpt_path` is the path of the data2vec/HuBERT model.
+- **whisper_root** and  **whisper_name**:
+  must provide **BOTH** `--whisper_root` and `--whisper_name` for Whisper.
+  If there is no corresponding model in `--whisper_root`, the script will download for you.
+- **layer**:
+  which Transformer encoder layer of the model should the representations be extracted from.
+  It is **1-based**.
+  For example, if layer=9, then the outputs from the 9<sup>th</sup> Transformer encoder layer are dumped.
+  Range: [1, number of Transformer encoder layers]
+- **feat_dir**: The output representations will be saved to `${feat_dir}/0_1.npy`
+  and `${feat_dir}/0_1.len`.
+For other useful functionalities (e.g., sharding), please check the argument list in `examples/dump_feature.py`.
+### Command Line Usage
+We expect to have `${feat_dir}/0_1.npy` and `${feat_dir}/0_1.len` in the provided
+directory `/dir/to/representaitons`.
+Also, the tsv file should be the **same** as the one used in [Representation Preparation](#representation-preparation).
+```
+repcodec /dir/to/representaitons \
+    --model /path/to/repcodec/model \
+    --tsv_path /path/to/tsv/file \
+    [--model_config_path /path/to/train/config] \
+    [--use_gpu] \
+    [--out_dir /path/to/output]
+```
+If you trained the model yourself following [Training New RepCodec Models](#training-new-repcodec-models),
+please provide the training config file using `--model_config_path`.
+If you use the model we provide [here](#repcodec-models), then you do not have to provide that.
+This command will tokenize the representations and the output discrete tokens will be saved to `${out_dir}/tokens`.
+The tokens are in the same order as the provided tsv file.
+An example of the output file:
+```
+/dir/to/LibriSpeech/dev-clean
+2277/149896/2277-149896-0026.flac	696 696 198 198 198 498 ...
+2277/149896/2277-149896-0005.flac	696 696 198 198 198 907 ...
+2277/149896/2277-149896-0033.flac	696 696 198 198 198 696 ...
+```
+Under `examples/tokens`, we provide some token files as references. They are obtained from LibriSpeech dev-clean subset
+using the 6 types of representations and corresponding [RepCodec Models](#repcodec-models).
+Your results should be very similar to ours.
+### Python Usage
+```python
+import torch
+import yaml
+from repcodec.RepCodec import RepCodec
+# for feature types of HubERT base & data2vec base, please use repcodec_dim768.yaml;
+# for feature types of HuBERT large & data2vec large & Whisper medium, please use repcodec_dim1024.yaml;
+# for feature types of Whisper large-v2, please use repcodec_dim1280.yaml
+config = "repcodec/configs/repcodec_dim768.yaml"
+with open(config) as fp:
+    conf = yaml.load(fp, Loader=yaml.FullLoader)
+model = RepCodec(**conf)
+model.load_state_dict(torch.load("./hubert_base_l9.pkl", map_location="cpu")["model"]["repcodec"])
+model.quantizer.initial()
+model.eval()
+# input shape: (batch size, hidden dim, sequence length)
+random_features = torch.randn(size=(1, 768, 100))
+with torch.no_grad():
+    x = model.encoder(random_features)
+    z = model.projector(x)
+    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
+    tokens = idx.cpu().data.numpy().tolist()[0]
+```
+## Training New RepCodec Models
+We use a config file to set up all the training configurations, e.g., data, model architecture,
+optimizer, scheduler.
+We provide an example [here](./train_configs/ex_dim768_mse.yaml).
+Please first install required packages following [Installation](#installation)
+and prepare the representations following [Representation Preparation](#representation-preparation).
+The input data directory is expected to have the following structure
+```
+/dir/to/representations/
+  train_set_name/
+    0_1.npy
+    0_1.len
+  valid_set_name/
+    0_1.npy
+    0_1.len
+  test_set_name/
+    0_1.npy
+    0_1.len
+```
+The names of subsets should be the same as the fields in the config file.
+Then, you can run training by
+```
+python train.py \
+  -c /path/to/config/file \
+  --tag $tag \
+  --exp_root exp
+```
+`tag` is the name of the output folder.
+All outputs will be saved to `exp_root/tag/`.
+## Acknowledge
+Our implementation is based on [facebookresearch/AudioDec](https://github.com/facebookresearch/AudioDec).
+We thank them for open-sourcing their code!
+## Citation
+If you find our work useful, please cite the following article.
+```
+@misc{huang2023repcodec,
+      title={RepCodec: A Speech Representation Codec for Speech Tokenization},
+      author={Zhichao Huang and Chutong Meng and Tom Ko},
+      year={2023},
+      eprint={2309.00169},
+      archivePrefix={arXiv},
+      primaryClass={eess.AS}
+}
+```

prompting/RepCodec/dataloader/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .collater import *
2	+ from .dataset import *

prompting/RepCodec/dataloader/collater.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import numpy as np
+import torch
+class ReprCollater(object):
+    def __call__(self, batch):
+        xs = []
+        for b in batch:
+            if b is not None:
+                xs.append(b)
+        x_batch = np.stack(xs, axis=0)
+        x_batch = torch.tensor(x_batch, dtype=torch.float).transpose(1, 2)  # (B, T, C) -> (B, C, T)
+        return x_batch

prompting/RepCodec/dataloader/dataset.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the CC BY-NC license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on AudioDec (https://github.com/facebookresearch/AudioDec)
+import glob
+import logging
+import os
+from typing import List
+import numpy as np
+from torch.utils.data import Dataset
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+)
+logger = logging.getLogger("dataset")
+class ReprDataset(Dataset):
+    def __init__(
+            self,
+            data_dir: str,
+            batch_len: int,
+    ):
+        self.batch_len = batch_len
+        self.blocks = self._load_blocks(data_dir)
+        self.offsets = self._load_offsets(data_dir)
+        assert len(self.blocks) == len(self.offsets)
+        # check len
+        for i in range(len(self.blocks)):
+            assert self.blocks[i].shape[0] == self.offsets[i][-1]
+        self.n_examples = np.cumsum([0] + [offset.shape[0] - 1 for offset in self.offsets])
+    def __len__(self):
+        return self.n_examples[-1]
+    def __getitem__(self, idx):
+        # find which block
+        block_id = -1
+        for n in range(len(self.n_examples) - 1):
+            if self.n_examples[n] <= idx < self.n_examples[n + 1]:
+                block_id = n
+                break
+        assert 0 <= block_id < len(self.blocks), f"Failed to find {idx}"
+        block_offset = idx - self.n_examples[block_id]
+        start = self.offsets[block_id][block_offset]
+        end = self.offsets[block_id][block_offset + 1]
+        # randomly choose a slice
+        if end - start < self.batch_len:
+            return None
+        elif end - start == self.batch_len:
+            return self.blocks[block_id][start:end]
+        else:
+            start_offset = np.random.randint(low=start, high=end - self.batch_len)
+            return self.blocks[block_id][start_offset:start_offset + self.batch_len]
+    @staticmethod
+    def _load_blocks(feat_dir: str) -> List[np.ndarray]:
+        # e.g., 0_2.npy, 1_2.npy
+        file_names = glob.glob(os.path.join(feat_dir, "*.npy"), recursive=False)
+        # sort by index
+        file_names = sorted(file_names, key=lambda x: int(os.path.basename(x).split("_")[0]))
+        logger.info(f"Found following blocks: {file_names}")
+        blocks = [np.load(name, mmap_mode="r") for name in file_names]
+        return blocks
+    @staticmethod
+    def _load_offsets(feat_dir: str):
+        def load_lens(file_name: str):
+            with open(file_name, mode="r") as fp:
+                res = fp.read().strip().split("\n")
+            # for easy use. [res[i], res[i+1]) denotes the range for ith element
+            res = [0] + [int(r) for r in res]
+            return np.cumsum(res, dtype=int)
+        # e.g., 0_2.len, 1_2.len
+        file_names = glob.glob(os.path.join(feat_dir, "*.len"), recursive=False)
+        file_names = sorted(file_names, key=lambda x: int(os.path.basename(x).split("_")[0]))
+        file_lens = []
+        for name in file_names:
+            file_lens.append(load_lens(name))
+        return file_lens

prompting/RepCodec/examples/.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,334 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "72bf1b45-66fd-450d-8d5c-bec9e0b3d08f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data2vec_feature_reader import Data2vecFeatureReader\n",
+    "\n",
+    "reader = Data2vecFeatureReader(\"./../../models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "84a9d238-048a-4772-a47b-5aadc50f36df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "490421d1c2f54cca9855f1a5397185f8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "be44942581b34d5388b0264e7b40d472",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading dataset shards:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "\n",
+    "cache_dir = \"./../../../cache\"\n",
+    "\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "cffd49ca-3524-4ac4-8ba5-bc4fcc9e0f53",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RepCodec(\n",
+       "  (encoder): Encoder(\n",
+       "    (conv): Conv1d(\n",
+       "      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "    )\n",
+       "    (conv_blocks): ModuleList(\n",
+       "      (0-1): 2 x EncoderBlock(\n",
+       "        (res_units): ModuleList(\n",
+       "          (0-1): 2 x ResidualUnit(\n",
+       "            (activation): ELU(alpha=1.0)\n",
+       "            (conv1): Conv1d(\n",
+       "              (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "            )\n",
+       "            (conv2): Conv1d1x1(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)\n",
+       "          )\n",
+       "        )\n",
+       "        (conv): Conv1d(\n",
+       "          (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       "  (decoder): Decoder(\n",
+       "    (conv1): Conv1d(\n",
+       "      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "    )\n",
+       "    (conv_blocks): ModuleList(\n",
+       "      (0-1): 2 x DecoderBlock(\n",
+       "        (conv): Conv1d(\n",
+       "          (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,))\n",
+       "        )\n",
+       "        (res_units): ModuleList(\n",
+       "          (0-1): 2 x ResidualUnit(\n",
+       "            (activation): ELU(alpha=1.0)\n",
+       "            (conv1): Conv1d(\n",
+       "              (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "            )\n",
+       "            (conv2): Conv1d1x1(1024, 1024, kernel_size=(1,), stride=(1,), bias=False)\n",
+       "          )\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (conv2): Conv1d(\n",
+       "      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "    )\n",
+       "  )\n",
+       "  (projector): Projector(\n",
+       "    (project): Conv1d(\n",
+       "      (conv): Conv1d(1024, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)\n",
+       "    )\n",
+       "  )\n",
+       "  (quantizer): Quantizer(\n",
+       "    (codebook): ResidualVQ(\n",
+       "      (layers): ModuleList(\n",
+       "        (0): VectorQuantize()\n",
+       "      )\n",
+       "    )\n",
+       "  )\n",
+       ")"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from repcodec.RepCodec import RepCodec\n",
+    "import torch\n",
+    "import yaml\n",
+    "\n",
+    "config = \"./../repcodec/configs/repcodec_dim1024.yaml\"\n",
+    "with open(config) as fp:\n",
+    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
+    "\n",
+    "model = RepCodec(**conf)\n",
+    "model.load_state_dict(torch.load(\"./../../models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
+    "model.quantizer.initial()\n",
+    "model.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a9a1731e-052c-4af0-a29c-b171a988b300",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[22], line 27\u001b[0m\n\u001b[1;32m     23\u001b[0m     feat\u001b[38;5;241m.\u001b[39mappend(feat_chunk)\n\u001b[1;32m     25\u001b[0m features \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mcat(feat, \u001b[38;5;241m1\u001b[39m)\u001b[38;5;241m.\u001b[39mpermute(\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m---> 27\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat32\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     28\u001b[0m z \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mprojector(x)\n\u001b[1;32m     29\u001b[0m _, idx \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mquantizer\u001b[38;5;241m.\u001b[39mcodebook\u001b[38;5;241m.\u001b[39mforward_index(z\u001b[38;5;241m.\u001b[39mtranspose(\u001b[38;5;241m2\u001b[39m, \u001b[38;5;241m1\u001b[39m))\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/jupyter_workspace/users/Darshan/RepCodec/repcodec/modules/encoder.py:86\u001b[0m, in \u001b[0;36mEncoder.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     85\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[0;32m---> 86\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnum_blocks):\n\u001b[1;32m     88\u001b[0m         x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv_blocks[i](x)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/jupyter_workspace/users/Darshan/RepCodec/repcodec/layers/conv_layer.py:55\u001b[0m, in \u001b[0;36mConv1d.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[1;32m     49\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     50\u001b[0m \u001b[38;5;124;03m    Args:\u001b[39;00m\n\u001b[1;32m     51\u001b[0m \u001b[38;5;124;03m        x (Tensor): Float tensor variable with the shape  (B, C, T).\u001b[39;00m\n\u001b[1;32m     52\u001b[0m \u001b[38;5;124;03m    Returns:\u001b[39;00m\n\u001b[1;32m     53\u001b[0m \u001b[38;5;124;03m        Tensor: Float tensor variable with the shape (B, C, T).\u001b[39;00m\n\u001b[1;32m     54\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 55\u001b[0m     x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     56\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m x\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1532\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1530\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1531\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1532\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1541\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1536\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1537\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1538\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1539\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1540\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1541\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1543\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1544\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:310\u001b[0m, in \u001b[0;36mConv1d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    309\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 310\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conv_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.8/dist-packages/torch/nn/modules/conv.py:306\u001b[0m, in \u001b[0;36mConv1d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m    302\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mzeros\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m    303\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mconv1d(F\u001b[38;5;241m.\u001b[39mpad(\u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode),\n\u001b[1;32m    304\u001b[0m                     weight, bias, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstride,\n\u001b[1;32m    305\u001b[0m                     _single(\u001b[38;5;241m0\u001b[39m), \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdilation, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroups)\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv1d\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    307\u001b[0m \u001b[43m                \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdilation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same"
+     ]
+    }
+   ],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][1]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = model.encoder(features)\n",
+    "    z = model.projector(x)\n",
+    "    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d51709a9-6fb3-450b-a517-005367095663",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 804, 1024])"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "features.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "dfc977d7-f27c-40d7-b545-fbdf26728cbe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([726, 1024])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "feat.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1810e6dc-2ece-4aca-a29a-e1933b8ce82a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import tqdm\n",
+    "from npy_append_array import NpyAppendArray\n",
+    "\n",
+    "def get_shard_range(tot, nshard, rank):\n",
+    "    assert rank < nshard and rank >= 0, f\"invaid rank/nshard {rank}/{nshard}\"\n",
+    "    start = round(tot / nshard * rank)\n",
+    "    end = round(tot / nshard * (rank + 1))\n",
+    "    assert start < end, f\"start={start}, end={end}\"\n",
+    "    logger.info(\n",
+    "        f\"rank {rank} of {nshard}, process {end-start} \"\n",
+    "        f\"({start}-{end}) out of {tot}\"\n",
+    "    )\n",
+    "    return start, end\n",
+    "\n",
+    "def get_path_iterator(tsv, nshard, rank):\n",
+    "    with open(tsv, \"r\") as f:\n",
+    "        root = f.readline().rstrip()\n",
+    "        lines = [line.rstrip() for line in f]\n",
+    "        start, end = get_shard_range(len(lines), nshard, rank)\n",
+    "        lines = lines[start:end]\n",
+    "        def iterate():\n",
+    "            for line in lines:\n",
+    "                subpath, nsample = line.split(\"\\t\")\n",
+    "                yield f\"{root}/{subpath}\", int(nsample)\n",
+    "    return iterate, len(lines)\n",
+    "\n",
+    "def dump_feature(reader, generator, num, nshard, rank, feat_dir):\n",
+    "    iterator = generator()\n",
+    "\n",
+    "    feat_path = f\"{feat_dir}/{rank}_{nshard}.npy\"\n",
+    "    leng_path = f\"{feat_dir}/{rank}_{nshard}.len\"\n",
+    "\n",
+    "    os.makedirs(feat_dir, exist_ok=True)\n",
+    "    if os.path.exists(feat_path):\n",
+    "        os.remove(feat_path)\n",
+    "\n",
+    "    feat_f = NpyAppendArray(feat_path)\n",
+    "    with open(leng_path, \"w\") as leng_f:\n",
+    "        for path, nsample in tqdm.tqdm(iterator, total=num):\n",
+    "            feat = reader.get_feats(path, nsample)\n",
+    "            feat_f.append(feat.cpu().numpy())\n",
+    "            leng_f.write(f\"{len(feat)}\\n\")\n",
+    "    logger.info(\"finished successfully\")\n",
+    "\n",
+    "generator, num = get_path_iterator(tsv_path, nshard, rank)\n",
+    "dump_feature(reader, generator, num, nshard, rank, feat_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

prompting/RepCodec/examples/.ipynb_checkpoints/data2vec_audio-checkpoint.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import II
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from fairseq.modules import EMAModule, EMAModuleConfig
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    ConvFeatureExtractionModel,
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+from fairseq.modules import (
+    GradMultiply,
+    LayerNorm,
+)
+from fairseq.utils import index_put
+logger = logging.getLogger(__name__)
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+    max_update: int = II("optimization.max_update")
+    min_target_var: float = field(
+        default=0.1, metadata={"help": "stop training if target var falls below this"}
+    )
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+        self.final_proj = nn.Linear(self.embed, self.embed)
+        self.num_updates = 0
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        if self.ema is None and self.final_proj is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+        self.num_updates = num_updates
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+        return x, mask_indices
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+        return input_lengths.to(torch.long)
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        orig_padding_mask = padding_mask
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+        features = self.dropout_input(features)
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        with torch.no_grad():
+            self.ema.model.eval()
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+            target_layer_results = [l[2] for l in y["layer_results"]]
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
+                ]
+                permuted = True
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in target_layer_results
+                ]
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [
+                    F.instance_norm(tl.float()) for tl in target_layer_results
+                ]
+            if permuted:
+                target_layer_results = [
+                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
+                ]
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-2:])
+                    for tl in target_layer_results
+                ]
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-1:])
+                    for tl in target_layer_results
+                ]
+            y = sum(target_layer_results) / len(target_layer_results)
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+            if not permuted:
+                y = y.transpose(0, 1)
+            y = y[mask_indices]
+        x = x[mask_indices]
+        x = self.final_proj(x)
+        sz = x.size(-1)
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.loss_beta
+            ).sum(dim=-1)
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+        result["losses"]["regression"] = loss.sum() * scale
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+            raise Exception(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+            raise Exception(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )

prompting/RepCodec/examples/.ipynb_checkpoints/data2vec_feature_reader-checkpoint.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import torch
+import torch.nn.functional as F
+from fairseq import tasks
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from omegaconf import OmegaConf
+from data2vec_audio import Data2VecAudioModel
+logger = logging.getLogger("dump_feature")
+class Data2vecFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        state = load_checkpoint_to_cpu(ckpt_path)
+        cfg = state["cfg"]
+        # load task
+        task = tasks.setup_task(cfg.task, from_checkpoint=True)
+        task.load_state_dict(state["task_state"])
+        # load model config
+        if "layer_type" not in cfg.model:
+            # fix a missing key
+            model_config = {k: v for k, v in cfg.model.items()}
+            model_config["layer_type"] = "transformer"
+            model_config = OmegaConf.create(model_config)
+        else:
+            model_config = cfg.model
+        # fix param name in the state
+        state["model"]["final_proj.weight"] = state["model"].pop("final_proj.0.weight")
+        state["model"]["final_proj.bias"] = state["model"].pop("final_proj.0.bias")
+        del state["model"]["_ema"]
+        # load model
+        model = Data2VecAudioModel.build_model(model_config)
+        model.load_state_dict(
+            state["model"], strict=True, model_cfg=model_config
+        )
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model.eval().to(self.device)
+        self.task = task
+        self.layer = layer - 1  # make it 1-based
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

prompting/RepCodec/examples/.ipynb_checkpoints/dump_feature-checkpoint.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import os
+import sys
+from feature_utils import get_path_iterator, dump_feature
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_feature")
+def main(
+        model_type: str,
+        tsv_path: str,
+        ckpt_path: str,
+        whisper_root: str,
+        whisper_name: str,
+        layer: int,
+        nshard: int,
+        rank: int,
+        feat_dir: str,
+        max_chunk: int,
+        use_cpu: bool = False
+):
+    device = "cpu" if use_cpu else "cuda"
+    # some checks
+    if model_type in ["hubert", "data2vec"]:
+        assert ckpt_path and os.path.exists(ckpt_path)
+    elif model_type in ["whisper"]:
+        assert whisper_name and whisper_root
+    else:
+        raise ValueError(f"Unsupported model type {model_type}")
+    reader = None
+    if model_type == "hubert":
+        from hubert_feature_reader import HubertFeatureReader
+        reader = HubertFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "data2vec":
+        from data2vec_feature_reader import Data2vecFeatureReader
+        reader = Data2vecFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "whisper":
+        from whisper_feature_reader import WhisperFeatureReader
+        reader = WhisperFeatureReader(whisper_root, whisper_name, layer, device=device)
+    assert reader is not None
+    generator, num = get_path_iterator(tsv_path, nshard, rank)
+    dump_feature(reader, generator, num, nshard, rank, feat_dir)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        required=True,
+        type=str,
+        choices=["data2vec", "hubert", "whisper"],
+        help="the type of the speech encoder."
+    )
+    parser.add_argument(
+        "--tsv_path",
+        required=True,
+        type=str,
+        help="the path to the tsv file."
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        required=False,
+        type=str,
+        default=None,
+        help="path to the speech model. must provide for HuBERT and data2vec"
+    )
+    parser.add_argument(
+        "--whisper_root",
+        required=False,
+        type=str,
+        default=None,
+        help="root dir to download/store whisper model. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--whisper_name",
+        required=False,
+        type=str,
+        default=None,
+        help="name of whisper model. e.g., large-v2. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--layer",
+        required=True,
+        type=int,
+        help="which layer of the model. this is 1-based."
+    )
+    parser.add_argument(
+        "--feat_dir",
+        required=True,
+        type=str,
+        help="the output dir to save the representations."
+    )
+    parser.add_argument(
+        "--nshard",
+        required=False,
+        type=int,
+        default=1,
+        help="total number of shards."
+    )
+    parser.add_argument(
+        "--rank",
+        required=False,
+        type=int,
+        default=0,
+        help="shard id of this process."
+    )
+    parser.add_argument(
+        "--max_chunk",
+        type=int,
+        default=1600000,
+        help="max number of frames of each batch."
+    )
+    parser.add_argument(
+        "--use_cpu",
+        default=False,
+        action="store_true",
+        help="whether use cpu instead of gpu."
+    )
+    args = parser.parse_args()
+    logger.info(args)
+    main(**vars(args))

prompting/RepCodec/examples/.ipynb_checkpoints/feature_utils-checkpoint.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/hubert/simple_kmeans/feature_utils.py
+import logging
+import os
+import sys
+import tqdm
+from npy_append_array import NpyAppendArray
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+def get_shard_range(tot, nshard, rank):
+    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+    start = round(tot / nshard * rank)
+    end = round(tot / nshard * (rank + 1))
+    assert start < end, f"start={start}, end={end}"
+    logger.info(
+        f"rank {rank} of {nshard}, process {end-start} "
+        f"({start}-{end}) out of {tot}"
+    )
+    return start, end
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        start, end = get_shard_range(len(lines), nshard, rank)
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                subpath, nsample = line.split("\t")
+                yield f"{subpath}", int(nsample)
+    return iterate, len(lines)
+def dump_feature(reader, generator, num, nshard, rank, feat_dir):
+    iterator = generator()
+    feat_path = f"{feat_dir}/{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{rank}_{nshard}.len"
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")

prompting/RepCodec/examples/.ipynb_checkpoints/some_run-checkpoint.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from datasets import load_dataset
+from tqdm import tqdm
+import pandas as pd
+cache_dir = "./../../../cache"
+dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
+from repcodec.RepCodec import RepCodec
+import torch
+import yaml
+config = "./../repcodec/configs/repcodec_dim1024.yaml"
+with open(config) as fp:
+    conf = yaml.load(fp, Loader=yaml.FullLoader)
+model = RepCodec(**conf)
+model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
+model.quantizer.initial()
+model.eval()
+model.to("cuda:0")
+from data2vec_feature_reader import Data2vecFeatureReader
+reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000)
+import torch.nn.functional as F
+import numpy as np
+for split in dataset.keys():
+    tokens = []
+    for idx in tqdm(range(len(dataset[split]))):
+        sample = dataset[split][idx]
+        x = sample["audio"]["array"]
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(reader.device)
+            if reader.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), reader.max_chunk):
+                x_chunk = x[:, start: start + reader.max_chunk]
+                res = reader.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=reader.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+            features = torch.cat(feat, 1).permute(0, 2, 1)
+            x = model.encoder(features)
+            z = model.projector(x)
+            _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
+            tkn = idx.detach().cpu().data.numpy()[0]
+        tokens.append(tkn)
+    np.savez(f"./tkns/{split}.npz", *tokens)

prompting/RepCodec/examples/Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,214 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "72bf1b45-66fd-450d-8d5c-bec9e0b3d08f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from data2vec_feature_reader import Data2vecFeatureReader\n",
+    "\n",
+    "reader = Data2vecFeatureReader(\"./../../models/vox_pretrained.pt\", 18, device=\"cuda:0\", max_chunk=1600000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "84a9d238-048a-4772-a47b-5aadc50f36df",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fb01bc434d964db08fde7f9f2c90ea3c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading dataset shards:   0%|          | 0/45 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4adc62013644ed0b16056aa217448a9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading dataset shards:   0%|          | 0/60 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "\n",
+    "cache_dir = \"./../../../cache\"\n",
+    "\n",
+    "dataset = load_dataset(\"openslr/librispeech_asr\", cache_dir=cache_dir, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "cffd49ca-3524-4ac4-8ba5-bc4fcc9e0f53",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "attempted relative import with no known parent package",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mRepCodec\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RepCodec\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01myaml\u001b[39;00m\n",
+      "\u001b[0;31mImportError\u001b[0m: attempted relative import with no known parent package"
+     ]
+    }
+   ],
+   "source": [
+    "from .RepCodec import RepCodec\n",
+    "import torch\n",
+    "import yaml\n",
+    "\n",
+    "config = \"./../repcodec/configs/repcodec_dim1024.yaml\"\n",
+    "with open(config) as fp:\n",
+    "    conf = yaml.load(fp, Loader=yaml.FullLoader)\n",
+    "\n",
+    "model = RepCodec(**conf)\n",
+    "model.load_state_dict(torch.load(\"./../../models/data2vec_large_l18.pkl\", map_location=\"cuda:0\")[\"model\"][\"repcodec\"])\n",
+    "model.quantizer.initial()\n",
+    "model.eval()\n",
+    "model.to(\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9a1731e-052c-4af0-a29c-b171a988b300",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "\n",
+    "sample = dataset[\"train.clean.100\"][1]\n",
+    "\n",
+    "x = sample[\"audio\"][\"array\"]\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.from_numpy(x).float().to(reader.device)\n",
+    "    if reader.task.cfg.normalize:\n",
+    "        x = F.layer_norm(x, x.shape)\n",
+    "    x = x.view(1, -1)\n",
+    "\n",
+    "    feat = []\n",
+    "    for start in range(0, x.size(1), reader.max_chunk):\n",
+    "        x_chunk = x[:, start: start + reader.max_chunk]\n",
+    "        res = reader.model.extract_features(\n",
+    "            source=x_chunk,\n",
+    "            padding_mask=None,\n",
+    "            mask=False,\n",
+    "            layer=reader.layer,\n",
+    "        )\n",
+    "        feat_chunk = res[\"x\"]\n",
+    "        feat.append(feat_chunk)\n",
+    "        \n",
+    "    features = torch.cat(feat, 1).permute(0, 2, 1)\n",
+    "\n",
+    "    x = model.encoder(features)\n",
+    "    z = model.projector(x)\n",
+    "    _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))\n",
+    "    tokens = idx.cpu().data.numpy().tolist()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1810e6dc-2ece-4aca-a29a-e1933b8ce82a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "import tqdm\n",
+    "from npy_append_array import NpyAppendArray\n",
+    "\n",
+    "def get_shard_range(tot, nshard, rank):\n",
+    "    assert rank < nshard and rank >= 0, f\"invaid rank/nshard {rank}/{nshard}\"\n",
+    "    start = round(tot / nshard * rank)\n",
+    "    end = round(tot / nshard * (rank + 1))\n",
+    "    assert start < end, f\"start={start}, end={end}\"\n",
+    "    logger.info(\n",
+    "        f\"rank {rank} of {nshard}, process {end-start} \"\n",
+    "        f\"({start}-{end}) out of {tot}\"\n",
+    "    )\n",
+    "    return start, end\n",
+    "\n",
+    "def get_path_iterator(tsv, nshard, rank):\n",
+    "    with open(tsv, \"r\") as f:\n",
+    "        root = f.readline().rstrip()\n",
+    "        lines = [line.rstrip() for line in f]\n",
+    "        start, end = get_shard_range(len(lines), nshard, rank)\n",
+    "        lines = lines[start:end]\n",
+    "        def iterate():\n",
+    "            for line in lines:\n",
+    "                subpath, nsample = line.split(\"\\t\")\n",
+    "                yield f\"{root}/{subpath}\", int(nsample)\n",
+    "    return iterate, len(lines)\n",
+    "\n",
+    "def dump_feature(reader, generator, num, nshard, rank, feat_dir):\n",
+    "    iterator = generator()\n",
+    "\n",
+    "    feat_path = f\"{feat_dir}/{rank}_{nshard}.npy\"\n",
+    "    leng_path = f\"{feat_dir}/{rank}_{nshard}.len\"\n",
+    "\n",
+    "    os.makedirs(feat_dir, exist_ok=True)\n",
+    "    if os.path.exists(feat_path):\n",
+    "        os.remove(feat_path)\n",
+    "\n",
+    "    feat_f = NpyAppendArray(feat_path)\n",
+    "    with open(leng_path, \"w\") as leng_f:\n",
+    "        for path, nsample in tqdm.tqdm(iterator, total=num):\n",
+    "            feat = reader.get_feats(path, nsample)\n",
+    "            feat_f.append(feat.cpu().numpy())\n",
+    "            leng_f.write(f\"{len(feat)}\\n\")\n",
+    "    logger.info(\"finished successfully\")\n",
+    "\n",
+    "generator, num = get_path_iterator(tsv_path, nshard, rank)\n",
+    "dump_feature(reader, generator, num, nshard, rank, feat_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

prompting/RepCodec/examples/__pycache__/data2vec_audio.cpython-38.pyc ADDED Viewed

Binary file (12.2 kB). View file

prompting/RepCodec/examples/__pycache__/data2vec_feature_reader.cpython-38.pyc ADDED Viewed

Binary file (3 kB). View file

prompting/RepCodec/examples/__pycache__/feature_utils.cpython-38.pyc ADDED Viewed

Binary file (2.23 kB). View file

prompting/RepCodec/examples/__pycache__/hubert_feature_reader.cpython-38.pyc ADDED Viewed

Binary file (2.22 kB). View file

prompting/RepCodec/examples/__pycache__/tokenize.cpython-38.pyc ADDED Viewed

Binary file (1.89 kB). View file

prompting/RepCodec/examples/data2vec_audio.py ADDED Viewed

	@@ -0,0 +1,541 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/data2vec/models/data2vec_audio.py
+import logging
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+from omegaconf import II
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from fairseq.modules import EMAModule, EMAModuleConfig
+from fairseq.data.data_utils import compute_mask_indices
+from fairseq.models import BaseFairseqModel, register_model
+from fairseq.models.wav2vec import (
+    ConvFeatureExtractionModel,
+    Wav2Vec2Config,
+    TransformerEncoder,
+)
+from fairseq.modules import (
+    GradMultiply,
+    LayerNorm,
+)
+from fairseq.utils import index_put
+logger = logging.getLogger(__name__)
+@dataclass
+class Data2VecAudioConfig(Wav2Vec2Config):
+    loss_beta: float = field(
+        default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"}
+    )
+    loss_scale: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)"
+        },
+    )
+    average_top_k_layers: int = field(
+        default=8, metadata={"help": "how many layers to average"}
+    )
+    layer_norm_target_layer: bool = False
+    instance_norm_target_layer: bool = False
+    instance_norm_targets: bool = False
+    layer_norm_targets: bool = False
+    batch_norm_target_layer: bool = False
+    group_norm_target_layer: bool = False
+    ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"})
+    ema_end_decay: float = field(
+        default=0.9999, metadata={"help": "final ema decay rate"}
+    )
+    # when to finish annealing ema decay rate
+    ema_anneal_end_step: int = II("optimization.max_update")
+    ema_transformer_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer"},
+    )
+    ema_layers_only: bool = field(
+        default=True,
+        metadata={"help": "whether to momentum update only the transformer layers"},
+    )
+    max_update: int = II("optimization.max_update")
+    min_target_var: float = field(
+        default=0.1, metadata={"help": "stop training if target var falls below this"}
+    )
+    min_pred_var: float = field(
+        default=0.01,
+        metadata={"help": "stop training if prediction var falls below this"},
+    )
+def get_annealed_rate(start, end, curr_step, total_steps):
+    r = end - start
+    pct_remaining = 1 - curr_step / total_steps
+    return end - r * pct_remaining
+@register_model("data2vec_audio", dataclass=Data2VecAudioConfig)
+class Data2VecAudioModel(BaseFairseqModel):
+    def __init__(self, cfg: Data2VecAudioConfig):
+        super().__init__()
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.extractor_embed = feature_enc_layers[-1][0]
+        self.ema = None
+        self.embed = cfg.encoder_embed_dim
+        self.average_top_k_layers = cfg.average_top_k_layers
+        self.loss_beta = cfg.loss_beta
+        self.loss_scale = cfg.loss_scale
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+        self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim)
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.mask_emb = nn.Parameter(
+            torch.FloatTensor(cfg.encoder_embed_dim).uniform_()
+        )
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.extractor_embed)
+        self.final_proj = nn.Linear(self.embed, self.embed)
+        self.num_updates = 0
+    def make_ema_teacher(self):
+        ema_config = EMAModuleConfig(
+            ema_decay=self.cfg.ema_decay,
+            ema_fp32=True,
+        )
+        skip_keys = set()
+        if self.cfg.ema_layers_only:
+            self.cfg.ema_transformer_only = True
+            for k, _ in self.encoder.pos_conv.named_parameters():
+                skip_keys.add(f"pos_conv.{k}")
+        self.ema = EMAModule(
+            self.encoder if self.cfg.ema_transformer_only else self,
+            ema_config,
+            skip_keys=skip_keys,
+        )
+    def set_num_updates(self, num_updates):
+        super().set_num_updates(num_updates)
+        if self.ema is None and self.final_proj is not None:
+            logger.info(f"making ema teacher")
+            self.make_ema_teacher()
+        elif self.training and self.ema is not None:
+            if self.cfg.ema_decay != self.cfg.ema_end_decay:
+                if num_updates >= self.cfg.ema_anneal_end_step:
+                    decay = self.cfg.ema_end_decay
+                else:
+                    decay = get_annealed_rate(
+                        self.cfg.ema_decay,
+                        self.cfg.ema_end_decay,
+                        num_updates,
+                        self.cfg.ema_anneal_end_step,
+                    )
+                self.ema.set_decay(decay)
+            if self.ema.get_decay() < 1:
+                self.ema.step(self.encoder if self.cfg.ema_transformer_only else self)
+        self.num_updates = num_updates
+    def state_dict(self, destination=None, prefix="", keep_vars=False):
+        state = super().state_dict(destination, prefix, keep_vars)
+        if self.ema is not None:
+            state[prefix + "_ema"] = self.ema.fp32_params
+        return state
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        if self.ema is not None:
+            k = prefix + "_ema"
+            assert k in state_dict
+            self.ema.restore(state_dict[k], True)
+            del state_dict[k]
+        return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @classmethod
+    def build_model(cls, cfg: Data2VecAudioConfig, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+    def apply_mask(
+        self,
+        x,
+        padding_mask,
+        mask_indices=None,
+        mask_channel_indices=None,
+    ):
+        B, T, C = x.shape
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                torch.from_numpy(mask_channel_indices)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=1,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout,
+                )
+                mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space,
+                )
+                mask_channel_indices = (
+                    torch.from_numpy(mask_channel_indices)
+                    .to(x.device)
+                    .unsqueeze(1)
+                    .expand(-1, T, -1)
+                )
+            x = index_put(x, mask_channel_indices, 0)
+        return x, mask_indices
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            return torch.floor((input_length - kernel_size) / stride + 1)
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(
+                input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2]
+            )
+        return input_lengths.to(torch.long)
+    def forward(
+        self,
+        source,
+        padding_mask=None,
+        mask=True,
+        features_only=False,
+        layer=None,
+        mask_indices=None,
+        mask_channel_indices=None,
+        padding_count=None,
+    ):
+        features = source
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(features)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with torch.no_grad():
+                features = self.feature_extractor(features)
+        features = features.transpose(1, 2)
+        features = self.layer_norm(features)
+        orig_padding_mask = padding_mask
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(input_lengths)
+            padding_mask = torch.zeros(
+                features.shape[:2], dtype=features.dtype, device=features.device
+            )
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[
+                (
+                    torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                    output_lengths - 1,
+                )
+            ] = 1
+            padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool()
+        else:
+            padding_mask = None
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        pre_encoder_features = None
+        if self.cfg.ema_transformer_only:
+            pre_encoder_features = features.clone()
+        features = self.dropout_input(features)
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices,
+            )
+        else:
+            x = features
+            mask_indices = None
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=layer,
+        )
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "layer_results": layer_results,
+            }
+        result = {
+            "losses": {},
+        }
+        with torch.no_grad():
+            self.ema.model.eval()
+            if self.cfg.ema_transformer_only:
+                y, layer_results = self.ema.model.extract_features(
+                    pre_encoder_features,
+                    padding_mask=padding_mask,
+                    min_layer=self.cfg.encoder_layers - self.average_top_k_layers,
+                )
+                y = {
+                    "x": y,
+                    "padding_mask": padding_mask,
+                    "layer_results": layer_results,
+                }
+            else:
+                y = self.ema.model.extract_features(
+                    source=source,
+                    padding_mask=orig_padding_mask,
+                    mask=False,
+                )
+            target_layer_results = [l[2] for l in y["layer_results"]]
+            permuted = False
+            if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    tl.permute(1, 2, 0) for tl in target_layer_results  # TBC -> BCT
+                ]
+                permuted = True
+            if self.cfg.batch_norm_target_layer:
+                target_layer_results = [
+                    F.batch_norm(
+                        tl.float(), running_mean=None, running_var=None, training=True
+                    )
+                    for tl in target_layer_results
+                ]
+            if self.cfg.instance_norm_target_layer:
+                target_layer_results = [
+                    F.instance_norm(tl.float()) for tl in target_layer_results
+                ]
+            if permuted:
+                target_layer_results = [
+                    tl.transpose(1, 2) for tl in target_layer_results  # BCT -> BTC
+                ]
+            if self.cfg.group_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-2:])
+                    for tl in target_layer_results
+                ]
+            if self.cfg.layer_norm_target_layer:
+                target_layer_results = [
+                    F.layer_norm(tl.float(), tl.shape[-1:])
+                    for tl in target_layer_results
+                ]
+            y = sum(target_layer_results) / len(target_layer_results)
+            if self.cfg.layer_norm_targets:
+                y = F.layer_norm(y.float(), y.shape[-1:])
+            if self.cfg.instance_norm_targets:
+                y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2)
+            if not permuted:
+                y = y.transpose(0, 1)
+            y = y[mask_indices]
+        x = x[mask_indices]
+        x = self.final_proj(x)
+        sz = x.size(-1)
+        if self.loss_beta == 0:
+            loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1)
+        else:
+            loss = F.smooth_l1_loss(
+                x.float(), y.float(), reduction="none", beta=self.loss_beta
+            ).sum(dim=-1)
+        if self.loss_scale is not None:
+            scale = self.loss_scale
+        else:
+            scale = 1 / math.sqrt(sz)
+        result["losses"]["regression"] = loss.sum() * scale
+        if "sample_size" not in result:
+            result["sample_size"] = loss.numel()
+        with torch.no_grad():
+            result["target_var"] = self.compute_var(y)
+            result["pred_var"] = self.compute_var(x.float())
+        if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var:
+            logger.error(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+            raise Exception(
+                f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting"
+            )
+        if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var:
+            logger.error(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+            raise Exception(
+                f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting"
+            )
+        if self.ema is not None:
+            result["ema_decay"] = self.ema.get_decay() * 1000
+        return result
+    @staticmethod
+    def compute_var(y):
+        y = y.view(-1, y.size(-1))
+        if dist.is_initialized():
+            zc = torch.tensor(y.size(0)).cuda()
+            zs = y.sum(dim=0)
+            zss = (y ** 2).sum(dim=0)
+            dist.all_reduce(zc)
+            dist.all_reduce(zs)
+            dist.all_reduce(zss)
+            var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1))
+            return torch.sqrt(var + 1e-6).mean()
+        else:
+            return torch.sqrt(y.var(dim=0) + 1e-6).mean()
+    def extract_features(
+        self, source, padding_mask, mask=False, layer=None
+    ):
+        res = self.forward(
+            source,
+            padding_mask,
+            mask=mask,
+            features_only=True,
+            layer=layer,
+        )
+        return res
+    def remove_pretraining_modules(self, last_layer=None):
+        self.final_proj = None
+        self.ema = None
+        if last_layer is not None:
+            self.encoder.layers = nn.ModuleList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer
+            )

prompting/RepCodec/examples/data2vec_feature_reader.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import torch
+import torch.nn.functional as F
+from fairseq import tasks
+from fairseq.checkpoint_utils import load_checkpoint_to_cpu
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+from omegaconf import OmegaConf
+from data2vec_audio import Data2VecAudioModel
+logger = logging.getLogger("dump_feature")
+class Data2vecFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        state = load_checkpoint_to_cpu(ckpt_path)
+        cfg = state["cfg"]
+        # load task
+        task = tasks.setup_task(cfg.task, from_checkpoint=True)
+        task.load_state_dict(state["task_state"])
+        # load model config
+        if "layer_type" not in cfg.model:
+            # fix a missing key
+            model_config = {k: v for k, v in cfg.model.items()}
+            model_config["layer_type"] = "transformer"
+            model_config = OmegaConf.create(model_config)
+        else:
+            model_config = cfg.model
+        # fix param name in the state
+        state["model"]["final_proj.weight"] = state["model"].pop("final_proj.0.weight")
+        state["model"]["final_proj.bias"] = state["model"].pop("final_proj.0.bias")
+        del state["model"]["_ema"]
+        # load model
+        model = Data2VecAudioModel.build_model(model_config)
+        model.load_state_dict(
+            state["model"], strict=True, model_cfg=model_config
+        )
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model.eval().to(self.device)
+        self.task = task
+        self.layer = layer - 1  # make it 1-based
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                res = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=self.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

prompting/RepCodec/examples/dump_feature.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import os
+import sys
+from feature_utils import get_path_iterator, dump_feature
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("dump_feature")
+def main(
+        model_type: str,
+        tsv_path: str,
+        ckpt_path: str,
+        whisper_root: str,
+        whisper_name: str,
+        layer: int,
+        nshard: int,
+        rank: int,
+        feat_dir: str,
+        max_chunk: int,
+        use_cpu: bool = False
+):
+    device = "cpu" if use_cpu else "cuda"
+    # some checks
+    if model_type in ["hubert", "data2vec"]:
+        assert ckpt_path and os.path.exists(ckpt_path)
+    elif model_type in ["whisper"]:
+        assert whisper_name and whisper_root
+    else:
+        raise ValueError(f"Unsupported model type {model_type}")
+    reader = None
+    if model_type == "hubert":
+        from hubert_feature_reader import HubertFeatureReader
+        reader = HubertFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "data2vec":
+        from data2vec_feature_reader import Data2vecFeatureReader
+        reader = Data2vecFeatureReader(ckpt_path, layer, device=device, max_chunk=max_chunk)
+    elif model_type == "whisper":
+        from whisper_feature_reader import WhisperFeatureReader
+        reader = WhisperFeatureReader(whisper_root, whisper_name, layer, device=device)
+    assert reader is not None
+    generator, num = get_path_iterator(tsv_path, nshard, rank)
+    dump_feature(reader, generator, num, nshard, rank, feat_dir)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        required=True,
+        type=str,
+        choices=["data2vec", "hubert", "whisper"],
+        help="the type of the speech encoder."
+    )
+    parser.add_argument(
+        "--tsv_path",
+        required=True,
+        type=str,
+        help="the path to the tsv file."
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        required=False,
+        type=str,
+        default=None,
+        help="path to the speech model. must provide for HuBERT and data2vec"
+    )
+    parser.add_argument(
+        "--whisper_root",
+        required=False,
+        type=str,
+        default=None,
+        help="root dir to download/store whisper model. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--whisper_name",
+        required=False,
+        type=str,
+        default=None,
+        help="name of whisper model. e.g., large-v2. must provide for whisper model."
+    )
+    parser.add_argument(
+        "--layer",
+        required=True,
+        type=int,
+        help="which layer of the model. this is 1-based."
+    )
+    parser.add_argument(
+        "--feat_dir",
+        required=True,
+        type=str,
+        help="the output dir to save the representations."
+    )
+    parser.add_argument(
+        "--nshard",
+        required=False,
+        type=int,
+        default=1,
+        help="total number of shards."
+    )
+    parser.add_argument(
+        "--rank",
+        required=False,
+        type=int,
+        default=0,
+        help="shard id of this process."
+    )
+    parser.add_argument(
+        "--max_chunk",
+        type=int,
+        default=1600000,
+        help="max number of frames of each batch."
+    )
+    parser.add_argument(
+        "--use_cpu",
+        default=False,
+        action="store_true",
+        help="whether use cpu instead of gpu."
+    )
+    args = parser.parse_args()
+    logger.info(args)
+    main(**vars(args))

prompting/RepCodec/examples/feature_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+# ref: https://github.com/facebookresearch/fairseq/blob/main/examples/hubert/simple_kmeans/feature_utils.py
+import logging
+import os
+import sys
+import tqdm
+from npy_append_array import NpyAppendArray
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger("feature_utils")
+def get_shard_range(tot, nshard, rank):
+    assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}"
+    start = round(tot / nshard * rank)
+    end = round(tot / nshard * (rank + 1))
+    assert start < end, f"start={start}, end={end}"
+    logger.info(
+        f"rank {rank} of {nshard}, process {end-start} "
+        f"({start}-{end}) out of {tot}"
+    )
+    return start, end
+def get_path_iterator(tsv, nshard, rank):
+    with open(tsv, "r") as f:
+        root = f.readline().rstrip()
+        lines = [line.rstrip() for line in f]
+        start, end = get_shard_range(len(lines), nshard, rank)
+        lines = lines[start:end]
+        def iterate():
+            for line in lines:
+                subpath, nsample = line.split("\t")
+                yield f"{subpath}", int(nsample)
+    return iterate, len(lines)
+def dump_feature(reader, generator, num, nshard, rank, feat_dir):
+    iterator = generator()
+    feat_path = f"{feat_dir}/{rank}_{nshard}.npy"
+    leng_path = f"{feat_dir}/{rank}_{nshard}.len"
+    os.makedirs(feat_dir, exist_ok=True)
+    if os.path.exists(feat_path):
+        os.remove(feat_path)
+    feat_f = NpyAppendArray(feat_path)
+    with open(leng_path, "w") as leng_f:
+        for path, nsample in tqdm.tqdm(iterator, total=num):
+            feat = reader.get_feats(path, nsample)
+            feat_f.append(feat.cpu().numpy())
+            leng_f.write(f"{len(feat)}\n")
+    logger.info("finished successfully")

prompting/RepCodec/examples/hubert_feature_reader.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) ByteDance, Inc. and its affiliates.
+# Copyright (c) Chutong Meng
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Based on fairseq (https://github.com/facebookresearch/fairseq)
+import logging
+import fairseq
+import torch
+import torch.nn.functional as F
+from fairseq.data.audio.audio_utils import get_features_or_waveform
+logger = logging.getLogger("dump_feature")
+class HubertFeatureReader(object):
+    def __init__(self, ckpt_path: str, layer: int, device: str, max_chunk=1600000):
+        (
+            model,
+            cfg,
+            task,
+        ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
+        self.device = device
+        logger.info(f"device = {self.device}")
+        self.model = model[0].eval().to(self.device)
+        self.task = task
+        self.layer = layer
+        self.max_chunk = max_chunk
+        logger.info(f"TASK CONFIG:\n{self.task.cfg}")
+        logger.info(f" max_chunk = {self.max_chunk}")
+    def read_audio(self, path, ref_len=None):
+        wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate)
+        if wav.ndim == 2:
+            wav = wav.mean(-1)
+        assert wav.ndim == 1, wav.ndim
+        if ref_len is not None and abs(ref_len - len(wav)) > 160:
+            logger.warning(f"ref {ref_len} != read {len(wav)} ({path})")
+        return wav
+    def get_feats(self, path, ref_len=None):
+        x = self.read_audio(path, ref_len=ref_len)
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(self.device)
+            if self.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), self.max_chunk):
+                x_chunk = x[:, start: start + self.max_chunk]
+                feat_chunk, _ = self.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    output_layer=self.layer,
+                )
+                feat.append(feat_chunk)
+        return torch.cat(feat, 1).squeeze(0)

prompting/RepCodec/examples/some_run.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from datasets import load_dataset
+from tqdm import tqdm
+import pandas as pd
+cache_dir = "./../../../cache"
+dataset = load_dataset("openslr/librispeech_asr", cache_dir=cache_dir, trust_remote_code=True)
+from repcodec.RepCodec import RepCodec
+import torch
+import yaml
+config = "./../repcodec/configs/repcodec_dim1024.yaml"
+with open(config) as fp:
+    conf = yaml.load(fp, Loader=yaml.FullLoader)
+model = RepCodec(**conf)
+model.load_state_dict(torch.load("./../../models/data2vec_large_l18.pkl", map_location="cuda:0")["model"]["repcodec"])
+model.quantizer.initial()
+model.eval()
+model.to("cuda:0")
+from data2vec_feature_reader import Data2vecFeatureReader
+reader = Data2vecFeatureReader("./../../models/vox_pretrained.pt", 18, device="cuda:0", max_chunk=1600000)
+import torch.nn.functional as F
+import numpy as np
+for split in dataset.keys():
+    tokens = []
+    for idx in tqdm(range(len(dataset[split]))):
+        sample = dataset[split][idx]
+        x = sample["audio"]["array"]
+        with torch.no_grad():
+            x = torch.from_numpy(x).float().to(reader.device)
+            if reader.task.cfg.normalize:
+                x = F.layer_norm(x, x.shape)
+            x = x.view(1, -1)
+            feat = []
+            for start in range(0, x.size(1), reader.max_chunk):
+                x_chunk = x[:, start: start + reader.max_chunk]
+                res = reader.model.extract_features(
+                    source=x_chunk,
+                    padding_mask=None,
+                    mask=False,
+                    layer=reader.layer,
+                )
+                feat_chunk = res["x"]
+                feat.append(feat_chunk)
+            features = torch.cat(feat, 1).permute(0, 2, 1)
+            x = model.encoder(features)
+            z = model.projector(x)
+            _, idx = model.quantizer.codebook.forward_index(z.transpose(2, 1))
+            tkn = idx.detach().cpu().data.numpy()[0]
+        tokens.append(tkn)
+    np.savez(f"./tkns/{split}.npz", *tokens)

prompting/RepCodec/examples/tkns/test.clean.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b2ee3dbd59f84e1903c251bdce1e05a8ed6d1f30bf492ab368d4873dc0e713b
+size 8415186

prompting/RepCodec/examples/tkns/test.other.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:188625227fc71069d6607920a733b76b0c025f6d75bd698abf87bbcfd6089d2c
+size 8403370

prompting/RepCodec/examples/tkns/train.clean.100.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5966bdb4154a433ffe605d4cb152fdda7349ab6456747f0e2275f154739c004
+size 151819656

prompting/RepCodec/examples/tkns/train.clean.360.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ed187075b7d515f3a1162df967548bf0d0c384aa31419ac12e7e5b95616ab2c
+size 549056222

prompting/RepCodec/examples/tkns/train.other.500.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14441ffb0f6e0ca5edd1772ddf8fbc6065f8b3e70796b4244dd62a5c286467a7
+size 751972686

prompting/RepCodec/examples/tkns/validation.clean.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5bb0560fa0169e387353d30f13a72eb40704a6518c0b8f72bdfbccada9a8660
+size 8412602

prompting/RepCodec/examples/tkns/validation.other.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:314083f9013d459db663631f237f4ae68860cf25a3137dd7c9d42377e7efb8c1
+size 8067914

prompting/RepCodec/examples/tokens/data2vec_base_l6_dev-clean.tokens ADDED Viewed