{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "8f95f1d6-be90-4900-9116-c27b82bd7836", "metadata": {}, "outputs": [], "source": [ "from tokenizers import SentencePieceBPETokenizer\n", "import transformers\n", "from transformers import GPT2Tokenizer, AutoModelForCausalLM\n", "from datasets import Dataset, DatasetDict\n", "\n", "cache_dir = \"./cache\"\n", "\n", "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n", "\n", "text = []\n", "for split in dataset.keys():\n", " text += list(dataset[split][\"text\"])\n", "\n", "model_max_length = 1877\n", "special_tokens = [ f\"<|audio:{idx}|>\" for idx in range(1024)] + [\"<|startoftranscript|>\", \"<|endoftranscript|>\", \"<|padding|>\"]\n", "\n", "bpe_tokenizer = SentencePieceBPETokenizer()\n", "bpe_tokenizer.train_from_iterator(\n", " text,\n", " vocab_size = 5000 + len(special_tokens),\n", " min_frequency = 2,\n", " show_progress = True,\n", " special_tokens = special_tokens\n", ")\n", "\n", "tokenizer = transformers.PreTrainedTokenizerFast(\n", " tokenizer_object = bpe_tokenizer,\n", " model_max_length = model_max_length,\n", " special_tokens = special_tokens\n", ")\n", "\n", "tokenizer.pad_token = \"<|padding|>\"\n", "tokenizer.pad_token_id = bpe_tokenizer.token_to_id(\"<|padding|>\")\n", "\n", "tokenizer.save_pretrained(\"./tokenizer\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d259b76d-1c8d-4c74-9d04-d711a4b3f395", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer\n", "from datasets import Dataset, DatasetDict\n", "\n", "max_length = 1877\n", "\n", "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")\n", "\n", "def tokenize(row):\n", " text = \"\".join([f\"<|audio:{token}|>\" for token in row[\"audio_tokens\"]]) + \"<|startoftranscript|>\" + row[\"text\"] + \"<|endoftranscript|>\"\n", " input_ids = tokenizer(\n", " text,\n", " padding=\"max_length\",\n", " max_length=max_length,\n", " )\n", " return input_ids\n", "\n", "dataset = dataset.map(tokenize, remove_columns=[\"text\", \"audio_tokens\"])\n", "\n", "dataset.save_to_disk(\"tokenized_librispeech\")" ] }, { "cell_type": "code", "execution_count": 1, "id": "9bc4d1db-1390-4ba1-b296-d52a8993c87f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by mode='default'.\n", " table = cls._concat_blocks(blocks, axis=0)\n" ] } ], "source": [ "from transformers import GPT2Tokenizer, AutoModelForCausalLM, AutoTokenizer\n", "from datasets import Dataset, DatasetDict\n", "\n", "max_length = 1877\n", "\n", "dataset = DatasetDict.load_from_disk(\"./../librispeech_tokenized.hf\")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"./tokenizer\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "a83d83c7-2e56-4b9e-9f63-11f7eea22d6d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[1024]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.encode(\"<|startoftranscript|>\")" ] }, { "cell_type": "code", "execution_count": null, "id": "5b9f0fa9-384b-4453-bf4c-0d49f0c1e4a5", "metadata": {}, "outputs": [], "source": [ "tokenizer.pad_token_id" ] }, { "cell_type": "code", "execution_count": null, "id": "1964689d-687d-4ab9-967d-80d9eb95b159", "metadata": {}, "outputs": [], "source": [ "from tqdm import tqdm\n", "lens = []\n", "\n", "for split in dataset.keys():\n", " for idx in tqdm(range(len(dataset[split]))):\n", " sample = dataset[split][idx]\n", " max_len = len(tokenizer.encode(sample[\"text\"])) + len(sample[\"audio_tokens\"])\n", " lens.append(max_len)" ] }, { "cell_type": "code", "execution_count": null, "id": "e49c1a27-c3e4-43eb-a4e0-68a0d739f27b", "metadata": {}, "outputs": [], "source": [ "max(lens)" ] }, { "cell_type": "code", "execution_count": null, "id": "96dbd94d-5455-49bd-8893-4aae5dfd9b7f", "metadata": {}, "outputs": [], "source": [ "min(lens)" ] }, { "cell_type": "code", "execution_count": null, "id": "b9aaca12-286d-416c-a2f7-0cdf689eeb2e", "metadata": {}, "outputs": [], "source": [ "tokenizer.encode(\"<|audio:0|>\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }