diff --git "a/01-Transformers/02-LLMs.ipynb" "b/01-Transformers/02-LLMs.ipynb" new file mode 100644--- /dev/null +++ "b/01-Transformers/02-LLMs.ipynb" @@ -0,0 +1,3703 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "36b63b15-fd1a-45fd-a52e-8833415d3320", + "metadata": { + "id": "36b63b15-fd1a-45fd-a52e-8833415d3320" + }, + "source": [ + "# Understanding LLMs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "463fb902-a6e5-4ebe-bdce-30e436f9d3b4", + "metadata": { + "id": "463fb902-a6e5-4ebe-bdce-30e436f9d3b4" + }, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "import torch\n", + "import transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d799ad8-f1b9-46da-943a-0c6adc9b7f43", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3d799ad8-f1b9-46da-943a-0c6adc9b7f43", + "outputId": "6c305f99-942c-4f5d-b857-681b100d7eb1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "4.42.4\n" + ] + } + ], + "source": [ + "print(transformers.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7cc2c97-9fbb-4d98-85c0-537b0f30ff66", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b7cc2c97-9fbb-4d98-85c0-537b0f30ff66", + "outputId": "cd448b79-5d1f-42ce-fcdb-a4971f0b1878" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2.3.1+cu121\n" + ] + } + ], + "source": [ + "print(torch.__version__)" + ] + }, + { + "cell_type": "markdown", + "id": "6486b253-05f2-4e94-950e-181a9a27c7d9", + "metadata": { + "id": "6486b253-05f2-4e94-950e-181a9a27c7d9" + }, + "source": [ + "## Tokenizing Text\n", + "\n", + "### Why Tokenization?\n", + "\n", + "Tokenization transforms text into a format that models can comprehend. There are several methods for tokenizing text, each with its pros and cons:\n", + "\n", + "1. **Character-Based Tokenization**:\n", + " - **Method**: Splitting the text into individual characters and assigning each a unique numerical ID.\n", + " - **Pros**: Works well for languages like Chinese, where each character carries significant information.\n", + " - **Cons**: Creates a small vocabulary but requires many tokens to represent a string. This can affect performance and accuracy since individual characters carry minimal information.\n", + "\n", + "2. **Word-Based Tokenization**:\n", + " - **Method**: Splitting the text into individual words.\n", + " - **Pros**: Captures more meaning per token.\n", + " - **Cons**: Results in a large vocabulary with many unknown words (e.g., typos, slang) and different word forms (e.g., \"run\", \"runs\", \"running\").\n", + "\n", + "### Modern Tokenization Strategies\n", + "\n", + "Modern approaches balance character and word tokenization by splitting text into subwords. These methods effectively capture both the structure and meaning of the text while efficiently handling unknown words and different forms of the same word.\n", + "\n", + "- **Subword Tokenization**:\n", + " - **Method**: Frequently occurring words or subwords are assigned a single token, while complex words are split into multiple tokens, each representing a meaningful part of the word.\n", + " - **Example**: \"flabbergasted\" could be split into:\n", + " \n", + " tensor(781) \t: fl\n", + " tensor(397) \t: ab\n", + " tensor(3900) \t: berg\n", + " tensor(8992) \t: asted\n", + "\n", + "Different models use different tokenizers, each with its unique strategy and vocabulary size. Let's see how the GPT-2 tokenizer handles a sentence.\n", + "\n", + "### Example with GPT-2 Tokenizer\n", + "\n", + "We'll use the GPT-2 tokenizer to tokenize the sentence shown below. This involves converting the text into tokens and then decoding those tokens back into text." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed917b48-6a6c-4b98-ad37-9eeb2c902b83", + "metadata": { + "editable": true, + "tags": [], + "colab": { + "base_uri": "https://localhost:8080/", + "height": 526, + "referenced_widgets": [ + "e74de2114e5b45659257044f9e684fdf", + "0c4c389eaee94fe3b2680c9daf132cf7", + "7a9f61a01efe433c8cfbb6b566309825", + "73ce456700734fd889b7810ce2238d24", + "b9fa473db2c64bee86f264a52222059c", + "d9cf1d633fa44b3895ea35db596da084", + "b3bee62b23df4e6f9d8441a468b20924", + "36de7b0ed5964ab88c744f7835075e03", + "9934322e09474417b838144147497c6f", + "e229dfe9d2d34792b2bfcd563e0acf22", + "d9aa61843dad451aaa1daa22a23597fb", + "5ccda054cde5495e8dfe6ae22d2c78f2", + "999c10931379459c8b3fd4c097ce8933", + "a3c853abf47b4b9ca52b3389599c7386", + "256e467326cc4c6fb8c05671b186dda3", + "fc164e3dc2274ce8b8c89aabf416ef6c", + "f4730cc66aea4bcab0f8a481a3618b9b", + "0183b632a76c4d4db4c03f41569c81f4", + "18c309963af64edca143dc1be246e324", + "c84a494f44c543ec89ac83d66f0e03dc", + "8868848f93a94876b54614db8f0b97b3", + "0659a87fcaa94b718a937a2eefbe596d", + "3eaba1dc40b0480785faddca9c71ab25", + "8878fd9ac72542a4b49c24a75135ea05", + "e933f3ccddc24756962f60170bf9732b", + "91d4b1c908974b758f5e7e811e649701", + "d602745e0c124c4eaed0a3f1c7b48c19", + "cbc51c8f621d4ffe88407009e8d0eee9", + "2240b05d0b2b4a4fadab8cd431b39109", + "88d5eaa421b14a91ae44b18c7239bc98", + "14266e6e8c2b4513acf7e71aa2d77458", + "38fea909f1f641b6b2210fca16c8d030", + "9bdeff70d04f4f32aa872a4e862fb163", + "7be8a3134b0646de9a56a50cf9d770c0", + "72e01e40934947ba8422e52a38243227", + "1998bdff384f4c4ca17ad7888f30f701", + "a6a15bc4821e479494221c79b47a4efa", + "114866adc151471ca12ef24b130b4279", + "679800cff7ff461b9524fb8e2ee2ca83", + "a02b00ce9ec24d96aac9332b525b1095", + "76fad241dc374b4798768ce4ddbfea5d", + "9cee88f31a7a42b3b9c2da1efe67f0ce", + "5231f1c6c51c42fe81a3e49f0e626d2a", + "957f7efbc6234abebb56ad3a498843f8", + "48dbd15ede0e44aa9341663a781af852", + "c3cecbfb0b8e4a558888ff18ffe64207", + "ff21a2927ab247469ec99ea1da9ac061", + "020a2ebcd8174331a7d32295011c7159", + "391b64e043804953a20a618c8900461e", + "67a46485f97149cc8d94e019e72fdc4e", + "36658eb5465c4601af8e0a8a294321e5", + "db9fc96cc57146e1a4e725d9e1d498ba", + "61cfb2ff2cd74f0285a9d74ff175bc3b", + "e273a2fad4114d83ad465a9688fc2058", + "7aeb002e684444a19e053b9893e799f7" + ] + }, + "id": "ed917b48-6a6c-4b98-ad37-9eeb2c902b83", + "outputId": "9206e05c-cdbc-43e2-e8e8-0f534d4fa97d" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/26.0 [00:00