{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Q-bj6K7Qv4ft" }, "source": [ "# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n", "\n", "1. Install required libraries." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OFWjSb_xDWja", "outputId": "ff2273de-a277-46a2-a70c-26f8aee40a09" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting transformers\n", " Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m65.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting datasets\n", " Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m53.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting codecarbon\n", " Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n", "Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n", " Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m27.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n", "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n", " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m111.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m81.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n", "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n", "Collecting dill<0.3.7,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m16.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n", "Collecting xxhash (from datasets)\n", " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m18.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n", "Collecting arrow (from codecarbon)\n", " Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting pynvml (from codecarbon)\n", " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n", "Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n", "Collecting fuzzywuzzy (from codecarbon)\n", " Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n", "Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n", "Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n" ] } ], "source": [ "!pip install transformers datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "pY6M4fSb8SY6" }, "source": [ "2. Load the data from the hub." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 702, "referenced_widgets": [ "d9e48cf8d69b404cb0528b4a5bb9c23f", "4d6d94b6584a4e7aa9b586fe5f63c705", "9c056246d50b4c40afe381254e14f343", "c6f1278b534a4b7983adcb776ce7b4fd", "9a9aa9595c294dcb893b003551852839", "dc3519c4fcb149f4b212dac78dd3e981", "30cd4d5828a84d70a0eb63230b943de8", "2dd26b8f83324717aa5ef039a4410675", "783b23c5e00e4279b20386b6af6aefd5", "b0ede4e791c0403dbc90bb02eb44d389", "fecfd7ecba6547219f36de89e154f8c4", "cd3d68f82a984432b60e4935a579fb64", "3d69341decff4c0194a5df523f2e5ef7", "d06879a3e92e490ca28c2193973c2fbb", "c1e8f36806354b8aa90285c545111df3", "0c1d8e086a46485d9b3fac20504ad7e2", "0ce91c595730407990bf0e723abb52ca", "1eceb7c2191b4028b9b59155bddb629f", "2c823bac025d47e7bbf1482f366cec75", "95d9b5c626ce424aa3e007953708db9c", "bf4b344c06ce453599b77c4e5583cf7d", "61267d4dc5bc456e9a7b27244cd528eb", "c2b43552429842dcbb5c4180d5692f1d", "2fb2ebf6a251413f85d31c479b5c8ebe", "06a59990a8e849cb8c904fd847de5482", "276a0d8085d04a6e977a399598b5c3ec", "8ea02ee1bc014685903911c508008ee8", "769248f0fde6491189c436065a408711", "8c33149721aa46038daadd9ec17cf5f1", "e366208c46f64a5db9da8dc482e3af77", "99efe6ebfd1e4ce1adc14e00cea36e36", "84dec9e3ffa0475692f66136978d49f6", "0d17dcbf836144509f99e1e38b860693", "70936aca40034f1585b1c0ce23283493", "596bbbf672c24467b484471a24c5f000", "ecd7c0f82f0647a88612439a615e95b2", "2bda638a3089418189e40b43336e2ec5", "790a23cb2bf1485ba1fdc3936ecaf782", "645d4bd2764546cb959cf1811153d9d0", "9ed860ed51ea4dd39507fa144e1bdadf", "d32ab3e25fb84ead9c05ad253fe46e93", "88a7da23d7f5414d85f77c0fb797072d", "956a841201544355b72bd0875f852727", "259421baf1da43858a0412c14b848640", "170ee76891de4791839232d5c725a823", "61f4d3bdadd647b38ba22fa10052f24d", "89ecb77da42042ceb338ab2830e8169c", "cc77332985674b8faead20cc86838760", "7f2a103614744b95aaa2552b15963276", "275832c1ff5e4ab7a70c5239e9b5050f", "67684be3b71d46d5828bc69781423274", "6521292a25f241d5b563971bc16a6fcd", "5ce298a07c9b4ce2b2c803ac7e75f829", "f74a7011a3014fffa37d92129bdc7421", "7be1756853ef46e09646866c301627c4", "7b22b84aca3644319f731852b1e3b040", "e14bf2addc1047c284701011ce97b249", "0535fdc113dc4d3d89bef23302be1e63", "03107de13b4c40abb408a36a67b26596", "690f765ca74e4cf1a263bba1943302bb", "83624f679b464ffa93096a3d29546c70", "3cec8cf19ee44491aa7d40f31879112b", "0baa95fe9bff4f47bdc8ec447b295587", "799f8ec3f5ae4d938d04bdea16b964f7", "9a22e71845484a2db8ac6f826846220e", "32671f3bd3904184a877513c7a70e29c", "75c121690ee740ad8dccc8655819bb60", "5d8735309625479aa44e21d8aa74f2ea", "1ac30d68d0e14d158fd7112863a4514e", "73c40bd736f546a29f2e9c4cfc5c1962", "11ca46a9caf342bf8c2207a28153499e", "1200df7ca4904df29def632aab9d6cf1", "2b4e6f056871491b8255a1bcc88aafaf", "64c3c307d7ff4588ab62c2adfa4fff75", "573e8db169e3442b9ee3364b0fd730e7", "23305514b6ad4c9ca32453ff3ddbeb12", "f6436b699a294b908f42bf07523accef", "20d3fe94880d42d9933eed798bc7e834", "310f80a796ca4fcaadbe56b06ffd356e", "525ed7d03ad34c6b92c41ab5f856477a", "267d4e2077fa441fab2210e85559c530", "0993950d12b04a65bceedbd79593dda3", "9047d2b6465d4d3296227805f276b7c9", "6a8c9b8936924429b8651bc812bda649", "63fc04394fce446b9660f19f065d1a24", "2b05e9f1a0ab47568cff646264cadba2", "26bf84b4726040d7a936c46a8e119fe6", "028b1ad7812149fdbadd9472555eae63", "01956825f60b45a1afc34045d153055b", "622d696b4068401b93f5b2bae0074c62", "458a951aa25e489fa544507fb625043d", "436bf1ac107e44c7a62b483c8fee5a39", "2375650cbab7417a9c729fa04d45f63e", "8a8a637905b34ef6943dd908b32fa0ba", "0d03b45e3d2e4c849fbb888551257f7e", "f112ddc798d74a318cb05a66fa305761", "20e0d95da7fd4042b7e00695e05bfd69", "7612cc9b1ee04257980ef1fa8d644142", "ad3fa15bbb8d495cb3a7247e75b6a839", "32aa48ecce9f4463812e99f39cc9240c", "fb5c8a3a5f224a90ba8219a7d761fbfb", "874341edbb69484896c249e121095740", "60fbf73caded446a8992087db6bac051", "a7550b737bfd4f04a7ccaea9e434650b", "ac72fc383d104facb2c499f91d361afe", "f724182dc2cf41c1a809a44b7c035751", "b4ee35b41ee3499ba7a022f0074c912d", "0e8cc859a0184905a6df4068d934cd5d", "2ef17c1437a04819825095ca10bc0839", "eaa8e8c2186d45a0ac68fd485ead5f8d", "d7e3d3b78ca448738afd7990838fd58d", "4a5fd86c98e04def91cd943881f9da63", "a32809d1f9874f5796e1a521ebacdc8b", "aaa34ad82fcf4edb805054e8b9d68e29", "3e23df5be01941ffb825915245b46c7c", "0bd061df19f240308df7697aea1811ea", "25e95ecfb5894eb497c6b0448754c3c2", "1aa95d5717694f909f993fda65750ae7", "3ea2901b3ab746b0bf3bd7e07f74fb51", "c9c7c2d9fc8d4457b91bb2000bea2f29", "71024cc40f9b439ea112bf7afdd65270" ] }, "id": "RNH_RDozXSqn", "outputId": "18a1e859-c165-4573-8a28-a8fa3b0df10d" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d9e48cf8d69b404cb0528b4a5bb9c23f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading readme: 0%| | 0.00/5.11k [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-432bcf84fc201cdd/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cd3d68f82a984432b60e4935a579fb64", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c2b43552429842dcbb5c4180d5692f1d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/1.98M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "70936aca40034f1585b1c0ce23283493", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/14.4M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "170ee76891de4791839232d5c725a823", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/15.4M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7b22b84aca3644319f731852b1e3b040", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading data: 0%| | 0.00/2.13M [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "75c121690ee740ad8dccc8655819bb60", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Extracting data files: 0%| | 0/4 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "20d3fe94880d42d9933eed798bc7e834", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_english split: 0%| | 0/10012 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "01956825f60b45a1afc34045d153055b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_english split: 0%| | 0/53134 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "32aa48ecce9f4463812e99f39cc9240c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_instruct_portuguese split: 0%| | 0/52874 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d7e3d3b78ca448738afd7990838fd58d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Generating aira_portuguese split: 0%| | 0/10021 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-432bcf84fc201cdd/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" ] }, { "data": { "text/html": [ "\n", "
\n", " | prompt | \n", "completion | \n", "
---|---|---|
0 | \n", "Como posso curar minha insônia. | \n", "Uma maneira possível de curar a insônia é mant... | \n", "
1 | \n", "Beber muita água faz mal? | \n", "De acordo com a Mayo Clinic, beber muita água ... | \n", "
2 | \n", "Dê-me uma lista dos melhores filmes de golfe q... | \n", "-Caddyshack\\n- Caneca de lata\\n- O melhor jogo... | \n", "
3 | \n", "Qual é a principal causa do aquecimento global. | \n", "A principal causa do aquecimento global é o au... | \n", "
4 | \n", "O que posso fazer para reduzir meus resíduos d... | \n", "Compre itens com embalagens mínimas e evite pl... | \n", "
... | \n", "... | \n", "... | \n", "
52869 | \n", "Você poderia elucidar a ideia de otimização de... | \n", "A otimização Mesa refere-se à circunstância em... | \n", "
52870 | \n", "Quais bancos têm as melhores taxas de juros no... | \n", "As melhores taxas de juros atualmente variam d... | \n", "
52871 | \n", "Em quais ações devo investir. | \n", "Depende de seus objetivos de investimento e to... | \n", "
52872 | \n", "Como fazer filmes? | \n", "Para fazer filmes, você precisa ter algumas co... | \n", "
52873 | \n", "Quais são os ingredientes para um coquetel Moj... | \n", "Os ingredientes para um coquetel Mojito são ru... | \n", "
52874 rows × 2 columns
\n", "