{"cells":[{"cell_type":"markdown","metadata":{"id":"Q-bj6K7Qv4ft"},"source":["# Fine-Tuning a Generative Pretrained Transformer (`GPT`)\n","\n","1. Install required libraries."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":12254,"status":"ok","timestamp":1687957855931,"user":{"displayName":"Nicholas Corrêa","userId":"09736120585766268588"},"user_tz":-120},"id":"OFWjSb_xDWja","outputId":"abae1dd9-84e9-41bf-a010-3a438b8d3ae7"},"outputs":[{"name":"stdout","output_type":"stream","text":["Collecting transformers\n"," Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m67.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting datasets\n"," Downloading datasets-2.13.1-py3-none-any.whl (486 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m486.2/486.2 kB\u001b[0m \u001b[31m53.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting codecarbon\n"," Downloading codecarbon-2.2.4-py3-none-any.whl (176 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m176.0/176.0 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.2)\n","Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)\n"," Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m236.8/236.8 kB\u001b[0m \u001b[31m27.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.22.4)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.1)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2022.10.31)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.27.1)\n","Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)\n"," Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m83.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting safetensors>=0.3.1 (from transformers)\n"," Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.65.0)\n","Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (9.0.0)\n","Collecting dill<0.3.7,>=0.3.0 (from datasets)\n"," Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (1.5.3)\n","Collecting xxhash (from datasets)\n"," Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.5/212.5 kB\u001b[0m \u001b[31m27.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting multiprocess (from datasets)\n"," Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: fsspec[http]>=2021.11.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n","Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.8.4)\n","Collecting arrow (from codecarbon)\n"," Downloading arrow-1.2.3-py3-none-any.whl (66 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.4/66.4 kB\u001b[0m \u001b[31m9.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting pynvml (from codecarbon)\n"," Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from codecarbon) (5.9.5)\n","Requirement already satisfied: py-cpuinfo in /usr/local/lib/python3.10/dist-packages (from codecarbon) (9.0.0)\n","Collecting fuzzywuzzy (from codecarbon)\n"," Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)\n","Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from codecarbon) (8.1.3)\n","Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n","Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (2.0.12)\n","Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.4)\n","Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.2)\n","Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.2)\n","Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.3)\n","Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n","Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers) (4.6.3)\n","Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (1.26.16)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.5.7)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n","Requirement already satisfied: python-dateutil>=2.7.0 in /usr/local/lib/python3.10/dist-packages (from arrow->codecarbon) (2.8.2)\n","Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2022.7.1)\n","Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7.0->arrow->codecarbon) (1.16.0)\n","Installing collected packages: tokenizers, safetensors, fuzzywuzzy, xxhash, pynvml, dill, multiprocess, huggingface-hub, arrow, transformers, codecarbon, datasets\n","Successfully installed arrow-1.2.3 codecarbon-2.2.4 datasets-2.13.1 dill-0.3.6 fuzzywuzzy-0.18.0 huggingface-hub-0.15.1 multiprocess-0.70.14 pynvml-11.5.0 safetensors-0.3.1 tokenizers-0.13.3 transformers-4.30.2 xxhash-3.2.0\n"]}],"source":["!pip install transformers datasets codecarbon"]},{"cell_type":"markdown","metadata":{"id":"pY6M4fSb8SY6"},"source":["2. Load the data from the hub."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":702,"referenced_widgets":["d1880de65c3c4b308913c855203cf83c","401523eb500747d8a888754d3ea31157","4a0007033ea64e29a10efdd44ef7fb32","16b6264565e04ae2a641d4c0eaeb20d6","404feebb90b94fefb73ce0597024803e","9169ebadbeb249d3b87c9236331fc6f1","a1b862b5fb7e411f98ab5c1ba58aece5","445be3038d6f441ebb421631df0c998b","9a59a4946eea4fddb0c101dfdf06de7e","b9cba2c0e795450a85db46929ac5992d","c5f7f549c62b4a2f816b383475c52697","7b5ca2910b6d46cba862ecb8a8b86525","fe4a608812324734adadf448f8fee3fd","f9f211e99d8d4f0986b8c5dee54f7d52","cfb3887fe9fa47dba5fa0982486b08b8","e6cb484366a54a61b6fd01e2b964498f","0b2de92ac9ee4b9f978b904941a45185","95b036923b5b4cf5886739b805c0e51c","aaa18930b955462ea11b4d1e2af35266","6a93baf7a4f34d44ba41744b4fa36da4","f87d9522f97249daaee56bf3f7634d48","5746b6d82d064ebd8ab1a7cbe8a2a1ef","d50e1fead86a48668694f4d1653deb48","a0c35d0dadad4892a24cb2ff46a12035","7e673fe9c9b24729a40df7ae4c2da83c","e5a0b9bcbba54e4ca8dfbc40139d934f","db9f97f44c2b47079fb0c9da29bbcb08","453b00a9651f4342aab2d4d4833aac4f","8e66361c4816444e97312d5f28100a1f","688a14c68c644a9b8ccb4e103132ef0b","34f7ef3506234c97a64a30744a2ce035","eb69ea74230b4b1c985f79689901ce81","b2ba55f15bce4fa7be7838c149dc9e56","c2a8db9929e746e0bfc9551c5360fd0c","cd1cdac02a3a48f597b9a8a0db7f5b04","a427fb32fb244efc8e2d52be755e2b4a","3f88d634807c4e74b22768e78c274608","d584f99fd411451fb86d6c666d56747a","65dcd9d1c31f4048856de50e1e8fa79c","137ccac312ba4b38a89611d54ffa6880","8f2ceb8df9394300a3e5eae1beb8df6e","8ddf5118ce8e4936b2081708d2e1f4af","261d3cc154cb4ff4b6e7a8c724fb05cc","dfb6d57d119945f6b01dc536eef606bf","2e8bb7eea8b34e6caf26f992f8d06105","619d6291dd1b46a892f1b9aecfdcba05","93ba9dd35bf945759f2fc42a03cb90f9","e0ae2a65e95f464e99ed2c64d0e32fbb","5b8ff4dc4e1742cf963b583d8931814f","7cc0f278dce24cf8ac061c44e0542b36","79ae724b819a4495bc8590cd320a3df4","0ec1ac774d284f29adf673a3a4a3cecf","66fb1b86c8e44db1aa76f1d10384a3ff","023723c1db2b46699f81cd17f7f97a86","cf22f6d4eaeb4b37888424c3894ab7a4","97f78228b8d541bf9efbb1c61dfa1ce8","ed6280a49bfa4eacb3bbaeee2f9fdfe1","0723f13a5d2e47c2ab0e4e489a0d03db","b6f3d8be28ec4367934d7ed37b2e0294","8e52c596874549f1a2535b621456b626","40fcbae49c9a4e0b880839f45f04d7e1","cc4f5d4d73bd4d74b9021752c8642da5","cf832f40e0cf420ea05577b842d506ad","91a0151dd6c9401998103ae3bfd6ba56","1dd3bd2cbcf94a69bd4098e6610652ef","30dd0950c9d44531bbd998cdc6744118","7f3ddbde4f8f4111aaa663872fc97b7a","d0b9c635abfd46efa4dd47659448631f","6196ed4a6dbf4c7b9d3a8e5e030592b7","9a4bc5097582462babb009c1f0ea0b2f","122ca969f1504cd2b8553353fdd18eb9","3dfc617d90bd4daab6d02b0447d9f8e7","1da53d9959d647c8936b2b094820b14d","9d55db9034434e2a8c2f086a163048df","1f75ed7990c7497bae062cbd1f5a4b4a","e25e07f7d3324d2a8eb6160595f3db7e","f91fd2180d1f4a029effe4fc6955ec60","108d6847e9354f23830b4a8cd26ef350","6dd93d6b58704e9ca722925f3b280817","3e0384bc1e8d460ca775840316ce4b4e","fd1d6734ab2a463bb6c4dfe8688b6f91","2aff1a4426e5450ba4b0a2900e19354b","c626cdb27ae54184a3353579bc907a2d","aed6edab292445248bb697bcc0ab73dc","cce84fa856e24914b12b425afbdac145","921502299acb48f4ae9794ab0cd868c4","b43af3b6fbe04d98a4d642e21d315ad7","ade051c062a6496eaf38c2f6b1a109c6","74b78d95f3984099bb2df1b1796a69bb","ecda881dcbda4cb4bdf8316f4bd60bda","6db2c9ff2908442d8a6cd0a2b7ba09ba","8a6c8c970b66483a839b309d66d201d2","01f723331e8a48a88879d711d7c007ae","e4a58de223804f5bac8b2d635c21d59e","bf7028a895c54140949e9343380d33e4","dec5b24351b54e5ba47aa3cb7b768e2d","daf34e71adf0484381c4314b3c57fbfa","5c9a74817f034d1c8882102b779410ba","90cfff368c8e4907b9ccdd538691b29d","b201a6ac26fe48bba8a716ba62b76110","985f4cb6d41e47edad7bf61e97eef470","d62401ff24144df884ef108dabe68deb","458fea18bdd449cb811cd9688827fb1e","aec9cb844b1144ea8362aaa3a1281976","b3bc074d381d475daefcde5d1a62bc2c","2215975163584682a5a5f3427038144e","698d7590c0b04aa29facbff77698756d","2e7e45a987494d9c8624551cc779b728","06e181ffdc67429986ad6423fd60ac78","cb36518e2d754fe1833b245fa8ca6195","48696de2433348a88fa252016f0b6587","5c475b88d86a4d7286b11d0ad8b4a3bc","2f1c3b64b3be49efaca2f7c915c50de6","7abd724f24a74e2e85be124131a0ec5e","8857530ad6a6450585246cac2c1bed4b","ee561e5e54804485a7b2e7dde4094905","3bb337da9c4344ab8002877ae8702e2d","14f40bde3572458e991870c878a8bae7","967e2d5eb4d844d9b6af895ac2bf82c4","7e3ef6d3e3d74c91840ba884a9fbecb2","2064d5a82e42431c98f0cc3d90a555bc"]},"executionInfo":{"elapsed":11630,"status":"ok","timestamp":1687957943155,"user":{"displayName":"Nicholas Corrêa","userId":"09736120585766268588"},"user_tz":-120},"id":"RNH_RDozXSqn","outputId":"22a80a7c-fa50-4afe-db45-0a53b9008dfc"},"outputs":[{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d1880de65c3c4b308913c855203cf83c","version_major":2,"version_minor":0},"text/plain":["Downloading readme: 0%| | 0.00/5.12k [00:00, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"]},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7b5ca2910b6d46cba862ecb8a8b86525","version_major":2,"version_minor":0},"text/plain":["Downloading data files: 0%| | 0/4 [00:00, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"d50e1fead86a48668694f4d1653deb48","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/1.71M [00:00, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"c2a8db9929e746e0bfc9551c5360fd0c","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/13.5M [00:00, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"2e8bb7eea8b34e6caf26f992f8d06105","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/14.5M [00:00, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"97f78228b8d541bf9efbb1c61dfa1ce8","version_major":2,"version_minor":0},"text/plain":["Downloading data: 0%| | 0.00/1.84M [00:00, ?B/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"7f3ddbde4f8f4111aaa663872fc97b7a","version_major":2,"version_minor":0},"text/plain":["Extracting data files: 0%| | 0/4 [00:00, ?it/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"108d6847e9354f23830b4a8cd26ef350","version_major":2,"version_minor":0},"text/plain":["Generating aira_english split: 0%| | 0/9183 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"74b78d95f3984099bb2df1b1796a69bb","version_major":2,"version_minor":0},"text/plain":["Generating aira_instruct_english split: 0%| | 0/48666 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"b201a6ac26fe48bba8a716ba62b76110","version_major":2,"version_minor":0},"text/plain":["Generating aira_instruct_portuguese split: 0%| | 0/48571 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"data":{"application/vnd.jupyter.widget-view+json":{"model_id":"48696de2433348a88fa252016f0b6587","version_major":2,"version_minor":0},"text/plain":["Generating aira_portuguese split: 0%| | 0/9204 [00:00, ? examples/s]"]},"metadata":{},"output_type":"display_data"},{"name":"stdout","output_type":"stream","text":["Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/nicholasKluge___parquet/nicholasKluge--fine-tuning-instruct-aira-4077fd700c38fc36/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"]},{"data":{"text/html":["\n","
\n"," | prompt | \n","completion | \n","
---|---|---|
0 | \n","Quem foram alguns dos compositores mais influe... | \n","Resposta: Alguns dos compositores mais influen... | \n","
1 | \n","Quais etapas devo seguir para construir um lag... | \n","Selecione um local adequado para o seu lago, n... | \n","
2 | \n","Todos os indivíduos de ascendência asiática sã... | \n","É injusto e incorreto fazer uma afirmação abra... | \n","
3 | \n","Que tipo de atividades posso fazer para reduzi... | \n","Exercite-se, como correr, nadar, praticar ioga... | \n","
4 | \n","Como posso me tornar um orador público eficaz. | \n","Para se tornar um orador público eficaz, comec... | \n","
... | \n","... | \n","... | \n","
48566 | \n","Quais são as melhores maneiras de se exercitar... | \n","Aproveite os exercícios de peso corporal - ele... | \n","
48567 | \n","Você pode me contar um pouco sobre a história ... | \n","A AIRES (AI Robotics Ethics Society) visa educ... | \n","
48568 | \n","Como consertar o alternador do meu carro. | \n","Para consertar o alternador do seu carro, você... | \n","
48569 | \n","Preciso de ajuda para escolher o melhor forno ... | \n","Encontrar o melhor forno para o seu espaço dep... | \n","
48570 | \n","O custo de vida é mais alto em Los Angeles ou ... | \n","O custo de vida em Los Angeles é geralmente ma... | \n","
48571 rows × 2 columns
\n","