File size: 7,570 Bytes
94980bd
1
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"fsEL5wE__Lhx"},"outputs":[],"source":["#this is for companies act\n","!pip install transformers\n","!python -m transformers.models.auto.modeling_auto get gpt2\n","!pip install torch\n","!pip install accelerate -U\n","!pip install transformers[torch]\n","!pip install datasets"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":739},"id":"1rH-Fc9E_VqP","outputId":"08c67862-3f70-4734-b4eb-f14147e9dc3c"},"outputs":[{"metadata":{"tags":null},"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]},{"metadata":{"tags":null},"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n","  warnings.warn(\n"]},{"data":{"text/html":["\n","    <div>\n","      \n","      <progress value='4356' max='5040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [4356/5040 45:42 < 07:10, 1.59 it/s, Epoch 13.83/16]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n"," <tr style=\"text-align: left;\">\n","      <th>Step</th>\n","      <th>Training Loss</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>500</td>\n","      <td>2.710600</td>\n","    </tr>\n","    <tr>\n","      <td>1000</td>\n","      <td>2.345700</td>\n","    </tr>\n","    <tr>\n","      <td>1500</td>\n","      <td>2.148000</td>\n","    </tr>\n","    <tr>\n","      <td>2000</td>\n","      <td>2.009000</td>\n","    </tr>\n","    <tr>\n","      <td>2500</td>\n","      <td>1.905100</td>\n","    </tr>\n","    <tr>\n","      <td>3000</td>\n","      <td>1.803900</td>\n","    </tr>\n","    <tr>\n","      <td>3500</td>\n","      <td>1.749700</td>\n","    </tr>\n","    <tr>\n","      <td>4000</td>\n","      <td>1.684100</td>\n","    </tr>\n","  </tbody>\n","</table><p>"],"text/plain":["<IPython.core.display.HTML object>"]},"metadata":{},"output_type":"display_data"},{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["\n","    <div>\n","      \n","      <progress value='4569' max='5040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n","      [4569/5040 47:56 < 04:56, 1.59 it/s, Epoch 14.50/16]\n","    </div>\n","    <table border=\"1\" class=\"dataframe\">\n","  <thead>\n"," <tr style=\"text-align: left;\">\n","      <th>Step</th>\n","      <th>Training Loss</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <td>500</td>\n","      <td>2.710600</td>\n","    </tr>\n","    <tr>\n","      <td>1000</td>\n","      <td>2.345700</td>\n","    </tr>\n","    <tr>\n","      <td>1500</td>\n","      <td>2.148000</td>\n","    </tr>\n","    <tr>\n","      <td>2000</td>\n","      <td>2.009000</td>\n","    </tr>\n","    <tr>\n","      <td>2500</td>\n","      <td>1.905100</td>\n","    </tr>\n","    <tr>\n","      <td>3000</td>\n","      <td>1.803900</td>\n","    </tr>\n","    <tr>\n","      <td>3500</td>\n","      <td>1.749700</td>\n","    </tr>\n","    <tr>\n","      <td>4000</td>\n","      <td>1.684100</td>\n","    </tr>\n","    <tr>\n","      <td>4500</td>\n","      <td>1.651500</td>\n","    </tr>\n","  </tbody>\n","</table><p>"]},"metadata":{}}],"source":["\n","import torch\n","\n","from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments\n","\n","\n","# Step 1: Mount Google Drive\n","from google.colab import drive\n","drive.mount('/content/drive')\n","\n","# Step 2: Download the Data File\n","import gdown\n","\n","\n","# Define your custom dataset file path\n","dataset_path = \"/content/drive/My Drive/claw_module.txt\"\n","# Load the pre-trained GPT-2 model and tokenizer\n","model_name = \"gpt2\"  # You can choose another GPT-2 variant if needed\n","model = GPT2LMHeadModel.from_pretrained(model_name)\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","\n","# Create a configuration for training\n","config = GPT2Config(\n","    vocab_size=tokenizer.vocab_size,\n","    bos_token_id=tokenizer.bos_token_id,\n","    eos_token_id=tokenizer.eos_token_id,\n",")\n","\n","# Create a TextDataset using your custom dataset\n","dataset = TextDataset(\n","    tokenizer=tokenizer,\n","    file_path=dataset_path,\n","    block_size=500,  # Adjust the block size as needed\n",")\n","\n","# Create a DataCollator for language modeling\n","data_collator = DataCollatorForLanguageModeling(\n","    tokenizer=tokenizer, mlm=False\n",")\n","\n","# Define the training arguments\n","training_args = TrainingArguments(\n","    output_dir=\"/content/drive/My Drive/model/\",\n","    num_train_epochs=16,  # Adjust the number of training epochs\n","    per_device_train_batch_size=4,  # Adjust the batch size\n","    save_steps=10_000,  # Save a checkpoint every N steps\n","    save_total_limit=2,  # Limit the total number of checkpoints saved\n",")\n","\n","# Create a Trainer instance\n","trainer = Trainer(\n","    model=model,\n","    args=training_args,\n","    data_collator=data_collator,\n","    train_dataset=dataset,\n",")\n","\n","# Fine-tune the model\n","trainer.train()\n","\n","# Save the final model\n","trainer.save_model()\n","\n","# Optionally, you can save the tokenizer separately\n","tokenizer.save_pretrained(\"/content/drive/My Drive/model/tokan\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":10446,"status":"ok","timestamp":1696296469018,"user":{"displayName":"Ansh Gupta","userId":"10344052996671114780"},"user_tz":-330},"id":"YQdmw-98BLiz","outputId":"d41019b1-cfcb-4213-aa4a-6fc6a0b0904e"},"outputs":[{"output_type":"stream","name":"stdout","text":["what is company?\n","\n","(a)\n","\n","The company is a company,\n","\n","(b)\n","\n","The company is a company,\n","\n","(c)\n","\n","The company is a company,\n","\n","(d)\n","\n","\n"]}],"source":["\n","import torch\n","from transformers import GPT2LMHeadModel, GPT2Tokenizer\n","\n","# Load the trained model and tokenizer\n","model_name = \"gpt2\"  # Use the same model name you used for training\n","model_path = \"/content/drive/My Drive/model/\"\n","tokenizer_path = \"/content/drive/My Drive/model/tokan\"\n","\n","model = GPT2LMHeadModel.from_pretrained(model_path)\n","tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)\n","\n","# Input text for generation\n","input_text = \"what is company?\"\n","\n","# Tokenize the input text\n","input_ids = tokenizer.encode(input_text, return_tensors=\"pt\")\n","\n","# Generate text\n","output = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=50256)\n","\n","# Decode and print the generated text\n","generated_text = tokenizer.decode(output[0], skip_special_tokens=True)\n","print(generated_text)"]}],"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyOkhD/fa3kYTIycqSsBpIP+"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}