{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"fsEL5wE__Lhx"},"outputs":[],"source":["#this is for companies act\n","!pip install transformers\n","!python -m transformers.models.auto.modeling_auto get gpt2\n","!pip install torch\n","!pip install accelerate -U\n","!pip install transformers[torch]\n","!pip install datasets"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":739},"id":"1rH-Fc9E_VqP","outputId":"08c67862-3f70-4734-b4eb-f14147e9dc3c"},"outputs":[{"metadata":{"tags":null},"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]},{"metadata":{"tags":null},"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.10/dist-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n"," warnings.warn(\n"]},{"data":{"text/html":["\n","
Step | \n","Training Loss | \n","
---|---|
500 | \n","2.710600 | \n","
1000 | \n","2.345700 | \n","
1500 | \n","2.148000 | \n","
2000 | \n","2.009000 | \n","
2500 | \n","1.905100 | \n","
3000 | \n","1.803900 | \n","
3500 | \n","1.749700 | \n","
4000 | \n","1.684100 | \n","
"],"text/plain":[" "]},"metadata":{}}],"source":["\n","import torch\n","\n","from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments\n","\n","\n","# Step 1: Mount Google Drive\n","from google.colab import drive\n","drive.mount('/content/drive')\n","\n","# Step 2: Download the Data File\n","import gdown\n","\n","\n","# Define your custom dataset file path\n","dataset_path = \"/content/drive/My Drive/claw_module.txt\"\n","# Load the pre-trained GPT-2 model and tokenizer\n","model_name = \"gpt2\" # You can choose another GPT-2 variant if needed\n","model = GPT2LMHeadModel.from_pretrained(model_name)\n","tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n","\n","# Create a configuration for training\n","config = GPT2Config(\n"," vocab_size=tokenizer.vocab_size,\n"," bos_token_id=tokenizer.bos_token_id,\n"," eos_token_id=tokenizer.eos_token_id,\n",")\n","\n","# Create a TextDataset using your custom dataset\n","dataset = TextDataset(\n"," tokenizer=tokenizer,\n"," file_path=dataset_path,\n"," block_size=500, # Adjust the block size as needed\n",")\n","\n","# Create a DataCollator for language modeling\n","data_collator = DataCollatorForLanguageModeling(\n"," tokenizer=tokenizer, mlm=False\n",")\n","\n","# Define the training arguments\n","training_args = TrainingArguments(\n"," output_dir=\"/content/drive/My Drive/model/\",\n"," num_train_epochs=16, # Adjust the number of training epochs\n"," per_device_train_batch_size=4, # Adjust the batch size\n"," save_steps=10_000, # Save a checkpoint every N steps\n"," save_total_limit=2, # Limit the total number of checkpoints saved\n",")\n","\n","# Create a Trainer instance\n","trainer = Trainer(\n"," model=model,\n"," args=training_args,\n"," data_collator=data_collator,\n"," train_dataset=dataset,\n",")\n","\n","# Fine-tune the model\n","trainer.train()\n","\n","# Save the final model\n","trainer.save_model()\n","\n","# Optionally, you can save the tokenizer separately\n","tokenizer.save_pretrained(\"/content/drive/My Drive/model/tokan\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":10446,"status":"ok","timestamp":1696296469018,"user":{"displayName":"Ansh Gupta","userId":"10344052996671114780"},"user_tz":-330},"id":"YQdmw-98BLiz","outputId":"d41019b1-cfcb-4213-aa4a-6fc6a0b0904e"},"outputs":[{"output_type":"stream","name":"stdout","text":["what is company?\n","\n","(a)\n","\n","The company is a company,\n","\n","(b)\n","\n","The company is a company,\n","\n","(c)\n","\n","The company is a company,\n","\n","(d)\n","\n","\n"]}],"source":["\n","import torch\n","from transformers import GPT2LMHeadModel, GPT2Tokenizer\n","\n","# Load the trained model and tokenizer\n","model_name = \"gpt2\" # Use the same model name you used for training\n","model_path = \"/content/drive/My Drive/model/\"\n","tokenizer_path = \"/content/drive/My Drive/model/tokan\"\n","\n","model = GPT2LMHeadModel.from_pretrained(model_path)\n","tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)\n","\n","# Input text for generation\n","input_text = \"what is company?\"\n","\n","# Tokenize the input text\n","input_ids = tokenizer.encode(input_text, return_tensors=\"pt\")\n","\n","# Generate text\n","output = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=50256)\n","\n","# Decode and print the generated text\n","generated_text = tokenizer.decode(output[0], skip_special_tokens=True)\n","print(generated_text)"]}],"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyOkhD/fa3kYTIycqSsBpIP+"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}\n"," \n","
\n"," \n"," \n"," \n"," Step \n"," Training Loss \n"," \n"," \n"," 500 \n"," 2.710600 \n"," \n"," \n"," 1000 \n"," 2.345700 \n"," \n"," \n"," 1500 \n"," 2.148000 \n"," \n"," \n"," 2000 \n"," 2.009000 \n"," \n"," \n"," 2500 \n"," 1.905100 \n"," \n"," \n"," 3000 \n"," 1.803900 \n"," \n"," \n"," 3500 \n"," 1.749700 \n"," \n"," \n"," 4000 \n"," 1.684100 \n"," \n"," \n"," \n","4500 \n"," 1.651500 \n","