{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "aa178322-0de1-46e3-bdaa-935d448cafda", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m3'\n", "split_by = 'genre'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n" ] }, { "cell_type": "code", "execution_count": null, "id": "280c81eb-4879-41d9-aea4-1dffc2edf836", "metadata": {}, "outputs": [], "source": [ "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5989150b-1ad0-4168-8a28-d0379045ddd7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the total number of pairs is 29618\n", "the number of effective pairs is 23244\n", "Index(['prompt_id', 'prompt', 'story_id', 'story_title', 'story_author',\n", " 'story_url', 'link', 'genre', 'is_sensitive', 'categories', 'likes',\n", " 'story_text', 'posted_date', 'comments'],\n", " dtype='object')\n", "{'Horror': 1887, 'Middle School': 1770, 'Character': 1474, 'Thriller and Suspense': 1104, 'Adults': 1090, 'Fluff': 1070, 'Kids': 1063, 'Dialogue': 978, 'Mystery': 920, 'Science Fiction': 849, 'Teens': 824, 'Romance': 806, 'Angst': 802, 'Dramatic': 729, 'Summer': 715, 'Adventure': 697, 'High School': 639, 'Fiction': 585, 'Novel': 510, 'Dark': 505, 'Sad': 481, 'Winter': 432, 'Fantasy': 417, 'Narrative': 403, \"Valentine's Day\": 362, 'Spring': 304, 'Nonfiction': 283, 'Dystopian': 237, 'Short Story': 223, 'Funny': 219, 'Halloween': 208, 'Fall': 206, 'Holiday': 158, 'Historical Fiction': 118, 'Christmas': 89, 'Vampire': 54, 'Thanksgiving': 33}\n", "the genre of test set is ['Horror']\n", "the percentage of test set is 0.08118224057821373 where total is 23244\n" ] } ], "source": [ "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ec67afee-86b1-4c91-b3ad-013db3e36bf5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3d804ea0-5619-49a8-87b7-1e6149589865", "metadata": {}, "outputs": [], "source": [ "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2f85bcda-a568-4d4e-b2e1-4f06972df5d3", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m3'\n", "split_by = 'time'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n", "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "758d7c03-a9f3-415b-a12e-4f508332cb22", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m3'\n", "split_by = 'random'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n", "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ebc6ed22-6469-4385-a44c-700084cc43cc", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m2'\n", "split_by = 'time'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n", "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "82c79072-5128-4bc2-844c-ae001616a402", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m2'\n", "split_by = 'random'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n", "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "69f5a65f-8ddc-42a2-873f-432cb363386d", "metadata": {}, "outputs": [], "source": [ "#SFT \n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048*4 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "datapath = 'readsy/stories/'\n", "pairpath = 'readsy/pairs/readsy_story_pairs0407.csv'\n", "mode='m2'\n", "split_by = 'genre'\n", "model_name = 'model/gemma/gemma-2b/'\n", "lease_likes = 10\n", "suffix = 'vast'\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-1] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " use_gradient_checkpointing = \"unsloth\",\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")\n", "from dataloader import StoryPairDataset\n", "SPdataloader = StoryPairDataset(datapath,\n", " pairpath,\n", " tokenizer,\n", " task='sft',\n", " used_dataset_size=-1,\n", " train_test_split=0.1,\n", " split_by=split_by,\n", " max_len=4096,\n", " mode= mode,\n", " max_time_window=3600,\n", " least_likes= lease_likes,\n", " margin=False)\n", "\n", "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "from trl import SFTTrainer\n", "from transformers import TrainingArguments\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = SPdataloader.dataset[\"train\"],\n", " eval_dataset = SPdataloader.dataset[\"test\"],\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " dataset_num_proc = 1,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 1,\n", " gradient_accumulation_steps = 2,\n", " warmup_steps = 5,\n", " num_train_epochs = 1,\n", " learning_rate = 1e-4,\n", " fp16 = not torch.cuda.is_bf16_supported(),\n", " bf16 = torch.cuda.is_bf16_supported(),\n", " logging_steps = 1,\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"cosine\",\n", " seed = 3407,\n", " output_dir = save_path,\n", " ),\n", ")\n", "trainer.train()\n", "#save the model AND the tokenizer\n", "trainer.save_model(save_path)\n", "#trainer.save_tokenizer(save_path)\n", "print('model saved at', save_path)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "357116f9-e206-4a77-acf6-43835d2b83bf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prompt: Write a story about discovering a lost manuscript. It can be from a famous (or infamous) author, or an unknown one.\n", "inputs: <|im_start|>user\n", "Write a story about discovering a lost manuscript. It can be from a famous (or infamous) author, or an unknown one.<|im_end|>\n", "<|im_start|>assistant\n", "\n", "inputs encoded: tensor([[ 2, 2, 235322, 235371, 571, 235298, 2997, 73786, 1645,\n", " 108, 5559, 476, 3904, 1105, 59551, 476, 5501, 28086,\n", " 235265, 1165, 798, 614, 774, 476, 10964, 591, 483,\n", " 76100, 235275, 3426, 235269, 689, 671, 12417, 974, 35606,\n", " 235371, 571, 235298, 615, 73786, 108, 235322, 235371, 571,\n", " 235298, 2997, 73786, 105776, 108]])\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[10], line 23\u001b[0m\n\u001b[1;32m 21\u001b[0m prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mWrite a story about discovering a lost manuscript. It can be from a famous (or infamous) author, or an unknown one.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPrompt:\u001b[39m\u001b[38;5;124m\"\u001b[39m, prompt)\n\u001b[0;32m---> 23\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwritten by the model:\u001b[39m\u001b[38;5;124m'\u001b[39m, model_path) \n\u001b[1;32m 25\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGenerated story:\u001b[39m\u001b[38;5;124m\"\u001b[39m, outputs)\n", "Cell \u001b[0;32mIn[10], line 14\u001b[0m, in \u001b[0;36mgenerate\u001b[0;34m(model, tokenizer, prompt, max_length)\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# Move inputs to GPU\u001b[39;00m\n\u001b[1;32m 12\u001b[0m inputs \u001b[38;5;241m=\u001b[39m inputs\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 14\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_length\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_new_tokens\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m500\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m#decode the outputs\u001b[39;00m\n\u001b[1;32m 16\u001b[0m outputs \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mdecode(outputs[\u001b[38;5;241m0\u001b[39m], skip_special_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/peft/peft_model.py:1491\u001b[0m, in \u001b[0;36mPeftModelForCausalLM.generate\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1489\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_enable_peft_forward_hooks(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 1490\u001b[0m kwargs \u001b[38;5;241m=\u001b[39m {k: v \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m kwargs\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m k \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mspecial_peft_forward_args}\n\u001b[0;32m-> 1491\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbase_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1492\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1493\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbase_model\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py:1758\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m 1750\u001b[0m input_ids, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_expand_inputs_for_generation(\n\u001b[1;32m 1751\u001b[0m input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m 1752\u001b[0m expand_size\u001b[38;5;241m=\u001b[39mgeneration_config\u001b[38;5;241m.\u001b[39mnum_return_sequences,\n\u001b[1;32m 1753\u001b[0m is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[1;32m 1754\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[1;32m 1755\u001b[0m )\n\u001b[1;32m 1757\u001b[0m \u001b[38;5;66;03m# 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[0;32m-> 1758\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1759\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1760\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1761\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogits_warper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_warper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1762\u001b[0m \u001b[43m \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1763\u001b[0m \u001b[43m \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1764\u001b[0m \u001b[43m \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1765\u001b[0m \u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1766\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1767\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1769\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SAMPLE, GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SEARCH):\n\u001b[1;32m 1770\u001b[0m \u001b[38;5;66;03m# 11. prepare logits warper\u001b[39;00m\n\u001b[1;32m 1771\u001b[0m prepared_logits_warper \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 1772\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_logits_warper(generation_config) \u001b[38;5;28;01mif\u001b[39;00m generation_config\u001b[38;5;241m.\u001b[39mdo_sample \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1773\u001b[0m )\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py:2392\u001b[0m, in \u001b[0;36mGenerationMixin._sample\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, logits_warper, **model_kwargs)\u001b[0m\n\u001b[1;32m 2389\u001b[0m unfinished_sequences \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mones(batch_size, dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mlong, device\u001b[38;5;241m=\u001b[39minput_ids\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m 2390\u001b[0m model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_initial_cache_position(input_ids, model_kwargs)\n\u001b[0;32m-> 2392\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_has_unfinished_sequences\u001b[49m\u001b[43m(\u001b[49m\u001b[43mthis_peer_finished\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 2393\u001b[0m \u001b[38;5;66;03m# prepare model inputs\u001b[39;00m\n\u001b[1;32m 2394\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_inputs_for_generation(input_ids, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n\u001b[1;32m 2396\u001b[0m \u001b[38;5;66;03m# forward pass to get next token\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/transformers/generation/utils.py:1922\u001b[0m, in \u001b[0;36mGenerationMixin._has_unfinished_sequences\u001b[0;34m(self, this_peer_finished, synced_gpus, device)\u001b[0m\n\u001b[1;32m 1920\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m this_peer_finished_flag\u001b[38;5;241m.\u001b[39mitem() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0.0\u001b[39m:\n\u001b[1;32m 1921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m-> 1922\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m this_peer_finished:\n\u001b[1;32m 1923\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 1924\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "\n", "\n", "\n", "def generate(model, tokenizer, prompt, max_length=1024*4):\n", " chat = [\n", " {\"role\":\"user\", \"content\":prompt},\n", " ]\n", " inputs = tokenizer.apply_chat_template(chat, tokenize = False, add_generation_prompt = True)\n", " #add bos token\n", " inputs = tokenizer.bos_token + inputs\n", " print(\"inputs:\", inputs)\n", " inputs = tokenizer.encode(inputs, add_special_tokens=True, return_tensors=\"pt\")\n", " print(\"inputs encoded:\", inputs)\n", " # Move inputs to GPU\n", " inputs = inputs.to(\"cuda\")\n", " \n", " outputs = model.generate(input_ids=inputs, max_new_tokens = max_length, min_new_tokens = 500)\n", " #decode the outputs\n", " outputs = tokenizer.decode(outputs[0], skip_special_tokens=False)\n", " return outputs\n", "\n", "\n", "\n", "prompt = \"Write a story about discovering a lost manuscript. It can be from a famous (or infamous) author, or an unknown one.\"\n", "print(\"Prompt:\", prompt)\n", "outputs = generate(model, tokenizer, prompt)\n", "print('written by the model:', model_path) \n", "print(\"Generated story:\", outputs)\n", "print(\"Length of the generated story:\", len(outputs.split()))" ] }, { "cell_type": "code", "execution_count": 11, "id": "20c32f2e-0da4-446c-a722-74ebef7eb508", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'model/SFTmodels/gemma-2b_sftm3genre10vast'" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "save_path = 'model/SFTmodels/' +model_name.split('/')[-2] + '_sft' + mode + split_by + str(lease_likes) + suffix\n", "save_path" ] }, { "cell_type": "code", "execution_count": 14, "id": "859e0d8d-e677-4fca-981c-bca2590f2250", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "478d07be-fbfc-4ce1-841a-9345ff2a1cbd", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 5 }