daresearch commited on
Commit
d501a8a
·
verified ·
1 Parent(s): e45b114

Update finetune_script.py

Browse files
Files changed (1) hide show
  1. finetune_script.py +137 -111
finetune_script.py CHANGED
@@ -1,112 +1,138 @@
1
- from accelerate import Accelerator
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
3
- from transformers import DataCollatorForLanguageModeling
4
- from datasets import Dataset
5
- from trl import SFTTrainer
6
  import os
7
- import pandas as pd
8
-
9
- # Initialize Accelerator for distributed training
10
- accelerator = Accelerator()
11
-
12
- if accelerator.is_local_main_process:
13
- print(f"Running on {accelerator.device}")
14
-
15
- # ================================
16
- # Load Model and Tokenizer
17
- # ================================
18
- model_name = "unsloth/Meta-Llama-3.1-70B-bnb-4bit"
19
-
20
- # Use AutoTokenizer and AutoModelForCausalLM
21
- tokenizer = AutoTokenizer.from_pretrained(model_name)
22
- model = AutoModelForCausalLM.from_pretrained(
23
- model_name,
24
- load_in_4bit=True,
25
- device_map="auto", # Automatically shard across GPUs
26
- torch_dtype="auto" # Auto-adjust to fp16/bf16 based on mixed_precision
27
- )
28
-
29
- # ================================
30
- # LoRA (Low-Rank Adaptation)
31
- # ================================
32
- from peft import LoraConfig, get_peft_model
33
-
34
- # Apply LoRA for fine-tuning
35
- lora_config = LoraConfig(
36
- r=16,
37
- lora_alpha=16,
38
- lora_dropout=0.1,
39
- bias="none",
40
- target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
41
- )
42
- model = get_peft_model(model, lora_config)
43
-
44
- # ================================
45
- # Load and Prepare Data
46
- # ================================
47
- # Load CSVs
48
- train_df = pd.read_csv("train.csv").fillna("")
49
- valid_df = pd.read_csv("valid.csv").fillna("")
50
-
51
- EOS_TOKEN = tokenizer.eos_token
52
-
53
-
54
- def format_prompts(df):
55
- prompts = []
56
- for _, row in df.iterrows():
57
- inst, inp, out = row["instruction"], row["input"], row["output"]
58
- prompt = f"### Instruction:\n{inst}\n\n### Input:\n{inp}\n\n### Response:\n{out}{EOS_TOKEN}"
59
- prompts.append(prompt)
60
- return prompts
61
-
62
-
63
- train_texts = format_prompts(train_df)
64
- valid_texts = format_prompts(valid_df)
65
-
66
- # Convert to Hugging Face datasets
67
- train_dataset = Dataset.from_dict({"text": train_texts})
68
- valid_dataset = Dataset.from_dict({"text": valid_texts})
69
-
70
- # Data collator
71
- data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
72
-
73
- # ================================
74
- # Training Arguments
75
- # ================================
76
- training_args = TrainingArguments(
77
- output_dir="/workspace/outputs",
78
- per_device_train_batch_size=1,
79
- gradient_accumulation_steps=16,
80
- evaluation_strategy="steps",
81
- eval_steps=100,
82
- save_steps=100,
83
- save_total_limit=2,
84
- logging_steps=10,
85
- learning_rate=2e-4,
86
- max_steps=500,
87
- fp16=True, # Use mixed precision for efficiency
88
- report_to="none",
89
- )
90
-
91
- # ================================
92
- # Initialize Trainer
93
- # ================================
94
- trainer = SFTTrainer(
95
- model=model,
96
- tokenizer=tokenizer,
97
- train_dataset=train_dataset,
98
- eval_dataset=valid_dataset,
99
- args=training_args,
100
- data_collator=data_collator,
101
- )
102
-
103
- # ================================
104
- # Start Training
105
- # ================================
106
- if accelerator.is_local_main_process:
107
- print("Starting training...")
108
-
109
- trainer.train()
110
-
111
- if accelerator.is_local_main_process:
112
- print("Training completed successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 0.2 Import Dependencies
 
 
 
 
2
  import os
3
+ import torch
4
+ from transformers import TextStreamer, TrainingArguments
5
+ from datasets import load_dataset
6
+ from trl import SFTTrainer
7
+ from unsloth import FastLanguageModel, is_bfloat16_supported
8
+
9
+ # 0.3 Import notebook_launcher from Accelerate
10
+ from accelerate import notebook_launcher
11
+
12
+ def train():
13
+ # 1. Configuration
14
+ max_seq_length = 2048
15
+ dtype = None
16
+ load_in_4bit = True
17
+
18
+ # Example Alpaca-style prompt template
19
+ alpaca_prompt = """Below is an instruction that describes a task.
20
+ Write a response that appropriately completes the request.
21
+ ### Instruction:
22
+ {}
23
+ ### Input:
24
+ {}
25
+ ### Response:
26
+ {}
27
+ """
28
+
29
+ instruction = """This assistant is trained to code executive ranks ..."""
30
+ input = "In 2015 the company ..."
31
+ huggingface_model_name = "daresearch/Llama-3.1-70B-bnb-4bit-Exec-Labeling"
32
+
33
+ # 2. Before Training
34
+ model, tokenizer = FastLanguageModel.from_pretrained(
35
+ model_name="unsloth/Meta-Llama-3.1-70B-bnb-4bit",
36
+ max_seq_length=max_seq_length,
37
+ dtype=dtype,
38
+ load_in_4bit=load_in_4bit,
39
+ token=os.getenv("HF_TOKEN"),
40
+ )
41
+
42
+ # Quick inference test (optional)
43
+ FastLanguageModel.for_inference(model)
44
+ inputs = tokenizer([alpaca_prompt.format(instruction, input, "")], return_tensors="pt").to("cuda")
45
+ text_streamer = TextStreamer(tokenizer)
46
+ _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=1000)
47
+
48
+ # 3. Load and Format Data
49
+ EOS_TOKEN = tokenizer.eos_token
50
+ def formatting_prompts_func(examples):
51
+ instructions = examples["instruction"]
52
+ inputs = examples["input"]
53
+ outputs = examples["output"]
54
+ texts = []
55
+ for i, inp, out in zip(instructions, inputs, outputs):
56
+ text = alpaca_prompt.format(i, inp, out) + EOS_TOKEN
57
+ texts.append(text)
58
+ return {"text": texts}
59
+
60
+ train_dataset = load_dataset("csv", data_files="train.csv", split="train")
61
+ valid_dataset = load_dataset("csv", data_files="valid.csv", split="train")
62
+
63
+ train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
64
+ valid_dataset = valid_dataset.map(formatting_prompts_func, batched=True)
65
+
66
+ # 4. Prepare LoRA Model
67
+ model = FastLanguageModel.get_peft_model(
68
+ model,
69
+ r=16,
70
+ target_modules=[
71
+ "q_proj","k_proj","v_proj","o_proj",
72
+ "gate_proj","up_proj","down_proj",
73
+ ],
74
+ lora_alpha=16,
75
+ lora_dropout=0,
76
+ bias="none",
77
+ use_gradient_checkpointing="unsloth",
78
+ random_state=3407,
79
+ use_rslora=False,
80
+ loftq_config=None,
81
+ )
82
+
83
+ # 5. Training
84
+ trainer = SFTTrainer(
85
+ model=model,
86
+ tokenizer=tokenizer,
87
+ train_dataset=train_dataset,
88
+ eval_dataset=valid_dataset,
89
+ dataset_text_field="text",
90
+ max_seq_length=max_seq_length,
91
+ dataset_num_proc=2,
92
+ packing=False,
93
+ args=TrainingArguments(
94
+ per_device_train_batch_size=2,
95
+ gradient_accumulation_steps=4,
96
+ warmup_steps=5,
97
+ max_steps=100,
98
+ learning_rate=2e-4,
99
+ fp16=not is_bfloat16_supported(),
100
+ bf16=is_bfloat16_supported(),
101
+ logging_steps=1,
102
+ evaluation_strategy="steps",
103
+ eval_steps=10,
104
+ optim="adamw_8bit",
105
+ weight_decay=0.01,
106
+ lr_scheduler_type="linear",
107
+ seed=3407,
108
+ output_dir="outputs",
109
+ ),
110
+ )
111
+
112
+ trainer_stats = trainer.train()
113
+ print("Training complete.")
114
+
115
+ # 6. Evaluation
116
+ eval_stats = trainer.evaluate(eval_dataset=valid_dataset)
117
+ print(f"Validation Loss: {eval_stats['eval_loss']}")
118
+ if "eval_accuracy" in eval_stats:
119
+ print(f"Validation Accuracy: {eval_stats['eval_accuracy']}")
120
+
121
+ # 7. Saving & Uploading
122
+ FastLanguageModel.for_inference(model)
123
+ model.save_pretrained("lora_model")
124
+ tokenizer.save_pretrained("lora_model")
125
+ model.push_to_hub(huggingface_model_name, token=os.getenv("HF_TOKEN"))
126
+ tokenizer.push_to_hub(huggingface_model_name, token=os.getenv("HF_TOKEN"))
127
+
128
+ # Merge LoRA weights to create a 16-bit model
129
+ model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
130
+ model.push_to_hub_merged(
131
+ huggingface_model_name,
132
+ tokenizer,
133
+ save_method="merged_16bit",
134
+ token=os.getenv("HF_TOKEN"),
135
+ )
136
+
137
+ # 0.4 Launch training inside this same script/notebook using multiple GPUs
138
+ notebook_launcher(train, num_processes=4) # Adjust num_processes for your number of GPUs