Cylanoid commited on
Commit
9a84d4a
·
1 Parent(s): 9396938

updated training script for last changes

Browse files
Files changed (1) hide show
  1. train_llama.py +38 -89
train_llama.py CHANGED
@@ -1,4 +1,5 @@
1
  from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
 
2
  import datasets
3
  import torch
4
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
@@ -14,122 +15,70 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
14
  MODEL_ID = "meta-llama/Llama-2-7b-hf"
15
  tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
16
 
17
- # Add padding token if it doesn't exist
18
  if tokenizer.pad_token is None:
19
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
20
 
21
- # Load the model with optimizations for A100 GPU
 
 
 
22
  model = LlamaForCausalLM.from_pretrained(
23
  MODEL_ID,
24
- torch_dtype=torch.bfloat16, # Better for A100 GPUs
25
  device_map="auto",
26
- use_flash_attention_2=True, # Flash Attention for faster training
27
- load_in_8bit=True # Quantization for memory efficiency
28
  )
29
 
30
- # Prepare the model for training with LoRA (more memory-efficient)
31
  model = prepare_model_for_kbit_training(model)
32
-
33
- # LoRA configuration
34
  peft_config = LoraConfig(
35
- r=16, # Rank
36
- lora_alpha=32, # Alpha
37
- lora_dropout=0.05, # Dropout
38
- bias="none",
39
- task_type="CAUSAL_LM",
40
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"] # Attention modules for Llama
41
  )
42
-
43
  model = get_peft_model(model, peft_config)
44
- model.print_trainable_parameters() # Print percentage of trainable parameters
45
 
46
- # Load the dataset with field="training_pairs"
47
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
48
-
49
- # Verify the dataset structure
50
  print("First example from dataset:", dataset["train"][0])
51
 
52
- # Define instruction template for formatting inputs
53
- def format_instruction(example):
54
- # Adapt this template based on your specific use case and dataset format
55
- return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""
56
-
57
- # Tokenization function
58
  def tokenize_data(example):
59
- formatted_text = format_instruction(example)
60
-
61
- # Tokenize with appropriate padding and truncation
62
- inputs = tokenizer(
63
- formatted_text,
64
- padding="max_length",
65
- truncation=True,
66
- max_length=2048, # Llama 2 context length
67
- return_tensors="pt"
68
- )
69
-
70
- # Create labels (for causal language modeling, labels are the same as input_ids)
71
- inputs["labels"] = inputs["input_ids"].clone()
72
-
73
- # Keep tensors as-is
74
- inputs = {k: v.squeeze(0) for k, v in inputs.items()}
75
- return inputs
76
-
77
- # Map without forcing Arrow schema
78
- tokenized_dataset = dataset["train"].map(
79
- tokenize_data,
80
- batched=False,
81
- remove_columns=dataset["train"].column_names
82
- )
83
-
84
- # Debug: Print the first tokenized example
85
  print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
86
 
87
- # Custom data collator
88
  def custom_data_collator(features):
89
- batch = {}
90
-
91
- # Stack tensors
92
- batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
93
- batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
94
- batch["labels"] = torch.stack([f["labels"] for f in features])
95
-
96
- return batch
97
 
98
- # Initialize accelerator for distributed training
99
  accelerator = Accelerator()
100
-
101
- # Training setup
102
  training_args = TrainingArguments(
103
- output_dir="./fine_tuned_llama2",
104
- per_device_train_batch_size=4, # Larger batch size for A100
105
- gradient_accumulation_steps=8, # Accumulate gradients to increase effective batch size
106
- eval_strategy="no",
107
- save_strategy="steps",
108
- save_steps=100,
109
- save_total_limit=3,
110
- num_train_epochs=3,
111
- learning_rate=2e-5,
112
- weight_decay=0.01,
113
- logging_dir="./logs",
114
- logging_steps=10,
115
- bf16=True, # Use bfloat16 for A100 GPUs
116
- gradient_checkpointing=True, # Memory optimization
117
- optim="adamw_torch",
118
- warmup_steps=100,
119
  )
120
-
121
  trainer = Trainer(
122
- model=model,
123
- args=training_args,
124
- train_dataset=tokenized_dataset,
125
- data_collator=custom_data_collator,
126
  )
127
-
128
- # Start fine-tuning
129
  trainer.train()
130
-
131
- # Save the fine-tuned model and tokenizer
132
  model.save_pretrained("./fine_tuned_llama2")
133
  tokenizer.save_pretrained("./fine_tuned_llama2")
134
-
135
  print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")
 
1
  from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
2
+ from transformers import BitsAndBytesConfig
3
  import datasets
4
  import torch
5
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 
15
  MODEL_ID = "meta-llama/Llama-2-7b-hf"
16
  tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
17
 
 
18
  if tokenizer.pad_token is None:
19
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
20
 
21
+ # Quantization config
22
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
23
+
24
+ # Load model with FlashAttention 2
25
  model = LlamaForCausalLM.from_pretrained(
26
  MODEL_ID,
27
+ torch_dtype=torch.bfloat16, # Matches A100
28
  device_map="auto",
29
+ quantization_config=quantization_config,
30
+ attn_implementation="flash_attention_2"
31
  )
32
 
33
+ # Prepare for LoRA
34
  model = prepare_model_for_kbit_training(model)
 
 
35
  peft_config = LoraConfig(
36
+ r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
37
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
 
 
 
 
38
  )
 
39
  model = get_peft_model(model, peft_config)
40
+ model.print_trainable_parameters()
41
 
42
+ # Load dataset
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
 
 
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
+ # Tokenization (dynamic padding)
 
 
 
 
 
47
  def tokenize_data(example):
48
+ formatted_text = f"{example['input']} {example['output']}"
49
+ inputs = tokenizer(formatted_text, truncation=True, max_length=2048, return_tensors="pt")
50
+ input_ids = inputs["input_ids"].squeeze(0)
51
+ labels = inputs["input_ids"].clone().squeeze(0)
52
+ input_len = len(tokenizer(example['input'])["input_ids"])
53
+ labels[:input_len] = -100
54
+ return {"input_ids": input_ids, "labels": labels, "attention_mask": inputs["attention_mask"].squeeze(0)}
55
+
56
+ tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
58
 
59
+ # Data collator
60
  def custom_data_collator(features):
61
+ return {
62
+ "input_ids": torch.stack([f["input_ids"] for f in features]),
63
+ "attention_mask": torch.stack([f["attention_mask"] for f in features]),
64
+ "labels": torch.stack([f["labels"] for f in features])
65
+ }
 
 
 
66
 
67
+ # Accelerator and training
68
  accelerator = Accelerator()
 
 
69
  training_args = TrainingArguments(
70
+ output_dir="./fine_tuned_llama2", per_device_train_batch_size=4, gradient_accumulation_steps=4,
71
+ eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=3,
72
+ num_train_epochs=3, learning_rate=2e-5, weight_decay=0.01, logging_dir="./logs", logging_steps=10,
73
+ bf16=True, gradient_checkpointing=True, optim="adamw_torch", warmup_steps=100
 
 
 
 
 
 
 
 
 
 
 
 
74
  )
 
75
  trainer = Trainer(
76
+ model=model, args=training_args,
77
+ train_dataset=tokenized_dataset.select(range(90)),
78
+ eval_dataset=tokenized_dataset.select(range(90, 112)),
79
+ data_collator=custom_data_collator
80
  )
 
 
81
  trainer.train()
 
 
82
  model.save_pretrained("./fine_tuned_llama2")
83
  tokenizer.save_pretrained("./fine_tuned_llama2")
 
84
  print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")