Cylanoid commited on
Commit
c1c1cb3
·
1 Parent(s): 13a5da0

oh boy oh noooo

Browse files
Files changed (1) hide show
  1. train_llama.py +21 -12
train_llama.py CHANGED
@@ -2,6 +2,7 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArgu
2
  from transformers import BitsAndBytesConfig
3
  import datasets
4
  import torch
 
5
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
6
  from accelerate import Accelerator
7
 
@@ -43,15 +44,15 @@ model.print_trainable_parameters()
43
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
44
  print("First example from dataset:", dataset["train"][0])
45
 
46
- # Tokenization with tensors
47
  def tokenize_data(example):
48
  formatted_text = f"{example['input']} {example['output']}"
49
- inputs = tokenizer(formatted_text, truncation=True, max_length=2048, return_tensors="pt")
50
- input_ids = inputs["input_ids"].squeeze(0)
51
- labels = inputs["input_ids"].clone().squeeze(0)
 
52
  input_len = len(tokenizer(example['input'])["input_ids"])
53
- labels[:input_len] = -100
54
- attention_mask = inputs["attention_mask"].squeeze(0)
55
  return {
56
  "input_ids": input_ids,
57
  "labels": labels,
@@ -59,16 +60,24 @@ def tokenize_data(example):
59
  }
60
 
61
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
62
- # Fix print to handle potential list or tensor
63
  first_example = tokenized_dataset[0]
64
- print("First tokenized example:", {k: (type(v), v.shape if hasattr(v, 'shape') else len(v)) for k, v in first_example.items()})
65
 
66
- # Data collator
67
  def custom_data_collator(features):
 
 
 
 
 
 
 
 
68
  return {
69
- "input_ids": torch.stack([f["input_ids"] for f in features]),
70
- "attention_mask": torch.stack([f["attention_mask"] for f in features]),
71
- "labels": torch.stack([f["labels"] for f in features])
72
  }
73
 
74
  # Accelerator and training
 
2
  from transformers import BitsAndBytesConfig
3
  import datasets
4
  import torch
5
+ from torch.nn.utils.rnn import pad_sequence
6
  from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
7
  from accelerate import Accelerator
8
 
 
44
  dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
45
  print("First example from dataset:", dataset["train"][0])
46
 
47
+ # Tokenization with lists (no tensors)
48
  def tokenize_data(example):
49
  formatted_text = f"{example['input']} {example['output']}"
50
+ inputs = tokenizer(formatted_text, truncation=True, max_length=2048)
51
+ input_ids = inputs["input_ids"]
52
+ attention_mask = inputs["attention_mask"]
53
+ labels = input_ids.copy()
54
  input_len = len(tokenizer(example['input'])["input_ids"])
55
+ labels[:input_len] = [-100] * input_len
 
56
  return {
57
  "input_ids": input_ids,
58
  "labels": labels,
 
60
  }
61
 
62
  tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
63
+ # Print first example (lists with lengths)
64
  first_example = tokenized_dataset[0]
65
+ print("First tokenized example:", {k: (type(v), len(v)) for k, v in first_example.items()})
66
 
67
+ # Data collator: convert lists to tensors and pad
68
  def custom_data_collator(features):
69
+ input_ids = [torch.tensor(f["input_ids"]) for f in features]
70
+ attention_mask = [torch.tensor(f["attention_mask"]) for f in features]
71
+ labels = [torch.tensor(f["labels"]) for f in features]
72
+
73
+ input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
74
+ attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
75
+ labels = pad_sequence(labels, batch_first=True, padding_value=-100)
76
+
77
  return {
78
+ "input_ids": input_ids,
79
+ "attention_mask": attention_mask,
80
+ "labels": labels
81
  }
82
 
83
  # Accelerator and training