Spaces:

Cylanoid
/

Nursing-Home-Fraud-Detection-using-Llama

Paused

App Files Files Community

Cylanoid commited on Mar 8

Commit

9a84d4a

1 Parent(s): 9396938

updated training script for last changes

Browse files

Files changed (1) hide show

train_llama.py +38 -89

train_llama.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
 import datasets
 import torch
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
@@ -14,122 +15,70 @@ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 MODEL_ID = "meta-llama/Llama-2-7b-hf"
 tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
-# Add padding token if it doesn't exist
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-# Load the model with optimizations for A100 GPU
 model = LlamaForCausalLM.from_pretrained(
     MODEL_ID,
-    torch_dtype=torch.bfloat16,  # Better for A100 GPUs
     device_map="auto",
-    use_flash_attention_2=True,  # Flash Attention for faster training
-    load_in_8bit=True  # Quantization for memory efficiency
 )
-# Prepare the model for training with LoRA (more memory-efficient)
 model = prepare_model_for_kbit_training(model)
-# LoRA configuration
 peft_config = LoraConfig(
-    r=16,               # Rank
-    lora_alpha=32,      # Alpha
-    lora_dropout=0.05,  # Dropout
-    bias="none",
-    task_type="CAUSAL_LM",
-    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Attention modules for Llama
 )
 model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()  # Print percentage of trainable parameters
-# Load the dataset with field="training_pairs"
 dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
-# Verify the dataset structure
 print("First example from dataset:", dataset["train"][0])
-# Define instruction template for formatting inputs
-def format_instruction(example):
-    # Adapt this template based on your specific use case and dataset format
-    return f"""<s>[INST] {example['input']} [/INST] {example['output']}</s>"""
-# Tokenization function
 def tokenize_data(example):
-    formatted_text = format_instruction(example)
-    # Tokenize with appropriate padding and truncation
-    inputs = tokenizer(
-        formatted_text,
-        padding="max_length",
-        truncation=True,
-        max_length=2048,  # Llama 2 context length
-        return_tensors="pt"
-    )
-    # Create labels (for causal language modeling, labels are the same as input_ids)
-    inputs["labels"] = inputs["input_ids"].clone()
-    # Keep tensors as-is
-    inputs = {k: v.squeeze(0) for k, v in inputs.items()}
-    return inputs
-# Map without forcing Arrow schema
-tokenized_dataset = dataset["train"].map(
-    tokenize_data,
-    batched=False,
-    remove_columns=dataset["train"].column_names
-)
-# Debug: Print the first tokenized example
 print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
-# Custom data collator
 def custom_data_collator(features):
-    batch = {}
-    # Stack tensors
-    batch["input_ids"] = torch.stack([f["input_ids"] for f in features])
-    batch["attention_mask"] = torch.stack([f["attention_mask"] for f in features])
-    batch["labels"] = torch.stack([f["labels"] for f in features])
-    return batch
-# Initialize accelerator for distributed training
 accelerator = Accelerator()
-# Training setup
 training_args = TrainingArguments(
-    output_dir="./fine_tuned_llama2",
-    per_device_train_batch_size=4,  # Larger batch size for A100
-    gradient_accumulation_steps=8,  # Accumulate gradients to increase effective batch size
-    eval_strategy="no",
-    save_strategy="steps",
-    save_steps=100,
-    save_total_limit=3,
-    num_train_epochs=3,
-    learning_rate=2e-5,
-    weight_decay=0.01,
-    logging_dir="./logs",
-    logging_steps=10,
-    bf16=True,  # Use bfloat16 for A100 GPUs
-    gradient_checkpointing=True,  # Memory optimization
-    optim="adamw_torch",
-    warmup_steps=100,
 )
 trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=tokenized_dataset,
-    data_collator=custom_data_collator,
 )
-# Start fine-tuning
 trainer.train()
-# Save the fine-tuned model and tokenizer
 model.save_pretrained("./fine_tuned_llama2")
 tokenizer.save_pretrained("./fine_tuned_llama2")
 print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")

 from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
+from transformers import BitsAndBytesConfig
 import datasets
 import torch
 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 MODEL_ID = "meta-llama/Llama-2-7b-hf"
 tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID)
 if tokenizer.pad_token is None:
     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+# Quantization config
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+# Load model with FlashAttention 2
 model = LlamaForCausalLM.from_pretrained(
     MODEL_ID,
+    torch_dtype=torch.bfloat16,  # Matches A100
     device_map="auto",
+    quantization_config=quantization_config,
+    attn_implementation="flash_attention_2"
 )
+# Prepare for LoRA
 model = prepare_model_for_kbit_training(model)
 peft_config = LoraConfig(
+    r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM",
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
 )
 model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+# Load dataset
 dataset = datasets.load_dataset("json", data_files="final_combined_fraud_data.json", field="training_pairs")
 print("First example from dataset:", dataset["train"][0])
+# Tokenization (dynamic padding)
 def tokenize_data(example):
+    formatted_text = f"{example['input']} {example['output']}"
+    inputs = tokenizer(formatted_text, truncation=True, max_length=2048, return_tensors="pt")
+    input_ids = inputs["input_ids"].squeeze(0)
+    labels = inputs["input_ids"].clone().squeeze(0)
+    input_len = len(tokenizer(example['input'])["input_ids"])
+    labels[:input_len] = -100
+    return {"input_ids": input_ids, "labels": labels, "attention_mask": inputs["attention_mask"].squeeze(0)}
+tokenized_dataset = dataset["train"].map(tokenize_data, batched=False, remove_columns=dataset["train"].column_names)
 print("First tokenized example:", {k: (type(v), v.shape if isinstance(v, torch.Tensor) else "list") for k, v in tokenized_dataset[0].items()})
+# Data collator
 def custom_data_collator(features):
+    return {
+        "input_ids": torch.stack([f["input_ids"] for f in features]),
+        "attention_mask": torch.stack([f["attention_mask"] for f in features]),
+        "labels": torch.stack([f["labels"] for f in features])
+    }
+# Accelerator and training
 accelerator = Accelerator()
 training_args = TrainingArguments(
+    output_dir="./fine_tuned_llama2", per_device_train_batch_size=4, gradient_accumulation_steps=4,
+    eval_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=100, save_total_limit=3,
+    num_train_epochs=3, learning_rate=2e-5, weight_decay=0.01, logging_dir="./logs", logging_steps=10,
+    bf16=True, gradient_checkpointing=True, optim="adamw_torch", warmup_steps=100
 )
 trainer = Trainer(
+    model=model, args=training_args,
+    train_dataset=tokenized_dataset.select(range(90)),
+    eval_dataset=tokenized_dataset.select(range(90, 112)),
+    data_collator=custom_data_collator
 )
 trainer.train()
 model.save_pretrained("./fine_tuned_llama2")
 tokenizer.save_pretrained("./fine_tuned_llama2")
 print("Training complete. Model and tokenizer saved to ./fine_tuned_llama2")