Spaces:
Runtime error
Runtime error
File size: 5,702 Bytes
f3b5025 a4467aa 8017cfe f3b5025 a4467aa 8017cfe f3b5025 8017cfe 5e2f43e 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa f3b5025 8017cfe a4467aa 8017cfe a4467aa 8017cfe f3b5025 a4467aa 8017cfe f3b5025 a4467aa 8017cfe f3b5025 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe f3b5025 8017cfe f3b5025 a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe a4467aa 8017cfe f3b5025 8017cfe f3b5025 a4467aa 8017cfe f3b5025 8017cfe a4467aa 8017cfe a4467aa 8017cfe f3b5025 a4467aa f3b5025 a4467aa f3b5025 a4467aa 8017cfe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
import pandas as pd
import torch
import os
import gc
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import logging
import os
# Set environment variables for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# [Your existing load_data, prepare_dataset, and tokenize_data functions]
def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
"""
Fine-tune a model with ultra aggressive memory optimizations for small GPUs
"""
logger.info(f"Using model: {model_id}")
# Force CUDA garbage collection
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# ============ MEMORY OPTIMIZATION 1: MICRO BATCH SIZE ============
# Use batch size of 1 since we have only ~15GB GPU
actual_batch_size = 1
logger.info(f"Using micro batch size: {actual_batch_size} for ~15GB GPU")
# ============ MEMORY OPTIMIZATION 2: 4-bit QUANTIZATION ============
# 4-bit is more memory efficient than 8-bit
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
use_cache=False,
torch_dtype=torch.float16,
# ============ MEMORY OPTIMIZATION 3: MODEL LOADING OPTIONS ============
max_memory={0: "10GB"}, # Limit memory usage
offload_folder="offload", # Set offload folder
offload_state_dict=True, # Offload state dict to CPU
)
logger.info(f"Model parameters: {model.num_parameters():,}")
# Prepare model for training
model = prepare_model_for_kbit_training(model)
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
logger.info("Gradient checkpointing enabled")
# ============ MEMORY OPTIMIZATION 4: MINIMAL LORA CONFIG ============
# Use absolute minimum LoRA configuration
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=2, # Minimal rank
lora_alpha=8, # Reduced alpha
lora_dropout=0.05, # Reduced dropout
target_modules=["q_proj", "v_proj"], # Only query and value projections
)
logger.info("Using minimal LoRA parameters: r=2, target=q_proj,v_proj only")
# Apply LoRA adapters
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Define training arguments with extreme memory optimization
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
# ============ MEMORY OPTIMIZATION 5: MICRO BATCH + HUGE ACCUMULATION ============
per_device_train_batch_size=actual_batch_size,
per_device_eval_batch_size=actual_batch_size,
gradient_accumulation_steps=16, # Accumulate gradients over many steps
# ============ MEMORY OPTIMIZATION 6: MIXED PRECISION ============
fp16=True,
# ============ MEMORY OPTIMIZATION 7: GRADIENT CHECKPOINTING ============
gradient_checkpointing=True,
# ============ MEMORY OPTIMIZATION 8: MINIMAL EVAL AND LOGGING ============
logging_steps=50,
save_strategy="no", # Don't save checkpoints during training
evaluation_strategy="no", # Skip evaluation to save memory
# ============ MEMORY OPTIMIZATION 9: DEEPSPEED OFFLOADING ============
deepspeed={
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": True
},
"allgather_partitions": True,
"allgather_bucket_size": 5e8,
"reduce_scatter": True,
"reduce_bucket_size": 5e8,
"overlap_comm": True,
"contiguous_gradients": True,
},
"fp16": {
"enabled": True
}
},
# Other parameters
learning_rate=1e-4, # Reduced learning rate
weight_decay=0.01,
warmup_ratio=0.03,
optim="adamw_hf", # HF's implementation is more memory efficient
report_to="none",
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data["train"],
tokenizer=tokenizer, # Important for tokenization during training
)
# Final memory cleanup before training
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
logger.info("CUDA cache cleared before training")
# Start training
logger.info("Starting training with ultra memory-efficient settings...")
trainer.train()
# Save the model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
logger.info(f"Model saved to {output_dir}")
return model, tokenizer
# [Rest of your Gradio interface code] |