Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import torch | |
import os | |
import gc | |
from datasets import Dataset | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer | |
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training | |
import logging | |
import os | |
# Set environment variables for memory management | |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# [Your existing load_data, prepare_dataset, and tokenize_data functions] | |
def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None): | |
""" | |
Fine-tune a model with ultra aggressive memory optimizations for small GPUs | |
""" | |
logger.info(f"Using model: {model_id}") | |
# Force CUDA garbage collection | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
# ============ MEMORY OPTIMIZATION 1: MICRO BATCH SIZE ============ | |
# Use batch size of 1 since we have only ~15GB GPU | |
actual_batch_size = 1 | |
logger.info(f"Using micro batch size: {actual_batch_size} for ~15GB GPU") | |
# ============ MEMORY OPTIMIZATION 2: 4-bit QUANTIZATION ============ | |
# 4-bit is more memory efficient than 8-bit | |
from transformers import BitsAndBytesConfig | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_use_double_quant=True, | |
) | |
# Load model with 4-bit quantization | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
quantization_config=bnb_config, | |
device_map="auto", | |
use_cache=False, | |
torch_dtype=torch.float16, | |
# ============ MEMORY OPTIMIZATION 3: MODEL LOADING OPTIONS ============ | |
max_memory={0: "10GB"}, # Limit memory usage | |
offload_folder="offload", # Set offload folder | |
offload_state_dict=True, # Offload state dict to CPU | |
) | |
logger.info(f"Model parameters: {model.num_parameters():,}") | |
# Prepare model for training | |
model = prepare_model_for_kbit_training(model) | |
# Enable gradient checkpointing | |
model.gradient_checkpointing_enable() | |
logger.info("Gradient checkpointing enabled") | |
# ============ MEMORY OPTIMIZATION 4: MINIMAL LORA CONFIG ============ | |
# Use absolute minimum LoRA configuration | |
peft_config = LoraConfig( | |
task_type=TaskType.CAUSAL_LM, | |
inference_mode=False, | |
r=2, # Minimal rank | |
lora_alpha=8, # Reduced alpha | |
lora_dropout=0.05, # Reduced dropout | |
target_modules=["q_proj", "v_proj"], # Only query and value projections | |
) | |
logger.info("Using minimal LoRA parameters: r=2, target=q_proj,v_proj only") | |
# Apply LoRA adapters | |
model = get_peft_model(model, peft_config) | |
model.print_trainable_parameters() | |
# Define training arguments with extreme memory optimization | |
training_args = TrainingArguments( | |
output_dir=output_dir, | |
num_train_epochs=epochs, | |
# ============ MEMORY OPTIMIZATION 5: MICRO BATCH + HUGE ACCUMULATION ============ | |
per_device_train_batch_size=actual_batch_size, | |
per_device_eval_batch_size=actual_batch_size, | |
gradient_accumulation_steps=16, # Accumulate gradients over many steps | |
# ============ MEMORY OPTIMIZATION 6: MIXED PRECISION ============ | |
fp16=True, | |
# ============ MEMORY OPTIMIZATION 7: GRADIENT CHECKPOINTING ============ | |
gradient_checkpointing=True, | |
# ============ MEMORY OPTIMIZATION 8: MINIMAL EVAL AND LOGGING ============ | |
logging_steps=50, | |
save_strategy="no", # Don't save checkpoints during training | |
evaluation_strategy="no", # Skip evaluation to save memory | |
# ============ MEMORY OPTIMIZATION 9: DEEPSPEED OFFLOADING ============ | |
deepspeed={ | |
"zero_optimization": { | |
"stage": 2, | |
"offload_optimizer": { | |
"device": "cpu", | |
"pin_memory": True | |
}, | |
"allgather_partitions": True, | |
"allgather_bucket_size": 5e8, | |
"reduce_scatter": True, | |
"reduce_bucket_size": 5e8, | |
"overlap_comm": True, | |
"contiguous_gradients": True, | |
}, | |
"fp16": { | |
"enabled": True | |
} | |
}, | |
# Other parameters | |
learning_rate=1e-4, # Reduced learning rate | |
weight_decay=0.01, | |
warmup_ratio=0.03, | |
optim="adamw_hf", # HF's implementation is more memory efficient | |
report_to="none", | |
) | |
# Initialize trainer | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=train_data["train"], | |
tokenizer=tokenizer, # Important for tokenization during training | |
) | |
# Final memory cleanup before training | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
gc.collect() | |
logger.info("CUDA cache cleared before training") | |
# Start training | |
logger.info("Starting training with ultra memory-efficient settings...") | |
trainer.train() | |
# Save the model | |
model.save_pretrained(output_dir) | |
tokenizer.save_pretrained(output_dir) | |
logger.info(f"Model saved to {output_dir}") | |
return model, tokenizer | |
# [Rest of your Gradio interface code] |