avatartestspace / app.py
amaltese's picture
Update app.py
8017cfe verified
import gradio as gr
import pandas as pd
import torch
import os
import gc
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import logging
import os
# Set environment variables for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# [Your existing load_data, prepare_dataset, and tokenize_data functions]
def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
"""
Fine-tune a model with ultra aggressive memory optimizations for small GPUs
"""
logger.info(f"Using model: {model_id}")
# Force CUDA garbage collection
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# ============ MEMORY OPTIMIZATION 1: MICRO BATCH SIZE ============
# Use batch size of 1 since we have only ~15GB GPU
actual_batch_size = 1
logger.info(f"Using micro batch size: {actual_batch_size} for ~15GB GPU")
# ============ MEMORY OPTIMIZATION 2: 4-bit QUANTIZATION ============
# 4-bit is more memory efficient than 8-bit
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)
# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
use_cache=False,
torch_dtype=torch.float16,
# ============ MEMORY OPTIMIZATION 3: MODEL LOADING OPTIONS ============
max_memory={0: "10GB"}, # Limit memory usage
offload_folder="offload", # Set offload folder
offload_state_dict=True, # Offload state dict to CPU
)
logger.info(f"Model parameters: {model.num_parameters():,}")
# Prepare model for training
model = prepare_model_for_kbit_training(model)
# Enable gradient checkpointing
model.gradient_checkpointing_enable()
logger.info("Gradient checkpointing enabled")
# ============ MEMORY OPTIMIZATION 4: MINIMAL LORA CONFIG ============
# Use absolute minimum LoRA configuration
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=2, # Minimal rank
lora_alpha=8, # Reduced alpha
lora_dropout=0.05, # Reduced dropout
target_modules=["q_proj", "v_proj"], # Only query and value projections
)
logger.info("Using minimal LoRA parameters: r=2, target=q_proj,v_proj only")
# Apply LoRA adapters
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# Define training arguments with extreme memory optimization
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=epochs,
# ============ MEMORY OPTIMIZATION 5: MICRO BATCH + HUGE ACCUMULATION ============
per_device_train_batch_size=actual_batch_size,
per_device_eval_batch_size=actual_batch_size,
gradient_accumulation_steps=16, # Accumulate gradients over many steps
# ============ MEMORY OPTIMIZATION 6: MIXED PRECISION ============
fp16=True,
# ============ MEMORY OPTIMIZATION 7: GRADIENT CHECKPOINTING ============
gradient_checkpointing=True,
# ============ MEMORY OPTIMIZATION 8: MINIMAL EVAL AND LOGGING ============
logging_steps=50,
save_strategy="no", # Don't save checkpoints during training
evaluation_strategy="no", # Skip evaluation to save memory
# ============ MEMORY OPTIMIZATION 9: DEEPSPEED OFFLOADING ============
deepspeed={
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": True
},
"allgather_partitions": True,
"allgather_bucket_size": 5e8,
"reduce_scatter": True,
"reduce_bucket_size": 5e8,
"overlap_comm": True,
"contiguous_gradients": True,
},
"fp16": {
"enabled": True
}
},
# Other parameters
learning_rate=1e-4, # Reduced learning rate
weight_decay=0.01,
warmup_ratio=0.03,
optim="adamw_hf", # HF's implementation is more memory efficient
report_to="none",
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data["train"],
tokenizer=tokenizer, # Important for tokenization during training
)
# Final memory cleanup before training
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
logger.info("CUDA cache cleared before training")
# Start training
logger.info("Starting training with ultra memory-efficient settings...")
trainer.train()
# Save the model
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
logger.info(f"Model saved to {output_dir}")
return model, tokenizer
# [Rest of your Gradio interface code]