import gradio as gr import pandas as pd import torch import os import gc from datasets import Dataset from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training import logging import os # Set environment variables for memory management os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # [Your existing load_data, prepare_dataset, and tokenize_data functions] def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None): """ Fine-tune a model with ultra aggressive memory optimizations for small GPUs """ logger.info(f"Using model: {model_id}") # Force CUDA garbage collection if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # ============ MEMORY OPTIMIZATION 1: MICRO BATCH SIZE ============ # Use batch size of 1 since we have only ~15GB GPU actual_batch_size = 1 logger.info(f"Using micro batch size: {actual_batch_size} for ~15GB GPU") # ============ MEMORY OPTIMIZATION 2: 4-bit QUANTIZATION ============ # 4-bit is more memory efficient than 8-bit from transformers import BitsAndBytesConfig bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) # Load model with 4-bit quantization model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto", use_cache=False, torch_dtype=torch.float16, # ============ MEMORY OPTIMIZATION 3: MODEL LOADING OPTIONS ============ max_memory={0: "10GB"}, # Limit memory usage offload_folder="offload", # Set offload folder offload_state_dict=True, # Offload state dict to CPU ) logger.info(f"Model parameters: {model.num_parameters():,}") # Prepare model for training model = prepare_model_for_kbit_training(model) # Enable gradient checkpointing model.gradient_checkpointing_enable() logger.info("Gradient checkpointing enabled") # ============ MEMORY OPTIMIZATION 4: MINIMAL LORA CONFIG ============ # Use absolute minimum LoRA configuration peft_config = LoraConfig( task_type=TaskType.CAUSAL_LM, inference_mode=False, r=2, # Minimal rank lora_alpha=8, # Reduced alpha lora_dropout=0.05, # Reduced dropout target_modules=["q_proj", "v_proj"], # Only query and value projections ) logger.info("Using minimal LoRA parameters: r=2, target=q_proj,v_proj only") # Apply LoRA adapters model = get_peft_model(model, peft_config) model.print_trainable_parameters() # Define training arguments with extreme memory optimization training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=epochs, # ============ MEMORY OPTIMIZATION 5: MICRO BATCH + HUGE ACCUMULATION ============ per_device_train_batch_size=actual_batch_size, per_device_eval_batch_size=actual_batch_size, gradient_accumulation_steps=16, # Accumulate gradients over many steps # ============ MEMORY OPTIMIZATION 6: MIXED PRECISION ============ fp16=True, # ============ MEMORY OPTIMIZATION 7: GRADIENT CHECKPOINTING ============ gradient_checkpointing=True, # ============ MEMORY OPTIMIZATION 8: MINIMAL EVAL AND LOGGING ============ logging_steps=50, save_strategy="no", # Don't save checkpoints during training evaluation_strategy="no", # Skip evaluation to save memory # ============ MEMORY OPTIMIZATION 9: DEEPSPEED OFFLOADING ============ deepspeed={ "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": True }, "allgather_partitions": True, "allgather_bucket_size": 5e8, "reduce_scatter": True, "reduce_bucket_size": 5e8, "overlap_comm": True, "contiguous_gradients": True, }, "fp16": { "enabled": True } }, # Other parameters learning_rate=1e-4, # Reduced learning rate weight_decay=0.01, warmup_ratio=0.03, optim="adamw_hf", # HF's implementation is more memory efficient report_to="none", ) # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_data["train"], tokenizer=tokenizer, # Important for tokenization during training ) # Final memory cleanup before training if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() logger.info("CUDA cache cleared before training") # Start training logger.info("Starting training with ultra memory-efficient settings...") trainer.train() # Save the model model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) logger.info(f"Model saved to {output_dir}") return model, tokenizer # [Rest of your Gradio interface code]