File size: 5,702 Bytes
f3b5025
 
a4467aa
 
8017cfe
f3b5025
a4467aa
 
 
8017cfe
f3b5025
8017cfe
 
5e2f43e
8017cfe
 
a4467aa
8017cfe
a4467aa
 
 
8017cfe
a4467aa
 
f3b5025
8017cfe
 
 
 
 
a4467aa
 
 
 
 
8017cfe
 
 
 
 
 
 
 
a4467aa
8017cfe
 
 
 
 
 
 
 
f3b5025
a4467aa
8017cfe
 
 
 
 
 
 
 
f3b5025
 
a4467aa
 
8017cfe
f3b5025
 
8017cfe
a4467aa
8017cfe
a4467aa
8017cfe
 
a4467aa
 
 
8017cfe
 
 
 
a4467aa
8017cfe
a4467aa
8017cfe
a4467aa
8017cfe
f3b5025
8017cfe
f3b5025
 
a4467aa
8017cfe
a4467aa
 
8017cfe
 
 
 
a4467aa
8017cfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4467aa
8017cfe
a4467aa
 
8017cfe
 
f3b5025
 
8017cfe
f3b5025
 
 
a4467aa
8017cfe
f3b5025
 
8017cfe
a4467aa
 
8017cfe
 
a4467aa
 
8017cfe
f3b5025
 
a4467aa
 
f3b5025
a4467aa
f3b5025
a4467aa
 
8017cfe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
import pandas as pd
import torch
import os
import gc
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import logging
import os

# Set environment variables for memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# [Your existing load_data, prepare_dataset, and tokenize_data functions]

def finetune_model(model_id, train_data, output_dir, epochs, batch_size=None):
    """
    Fine-tune a model with ultra aggressive memory optimizations for small GPUs
    """
    logger.info(f"Using model: {model_id}")
    
    # Force CUDA garbage collection
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    # ============ MEMORY OPTIMIZATION 1: MICRO BATCH SIZE ============
    # Use batch size of 1 since we have only ~15GB GPU
    actual_batch_size = 1
    logger.info(f"Using micro batch size: {actual_batch_size} for ~15GB GPU")
    
    # ============ MEMORY OPTIMIZATION 2: 4-bit QUANTIZATION ============
    # 4-bit is more memory efficient than 8-bit
    from transformers import BitsAndBytesConfig
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
    
    # Load model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        use_cache=False,
        torch_dtype=torch.float16,
        # ============ MEMORY OPTIMIZATION 3: MODEL LOADING OPTIONS ============
        max_memory={0: "10GB"},  # Limit memory usage
        offload_folder="offload",  # Set offload folder
        offload_state_dict=True,  # Offload state dict to CPU
    )
    
    logger.info(f"Model parameters: {model.num_parameters():,}")
    
    # Prepare model for training
    model = prepare_model_for_kbit_training(model)
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    logger.info("Gradient checkpointing enabled")
    
    # ============ MEMORY OPTIMIZATION 4: MINIMAL LORA CONFIG ============
    # Use absolute minimum LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=2,               # Minimal rank
        lora_alpha=8,      # Reduced alpha
        lora_dropout=0.05, # Reduced dropout
        target_modules=["q_proj", "v_proj"],  # Only query and value projections
    )
    logger.info("Using minimal LoRA parameters: r=2, target=q_proj,v_proj only")
    
    # Apply LoRA adapters
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    # Define training arguments with extreme memory optimization
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        # ============ MEMORY OPTIMIZATION 5: MICRO BATCH + HUGE ACCUMULATION ============
        per_device_train_batch_size=actual_batch_size,
        per_device_eval_batch_size=actual_batch_size,
        gradient_accumulation_steps=16,  # Accumulate gradients over many steps
        # ============ MEMORY OPTIMIZATION 6: MIXED PRECISION ============
        fp16=True,
        # ============ MEMORY OPTIMIZATION 7: GRADIENT CHECKPOINTING ============
        gradient_checkpointing=True,
        # ============ MEMORY OPTIMIZATION 8: MINIMAL EVAL AND LOGGING ============
        logging_steps=50,
        save_strategy="no",       # Don't save checkpoints during training
        evaluation_strategy="no", # Skip evaluation to save memory
        # ============ MEMORY OPTIMIZATION 9: DEEPSPEED OFFLOADING ============
        deepspeed={
            "zero_optimization": {
                "stage": 2,
                "offload_optimizer": {
                    "device": "cpu",
                    "pin_memory": True
                },
                "allgather_partitions": True,
                "allgather_bucket_size": 5e8,
                "reduce_scatter": True,
                "reduce_bucket_size": 5e8,
                "overlap_comm": True,
                "contiguous_gradients": True,
            },
            "fp16": {
                "enabled": True
            }
        },
        # Other parameters
        learning_rate=1e-4,  # Reduced learning rate
        weight_decay=0.01,
        warmup_ratio=0.03,
        optim="adamw_hf",  # HF's implementation is more memory efficient
        report_to="none",
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data["train"],
        tokenizer=tokenizer,  # Important for tokenization during training
    )
    
    # Final memory cleanup before training
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
    logger.info("CUDA cache cleared before training")
    
    # Start training
    logger.info("Starting training with ultra memory-efficient settings...")
    trainer.train()
    
    # Save the model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    logger.info(f"Model saved to {output_dir}")
    
    return model, tokenizer

# [Rest of your Gradio interface code]