Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2021 microsoft | |
# 2023 Alan ([email protected]) | |
# ----------------------------------------------------------------------------- | |
# Licensed under the MIT License (MIT). See LICENSE in the repo root for | |
# license information. | |
# ----------------------------------------------------------------------------- | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
import math | |
from typing import List | |
class LoRALayer(): | |
def __init__( | |
self, | |
r: int, | |
lora_alpha: int, | |
lora_dropout: float, | |
merge_weights: bool, | |
): | |
self.r = r | |
self.lora_alpha = lora_alpha | |
# Optional dropout | |
if lora_dropout > 0.: | |
self.lora_dropout = nn.Dropout(p=lora_dropout) | |
else: | |
self.lora_dropout = self.identity | |
# Mark the weight as unmerged | |
self.merged = False | |
self.merge_weights = merge_weights | |
def identity(self, x): | |
return x | |
class Embedding(nn.Embedding, LoRALayer): | |
# LoRA implemented in a dense layer | |
def __init__(self, | |
num_embeddings: int, | |
embedding_dim: int, | |
r: int = 0, | |
lora_alpha: int = 1, | |
merge_weights: bool = True, | |
**kwargs): | |
nn.Embedding.__init__(self, num_embeddings, embedding_dim, **kwargs) | |
LoRALayer.__init__(self, | |
r=r, | |
lora_alpha=lora_alpha, | |
lora_dropout=0, | |
merge_weights=merge_weights) | |
# Actual trainable parameters | |
if r > 0: | |
self.lora_A = nn.Parameter( | |
self.weight.new_zeros((r, num_embeddings))) | |
self.lora_B = nn.Parameter( | |
self.weight.new_zeros((embedding_dim, r))) | |
self.scaling = self.lora_alpha / self.r | |
# Freezing the pre-trained weight matrix | |
self.weight.requires_grad = False | |
self.reset_parameters() | |
def reset_parameters(self): | |
nn.Embedding.reset_parameters(self) | |
if hasattr(self, 'lora_A'): | |
# initialize A the same way as the default for nn.Linear and B to zero | |
nn.init.zeros_(self.lora_A) | |
nn.init.normal_(self.lora_B) | |
def train(self, mode: bool = True): | |
nn.Embedding.train(self, mode) | |
if mode: | |
if self.merge_weights and self.merged: | |
# Make sure that the weights are not merged | |
if self.r > 0: | |
temp = (self.lora_B @ self.lora_A).transpose(0, 1) | |
self.weight.data -= temp * self.scaling | |
self.merged = False | |
else: | |
if self.merge_weights and not self.merged: | |
# Merge the weights and mark it | |
if self.r > 0: | |
temp = (self.lora_B @ self.lora_A).transpose(0, 1) | |
self.weight.data += temp * self.scaling | |
self.merged = True | |
def forward(self, x: torch.Tensor): | |
if self.r > 0 and not self.merged: | |
result = nn.Embedding.forward(self, x) | |
after_A = F.embedding(x, self.lora_A.transpose(0, 1), | |
self.padding_idx, self.max_norm, | |
self.norm_type, self.scale_grad_by_freq, | |
self.sparse) | |
result += (after_A @ self.lora_B.transpose(0, 1)) * self.scaling | |
return result | |
else: | |
return nn.Embedding.forward(self, x) | |
class Linear(nn.Linear, LoRALayer): | |
# LoRA implemented in a dense layer | |
def __init__( | |
self, | |
in_features: int, | |
out_features: int, | |
r: int = 0, | |
lora_alpha: int = 1, | |
lora_dropout: float = 0., | |
fan_in_fan_out: bool = False, | |
# Set this to True if the layer to replace stores weight like (fan_in, | |
# fan_out) | |
merge_weights: bool = True, | |
**kwargs): | |
nn.Linear.__init__(self, in_features, out_features, **kwargs) | |
LoRALayer.__init__(self, | |
r=r, | |
lora_alpha=lora_alpha, | |
lora_dropout=lora_dropout, | |
merge_weights=merge_weights) | |
self.fan_in_fan_out = fan_in_fan_out | |
# Actual trainable parameters | |
if r > 0: | |
self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features))) | |
self.lora_B = nn.Parameter(self.weight.new_zeros( | |
(out_features, r))) | |
self.scaling = self.lora_alpha / self.r | |
# Freezing the pre-trained weight matrix | |
self.weight.requires_grad = False | |
self.reset_parameters() | |
if fan_in_fan_out: | |
self.weight.data = self.weight.data.transpose(0, 1) | |
def reset_parameters(self): | |
nn.Linear.reset_parameters(self) | |
if hasattr(self, 'lora_A'): | |
# initialize A the same way as the default for nn.Linear and B to zero | |
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) | |
nn.init.zeros_(self.lora_B) | |
def T(self, w): | |
return w.transpose(0, 1) if self.fan_in_fan_out else w | |
def train(self, mode: bool = True): | |
nn.Linear.train(self, mode) | |
if mode: | |
if self.merge_weights and self.merged: | |
# Make sure that the weights are not merged | |
if self.r > 0: | |
temp = self.T(self.lora_B @ self.lora_A) | |
self.weight.data -= temp * self.scaling | |
self.merged = False | |
else: | |
if self.merge_weights and not self.merged: | |
# Merge the weights and mark it | |
if self.r > 0: | |
temp = self.T(self.lora_B @ self.lora_A) | |
self.weight.data += temp * self.scaling | |
self.merged = True | |
def forward(self, x: torch.Tensor): | |
if self.r > 0 and not self.merged: | |
result = F.linear(x, self.T(self.weight), bias=self.bias) | |
result += (self.lora_dropout(x) @ self.lora_A.transpose(0, 1) | |
return result | |
else: | |
return F.linear(x, self.T(self.weight), bias=self.bias) | |
class MergedLinear(nn.Linear, LoRALayer): | |
# LoRA implemented in a dense layer | |
def __init__(self, | |
in_features: int, | |
out_features: int, | |
r: int = 0, | |
lora_alpha: int = 1, | |
lora_dropout: float = 0., | |
enable_lora: List[bool] = None, | |
fan_in_fan_out: bool = False, | |
merge_weights: bool = True, | |
**kwargs): | |
if enable_lora is None: | |
enable_lora = [False] | |
nn.Linear.__init__(self, in_features, out_features, **kwargs) | |
LoRALayer.__init__(self, | |
r=r, | |
lora_alpha=lora_alpha, | |
lora_dropout=lora_dropout, | |
merge_weights=merge_weights) | |
assert out_features % len(enable_lora) == 0, \ | |
'The length of enable_lora must divide out_features' | |
self.enable_lora = enable_lora | |
self.fan_in_fan_out = fan_in_fan_out | |
# Actual trainable parameters | |
if r > 0 and any(enable_lora): | |
self.lora_A = nn.Parameter( | |
self.weight.new_zeros((r * sum(enable_lora), in_features))) | |
self.lora_B = nn.Parameter( | |
self.weight.new_zeros( | |
(out_features // len(enable_lora) * sum(enable_lora), r))) | |
# weights for Conv1D with groups=sum(enable_lora) | |
self.scaling = self.lora_alpha / self.r | |
# Freezing the pre-trained weight matrix | |
self.weight.requires_grad = False | |
# Compute the indices | |
self.lora_ind = self.weight.new_zeros( | |
(out_features, ), dtype=torch.bool).view(len(enable_lora), -1) | |
self.lora_ind[enable_lora, :] = True | |
self.lora_ind = self.lora_ind.view(-1) | |
self.reset_parameters() | |
if fan_in_fan_out: | |
self.weight.data = self.weight.data.transpose(0, 1) | |
def reset_parameters(self): | |
nn.Linear.reset_parameters(self) | |
if hasattr(self, 'lora_A'): | |
# initialize A the same way as the default for nn.Linear and B to zero | |
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) | |
nn.init.zeros_(self.lora_B) | |
def zero_pad(self, x): | |
result = x.new_zeros((len(self.lora_ind), *x.size()[1:])) | |
result[self.lora_ind] = x | |
return result | |
def T(self, w): | |
return w.transpose(0, 1) if self.fan_in_fan_out else w | |
def merge_AB(self): | |
delta_w = F.conv1d(self.lora_A.unsqueeze(0), | |
self.lora_B.unsqueeze(-1), | |
groups=sum(self.enable_lora)).squeeze(0) | |
return self.T(delta_w) | |
def train(self, mode: bool = True): | |
nn.Linear.train(self, mode) | |
if mode: | |
if self.merge_weights and self.merged: | |
# Make sure that the weights are not merged | |
if self.r > 0 and any(self.enable_lora): | |
self.weight.data -= self.merge_AB() * self.scaling | |
self.merged = False | |
else: | |
if self.merge_weights and not self.merged: | |
# Merge the weights and mark it | |
if self.r > 0 and any(self.enable_lora): | |
self.weight.data += self.merge_AB() * self.scaling | |
self.merged = True | |
def forward(self, x: torch.Tensor): | |
if self.merged: | |
return F.linear(x, self.T(self.weight), bias=self.bias) | |
else: | |
result = F.linear(x, self.T(self.weight), bias=self.bias) | |
if self.r > 0: | |
temp = self.T(self.merge_AB().T) | |
result += self.lora_dropout(x) @ temp * self.scaling | |
return result | |
class ConvLoRA(nn.Module, LoRALayer): | |
def __init__(self, | |
conv_module, | |
in_channels, | |
out_channels, | |
kernel_size, | |
r=0, | |
lora_alpha=1, | |
lora_dropout=0., | |
merge_weights=True, | |
**kwargs): | |
super(ConvLoRA, self).__init__() | |
self.conv = conv_module(in_channels, out_channels, kernel_size, | |
**kwargs) | |
LoRALayer.__init__(self, | |
r=r, | |
lora_alpha=lora_alpha, | |
lora_dropout=lora_dropout, | |
merge_weights=merge_weights) | |
assert isinstance(kernel_size, int) | |
# Actual trainable parameters | |
if r > 0: | |
self.lora_A = nn.Parameter( | |
self.conv.weight.new_zeros( | |
(r * kernel_size, in_channels * kernel_size))) | |
self.lora_B = nn.Parameter( | |
self.conv.weight.new_zeros( | |
(out_channels // self.conv.groups * kernel_size, | |
r * kernel_size))) | |
self.scaling = self.lora_alpha / self.r | |
# Freezing the pre-trained weight matrix | |
self.conv.weight.requires_grad = False | |
self.reset_parameters() | |
self.merged = False | |
def reset_parameters(self): | |
self.conv.reset_parameters() | |
if hasattr(self, 'lora_A'): | |
# initialize A the same way as the default for nn.Linear and B to zero | |
nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) | |
nn.init.zeros_(self.lora_B) | |
def train(self, mode=True): | |
super(ConvLoRA, self).train(mode) | |
if mode: | |
if self.merge_weights and self.merged: | |
if self.r > 0: | |
# Make sure that the weights are not merged | |
self.conv.weight.data -= (self.lora_B @ self.lora_A).view( | |
self.conv.weight.shape) * self.scaling | |
self.merged = False | |
else: | |
if self.merge_weights and not self.merged: | |
if self.r > 0: | |
# Merge the weights and mark it | |
self.conv.weight.data += (self.lora_B @ self.lora_A).view( | |
self.conv.weight.shape) * self.scaling | |
self.merged = True | |
def forward(self, x): | |
if self.r > 0 and not self.merged: | |
return self.conv._conv_forward( | |
x, self.conv.weight + | |
(self.lora_B @ self.lora_A).view(self.conv.weight.shape) * | |
self.scaling, self.conv.bias) | |
return self.conv(x) | |
class Conv2d(ConvLoRA): | |
def __init__(self, *args, **kwargs): | |
super(Conv2d, self).__init__(nn.Conv2d, *args, **kwargs) | |
class Conv1d(ConvLoRA): | |
def __init__(self, *args, **kwargs): | |
super(Conv1d, self).__init__(nn.Conv1d, *args, **kwargs) | |
# Can Extend to other ones like this | |
class Conv3d(ConvLoRA): | |
def __init__(self, *args, **kwargs): | |
super(Conv3d, self).__init__(nn.Conv3d, *args, **kwargs) | |