File size: 4,437 Bytes
a164e13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Copyright (C) 2024 Charles O. Goddard
#
# This software is free software: you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This software is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see http://www.gnu.org/licenses/.

from typing import Any, Dict, List, Optional

import torch

from mergekit.architecture import WeightInfo
from mergekit.common import ImmutableMap, ModelReference, rectify_embed_sizes
from mergekit.graph import Task
from mergekit.io.tasks import GatherTensors
from mergekit.merge_methods.base import ConfigParameterDef, MergeMethod


class ModelStockMergeTask(Task[torch.Tensor]):
    gather_tensors: GatherTensors
    base_model: ModelReference
    parameter_name: str
    filter_wise: bool = False

    def uses_accelerator(self) -> bool:
        return True

    def arguments(self) -> Dict[str, Task]:
        return {"tensors": self.gather_tensors}

    def execute(self, tensors: Dict[ModelReference, torch.Tensor]) -> torch.Tensor:
        if len(tensors) == 1 and self.base_model in tensors:
            return tensors[self.base_model]
        if len(tensors) < 3:
            raise ValueError(
                "ModelStockMerge requires at least 3 models (base plus two+ others)"
            )

        w_0, ws = self.get_rectified_weights(tensors)
        out_shape = w_0.shape

        if self.filter_wise:
            if w_0.dim() == 1:
                # bias (or other single-vector) parameters should be treated as row vectors
                w_0 = w_0.unsqueeze(0)
                ws = [w.unsqueeze(0) for w in ws]
        else:
            w_0 = w_0.view(-1)
            ws = [w.view(-1) for w in ws]

        offsets = [w - w_0 for w in ws]

        # now there is a question of how to come up with a value for theta.
        # in the two-vector case, we can get an exact angle between the two vectors
        # but the paper doesn't explicitly say what to do in the multi-vector case -
        # they keep using a singular theta value and don't elaborate on how to
        # calculate it. i'm going to assume an average of pairwise angles for now? i guess?

        cos_thetas = []
        for i, w_0_offset in enumerate(offsets):
            for j in range(i + 1, len(offsets)):
                w_1_offset = offsets[j]

                norm_product = torch.norm(w_0_offset, dim=-1) * torch.norm(
                    w_1_offset, dim=-1
                )
                cos_theta = (
                    (w_0_offset * w_1_offset).sum(dim=-1) / norm_product.clamp(min=1e-6)
                ).clamp(-1, 1)
                cos_thetas.append(cos_theta)

        cos_theta = torch.stack(cos_thetas).mean(dim=0).unsqueeze(-1)
        N = len(ws)
        t = (N * cos_theta) / (1 + (N - 1) * cos_theta)

        w_avg = sum(ws) / len(ws)
        w_h = t * w_avg + (1 - t) * w_0

        return w_h.reshape(out_shape)

    def get_rectified_weights(self, tensors: Dict[ModelReference, torch.Tensor]):
        if self.base_model not in tensors:
            raise ValueError("Base model tensor not found")

        all_weights = [tensors[self.base_model]] + [
            tensors[k] for k in tensors if k != self.base_model
        ]
        rectify_embed_sizes(self.parameter_name, all_weights)
        w_0 = all_weights[0]
        ws = all_weights[1:]
        return w_0, ws


class ModelStockMerge(MergeMethod):
    def parameters(self) -> List[ConfigParameterDef]:
        return [
            ConfigParameterDef(name="filter_wise", required=False, default_value=False)
        ]

    def make_task(
        self,
        *,
        output_weight: WeightInfo,
        tensors: GatherTensors,
        base_model: Optional[ModelReference],
        parameters: ImmutableMap[str, Any],
        **_kwargs,
    ) -> Task:
        return ModelStockMergeTask(
            gather_tensors=tensors,
            base_model=base_model,
            parameter_name=output_weight.name,
            filter_wise=parameters["filter_wise"],
        )