OpenGVLab
/

InternVL-14B-224px

@@ -114,13 +114,29 @@ class CrossAttention(nn.Module):
             k_bias = self.k_bias
             v_bias = self.v_bias
         q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
         q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
         k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
         k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
         v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
         v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
         q = q * self.scale

             k_bias = self.k_bias
             v_bias = self.v_bias
+        # simulate module forward hooks to let accelerate load the actual weight
+        # see https://github.com/huggingface/accelerate/blob/1f7a79b428749f45187ec69485f2c966fe21926e/src/accelerate/hooks.py#L163
+        simulate_hooks = hasattr(self.q, '_hf_hook')
+        if simulate_hooks:
+            self.q._hf_hook.pre_forward(self.q, x)
         q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        if simulate_hooks:
+            self.q._hf_hook.post_forward(self.q, x)
         q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
+        if simulate_hooks:
+            self.k._hf_hook.pre_forward(self.k, k)
         k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        if simulate_hooks:
+            self.k._hf_hook.post_forward(self.k, k)
         k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+        if simulate_hooks:
+            self.v._hf_hook.pre_forward(self.v, v)
         v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        if simulate_hooks:
+            self.v._hf_hook.post_forward(self.v, v)
         v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
         q = q * self.scale