tencent
/

Hunyuan-A13B-Instruct

@@ -74,7 +74,8 @@ _CONFIG_FOR_DOC = "HunYuanConfig"
 def topkgating(logits: Tensor, topk: int):
     logits = logits.float()
     gates = F.softmax(logits, dim=1)
-    expert_capacity = topk * gates.shape[0]
     num_experts = int(gates.shape[1])
     # Top-k router probability and corresponding expert indices for each token.
     # Shape: [tokens_per_group, num_selected_experts].
@@ -1546,7 +1547,7 @@ class HunYuanMoEV1ForCausalLM(HunYuanPreTrainedModel):
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None

 def topkgating(logits: Tensor, topk: int):
     logits = logits.float()
     gates = F.softmax(logits, dim=1)
+    # expert_capacity = topk * gates.shape[0]
+    expert_capacity = max(topk, topk * gates.shape[0] // gates.shape[1])
     num_experts = int(gates.shape[1])
     # Top-k router probability and corresponding expert indices for each token.
     # Shape: [tokens_per_group, num_selected_experts].
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_cache_shape()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None