tencent
/

Hunyuan-A13B-Instruct-FP8

@@ -74,7 +74,7 @@ _CONFIG_FOR_DOC = "HunYuanConfig"
 def topkgating(logits: Tensor, topk: int):
     logits = logits.float()
     gates = F.softmax(logits, dim=1)
-   # expert_capacity = topk * gates.shape[0]
     expert_capacity = max(topk, topk * gates.shape[0] // gates.shape[1])
     num_experts = int(gates.shape[1])
     # Top-k router probability and corresponding expert indices for each token.
@@ -1417,7 +1417,7 @@ class HunYuanModel(HunYuanPreTrainedModel):
         )
-class HunYuanForCausalLM(HunYuanPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: HunYuanConfig):
@@ -1547,7 +1547,7 @@ class HunYuanForCausalLM(HunYuanPreTrainedModel):
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None

 def topkgating(logits: Tensor, topk: int):
     logits = logits.float()
     gates = F.softmax(logits, dim=1)
+    # expert_capacity = topk * gates.shape[0]
     expert_capacity = max(topk, topk * gates.shape[0] // gates.shape[1])
     num_experts = int(gates.shape[1])
     # Top-k router probability and corresponding expert indices for each token.
         )
+class HunYuanMoEV1ForCausalLM(HunYuanPreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config: HunYuanConfig):
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
                 past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_cache_shape()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None