import torch
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    Repeats key-value hidden states along the key-value head dimension.
    Args:
        hidden_states (torch.Tensor): Input tensor with shape either
            (batch, num_key_value_heads, seqlen, head_dim) or
            (num_layers, batch, num_key_value_heads, seqlen, head_dim).
        n_rep (int): Number of repetitions for key-value heads.
    Returns:
        torch.Tensor: The repeated tensor with shape either
            (batch, num_attention_heads, seqlen, head_dim) or
            (num_layers, batch, num_attention_heads, seqlen, head_dim).
    """
    if hidden_states.dim() == 4:  # (batch, num_key_value_heads, seqlen, head_dim)
        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
        if n_rep == 1:
            return hidden_states
        hidden_states = hidden_states.unsqueeze(2).expand(batch, num_key_value_heads, n_rep, slen, head_dim)
        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)

    elif hidden_states.dim() == 5:  # (num_layers, batch, num_key_value_heads, seqlen, head_dim)
        num_layers, batch, num_key_value_heads, slen, head_dim = hidden_states.shape
        if n_rep == 1:
            return hidden_states
        hidden_states = hidden_states.unsqueeze(3).expand(num_layers, batch, num_key_value_heads, n_rep, slen, head_dim)
        return hidden_states.reshape(num_layers, batch, num_key_value_heads * n_rep, slen, head_dim)

    else:
        raise ValueError("Input tensor must have 4 or 5 dimensions.")
    
import math

def calculate_tokens_suggest_compression_ratio(text, tokenizer, model):
    """
    Tokenizes the text and returns:
      - token_count: the number of tokens in the input text.
      - suggestions: a list of 6 candidate compression ratios.
      - tokenized: a dictionary containing 'input_ids' and 'attention_mask'.
      
    The suggestions are chosen so that compressing the token count by these ratios
    would (in the worst case) bring the count within the maximum allowed tokens (128k).
    
    If the text already fits within the context (<= 128k tokens),
    the default suggestions [1, 2, 4, 8, 16, 32] are returned.
    If the text is too long, we generate six values in logarithmic space
    between max(required_ratio, 1) and 32 (or a higher upper bound if needed).
    """
    tokenized = tokenizer(text, return_tensors="pt", truncation=False)
    token_ids = tokenized["input_ids"][0]
    token_count = token_ids.size(0)
    max_context = model.config.max_position_embeddings
    if token_count <= max_context:
        required_ratio = 1.0
    else:
        required_ratio = token_count / max_context
    if required_ratio <= 1.0:
        suggestions = [1, 2, 4, 8, 16, 32]
    else:
        lower_bound = max(required_ratio, 1)
        if required_ratio < 32:
            upper_bound = 32
        else:
            upper_bound = required_ratio * (32 / 1)
        suggestions = [
            round(math.exp(math.log(lower_bound) + i * (math.log(upper_bound) - math.log(lower_bound)) / (6 - 1)), 2)
            for i in range(6)
        ]
    
    return token_count, suggestions, tokenized


def update_retrieval_context(token_count, compression_ratio):
    retrieval_tokens = int(token_count / compression_ratio)
    return f"Retrieval context tokens (after compression): {retrieval_tokens}"