import torch

class PoetModelInterface(torch.nn.Module):
    """Pytorch Model Interface. Abstract class for all Poet model types

    Args:
        torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface 
    """
    def __init__(self, *args, **kwargs) -> None:
        """ Constructor. As child Class needs to construct Parent
        """
        super().__init__(*args, **kwargs)
        
        
    def forward(self, input_ids=None, labels=None, attention_mask=None, *args, **kwargs):
        """Compute model output and model loss

        Args:
            input_ids (_type_, optional): Model inputs. Defaults to None.
            labels (_type_, optional): Language Model labels. Defaults to None.
            attention_mask (_type_, optional): Attention mask where padding starts. Defaults to None.

        Raises:
            NotImplementedError: Abstract class
        """
        raise NotImplementedError()
    
    def generate_forced(self,  *args, **kwargs):
        """Generates model output with restriction on inputs and past generation

        Raises:
            NotImplementedError: Abstract class
        """
        raise NotImplementedError()

    @staticmethod
    def rhyme_like(rhyme:str):
        """DEPRECATED: Check string in rhyme format 

        Args:
            rhyme (str): String with possible rhyme

        Returns:
            bool: Boolean if string like rhyme
        """
        return rhyme.isupper() and len(rhyme) in [4,6]
    
    def save_LM(self, LM_path):
        """Save raw LM

        Args:
            LM_path (str): Where to store the LM

        Raises:
            NotImplementedError: Abstract class
        """
        raise NotImplementedError()
    

from transformers import GPT2Config, GPT2Model
from .poet_utils import StropheParams

class ContextModule(torch.nn.Module):
    """Module for understanding poet context

    Args:
        torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface 
    """
    def __init__(self, block_count, input_size, n_embd ,output_size,*args, **kwargs) -> None:
        """Construct the underlying small LM for context

        Args:
            block_count (_type_): LM number of blocks of GPT2Block
            input_size (_type_): LM size of input
            n_embd (_type_): LM size of hidden layers
            output_size (_type_): LM size of output
        """
        super().__init__(*args, **kwargs)
        self.config = GPT2Config(n_positions=input_size, n_head=(n_embd//(768//12)),n_embd=n_embd, 
                                 n_layer=block_count, output_hidden_states=True,  output_attentions =True)
        self.context_model = GPT2Model(self.config)
        self.linear_downscale = torch.nn.Linear(n_embd, output_size)
        self.input_size = input_size
        self.n_embd = n_embd
        self.output_size = output_size
        # Context is getting injected from Outside
        self.context_ids = None
        self.context_attention_mask = None
    
    
    def forward(self, hidden_states,layer_past=None,*args, **kwargs):
        """Compute Context LM output, Data are injected from outside

        Args:
            hidden_states (_type_): Current hidden states
            layer_past (_type_, optional): Past layer outputs. Defaults to None.

        Returns:
            _type_: GPT2Block structured output (hidden states, layer past, attention, keys)
        """
        down = torch.zeros_like(hidden_states)
        model_output = None
        # Sometimes there might be no context
        if self.context_ids != None:
            model_output = self.context_model.forward(input_ids=self.context_ids, attention_mask=self.context_attention_mask)
            # Take only the Class token as 
            down = self.linear_downscale.forward(model_output["hidden_states"][-1][:,0,:].view(-1, self.n_embd))[:, None, :]
        return  (hidden_states + down,
                 down[None, :, :, :],
                 (None if model_output == None else model_output["attentions"], 
                None))
        
class PoetTypeModule(torch.nn.Module):
    """Module to classify poet type

    Args:
        torch (_type_): Is child of torch.nn.Module for integration with torch and huggingface 
    """
    
    def __init__(self, block_count, input_size, n_embd,output_size,*args, **kwargs) -> None:
        """Construct LM for poet classification from inputs

        Args:
            block_count (_type_): LM number of blocks of GPT2Block
            input_size (_type_): LM size of input
            n_embd (_type_): LM size of hidden layers
            output_size (_type_): LM size of output
        """
        super().__init__(*args, **kwargs)
        self.config = GPT2Config(n_positions=input_size, n_head=(n_embd//(768//12)),n_embd=n_embd, 
                                 n_layer=block_count, output_hidden_states=True,  output_attentions =True)
        self.type_model = GPT2Model(self.config)
        self.type_predict = torch.nn.Linear(n_embd, len(StropheParams.YEAR))
        self.softmax = torch.nn.Softmax()
        self.linear_scale = torch.nn.Linear(len(StropheParams.YEAR), output_size)
        self.input_size = input_size
        self.n_embd = n_embd
        self.output_size = output_size
        # Context and labels are getting injected from Outside
        self.context_ids = None
        self.context_attention_mask = None
        self.type_labels=None
        # Store for loss for model itself
        self.indiv_loss=None
    
    def forward(self, hidden_states,layer_past=None,*args, **kwargs):
        """Compute Classification LM output and loss 

        Args:
            hidden_states (_type_): Current hidden states
            layer_past (_type_, optional): Past layer outputs. Defaults to None.

        Returns:
            _type_: GPT2Block structured output (hidden states, layer past, attention, keys)
        """
        type_prob = torch.zeros((hidden_states.shape[0], len(StropheParams.YEAR))).to("cuda" if torch.cuda.is_available() else "cpu")
        model_output = None
        # Sometimes there might be no context
        if self.context_ids != None:
            model_output = self.type_model.forward(input_ids=self.context_ids, attention_mask=self.context_attention_mask)
            # Only Class token is taken
            poet_type = self.type_predict.forward(model_output["hidden_states"][-1][:,0,:].view(-1, self.n_embd))
            type_prob = self.softmax.forward(poet_type) 
        # If type labels are present, inject the true labels to future blocks
        if self.type_labels != None:
            loss_fct = torch.nn.CrossEntropyLoss()
            self.indiv_loss = loss_fct(type_prob, self.type_labels)
            type_prob = (self.type_labels.type(torch.FloatTensor)).to("cuda" if torch.cuda.is_available() else "cpu")
        linear_up = self.linear_scale.forward(type_prob)
        return (hidden_states + linear_up[:, None, :],
                linear_up[None, :, None, :], 
                (None if model_output == None else model_output["attentions"], 
                None))
            
from transformers import PreTrainedTokenizerBase

class ModelManipulation:
    """Static Class incorporating methods for Manipulation with LMs
    Code Inspired by article: Fine-tuning the English GPT-2 in any language with Hugging Face
    Link: https://github.com/piegu/fastai-projects/blob/master/finetuning-English-GPT2-any-language-Portuguese-HuggingFace-fastaiv2.ipynb
    """
     
    @staticmethod
    def exchange_embedding(poet_model: PoetModelInterface, new_tokenizer: PreTrainedTokenizerBase, old_tokenizer: PreTrainedTokenizerBase, mirror_imbed:bool=False):
        """Exchange embedding matrixes for GPT2 Models

        Args:
            poet_model (PoetModelInterface): Model to manipulate with
            new_tokenizer (PreTrainedTokenizerBase): New tokenization
            old_tokenizer (PreTrainedTokenizerBase): Old tokenization
        """
        # Get old Embeddings
        if hasattr(poet_model.model, "transformer"):
            old_embed_in = poet_model.model.transformer.get_input_embeddings().weight.clone().detach()
        else:
            old_embed_in = poet_model.model.get_input_embeddings().weight.clone().detach()
        old_mean_in = old_embed_in.mean(0)
        # Generate new Embedding based on new tokenization
        new_embd_in = old_embed_in.new_zeros(new_tokenizer.vocab_size, old_embed_in.size(1))
        old_vocab = old_tokenizer.get_vocab()
        
        vocab_hit = 0
        # Keep as much from old Embeddings as possible
        for w, idx_new in new_tokenizer.get_vocab().items():
            idx_old = old_vocab.get(w, -1)
            if idx_old >= 0:
                new_embd_in[idx_new] = old_embed_in[idx_old]
                vocab_hit +=1
            else:
                new_embd_in[idx_new] = old_mean_in
                
        print(f"Vocab hit rate: {vocab_hit}/{old_tokenizer.vocab_size}")
        #Exchange Embeddings and Decoding
        new_embd_layer_in = torch.nn.Embedding(new_tokenizer.vocab_size, old_embed_in.size(1))
        new_embd_layer_in.weight.data = new_embd_in
        if hasattr(poet_model.model, "transformer"):
            poet_model.model.transformer.set_input_embeddings(new_embd_layer_in)
        else:
            poet_model.model.set_input_embeddings(new_embd_layer_in)
        
        new_decoder = torch.nn.Linear( old_embed_in.size(1), new_tokenizer.vocab_size, bias=False)
        if hasattr(poet_model.model, "transformer"):
            new_decoder.weight = poet_model.model.transformer.wte.weight
        else:
            new_decoder.weight = poet_model.model.base_model.embeddings.weight
        if hasattr(poet_model.model, "lm_head"):
            poet_model.model.lm_head = new_decoder
        else:
            poet_model.model.head = new_decoder

        
        # Update LM config to reflect possible change in vocab size
        poet_model.model.config.vocab_size = new_tokenizer.vocab_size
        
        
    @staticmethod
    def exchange_embedding_roberta(metre_model, new_tokenizer: PreTrainedTokenizerBase, old_tokenizer: PreTrainedTokenizerBase):
        """Exchange embedding matrixes for Roberta Models

        Args:
            poet_model (PoetModelInterface): Model to manipulate with
            new_tokenizer (PreTrainedTokenizerBase): New tokenization
            old_tokenizer (PreTrainedTokenizerBase): Old tokenization
        """
        # Get old Embeddings
        old_embed = metre_model.model.get_input_embeddings().weight.clone().detach()
        old_mean = old_embed.mean(0)
        # Generate new Embedding based on new tokenization
        new_embd = old_embed.new_zeros(new_tokenizer.vocab_size, old_embed.size(1))
        old_vocab = old_tokenizer.get_vocab()
        
        vocab_hit = 0
        # Keep as much from old Embeddings as possible
        for w, idx_new in new_tokenizer.get_vocab().items():
            idx_old = old_vocab.get(w, -1)
            if idx_old >= 0:
                new_embd[idx_new] = old_embed[idx_old]
                vocab_hit +=1
            else:
                new_embd[idx_new] = old_mean
                
        print(f"Vocab hit rate: {vocab_hit}/{old_tokenizer.vocab_size}")
        #Exchange Embeddings and Decoding
        new_embd_layer = torch.nn.Embedding(new_tokenizer.vocab_size, old_embed.size(1))
        new_embd_layer.weight.data = new_embd
        metre_model.model.set_input_embeddings(new_embd_layer)
        new_decoder = torch.nn.Linear( old_embed.size(1), new_tokenizer.vocab_size)
        new_decoder.weight = metre_model.model.roberta.embeddings.word_embeddings.weight
        metre_model.model.lm_head.decoder = new_decoder
        # Update LM config to reflect possible change in vocab size
        metre_model.model.config.vocab_size = new_tokenizer.vocab_size