import torch
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
from typing import Union, List
from pathlib import Path
from typing import Union, List
import dotenv
import os
import sys
sys.path.insert(0,"./")
from src.utils import full_path
from tqdm import tqdm


dotenv.load_dotenv(os.getenv("./models/.env"))
hf = os.getenv("huggingface_token")

def check_model_in_cache(model_name: str):
    if model_name in ["LLaMA3","llama3"]:
        return str(full_path("/data/shared/llama3-8b/Meta-Llama-3-8B_shard_size_1GB"))
    
    if model_name in ["Mistral","mistral"]:
        return str(full_path("/data/shared/mistral-7b-v03/Mistral-7B-v0.3_shard_size_1GB"))
    
    if model_name in ["olmo","OLMo"]:
        return str(full_path("/data/shared/olmo/OLMo-7B_shard_size_2GB"))

    raise ValueError(f"Model '{model_name}' not found in local cache.")

def mean_pooling(model_output, attention_mask):
    """
    mean_pooling _summary_

    Args:
        model_output (_type_): _description_
        attention_mask (_type_): _description_

    Returns:
        _type_: _description_
    """
    token_embeddings = model_output #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

class LLMEmbeddings:
    def __init__(self, model_name: str, device: torch.device = None):
        """
        Initializes any Hugging Face LLM.
        
        Args:
            model_dir (str): Path or Hugging Face repo ID for the model.
            device (torch.device): Device to load the model on (CPU/GPU).
        """
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Load model from cache
        try:
            model_dir = check_model_in_cache(model_name)
        except:
            model_dir = model_name 

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        
        # Load model configuration to determine model type
        config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
        self.model_type = config.architectures[0] if config.architectures else ""

        # Automatically choose between AutoModelForCausalLM and AutoModel
        if "CausalLM" in self.model_type:
            self.model = AutoModelForCausalLM.from_pretrained(
                model_dir, trust_remote_code=True, torch_dtype=torch.float16
            ).to(self.device)
        else:
            self.model = AutoModel.from_pretrained(
                model_dir, trust_remote_code=True, torch_dtype=torch.float16
            ).to(self.device)

        # Ensure padding token is set (fixes issues in tokenization)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model.eval()

    def encode(self, text: Union[str, List[str]]):
        """Encodes input sentences into embeddings."""
        inputs = self.tokenizer(
            text, return_tensors="pt", padding=True, truncation=True, max_length=1024, return_token_type_ids=False
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
       
        embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
        return embeddings
    
    def encode_batch(self, text: Union[str, List[str]], batch_size: int = 32):
        """Encodes input sentences into embeddings using batching."""
        # If a single string is provided, wrap it in a list.
        if isinstance(text, str):
            text = [text]

        embeddings_list = []
        # Process the text in batches
        for i in tqdm(range(0, len(text), batch_size), desc="Processing Batches"):
            batch_text = text[i:i+batch_size]
            inputs = self.tokenizer(
                batch_text,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=1024,
                return_token_type_ids=False
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)

            batch_embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
            embeddings_list.append(batch_embeddings)

        # Concatenate embeddings from all batches along the batch dimension.
        embeddings = torch.cat(embeddings_list, dim=0)
        return embeddings

    
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load any Hugging Face LLM (e.g., LLaMA, Mistral, Falcon, GPT)
    
    llm = LLMEmbeddings(model_name="llama3", device=device)

    # Encode text into embeddings
    embedding = llm.encode("Hugging Face models are powerful!")
    print(embedding.shape)
    print("Done!!")