import logging
from typing import Dict, List, Tuple, Union

import spacy

# from ipa.common.utils import load_spacy
from overrides import overrides
from spacy.cli.download import download as spacy_download
from spacy.tokens import Doc

from relik.common.log import get_logger
from relik.inference.data.objects import Word
from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer

logger = get_logger(level=logging.DEBUG)

# Spacy and Stanza stuff

LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {}


def load_spacy(
    language: str,
    pos_tags: bool = False,
    lemma: bool = False,
    parse: bool = False,
    split_on_spaces: bool = False,
) -> spacy.Language:
    """
    Download and load spacy model.

    Args:
        language (:obj:`str`, defaults to :obj:`en`):
            Language of the text to tokenize.
        pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs POS tagging with spacy model.
        lemma (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs lemmatization with spacy model.
        parse (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs dependency parsing with spacy model.
        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, will split by spaces without performing tokenization.

    Returns:
        :obj:`spacy.Language`: The spacy model loaded.
    """
    exclude = ["vectors", "textcat", "ner"]
    if not pos_tags:
        exclude.append("tagger")
    if not lemma:
        exclude.append("lemmatizer")
    if not parse:
        exclude.append("parser")

    # check if the model is already loaded
    # if so, there is no need to reload it
    spacy_params = (language, pos_tags, lemma, parse, split_on_spaces)
    if spacy_params not in LOADED_SPACY_MODELS:
        try:
            spacy_tagger = spacy.load(language, exclude=exclude)
        except OSError:
            logger.warning(
                "Spacy model '%s' not found. Downloading and installing.", language
            )
            spacy_download(language)
            spacy_tagger = spacy.load(language, exclude=exclude)

        # if everything is disabled, return only the tokenizer
        # for faster tokenization
        # TODO: is it really faster?
        # if len(exclude) >= 6:
        #     spacy_tagger = spacy_tagger.tokenizer
        LOADED_SPACY_MODELS[spacy_params] = spacy_tagger

    return LOADED_SPACY_MODELS[spacy_params]


class SpacyTokenizer(BaseTokenizer):
    """
    A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects.

    Args:
        language (:obj:`str`, optional, defaults to :obj:`en`):
            Language of the text to tokenize.
        return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs POS tagging with spacy model.
        return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs lemmatization with spacy model.
        return_deps (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, performs dependency parsing with spacy model.
        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, will split by spaces without performing tokenization.
        use_gpu (:obj:`bool`, optional, defaults to :obj:`False`):
            If :obj:`True`, will load the Stanza model on GPU.
    """

    def __init__(
        self,
        language: str = "en",
        return_pos_tags: bool = False,
        return_lemmas: bool = False,
        return_deps: bool = False,
        split_on_spaces: bool = False,
        use_gpu: bool = False,
    ):
        super(SpacyTokenizer, self).__init__()
        if language not in SPACY_LANGUAGE_MAPPER:
            raise ValueError(
                f"`{language}` language not supported. The supported "
                f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}."
            )
        if use_gpu:
            # load the model on GPU
            # if the GPU is not available or not correctly configured,
            # it will rise an error
            spacy.require_gpu()
        self.spacy = load_spacy(
            SPACY_LANGUAGE_MAPPER[language],
            return_pos_tags,
            return_lemmas,
            return_deps,
            split_on_spaces,
        )
        self.split_on_spaces = split_on_spaces

    def __call__(
        self,
        texts: Union[str, List[str], List[List[str]]],
        is_split_into_words: bool = False,
        **kwargs,
    ) -> Union[List[Word], List[List[Word]]]:
        """
        Tokenize the input into single words using SpaCy models.

        Args:
            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
                If :obj:`True` and the input is a string, the input is split on spaces.

        Returns:
            :obj:`List[List[Word]]`: The input text tokenized in single words.

        Example::

            >>> from ipa import SpacyTokenizer

            >>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True)
            >>> spacy_tokenizer("Mary sold the car to John.")

        """
        # check if input is batched or a single sample
        is_batched = self.check_is_batched(texts, is_split_into_words)
        if is_batched:
            tokenized = self.tokenize_batch(texts)
        else:
            tokenized = self.tokenize(texts)
        return tokenized

    @overrides
    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
        if self.split_on_spaces:
            if isinstance(text, str):
                text = text.split(" ")
            spaces = [True] * len(text)
            text = Doc(self.spacy.vocab, words=text, spaces=spaces)
        return self._clean_tokens(self.spacy(text))

    @overrides
    def tokenize_batch(
        self, texts: Union[List[str], List[List[str]]]
    ) -> List[List[Word]]:
        if self.split_on_spaces:
            if isinstance(texts[0], str):
                texts = [text.split(" ") for text in texts]
            spaces = [[True] * len(text) for text in texts]
            texts = [
                Doc(self.spacy.vocab, words=text, spaces=space)
                for text, space in zip(texts, spaces)
            ]
        return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)]

    @staticmethod
    def _clean_tokens(tokens: Doc) -> List[Word]:
        """
        Converts spaCy tokens to :obj:`Word`.

        Args:
            tokens (:obj:`spacy.tokens.Doc`):
                Tokens from SpaCy model.

        Returns:
            :obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects.
        """
        words = [
            Word(
                token.text,
                token.i,
                token.idx,
                token.idx + len(token),
                token.lemma_,
                token.pos_,
                token.dep_,
                token.head.i,
            )
            for token in tokens
        ]
        return words


class WhitespaceSpacyTokenizer:
    """Simple white space tokenizer for SpaCy."""

    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        if isinstance(text, str):
            words = text.split(" ")
        elif isinstance(text, list):
            words = text
        else:
            raise ValueError(
                f"text must be either `str` or `list`, found: `{type(text)}`"
            )
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)