import os
import re
from typing import Dict, Tuple
from warnings import filterwarnings

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from newsclassifier.config.config import Cfg, logger
from torch.utils.data import Dataset
from transformers import RobertaTokenizer

filterwarnings("ignore")


def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame:
    """load data from source into a Pandas DataFrame.

    Args:
        filepath (str): file location.
        print_i (int): Print number of instances.

    Returns:
        pd.DataFrame: Pandas DataFrame of the data.
    """
    logger.info("Loading Data.")
    df = pd.read_csv(filepath)
    if print_i:
        print(df.head(print_i), "\n")
    return df


def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Separate headlines instance and feature selection.

    Args:
        df: original dataframe.

    Returns:
        df: new dataframe with appropriate features.
        headlines_df: dataframe cintaining "headlines" category instances.
    """
    logger.info("Preparing Data.")
    try:
        df = df[["Title", "Category"]]
        df.rename(columns={"Title": "Text"}, inplace=True)
        df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)
    except Exception as e:
        logger.error(e)

    return df, headlines_df


def clean_text(text: str) -> str:
    """Clean text (lower, puntuations removal, blank space removal)."""
    # lower case the text
    logger.info("Cleaning input text.")
    text = text.lower()  # necessary to do before as stopwords are in lower case

    # remove stopwords
    stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*")
    text = stp_pattern.sub("", text)

    # custom cleaning
    text = text.strip()  # remove space at start or end if any
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove characters that are not alphanumeric

    return text


def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]:
    """Preprocess the data.

    Args:
        df: Dataframe on which the preprocessing steps need to be performed.

    Returns:
        df: Preprocessed Data.
        class_to_index: class labels to indices mapping
        class_to_index: indices to class labels mapping
    """
    df, headlines_df = prepare_data(df)

    cats = df["Category"].unique().tolist()
    class_to_index = {tag: i for i, tag in enumerate(cats)}
    index_to_class = {v: k for k, v in class_to_index.items()}

    df["Text"] = df["Text"].apply(clean_text)  # clean text
    df = df[["Text", "Category"]]
    try:
        df["Category"] = df["Category"].map(class_to_index)  # label encoding
    except Exception as e:
        logger.error(e)
    return df, headlines_df, class_to_index, index_to_class


def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False):
    """Split data into train and test sets.

    Args:
        df (pd.DataFrame): Data to be split.
        split_size (float): train-test split ratio (test ratio).
        stratify_on_target (bool): Whether to do stratify split on target.
        target_sep (bool): Whether to do target setting for train and test sets.
        save_dfs (bool): Whether to save dataset splits in artifacts.

    Returns:
        train-test splits (with/without target setting)
    """
    logger.info("Splitting Data.")
    try:
        if stratify_on_target:
            stra = df["Category"]
        else:
            stra = None

        train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra)
        train_ds = pd.DataFrame(train, columns=df.columns)
        test_ds = pd.DataFrame(test, columns=df.columns)

        if save_dfs:
            logger.info("Saving and storing data splits.")

            os.makedirs(Cfg.preprocessed_data_path, exist_ok=True)
            train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv"))
            test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv"))
    except Exception as e:
        logger.error(e)

        return train_ds, test_ds


def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
    """Tokenize and prepare the input text using the provided tokenizer.

    Args:
        tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
        text (str): The input text to be tokenized.

    Returns:
        inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
            'attention_mask', etc.
    """
    logger.info("Tokenizing input text.")
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=Cfg.add_special_tokens,
        max_length=Cfg.max_len,
        pad_to_max_length=Cfg.pad_to_max_length,
        truncation=Cfg.truncation,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class NewsDataset(Dataset):
    def __init__(self, ds):
        self.texts = ds["Text"].values
        self.labels = ds["Category"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        inputs = prepare_input(tokenizer, self.texts[item])
        labels = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, labels


def collate(inputs: Dict) -> Dict:
    """Collate and modify the input dictionary to have the same sequence length for a particular input batch.

    Args:
        inputs (dict): A dictionary containing input tensors with varying sequence lengths.

    Returns:
        modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
    """
    max_len = int(inputs["input_ids"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :max_len]
    return inputs


if __name__ == "__main__":
    df = load_dataset(Cfg.dataset_loc)
    df, headlines_df, class_to_index, index_to_class = preprocess(df)
    print(df)
    print(class_to_index)
    train_ds, val_ds = data_split(df, save_dfs=True)
    dataset = NewsDataset(df)
    print(dataset.__getitem__(0))