import argparse
import logging
import os
import glob
import random
import copy
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn

from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForPreTraining,
    AutoModelForQuestionAnswering,
    AutoModelForSequenceClassification,
    AutoModelForTokenClassification,
    AutoModelWithLMHead,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

from ..data import load_dataset
from ..data.examples import *


logger = logging.getLogger(__name__)


MODEL_MODES = {
    'base': AutoModel,
    'sequence-classification': AutoModelForSequenceClassification,
    'question-answering': AutoModelForQuestionAnswering,
    'pretraining': AutoModelForPreTraining,
    'token-classification': AutoModelForTokenClassification,
    'language-modeling': AutoModelWithLMHead,
    'multiple-choice': AutoModelForMultipleChoice,
}


def get_model_class(model_type, mode):
    return MODEL_MODES[mode]


def set_seed(hparams):
    random.seed(hparams['seed'])
    np.random.seed(hparams['seed'])
    torch.manual_seed(hparams['seed'])
    if hparams['n_gpu'] > 0:
        torch.cuda.manual_seed_all(hparams['seed'])


class BaseModule(pl.LightningModule):
    """
    The base module has 4 components: config, tokenizer, transformer model,
    and dataset

    Loading of a dataset:
    1. Load instances of a dataset in the form of `Examples`
    2. Convert all examples into features - may require tokenizer
    3. Create a tensor dataset and loader given all the converted features

    """

    def __init__(self, hparams):
        super().__init__()

        hparams['mode'] = self.mode
        hparams['output_mode'] = self.output_mode
        hparams['example_type'] = self.example_type
        hparams['dev_lang'] = hparams['train_lang']
        self.hparams = hparams  # must come after super
        self.dataset = load_dataset(hparams['dataset'], hparams['data_dir'])
        if self.output_mode == 'classification':
            self.labels = self.dataset.get_labels(hparams['train_lang'])

        # setup config object
        config_name = hparams['config_name'] or hparams['model_name_or_path']
        args = {}
        if self.output_mode == 'classification':
            hparams['num_labels'] = len(self.dataset.get_labels(hparams['train_lang']))
            args = {'num_labels': hparams['num_labels']}

        self.config = AutoConfig.from_pretrained(
            config_name,
            **args,
            cache_dir=hparams['cache_dir']
        )

        # setup tokenizer object
        tok_name = hparams['tokenizer_name'] or hparams['model_name_or_path']
        self.tokenizer = AutoTokenizer.from_pretrained(
            tok_name,
            config=self.config,
            cache_dir=hparams['cache_dir'],
        )

        # setup transformer model
        model_class = get_model_class(self.config.model_type, hparams['mode'])
        self.model = model_class.from_pretrained(
            hparams['model_name_or_path'],
            config=self.config,
            cache_dir=hparams['cache_dir'],
        )

    def forward(self, **inputs):
        return self.model(**inputs)

    def prepare_data(self):
        """Cache feature files on disk for every mode at the onset"""
        modes = self.dataset.modes()
        for mode in modes:
            cached_features_file = self._feature_file(mode)
            if not os.path.exists(cached_features_file)\
                    or self.hparams['overwrite_cache']:
                self.load_features(mode)

    def load_features(self, mode):
        """Load examples and convert them into features"""
        if mode in ('train', 'dev', 'test'):
            lang = self.hparams['{}_lang'.format(mode)]
        else:
            lang = self.hparams['test_lang']
        examples = self.dataset.get_examples(lang, mode)

        cached_features_file = self._feature_file(mode)
        if os.path.exists(cached_features_file)\
                and not self.hparams['overwrite_cache']:
            features = torch.load(cached_features_file)
        else:
            features = self.convert_examples_to_features(examples)
            torch.save(features, cached_features_file)

        return features

    def convert_examples_to_features(self, examples):
        if self.hparams['example_type'] == 'multiple-choice':
            features = convert_multiple_choice_examples_to_features(
                examples,
                self.tokenizer,
                max_length=self.hparams['max_seq_length'],
                label_list=self.labels
            )
        elif self.hparams['example_type'] == 'text':
            features = convert_text_examples_to_features(
                examples,
                self.tokenizer,
                max_length=self.hparams['max_seq_length'],
                label_list=self.labels,
                output_mode=self.output_mode,
            )
        elif self.hparams['example_type'] == 'tokens':
            features = convert_tokens_examples_to_features(
                examples,
                self.labels,
                self.hparams['max_seq_length'],
                self.tokenizer,
                cls_token_at_end=bool(self.config.model_type in ["xlnet"]),
                cls_token=self.tokenizer.cls_token,
                cls_token_segment_id=2 if self.config.model_type in ["xlnet"] else 0,
                sep_token=self.tokenizer.sep_token,
                sep_token_extra=bool(self.config.model_type in ["roberta"]),
                pad_on_left=bool(self.config.model_type in ["xlnet"]),
                pad_token=self.tokenizer.pad_token_id,
                pad_token_segment_id=self.tokenizer.pad_token_type_id,
                pad_token_label_id=self.pad_token_label_id,
            )
        return features

    def make_loader(self, features, batch_size):
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids or 0 for f in features], dtype=torch.long)
        # all_candidates = torch.tensor([f.candidates for f in features], dtype=torch.long)
        if self.hparams['output_mode'] == 'classification':
            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
        elif self.hparams['output_mode'] == 'regression':
            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)

        return DataLoader(
            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
            # TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_candidates),
            batch_size=batch_size,
        )

    def train_dataloader(self):
        train_batch_size = self.hparams['train_batch_size']
        train_features = self.load_features('train')
        dataloader = self.make_loader(train_features, train_batch_size)

        t_total = (
            (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams['n_gpu'])))
            // self.hparams['gradient_accumulation_steps']
            * float(self.hparams['num_train_epochs'])
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparams['warmup_steps'], num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        dev_features = self.load_features('dev')
        dataloader = self.make_loader(dev_features, self.hparams['eval_batch_size'])
        return dataloader

    def test_dataloader(self):
        test_features = self.load_features('test')
        dataloader = self.make_loader(test_features, self.hparams['eval_batch_size'])
        return dataloader

    def training_step(self, batch, batch_idx):
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]}
        if self.config.model_type != 'distilbert':
            inputs['token_type_ids'] = (
                batch[2] if self.config.model_type in ['bert', 'xlnet', 'albert'] else None
            )  # XLM and RoBERTa don't use token_type_ids

        outputs = self(**inputs)
        loss = outputs[0]

        tensorboard_logs = {'loss': loss, 'rate': self.lr_scheduler.get_last_lr()[-1]}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[3]}

        # XLM and RoBERTa don't use token_type_ids
        inputs['token_type_ids'] = None
        if self.config.model_type in ['bert', 'xlnet', 'albert']:
            inputs['token_type_ids'] = batch[2]

        outputs = self(**inputs)
        tmp_eval_loss, logits = outputs[:2]
        preds = logits.detach().cpu().numpy()
        out_label_ids = inputs['labels'].detach().cpu().numpy()

        return {'val_loss': tmp_eval_loss.detach().cpu(),
                'pred': preds,
                'target': out_label_ids}

    def test_step(self, batch, batch_nb):
        return self.validation_step(batch, batch_nb)

    def _feature_file(self, mode):
        if mode in ('train', 'dev', 'test'):
            lang = self.hparams['{}_lang'.format(mode)]
        else:
            lang = self.hparams['test_lang']
        return os.path.join(
            self.hparams['data_dir'],
            'cached_{}_{}_{}_{}'.format(
                lang,
                mode,
                list(filter(None, self.hparams['model_name_or_path'].split('/'))).pop(),
                str(self.hparams['max_seq_length']),
            ),
        )

    def is_logger(self):
        return self.trainer.global_rank <= 0

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""

        model = self.model
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in model.named_parameters()
                           if not any(nd in n for nd in no_decay)],
                'weight_decay': self.hparams['weight_decay'],
            },
            {
                'params': [p for n, p in model.named_parameters()
                           if any(nd in n for nd in no_decay)],
                'weight_decay': 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams['learning_rate'],
                          eps=self.hparams['adam_epsilon'])
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx,
                       second_order_closure=None):
        if self.trainer.use_tpu:
            import torch_xla.core.xla_model as xm
            xm.optimizer_step(optimizer)
        else:
            optimizer.step()
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        avg_loss = getattr(self.trainer, 'avg_loss', 0.0)
        tqdm_dict = {'loss': '{:.3f}'.format(avg_loss), 'lr': self.lr_scheduler.get_last_lr()[-1]}
        return tqdm_dict

    def run_module(self):
        trainer = create_trainer(self, self.hparams)
        hparams_copy = copy.deepcopy(self.hparams)

        if self.hparams['do_train']:
            checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
            if len(checkpoints) == 0:
                trainer.fit(self)
                checkpoints = list(sorted(glob.glob(os.path.join(self.hparams['output_dir'], 'checkpointepoch=*.ckpt'), recursive=True)))
            self.trained_model = self.load_from_checkpoint(checkpoints[-1])
            self.trained_model.hparams = hparams_copy

        # Optionally, predict on dev set and write to output_dir
        if self.hparams['do_predict']:
            trainer.test(self.trained_model)


# Fixes __temp_weight_ddp_end.ckpt bug
# See https://github.com/PyTorchLightning/pytorch-lightning/issues/1142
class MonkeyPatchedTrainer(pl.Trainer):
    def load_spawn_weights(self, original_model):
        pass


pl.Trainer = MonkeyPatchedTrainer


class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
        logger.info("***** Test results *****")
        print(trainer.callback_metrics)

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_dir = pl_module.hparams['output_dir']
            test_lang = pl_module.hparams['test_lang']
            output_test_results_file = os.path.join(output_dir, 'test_results_{}.txt'.format(test_lang))
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))


def create_trainer(model, hparams):
    # init model
    set_seed(hparams)

    # if os.path.exists(hparams['output_dir']) and os.listdir(hparams['output_dir']) and hparams['do_train']:
    #   raise ValueError('Output directory ({}) already exists and is not empty.'.format(hparams['output_dir']))

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        filepath=hparams['output_dir'], prefix='checkpoint', monitor='val_loss', mode='min', save_top_k=5
    )

    train_params = dict(
        accumulate_grad_batches=hparams['gradient_accumulation_steps'],
        gpus=hparams['n_gpu'],
        max_epochs=hparams['num_train_epochs'],
        early_stop_callback=False,
        gradient_clip_val=hparams['max_grad_norm'],
        checkpoint_callback=checkpoint_callback,
        callbacks=[LoggingCallback()],
    )

    if hparams['fp16']:
        train_params['use_amp'] = hparams['fp16']
        train_params['amp_level'] = hparams['fp16_opt_level']

    if hparams['n_tpu_cores'] > 0:
        train_params['tpu_cores'] = hparams['n_tpu_cores']
        train_params['gpus'] = 0

    if hparams['n_gpu'] > 1:
        train_params['distributed_backend'] = 'ddp'

    trainer = pl.Trainer(**train_params)
    return trainer