import settings
import captum
import numpy as np
import torch
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from utils import get_args
from utils import CTCLabelConverter, AttnLabelConverter, Averager, TokenLabelConverter
import string
import time
import sys
from dataset import hierarchical_dataset, AlignCollate
import validators
from model import Model, STRScore
from PIL import Image
from lime.wrappers.scikit_image import SegmentationAlgorithm
from captum._utils.models.linear_model import SkLearnLinearModel, SkLearnRidge
import random
import os
from skimage.color import gray2rgb
import pickle
from train_shap_corr import getPredAndConf
import re
from captum_test import acquire_average_auc, saveAttrData
import copy
from skimage.color import gray2rgb
from matplotlib import pyplot as plt
from torchvision import transforms

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from captum.attr import (
    GradientShap,
    DeepLift,
    DeepLiftShap,
    IntegratedGradients,
    LayerConductance,
    NeuronConductance,
    NoiseTunnel,
    Saliency,
    InputXGradient,
    GuidedBackprop,
    Deconvolution,
    GuidedGradCam,
    FeatureAblation,
    ShapleyValueSampling,
    Lime,
    KernelShap
)

from captum.metrics import (
    infidelity,
    sensitivity_max
)

from captum.attr._utils.visualization import visualize_image_attr

### Acquire pixelwise attributions and replace them with ranked numbers averaged
### across segmentation with the largest contribution having the largest number
### and the smallest set to 1, which is the minimum number.
### attr - original attribution
### segm - image segmentations
def rankedAttributionsBySegm(attr, segm):
    aveSegmentations, sortedDict = averageSegmentsOut(attr[0,0], segm)
    totalSegm = len(sortedDict.keys()) # total segmentations
    sortedKeys = [k for k, v in sorted(sortedDict.items(), key=lambda item: item[1])]
    sortedKeys = sortedKeys[::-1] ### A list that should contain largest to smallest score
    currentRank = totalSegm
    rankedSegmImg = torch.clone(attr)
    for totalSegToHide in range(0, len(sortedKeys)):
        currentSegmentToHide = sortedKeys[totalSegToHide]
        rankedSegmImg[0,0][segm == currentSegmentToHide] = currentRank
        currentRank -= 1
    return rankedSegmImg

### Returns the mean for each segmentation having shape as the same as the input
### This function can only one attribution image at a time
def averageSegmentsOut(attr, segments):
    averagedInput = torch.clone(attr)
    sortedDict = {}
    for x in np.unique(segments):
        segmentMean = torch.mean(attr[segments == x][:])
        sortedDict[x] = float(segmentMean.detach().cpu().numpy())
        averagedInput[segments == x] = segmentMean
    return averagedInput, sortedDict

### Output and save segmentations only for one dataset only
def outputSegmOnly(opt):
    ### targetDataset - one dataset only, SVTP-645, CUTE80-288images
    targetDataset = "CUTE80" # ['IIIT5k_3000', 'SVT', 'IC03_867', 'IC13_1015', 'IC15_2077', 'SVTP', 'CUTE80']
    segmRootDir = "/home/uclpc1/Documents/STR/datasets/segmentations/224X224/{}/".format(targetDataset)

    if not os.path.exists(segmRootDir):
        os.makedirs(segmRootDir)

    opt.eval = True
    ### Only IIIT5k_3000
    if opt.fast_acc:
    # # To easily compute the total accuracy of our paper.
        eval_data_list = [targetDataset]
    else:
        # The evaluation datasets, dataset order is same with Table 1 in our paper.
        eval_data_list = [targetDataset]

    ### Taken from LIME
    segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
                                            max_dist=200, ratio=0.2,
                                            random_seed=random.randint(0, 1000))

    for eval_data in eval_data_list:
        eval_data_path = os.path.join(opt.eval_data, eval_data)
        AlignCollate_evaluation = AlignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio_with_pad=opt.PAD, opt=opt)
        eval_data, eval_data_log = hierarchical_dataset(root=eval_data_path, opt=opt)
        evaluation_loader = torch.utils.data.DataLoader(
            eval_data, batch_size=1,
            shuffle=False,
            num_workers=int(opt.workers),
            collate_fn=AlignCollate_evaluation, pin_memory=True)
        for i, (image_tensors, labels) in enumerate(evaluation_loader):
            imgDataDict = {}
            img_numpy = image_tensors.cpu().detach().numpy()[0] ### Need to set batch size to 1 only
            if img_numpy.shape[0] == 1:
                img_numpy = gray2rgb(img_numpy[0])
            # print("img_numpy shape: ", img_numpy.shape) # (224,224,3)
            segmOutput = segmentation_fn(img_numpy)
            imgDataDict['segdata'] = segmOutput
            imgDataDict['label'] = labels[0]
            outputPickleFile = segmRootDir + "{}.pkl".format(i)
            with open(outputPickleFile, 'wb') as f:
                pickle.dump(imgDataDict, f)

def acquireSelectivityHit(origImg, attributions, segmentations, model, converter, labels, scoring):
    # print("segmentations unique len: ", np.unique(segmentations))
    aveSegmentations, sortedDict = averageSegmentsOut(attributions[0,0], segmentations)
    sortedKeys = [k for k, v in sorted(sortedDict.items(), key=lambda item: item[1])]
    sortedKeys = sortedKeys[::-1] ### A list that should contain largest to smallest score
    # print("sortedDict: ", sortedDict) # {0: -5.51e-06, 1: -1.469e-05, 2: -3.06e-05,...}
    # print("aveSegmentations unique len: ", np.unique(aveSegmentations))
    # print("aveSegmentations device: ", aveSegmentations.device) # cuda:0
    # print("aveSegmentations shape: ", aveSegmentations.shape) # (224,224)
    # print("aveSegmentations: ", aveSegmentations)

    n_correct = []
    confidenceList = [] # First index is one feature removed, second index two features removed, and so on...
    clonedImg = torch.clone(origImg)
    gt = str(labels)
    for totalSegToHide in range(0, len(sortedKeys)):
        ### Acquire LIME prediction result
        currentSegmentToHide = sortedKeys[totalSegToHide]
        clonedImg[0,0][segmentations == currentSegmentToHide] = 0.0
        pred, confScore = getPredAndConf(opt, model, scoring, clonedImg, converter, np.array([gt]))
        # To evaluate 'case sensitive model' with alphanumeric and case insensitve setting.
        if opt.sensitive and opt.data_filtering_off:
            pred = pred.lower()
            gt = gt.lower()
            alphanumeric_case_insensitve = '0123456789abcdefghijklmnopqrstuvwxyz'
            out_of_alphanumeric_case_insensitve = f"[^{alphanumeric_case_insensitve}]"
            pred = re.sub(out_of_alphanumeric_case_insensitve, '', pred)
            gt = re.sub(out_of_alphanumeric_case_insensitve, '', gt)
        if pred == gt:
            n_correct.append(1)
        else:
            n_correct.append(0)
        confScore = confScore[0][0]*100
        confidenceList.append(confScore)
    return n_correct, confidenceList

### Once you have the selectivity_eval_results.pkl file,
def acquire_selectivity_auc(opt, pkl_filename=None):
    if pkl_filename is None:
        pkl_filename = "/home/goo/str/str_vit_dataexplain_lambda/metrics_sensitivity_eval_results_CUTE80.pkl" # VITSTR
    accKeys = []

    with open(pkl_filename, 'rb') as f:
        selectivity_data = pickle.load(f)

    for resDictIdx, resDict in enumerate(selectivity_data):
        keylistAcc = []
        keylistConf = []
        metricsKeys = resDict.keys()
        for keyStr in resDict.keys():
            if "_acc" in keyStr: keylistAcc.append(keyStr)
            if "_conf" in keyStr: keylistConf.append(keyStr)
        # Need to check if network correctly predicted the image
        for metrics_accStr in keylistAcc:
            if 1 not in resDict[metrics_accStr]: print("resDictIdx")

# Single directory STRExp explanations output demo
def sampleDemo(opt):
    targetDataset = "SVTP"
    demoImgDir = "demo_image/"
    outputDir = "demo_image_output/"

    if not os.path.exists(outputDir):
        os.makedirs(outputDir)

    segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
                                            max_dist=200, ratio=0.2,
                                            random_seed=random.randint(0, 1000))

    """ model configuration """
    if opt.Transformer:
        converter = TokenLabelConverter(opt)
    elif 'CTC' in opt.Prediction:
        converter = CTCLabelConverter(opt.character)
    else:
        converter = AttnLabelConverter(opt.character)
    opt.num_class = len(converter.character)

    if opt.rgb:
        opt.input_channel = 3
    model_obj = Model(opt)

    print('model input parameters', opt.imgH, opt.imgW, opt.num_fiducial, opt.input_channel, opt.output_channel,
          opt.hidden_size, opt.num_class, opt.batch_max_length, opt.Transformation, opt.FeatureExtraction,
          opt.SequenceModeling, opt.Prediction)
    model = torch.nn.DataParallel(model_obj).to(device)

    modelCopy = copy.deepcopy(model)

    """ evaluation """
    scoring_singlechar = STRScore(opt=opt, converter=converter, device=device, enableSingleCharAttrAve=True)
    super_pixel_model_singlechar = torch.nn.Sequential(
        # super_pixler,
        # numpy2torch_converter,
        modelCopy,
        scoring_singlechar
    ).to(device)
    modelCopy.eval()
    scoring_singlechar.eval()
    super_pixel_model_singlechar.eval()

    # Single Char Attribution Averaging
    # enableSingleCharAttrAve - set to True
    scoring = STRScore(opt=opt, converter=converter, device=device)
    super_pixel_model = torch.nn.Sequential(
        # super_pixler,
        # numpy2torch_converter,
        model,
        scoring
    ).to(device)
    model.eval()
    scoring.eval()
    super_pixel_model.eval()

    if opt.blackbg:
        shapImgLs = np.zeros(shape=(1, 1, 224, 224)).astype(np.float32)
        trainList = np.array(shapImgLs)
        background = torch.from_numpy(trainList).to(device)

    opt.eval = True
    for path, subdirs, files in os.walk(demoImgDir):
        for name in files:
            nameNoExt = name.split('.')[0]
            labels = nameNoExt
            fullfilename = os.path.join(demoImgDir, name) # Value
            # fullfilename: /data/goo/strattr/attributionData/trba/CUTE80/66_featablt.pkl
            pilImg = Image.open(fullfilename)

            if settings.MODEL=="vitstr":
                pilImg = pilImg.resize((224, 224))

            orig_img_tensors = transforms.ToTensor()(pilImg)
            orig_img_tensors = torch.mean(orig_img_tensors, dim=0).unsqueeze(0).unsqueeze(0)
            image_tensors = ((torch.clone(orig_img_tensors) + 1.0) / 2.0) * 255.0
            imgDataDict = {}
            img_numpy = image_tensors.cpu().detach().numpy()[0] ### Need to set batch size to 1 only
            if img_numpy.shape[0] == 1:
                img_numpy = gray2rgb(img_numpy[0])
            # print("img_numpy shape: ", img_numpy.shape) # (32,100,3)
            segmOutput = segmentation_fn(img_numpy)
            # print("orig_img_tensors shape: ", orig_img_tensors.shape) # (3, 224, 224)
            # print("orig_img_tensors max: ", orig_img_tensors.max()) # 0.6824 (1)
            # print("orig_img_tensors min: ", orig_img_tensors.min()) # 0.0235 (0)
            # sys.exit()

            results_dict = {}
            aveAttr = []
            aveAttr_charContrib = []
            # segmData, labels = segAndLabels[0]
            target = converter.encode([labels])

            # labels: RONALDO
            segmDataNP = segmOutput
            segmTensor = torch.from_numpy(segmDataNP).unsqueeze(0).unsqueeze(0)
            # print("segmTensor min: ", segmTensor.min()) # 0 starting segmentation
            segmTensor = segmTensor.to(device)
            # print("segmTensor shape: ", segmTensor.shape)
            # img1 = np.asarray(imgPIL.convert('L'))
            # sys.exit()
            # img1 = img1 / 255.0
            # img1 = torch.from_numpy(img1).unsqueeze(0).unsqueeze(0).type(torch.FloatTensor).to(device)
            img1 = orig_img_tensors.to(device)
            img1.requires_grad = True
            bgImg = torch.zeros(img1.shape).to(device)

            ### Single char averaging
            charOffset = 1

            # preds = model(img1, seqlen=converter.batch_max_length)
            input = img1
            origImgNP = torch.clone(orig_img_tensors).detach().cpu().numpy()[0][0] # (1, 1, 224, 224)
            origImgNP = gray2rgb(origImgNP)

            ### Local explanations only
            collectedAttributions = []
            for charIdx in range(0, len(labels)):
                scoring_singlechar.setSingleCharOutput(charIdx + charOffset)
                gtClassNum = target[0][charIdx + charOffset]

                ### Shapley Value Sampling
                svs = ShapleyValueSampling(super_pixel_model_singlechar)
                # attr = svs.attribute(input, target=0, n_samples=200) ### Individual pixels, too long to calculate
                attributions = svs.attribute(input, target=gtClassNum, feature_mask=segmTensor)
                collectedAttributions.append(attributions)
            aveAttributions = torch.mean(torch.cat(collectedAttributions,dim=0), dim=0).unsqueeze(0)
            if not torch.isnan(aveAttributions).any():
                rankedAttr = rankedAttributionsBySegm(aveAttributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_shapley_l.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Shapley Value Sampling
            svs = ShapleyValueSampling(super_pixel_model)
            # attr = svs.attribute(input, target=0, n_samples=200) ### Individual pixels, too long to calculate
            attributions = svs.attribute(input, target=0, feature_mask=segmTensor)
            if not torch.isnan(attributions).any():
                collectedAttributions.append(attributions)
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_shapley.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Global + Local context
            aveAttributions = torch.mean(torch.cat(collectedAttributions,dim=0), dim=0).unsqueeze(0)
            if not torch.isnan(aveAttributions).any():
                rankedAttr = rankedAttributionsBySegm(aveAttributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_shapley_gl.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### BASELINE Evaluations

            ### Integrated Gradients
            ig = IntegratedGradients(super_pixel_model)
            attributions = ig.attribute(input, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_intgrad.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Gradient SHAP using zero-background
            gs = GradientShap(super_pixel_model)
            # We define a distribution of baselines and draw `n_samples` from that
            # distribution in order to estimate the expectations of gradients across all baselines
            baseline_dist = torch.zeros((1, 1, 224, 224))
            baseline_dist = baseline_dist.to(device)
            attributions = gs.attribute(input, baselines=baseline_dist, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_gradshap.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### DeepLift using zero-background
            dl = DeepLift(super_pixel_model)
            attributions = dl.attribute(input, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_deeplift.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Saliency
            saliency = Saliency(super_pixel_model)
            attributions = saliency.attribute(input, target=0) ### target=class0
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_saliency.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### InputXGradient
            input_x_gradient = InputXGradient(super_pixel_model)
            attributions = input_x_gradient.attribute(input, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_inpxgrad.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### GuidedBackprop
            gbp = GuidedBackprop(super_pixel_model)
            attributions = gbp.attribute(input, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_guidedbp.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Deconvolution
            deconv = Deconvolution(super_pixel_model)
            attributions = deconv.attribute(input, target=0)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_deconv.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### Feature ablator
            ablator = FeatureAblation(super_pixel_model)
            attributions = ablator.attribute(input, target=0, feature_mask=segmTensor)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_featablt.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ## LIME
            interpretable_model = SkLearnRidge(alpha=1, fit_intercept=True) ### This is the default used by LIME
            lime = Lime(super_pixel_model, interpretable_model=interpretable_model)
            attributions = lime.attribute(input, target=0, feature_mask=segmTensor)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_lime.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

            ### KernelSHAP
            ks = KernelShap(super_pixel_model)
            attributions = ks.attribute(input, target=0, feature_mask=segmTensor)
            if not torch.isnan(attributions).any():
                rankedAttr = rankedAttributionsBySegm(attributions, segmDataNP)
                rankedAttr = rankedAttr.detach().cpu().numpy()[0][0]
                rankedAttr = gray2rgb(rankedAttr)
                mplotfig, _ = visualize_image_attr(rankedAttr, origImgNP, method='blended_heat_map', cmap='RdYlGn')
                mplotfig.savefig(outputDir + '{}_kernelshap.png'.format(nameNoExt))
                mplotfig.clear()
                plt.close(mplotfig)

if __name__ == '__main__':
    # deleteInf()
    opt = get_args(is_train=False)

    """ vocab / character number configuration """
    if opt.sensitive:
        opt.character = string.printable[:-6]  # same with ASTER setting (use 94 char).

    cudnn.benchmark = True
    cudnn.deterministic = True
    opt.num_gpu = torch.cuda.device_count()

    # combineBestDataXAI(opt)
    # acquire_average_auc(opt)
    # acquireSingleCharAttrAve(opt)
    sampleDemo(opt)