import torch
from tqdm import tqdm
from PIL import Image
from io import BytesIO
import base64
import numpy as np
import time
import json
import os
import cv2
from coco_metric import compute_cider
import random
import pickle

def evaluate_reg(
    model,
    tokenizer,
    image_processor,
    vis_embed_size=None,
    rank=0,
    world_size=1,
    id=0,
):
    lang_encoder_name = model.lang_encoder.__class__.__name__.lower()
    dataset_name = "refcocog"
    pkl_file = "/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_data.pkl"
    try:
        media_token_id = tokenizer("<|#image#|>", add_special_tokens=False)["input_ids"][-1]
        endofmedia_token_id = tokenizer("<|#endofimage#|>", add_special_tokens=False)["input_ids"][-1]
        pad_token_id = tokenizer(tokenizer.pad_token, add_special_tokens=False)["input_ids"][-1]
        bos_token_id = tokenizer(tokenizer.bos_token, add_special_tokens=False)["input_ids"][-1]
    except:
        pass

    model.eval().cuda()
    if world_size > 1:
        torch.distributed.barrier()
    this_tot = 0
    predictions = []
    D = pickle.load(open(pkl_file, "rb"))
    lines = []
    data = D["data"]
    uniq_id_to_text = D["uniq_id_to_text"]
    uniq_id_to_image = D["uniq_id_to_image"]
    uniq_id_to_image_id = D["uniq_id_to_image_id"]
    for image_id in data:
        for region in data[image_id]:
            uniq_id = data[image_id][region][0]
            lines.append([uniq_id, uniq_id_to_image_id[uniq_id], [uniq_id_to_text[r] for r in data[image_id][region]], region, uniq_id_to_image[uniq_id]])
    print("total data:", len(lines))
    # lines = lines[:20]
    pbar = tqdm(lines, disable=(rank != 0))
    for ii, line in enumerate(pbar):
        if ii % world_size != rank:
            continue
        uniq_id, image_id, text, region_coord, image = line
        gt_box = np.array(region_coord)
        width = image.width
        height = image.height
        image = image.resize((224, 224))
        gt_box = gt_box / np.array([width, height, width, height]) * 224
        batch_images = image_processor(image).unsqueeze(0).unsqueeze(1).unsqueeze(0)
        prompt = [f"{tokenizer.bos_token}<|#image#|>{tokenizer.pad_token*vis_embed_size}<|#endofimage#|><|#object#|><|#previsual#|><|#prebox#|><|#object#|>"]

        encodings = tokenizer(
            prompt,
            padding="longest",
            truncation=True,
            return_tensors="pt",
            max_length=2000,
        )
        input_ids = encodings["input_ids"]
        attention_mask = encodings["attention_mask"]
        image_start_index_list = ((input_ids == media_token_id).nonzero(as_tuple=True)[-1] + 1).tolist()
        image_start_index_list = [[x] for x in image_start_index_list]
        image_nums = [1] * len(input_ids)
        batch_images = batch_images.cuda()
        input_ids = input_ids.cuda()
        attention_mask = attention_mask.cuda()
        added_bbox_list = [(torch.tensor(gt_box).cuda() / 224).clamp(0, 0.99).unsqueeze(0)]

        with torch.inference_mode() and torch.cuda.amp.autocast(dtype=torch.float16):
            outputs = model.generate(
                batch_images,
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=25,
                min_length=5,
                num_beams=8,
                length_penalty=0,
                image_start_index_list=image_start_index_list,
                image_nums=image_nums,
                added_bbox_list=added_bbox_list,
            )
        outputs = outputs[:, len(input_ids[0]) :]
        new_prediction = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].strip().lower()
        this_tot += 1
        if rank == 0 and this_tot % 10 == 0:
            for i in range(1):
                tqdm.write(f"answer: {text}\nmodel output: {new_prediction}")
        predictions.append(
            {"image_id": image_id, "caption": new_prediction}
        )
    results_path = f"reg_{lang_encoder_name}_{rank}_{id}.json"
    json.dump(predictions, open(results_path, "w"))
    print("save to", results_path)
    del predictions
    time.sleep(5)
    if world_size > 1:
        torch.distributed.barrier()
    if rank == 0:
        print(f"evaluate on rank {rank}. world size is {world_size}")
        predictions = []
        for rank_i in range(world_size):
            part_results_path = f"reg_{lang_encoder_name}_{rank_i}_{id}.json"
            print("load", part_results_path)
            part_data = json.load(open(part_results_path))
            predictions.extend(part_data)
            os.remove(part_results_path)
        print("num:", len(predictions))
        results_path = f"reg_{lang_encoder_name}_{id}_result.json"
        json.dump(predictions, open(results_path, "w"), indent=2)

        metrics = compute_cider(
            result_path=results_path,
            annotations_path="/gpfs/u/home/LMCG/LMCGljnn/scratch/code/multimodal2/open_flamingo/eval/task/others/refcocog_reg_val_label.json",
        )
        os.makedirs("eval_results", exist_ok=True)
        cider = metrics["CIDEr"]
        print("cider", cider)
        with open(os.path.join("eval_results", f"reg_{model.expr_name}_{model.step_num}_{int(time.time())}_{cider}"), "w") as f:
            f.write(json.dumps(predictions, indent=2))
        # delete the temporary file
        os.remove(results_path)
        return cider


if __name__ == "__main__":
    anno = json.load(open("/gpfs/u/home/LMCG/LMCGljnn/scratch/.cache/lavis/coco_gt/coco_karpathy_test_gt.json"))
    import pdb; pdb.set_trace()
    print(anno.keys())