import torch
import json
from torch import Tensor as T
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer

def get_tokenizer(model_checkpoint):
    """
    Get tokenizer
    """
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    return tokenizer

def query_trans(text, tokenizer):
    #return "câu hỏi " + tokenizer.sep_token + " " + text
    return text

def context_trans(text, tokenizer):
    #return "đoạn văn " + tokenizer.sep_token + " " + text
    return text

def build_dpr_traindata(corpus, df, tokenizer, q_len, ctx_len, batch_size, no_hard, shuffle = False, all_data=False):
    """
    This funtion builds train and val data loader for biencoder training
    """
    tokenized_questions = [query_trans(x, tokenizer) for x in df["tokenized_question"].tolist()]
    questions = []
    positives = []
    negatives = []
    ans_ids = df["best_ans_id"].tolist()
    if no_hard != 0:
        neg_ids = df["neg_ids"].tolist()

    for i in range(len(df)):
        #positive_ids = [int(x) for x in str(ans_ids[i][1:-1]).split(", ")]
        positive_ids = json.loads(str(ans_ids[i]))
        poss = [context_trans(corpus[j], tokenizer) for j in positive_ids]
        if no_hard != 0:
            #negative_ids = [int(y) for y in neg_ids[i][1:-1].split(", ")[:no_hard]]
            negative_ids = json.loads(str(neg_ids[i]))[:no_hard]
            negs = [context_trans(corpus[j], tokenizer) for j in negative_ids]

        if all_data:
            for pos in poss:
                questions.append(tokenized_questions[i])
                positives.append(pos)
                if no_hard != 0:
                    negatives += negs
        else:
            questions.append(tokenized_questions[i])
            positives.append(poss[0])
            if no_hard != 0:
                negatives += negs

    Q = tokenizer.batch_encode_plus(questions, padding='max_length', truncation=True, max_length=q_len, return_tensors='pt')
    P = tokenizer.batch_encode_plus(positives, padding='max_length', truncation=True, max_length=ctx_len, return_tensors='pt')
    if no_hard != 0:
        N = tokenizer.batch_encode_plus(negatives, padding='max_length', truncation=True, max_length=ctx_len, return_tensors='pt')
        N_ids = N['input_ids'].view(-1,no_hard,ctx_len)
        N_attn = N['attention_mask'].view(-1,no_hard,ctx_len)
        data_tensor = TensorDataset(Q['input_ids'], Q['attention_mask'], P['input_ids'], P['attention_mask'], N_ids, N_attn)
    else:
        data_tensor = TensorDataset(Q['input_ids'], Q['attention_mask'], P['input_ids'], P['attention_mask'])
    data_loader = DataLoader(data_tensor, batch_size=batch_size, shuffle=shuffle)
    return data_loader