import re
import math
import json
import pandas as pd
import string 

# common phrases in legal documents
re_thuchientheo = re.compile(
    r"((((được\s)?thực hiện theo qu[iy] định tại\s|hướng dẫn tại\s|theo qu[iy] định tại\s|(được\s)?thực hiện theo\s|theo qu[iy] định tại\s|theo nội dung qu[yi] định tại\s|quy[iy] định tại|theo\s)(các\s)?)?|tại\s(các\s)?)(khoản(\ssố)?\s(\d+\,\s)*\d+|điều(\ssố)?\s(\d+\,\s)*\d+|điểm\s(([a-z]|đ)\,\s)*([a-z]|đ)\b|chương(\ssố)?\s(\d+\,\s)*\d+)((\s|\,\s|\s\,\s|\svà\s)(khoản(\ssố)?\s(\d+\,\s)*\d+|điều(\ssố)?\s(\d+\,\s)*\d+|điểm\s(([a-z]|đ)\,\s)*([a-z]|đ)\b|chương(\ssố)?\s(\d+\,\s)*\d+))*(\s(điều này|thông tư này|nghị quyết này|quyết định này|nghị định này|văn bản này|quyết định này))?"
)
re_thongtuso = re.compile(
    r"(thông tư liên tịch|thông tư|nghị quyết|quyết định|nghị định|văn bản|Thông tư liên tịch|Thông tư|Nghị quyết|Nghị định|Văn bản|Quyết định)\s(số\s)?(([a-z0-9]|đ|\-)+\/([a-z0-9]|đ|\-|\/)*)"
)
re_ngay = re.compile(r"ngày\s\d+\/\d+\/\d+\b|ngày\s\d+tháng\d+năm\d+")
re_thang_nam = re.compile(r"tháng\s\d+\/\d+|tháng\s\d+|năm\s\d+")
re_chuong = re.compile(
    r"chương\s(III|II|IV|IX|VIII|VII|VI|XIII|XII|XI|XIV|XIX|XVIII|XVII|XVI|XV|XX|V|X|I|XXIII|XXII|XXI|XXIV|XXVIII|XXVII|XXVI|XXV|XXIX|XXX)\b"
)

# common end phrases in questions
END_PHRASES = [
    "có đúng không",
    "đúng không",
    "được không",
    "hay không",
    "được hiểu thế nào",
    "được quy định cụ thể là gì",
    "được quy định như thế nào",
    "được quy định thế nào",
    "được quy định như nào",
    "trong trường hợp như nào",
    "trong trường hợp như thế nào",
    "trong trường hợp nào",
    "trong những trường hợp nào",
    "được hiểu như thế nào",
    "được hiểu như nào",
    "như thế nào",
    "thế nào",
    "như nào",
    "là gì",
    "là ai",
    "là bao nhiêu",
    "bao nhiêu",
    "trước bao lâu",
    "là bao lâu",
    "bao lâu",
    "bao gồm gì",
    "không",
    "bao gồm những gì",
    "vào thời điểm nào",
    "gồm những giấy tờ gì",
    "những yêu cầu nào",
]

# punctuations, characters, stop-words 
punc = """!"#$%&'()*+,-./:;<=>?@[\]^`{|}~"""  # noqa: W605
table = str.maketrans("", "", punc)

punctuation = [x for x in string.punctuation]
number = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
chars = ["a", "b", "c", "d", "đ", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]
stop_word = number + chars + ["của", "và", "các", "có", "được", "theo", "tại", "trong", "về", 
            "hoặc", "người",  "này", "khoản", "cho", "không", "từ", "phải", 
            "ngày", "việc", "sau",  "để",  "đến", "bộ",  "với", "là", "năm", 
            "khi", "số", "trên", "khác", "đã", "thì", "thuộc", "điểm", "đồng",
            "do", "một", "bị", "vào", "lại", "ở", "nếu", "làm", "đây", 
            "như", "đó", "mà", "nơi", "”", "“"]
bm25_removed = punctuation + stop_word

# defining sub-functions

def remove_dieu_number(text):
    '''
    This funtion removes the common legal phrases out from texts
    '''
    text = re_thuchientheo.sub(" ", text)
    text = re_thongtuso.sub(" ", text)
    text = re_ngay.sub(" ", text)
    text = re_thang_nam.sub(" ", text)
    text = re_chuong.sub(" ", text)
    return " ".join(text.split())


def remove_other_number_by_zero(text):
    '''
    This funtion replaces numeric characters in texts into 0 for easier handling
    '''
    for digit in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
        text = text.replace(digit, "0")
    return text


def remove_punct(text):
    '''
    This funtion replaces punctuations in texts for easier handling
    '''
    text = text.replace(";", ",").replace(":", ".").replace("“", " ").replace("”", " ")
    text = "".join(
        [
            c
            if c.isalpha() or c.isdigit() or c in [" ", ",", "(", ")", ".", "/", "-"]
            else " "
            for c in text
        ]
    )
    text = " ".join(text.split())
    return text

def lower_or_keep(text):
    "This funtion lower words but not for abbreviations"
    lst = text.split(" ")
    newlst = [x if x.isupper() else x.lower() for x in lst]
    return " ".join(newlst)

def preprocess_all_title(article_title):
    """
    Preprocess titles of documents
    """
    article_title = lower_or_keep(article_title)
    lst = article_title.split()
    new_lst = []
    for i in range(len(lst)):
        if lst[i] == 'số' and i ==  len(lst)-1:
            new_lst.append(lst[i])
        elif lst[i] == 'số' and "/" in lst[i+1]:
            pass
        elif "/" in lst[i]:
            pass
        else:
            new_lst.append(lst[i])
    article_title = " ".join(new_lst)
    article_title = remove_dieu_number(article_title)
    #article_title = remove_other_number_by_zero(article_title)
    article_title = remove_punct(article_title)
    article_title = article_title.replace("về", "")
    if "do" in article_title and "ban hành" in article_title:
        idx = article_title.rfind("do")
        article_title = article_title[:(idx-1)]
    
    re_head = re.compile(r"(thông tư liên tịch|thông tư|nghị quyết|quyết định|nghị định|văn bản)\s(quy định|hướng dẫn)?")
    article_title = re_head.sub(" ", article_title)
    article_title = article_title.replace("   ", " ")
    article_title = article_title.replace("  ", " ")
    return article_title.strip()

def preprocess_article_title(article_title):
    """
    Preprocess titles of documents
    """
    article_title = lower_or_keep(article_title)
    article_title = " ".join(article_title.split()[2:])  # Dieu 1.
    article_title = remove_dieu_number(article_title)
    #article_title = remove_other_number_by_zero(article_title)
    article_title = remove_punct(article_title)
    return article_title
    
def preprocess_khoan(khoan):
    """
    Perprocess parts in a legal documents
    """
    khoan = lower_or_keep(khoan)
    khoan = khoan.replace("\xa0", "")
    matched = re.match(r"^\d+\.(\d+\.?)?\s", khoan)  # 1. 2.2. 2.2
    if matched is not None:
        khoan = khoan[matched.span()[1]:].strip()

    else:
        matched2 = re.match(r"^[\wđ]\)\s", khoan)
        if matched2 is not None:
            khoan = khoan[matched2.span()[1]:].strip()

    khoan = remove_dieu_number(khoan)
    #khoan = khoan.replace("đ)","")
    khoan = re.sub(r"[\wđ]\) ","", khoan)
    khoan = re.sub(r"[\wđ]\. ","", khoan)
    khoan = re.sub(r"\d+\.\d+\.\d+\. ", "", khoan)
    khoan = re.sub(r"\d+\.\d+\. ", "", khoan)
    khoan = re.sub(r"\d+\. ", "", khoan)
    #khoan = re.sub(r"[0-9]\. ", "", khoan)
    #khoan = remove_other_number_by_zero(khoan)
    khoan = remove_punct(khoan)
    khoan = khoan.replace(". .", ".")
    khoan = khoan.replace("..", ".")
    khoan = khoan.replace(", ,", ",")
    khoan = khoan.replace(",,", ",")
    khoan = khoan.strip()
    return " ".join(khoan.split())


def preprocess_question(q, remove_end_phrase=True):
    """
    Preprocess questions
    """
    q = lower_or_keep(q)
    q = remove_dieu_number(q)
    q = "".join([c if c.isalpha() or c.isdigit() or c == " " else " " for c in q])
    q = remove_punct(q)
    if remove_end_phrase:
        for phrase in END_PHRASES:
            if q.endswith(phrase):
                q = q[: -len(phrase)]
                break

    return q.strip()

'''def tokenise(text, segmenter):
    """
    Segment the texts with vncorenlp-segemnter
    """
    result = segmenter.tokenize(text)
    rlt = ""
    for i in range(len(result)-1):
        rlt += " ".join(result[i])
        rlt += " "
    rlt += " ".join(result[len(result)-1])
    return rlt
'''
def tokenise(text, f):
    """
    Segment the texts with pyvi tokenizer
    """
    return f(text)
    
def remove_stopword(w):
    "Remove stopwords in texts"
    return w not in stop_word

def bm25_process(text, f):
    """
    Processing texts for bm25: remove all puntuations, lower all words
    """
    text = tokenise(text, f)
    words = text.lower().split(" ")
    result = [w for w in words if w not in bm25_removed]
    stripped = " ".join(result)
    result = " ".join(stripped.split(" "))
    return result

def length(sentence):
    "Return the length in words of sentences"
    return len(sentence.split())

def build_corpus(f, corpus_file, law_dict, scorpus_ids, head = False):
    """
    Build a corpus-dataframe
    """
    law_ids = []
    text_ids = []
    article_ids = []
    titles = []
    texts = []
    processed_texts = []
    tokenized_texts = []
    bm25texts = []
    lengths = []
    ids = []
    sub_ids = []
    count = 0

    with open (corpus_file, 'r') as input:
        data = json.load(input)
    
    for law in data:
        for article in law['articles']:
            ids.append(count)
            law_ids.append(law['law_id'])
            article_ids.append(article['article_id'])
            text_id = law['law_id'] + "_" + article['article_id']
            text_ids.append(text_id)
            
            titles.append(article['title'])
            texts.append(article['text'])
            
            title = preprocess_article_title(article["title"])
            head = preprocess_all_title(law_dict[law['law_id']])
            
            cac_khoan = article["text"].split("\n")
            khoan_clean = []
            for khoan in cac_khoan:
                khoan = preprocess_khoan(khoan)
                khoan_clean.append(khoan.strip())
            article_text = " ".join(khoan_clean)
            if head:
                processed_text = head + ". " + title + ". " + article_text
            else:
                processed_text = title + ". " + article_text + ". " + head + "."
            processed_texts.append(processed_text)
            start_sub_id = scorpus_ids.index(count)
            try:
                end_sub_id = scorpus_ids.index(count+1)
                sub_ids.append([i for i in range(start_sub_id, end_sub_id)])
            except:
                sub_ids.append([i for i in range(start_sub_id, len(scorpus_ids))])
            
            try: 
                tokenized_text = tokenise(processed_text, f)
                tokenized_texts.append(tokenized_text)
                lengths.append(length(tokenized_text))
            except:
                tokenized_text = tokenise(processed_text[:50000], f)
                tokenized_texts.append(tokenized_text)
                lengths.append(length(tokenized_text))
            bm25texts.append(bm25_process(processed_text, f))
            count += 1
      
    df = pd.DataFrame()
    df["id"] = ids
    df["law_id"] = law_ids
    df["article_id"] = article_ids
    df["text_id"] = text_ids
    df["title"] = titles
    df["text"] = texts
    df["processed_text"] = processed_texts
    df["sub_id"] = sub_ids
    df["tokenized_text"] = tokenized_texts
    df["bm25text"] = bm25texts
    df["len"] = lengths
    
    return df

def create_sliding_window(tokenized_text, size=200, overlap=64):
    """
    Create list of windows for a text
    """
    sentences = tokenized_text.split(".")
    words = tokenized_text.split(" ")
    title = sentences[0]
    words = [w for w in words if len(w) >0]
    actual_size = size - overlap
    
    windows = []
    n_windows = math.ceil(len(words)/actual_size)
    for i in range(n_windows):
        windows.append(" ".join(words[i*actual_size:i*actual_size + size]))
    for i in range(1, n_windows):
        if not windows[i].startswith("."):
            windows[i] = title + ". " + windows[i]
        else:
            windows[i] = title + windows[i]
    return windows

def build_short_corpus(f, corpus_file, law_dict, head=False, size=200, overlap=64):
    """
    Build a corpus-dataframe
    """
    ids = []
    law_ids = []
    text_ids = []
    article_ids = []
    titles = []
    texts = []
    processed_texts = []
    sub_ids = []
    tokenized_texts = []
    bm25texts = []
    lengths = []

    with open (corpus_file, 'r') as input:
        data = json.load(input)
    idx = 0
    sub_idx = 0
    for law in data:
        for article in law['articles']:
            text_id = law['law_id'] + "_" + article['article_id']
            title = preprocess_article_title(article["title"])
            head = preprocess_all_title(law_dict[law['law_id']])
            cac_khoan = article["text"].split("\n")
            khoan_clean = []
            for khoan in cac_khoan:
                khoan = preprocess_khoan(khoan)
                khoan_clean.append(khoan.strip())
            article_text = " ".join(khoan_clean)
            if head:
                processed_text = head + ". " + title + ". " + article_text
            else:
                processed_text = title + ". " + article_text + ". " + head + "."
            try: 
                tokenized_text = tokenise(processed_text, f)
                tokenized_len = length(tokenized_text)
                if tokenized_len <= size + 10:
                    ids.append(idx)
                    law_ids.append(law['law_id'])
                    article_ids.append(article['article_id'])
                    text_ids.append(text_id)
                    titles.append(article['title'])
                    texts.append(article['text'])
                    processed_texts.append(processed_text)
                    sub_ids.append(sub_idx)
                    tokenized_texts.append(tokenized_text)
                    lengths.append(tokenized_len)
                    bm25texts.append(bm25_process(processed_text, f))
                    sub_idx +=1
                else:
                    windows = create_sliding_window(tokenized_text, size=224, overlap=64)
                    for window in windows:
                        ids.append(idx)
                        law_ids.append(law['law_id'])
                        article_ids.append(article['article_id'])
                        text_ids.append(text_id)
                        titles.append(article['title'])
                        texts.append(article['text'])
                        processed_texts.append(processed_text)
                        sub_ids.append(sub_idx)
                        tokenized_texts.append(window)
                        lengths.append(length(window))
                        bm25texts.append(bm25_process(window, f))  
                        sub_idx +=1
            except:
                actual_size = 50000 - overlap
                big_windows = []
                n_big_windows = math.ceil(len(processed_text)/actual_size)
                for i in range(n_big_windows):
                    big_windows.append("".join(processed_text[i*actual_size:i*actual_size + size]))
                for big_window in big_windows:
                    tokenized_text = tokenise(big_window, f)
                    tokenized_len = length(tokenized_text)
                    if tokenized_len > size + 10:
                        windows = create_sliding_window(tokenized_text, size=224, overlap=64)
                        for window in windows:
                            ids.append(idx)
                            law_ids.append(law['law_id'])
                            article_ids.append(article['article_id'])
                            text_ids.append(text_id)
                            titles.append(article['title'])
                            texts.append(article['text'])
                            processed_texts.append(processed_text)
                            sub_ids.append(sub_idx)
                            tokenized_texts.append(window)
                            lengths.append(length(window))
                            bm25texts.append(bm25_process(window, f))  
                            sub_idx +=1
                    else:
                        ids.append(idx)
                        law_ids.append(law['law_id'])
                        article_ids.append(article['article_id'])
                        text_ids.append(text_id)
                        titles.append(article['title'])
                        texts.append(article['text'])
                        processed_texts.append(processed_text)
                        sub_ids.append(sub_idx)
                        tokenized_texts.append(tokenized_text)
                        lengths.append(tokenized_len)
                        bm25texts.append(bm25_process(processed_text, f))
                        sub_idx +=1
    
            idx += 1
                 
    df = pd.DataFrame()
    df["id"] = ids
    df["law_id"] = law_ids
    df["article_id"] = article_ids
    df["text_id"] = text_ids
    df["title"] = titles
    df["text"] = texts
    df["processed_text"] = processed_texts
    df["sub_id"] = sub_ids
    df["tokenized_text"] = tokenized_texts
    df["bm25text"] = bm25texts
    df["len"] = lengths
    
    return df

def build_qa(f, df, qa_file, split = False):
    """
    Build a question-answer dataframe
    """
    text_ids = df["text_id"].tolist()
    titles = df["title"].tolist()
    texts = df["text"].tolist()
    lengths = df["len"].tolist()
    sub_ids = df["sub_id"].tolist()
    q_texts = []
    q_processed_texts = []
    q_tokenized_texts = []
    q_bm25texts = []
    q_lens = []
    no_ans = []
    ans_ids = []
    ans_text_ids = []
    ans_titles = []
    ans_texts = []
    ans_lens = []
    ans_sub_ids = []
    with open (qa_file, 'r') as input:
        data = json.load(input)
    
    if not split:
        for item in data['items']:
            question = item["question"]
            q_texts.append(question)
            q_processed_text = preprocess_question(question, remove_end_phrase=False)
            q_processed_texts.append(q_processed_text)
            q_tokenized_text = tokenise(q_processed_text, f)
            q_tokenized_texts.append(q_tokenized_text)
            q_bm25texts.append(bm25_process(q_processed_text, f))
            q_lens.append(length(q_tokenized_text))
            ans_text_id = ""
            ans_id = ""
            ans_title = ""
            ans_text = ""
            ans_len = ""
            ans_count = 0
            ans_sub_id = []
            for i in range(len(item['relevant_articles'])):
                ans_count += 1
                atext_id = item['relevant_articles'][i]['law_id'] + "_"  + item['relevant_articles'][i]['article_id']
                a_id = text_ids.index(atext_id)
                ans_text_id += atext_id
                ans_id += str(a_id)
                ans_title += titles[a_id]
                ans_text += texts[a_id]
                ans_len += str(lengths[a_id])
                sub_id = sub_ids[a_id]
                ans_sub_id += sub_id
                
                if i < len(item["relevant_articles"]) - 1:
                    ans_text_id += ", "
                    ans_id += ", "
                    ans_title += ", "
                    ans_text += ", "
                    ans_len += ", "
                    
            no_ans.append(ans_count)
            ans_text_ids.append(ans_text_id)
            ans_ids.append(ans_id)
            ans_titles.append(ans_title)
            ans_texts.append(ans_text)
            ans_lens.append(ans_len)
            ans_sub_ids.append(ans_sub_id)
    else:
        for item in data['items']:
            question = item["question"]
            for article in item['relevant_articles']:
                q_texts.append(question)
                q_processed_text = preprocess_question(question, remove_end_phrase=False)
                q_processed_texts.append(q_processed_text)
                q_tokenized_text = tokenise(q_processed_text, f)
                q_tokenized_texts.append(q_tokenized_text)
                q_bm25texts.append(bm25_process(q_processed_text, f))
                q_lens.append(length(q_tokenized_text))           
                ans_text_id = article['law_id'] + "_"  + article['article_id']
                ans_text_ids.append(ans_text_id)
                a_id = text_ids.index(ans_text_id)
                ans_ids.append(a_id)
                ans_titles.append(titles[a_id])
                ans_texts.append(texts[a_id])
                ans_lens.append(lengths[a_id])
                ans_sub_ids.append(sub_ids[a_id])
                
    
    df = pd.DataFrame()
    df["question"] = q_texts
    df["processed_question"] = q_processed_texts
    df["tokenized_question"] = q_tokenized_texts
    df["bm25_question"] = q_bm25texts
    df["ques_len"] = q_lens
    if not split:
        df['no_ans'] = no_ans
    df["ans_text_id"] = ans_text_ids
    df["ans_id"] = ans_ids
    df["ans_title"] = ans_titles
    df["ans_text"] = ans_texts
    df["ans_len"] = ans_lens
    df["ans_sub_id"] = ans_sub_ids
    
    return df

def build_biencoder_data(dqa_split, bm25, set_ques, no_hneg, no_search):
    """
    Build train, val, test, dataframe used for biencoder training
    """
    qa_ids = []
    neg_ids = []
    search_ids = []
    q_texts = dqa_split['question'].tolist()
    q_bm25texts = dqa_split['bm25_question'].tolist()
    count = 0
    ans_ids = dqa_split['ans_id'].tolist()
    ids = [i for i in range(bm25.corpus_size)]
    for i in range(len(q_texts)):
        if q_texts[i] in set_ques:
            qa_ids.append(i)
            q_bm25 = q_bm25texts[i].split(" ")
            bm25_ids = bm25.get_top_n(q_bm25, ids, n=no_search)
            if ans_ids[i] in bm25_ids:
                count += 1
        
            neg = bm25_ids[:(no_hneg+1)]
            if ans_ids[i] in neg:
                neg.remove(ans_ids[i])
                
            neg = neg[:no_hneg]
            neg_ids.append(neg)
            search_ids.append(bm25_ids)
    print(count/len(qa_ids))   
    df = dqa_split.loc[qa_ids]
    df['neg_ids'] = neg_ids
    df['search_ids'] = search_ids
    return df

def build_short_data(df, dcorpus, limited_length = 234):
    """
    Build short data
    """
    ids = [i for i in range(len(df)) if dcorpus['len'][df['ans_id'][i]] <= limited_length]
    dshort = df.loc[ids].copy(deep= True).reset_index(drop=True)
    return dshort

def build_general_data(dqa, bm25, set_ques, no_hneg, no_search):
    """
    Build general train, test, val dataframe
    """
    qa_ids = []
    neg_ids = []
    search_ids = []
    q_texts = dqa['question'].tolist()
    q_bm25texts = dqa['bm25_question'].tolist()
    ans_ids = dqa['ans_id'].tolist()
    ids = [i for i in range(bm25.corpus_size)]
    count = 0
    
    for i in range(len(q_texts)):
        if q_texts[i] in set_ques:
            qa_ids.append(i)
            q_bm25 = q_bm25texts[i].split(" ")
            ans_id = [int(x) for x in ans_ids[i].split(", ")]
            bm25_ids = bm25.get_top_n(q_bm25, ids, n= no_search)
            search_ids.append(bm25_ids)
            
            for a_id in ans_id:
                if a_id in bm25_ids:
                    bm25_ids.remove(a_id)
            neg_id = bm25_ids[:no_hneg]
            neg_ids.append(neg_id)
            if len(bm25_ids) == (no_search - len(ans_id)):
                count += 1      
        
    df = dqa.loc[qa_ids]
    df['neg_ids'] = neg_ids
    df['search_ids'] = search_ids
    print(count/len(qa_ids))
    return df