viLegal_bi / src /bi /preprocess.py
coang's picture
Upload 18 files
7d95c60 verified
import re
import math
import json
import pandas as pd
import string
# common phrases in legal documents
re_thuchientheo = re.compile(
r"((((được\s)?thực hiện theo qu[iy] định tại\s|hướng dẫn tại\s|theo qu[iy] định tại\s|(được\s)?thực hiện theo\s|theo qu[iy] định tại\s|theo nội dung qu[yi] định tại\s|quy[iy] định tại|theo\s)(các\s)?)?|tại\s(các\s)?)(khoản(\ssố)?\s(\d+\,\s)*\d+|điều(\ssố)?\s(\d+\,\s)*\d+|điểm\s(([a-z]|đ)\,\s)*([a-z]|đ)\b|chương(\ssố)?\s(\d+\,\s)*\d+)((\s|\,\s|\s\,\s|\svà\s)(khoản(\ssố)?\s(\d+\,\s)*\d+|điều(\ssố)?\s(\d+\,\s)*\d+|điểm\s(([a-z]|đ)\,\s)*([a-z]|đ)\b|chương(\ssố)?\s(\d+\,\s)*\d+))*(\s(điều này|thông tư này|nghị quyết này|quyết định này|nghị định này|văn bản này|quyết định này))?"
)
re_thongtuso = re.compile(
r"(thông tư liên tịch|thông tư|nghị quyết|quyết định|nghị định|văn bản|Thông tư liên tịch|Thông tư|Nghị quyết|Nghị định|Văn bản|Quyết định)\s(số\s)?(([a-z0-9]|đ|\-)+\/([a-z0-9]|đ|\-|\/)*)"
)
re_ngay = re.compile(r"ngày\s\d+\/\d+\/\d+\b|ngày\s\d+tháng\d+năm\d+")
re_thang_nam = re.compile(r"tháng\s\d+\/\d+|tháng\s\d+|năm\s\d+")
re_chuong = re.compile(
r"chương\s(III|II|IV|IX|VIII|VII|VI|XIII|XII|XI|XIV|XIX|XVIII|XVII|XVI|XV|XX|V|X|I|XXIII|XXII|XXI|XXIV|XXVIII|XXVII|XXVI|XXV|XXIX|XXX)\b"
)
# common end phrases in questions
END_PHRASES = [
"có đúng không",
"đúng không",
"được không",
"hay không",
"được hiểu thế nào",
"được quy định cụ thể là gì",
"được quy định như thế nào",
"được quy định thế nào",
"được quy định như nào",
"trong trường hợp như nào",
"trong trường hợp như thế nào",
"trong trường hợp nào",
"trong những trường hợp nào",
"được hiểu như thế nào",
"được hiểu như nào",
"như thế nào",
"thế nào",
"như nào",
"là gì",
"là ai",
"là bao nhiêu",
"bao nhiêu",
"trước bao lâu",
"là bao lâu",
"bao lâu",
"bao gồm gì",
"không",
"bao gồm những gì",
"vào thời điểm nào",
"gồm những giấy tờ gì",
"những yêu cầu nào",
]
# punctuations, characters, stop-words
punc = """!"#$%&'()*+,-./:;<=>?@[\]^`{|}~""" # noqa: W605
table = str.maketrans("", "", punc)
punctuation = [x for x in string.punctuation]
number = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
chars = ["a", "b", "c", "d", "đ", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]
stop_word = number + chars + ["của", "và", "các", "có", "được", "theo", "tại", "trong", "về",
"hoặc", "người", "này", "khoản", "cho", "không", "từ", "phải",
"ngày", "việc", "sau", "để", "đến", "bộ", "với", "là", "năm",
"khi", "số", "trên", "khác", "đã", "thì", "thuộc", "điểm", "đồng",
"do", "một", "bị", "vào", "lại", "ở", "nếu", "làm", "đây",
"như", "đó", "mà", "nơi", "”", "“"]
bm25_removed = punctuation + stop_word
# defining sub-functions
def remove_dieu_number(text):
'''
This funtion removes the common legal phrases out from texts
'''
text = re_thuchientheo.sub(" ", text)
text = re_thongtuso.sub(" ", text)
text = re_ngay.sub(" ", text)
text = re_thang_nam.sub(" ", text)
text = re_chuong.sub(" ", text)
return " ".join(text.split())
def remove_other_number_by_zero(text):
'''
This funtion replaces numeric characters in texts into 0 for easier handling
'''
for digit in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
text = text.replace(digit, "0")
return text
def remove_punct(text):
'''
This funtion replaces punctuations in texts for easier handling
'''
text = text.replace(";", ",").replace(":", ".").replace("“", " ").replace("”", " ")
text = "".join(
[
c
if c.isalpha() or c.isdigit() or c in [" ", ",", "(", ")", ".", "/", "-"]
else " "
for c in text
]
)
text = " ".join(text.split())
return text
def lower_or_keep(text):
"This funtion lower words but not for abbreviations"
lst = text.split(" ")
newlst = [x if x.isupper() else x.lower() for x in lst]
return " ".join(newlst)
def preprocess_all_title(article_title):
"""
Preprocess titles of documents
"""
article_title = lower_or_keep(article_title)
lst = article_title.split()
new_lst = []
for i in range(len(lst)):
if lst[i] == 'số' and i == len(lst)-1:
new_lst.append(lst[i])
elif lst[i] == 'số' and "/" in lst[i+1]:
pass
elif "/" in lst[i]:
pass
else:
new_lst.append(lst[i])
article_title = " ".join(new_lst)
article_title = remove_dieu_number(article_title)
#article_title = remove_other_number_by_zero(article_title)
article_title = remove_punct(article_title)
article_title = article_title.replace("về", "")
if "do" in article_title and "ban hành" in article_title:
idx = article_title.rfind("do")
article_title = article_title[:(idx-1)]
re_head = re.compile(r"(thông tư liên tịch|thông tư|nghị quyết|quyết định|nghị định|văn bản)\s(quy định|hướng dẫn)?")
article_title = re_head.sub(" ", article_title)
article_title = article_title.replace(" ", " ")
article_title = article_title.replace(" ", " ")
return article_title.strip()
def preprocess_article_title(article_title):
"""
Preprocess titles of documents
"""
article_title = lower_or_keep(article_title)
article_title = " ".join(article_title.split()[2:]) # Dieu 1.
article_title = remove_dieu_number(article_title)
#article_title = remove_other_number_by_zero(article_title)
article_title = remove_punct(article_title)
return article_title
def preprocess_khoan(khoan):
"""
Perprocess parts in a legal documents
"""
khoan = lower_or_keep(khoan)
khoan = khoan.replace("\xa0", "")
matched = re.match(r"^\d+\.(\d+\.?)?\s", khoan) # 1. 2.2. 2.2
if matched is not None:
khoan = khoan[matched.span()[1]:].strip()
else:
matched2 = re.match(r"^[\wđ]\)\s", khoan)
if matched2 is not None:
khoan = khoan[matched2.span()[1]:].strip()
khoan = remove_dieu_number(khoan)
#khoan = khoan.replace("đ)","")
khoan = re.sub(r"[\wđ]\) ","", khoan)
khoan = re.sub(r"[\wđ]\. ","", khoan)
khoan = re.sub(r"\d+\.\d+\.\d+\. ", "", khoan)
khoan = re.sub(r"\d+\.\d+\. ", "", khoan)
khoan = re.sub(r"\d+\. ", "", khoan)
#khoan = re.sub(r"[0-9]\. ", "", khoan)
#khoan = remove_other_number_by_zero(khoan)
khoan = remove_punct(khoan)
khoan = khoan.replace(". .", ".")
khoan = khoan.replace("..", ".")
khoan = khoan.replace(", ,", ",")
khoan = khoan.replace(",,", ",")
khoan = khoan.strip()
return " ".join(khoan.split())
def preprocess_question(q, remove_end_phrase=True):
"""
Preprocess questions
"""
q = lower_or_keep(q)
q = remove_dieu_number(q)
q = "".join([c if c.isalpha() or c.isdigit() or c == " " else " " for c in q])
q = remove_punct(q)
if remove_end_phrase:
for phrase in END_PHRASES:
if q.endswith(phrase):
q = q[: -len(phrase)]
break
return q.strip()
'''def tokenise(text, segmenter):
"""
Segment the texts with vncorenlp-segemnter
"""
result = segmenter.tokenize(text)
rlt = ""
for i in range(len(result)-1):
rlt += " ".join(result[i])
rlt += " "
rlt += " ".join(result[len(result)-1])
return rlt
'''
def tokenise(text, f):
"""
Segment the texts with pyvi tokenizer
"""
return f(text)
def remove_stopword(w):
"Remove stopwords in texts"
return w not in stop_word
def bm25_process(text, f):
"""
Processing texts for bm25: remove all puntuations, lower all words
"""
text = tokenise(text, f)
words = text.lower().split(" ")
result = [w for w in words if w not in bm25_removed]
stripped = " ".join(result)
result = " ".join(stripped.split(" "))
return result
def length(sentence):
"Return the length in words of sentences"
return len(sentence.split())
def build_corpus(f, corpus_file, law_dict, scorpus_ids, head = False):
"""
Build a corpus-dataframe
"""
law_ids = []
text_ids = []
article_ids = []
titles = []
texts = []
processed_texts = []
tokenized_texts = []
bm25texts = []
lengths = []
ids = []
sub_ids = []
count = 0
with open (corpus_file, 'r') as input:
data = json.load(input)
for law in data:
for article in law['articles']:
ids.append(count)
law_ids.append(law['law_id'])
article_ids.append(article['article_id'])
text_id = law['law_id'] + "_" + article['article_id']
text_ids.append(text_id)
titles.append(article['title'])
texts.append(article['text'])
title = preprocess_article_title(article["title"])
head = preprocess_all_title(law_dict[law['law_id']])
cac_khoan = article["text"].split("\n")
khoan_clean = []
for khoan in cac_khoan:
khoan = preprocess_khoan(khoan)
khoan_clean.append(khoan.strip())
article_text = " ".join(khoan_clean)
if head:
processed_text = head + ". " + title + ". " + article_text
else:
processed_text = title + ". " + article_text + ". " + head + "."
processed_texts.append(processed_text)
start_sub_id = scorpus_ids.index(count)
try:
end_sub_id = scorpus_ids.index(count+1)
sub_ids.append([i for i in range(start_sub_id, end_sub_id)])
except:
sub_ids.append([i for i in range(start_sub_id, len(scorpus_ids))])
try:
tokenized_text = tokenise(processed_text, f)
tokenized_texts.append(tokenized_text)
lengths.append(length(tokenized_text))
except:
tokenized_text = tokenise(processed_text[:50000], f)
tokenized_texts.append(tokenized_text)
lengths.append(length(tokenized_text))
bm25texts.append(bm25_process(processed_text, f))
count += 1
df = pd.DataFrame()
df["id"] = ids
df["law_id"] = law_ids
df["article_id"] = article_ids
df["text_id"] = text_ids
df["title"] = titles
df["text"] = texts
df["processed_text"] = processed_texts
df["sub_id"] = sub_ids
df["tokenized_text"] = tokenized_texts
df["bm25text"] = bm25texts
df["len"] = lengths
return df
def create_sliding_window(tokenized_text, size=200, overlap=64):
"""
Create list of windows for a text
"""
sentences = tokenized_text.split(".")
words = tokenized_text.split(" ")
title = sentences[0]
words = [w for w in words if len(w) >0]
actual_size = size - overlap
windows = []
n_windows = math.ceil(len(words)/actual_size)
for i in range(n_windows):
windows.append(" ".join(words[i*actual_size:i*actual_size + size]))
for i in range(1, n_windows):
if not windows[i].startswith("."):
windows[i] = title + ". " + windows[i]
else:
windows[i] = title + windows[i]
return windows
def build_short_corpus(f, corpus_file, law_dict, head=False, size=200, overlap=64):
"""
Build a corpus-dataframe
"""
ids = []
law_ids = []
text_ids = []
article_ids = []
titles = []
texts = []
processed_texts = []
sub_ids = []
tokenized_texts = []
bm25texts = []
lengths = []
with open (corpus_file, 'r') as input:
data = json.load(input)
idx = 0
sub_idx = 0
for law in data:
for article in law['articles']:
text_id = law['law_id'] + "_" + article['article_id']
title = preprocess_article_title(article["title"])
head = preprocess_all_title(law_dict[law['law_id']])
cac_khoan = article["text"].split("\n")
khoan_clean = []
for khoan in cac_khoan:
khoan = preprocess_khoan(khoan)
khoan_clean.append(khoan.strip())
article_text = " ".join(khoan_clean)
if head:
processed_text = head + ". " + title + ". " + article_text
else:
processed_text = title + ". " + article_text + ". " + head + "."
try:
tokenized_text = tokenise(processed_text, f)
tokenized_len = length(tokenized_text)
if tokenized_len <= size + 10:
ids.append(idx)
law_ids.append(law['law_id'])
article_ids.append(article['article_id'])
text_ids.append(text_id)
titles.append(article['title'])
texts.append(article['text'])
processed_texts.append(processed_text)
sub_ids.append(sub_idx)
tokenized_texts.append(tokenized_text)
lengths.append(tokenized_len)
bm25texts.append(bm25_process(processed_text, f))
sub_idx +=1
else:
windows = create_sliding_window(tokenized_text, size=224, overlap=64)
for window in windows:
ids.append(idx)
law_ids.append(law['law_id'])
article_ids.append(article['article_id'])
text_ids.append(text_id)
titles.append(article['title'])
texts.append(article['text'])
processed_texts.append(processed_text)
sub_ids.append(sub_idx)
tokenized_texts.append(window)
lengths.append(length(window))
bm25texts.append(bm25_process(window, f))
sub_idx +=1
except:
actual_size = 50000 - overlap
big_windows = []
n_big_windows = math.ceil(len(processed_text)/actual_size)
for i in range(n_big_windows):
big_windows.append("".join(processed_text[i*actual_size:i*actual_size + size]))
for big_window in big_windows:
tokenized_text = tokenise(big_window, f)
tokenized_len = length(tokenized_text)
if tokenized_len > size + 10:
windows = create_sliding_window(tokenized_text, size=224, overlap=64)
for window in windows:
ids.append(idx)
law_ids.append(law['law_id'])
article_ids.append(article['article_id'])
text_ids.append(text_id)
titles.append(article['title'])
texts.append(article['text'])
processed_texts.append(processed_text)
sub_ids.append(sub_idx)
tokenized_texts.append(window)
lengths.append(length(window))
bm25texts.append(bm25_process(window, f))
sub_idx +=1
else:
ids.append(idx)
law_ids.append(law['law_id'])
article_ids.append(article['article_id'])
text_ids.append(text_id)
titles.append(article['title'])
texts.append(article['text'])
processed_texts.append(processed_text)
sub_ids.append(sub_idx)
tokenized_texts.append(tokenized_text)
lengths.append(tokenized_len)
bm25texts.append(bm25_process(processed_text, f))
sub_idx +=1
idx += 1
df = pd.DataFrame()
df["id"] = ids
df["law_id"] = law_ids
df["article_id"] = article_ids
df["text_id"] = text_ids
df["title"] = titles
df["text"] = texts
df["processed_text"] = processed_texts
df["sub_id"] = sub_ids
df["tokenized_text"] = tokenized_texts
df["bm25text"] = bm25texts
df["len"] = lengths
return df
def build_qa(f, df, qa_file, split = False):
"""
Build a question-answer dataframe
"""
text_ids = df["text_id"].tolist()
titles = df["title"].tolist()
texts = df["text"].tolist()
lengths = df["len"].tolist()
sub_ids = df["sub_id"].tolist()
q_texts = []
q_processed_texts = []
q_tokenized_texts = []
q_bm25texts = []
q_lens = []
no_ans = []
ans_ids = []
ans_text_ids = []
ans_titles = []
ans_texts = []
ans_lens = []
ans_sub_ids = []
with open (qa_file, 'r') as input:
data = json.load(input)
if not split:
for item in data['items']:
question = item["question"]
q_texts.append(question)
q_processed_text = preprocess_question(question, remove_end_phrase=False)
q_processed_texts.append(q_processed_text)
q_tokenized_text = tokenise(q_processed_text, f)
q_tokenized_texts.append(q_tokenized_text)
q_bm25texts.append(bm25_process(q_processed_text, f))
q_lens.append(length(q_tokenized_text))
ans_text_id = ""
ans_id = ""
ans_title = ""
ans_text = ""
ans_len = ""
ans_count = 0
ans_sub_id = []
for i in range(len(item['relevant_articles'])):
ans_count += 1
atext_id = item['relevant_articles'][i]['law_id'] + "_" + item['relevant_articles'][i]['article_id']
a_id = text_ids.index(atext_id)
ans_text_id += atext_id
ans_id += str(a_id)
ans_title += titles[a_id]
ans_text += texts[a_id]
ans_len += str(lengths[a_id])
sub_id = sub_ids[a_id]
ans_sub_id += sub_id
if i < len(item["relevant_articles"]) - 1:
ans_text_id += ", "
ans_id += ", "
ans_title += ", "
ans_text += ", "
ans_len += ", "
no_ans.append(ans_count)
ans_text_ids.append(ans_text_id)
ans_ids.append(ans_id)
ans_titles.append(ans_title)
ans_texts.append(ans_text)
ans_lens.append(ans_len)
ans_sub_ids.append(ans_sub_id)
else:
for item in data['items']:
question = item["question"]
for article in item['relevant_articles']:
q_texts.append(question)
q_processed_text = preprocess_question(question, remove_end_phrase=False)
q_processed_texts.append(q_processed_text)
q_tokenized_text = tokenise(q_processed_text, f)
q_tokenized_texts.append(q_tokenized_text)
q_bm25texts.append(bm25_process(q_processed_text, f))
q_lens.append(length(q_tokenized_text))
ans_text_id = article['law_id'] + "_" + article['article_id']
ans_text_ids.append(ans_text_id)
a_id = text_ids.index(ans_text_id)
ans_ids.append(a_id)
ans_titles.append(titles[a_id])
ans_texts.append(texts[a_id])
ans_lens.append(lengths[a_id])
ans_sub_ids.append(sub_ids[a_id])
df = pd.DataFrame()
df["question"] = q_texts
df["processed_question"] = q_processed_texts
df["tokenized_question"] = q_tokenized_texts
df["bm25_question"] = q_bm25texts
df["ques_len"] = q_lens
if not split:
df['no_ans'] = no_ans
df["ans_text_id"] = ans_text_ids
df["ans_id"] = ans_ids
df["ans_title"] = ans_titles
df["ans_text"] = ans_texts
df["ans_len"] = ans_lens
df["ans_sub_id"] = ans_sub_ids
return df
def build_biencoder_data(dqa_split, bm25, set_ques, no_hneg, no_search):
"""
Build train, val, test, dataframe used for biencoder training
"""
qa_ids = []
neg_ids = []
search_ids = []
q_texts = dqa_split['question'].tolist()
q_bm25texts = dqa_split['bm25_question'].tolist()
count = 0
ans_ids = dqa_split['ans_id'].tolist()
ids = [i for i in range(bm25.corpus_size)]
for i in range(len(q_texts)):
if q_texts[i] in set_ques:
qa_ids.append(i)
q_bm25 = q_bm25texts[i].split(" ")
bm25_ids = bm25.get_top_n(q_bm25, ids, n=no_search)
if ans_ids[i] in bm25_ids:
count += 1
neg = bm25_ids[:(no_hneg+1)]
if ans_ids[i] in neg:
neg.remove(ans_ids[i])
neg = neg[:no_hneg]
neg_ids.append(neg)
search_ids.append(bm25_ids)
print(count/len(qa_ids))
df = dqa_split.loc[qa_ids]
df['neg_ids'] = neg_ids
df['search_ids'] = search_ids
return df
def build_short_data(df, dcorpus, limited_length = 234):
"""
Build short data
"""
ids = [i for i in range(len(df)) if dcorpus['len'][df['ans_id'][i]] <= limited_length]
dshort = df.loc[ids].copy(deep= True).reset_index(drop=True)
return dshort
def build_general_data(dqa, bm25, set_ques, no_hneg, no_search):
"""
Build general train, test, val dataframe
"""
qa_ids = []
neg_ids = []
search_ids = []
q_texts = dqa['question'].tolist()
q_bm25texts = dqa['bm25_question'].tolist()
ans_ids = dqa['ans_id'].tolist()
ids = [i for i in range(bm25.corpus_size)]
count = 0
for i in range(len(q_texts)):
if q_texts[i] in set_ques:
qa_ids.append(i)
q_bm25 = q_bm25texts[i].split(" ")
ans_id = [int(x) for x in ans_ids[i].split(", ")]
bm25_ids = bm25.get_top_n(q_bm25, ids, n= no_search)
search_ids.append(bm25_ids)
for a_id in ans_id:
if a_id in bm25_ids:
bm25_ids.remove(a_id)
neg_id = bm25_ids[:no_hneg]
neg_ids.append(neg_id)
if len(bm25_ids) == (no_search - len(ans_id)):
count += 1
df = dqa.loc[qa_ids]
df['neg_ids'] = neg_ids
df['search_ids'] = search_ids
print(count/len(qa_ids))
return df