Upload 18 files

7d95c60 verified 18 days ago

24.1 kB

	import re
	import math
	import json
	import pandas as pd
	import string

	# common phrases in legal documents
	re_thuchientheo = re.compile(
	r"((((được\s)?thực hiện theo qu[iy] định tại\s\|hướng dẫn tại\s\|theo qu[iy] định tại\s\|(được\s)?thực hiện theo\s\|theo qu[iy] định tại\s\|theo nội dung qu[yi] định tại\s\|quy[iy] định tại\|theo\s)(các\s)?)?\|tại\s(các\s)?)(khoản(\ssố)?\s(\d+\,\s)\d+\|điều(\ssố)?\s(\d+\,\s)\d+\|điểm\s(([a-z]\|đ)\,\s)([a-z]\|đ)\b\|chương(\ssố)?\s(\d+\,\s)\d+)((\s\|\,\s\|\s\,\s\|\svà\s)(khoản(\ssố)?\s(\d+\,\s)\d+\|điều(\ssố)?\s(\d+\,\s)\d+\|điểm\s(([a-z]\|đ)\,\s)([a-z]\|đ)\b\|chương(\ssố)?\s(\d+\,\s)\d+))*(\s(điều này\|thông tư này\|nghị quyết này\|quyết định này\|nghị định này\|văn bản này\|quyết định này))?"
	)
	re_thongtuso = re.compile(
	r"(thông tư liên tịch\|thông tư\|nghị quyết\|quyết định\|nghị định\|văn bản\|Thông tư liên tịch\|Thông tư\|Nghị quyết\|Nghị định\|Văn bản\|Quyết định)\s(số\s)?(([a-z0-9]\|đ\|\-)+\/([a-z0-9]\|đ\|\-\|\/)*)"
	)
	re_ngay = re.compile(r"ngày\s\d+\/\d+\/\d+\b\|ngày\s\d+tháng\d+năm\d+")
	re_thang_nam = re.compile(r"tháng\s\d+\/\d+\|tháng\s\d+\|năm\s\d+")
	re_chuong = re.compile(
	r"chương\s(III\|II\|IV\|IX\|VIII\|VII\|VI\|XIII\|XII\|XI\|XIV\|XIX\|XVIII\|XVII\|XVI\|XV\|XX\|V\|X\|I\|XXIII\|XXII\|XXI\|XXIV\|XXVIII\|XXVII\|XXVI\|XXV\|XXIX\|XXX)\b"
	)

	# common end phrases in questions
	END_PHRASES = [
	"có đúng không",
	"đúng không",
	"được không",
	"hay không",
	"được hiểu thế nào",
	"được quy định cụ thể là gì",
	"được quy định như thế nào",
	"được quy định thế nào",
	"được quy định như nào",
	"trong trường hợp như nào",
	"trong trường hợp như thế nào",
	"trong trường hợp nào",
	"trong những trường hợp nào",
	"được hiểu như thế nào",
	"được hiểu như nào",
	"như thế nào",
	"thế nào",
	"như nào",
	"là gì",
	"là ai",
	"là bao nhiêu",
	"bao nhiêu",
	"trước bao lâu",
	"là bao lâu",
	"bao lâu",
	"bao gồm gì",
	"không",
	"bao gồm những gì",
	"vào thời điểm nào",
	"gồm những giấy tờ gì",
	"những yêu cầu nào",
	]

	# punctuations, characters, stop-words
	punc = """!"#$%&'()*+,-./:;<=>?@[\]^`{\|}~""" # noqa: W605
	table = str.maketrans("", "", punc)

	punctuation = [x for x in string.punctuation]
	number = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
	chars = ["a", "b", "c", "d", "đ", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]
	stop_word = number + chars + ["của", "và", "các", "có", "được", "theo", "tại", "trong", "về",
	"hoặc", "người", "này", "khoản", "cho", "không", "từ", "phải",
	"ngày", "việc", "sau", "để", "đến", "bộ", "với", "là", "năm",
	"khi", "số", "trên", "khác", "đã", "thì", "thuộc", "điểm", "đồng",
	"do", "một", "bị", "vào", "lại", "ở", "nếu", "làm", "đây",
	"như", "đó", "mà", "nơi", "”", "“"]
	bm25_removed = punctuation + stop_word

	# defining sub-functions

	def remove_dieu_number(text):
	'''
	This funtion removes the common legal phrases out from texts
	'''
	text = re_thuchientheo.sub(" ", text)
	text = re_thongtuso.sub(" ", text)
	text = re_ngay.sub(" ", text)
	text = re_thang_nam.sub(" ", text)
	text = re_chuong.sub(" ", text)
	return " ".join(text.split())


	def remove_other_number_by_zero(text):
	'''
	This funtion replaces numeric characters in texts into 0 for easier handling
	'''
	for digit in ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]:
	text = text.replace(digit, "0")
	return text


	def remove_punct(text):
	'''
	This funtion replaces punctuations in texts for easier handling
	'''
	text = text.replace(";", ",").replace(":", ".").replace("“", " ").replace("”", " ")
	text = "".join(
	[
	c
	if c.isalpha() or c.isdigit() or c in [" ", ",", "(", ")", ".", "/", "-"]
	else " "
	for c in text
	]
	)
	text = " ".join(text.split())
	return text

	def lower_or_keep(text):
	"This funtion lower words but not for abbreviations"
	lst = text.split(" ")
	newlst = [x if x.isupper() else x.lower() for x in lst]
	return " ".join(newlst)

	def preprocess_all_title(article_title):
	"""
	Preprocess titles of documents
	"""
	article_title = lower_or_keep(article_title)
	lst = article_title.split()
	new_lst = []
	for i in range(len(lst)):
	if lst[i] == 'số' and i == len(lst)-1:
	new_lst.append(lst[i])
	elif lst[i] == 'số' and "/" in lst[i+1]:
	pass
	elif "/" in lst[i]:
	pass
	else:
	new_lst.append(lst[i])
	article_title = " ".join(new_lst)
	article_title = remove_dieu_number(article_title)
	#article_title = remove_other_number_by_zero(article_title)
	article_title = remove_punct(article_title)
	article_title = article_title.replace("về", "")
	if "do" in article_title and "ban hành" in article_title:
	idx = article_title.rfind("do")
	article_title = article_title[:(idx-1)]

	re_head = re.compile(r"(thông tư liên tịch\|thông tư\|nghị quyết\|quyết định\|nghị định\|văn bản)\s(quy định\|hướng dẫn)?")
	article_title = re_head.sub(" ", article_title)
	article_title = article_title.replace(" ", " ")
	article_title = article_title.replace(" ", " ")
	return article_title.strip()

	def preprocess_article_title(article_title):
	"""
	Preprocess titles of documents
	"""
	article_title = lower_or_keep(article_title)
	article_title = " ".join(article_title.split()[2:]) # Dieu 1.
	article_title = remove_dieu_number(article_title)
	#article_title = remove_other_number_by_zero(article_title)
	article_title = remove_punct(article_title)
	return article_title

	def preprocess_khoan(khoan):
	"""
	Perprocess parts in a legal documents
	"""
	khoan = lower_or_keep(khoan)
	khoan = khoan.replace("\xa0", "")
	matched = re.match(r"^\d+\.(\d+\.?)?\s", khoan) # 1. 2.2. 2.2
	if matched is not None:
	khoan = khoan[matched.span()[1]:].strip()

	else:
	matched2 = re.match(r"^[\wđ]\)\s", khoan)
	if matched2 is not None:
	khoan = khoan[matched2.span()[1]:].strip()

	khoan = remove_dieu_number(khoan)
	#khoan = khoan.replace("đ)","")
	khoan = re.sub(r"[\wđ]\) ","", khoan)
	khoan = re.sub(r"[\wđ]\. ","", khoan)
	khoan = re.sub(r"\d+\.\d+\.\d+\. ", "", khoan)
	khoan = re.sub(r"\d+\.\d+\. ", "", khoan)
	khoan = re.sub(r"\d+\. ", "", khoan)
	#khoan = re.sub(r"[0-9]\. ", "", khoan)
	#khoan = remove_other_number_by_zero(khoan)
	khoan = remove_punct(khoan)
	khoan = khoan.replace(". .", ".")
	khoan = khoan.replace("..", ".")
	khoan = khoan.replace(", ,", ",")
	khoan = khoan.replace(",,", ",")
	khoan = khoan.strip()
	return " ".join(khoan.split())


	def preprocess_question(q, remove_end_phrase=True):
	"""
	Preprocess questions
	"""
	q = lower_or_keep(q)
	q = remove_dieu_number(q)
	q = "".join([c if c.isalpha() or c.isdigit() or c == " " else " " for c in q])
	q = remove_punct(q)
	if remove_end_phrase:
	for phrase in END_PHRASES:
	if q.endswith(phrase):
	q = q[: -len(phrase)]
	break

	return q.strip()

	'''def tokenise(text, segmenter):
	"""
	Segment the texts with vncorenlp-segemnter
	"""
	result = segmenter.tokenize(text)
	rlt = ""
	for i in range(len(result)-1):
	rlt += " ".join(result[i])
	rlt += " "
	rlt += " ".join(result[len(result)-1])
	return rlt
	'''
	def tokenise(text, f):
	"""
	Segment the texts with pyvi tokenizer
	"""
	return f(text)

	def remove_stopword(w):
	"Remove stopwords in texts"
	return w not in stop_word

	def bm25_process(text, f):
	"""
	Processing texts for bm25: remove all puntuations, lower all words
	"""
	text = tokenise(text, f)
	words = text.lower().split(" ")
	result = [w for w in words if w not in bm25_removed]
	stripped = " ".join(result)
	result = " ".join(stripped.split(" "))
	return result

	def length(sentence):
	"Return the length in words of sentences"
	return len(sentence.split())

	def build_corpus(f, corpus_file, law_dict, scorpus_ids, head = False):
	"""
	Build a corpus-dataframe
	"""
	law_ids = []
	text_ids = []
	article_ids = []
	titles = []
	texts = []
	processed_texts = []
	tokenized_texts = []
	bm25texts = []
	lengths = []
	ids = []
	sub_ids = []
	count = 0

	with open (corpus_file, 'r') as input:
	data = json.load(input)

	for law in data:
	for article in law['articles']:
	ids.append(count)
	law_ids.append(law['law_id'])
	article_ids.append(article['article_id'])
	text_id = law['law_id'] + "_" + article['article_id']
	text_ids.append(text_id)

	titles.append(article['title'])
	texts.append(article['text'])

	title = preprocess_article_title(article["title"])
	head = preprocess_all_title(law_dict[law['law_id']])

	cac_khoan = article["text"].split("\n")
	khoan_clean = []
	for khoan in cac_khoan:
	khoan = preprocess_khoan(khoan)
	khoan_clean.append(khoan.strip())
	article_text = " ".join(khoan_clean)
	if head:
	processed_text = head + ". " + title + ". " + article_text
	else:
	processed_text = title + ". " + article_text + ". " + head + "."
	processed_texts.append(processed_text)
	start_sub_id = scorpus_ids.index(count)
	try:
	end_sub_id = scorpus_ids.index(count+1)
	sub_ids.append([i for i in range(start_sub_id, end_sub_id)])
	except:
	sub_ids.append([i for i in range(start_sub_id, len(scorpus_ids))])

	try:
	tokenized_text = tokenise(processed_text, f)
	tokenized_texts.append(tokenized_text)
	lengths.append(length(tokenized_text))
	except:
	tokenized_text = tokenise(processed_text[:50000], f)
	tokenized_texts.append(tokenized_text)
	lengths.append(length(tokenized_text))
	bm25texts.append(bm25_process(processed_text, f))
	count += 1

	df = pd.DataFrame()
	df["id"] = ids
	df["law_id"] = law_ids
	df["article_id"] = article_ids
	df["text_id"] = text_ids
	df["title"] = titles
	df["text"] = texts
	df["processed_text"] = processed_texts
	df["sub_id"] = sub_ids
	df["tokenized_text"] = tokenized_texts
	df["bm25text"] = bm25texts
	df["len"] = lengths

	return df

	def create_sliding_window(tokenized_text, size=200, overlap=64):
	"""
	Create list of windows for a text
	"""
	sentences = tokenized_text.split(".")
	words = tokenized_text.split(" ")
	title = sentences[0]
	words = [w for w in words if len(w) >0]
	actual_size = size - overlap

	windows = []
	n_windows = math.ceil(len(words)/actual_size)
	for i in range(n_windows):
	windows.append(" ".join(words[iactual_size:iactual_size + size]))
	for i in range(1, n_windows):
	if not windows[i].startswith("."):
	windows[i] = title + ". " + windows[i]
	else:
	windows[i] = title + windows[i]
	return windows

	def build_short_corpus(f, corpus_file, law_dict, head=False, size=200, overlap=64):
	"""
	Build a corpus-dataframe
	"""
	ids = []
	law_ids = []
	text_ids = []
	article_ids = []
	titles = []
	texts = []
	processed_texts = []
	sub_ids = []
	tokenized_texts = []
	bm25texts = []
	lengths = []

	with open (corpus_file, 'r') as input:
	data = json.load(input)
	idx = 0
	sub_idx = 0
	for law in data:
	for article in law['articles']:
	text_id = law['law_id'] + "_" + article['article_id']
	title = preprocess_article_title(article["title"])
	head = preprocess_all_title(law_dict[law['law_id']])
	cac_khoan = article["text"].split("\n")
	khoan_clean = []
	for khoan in cac_khoan:
	khoan = preprocess_khoan(khoan)
	khoan_clean.append(khoan.strip())
	article_text = " ".join(khoan_clean)
	if head:
	processed_text = head + ". " + title + ". " + article_text
	else:
	processed_text = title + ". " + article_text + ". " + head + "."
	try:
	tokenized_text = tokenise(processed_text, f)
	tokenized_len = length(tokenized_text)
	if tokenized_len <= size + 10:
	ids.append(idx)
	law_ids.append(law['law_id'])
	article_ids.append(article['article_id'])
	text_ids.append(text_id)
	titles.append(article['title'])
	texts.append(article['text'])
	processed_texts.append(processed_text)
	sub_ids.append(sub_idx)
	tokenized_texts.append(tokenized_text)
	lengths.append(tokenized_len)
	bm25texts.append(bm25_process(processed_text, f))
	sub_idx +=1
	else:
	windows = create_sliding_window(tokenized_text, size=224, overlap=64)
	for window in windows:
	ids.append(idx)
	law_ids.append(law['law_id'])
	article_ids.append(article['article_id'])
	text_ids.append(text_id)
	titles.append(article['title'])
	texts.append(article['text'])
	processed_texts.append(processed_text)
	sub_ids.append(sub_idx)
	tokenized_texts.append(window)
	lengths.append(length(window))
	bm25texts.append(bm25_process(window, f))
	sub_idx +=1
	except:
	actual_size = 50000 - overlap
	big_windows = []
	n_big_windows = math.ceil(len(processed_text)/actual_size)
	for i in range(n_big_windows):
	big_windows.append("".join(processed_text[iactual_size:iactual_size + size]))
	for big_window in big_windows:
	tokenized_text = tokenise(big_window, f)
	tokenized_len = length(tokenized_text)
	if tokenized_len > size + 10:
	windows = create_sliding_window(tokenized_text, size=224, overlap=64)
	for window in windows:
	ids.append(idx)
	law_ids.append(law['law_id'])
	article_ids.append(article['article_id'])
	text_ids.append(text_id)
	titles.append(article['title'])
	texts.append(article['text'])
	processed_texts.append(processed_text)
	sub_ids.append(sub_idx)
	tokenized_texts.append(window)
	lengths.append(length(window))
	bm25texts.append(bm25_process(window, f))
	sub_idx +=1
	else:
	ids.append(idx)
	law_ids.append(law['law_id'])
	article_ids.append(article['article_id'])
	text_ids.append(text_id)
	titles.append(article['title'])
	texts.append(article['text'])
	processed_texts.append(processed_text)
	sub_ids.append(sub_idx)
	tokenized_texts.append(tokenized_text)
	lengths.append(tokenized_len)
	bm25texts.append(bm25_process(processed_text, f))
	sub_idx +=1

	idx += 1

	df = pd.DataFrame()
	df["id"] = ids
	df["law_id"] = law_ids
	df["article_id"] = article_ids
	df["text_id"] = text_ids
	df["title"] = titles
	df["text"] = texts
	df["processed_text"] = processed_texts
	df["sub_id"] = sub_ids
	df["tokenized_text"] = tokenized_texts
	df["bm25text"] = bm25texts
	df["len"] = lengths

	return df

	def build_qa(f, df, qa_file, split = False):
	"""
	Build a question-answer dataframe
	"""
	text_ids = df["text_id"].tolist()
	titles = df["title"].tolist()
	texts = df["text"].tolist()
	lengths = df["len"].tolist()
	sub_ids = df["sub_id"].tolist()
	q_texts = []
	q_processed_texts = []
	q_tokenized_texts = []
	q_bm25texts = []
	q_lens = []
	no_ans = []
	ans_ids = []
	ans_text_ids = []
	ans_titles = []
	ans_texts = []
	ans_lens = []
	ans_sub_ids = []
	with open (qa_file, 'r') as input:
	data = json.load(input)

	if not split:
	for item in data['items']:
	question = item["question"]
	q_texts.append(question)
	q_processed_text = preprocess_question(question, remove_end_phrase=False)
	q_processed_texts.append(q_processed_text)
	q_tokenized_text = tokenise(q_processed_text, f)
	q_tokenized_texts.append(q_tokenized_text)
	q_bm25texts.append(bm25_process(q_processed_text, f))
	q_lens.append(length(q_tokenized_text))
	ans_text_id = ""
	ans_id = ""
	ans_title = ""
	ans_text = ""
	ans_len = ""
	ans_count = 0
	ans_sub_id = []
	for i in range(len(item['relevant_articles'])):
	ans_count += 1
	atext_id = item['relevant_articles'][i]['law_id'] + "_" + item['relevant_articles'][i]['article_id']
	a_id = text_ids.index(atext_id)
	ans_text_id += atext_id
	ans_id += str(a_id)
	ans_title += titles[a_id]
	ans_text += texts[a_id]
	ans_len += str(lengths[a_id])
	sub_id = sub_ids[a_id]
	ans_sub_id += sub_id

	if i < len(item["relevant_articles"]) - 1:
	ans_text_id += ", "
	ans_id += ", "
	ans_title += ", "
	ans_text += ", "
	ans_len += ", "

	no_ans.append(ans_count)
	ans_text_ids.append(ans_text_id)
	ans_ids.append(ans_id)
	ans_titles.append(ans_title)
	ans_texts.append(ans_text)
	ans_lens.append(ans_len)
	ans_sub_ids.append(ans_sub_id)
	else:
	for item in data['items']:
	question = item["question"]
	for article in item['relevant_articles']:
	q_texts.append(question)
	q_processed_text = preprocess_question(question, remove_end_phrase=False)
	q_processed_texts.append(q_processed_text)
	q_tokenized_text = tokenise(q_processed_text, f)
	q_tokenized_texts.append(q_tokenized_text)
	q_bm25texts.append(bm25_process(q_processed_text, f))
	q_lens.append(length(q_tokenized_text))
	ans_text_id = article['law_id'] + "_" + article['article_id']
	ans_text_ids.append(ans_text_id)
	a_id = text_ids.index(ans_text_id)
	ans_ids.append(a_id)
	ans_titles.append(titles[a_id])
	ans_texts.append(texts[a_id])
	ans_lens.append(lengths[a_id])
	ans_sub_ids.append(sub_ids[a_id])


	df = pd.DataFrame()
	df["question"] = q_texts
	df["processed_question"] = q_processed_texts
	df["tokenized_question"] = q_tokenized_texts
	df["bm25_question"] = q_bm25texts
	df["ques_len"] = q_lens
	if not split:
	df['no_ans'] = no_ans
	df["ans_text_id"] = ans_text_ids
	df["ans_id"] = ans_ids
	df["ans_title"] = ans_titles
	df["ans_text"] = ans_texts
	df["ans_len"] = ans_lens
	df["ans_sub_id"] = ans_sub_ids

	return df

	def build_biencoder_data(dqa_split, bm25, set_ques, no_hneg, no_search):
	"""
	Build train, val, test, dataframe used for biencoder training
	"""
	qa_ids = []
	neg_ids = []
	search_ids = []
	q_texts = dqa_split['question'].tolist()
	q_bm25texts = dqa_split['bm25_question'].tolist()
	count = 0
	ans_ids = dqa_split['ans_id'].tolist()
	ids = [i for i in range(bm25.corpus_size)]
	for i in range(len(q_texts)):
	if q_texts[i] in set_ques:
	qa_ids.append(i)
	q_bm25 = q_bm25texts[i].split(" ")
	bm25_ids = bm25.get_top_n(q_bm25, ids, n=no_search)
	if ans_ids[i] in bm25_ids:
	count += 1

	neg = bm25_ids[:(no_hneg+1)]
	if ans_ids[i] in neg:
	neg.remove(ans_ids[i])

	neg = neg[:no_hneg]
	neg_ids.append(neg)
	search_ids.append(bm25_ids)
	print(count/len(qa_ids))
	df = dqa_split.loc[qa_ids]
	df['neg_ids'] = neg_ids
	df['search_ids'] = search_ids
	return df

	def build_short_data(df, dcorpus, limited_length = 234):
	"""
	Build short data
	"""
	ids = [i for i in range(len(df)) if dcorpus['len'][df['ans_id'][i]] <= limited_length]
	dshort = df.loc[ids].copy(deep= True).reset_index(drop=True)
	return dshort

	def build_general_data(dqa, bm25, set_ques, no_hneg, no_search):
	"""
	Build general train, test, val dataframe
	"""
	qa_ids = []
	neg_ids = []
	search_ids = []
	q_texts = dqa['question'].tolist()
	q_bm25texts = dqa['bm25_question'].tolist()
	ans_ids = dqa['ans_id'].tolist()
	ids = [i for i in range(bm25.corpus_size)]
	count = 0

	for i in range(len(q_texts)):
	if q_texts[i] in set_ques:
	qa_ids.append(i)
	q_bm25 = q_bm25texts[i].split(" ")
	ans_id = [int(x) for x in ans_ids[i].split(", ")]
	bm25_ids = bm25.get_top_n(q_bm25, ids, n= no_search)
	search_ids.append(bm25_ids)

	for a_id in ans_id:
	if a_id in bm25_ids:
	bm25_ids.remove(a_id)
	neg_id = bm25_ids[:no_hneg]
	neg_ids.append(neg_id)
	if len(bm25_ids) == (no_search - len(ans_id)):
	count += 1

	df = dqa.loc[qa_ids]
	df['neg_ids'] = neg_ids
	df['search_ids'] = search_ids
	print(count/len(qa_ids))
	return df