British

Sleeping

App Files Files Community

British / utils /cleaning.py

raminass

Upload 4 files

81d4aee almost 2 years ago

raw

history blame

5.92 kB

	import subprocess
	import sys
	import re
	import pandas as pd

	try:
	import eyecite
	except ImportError:
	subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
	finally:
	from eyecite import find, clean

	# @title
	def full_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.year:
	pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
	text = re.sub(pattern, '', text)
	if citation.metadata.pin_cite:
	text = text.replace(citation.metadata.pin_cite, "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	if citation.metadata.plaintiff:
	text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
	publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
	if publisher_date:
	text = text.replace(f"{publisher_date}", "")
	if citation.metadata.extra:
	text = text.replace(citation.metadata.extra, "")
	return text

	def supra_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.pin_cite:
	text = text.replace(citation.metadata.pin_cite, "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	if citation.metadata.antecedent_guess:
	text = text.replace(citation.metadata.antecedent_guess, "")
	return text

	def short_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	if citation.metadata.year:
	pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
	if citation.metadata.antecedent_guess:
	text = text.replace(citation.metadata.antecedent_guess, "")
	return text

	def id_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	if citation.metadata.pin_cite:
	text = text.replace(citation.metadata.pin_cite, "")
	return text

	def unknown_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	return text

	def full_law_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	return text

	def full_journal_case(citation, text):
	text = text.replace(citation.matched_text(), "")
	if citation.metadata.year:
	pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
	text = re.sub(pattern, '', text)
	if citation.metadata.pin_cite:
	text = text.replace(citation.metadata.pin_cite, "")
	if citation.metadata.parenthetical:
	text = text.replace(f"({citation.metadata.parenthetical})", "")
	return text

	def all_commas(text: str) -> str:
	return re.sub(r"\,+", ",", text)

	def all_dots(text: str) -> str:
	return re.sub(r"\.+", ".", text)

	functions_dict = {
	'FullCaseCitation': full_case,
	'SupraCitation': supra_case,
	'ShortCaseCitation': short_case,
	'IdCitation': id_case,
	'UnknownCitation': unknown_case,
	'FullLawCitation': full_law_case,
	'FullJournalCitation': full_journal_case,
	}

	# @title
	def remove_citations(input_text):
	#clean text
	plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
	#remove citations
	found_citations = find.get_citations(plain_text)
	for citation in found_citations:
	plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
	#clean text
	plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
	plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
	pattern = r"\?\d\s*I+\n"
	plain_text = re.sub(pattern, '', plain_text)
	pattern = r"\s[,.]"
	plain_text = re.sub(pattern, '', plain_text)
	return plain_text

	def split_text(text):
	words = text.split()
	chunks = []
	for i in range(0, len(words), 420):
	chunks.append(' '.join(words[i:i+430]))
	return chunks


	# @title
	def chunk_text_to_paragraphs(text):
	paragraphs = text.split("\n") # Split by empty line

	# Remove leading and trailing whitespace from each paragraph
	paragraphs = [p.strip() for p in paragraphs]

	return paragraphs

	# @title
	def split_data(data, id2label, label2id):

	data_dict = {'author_name': [],
	'label': [],
	'category': [],
	'case_name': [],
	'url': [],
	'text': []}
	opinions_split = pd.DataFrame(data_dict)
	opinions_split['label'] = opinions_split['label'].astype(int)
	for index, row in data.iterrows():
	# chunks = chunk_text_to_paragraphs(row['text'])
	chunks = split_text(row['clean_text'])
	for chunk in chunks:
	if len(chunk)<1000:
	continue
	tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
	'category': row['category'],'case_name': row['case_name'],
	'url': [row['absolute_url']], 'text': [chunk]})
	opinions_split = pd.concat([opinions_split, tmp])
	return opinions_split

	def chunk_data(data):

	data_dict = {'text': []}
	opinions_split = pd.DataFrame(data_dict)
	chunks = split_text(data)
	for chunk in chunks:
	if len(chunk)<1000:
	continue
	tmp = pd.DataFrame({'label': [200],'text': [chunk]})
	opinions_split = pd.concat([opinions_split, tmp])
	return opinions_split