import subprocess import sys import re import pandas as pd try: import eyecite except ImportError: subprocess.check_call([sys.executable, "-m", "pip", "install", "eyecite"]) finally: from eyecite import find, clean # @title def full_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.year: pattern = r"\([^)]*{}\)".format( citation.metadata.year ) # Matches any word that ends with "year" text = re.sub(pattern, "", text) if citation.metadata.pin_cite: text = text.replace(citation.metadata.pin_cite, "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") if citation.metadata.plaintiff: text = text.replace( f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "" ) publisher_date = " ".join( i for i in (citation.metadata.court, citation.metadata.year) if i ) if publisher_date: text = text.replace(f"{publisher_date}", "") if citation.metadata.extra: text = text.replace(citation.metadata.extra, "") return text def supra_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.pin_cite: text = text.replace(citation.metadata.pin_cite, "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") if citation.metadata.antecedent_guess: text = text.replace(citation.metadata.antecedent_guess, "") return text def short_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") if citation.metadata.year: pattern = r"\([^)]*{}\)".format(citation.metadata.year) if citation.metadata.antecedent_guess: text = text.replace(citation.metadata.antecedent_guess, "") return text def id_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") if citation.metadata.pin_cite: text = text.replace(citation.metadata.pin_cite, "") return text def unknown_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") return text def full_law_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") return text def full_journal_case(citation, text): text = text.replace(citation.matched_text(), "") if citation.metadata.year: pattern = r"\([^)]*{}\)".format( citation.metadata.year ) # Matches any word that ends with "year" text = re.sub(pattern, "", text) if citation.metadata.pin_cite: text = text.replace(citation.metadata.pin_cite, "") if citation.metadata.parenthetical: text = text.replace(f"({citation.metadata.parenthetical})", "") return text def all_commas(text: str) -> str: return re.sub(r"\,+", ",", text) def all_dots(text: str) -> str: return re.sub(r"\.+", ".", text) functions_dict = { "FullCaseCitation": full_case, "SupraCitation": supra_case, "ShortCaseCitation": short_case, "IdCitation": id_case, "UnknownCitation": unknown_case, "FullLawCitation": full_law_case, "FullJournalCitation": full_journal_case, } # @title def remove_citations(input_text): # clean text plain_text = clean.clean_text( input_text, ["html", "inline_whitespace", "underscores"] ) # remove citations found_citations = find.get_citations(plain_text) for citation in found_citations: plain_text = functions_dict[citation.__class__.__name__](citation, plain_text) # clean text plain_text = clean.clean_text( plain_text, ["inline_whitespace", "underscores", "all_whitespace", all_commas, all_dots], ) plain_text = clean.clean_text(plain_text, ["inline_whitespace", "all_whitespace"]) pattern = r"\*?\d*\s*I+\n" plain_text = re.sub(pattern, "", plain_text) pattern = r"\s[,.]" plain_text = re.sub(pattern, "", plain_text) return plain_text def split_text(text): words = text.split() chunks = [] for i in range(0, len(words), 420): chunks.append(" ".join(words[i : i + 430])) return chunks # @title def chunk_text_to_paragraphs(text): paragraphs = text.split("\n") # Split by empty line # Remove leading and trailing whitespace from each paragraph paragraphs = [p.strip() for p in paragraphs] return paragraphs # @title def split_data(data, id2label, label2id): data_dict = { "author_name": [], "label": [], "category": [], "case_name": [], "url": [], "text": [], } opinions_split = pd.DataFrame(data_dict) opinions_split["label"] = opinions_split["label"].astype(int) for index, row in data.iterrows(): # chunks = chunk_text_to_paragraphs(row['text']) chunks = split_text(row["clean_text"]) for chunk in chunks: if len(chunk) < 1000: continue tmp = pd.DataFrame( { "author_name": row["author_name"], "label": [label2id[row["author_name"]]], "category": row["category"], "case_name": row["case_name"], "url": [row["absolute_url"]], "text": [chunk], } ) opinions_split = pd.concat([opinions_split, tmp]) return opinions_split def chunk_data(data): data_dict = {"text": []} opinions_split = pd.DataFrame(data_dict) chunks = split_text(data) for chunk in chunks: # if len(chunk) < 1000: # continue tmp = pd.DataFrame({"label": [200], "text": [chunk]}) opinions_split = pd.concat([opinions_split, tmp]) return opinions_split