British

Sleeping

File size: 5,916 Bytes

81d4aee

import subprocess
import sys
import re
import pandas as pd 

try:
    import eyecite
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
finally:
    from eyecite import find, clean

# @title
def full_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.year:
      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
      text = re.sub(pattern, '', text)
    if citation.metadata.pin_cite:
      text = text.replace(citation.metadata.pin_cite, "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    if citation.metadata.plaintiff:
      text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
    publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
    if publisher_date:
      text = text.replace(f"{publisher_date}", "")
    if citation.metadata.extra:
      text = text.replace(citation.metadata.extra, "")
    return text

def supra_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.pin_cite:
      text = text.replace(citation.metadata.pin_cite, "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    if citation.metadata.antecedent_guess:
      text = text.replace(citation.metadata.antecedent_guess, "")
    return text

def short_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    if citation.metadata.year:
      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
    if citation.metadata.antecedent_guess:
      text = text.replace(citation.metadata.antecedent_guess, "")
    return text

def id_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    if citation.metadata.pin_cite:
      text = text.replace(citation.metadata.pin_cite, "")
    return text

def unknown_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    return text

def full_law_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    return text

def full_journal_case(citation, text):
    text = text.replace(citation.matched_text(), "")
    if citation.metadata.year:
      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
      text = re.sub(pattern, '', text)
    if citation.metadata.pin_cite:
      text = text.replace(citation.metadata.pin_cite, "")
    if citation.metadata.parenthetical:
      text = text.replace(f"({citation.metadata.parenthetical})", "")
    return text

def all_commas(text: str) -> str:
    return re.sub(r"\,+", ",", text)

def all_dots(text: str) -> str:
    return re.sub(r"\.+", ".", text)

functions_dict = {
    'FullCaseCitation': full_case,
    'SupraCitation': supra_case,
    'ShortCaseCitation': short_case,
    'IdCitation': id_case,
    'UnknownCitation': unknown_case,
    'FullLawCitation': full_law_case,
    'FullJournalCitation': full_journal_case,
}

# @title
def remove_citations(input_text):
  #clean text
  plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
  #remove citations
  found_citations = find.get_citations(plain_text)
  for citation in found_citations:
    plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
  #clean text
  plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
  plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
  pattern = r"\*?\d*\s*I+\n"
  plain_text = re.sub(pattern, '', plain_text)
  pattern = r"\s[,.]"
  plain_text = re.sub(pattern, '', plain_text)
  return plain_text

def split_text(text):
    words = text.split()
    chunks = []
    for i in range(0, len(words), 420):
        chunks.append(' '.join(words[i:i+430]))
    return chunks


# @title
def chunk_text_to_paragraphs(text):
    paragraphs = text.split("\n")  # Split by empty line

    # Remove leading and trailing whitespace from each paragraph
    paragraphs = [p.strip() for p in paragraphs]

    return paragraphs

# @title
def split_data(data, id2label, label2id):

  data_dict = {'author_name': [],
              'label': [],
              'category': [],
              'case_name': [],
              'url': [],
              'text': []}
  opinions_split = pd.DataFrame(data_dict)
  opinions_split['label'] = opinions_split['label'].astype(int)
  for index, row in data.iterrows():
      # chunks = chunk_text_to_paragraphs(row['text'])
      chunks = split_text(row['clean_text'])
      for chunk in chunks:
        if len(chunk)<1000:
          continue
        tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
                              'category': row['category'],'case_name': row['case_name'],
                              'url': [row['absolute_url']], 'text': [chunk]})
        opinions_split = pd.concat([opinions_split, tmp])
  return opinions_split

def chunk_data(data):

  data_dict = {'text': []}
  opinions_split = pd.DataFrame(data_dict)
  chunks = split_text(data)
  for chunk in chunks:
    if len(chunk)<1000:
      continue
    tmp = pd.DataFrame({'label': [200],'text': [chunk]})
    opinions_split = pd.concat([opinions_split, tmp])
  return opinions_split