Spaces:

raminass
/

SCOTUS

Sleeping

App Files Files Community

raminass commited on Sep 29, 2023

Commit

81d4aee

1 Parent(s): ba25d86

Upload 4 files

Browse files

Files changed (4) hide show

utils/__init__.py +17 -0
utils/cleaning.py +166 -0
utils/id2label.json +15 -0
utils/label2id.json +16 -0

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .cleaning import remove_citations, split_data, split_text, chunk_data
+from IPython.display import display, HTML
+import pandas as pd
+import numpy as np
+import json
+with open('utils/id2label.json', 'r') as j:
+     id2label = json.loads(j.read())
+with open('utils/label2id.json', 'r') as j:
+     label2id = json.loads(j.read())
+def find_case_by_name(df, name):
+  return display(HTML(df[df['case_name'].str.contains(name)].iloc[:,:-1].to_html(render_links=True, escape=False)))
+def head_df(df):
+  return display(HTML(df.iloc[:,:-1].head().to_html(render_links=True, escape=False)))

utils/cleaning.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import subprocess
+import sys
+import re
+import pandas as pd
+try:
+    import eyecite
+except ImportError:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
+finally:
+    from eyecite import find, clean
+# @title
+def full_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.year:
+      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
+      text = re.sub(pattern, '', text)
+    if citation.metadata.pin_cite:
+      text = text.replace(citation.metadata.pin_cite, "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    if citation.metadata.plaintiff:
+      text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
+    publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
+    if publisher_date:
+      text = text.replace(f"{publisher_date}", "")
+    if citation.metadata.extra:
+      text = text.replace(citation.metadata.extra, "")
+    return text
+def supra_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.pin_cite:
+      text = text.replace(citation.metadata.pin_cite, "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    if citation.metadata.antecedent_guess:
+      text = text.replace(citation.metadata.antecedent_guess, "")
+    return text
+def short_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    if citation.metadata.year:
+      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
+    if citation.metadata.antecedent_guess:
+      text = text.replace(citation.metadata.antecedent_guess, "")
+    return text
+def id_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    if citation.metadata.pin_cite:
+      text = text.replace(citation.metadata.pin_cite, "")
+    return text
+def unknown_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    return text
+def full_law_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    return text
+def full_journal_case(citation, text):
+    text = text.replace(citation.matched_text(), "")
+    if citation.metadata.year:
+      pattern = r'\([^)]*{}\)'.format(citation.metadata.year)  # Matches any word that ends with "year"
+      text = re.sub(pattern, '', text)
+    if citation.metadata.pin_cite:
+      text = text.replace(citation.metadata.pin_cite, "")
+    if citation.metadata.parenthetical:
+      text = text.replace(f"({citation.metadata.parenthetical})", "")
+    return text
+def all_commas(text: str) -> str:
+    return re.sub(r"\,+", ",", text)
+def all_dots(text: str) -> str:
+    return re.sub(r"\.+", ".", text)
+functions_dict = {
+    'FullCaseCitation': full_case,
+    'SupraCitation': supra_case,
+    'ShortCaseCitation': short_case,
+    'IdCitation': id_case,
+    'UnknownCitation': unknown_case,
+    'FullLawCitation': full_law_case,
+    'FullJournalCitation': full_journal_case,
+}
+# @title
+def remove_citations(input_text):
+  #clean text
+  plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
+  #remove citations
+  found_citations = find.get_citations(plain_text)
+  for citation in found_citations:
+    plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
+  #clean text
+  plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
+  plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
+  pattern = r"\*?\d*\s*I+\n"
+  plain_text = re.sub(pattern, '', plain_text)
+  pattern = r"\s[,.]"
+  plain_text = re.sub(pattern, '', plain_text)
+  return plain_text
+def split_text(text):
+    words = text.split()
+    chunks = []
+    for i in range(0, len(words), 420):
+        chunks.append(' '.join(words[i:i+430]))
+    return chunks
+# @title
+def chunk_text_to_paragraphs(text):
+    paragraphs = text.split("\n")  # Split by empty line
+    # Remove leading and trailing whitespace from each paragraph
+    paragraphs = [p.strip() for p in paragraphs]
+    return paragraphs
+# @title
+def split_data(data, id2label, label2id):
+  data_dict = {'author_name': [],
+              'label': [],
+              'category': [],
+              'case_name': [],
+              'url': [],
+              'text': []}
+  opinions_split = pd.DataFrame(data_dict)
+  opinions_split['label'] = opinions_split['label'].astype(int)
+  for index, row in data.iterrows():
+      # chunks = chunk_text_to_paragraphs(row['text'])
+      chunks = split_text(row['clean_text'])
+      for chunk in chunks:
+        if len(chunk)<1000:
+          continue
+        tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
+                              'category': row['category'],'case_name': row['case_name'],
+                              'url': [row['absolute_url']], 'text': [chunk]})
+        opinions_split = pd.concat([opinions_split, tmp])
+  return opinions_split
+def chunk_data(data):
+  data_dict = {'text': []}
+  opinions_split = pd.DataFrame(data_dict)
+  chunks = split_text(data)
+  for chunk in chunks:
+    if len(chunk)<1000:
+      continue
+    tmp = pd.DataFrame({'label': [200],'text': [chunk]})
+    opinions_split = pd.concat([opinions_split, tmp])
+  return opinions_split

utils/id2label.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+   "0":"Justice Breyer",
+   "1":"Justice Ginsburg",
+   "2":"Justice Kennedy",
+   "3":"Justice O'Connor",
+   "4":"Justice Rehnquist",
+   "5":"Justice Scalia",
+   "6":"Justice Souter",
+   "7":"Justice Stevens",
+   "8":"Justice Thomas",
+   "9":"Justice Kagan",
+   "10":"Justice Alito",
+   "11":"Justice Sotomayor",
+   "12":"Justice Roberts"
+}

utils/label2id.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+   "Justice Breyer":0,
+   "Justice Ginsburg":1,
+   "Justice Kennedy":2,
+   "Justice O'Connor":3,
+   "Justice Rehnquist":4,
+   "Justice Scalia":5,
+   "Justice Souter":6,
+   "Justice Stevens":7,
+   "Justice Thomas":8,
+   "Justice Kagan":9,
+   "Justice Alito":10,
+   "Justice Sotomayor":11,
+   "Justice Roberts":12,
+   "per_curiam":100
+}