Upload 4 files
Browse files- utils/__init__.py +17 -0
- utils/cleaning.py +166 -0
- utils/id2label.json +15 -0
- utils/label2id.json +16 -0
utils/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .cleaning import remove_citations, split_data, split_text, chunk_data
|
2 |
+
from IPython.display import display, HTML
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import json
|
6 |
+
|
7 |
+
with open('utils/id2label.json', 'r') as j:
|
8 |
+
id2label = json.loads(j.read())
|
9 |
+
|
10 |
+
with open('utils/label2id.json', 'r') as j:
|
11 |
+
label2id = json.loads(j.read())
|
12 |
+
|
13 |
+
def find_case_by_name(df, name):
|
14 |
+
return display(HTML(df[df['case_name'].str.contains(name)].iloc[:,:-1].to_html(render_links=True, escape=False)))
|
15 |
+
|
16 |
+
def head_df(df):
|
17 |
+
return display(HTML(df.iloc[:,:-1].head().to_html(render_links=True, escape=False)))
|
utils/cleaning.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import sys
|
3 |
+
import re
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
try:
|
7 |
+
import eyecite
|
8 |
+
except ImportError:
|
9 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
|
10 |
+
finally:
|
11 |
+
from eyecite import find, clean
|
12 |
+
|
13 |
+
# @title
|
14 |
+
def full_case(citation, text):
|
15 |
+
text = text.replace(citation.matched_text(), "")
|
16 |
+
if citation.metadata.year:
|
17 |
+
pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
|
18 |
+
text = re.sub(pattern, '', text)
|
19 |
+
if citation.metadata.pin_cite:
|
20 |
+
text = text.replace(citation.metadata.pin_cite, "")
|
21 |
+
if citation.metadata.parenthetical:
|
22 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
23 |
+
if citation.metadata.plaintiff:
|
24 |
+
text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
|
25 |
+
publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
|
26 |
+
if publisher_date:
|
27 |
+
text = text.replace(f"{publisher_date}", "")
|
28 |
+
if citation.metadata.extra:
|
29 |
+
text = text.replace(citation.metadata.extra, "")
|
30 |
+
return text
|
31 |
+
|
32 |
+
def supra_case(citation, text):
|
33 |
+
text = text.replace(citation.matched_text(), "")
|
34 |
+
if citation.metadata.pin_cite:
|
35 |
+
text = text.replace(citation.metadata.pin_cite, "")
|
36 |
+
if citation.metadata.parenthetical:
|
37 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
38 |
+
if citation.metadata.antecedent_guess:
|
39 |
+
text = text.replace(citation.metadata.antecedent_guess, "")
|
40 |
+
return text
|
41 |
+
|
42 |
+
def short_case(citation, text):
|
43 |
+
text = text.replace(citation.matched_text(), "")
|
44 |
+
if citation.metadata.parenthetical:
|
45 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
46 |
+
if citation.metadata.year:
|
47 |
+
pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
|
48 |
+
if citation.metadata.antecedent_guess:
|
49 |
+
text = text.replace(citation.metadata.antecedent_guess, "")
|
50 |
+
return text
|
51 |
+
|
52 |
+
def id_case(citation, text):
|
53 |
+
text = text.replace(citation.matched_text(), "")
|
54 |
+
if citation.metadata.parenthetical:
|
55 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
56 |
+
if citation.metadata.pin_cite:
|
57 |
+
text = text.replace(citation.metadata.pin_cite, "")
|
58 |
+
return text
|
59 |
+
|
60 |
+
def unknown_case(citation, text):
|
61 |
+
text = text.replace(citation.matched_text(), "")
|
62 |
+
if citation.metadata.parenthetical:
|
63 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
64 |
+
return text
|
65 |
+
|
66 |
+
def full_law_case(citation, text):
|
67 |
+
text = text.replace(citation.matched_text(), "")
|
68 |
+
if citation.metadata.parenthetical:
|
69 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
70 |
+
return text
|
71 |
+
|
72 |
+
def full_journal_case(citation, text):
|
73 |
+
text = text.replace(citation.matched_text(), "")
|
74 |
+
if citation.metadata.year:
|
75 |
+
pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
|
76 |
+
text = re.sub(pattern, '', text)
|
77 |
+
if citation.metadata.pin_cite:
|
78 |
+
text = text.replace(citation.metadata.pin_cite, "")
|
79 |
+
if citation.metadata.parenthetical:
|
80 |
+
text = text.replace(f"({citation.metadata.parenthetical})", "")
|
81 |
+
return text
|
82 |
+
|
83 |
+
def all_commas(text: str) -> str:
|
84 |
+
return re.sub(r"\,+", ",", text)
|
85 |
+
|
86 |
+
def all_dots(text: str) -> str:
|
87 |
+
return re.sub(r"\.+", ".", text)
|
88 |
+
|
89 |
+
functions_dict = {
|
90 |
+
'FullCaseCitation': full_case,
|
91 |
+
'SupraCitation': supra_case,
|
92 |
+
'ShortCaseCitation': short_case,
|
93 |
+
'IdCitation': id_case,
|
94 |
+
'UnknownCitation': unknown_case,
|
95 |
+
'FullLawCitation': full_law_case,
|
96 |
+
'FullJournalCitation': full_journal_case,
|
97 |
+
}
|
98 |
+
|
99 |
+
# @title
|
100 |
+
def remove_citations(input_text):
|
101 |
+
#clean text
|
102 |
+
plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
|
103 |
+
#remove citations
|
104 |
+
found_citations = find.get_citations(plain_text)
|
105 |
+
for citation in found_citations:
|
106 |
+
plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
|
107 |
+
#clean text
|
108 |
+
plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
|
109 |
+
plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
|
110 |
+
pattern = r"\*?\d*\s*I+\n"
|
111 |
+
plain_text = re.sub(pattern, '', plain_text)
|
112 |
+
pattern = r"\s[,.]"
|
113 |
+
plain_text = re.sub(pattern, '', plain_text)
|
114 |
+
return plain_text
|
115 |
+
|
116 |
+
def split_text(text):
|
117 |
+
words = text.split()
|
118 |
+
chunks = []
|
119 |
+
for i in range(0, len(words), 420):
|
120 |
+
chunks.append(' '.join(words[i:i+430]))
|
121 |
+
return chunks
|
122 |
+
|
123 |
+
|
124 |
+
# @title
|
125 |
+
def chunk_text_to_paragraphs(text):
|
126 |
+
paragraphs = text.split("\n") # Split by empty line
|
127 |
+
|
128 |
+
# Remove leading and trailing whitespace from each paragraph
|
129 |
+
paragraphs = [p.strip() for p in paragraphs]
|
130 |
+
|
131 |
+
return paragraphs
|
132 |
+
|
133 |
+
# @title
|
134 |
+
def split_data(data, id2label, label2id):
|
135 |
+
|
136 |
+
data_dict = {'author_name': [],
|
137 |
+
'label': [],
|
138 |
+
'category': [],
|
139 |
+
'case_name': [],
|
140 |
+
'url': [],
|
141 |
+
'text': []}
|
142 |
+
opinions_split = pd.DataFrame(data_dict)
|
143 |
+
opinions_split['label'] = opinions_split['label'].astype(int)
|
144 |
+
for index, row in data.iterrows():
|
145 |
+
# chunks = chunk_text_to_paragraphs(row['text'])
|
146 |
+
chunks = split_text(row['clean_text'])
|
147 |
+
for chunk in chunks:
|
148 |
+
if len(chunk)<1000:
|
149 |
+
continue
|
150 |
+
tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
|
151 |
+
'category': row['category'],'case_name': row['case_name'],
|
152 |
+
'url': [row['absolute_url']], 'text': [chunk]})
|
153 |
+
opinions_split = pd.concat([opinions_split, tmp])
|
154 |
+
return opinions_split
|
155 |
+
|
156 |
+
def chunk_data(data):
|
157 |
+
|
158 |
+
data_dict = {'text': []}
|
159 |
+
opinions_split = pd.DataFrame(data_dict)
|
160 |
+
chunks = split_text(data)
|
161 |
+
for chunk in chunks:
|
162 |
+
if len(chunk)<1000:
|
163 |
+
continue
|
164 |
+
tmp = pd.DataFrame({'label': [200],'text': [chunk]})
|
165 |
+
opinions_split = pd.concat([opinions_split, tmp])
|
166 |
+
return opinions_split
|
utils/id2label.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"0":"Justice Breyer",
|
3 |
+
"1":"Justice Ginsburg",
|
4 |
+
"2":"Justice Kennedy",
|
5 |
+
"3":"Justice O'Connor",
|
6 |
+
"4":"Justice Rehnquist",
|
7 |
+
"5":"Justice Scalia",
|
8 |
+
"6":"Justice Souter",
|
9 |
+
"7":"Justice Stevens",
|
10 |
+
"8":"Justice Thomas",
|
11 |
+
"9":"Justice Kagan",
|
12 |
+
"10":"Justice Alito",
|
13 |
+
"11":"Justice Sotomayor",
|
14 |
+
"12":"Justice Roberts"
|
15 |
+
}
|
utils/label2id.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Justice Breyer":0,
|
3 |
+
"Justice Ginsburg":1,
|
4 |
+
"Justice Kennedy":2,
|
5 |
+
"Justice O'Connor":3,
|
6 |
+
"Justice Rehnquist":4,
|
7 |
+
"Justice Scalia":5,
|
8 |
+
"Justice Souter":6,
|
9 |
+
"Justice Stevens":7,
|
10 |
+
"Justice Thomas":8,
|
11 |
+
"Justice Kagan":9,
|
12 |
+
"Justice Alito":10,
|
13 |
+
"Justice Sotomayor":11,
|
14 |
+
"Justice Roberts":12,
|
15 |
+
"per_curiam":100
|
16 |
+
}
|