raminass commited on
Commit
81d4aee
1 Parent(s): ba25d86

Upload 4 files

Browse files
Files changed (4) hide show
  1. utils/__init__.py +17 -0
  2. utils/cleaning.py +166 -0
  3. utils/id2label.json +15 -0
  4. utils/label2id.json +16 -0
utils/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .cleaning import remove_citations, split_data, split_text, chunk_data
2
+ from IPython.display import display, HTML
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+
7
+ with open('utils/id2label.json', 'r') as j:
8
+ id2label = json.loads(j.read())
9
+
10
+ with open('utils/label2id.json', 'r') as j:
11
+ label2id = json.loads(j.read())
12
+
13
+ def find_case_by_name(df, name):
14
+ return display(HTML(df[df['case_name'].str.contains(name)].iloc[:,:-1].to_html(render_links=True, escape=False)))
15
+
16
+ def head_df(df):
17
+ return display(HTML(df.iloc[:,:-1].head().to_html(render_links=True, escape=False)))
utils/cleaning.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import re
4
+ import pandas as pd
5
+
6
+ try:
7
+ import eyecite
8
+ except ImportError:
9
+ subprocess.check_call([sys.executable, "-m", "pip", "install", 'eyecite'])
10
+ finally:
11
+ from eyecite import find, clean
12
+
13
+ # @title
14
+ def full_case(citation, text):
15
+ text = text.replace(citation.matched_text(), "")
16
+ if citation.metadata.year:
17
+ pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
18
+ text = re.sub(pattern, '', text)
19
+ if citation.metadata.pin_cite:
20
+ text = text.replace(citation.metadata.pin_cite, "")
21
+ if citation.metadata.parenthetical:
22
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
23
+ if citation.metadata.plaintiff:
24
+ text = text.replace(f"{citation.metadata.plaintiff} v. {citation.metadata.defendant}", "")
25
+ publisher_date = " ".join(i for i in (citation.metadata.court, citation.metadata.year) if i)
26
+ if publisher_date:
27
+ text = text.replace(f"{publisher_date}", "")
28
+ if citation.metadata.extra:
29
+ text = text.replace(citation.metadata.extra, "")
30
+ return text
31
+
32
+ def supra_case(citation, text):
33
+ text = text.replace(citation.matched_text(), "")
34
+ if citation.metadata.pin_cite:
35
+ text = text.replace(citation.metadata.pin_cite, "")
36
+ if citation.metadata.parenthetical:
37
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
38
+ if citation.metadata.antecedent_guess:
39
+ text = text.replace(citation.metadata.antecedent_guess, "")
40
+ return text
41
+
42
+ def short_case(citation, text):
43
+ text = text.replace(citation.matched_text(), "")
44
+ if citation.metadata.parenthetical:
45
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
46
+ if citation.metadata.year:
47
+ pattern = r'\([^)]*{}\)'.format(citation.metadata.year)
48
+ if citation.metadata.antecedent_guess:
49
+ text = text.replace(citation.metadata.antecedent_guess, "")
50
+ return text
51
+
52
+ def id_case(citation, text):
53
+ text = text.replace(citation.matched_text(), "")
54
+ if citation.metadata.parenthetical:
55
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
56
+ if citation.metadata.pin_cite:
57
+ text = text.replace(citation.metadata.pin_cite, "")
58
+ return text
59
+
60
+ def unknown_case(citation, text):
61
+ text = text.replace(citation.matched_text(), "")
62
+ if citation.metadata.parenthetical:
63
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
64
+ return text
65
+
66
+ def full_law_case(citation, text):
67
+ text = text.replace(citation.matched_text(), "")
68
+ if citation.metadata.parenthetical:
69
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
70
+ return text
71
+
72
+ def full_journal_case(citation, text):
73
+ text = text.replace(citation.matched_text(), "")
74
+ if citation.metadata.year:
75
+ pattern = r'\([^)]*{}\)'.format(citation.metadata.year) # Matches any word that ends with "year"
76
+ text = re.sub(pattern, '', text)
77
+ if citation.metadata.pin_cite:
78
+ text = text.replace(citation.metadata.pin_cite, "")
79
+ if citation.metadata.parenthetical:
80
+ text = text.replace(f"({citation.metadata.parenthetical})", "")
81
+ return text
82
+
83
+ def all_commas(text: str) -> str:
84
+ return re.sub(r"\,+", ",", text)
85
+
86
+ def all_dots(text: str) -> str:
87
+ return re.sub(r"\.+", ".", text)
88
+
89
+ functions_dict = {
90
+ 'FullCaseCitation': full_case,
91
+ 'SupraCitation': supra_case,
92
+ 'ShortCaseCitation': short_case,
93
+ 'IdCitation': id_case,
94
+ 'UnknownCitation': unknown_case,
95
+ 'FullLawCitation': full_law_case,
96
+ 'FullJournalCitation': full_journal_case,
97
+ }
98
+
99
+ # @title
100
+ def remove_citations(input_text):
101
+ #clean text
102
+ plain_text = clean.clean_text(input_text, ['html', 'inline_whitespace', 'underscores'])
103
+ #remove citations
104
+ found_citations = find.get_citations(plain_text)
105
+ for citation in found_citations:
106
+ plain_text = functions_dict[citation.__class__.__name__](citation, plain_text)
107
+ #clean text
108
+ plain_text = clean.clean_text(plain_text, ['inline_whitespace', 'underscores','all_whitespace', all_commas, all_dots])
109
+ plain_text = clean.clean_text(plain_text, ['inline_whitespace','all_whitespace'])
110
+ pattern = r"\*?\d*\s*I+\n"
111
+ plain_text = re.sub(pattern, '', plain_text)
112
+ pattern = r"\s[,.]"
113
+ plain_text = re.sub(pattern, '', plain_text)
114
+ return plain_text
115
+
116
+ def split_text(text):
117
+ words = text.split()
118
+ chunks = []
119
+ for i in range(0, len(words), 420):
120
+ chunks.append(' '.join(words[i:i+430]))
121
+ return chunks
122
+
123
+
124
+ # @title
125
+ def chunk_text_to_paragraphs(text):
126
+ paragraphs = text.split("\n") # Split by empty line
127
+
128
+ # Remove leading and trailing whitespace from each paragraph
129
+ paragraphs = [p.strip() for p in paragraphs]
130
+
131
+ return paragraphs
132
+
133
+ # @title
134
+ def split_data(data, id2label, label2id):
135
+
136
+ data_dict = {'author_name': [],
137
+ 'label': [],
138
+ 'category': [],
139
+ 'case_name': [],
140
+ 'url': [],
141
+ 'text': []}
142
+ opinions_split = pd.DataFrame(data_dict)
143
+ opinions_split['label'] = opinions_split['label'].astype(int)
144
+ for index, row in data.iterrows():
145
+ # chunks = chunk_text_to_paragraphs(row['text'])
146
+ chunks = split_text(row['clean_text'])
147
+ for chunk in chunks:
148
+ if len(chunk)<1000:
149
+ continue
150
+ tmp = pd.DataFrame({'author_name': row['author_name'],'label': [label2id[row['author_name']]],
151
+ 'category': row['category'],'case_name': row['case_name'],
152
+ 'url': [row['absolute_url']], 'text': [chunk]})
153
+ opinions_split = pd.concat([opinions_split, tmp])
154
+ return opinions_split
155
+
156
+ def chunk_data(data):
157
+
158
+ data_dict = {'text': []}
159
+ opinions_split = pd.DataFrame(data_dict)
160
+ chunks = split_text(data)
161
+ for chunk in chunks:
162
+ if len(chunk)<1000:
163
+ continue
164
+ tmp = pd.DataFrame({'label': [200],'text': [chunk]})
165
+ opinions_split = pd.concat([opinions_split, tmp])
166
+ return opinions_split
utils/id2label.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0":"Justice Breyer",
3
+ "1":"Justice Ginsburg",
4
+ "2":"Justice Kennedy",
5
+ "3":"Justice O'Connor",
6
+ "4":"Justice Rehnquist",
7
+ "5":"Justice Scalia",
8
+ "6":"Justice Souter",
9
+ "7":"Justice Stevens",
10
+ "8":"Justice Thomas",
11
+ "9":"Justice Kagan",
12
+ "10":"Justice Alito",
13
+ "11":"Justice Sotomayor",
14
+ "12":"Justice Roberts"
15
+ }
utils/label2id.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Justice Breyer":0,
3
+ "Justice Ginsburg":1,
4
+ "Justice Kennedy":2,
5
+ "Justice O'Connor":3,
6
+ "Justice Rehnquist":4,
7
+ "Justice Scalia":5,
8
+ "Justice Souter":6,
9
+ "Justice Stevens":7,
10
+ "Justice Thomas":8,
11
+ "Justice Kagan":9,
12
+ "Justice Alito":10,
13
+ "Justice Sotomayor":11,
14
+ "Justice Roberts":12,
15
+ "per_curiam":100
16
+ }