arosyihuddin commited on
Commit
de92ab7
·
1 Parent(s): ecfd12f
app.py CHANGED
@@ -3,6 +3,9 @@ from gradio_pdf import PDF
3
  from src.bert import *
4
  from src.legalNER import *
5
  import gradio as gr
 
 
 
6
 
7
  ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
8
  indolem = 'indolem/indobert-base-uncased'
@@ -12,7 +15,22 @@ model_indonlu = BertModel(indonlu, len(ids_to_labels))
12
  tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
13
  tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
14
 
15
- def predict(doc : str, model : str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  if model == 'IndoBERT (IndoLEM)':
17
  use_model = model_indolem
18
  use_tokenizer = tokenizer_indolem
@@ -21,19 +39,64 @@ def predict(doc : str, model : str) -> str:
21
  use_model = model_indonlu
22
  use_tokenizer = tokenizer_indonlu
23
 
24
- ner = LegalNER(use_model, use_tokenizer, doc, ids_to_labels, model)
25
 
26
- return ner.display()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- iface = gr.Interface(
29
- fn=predict,
30
- inputs=[PDF(label="Document"),
31
- gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')],
32
- outputs="textbox",
33
- title="Legal NER",
34
- description="Upload File PDF Putusan Pidana",
35
- allow_flagging='never'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
 
38
  if __name__ == "__main__":
39
- iface.launch()
 
3
  from src.bert import *
4
  from src.legalNER import *
5
  import gradio as gr
6
+ from pathlib import Path
7
+
8
+ dir_ = Path(__file__).parent
9
 
10
  ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
11
  indolem = 'indolem/indobert-base-uncased'
 
15
  tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
16
  tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
17
 
18
+ def text_extraction(text, model, progress=gr.Progress()):
19
+ if model == 'IndoBERT (IndoLEM)':
20
+ use_model = model_indolem
21
+ use_tokenizer = tokenizer_indolem
22
+
23
+ else:
24
+ use_model = model_indonlu
25
+ use_tokenizer = tokenizer_indonlu
26
+
27
+ legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
28
+ entitas = legalner.predict(text)
29
+ new_text = legalner.tokenizer_decode
30
+
31
+ return {"text": new_text, "entities": entitas}
32
+
33
+ def pdf_extraction(doc, model, progress=gr.Progress()):
34
  if model == 'IndoBERT (IndoLEM)':
35
  use_model = model_indolem
36
  use_tokenizer = tokenizer_indolem
 
39
  use_model = model_indonlu
40
  use_tokenizer = tokenizer_indonlu
41
 
42
+ legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
43
 
44
+ return legalner.predict(doc)
45
+
46
+
47
+ with gr.Blocks() as ner:
48
+ gr.Markdown("#Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
49
+ gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
50
+ # Input Text
51
+ with gr.Row():
52
+ with gr.Column(scale=2):
53
+ text = gr.Textbox(label="Text")
54
+ model_text = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
55
+ button_text = gr.Button(value="Predict", variant='primary')
56
+ gr.ClearButton(text, value='Reset')
57
+ with gr.Column(scale=3):
58
+ output_text = gr.HighlightedText(label="Output Text")
59
+
60
+ button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
61
+
62
+ gr.Markdown("## Contoh Inputan Potongan Kalimat")
63
+ gr.Examples(
64
+ examples=[
65
+ ["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
66
+ ["MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoNLU)"]
67
+ ["PUTUSAN Nomor 77/Pid.B/2023/PN Jkt.Pst DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa : 1. Nama lengkap : Arif Bin Santung", "IndoBERT (IndoLEM)"]
68
+ ],
69
+ inputs=[text, model_text],
70
+ outputs=output_text,
71
+ fn=text_extraction,
72
+ )
73
 
74
+ gr.Markdown("## Ekstrak Entitas pada Dokumen Putusan Hukum")
75
+ # Input PDF
76
+ with gr.Row():
77
+ with gr.Column(scale=2):
78
+ doc = PDF(label="Document")
79
+ model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
80
+ button_pdf = gr.Button(value="Extract", variant='primary')
81
+ gr.ClearButton(doc, value="Reset")
82
+
83
+ with gr.Column(scale=3):
84
+ output_pdf = gr.Textbox(label="Output PDF")
85
+
86
+ button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
87
+
88
+ gr.Examples(
89
+ examples=[[str(dir_ /"data/165_Pdt.P_2023_PN_Bkl.pdf")],
90
+ [str(dir_ /'data/162_Pid.Sus_2023_PN_Bkl.pdf')],
91
+ [str(dir_ /'data/164_Pid.Sus_2023_PN_Bkl.pdf')],
92
+ [str(dir_ /'data/167_Pid.Sus_2023_PN_Bkl.pdf')],
93
+ [str(dir_ /'data/168_Pid.Sus_2023_PN_Bkl.pdf')],
94
+ [str(dir_ /'data/169_Pid.Sus_2023_PN_Bkl.pdf')],
95
+ ],
96
+ inputs=[doc],
97
+ outputs=output_pdf,
98
+ fn=pdf_extraction,
99
  )
100
 
101
  if __name__ == "__main__":
102
+ ner.launch()
src/__pycache__/helper.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/helper.cpython-310.pyc and b/src/__pycache__/helper.cpython-310.pyc differ
 
src/__pycache__/legalNER.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/legalNER.cpython-310.pyc and b/src/__pycache__/legalNER.cpython-310.pyc differ
 
src/helper.py CHANGED
@@ -32,4 +32,19 @@ def read_pdf(pdf):
32
  return pdf_text.strip()
33
 
34
  except requests.exceptions.RequestException as e:
35
- print("Error:", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  return pdf_text.strip()
33
 
34
  except requests.exceptions.RequestException as e:
35
+ print("Error:", e)
36
+
37
+ def token_decode(input_ids_conv):
38
+ result = ''
39
+ temp = ''
40
+ for i, word in enumerate(input_ids_conv):
41
+ if word not in ['[CLS]', '[SEP]', '[PAD]']:
42
+ if temp != '' and '##' not in word:
43
+ result += ' ' + temp
44
+ if '##' in word:
45
+ temp += word.replace('##', '')
46
+ else:
47
+ temp = word
48
+ if i == len(input_ids_conv)-1:
49
+ result += ' ' + temp
50
+ return result.strip()
src/legalNER.py CHANGED
@@ -3,14 +3,16 @@ import gradio as gr
3
  import torch
4
 
5
  class LegalNER():
6
- def __init__(self, model, tokenizer, pdf_file, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
7
  self.model = model
8
  self.tokenizer = tokenizer
9
- self.pdf = pdf_file
10
  self.check_point = check_point
11
  self.label_all_tokens = label_all_tokens
12
- self.ids_to_labels = ids_to_labels
13
  self.prediction_label = ''
 
 
 
 
14
  self.label_convert = {'B_VERN' : 'Nomor Putusan',
15
  'B_DEFN' : 'Nama Terdakwa',
16
  'B_CRIA' : 'Tindak Pidana',
@@ -26,16 +28,12 @@ class LegalNER():
26
  }
27
 
28
  def align_word_ids(self, texts):
29
-
30
  tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
31
-
32
  word_ids = tokenized_inputs.word_ids()
33
-
34
  previous_word_idx = None
35
  label_ids = []
36
 
37
  for word_idx in word_ids:
38
-
39
  if word_idx is None:
40
  label_ids.append(-100)
41
 
@@ -53,13 +51,13 @@ class LegalNER():
53
 
54
  return label_ids
55
 
56
- def labelToText(self, data_token):
57
  prev_tag = 'O'
58
  result = {}
59
  temp = ''
60
 
61
  # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
62
- for i, word in enumerate(data_token):
63
  if self.prediction_label[i] != 'O':
64
  if prev_tag == 'O' and temp != '':
65
  temp = ''
@@ -77,11 +75,11 @@ class LegalNER():
77
  prev_tag = self.prediction_label[i]
78
 
79
  return result
80
-
81
- def labelConverter(self, entity):
82
  # Memilih prediksi entitas yang paling bagus
83
  entity_result = {}
84
- for i in entity:
85
  if len(list(i.keys())) > 1:
86
  for y in i.items():
87
  if y[0] not in entity_result:
@@ -92,8 +90,8 @@ class LegalNER():
92
  else:
93
  if tuple(i.items())[0] not in entity_result:
94
  entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
95
-
96
- # Mengkonversi hasil dalam bentuk String
97
  result = ''
98
  for i, (label, data) in enumerate(entity_result.items()):
99
  if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
@@ -102,15 +100,55 @@ class LegalNER():
102
  result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
103
  elif label in ['B_ADVO', 'B_REGI']:
104
  result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
105
- else:
106
  result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
107
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return result
109
 
110
- def display(self, progress=gr.Progress()):
111
- file_pdf = read_pdf(self.pdf)
112
- sentence_file = file_pdf.split(';')
113
-
114
  use_cuda = torch.cuda.is_available()
115
  device = torch.device("cuda" if use_cuda else "cpu")
116
  if use_cuda:
@@ -121,8 +159,7 @@ class LegalNER():
121
  model_weights = torch.load(file_check_point, map_location=torch.device(device))
122
  self.model.load_state_dict(model_weights)
123
 
124
- label_extraction = []
125
- for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
126
  toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
127
  input_ids = toknize['input_ids'].to(device)
128
  mask = toknize['attention_mask'].to(device)
@@ -135,10 +172,20 @@ class LegalNER():
135
 
136
  input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
137
  data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
 
 
138
  self.prediction_label = prediction_label
139
- labelConv = self.labelToText(data_token)
140
 
141
  if labelConv:
142
- label_extraction.append(labelConv)
143
-
144
- return self.labelConverter(label_extraction)
 
 
 
 
 
 
 
 
 
3
  import torch
4
 
5
  class LegalNER():
6
+ def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
7
  self.model = model
8
  self.tokenizer = tokenizer
 
9
  self.check_point = check_point
10
  self.label_all_tokens = label_all_tokens
 
11
  self.prediction_label = ''
12
+ self.data_token = ''
13
+ self.ids_to_labels = ids_to_labels
14
+ self.label_extraction = []
15
+ self.tokenizer_decode = ''
16
  self.label_convert = {'B_VERN' : 'Nomor Putusan',
17
  'B_DEFN' : 'Nama Terdakwa',
18
  'B_CRIA' : 'Tindak Pidana',
 
28
  }
29
 
30
  def align_word_ids(self, texts):
 
31
  tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
 
32
  word_ids = tokenized_inputs.word_ids()
 
33
  previous_word_idx = None
34
  label_ids = []
35
 
36
  for word_idx in word_ids:
 
37
  if word_idx is None:
38
  label_ids.append(-100)
39
 
 
51
 
52
  return label_ids
53
 
54
+ def labelToText(self):
55
  prev_tag = 'O'
56
  result = {}
57
  temp = ''
58
 
59
  # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
60
+ for i, word in enumerate(self.data_token):
61
  if self.prediction_label[i] != 'O':
62
  if prev_tag == 'O' and temp != '':
63
  temp = ''
 
75
  prev_tag = self.prediction_label[i]
76
 
77
  return result
78
+
79
+ def dis_pdf_prediction(self):
80
  # Memilih prediksi entitas yang paling bagus
81
  entity_result = {}
82
+ for i in self.label_extraction:
83
  if len(list(i.keys())) > 1:
84
  for y in i.items():
85
  if y[0] not in entity_result:
 
90
  else:
91
  if tuple(i.items())[0] not in entity_result:
92
  entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
93
+
94
+ # Mengkonversi hasil ekstraski entitas dalam bentuk List
95
  result = ''
96
  for i, (label, data) in enumerate(entity_result.items()):
97
  if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
 
100
  result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
101
  elif label in ['B_ADVO', 'B_REGI']:
102
  result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
103
+ else:
104
  result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
105
+
106
+ return result
107
+
108
+ def dis_text_prediction(self):
109
+ result = []
110
+ temp_result = {}
111
+ count_huruf = 0
112
+ temp_word = ''
113
+ temp_label = ''
114
+ temp_label = ''
115
+ temp_count_huruf = 0
116
+ prev_word = ''
117
+ for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
118
+ if label != 'O':
119
+ if temp_word != '' and '##' not in word:
120
+ temp_result['entity'] = temp_label
121
+ temp_result['word'] = temp_word
122
+ temp_result['start'] = temp_count_huruf
123
+ temp_result['end'] = temp_count_huruf + (len(temp_word))
124
+ result.append(temp_result)
125
+ temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
126
+
127
+ if '##' in word:
128
+ temp_word += word.replace('##', '')
129
+
130
+ else:
131
+ temp_label = label
132
+ temp_word = word
133
+ temp_count_huruf = count_huruf
134
+
135
+ if i == len(self.data_token)-1:
136
+ temp_result['entity'] = temp_label
137
+ temp_result['word'] = temp_word
138
+ temp_result['start'] = temp_count_huruf
139
+ temp_result['end'] = temp_count_huruf + (len(temp_word))
140
+ result.append(temp_result)
141
+ temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
142
+
143
+ if '##' in word:
144
+ count_huruf += len(word)-2
145
+
146
+ else:
147
+ count_huruf += len(word)+1
148
+
149
  return result
150
 
151
+ def fit_transform(self, texts, progress=gr.Progress()):
 
 
 
152
  use_cuda = torch.cuda.is_available()
153
  device = torch.device("cuda" if use_cuda else "cpu")
154
  if use_cuda:
 
159
  model_weights = torch.load(file_check_point, map_location=torch.device(device))
160
  self.model.load_state_dict(model_weights)
161
 
162
+ for text in progress.tqdm(texts, desc="Ekstraksi Entitas"):
 
163
  toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
164
  input_ids = toknize['input_ids'].to(device)
165
  mask = toknize['attention_mask'].to(device)
 
172
 
173
  input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
174
  data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
175
+ self.tokenizer_decode = token_decode(input_ids_conv)
176
+ self.data_token = data_token
177
  self.prediction_label = prediction_label
178
+ labelConv = self.labelToText()
179
 
180
  if labelConv:
181
+ self.label_extraction.append(labelConv)
182
+
183
+ def predict(self, doc):
184
+ if '.pdf' not in doc:
185
+ self.fit_transform([doc.strip()])
186
+ return self.dis_text_prediction()
187
+ else:
188
+ file_pdf = read_pdf(doc)
189
+ sentence_file = file_pdf.split(';')
190
+ self.fit_transform(sentence_file)
191
+ return self.dis_pdf_prediction()