Spaces:

arosyihuddin
/

gradio-LegalNER

Sleeping

App Files Files Community

arosyihuddin commited on Mar 20, 2024

Commit

de92ab7

1 Parent(s): ecfd12f

update UI

Browse files

Files changed (5) hide show

app.py +75 -12
src/__pycache__/helper.cpython-310.pyc +0 -0
src/__pycache__/legalNER.cpython-310.pyc +0 -0
src/helper.py +16 -1
src/legalNER.py +73 -26

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ from gradio_pdf import PDF
 from src.bert import *
 from src.legalNER import *
 import gradio as gr
 ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
 indolem = 'indolem/indobert-base-uncased'
@@ -12,7 +15,22 @@ model_indonlu = BertModel(indonlu, len(ids_to_labels))
 tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
 tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
-def predict(doc : str, model : str) -> str:
   if model == 'IndoBERT (IndoLEM)':
     use_model = model_indolem
     use_tokenizer = tokenizer_indolem
@@ -21,19 +39,64 @@ def predict(doc : str, model : str) -> str:
     use_model = model_indonlu
     use_tokenizer = tokenizer_indonlu
-  ner =  LegalNER(use_model, use_tokenizer, doc, ids_to_labels, model)
-  return ner.display()
-iface = gr.Interface(
-    fn=predict,
-    inputs=[PDF(label="Document"),
-            gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')],
-    outputs="textbox",
-    title="Legal NER",
-    description="Upload File PDF Putusan Pidana",
-    allow_flagging='never'
     )
 if __name__ == "__main__":
-    iface.launch()

 from src.bert import *
 from src.legalNER import *
 import gradio as gr
+from pathlib import Path
+dir_ = Path(__file__).parent
 ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
 indolem = 'indolem/indobert-base-uncased'
 tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
 tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
+def text_extraction(text, model, progress=gr.Progress()):
+  if model == 'IndoBERT (IndoLEM)':
+    use_model = model_indolem
+    use_tokenizer = tokenizer_indolem
+  else:
+    use_model = model_indonlu
+    use_tokenizer = tokenizer_indonlu
+  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
+  entitas = legalner.predict(text)
+  new_text = legalner.tokenizer_decode
+  return {"text": new_text, "entities": entitas}
+def pdf_extraction(doc, model, progress=gr.Progress()):
   if model == 'IndoBERT (IndoLEM)':
     use_model = model_indolem
     use_tokenizer = tokenizer_indolem
     use_model = model_indonlu
     use_tokenizer = tokenizer_indonlu
+  legalner =  LegalNER(use_model, use_tokenizer, ids_to_labels, model)
+  return legalner.predict(doc)
+with gr.Blocks() as ner:
+  gr.Markdown("#Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
+  gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
+  # Input Text
+  with gr.Row():
+    with gr.Column(scale=2):
+      text = gr.Textbox(label="Text")
+      model_text = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
+      button_text = gr.Button(value="Predict", variant='primary')
+      gr.ClearButton(text, value='Reset')
+    with gr.Column(scale=3):
+      output_text = gr.HighlightedText(label="Output Text")
+    button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
+  gr.Markdown("## Contoh Inputan Potongan Kalimat")
+  gr.Examples(
+    examples=[
+        ["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
+        ["MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoNLU)"]
+        ["PUTUSAN Nomor 77/Pid.B/2023/PN Jkt.Pst DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa : 1.	Nama lengkap	: Arif Bin Santung", "IndoBERT (IndoLEM)"]
+    ],
+    inputs=[text, model_text],
+    outputs=output_text,
+    fn=text_extraction,
+    )
+  gr.Markdown("## Ekstrak Entitas pada Dokumen Putusan Hukum")
+  # Input PDF
+  with gr.Row():
+    with gr.Column(scale=2):
+      doc = PDF(label="Document")
+      model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
+      button_pdf = gr.Button(value="Extract", variant='primary')
+      gr.ClearButton(doc, value="Reset")
+    with gr.Column(scale=3):
+      output_pdf = gr.Textbox(label="Output PDF")
+  button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
+  gr.Examples(
+    examples=[[str(dir_ /"data/165_Pdt.P_2023_PN_Bkl.pdf")],
+              [str(dir_ /'data/162_Pid.Sus_2023_PN_Bkl.pdf')],
+              [str(dir_ /'data/164_Pid.Sus_2023_PN_Bkl.pdf')],
+              [str(dir_ /'data/167_Pid.Sus_2023_PN_Bkl.pdf')],
+              [str(dir_ /'data/168_Pid.Sus_2023_PN_Bkl.pdf')],
+              [str(dir_ /'data/169_Pid.Sus_2023_PN_Bkl.pdf')],
+    ],
+    inputs=[doc],
+    outputs=output_pdf,
+    fn=pdf_extraction,
     )
 if __name__ == "__main__":
+  ner.launch()

src/__pycache__/helper.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/helper.cpython-310.pyc and b/src/__pycache__/helper.cpython-310.pyc differ

src/__pycache__/legalNER.cpython-310.pyc CHANGED Viewed

Binary files a/src/__pycache__/legalNER.cpython-310.pyc and b/src/__pycache__/legalNER.cpython-310.pyc differ

src/helper.py CHANGED Viewed

@@ -32,4 +32,19 @@ def read_pdf(pdf):
     return pdf_text.strip()
   except requests.exceptions.RequestException as e:
-    print("Error:", e)

     return pdf_text.strip()
   except requests.exceptions.RequestException as e:
+    print("Error:", e)
+def token_decode(input_ids_conv):
+  result = ''
+  temp = ''
+  for i, word in enumerate(input_ids_conv):
+    if word not in ['[CLS]', '[SEP]', '[PAD]']:
+      if temp != '' and '##' not in word:
+        result += ' ' + temp
+      if '##' in word:
+        temp += word.replace('##', '')
+      else:
+        temp = word
+    if i == len(input_ids_conv)-1:
+      result += ' ' + temp
+  return result.strip()

src/legalNER.py CHANGED Viewed

@@ -3,14 +3,16 @@ import gradio as gr
 import torch
 class LegalNER():
-  def __init__(self, model, tokenizer, pdf_file, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
     self.model = model
     self.tokenizer = tokenizer
-    self.pdf = pdf_file
     self.check_point = check_point
     self.label_all_tokens = label_all_tokens
-    self.ids_to_labels = ids_to_labels
     self.prediction_label = ''
     self.label_convert = {'B_VERN' : 'Nomor Putusan',
                    'B_DEFN' : 'Nama Terdakwa',
                    'B_CRIA' : 'Tindak Pidana',
@@ -26,16 +28,12 @@ class LegalNER():
                    }
   def align_word_ids(self, texts):
     tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
     word_ids = tokenized_inputs.word_ids()
     previous_word_idx = None
     label_ids = []
     for word_idx in word_ids:
         if word_idx is None:
             label_ids.append(-100)
@@ -53,13 +51,13 @@ class LegalNER():
     return label_ids
-  def labelToText(self, data_token):
     prev_tag = 'O'
     result = {}
     temp = ''
     # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
-    for i, word in enumerate(data_token):
       if self.prediction_label[i] != 'O':
         if prev_tag == 'O' and temp != '':
           temp = ''
@@ -77,11 +75,11 @@ class LegalNER():
       prev_tag = self.prediction_label[i]
     return result
-  def labelConverter(self, entity):
     # Memilih prediksi entitas yang paling bagus
     entity_result = {}
-    for i in entity:
       if len(list(i.keys())) > 1:
         for y in i.items():
           if y[0] not in entity_result:
@@ -92,8 +90,8 @@ class LegalNER():
       else:
         if tuple(i.items())[0] not in entity_result:
           entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
-    # Mengkonversi hasil dalam bentuk String
     result = ''
     for i, (label, data) in enumerate(entity_result.items()):
       if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
@@ -102,15 +100,55 @@ class LegalNER():
         result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
       elif label in ['B_ADVO', 'B_REGI']:
         result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
-      else:
         result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
     return result
-  def display(self, progress=gr.Progress()):
-    file_pdf = read_pdf(self.pdf)
-    sentence_file = file_pdf.split(';')
     use_cuda = torch.cuda.is_available()
     device = torch.device("cuda" if use_cuda else "cpu")
     if use_cuda:
@@ -121,8 +159,7 @@ class LegalNER():
     model_weights = torch.load(file_check_point, map_location=torch.device(device))
     self.model.load_state_dict(model_weights)
-    label_extraction = []
-    for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
       toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
       input_ids = toknize['input_ids'].to(device)
       mask = toknize['attention_mask'].to(device)
@@ -135,10 +172,20 @@ class LegalNER():
       input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
       data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
       self.prediction_label = prediction_label
-      labelConv = self.labelToText(data_token)
       if labelConv:
-        label_extraction.append(labelConv)
-    return self.labelConverter(label_extraction)

 import torch
 class LegalNER():
+  def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
     self.model = model
     self.tokenizer = tokenizer
     self.check_point = check_point
     self.label_all_tokens = label_all_tokens
     self.prediction_label = ''
+    self.data_token = ''
+    self.ids_to_labels = ids_to_labels
+    self.label_extraction = []
+    self.tokenizer_decode = ''
     self.label_convert = {'B_VERN' : 'Nomor Putusan',
                    'B_DEFN' : 'Nama Terdakwa',
                    'B_CRIA' : 'Tindak Pidana',
                    }
   def align_word_ids(self, texts):
     tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
     word_ids = tokenized_inputs.word_ids()
     previous_word_idx = None
     label_ids = []
     for word_idx in word_ids:
         if word_idx is None:
             label_ids.append(-100)
     return label_ids
+  def labelToText(self):
     prev_tag = 'O'
     result = {}
     temp = ''
     # Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
+    for i, word in enumerate(self.data_token):
       if self.prediction_label[i] != 'O':
         if prev_tag == 'O' and temp != '':
           temp = ''
       prev_tag = self.prediction_label[i]
     return result
+  def dis_pdf_prediction(self):
     # Memilih prediksi entitas yang paling bagus
     entity_result = {}
+    for i in self.label_extraction:
       if len(list(i.keys())) > 1:
         for y in i.items():
           if y[0] not in entity_result:
       else:
         if tuple(i.items())[0] not in entity_result:
           entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
+    # Mengkonversi hasil ekstraski entitas dalam bentuk List
     result = ''
     for i, (label, data) in enumerate(entity_result.items()):
       if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
         result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
       elif label in ['B_ADVO', 'B_REGI']:
         result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
+      else:
         result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
+    return result
+  def dis_text_prediction(self):
+    result = []
+    temp_result = {}
+    count_huruf = 0
+    temp_word = ''
+    temp_label = ''
+    temp_label = ''
+    temp_count_huruf = 0
+    prev_word = ''
+    for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
+      if label != 'O':
+        if temp_word != '' and '##' not in word:
+          temp_result['entity'] = temp_label
+          temp_result['word'] = temp_word
+          temp_result['start'] = temp_count_huruf
+          temp_result['end'] = temp_count_huruf + (len(temp_word))
+          result.append(temp_result)
+          temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
+        if '##' in word:
+          temp_word += word.replace('##', '')
+        else:
+          temp_label = label
+          temp_word = word
+          temp_count_huruf = count_huruf
+      if i == len(self.data_token)-1:
+        temp_result['entity'] = temp_label
+        temp_result['word'] = temp_word
+        temp_result['start'] = temp_count_huruf
+        temp_result['end'] = temp_count_huruf + (len(temp_word))
+        result.append(temp_result)
+        temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
+      if '##' in word:
+        count_huruf += len(word)-2
+      else:
+        count_huruf += len(word)+1
     return result
+  def fit_transform(self, texts, progress=gr.Progress()):
     use_cuda = torch.cuda.is_available()
     device = torch.device("cuda" if use_cuda else "cpu")
     if use_cuda:
     model_weights = torch.load(file_check_point, map_location=torch.device(device))
     self.model.load_state_dict(model_weights)
+    for text in progress.tqdm(texts, desc="Ekstraksi Entitas"):
       toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
       input_ids = toknize['input_ids'].to(device)
       mask = toknize['attention_mask'].to(device)
       input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
       data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
+      self.tokenizer_decode = token_decode(input_ids_conv)
+      self.data_token = data_token
       self.prediction_label = prediction_label
+      labelConv = self.labelToText()
       if labelConv:
+        self.label_extraction.append(labelConv)
+  def predict(self, doc):
+    if '.pdf' not in doc:
+      self.fit_transform([doc.strip()])
+      return self.dis_text_prediction()
+    else:
+      file_pdf = read_pdf(doc)
+      sentence_file = file_pdf.split(';')
+      self.fit_transform(sentence_file)
+      return self.dis_pdf_prediction()