Spaces:
Sleeping
Sleeping
Commit
·
de92ab7
1
Parent(s):
ecfd12f
update UI
Browse files- app.py +75 -12
- src/__pycache__/helper.cpython-310.pyc +0 -0
- src/__pycache__/legalNER.cpython-310.pyc +0 -0
- src/helper.py +16 -1
- src/legalNER.py +73 -26
app.py
CHANGED
@@ -3,6 +3,9 @@ from gradio_pdf import PDF
|
|
3 |
from src.bert import *
|
4 |
from src.legalNER import *
|
5 |
import gradio as gr
|
|
|
|
|
|
|
6 |
|
7 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
8 |
indolem = 'indolem/indobert-base-uncased'
|
@@ -12,7 +15,22 @@ model_indonlu = BertModel(indonlu, len(ids_to_labels))
|
|
12 |
tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
|
13 |
tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
|
14 |
|
15 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
if model == 'IndoBERT (IndoLEM)':
|
17 |
use_model = model_indolem
|
18 |
use_tokenizer = tokenizer_indolem
|
@@ -21,19 +39,64 @@ def predict(doc : str, model : str) -> str:
|
|
21 |
use_model = model_indonlu
|
22 |
use_tokenizer = tokenizer_indonlu
|
23 |
|
24 |
-
|
25 |
|
26 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
)
|
37 |
|
38 |
if __name__ == "__main__":
|
39 |
-
|
|
|
3 |
from src.bert import *
|
4 |
from src.legalNER import *
|
5 |
import gradio as gr
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
dir_ = Path(__file__).parent
|
9 |
|
10 |
ids_to_labels = {0: 'B_ADVO', 1: 'B_ARTV', 2: 'B_CRIA', 3: 'B_DEFN', 4: 'B_JUDG', 5: 'B_JUDP', 6: 'B_PENA', 7: 'B_PROS', 8: 'B_PUNI', 9: 'B_REGI', 10: 'B_TIMV', 11: 'B_VERN', 12: 'I_ADVO', 13: 'I_ARTV', 14: 'I_CRIA', 15: 'I_DEFN', 16: 'I_JUDG', 17: 'I_JUDP', 18: 'I_PENA', 19: 'I_PROS', 20: 'I_PUNI', 21: 'I_REGI', 22: 'I_TIMV', 23: 'I_VERN', 24: 'O'}
|
11 |
indolem = 'indolem/indobert-base-uncased'
|
|
|
15 |
tokenizer_indolem = BertTokenizerFast.from_pretrained(indolem)
|
16 |
tokenizer_indonlu = BertTokenizerFast.from_pretrained(indonlu)
|
17 |
|
18 |
+
def text_extraction(text, model, progress=gr.Progress()):
|
19 |
+
if model == 'IndoBERT (IndoLEM)':
|
20 |
+
use_model = model_indolem
|
21 |
+
use_tokenizer = tokenizer_indolem
|
22 |
+
|
23 |
+
else:
|
24 |
+
use_model = model_indonlu
|
25 |
+
use_tokenizer = tokenizer_indonlu
|
26 |
+
|
27 |
+
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
28 |
+
entitas = legalner.predict(text)
|
29 |
+
new_text = legalner.tokenizer_decode
|
30 |
+
|
31 |
+
return {"text": new_text, "entities": entitas}
|
32 |
+
|
33 |
+
def pdf_extraction(doc, model, progress=gr.Progress()):
|
34 |
if model == 'IndoBERT (IndoLEM)':
|
35 |
use_model = model_indolem
|
36 |
use_tokenizer = tokenizer_indolem
|
|
|
39 |
use_model = model_indonlu
|
40 |
use_tokenizer = tokenizer_indonlu
|
41 |
|
42 |
+
legalner = LegalNER(use_model, use_tokenizer, ids_to_labels, model)
|
43 |
|
44 |
+
return legalner.predict(doc)
|
45 |
+
|
46 |
+
|
47 |
+
with gr.Blocks() as ner:
|
48 |
+
gr.Markdown("#Sistem Ekstraksi Informasi Dokumen Putusan Hukum")
|
49 |
+
gr.Markdown("## Uji Coba Model dengan Potongan Kalimat")
|
50 |
+
# Input Text
|
51 |
+
with gr.Row():
|
52 |
+
with gr.Column(scale=2):
|
53 |
+
text = gr.Textbox(label="Text")
|
54 |
+
model_text = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model', value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
|
55 |
+
button_text = gr.Button(value="Predict", variant='primary')
|
56 |
+
gr.ClearButton(text, value='Reset')
|
57 |
+
with gr.Column(scale=3):
|
58 |
+
output_text = gr.HighlightedText(label="Output Text")
|
59 |
+
|
60 |
+
button_text.click(fn=text_extraction, inputs=[text, model_text], outputs=output_text, api_name="text")
|
61 |
+
|
62 |
+
gr.Markdown("## Contoh Inputan Potongan Kalimat")
|
63 |
+
gr.Examples(
|
64 |
+
examples=[
|
65 |
+
["PUTUSAN . NOMOR : 187 / Pid . Sus / 2014 / PN . JKT . TIM . DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA . MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoLEM)"],
|
66 |
+
["MENUNTUT : 1 Menyatakan terdakwa AGNES TRI AHADI Als AGNES telah terbukti secara sah dan meyakinkan bersalah melakukan tindak pidana Narkotika memiliki , menyimpan , menguasai , atau menyediakan Narkotika golongan I bukan tanaman sebagaimana didakwakan dalam dakwaan kedua yaitu melanggar ketentuan unsure pasal 112 ayat ( 1 ) UURI No . 35 tahun 2009 tentang Narkotika ;", "IndoBERT (IndoNLU)"]
|
67 |
+
["PUTUSAN Nomor 77/Pid.B/2023/PN Jkt.Pst DEMI KEADILAN BERDASARKAN KETUHANAN YANG MAHA ESA Pengadilan Negeri Jakarta Pusat yang mengadili perkara pidana dengan acara pemeriksaan biasa dalam tingkat pertama menjatuhkan putusan sebagai berikut dalam perkara Terdakwa : 1. Nama lengkap : Arif Bin Santung", "IndoBERT (IndoLEM)"]
|
68 |
+
],
|
69 |
+
inputs=[text, model_text],
|
70 |
+
outputs=output_text,
|
71 |
+
fn=text_extraction,
|
72 |
+
)
|
73 |
|
74 |
+
gr.Markdown("## Ekstrak Entitas pada Dokumen Putusan Hukum")
|
75 |
+
# Input PDF
|
76 |
+
with gr.Row():
|
77 |
+
with gr.Column(scale=2):
|
78 |
+
doc = PDF(label="Document")
|
79 |
+
model_pdf = gr.Dropdown(['IndoBERT (IndoLEM)', 'IndoBERT (IndoNLU)'], label='Model',value='IndoBERT (IndoLEM)', info='Pilih Model yang ingin digunakan *Default : IndoBERT (IndoLEM)')
|
80 |
+
button_pdf = gr.Button(value="Extract", variant='primary')
|
81 |
+
gr.ClearButton(doc, value="Reset")
|
82 |
+
|
83 |
+
with gr.Column(scale=3):
|
84 |
+
output_pdf = gr.Textbox(label="Output PDF")
|
85 |
+
|
86 |
+
button_pdf.click(fn=pdf_extraction, inputs=[doc, model_pdf], outputs=output_pdf, api_name="pdf")
|
87 |
+
|
88 |
+
gr.Examples(
|
89 |
+
examples=[[str(dir_ /"data/165_Pdt.P_2023_PN_Bkl.pdf")],
|
90 |
+
[str(dir_ /'data/162_Pid.Sus_2023_PN_Bkl.pdf')],
|
91 |
+
[str(dir_ /'data/164_Pid.Sus_2023_PN_Bkl.pdf')],
|
92 |
+
[str(dir_ /'data/167_Pid.Sus_2023_PN_Bkl.pdf')],
|
93 |
+
[str(dir_ /'data/168_Pid.Sus_2023_PN_Bkl.pdf')],
|
94 |
+
[str(dir_ /'data/169_Pid.Sus_2023_PN_Bkl.pdf')],
|
95 |
+
],
|
96 |
+
inputs=[doc],
|
97 |
+
outputs=output_pdf,
|
98 |
+
fn=pdf_extraction,
|
99 |
)
|
100 |
|
101 |
if __name__ == "__main__":
|
102 |
+
ner.launch()
|
src/__pycache__/helper.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/helper.cpython-310.pyc and b/src/__pycache__/helper.cpython-310.pyc differ
|
|
src/__pycache__/legalNER.cpython-310.pyc
CHANGED
Binary files a/src/__pycache__/legalNER.cpython-310.pyc and b/src/__pycache__/legalNER.cpython-310.pyc differ
|
|
src/helper.py
CHANGED
@@ -32,4 +32,19 @@ def read_pdf(pdf):
|
|
32 |
return pdf_text.strip()
|
33 |
|
34 |
except requests.exceptions.RequestException as e:
|
35 |
-
print("Error:", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
return pdf_text.strip()
|
33 |
|
34 |
except requests.exceptions.RequestException as e:
|
35 |
+
print("Error:", e)
|
36 |
+
|
37 |
+
def token_decode(input_ids_conv):
|
38 |
+
result = ''
|
39 |
+
temp = ''
|
40 |
+
for i, word in enumerate(input_ids_conv):
|
41 |
+
if word not in ['[CLS]', '[SEP]', '[PAD]']:
|
42 |
+
if temp != '' and '##' not in word:
|
43 |
+
result += ' ' + temp
|
44 |
+
if '##' in word:
|
45 |
+
temp += word.replace('##', '')
|
46 |
+
else:
|
47 |
+
temp = word
|
48 |
+
if i == len(input_ids_conv)-1:
|
49 |
+
result += ' ' + temp
|
50 |
+
return result.strip()
|
src/legalNER.py
CHANGED
@@ -3,14 +3,16 @@ import gradio as gr
|
|
3 |
import torch
|
4 |
|
5 |
class LegalNER():
|
6 |
-
def __init__(self, model, tokenizer,
|
7 |
self.model = model
|
8 |
self.tokenizer = tokenizer
|
9 |
-
self.pdf = pdf_file
|
10 |
self.check_point = check_point
|
11 |
self.label_all_tokens = label_all_tokens
|
12 |
-
self.ids_to_labels = ids_to_labels
|
13 |
self.prediction_label = ''
|
|
|
|
|
|
|
|
|
14 |
self.label_convert = {'B_VERN' : 'Nomor Putusan',
|
15 |
'B_DEFN' : 'Nama Terdakwa',
|
16 |
'B_CRIA' : 'Tindak Pidana',
|
@@ -26,16 +28,12 @@ class LegalNER():
|
|
26 |
}
|
27 |
|
28 |
def align_word_ids(self, texts):
|
29 |
-
|
30 |
tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
31 |
-
|
32 |
word_ids = tokenized_inputs.word_ids()
|
33 |
-
|
34 |
previous_word_idx = None
|
35 |
label_ids = []
|
36 |
|
37 |
for word_idx in word_ids:
|
38 |
-
|
39 |
if word_idx is None:
|
40 |
label_ids.append(-100)
|
41 |
|
@@ -53,13 +51,13 @@ class LegalNER():
|
|
53 |
|
54 |
return label_ids
|
55 |
|
56 |
-
def labelToText(self
|
57 |
prev_tag = 'O'
|
58 |
result = {}
|
59 |
temp = ''
|
60 |
|
61 |
# Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
|
62 |
-
for i, word in enumerate(data_token):
|
63 |
if self.prediction_label[i] != 'O':
|
64 |
if prev_tag == 'O' and temp != '':
|
65 |
temp = ''
|
@@ -77,11 +75,11 @@ class LegalNER():
|
|
77 |
prev_tag = self.prediction_label[i]
|
78 |
|
79 |
return result
|
80 |
-
|
81 |
-
def
|
82 |
# Memilih prediksi entitas yang paling bagus
|
83 |
entity_result = {}
|
84 |
-
for i in
|
85 |
if len(list(i.keys())) > 1:
|
86 |
for y in i.items():
|
87 |
if y[0] not in entity_result:
|
@@ -92,8 +90,8 @@ class LegalNER():
|
|
92 |
else:
|
93 |
if tuple(i.items())[0] not in entity_result:
|
94 |
entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
|
95 |
-
|
96 |
-
# Mengkonversi hasil dalam bentuk
|
97 |
result = ''
|
98 |
for i, (label, data) in enumerate(entity_result.items()):
|
99 |
if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
|
@@ -102,15 +100,55 @@ class LegalNER():
|
|
102 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
103 |
elif label in ['B_ADVO', 'B_REGI']:
|
104 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
|
105 |
-
else:
|
106 |
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
return result
|
109 |
|
110 |
-
def
|
111 |
-
file_pdf = read_pdf(self.pdf)
|
112 |
-
sentence_file = file_pdf.split(';')
|
113 |
-
|
114 |
use_cuda = torch.cuda.is_available()
|
115 |
device = torch.device("cuda" if use_cuda else "cpu")
|
116 |
if use_cuda:
|
@@ -121,8 +159,7 @@ class LegalNER():
|
|
121 |
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
122 |
self.model.load_state_dict(model_weights)
|
123 |
|
124 |
-
|
125 |
-
for text in progress.tqdm(sentence_file, desc="Ekstraksi Entitas"):
|
126 |
toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
|
127 |
input_ids = toknize['input_ids'].to(device)
|
128 |
mask = toknize['attention_mask'].to(device)
|
@@ -135,10 +172,20 @@ class LegalNER():
|
|
135 |
|
136 |
input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
137 |
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
|
|
|
|
138 |
self.prediction_label = prediction_label
|
139 |
-
labelConv = self.labelToText(
|
140 |
|
141 |
if labelConv:
|
142 |
-
label_extraction.append(labelConv)
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import torch
|
4 |
|
5 |
class LegalNER():
|
6 |
+
def __init__(self, model, tokenizer, ids_to_labels, check_point='IndoBERT (IndoLEM)', label_all_tokens=True):
|
7 |
self.model = model
|
8 |
self.tokenizer = tokenizer
|
|
|
9 |
self.check_point = check_point
|
10 |
self.label_all_tokens = label_all_tokens
|
|
|
11 |
self.prediction_label = ''
|
12 |
+
self.data_token = ''
|
13 |
+
self.ids_to_labels = ids_to_labels
|
14 |
+
self.label_extraction = []
|
15 |
+
self.tokenizer_decode = ''
|
16 |
self.label_convert = {'B_VERN' : 'Nomor Putusan',
|
17 |
'B_DEFN' : 'Nama Terdakwa',
|
18 |
'B_CRIA' : 'Tindak Pidana',
|
|
|
28 |
}
|
29 |
|
30 |
def align_word_ids(self, texts):
|
|
|
31 |
tokenized_inputs = self.tokenizer(texts, padding='max_length', max_length=512, truncation=True)
|
|
|
32 |
word_ids = tokenized_inputs.word_ids()
|
|
|
33 |
previous_word_idx = None
|
34 |
label_ids = []
|
35 |
|
36 |
for word_idx in word_ids:
|
|
|
37 |
if word_idx is None:
|
38 |
label_ids.append(-100)
|
39 |
|
|
|
51 |
|
52 |
return label_ids
|
53 |
|
54 |
+
def labelToText(self):
|
55 |
prev_tag = 'O'
|
56 |
result = {}
|
57 |
temp = ''
|
58 |
|
59 |
# Menganggabungkan semua token menjadi satu kalimat sesuai dengan labelnya
|
60 |
+
for i, word in enumerate(self.data_token):
|
61 |
if self.prediction_label[i] != 'O':
|
62 |
if prev_tag == 'O' and temp != '':
|
63 |
temp = ''
|
|
|
75 |
prev_tag = self.prediction_label[i]
|
76 |
|
77 |
return result
|
78 |
+
|
79 |
+
def dis_pdf_prediction(self):
|
80 |
# Memilih prediksi entitas yang paling bagus
|
81 |
entity_result = {}
|
82 |
+
for i in self.label_extraction:
|
83 |
if len(list(i.keys())) > 1:
|
84 |
for y in i.items():
|
85 |
if y[0] not in entity_result:
|
|
|
90 |
else:
|
91 |
if tuple(i.items())[0] not in entity_result:
|
92 |
entity_result[tuple(i.items())[0][0]] = tuple(i.items())[0][1]
|
93 |
+
|
94 |
+
# Mengkonversi hasil ekstraski entitas dalam bentuk List
|
95 |
result = ''
|
96 |
for i, (label, data) in enumerate(entity_result.items()):
|
97 |
if label in ['B_PENA', 'B_ARTV', 'B_PROS']:
|
|
|
100 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t = {data.capitalize()}\n'
|
101 |
elif label in ['B_ADVO', 'B_REGI']:
|
102 |
result += f'{i+1}. {self.label_convert[label]}\t\t\t\t\t = {data.capitalize()}\n'
|
103 |
+
else:
|
104 |
result += f'{i+1}. {self.label_convert[label]}\t\t = {data.capitalize()}\n'
|
105 |
+
|
106 |
+
return result
|
107 |
+
|
108 |
+
def dis_text_prediction(self):
|
109 |
+
result = []
|
110 |
+
temp_result = {}
|
111 |
+
count_huruf = 0
|
112 |
+
temp_word = ''
|
113 |
+
temp_label = ''
|
114 |
+
temp_label = ''
|
115 |
+
temp_count_huruf = 0
|
116 |
+
prev_word = ''
|
117 |
+
for i, (word, label) in enumerate(zip(self.data_token, self.prediction_label)):
|
118 |
+
if label != 'O':
|
119 |
+
if temp_word != '' and '##' not in word:
|
120 |
+
temp_result['entity'] = temp_label
|
121 |
+
temp_result['word'] = temp_word
|
122 |
+
temp_result['start'] = temp_count_huruf
|
123 |
+
temp_result['end'] = temp_count_huruf + (len(temp_word))
|
124 |
+
result.append(temp_result)
|
125 |
+
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
126 |
+
|
127 |
+
if '##' in word:
|
128 |
+
temp_word += word.replace('##', '')
|
129 |
+
|
130 |
+
else:
|
131 |
+
temp_label = label
|
132 |
+
temp_word = word
|
133 |
+
temp_count_huruf = count_huruf
|
134 |
+
|
135 |
+
if i == len(self.data_token)-1:
|
136 |
+
temp_result['entity'] = temp_label
|
137 |
+
temp_result['word'] = temp_word
|
138 |
+
temp_result['start'] = temp_count_huruf
|
139 |
+
temp_result['end'] = temp_count_huruf + (len(temp_word))
|
140 |
+
result.append(temp_result)
|
141 |
+
temp_word, temp_label, temp_count_huruf, temp_result = '', '', 0, {}
|
142 |
+
|
143 |
+
if '##' in word:
|
144 |
+
count_huruf += len(word)-2
|
145 |
+
|
146 |
+
else:
|
147 |
+
count_huruf += len(word)+1
|
148 |
+
|
149 |
return result
|
150 |
|
151 |
+
def fit_transform(self, texts, progress=gr.Progress()):
|
|
|
|
|
|
|
152 |
use_cuda = torch.cuda.is_available()
|
153 |
device = torch.device("cuda" if use_cuda else "cpu")
|
154 |
if use_cuda:
|
|
|
159 |
model_weights = torch.load(file_check_point, map_location=torch.device(device))
|
160 |
self.model.load_state_dict(model_weights)
|
161 |
|
162 |
+
for text in progress.tqdm(texts, desc="Ekstraksi Entitas"):
|
|
|
163 |
toknize = self.tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
|
164 |
input_ids = toknize['input_ids'].to(device)
|
165 |
mask = toknize['attention_mask'].to(device)
|
|
|
172 |
|
173 |
input_ids_conv = self.tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
|
174 |
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
|
175 |
+
self.tokenizer_decode = token_decode(input_ids_conv)
|
176 |
+
self.data_token = data_token
|
177 |
self.prediction_label = prediction_label
|
178 |
+
labelConv = self.labelToText()
|
179 |
|
180 |
if labelConv:
|
181 |
+
self.label_extraction.append(labelConv)
|
182 |
+
|
183 |
+
def predict(self, doc):
|
184 |
+
if '.pdf' not in doc:
|
185 |
+
self.fit_transform([doc.strip()])
|
186 |
+
return self.dis_text_prediction()
|
187 |
+
else:
|
188 |
+
file_pdf = read_pdf(doc)
|
189 |
+
sentence_file = file_pdf.split(';')
|
190 |
+
self.fit_transform(sentence_file)
|
191 |
+
return self.dis_pdf_prediction()
|