gradio-LegalNER / src /pdf_predict.py
arosyihuddin's picture
add files
a450bc7
raw
history blame
1.88 kB
from tqdm import tqdm
import torch
from read_file import *
from align_word_ids import *
from convertTotext import *
def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
file_pdf = read_pdf(file_path)
sentence_file = file_pdf.split(';')
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
model = model.cuda()
file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'
model_weights = torch.load(file_check_point, map_location=torch.device(device))
model.load_state_dict(model_weights)
label_extraction = []
for text in tqdm(sentence_file, desc="Prediction Sentence"):
toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
input_ids = toknize['input_ids'].to(device)
mask = toknize['attention_mask'].to(device)
logits = model(input_ids, mask, None)
label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
logits_clean = logits[0][label_ids != -100]
predictions = logits_clean.argmax(dim=1).tolist()
prediction_label = [ids_to_labels[i] for i in predictions]
input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
nerExtraction = convertTotext(data_token, prediction_label)
if nerExtraction:
label_extraction.append(nerExtraction)
# print(f"\nText : {text}")
# print(f"Predict Label : {prediction_label}")
# print()
# print(f"Hasil Ekstrak NER:")
# print(nerExtraction)
# print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
# print()
return label_extraction