Spaces:

arosyihuddin
/

gradio-LegalNER

Sleeping

App Files Files Community

gradio-LegalNER / src /pdf_predict.py

arosyihuddin

add files

a450bc7 about 1 year ago

raw

history blame

1.88 kB

	from tqdm import tqdm
	import torch
	from read_file import *
	from align_word_ids import *
	from convertTotext import *

	def pdf_predict(model, tokenizer, file_path, ids_to_labels, check_point='IndoBERT (IndoLEM)'):
	file_pdf = read_pdf(file_path)
	sentence_file = file_pdf.split(';')

	use_cuda = torch.cuda.is_available()
	device = torch.device("cuda" if use_cuda else "cpu")
	if use_cuda:
	model = model.cuda()

	file_check_point = 'model/IndoLEM/model_fold_4.pth' if check_point == 'IndoBERT (IndoLEM)' else 'model/IndoNLU/model_fold_4.pth'

	model_weights = torch.load(file_check_point, map_location=torch.device(device))
	model.load_state_dict(model_weights)

	label_extraction = []
	for text in tqdm(sentence_file, desc="Prediction Sentence"):
	toknize = tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
	input_ids = toknize['input_ids'].to(device)
	mask = toknize['attention_mask'].to(device)

	logits = model(input_ids, mask, None)
	label_ids = torch.Tensor(align_word_ids(text, tokenizer, True)).unsqueeze(0).to(device)
	logits_clean = logits[0][label_ids != -100]
	predictions = logits_clean.argmax(dim=1).tolist()
	prediction_label = [ids_to_labels[i] for i in predictions]

	input_ids_conv = tokenizer.convert_ids_to_tokens(toknize['input_ids'][0])
	data_token = [word for word in input_ids_conv if word not in ['[CLS]', '[SEP]', '[PAD]']]
	nerExtraction = convertTotext(data_token, prediction_label)

	if nerExtraction:
	label_extraction.append(nerExtraction)
	# print(f"\nText : {text}")
	# print(f"Predict Label : {prediction_label}")
	# print()

	# print(f"Hasil Ekstrak NER:")
	# print(nerExtraction)
	# print(f"Panjang Token : {len(data_token)}, Panjang Predict Label : {len(prediction_label)}")
	# print()

	return label_extraction