import torch import numpy as np from bartpho.preprocess import tokenize, normalize tag_dict = { "RESTAURANT#GENERAL": "chung về nhà_hàng", "RESTAURANT#PRICES": "giá của nhà_hàng", "RESTAURANT#MISCELLANEOUS": "tổng_quát về nhà_hàng", "FOOD#PRICES": "giá đồ ăn", "FOOD#QUALITY": "chất_lượng đồ ăn", "FOOD#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ ăn", "DRINKS#PRICES": "giá đồ uống", "DRINKS#QUALITY": "chất_lượng đồ uống", "DRINKS#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ uống", "AMBIENCE#GENERAL": "bầu không_khí", "SERVICE#GENERAL": "dịch_vụ", "LOCATION#GENERAL": "vị_trí", } polarity_dict = { "không có": "không có", "positive": "tích_cực", "neutral": "trung_lập", "negative": "tiêu_cực" } polarity_list = ["không có", "tích_cực", "trung_lập", "tiêu_cực"] tags = ["chung về nhà_hàng", "giá của nhà_hàng", "tổng_quát về nhà_hàng", "giá đồ ăn", "chất_lượng đồ ăn", "phong_cách và lựa_chọn đồ ăn", "giá đồ uống", "chất_lượng đồ uống", "phong_cách và lựa_chọn đồ uống", "bầu không_khí", "dịch_vụ", "vị_trí"] eng_tags = ["RESTAURANT#GENERAL", "RESTAURANT#PRICES", "RESTAURANT#MISCELLANEOUS", "FOOD#PRICES", "FOOD#QUALITY", "FOOD#STYLE&OPTIONS", "DRINKS#PRICES", "DRINKS#QUALITY", "DRINKS#STYLE&OPTIONS", "AMBIENCE#GENERAL", "SERVICE#GENERAL", "LOCATION#GENERAL"] eng_polarity = ["không có", "positive", "neutral", "negative"] detect_labels = ['không', 'có'] no_polarity = len(polarity_list) no_tag = len(tags) def predict(model, text, tokenizer, model_tokenize=None, device='cuda', processed=True, printout=False): predicts = [] model.to(device) model.eval() model.config.use_cache = False if not processed: text = normalize(text) text = tokenize(text, model_tokenize) for i in range(no_tag): tag = tags[i] score_list = [] input_ids = tokenizer([text] * no_polarity, return_tensors='pt')['input_ids'] target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list] output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids'] with torch.no_grad(): output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0] logits = output.softmax(dim=-1).to('cpu').numpy() for m in range(no_polarity): score = 1 for n in range(logits[m].shape[0] - 2): score *= logits[m][n][output_ids[m][n+1]] score_list.append(score) predict = np.argmax(score_list) predicts.append(predict) if printout: result = {} for i in range(no_tag): if predicts[i] != 0: result[eng_tags[i]] = eng_polarity[predicts[i]] print(result) return predicts def predict_df(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base', device='cuda', processed=True, printout=True): model.eval() model.to(device) model.config.use_cache = False if not tokenizer: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0 total_f1 = len(df) total = len(df) * no_tag for i in range(total_f1): text = df['text'][i] labels = [df[x][i] for x in eng_tags] predicts = predict(model, text, tokenizer, model_tokenize, device, processed) labels_detect = [i for i in range(no_tag) if labels[i] != 0] predicts_detect = [i for i in range(no_tag) if predicts[i] != 0] common_detect = [x for x in labels_detect if x in predicts_detect] if common_detect: precision_detect = len(common_detect) / len(predicts_detect) recall_detect = len(common_detect) / len(labels_detect) f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect)) pre_detect += precision_detect rec_detect += recall_detect labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0] predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0] common_absa = [x for x in labels_absa if x in predicts_absa] if common_absa: precision_absa = len(common_absa) / len(predicts_absa) recall_absa = len(common_absa) / len(labels_absa) f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa)) pre_absa += precision_absa rec_absa += recall_absa for j in range(no_tag): if labels[j] == predicts[j]: count_acc += 1 count_detect += 1 else: if labels[j] != 0 and predicts[j] != 0: count_detect += 1 acc_detect = count_detect / total pre_detect = pre_detect / total_f1 rec_detect = rec_detect / total_f1 f1_detect = f1_detect / total_f1 acc = count_acc / total pre_absa = pre_absa / total_f1 rec_absa = rec_absa / total_f1 f1_absa = f1_absa / total_f1 if printout: print(f"Detect acc: {acc_detect:.4f}%") print(f"Detect precision: {pre_detect:.4f}%") print(f"Detect recall: {rec_detect:.4f}%") print(f"Detect f1: {f1_detect:.4f}%") print() print(f"Absa acc: {acc:.4f}%") print(f"Absa precision: {pre_absa:.4f}%") print(f"Absa recall: {rec_absa:.4f}%") print(f"Absa f1: {f1_absa:.4f}%") return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa def predict_detect(model, text, tokenizer, model_tokenize=None, device='cuda', processed=True, printout=False): detect_predicts = [] model.to(device) model.eval() model.config.use_cache = False if not processed: text = normalize(text) text = tokenize(text, model_tokenize) for i in range(no_tag): tag = tags[i] detect_score_list = [] input_ids = tokenizer([text] * 2, return_tensors='pt')['input_ids'] target_list = [tag.lower() + " " + detect_label.lower() + " được nhận_xét ." for detect_label in detect_labels] output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids'] with torch.no_grad(): output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0] logits = output.softmax(dim=-1).to('cpu').numpy() for m in range(2): detect_score = 1 for n in range(logits[m].shape[0] - 2): detect_score *= logits[m][n][output_ids[m][n+1]] detect_score_list.append(detect_score) detect_predict = np.argmax(detect_score_list) detect_predicts.append(detect_predict) predicts = [] for i in range(no_tag): if detect_predicts[i] == 0: predicts.append(0) else: tag = tags[i] score_list = [] input_ids = tokenizer([text] * (no_polarity - 1), return_tensors='pt')['input_ids'] target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list if polarity != "không có"] output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids'] with torch.no_grad(): output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0] logits = output.softmax(dim=-1).to('cpu').numpy() for m in range(no_polarity - 1): score = 1 for n in range(logits[m].shape[0] - 2): score *= logits[m][n][output_ids[m][n + 1]] score_list.append(score) predict = np.argmax(score_list) + 1 predicts.append(predict) if printout: result = {} for i in range(no_tag): if predicts[i] != 0: result[eng_tags[i]] = eng_polarity[predicts[i]] print(result) return predicts def predict_df_detect(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base', device='cuda', printout=True): model.eval() model.to(device) model.config.use_cache = False if not tokenizer: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0 total_f1 = len(df) total = len(df) * no_tag for i in range(total_f1): text = df['text'][i] labels = [df[x][i] for x in eng_tags] predicts = predict(model, text, tokenizer, model_tokenize, processed, device) labels_detect = [i for i in range(no_tag) if labels[i] != 0] predicts_detect = [i for i in range(no_tag) if predicts[i] != 0] common_detect = [x for x in labels_detect if x in predicts_detect] if common_detect: precision_detect = len(common_detect) / len(predicts_detect) recall_detect = len(common_detect) / len(labels_detect) f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect)) pre_detect += precision_detect rec_detect += recall_detect labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0] predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0] common_absa = [x for x in labels_absa if x in predicts_absa] if common_absa: precision_absa = len(common_absa) / len(predicts_absa) recall_absa = len(common_absa) / len(labels_absa) f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa)) pre_absa += precision_absa rec_absa += recall_absa for j in range(no_tag): if labels[j] == predicts[j]: count_acc += 1 count_detect += 1 else: if labels[j] != 0 and predicts[j] != 0: count_detect += 1 acc_detect = count_detect / total pre_detect = pre_detect / total_f1 rec_detect = rec_detect / total_f1 f1_detect = f1_detect / total_f1 acc = count_acc / total pre_absa = pre_absa / total_f1 rec_absa = rec_absa / total_f1 f1_absa = f1_absa / total_f1 if printout: print(f"Detect acc: {acc_detect:.4f}%") print(f"Detect precision: {pre_detect:.4f}%") print(f"Detect recall: {rec_detect:.4f}%") print(f"Detect f1: {f1_detect:.4f}%") print() print(f"Absa acc: {acc:.4f}%") print(f"Absa precision: {pre_absa:.4f}%") print(f"Absa recall: {rec_absa:.4f}%") print(f"Absa f1: {f1_absa:.4f}%") return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa