Spaces:

hHoai
/

demo-asbvn

Runtime error

App Files Files Community

hHoai commited on 19 days ago

Commit

68c0ef8

verified ·

1 Parent(s): 939d153

Update bartpho/utils.py

Browse files

Files changed (1) hide show

bartpho/utils.py +272 -270

bartpho/utils.py CHANGED Viewed

@@ -1,270 +1,272 @@
-import torch
-import numpy as np
-from bartpho.preprocess import tokenize, normalize
-tag_dict = {
-    "RESTAURANT#GENERAL": "chung về nhà_hàng",
-    "RESTAURANT#PRICES": "giá của nhà_hàng",
-    "RESTAURANT#MISCELLANEOUS": "tổng_quát về nhà_hàng",
-    "FOOD#PRICES": "giá đồ ăn",
-    "FOOD#QUALITY": "chất_lượng đồ ăn",
-    "FOOD#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ ăn",
-    "DRINKS#PRICES": "giá đồ uống",
-    "DRINKS#QUALITY": "chất_lượng đồ uống",
-    "DRINKS#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ uống",
-    "AMBIENCE#GENERAL": "bầu không_khí",
-    "SERVICE#GENERAL": "dịch_vụ",
-    "LOCATION#GENERAL": "vị_trí",
-}
-polarity_dict = {
-    "không có": "không có",
-    "positive": "tích_cực",
-    "neutral": "trung_lập",
-    "negative": "tiêu_cực"
-}
-polarity_list = ["không có", "tích_cực", "trung_lập", "tiêu_cực"]
-tags = ["chung về nhà_hàng", "giá của nhà_hàng", "tổng_quát về nhà_hàng", "giá đồ ăn",
-        "chất_lượng đồ ăn", "phong_cách và lựa_chọn đồ ăn", "giá đồ uống", "chất_lượng đồ uống",
-        "phong_cách và lựa_chọn đồ uống", "bầu không_khí", "dịch_vụ", "vị_trí"]
-eng_tags = ["RESTAURANT#GENERAL", "RESTAURANT#PRICES", "RESTAURANT#MISCELLANEOUS", "FOOD#PRICES",
-            "FOOD#QUALITY", "FOOD#STYLE&OPTIONS", "DRINKS#PRICES", "DRINKS#QUALITY",
-            "DRINKS#STYLE&OPTIONS", "AMBIENCE#GENERAL", "SERVICE#GENERAL", "LOCATION#GENERAL"]
-eng_polarity = ["không có", "positive", "neutral", "negative"]
-detect_labels = ['không', 'có']
-no_polarity = len(polarity_list)
-no_tag = len(tags)
-def predict(model, text, tokenizer, model_tokenize=None, device='cuda', processed=True, printout=False):
-    predicts = []
-    model.to(device)
-    model.eval()
-    model.config.use_cache = False
-    if not processed:
-        text = normalize(text)
-        text = tokenize(text, model_tokenize)
-    for i in range(no_tag):
-        tag = tags[i]
-        score_list = []
-        input_ids = tokenizer([text] * no_polarity, return_tensors='pt')['input_ids']
-        target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list]
-        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
-        with torch.no_grad():
-            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
-            logits = output.softmax(dim=-1).to('cpu').numpy()
-        for m in range(no_polarity):
-            score = 1
-            for n in range(logits[m].shape[0] - 2):
-                score *= logits[m][n][output_ids[m][n+1]]
-            score_list.append(score)
-        predict = np.argmax(score_list)
-        predicts.append(predict)
-    if printout:
-        result = {}
-        for i in range(no_tag):
-            if predicts[i] != 0:
-                result[eng_tags[i]] = eng_polarity[predicts[i]]
-        print(result)
-    return predicts
-def predict_df(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base',
-               device='cuda', processed=True, printout=True):
-    model.eval()
-    model.to(device)
-    model.config.use_cache = False
-    if not tokenizer:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0
-    total_f1 = len(df)
-    total = len(df) * no_tag
-    for i in range(total_f1):
-        text = df['text'][i]
-        labels = [df[x][i] for x in eng_tags]
-        predicts = predict(model, text, tokenizer, model_tokenize, device, processed)
-        labels_detect = [i for i in range(no_tag) if labels[i] != 0]
-        predicts_detect = [i for i in range(no_tag) if predicts[i] != 0]
-        common_detect = [x for x in labels_detect if x in predicts_detect]
-        if common_detect:
-            precision_detect = len(common_detect) / len(predicts_detect)
-            recall_detect = len(common_detect) / len(labels_detect)
-            f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect))
-            pre_detect += precision_detect
-            rec_detect += recall_detect
-            labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0]
-            predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0]
-            common_absa = [x for x in labels_absa if x in predicts_absa]
-            if common_absa:
-                precision_absa = len(common_absa) / len(predicts_absa)
-                recall_absa = len(common_absa) / len(labels_absa)
-                f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa))
-                pre_absa += precision_absa
-                rec_absa += recall_absa
-        for j in range(no_tag):
-            if labels[j] == predicts[j]:
-                count_acc += 1
-                count_detect += 1
-            else:
-                if labels[j] != 0 and predicts[j] != 0:
-                    count_detect += 1
-    acc_detect = count_detect / total
-    pre_detect = pre_detect / total_f1
-    rec_detect = rec_detect / total_f1
-    f1_detect = f1_detect / total_f1
-    acc = count_acc / total
-    pre_absa = pre_absa / total_f1
-    rec_absa = rec_absa / total_f1
-    f1_absa = f1_absa / total_f1
-    if printout:
-        print(f"Detect acc: {acc_detect:.4f}%")
-        print(f"Detect precision: {pre_detect:.4f}%")
-        print(f"Detect recall: {rec_detect:.4f}%")
-        print(f"Detect f1: {f1_detect:.4f}%")
-        print()
-        print(f"Absa acc: {acc:.4f}%")
-        print(f"Absa precision: {pre_absa:.4f}%")
-        print(f"Absa recall: {rec_absa:.4f}%")
-        print(f"Absa f1: {f1_absa:.4f}%")
-    return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa
-def predict_detect(model, text, tokenizer, model_tokenize=None, device='cuda', processed=True, printout=False):
-    detect_predicts = []
-    model.to(device)
-    model.eval()
-    model.config.use_cache = False
-    if not processed:
-        text = normalize(text)
-        text = tokenize(text, model_tokenize)
-    for i in range(no_tag):
-        tag = tags[i]
-        detect_score_list = []
-        input_ids = tokenizer([text] * 2, return_tensors='pt')['input_ids']
-        target_list = [tag.lower() + " " + detect_label.lower() + " được nhận_xét ." for detect_label in detect_labels]
-        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
-        with torch.no_grad():
-            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
-            logits = output.softmax(dim=-1).to('cpu').numpy()
-        for m in range(2):
-            detect_score = 1
-            for n in range(logits[m].shape[0] - 2):
-                detect_score *= logits[m][n][output_ids[m][n+1]]
-            detect_score_list.append(detect_score)
-        detect_predict = np.argmax(detect_score_list)
-        detect_predicts.append(detect_predict)
-    predicts = []
-    for i in range(no_tag):
-        if detect_predicts[i] == 0:
-            predicts.append(0)
-        else:
-            tag = tags[i]
-            score_list = []
-            input_ids = tokenizer([text] * (no_polarity - 1), return_tensors='pt')['input_ids']
-            target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list if polarity != "không có"]
-            output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
-            with torch.no_grad():
-                output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
-                logits = output.softmax(dim=-1).to('cpu').numpy()
-            for m in range(no_polarity - 1):
-                score = 1
-                for n in range(logits[m].shape[0] - 2):
-                    score *= logits[m][n][output_ids[m][n + 1]]
-                score_list.append(score)
-            predict = np.argmax(score_list) + 1
-            predicts.append(predict)
-    if printout:
-        result = {}
-        for i in range(no_tag):
-            if predicts[i] != 0:
-                result[eng_tags[i]] = eng_polarity[predicts[i]]
-        print(result)
-    return predicts
-def predict_df_detect(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base',
-                      device='cuda', printout=True):
-    model.eval()
-    model.to(device)
-    model.config.use_cache = False
-    if not tokenizer:
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0
-    total_f1 = len(df)
-    total = len(df) * no_tag
-    for i in range(total_f1):
-        text = df['text'][i]
-        labels = [df[x][i] for x in eng_tags]
-        predicts = predict(model, text, tokenizer, model_tokenize, processed, device)
-        labels_detect = [i for i in range(no_tag) if labels[i] != 0]
-        predicts_detect = [i for i in range(no_tag) if predicts[i] != 0]
-        common_detect = [x for x in labels_detect if x in predicts_detect]
-        if common_detect:
-            precision_detect = len(common_detect) / len(predicts_detect)
-            recall_detect = len(common_detect) / len(labels_detect)
-            f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect))
-            pre_detect += precision_detect
-            rec_detect += recall_detect
-            labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0]
-            predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0]
-            common_absa = [x for x in labels_absa if x in predicts_absa]
-            if common_absa:
-                precision_absa = len(common_absa) / len(predicts_absa)
-                recall_absa = len(common_absa) / len(labels_absa)
-                f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa))
-                pre_absa += precision_absa
-                rec_absa += recall_absa
-        for j in range(no_tag):
-            if labels[j] == predicts[j]:
-                count_acc += 1
-                count_detect += 1
-            else:
-                if labels[j] != 0 and predicts[j] != 0:
-                    count_detect += 1
-    acc_detect = count_detect / total
-    pre_detect = pre_detect / total_f1
-    rec_detect = rec_detect / total_f1
-    f1_detect = f1_detect / total_f1
-    acc = count_acc / total
-    pre_absa = pre_absa / total_f1
-    rec_absa = rec_absa / total_f1
-    f1_absa = f1_absa / total_f1
-    if printout:
-        print(f"Detect acc: {acc_detect:.4f}%")
-        print(f"Detect precision: {pre_detect:.4f}%")
-        print(f"Detect recall: {rec_detect:.4f}%")
-        print(f"Detect f1: {f1_detect:.4f}%")
-        print()
-        print(f"Absa acc: {acc:.4f}%")
-        print(f"Absa precision: {pre_absa:.4f}%")
-        print(f"Absa recall: {rec_absa:.4f}%")
-        print(f"Absa f1: {f1_absa:.4f}%")
-    return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa

+import torch
+import numpy as np
+from bartpho.preprocess import tokenize, normalize
+tag_dict = {
+    "RESTAURANT#GENERAL": "chung về nhà_hàng",
+    "RESTAURANT#PRICES": "giá của nhà_hàng",
+    "RESTAURANT#MISCELLANEOUS": "tổng_quát về nhà_hàng",
+    "FOOD#PRICES": "giá đồ ăn",
+    "FOOD#QUALITY": "chất_lượng đồ ăn",
+    "FOOD#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ ăn",
+    "DRINKS#PRICES": "giá đồ uống",
+    "DRINKS#QUALITY": "chất_lượng đồ uống",
+    "DRINKS#STYLE&OPTIONS": "phong_cách và lựa_chọn đồ uống",
+    "AMBIENCE#GENERAL": "bầu không_khí",
+    "SERVICE#GENERAL": "dịch_vụ",
+    "LOCATION#GENERAL": "vị_trí",
+}
+polarity_dict = {
+    "không có": "không có",
+    "positive": "tích_cực",
+    "neutral": "trung_lập",
+    "negative": "tiêu_cực"
+}
+polarity_list = ["không có", "tích_cực", "trung_lập", "tiêu_cực"]
+tags = ["chung về nhà_hàng", "giá của nhà_hàng", "tổng_quát về nhà_hàng", "giá đồ ăn",
+        "chất_lượng đồ ăn", "phong_cách và lựa_chọn đồ ăn", "giá đồ uống", "chất_lượng đồ uống",
+        "phong_cách và lựa_chọn đồ uống", "bầu không_khí", "dịch_vụ", "vị_trí"]
+eng_tags = ["RESTAURANT#GENERAL", "RESTAURANT#PRICES", "RESTAURANT#MISCELLANEOUS", "FOOD#PRICES",
+            "FOOD#QUALITY", "FOOD#STYLE&OPTIONS", "DRINKS#PRICES", "DRINKS#QUALITY",
+            "DRINKS#STYLE&OPTIONS", "AMBIENCE#GENERAL", "SERVICE#GENERAL", "LOCATION#GENERAL"]
+eng_polarity = ["không có", "positive", "neutral", "negative"]
+detect_labels = ['không', 'có']
+no_polarity = len(polarity_list)
+no_tag = len(tags)
+def predict(model, text, tokenizer, model_tokenize=None, processed=True, printout=False):
+    predicts = []
+    device = 'cpu'
+    model.to(device)
+    model.eval()
+    model.config.use_cache = False
+    if not processed:
+        text = normalize(text)
+        text = tokenize(text, model_tokenize)
+    for i in range(no_tag):
+        tag = tags[i]
+        score_list = []
+        input_ids = tokenizer([text] * no_polarity, return_tensors='pt')['input_ids']
+        target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list]
+        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
+        with torch.no_grad():
+            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
+            logits = output.softmax(dim=-1).to('cpu').numpy()
+        for m in range(no_polarity):
+            score = 1
+            for n in range(logits[m].shape[0] - 2):
+                score *= logits[m][n][output_ids[m][n+1]]
+            score_list.append(score)
+        predict = np.argmax(score_list)
+        predicts.append(predict)
+    if printout:
+        result = {}
+        for i in range(no_tag):
+            if predicts[i] != 0:
+                result[eng_tags[i]] = eng_polarity[predicts[i]]
+        print(result)
+    return predicts
+def predict_df(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base', processed=True, printout=True):
+    model.eval()
+    device = 'cpu'
+    model.to(device)
+    model.config.use_cache = False
+    if not tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0
+    total_f1 = len(df)
+    total = len(df) * no_tag
+    for i in range(total_f1):
+        text = df['text'][i]
+        labels = [df[x][i] for x in eng_tags]
+        predicts = predict(model, text, tokenizer, model_tokenize, device, processed)
+        labels_detect = [i for i in range(no_tag) if labels[i] != 0]
+        predicts_detect = [i for i in range(no_tag) if predicts[i] != 0]
+        common_detect = [x for x in labels_detect if x in predicts_detect]
+        if common_detect:
+            precision_detect = len(common_detect) / len(predicts_detect)
+            recall_detect = len(common_detect) / len(labels_detect)
+            f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect))
+            pre_detect += precision_detect
+            rec_detect += recall_detect
+            labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0]
+            predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0]
+            common_absa = [x for x in labels_absa if x in predicts_absa]
+            if common_absa:
+                precision_absa = len(common_absa) / len(predicts_absa)
+                recall_absa = len(common_absa) / len(labels_absa)
+                f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa))
+                pre_absa += precision_absa
+                rec_absa += recall_absa
+        for j in range(no_tag):
+            if labels[j] == predicts[j]:
+                count_acc += 1
+                count_detect += 1
+            else:
+                if labels[j] != 0 and predicts[j] != 0:
+                    count_detect += 1
+    acc_detect = count_detect / total
+    pre_detect = pre_detect / total_f1
+    rec_detect = rec_detect / total_f1
+    f1_detect = f1_detect / total_f1
+    acc = count_acc / total
+    pre_absa = pre_absa / total_f1
+    rec_absa = rec_absa / total_f1
+    f1_absa = f1_absa / total_f1
+    if printout:
+        print(f"Detect acc: {acc_detect:.4f}%")
+        print(f"Detect precision: {pre_detect:.4f}%")
+        print(f"Detect recall: {rec_detect:.4f}%")
+        print(f"Detect f1: {f1_detect:.4f}%")
+        print()
+        print(f"Absa acc: {acc:.4f}%")
+        print(f"Absa precision: {pre_absa:.4f}%")
+        print(f"Absa recall: {rec_absa:.4f}%")
+        print(f"Absa f1: {f1_absa:.4f}%")
+    return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa
+def predict_detect(model, text, tokenizer, model_tokenize=None, processed=True, printout=False):
+    detect_predicts = []
+    device = 'cpu'
+    model.to(device)
+    model.eval()
+    model.config.use_cache = False
+    if not processed:
+        text = normalize(text)
+        text = tokenize(text, model_tokenize)
+    for i in range(no_tag):
+        tag = tags[i]
+        detect_score_list = []
+        input_ids = tokenizer([text] * 2, return_tensors='pt')['input_ids']
+        target_list = [tag.lower() + " " + detect_label.lower() + " được nhận_xét ." for detect_label in detect_labels]
+        output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
+        with torch.no_grad():
+            output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
+            logits = output.softmax(dim=-1).to('cpu').numpy()
+        for m in range(2):
+            detect_score = 1
+            for n in range(logits[m].shape[0] - 2):
+                detect_score *= logits[m][n][output_ids[m][n+1]]
+            detect_score_list.append(detect_score)
+        detect_predict = np.argmax(detect_score_list)
+        detect_predicts.append(detect_predict)
+    predicts = []
+    for i in range(no_tag):
+        if detect_predicts[i] == 0:
+            predicts.append(0)
+        else:
+            tag = tags[i]
+            score_list = []
+            input_ids = tokenizer([text] * (no_polarity - 1), return_tensors='pt')['input_ids']
+            target_list = ["Nhận_xét " + tag.lower() + " " + polarity.lower() + " ." for polarity in polarity_list if polarity != "không có"]
+            output_ids = tokenizer(target_list, return_tensors='pt', padding=True, truncation=True)['input_ids']
+            with torch.no_grad():
+                output = model(input_ids=input_ids.to(device), decoder_input_ids=output_ids.to(device))[0]
+                logits = output.softmax(dim=-1).to('cpu').numpy()
+            for m in range(no_polarity - 1):
+                score = 1
+                for n in range(logits[m].shape[0] - 2):
+                    score *= logits[m][n][output_ids[m][n + 1]]
+                score_list.append(score)
+            predict = np.argmax(score_list) + 1
+            predicts.append(predict)
+    if printout:
+        result = {}
+        for i in range(no_tag):
+            if predicts[i] != 0:
+                result[eng_tags[i]] = eng_polarity[predicts[i]]
+        print(result)
+    return predicts
+def predict_df_detect(model, df, tokenizer=None, model_tokenize=None, tokenizer_name='vinai/bartpho-word-base', printout=True):
+    model.eval()
+    device = 'cpu'
+    model.to(device)
+    model.config.use_cache = False
+    if not tokenizer:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    count_acc = count_detect = f1_detect = f1_absa = pre_detect = rec_detect = pre_absa = rec_absa = 0
+    total_f1 = len(df)
+    total = len(df) * no_tag
+    for i in range(total_f1):
+        text = df['text'][i]
+        labels = [df[x][i] for x in eng_tags]
+        predicts = predict(model, text, tokenizer, model_tokenize, processed, device)
+        labels_detect = [i for i in range(no_tag) if labels[i] != 0]
+        predicts_detect = [i for i in range(no_tag) if predicts[i] != 0]
+        common_detect = [x for x in labels_detect if x in predicts_detect]
+        if common_detect:
+            precision_detect = len(common_detect) / len(predicts_detect)
+            recall_detect = len(common_detect) / len(labels_detect)
+            f1_detect += (2 * precision_detect * recall_detect / (precision_detect + recall_detect))
+            pre_detect += precision_detect
+            rec_detect += recall_detect
+            labels_absa = [str(i) + '-' + str(labels[i]) for i in range(no_tag) if labels[i] != 0]
+            predicts_absa = [str(i) + '-' + str(predicts[i]) for i in range(no_tag) if predicts[i] != 0]
+            common_absa = [x for x in labels_absa if x in predicts_absa]
+            if common_absa:
+                precision_absa = len(common_absa) / len(predicts_absa)
+                recall_absa = len(common_absa) / len(labels_absa)
+                f1_absa += (2 * precision_absa * recall_absa / (precision_absa + recall_absa))
+                pre_absa += precision_absa
+                rec_absa += recall_absa
+        for j in range(no_tag):
+            if labels[j] == predicts[j]:
+                count_acc += 1
+                count_detect += 1
+            else:
+                if labels[j] != 0 and predicts[j] != 0:
+                    count_detect += 1
+    acc_detect = count_detect / total
+    pre_detect = pre_detect / total_f1
+    rec_detect = rec_detect / total_f1
+    f1_detect = f1_detect / total_f1
+    acc = count_acc / total
+    pre_absa = pre_absa / total_f1
+    rec_absa = rec_absa / total_f1
+    f1_absa = f1_absa / total_f1
+    if printout:
+        print(f"Detect acc: {acc_detect:.4f}%")
+        print(f"Detect precision: {pre_detect:.4f}%")
+        print(f"Detect recall: {rec_detect:.4f}%")
+        print(f"Detect f1: {f1_detect:.4f}%")
+        print()
+        print(f"Absa acc: {acc:.4f}%")
+        print(f"Absa precision: {pre_absa:.4f}%")
+        print(f"Absa recall: {rec_absa:.4f}%")
+        print(f"Absa f1: {f1_absa:.4f}%")
+    return acc_detect, pre_detect, rec_detect, f1_detect, acc, pre_absa, rec_absa, f1_absa