File size: 6,504 Bytes
4f08d2c 76a5b51 4f08d2c 76a5b51 4f08d2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
import argparse
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import torch
import utils
from metrics import *
import sys
sys.path.insert(0,"./")
from Models.SentenceTransformersModel import SentenceTransformerModels
from Models.llm_embeddings import LLMEmbeddings
from main_args import get_args
from metrics import CosineMetric
def read_pertubed_data(filename, task, lang="en"):
# path = f"./data/perturbed_dataset/{lang}/{task}/{filename}.csv"
if not os.path.exists(filename):
raise FileNotFoundError(f"File {filename} not found.")
return pd.read_csv(filename)
def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
model = LLMEmbeddings(args_model, device=default_gpu)
pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" # check if path exist
data = read_pertubed_data(pertubed_data_path, args_task)
# dataset_name = dataset_name.split(".")[0] if args_task == "paraphrase" else dataset_name.split("_")[0]
print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n")
# Collect all sentences based on task
sentences = []
if args_task in ["Anto","anto","Antonym"]:
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"]
for _, row in data[cols].iterrows():
sentences.extend(row.values)
elif args_task in ["jumbling", "Jumbling","jumb"]:
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
for _, row in data[cols].iterrows():
sentences.extend(row.values)
elif args_task in ["Syn","syn","Synonym"]:
cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
for _, row in data[cols].iterrows():
sentences.extend(row.values)
elif args_task in ["paraphrase","Paraphrase","para"]:
cols = ["original_sentence", "paraphrased_sentence"]
for _, row in data[cols].iterrows():
sentences.extend(row.values)
# Batch process embeddings
embeddings = model.encode_batch(sentences,batch_size=batch_size)
# Ensure embeddings are on CPU and in numpy format
if args_model == "chatgpt":
# For chatgpt, embeddings is likely a list of torch tensors
embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in embeddings]
embeddings = np.array(embeddings)
else:
# For other models, assume a single torch tensor
if isinstance(embeddings, torch.Tensor):
embeddings = embeddings.cpu().numpy()
# Process embeddings based on task
if args_task == "anto":
emb_org = embeddings[0::3] # start at 0, step by 3
emb_para = embeddings[1::3] # start at 1, step by 3
emb_anto = embeddings[2::3] # start at 2, step by 3
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto)
data["sim_org_para"] = sim_para
data["sim_org_anto"] = sim_anto
data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto)
print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """)
elif args_task == "jumbling":
emb_org = embeddings[0::5] # start at 0, step by 3
emb_para = embeddings[1::5] # start at 1, step by 3
emb_n1 = embeddings[2::5] # start at 2, step by 3
emb_n2 = embeddings[3::5]
emb_n3 = embeddings[4::5]
# Compute metrics for each perturbation
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1)
mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2)
mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3)
data["sim_org_para"] = sim_para
data["sim_org_n1"] = sim_n1
data["sim_org_n2"] = sim_n2
data["sim_org_n3"] = sim_n3
data["diff_org_para"] = sim_para - sim_para # Zero as per original
data["diff_org_n1"] = sim_para - sim_n1
data["diff_org_n2"] = sim_para - sim_n2
data["diff_org_n3"] = sim_para - sim_n3
print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """)
elif args_task == "syn":
emb_org = embeddings[0::4] # start at 0, step by 3
emb_s1 = embeddings[1::4] # start at 1, step by 3
emb_s2 = embeddings[2::4] # start at 2, step by 3
emb_s3 = embeddings[3::4]
_,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1)
_,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2)
_,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3)
data["sim_org_s1"] = sim_s1
data["sim_org_s2"] = sim_s2
data["sim_org_s3"] = sim_s3
print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """)
elif args_task == "paraphrase":
emb_s1 = embeddings[0::2] # start at 0, step by 3
emb_s2 = embeddings[1::2]
data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2)
print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """)
if save:
path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv"
data.to_csv(path)
print("Data saved at path : {path} ")
return data
if __name__ == "__main__":
if sys.gettrace() is None:
parser = get_args()
config = {
"args_model": parser.model_name,
"dataset_name": parser.perturbed_dataset,
"args_task": parser.task,
"default_gpu": parser.gpu,
"save": parser.save,
"target_lang": parser.target_lang,
"metric":parser.metric,
"batch_size":2
}
else:
#sentence-transformers/all-MiniLM-L6-v2
config = {
"args_model": "llama3",
"dataset_name": "mrpc",
"args_task": "syn",
"default_gpu": "cuda:2",
"save": False,
"target_lang": "en"
}
run(**config)
|