|
import argparse |
|
import numpy as np |
|
import os |
|
import pandas as pd |
|
from tqdm import tqdm |
|
import torch |
|
import utils |
|
from metrics import * |
|
import sys |
|
sys.path.insert(0,"./") |
|
from Models.SentenceTransformersModel import SentenceTransformerModels |
|
from Models.llm_embeddings import LLMEmbeddings |
|
from main_args import get_args |
|
from metrics import CosineMetric |
|
|
|
|
|
def read_pertubed_data(filename, task, lang="en"): |
|
|
|
if not os.path.exists(filename): |
|
raise FileNotFoundError(f"File {filename} not found.") |
|
return pd.read_csv(filename) |
|
|
|
|
|
def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2): |
|
model = LLMEmbeddings(args_model, device=default_gpu) |
|
|
|
pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" |
|
|
|
data = read_pertubed_data(pertubed_data_path, args_task) |
|
|
|
|
|
print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n") |
|
|
|
|
|
sentences = [] |
|
if args_task in ["Anto","anto","Antonym"]: |
|
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"] |
|
for _, row in data[cols].iterrows(): |
|
sentences.extend(row.values) |
|
elif args_task in ["jumbling", "Jumbling","jumb"]: |
|
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"] |
|
for _, row in data[cols].iterrows(): |
|
sentences.extend(row.values) |
|
elif args_task in ["Syn","syn","Synonym"]: |
|
cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"] |
|
for _, row in data[cols].iterrows(): |
|
sentences.extend(row.values) |
|
elif args_task in ["paraphrase","Paraphrase","para"]: |
|
cols = ["original_sentence", "paraphrased_sentence"] |
|
for _, row in data[cols].iterrows(): |
|
sentences.extend(row.values) |
|
|
|
|
|
embeddings = model.encode_batch(sentences,batch_size=batch_size) |
|
|
|
if args_model == "chatgpt": |
|
|
|
embeddings = [emb.cpu().numpy() if isinstance(emb, torch.Tensor) else emb for emb in embeddings] |
|
embeddings = np.array(embeddings) |
|
else: |
|
|
|
if isinstance(embeddings, torch.Tensor): |
|
embeddings = embeddings.cpu().numpy() |
|
|
|
|
|
if args_task == "anto": |
|
emb_org = embeddings[0::3] |
|
emb_para = embeddings[1::3] |
|
emb_anto = embeddings[2::3] |
|
|
|
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para) |
|
mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto) |
|
data["sim_org_para"] = sim_para |
|
data["sim_org_anto"] = sim_anto |
|
data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto) |
|
|
|
print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """) |
|
|
|
|
|
elif args_task == "jumbling": |
|
|
|
emb_org = embeddings[0::5] |
|
emb_para = embeddings[1::5] |
|
emb_n1 = embeddings[2::5] |
|
emb_n2 = embeddings[3::5] |
|
emb_n3 = embeddings[4::5] |
|
|
|
|
|
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para) |
|
mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1) |
|
mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2) |
|
mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3) |
|
|
|
data["sim_org_para"] = sim_para |
|
data["sim_org_n1"] = sim_n1 |
|
data["sim_org_n2"] = sim_n2 |
|
data["sim_org_n3"] = sim_n3 |
|
|
|
data["diff_org_para"] = sim_para - sim_para |
|
data["diff_org_n1"] = sim_para - sim_n1 |
|
data["diff_org_n2"] = sim_para - sim_n2 |
|
data["diff_org_n3"] = sim_para - sim_n3 |
|
|
|
print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """) |
|
|
|
|
|
elif args_task == "syn": |
|
|
|
emb_org = embeddings[0::4] |
|
emb_s1 = embeddings[1::4] |
|
emb_s2 = embeddings[2::4] |
|
emb_s3 = embeddings[3::4] |
|
|
|
_,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1) |
|
_,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2) |
|
_,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3) |
|
|
|
data["sim_org_s1"] = sim_s1 |
|
data["sim_org_s2"] = sim_s2 |
|
data["sim_org_s3"] = sim_s3 |
|
|
|
print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """) |
|
|
|
elif args_task == "paraphrase": |
|
emb_s1 = embeddings[0::2] |
|
emb_s2 = embeddings[1::2] |
|
data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2) |
|
|
|
print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """) |
|
|
|
if save: |
|
path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv" |
|
data.to_csv(path) |
|
print("Data saved at path : {path} ") |
|
return data |
|
|
|
if __name__ == "__main__": |
|
if sys.gettrace() is None: |
|
parser = get_args() |
|
config = { |
|
"args_model": parser.model_name, |
|
"dataset_name": parser.perturbed_dataset, |
|
"args_task": parser.task, |
|
"default_gpu": parser.gpu, |
|
"save": parser.save, |
|
"target_lang": parser.target_lang, |
|
"metric":parser.metric, |
|
"batch_size":2 |
|
} |
|
else: |
|
|
|
config = { |
|
"args_model": "llama3", |
|
"dataset_name": "mrpc", |
|
"args_task": "syn", |
|
"default_gpu": "cuda:2", |
|
"save": False, |
|
"target_lang": "en" |
|
|
|
} |
|
run(**config) |
|
|