Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- .gitignore +5 -0
- Models/.env +0 -0
- Models/MultilingualTranslationModel.py +159 -0
- Models/SentenceTransformersModel.py +24 -0
- Models/llm_embeddings.py +135 -0
- README.md +111 -0
- data/__init__.py +0 -0
- data/original_datasets/En/Readme.txt +29 -0
- data/original_datasets/En/mrpc.csv +0 -0
- data/original_datasets/En/paw_wiki.tsv +3 -0
- data/original_datasets/En/qoura.csv +3 -0
- requiremnets.txt +14 -0
- src/MultilingualTranslation/args_parser.py +53 -0
- src/MultilingualTranslation/translation.py +104 -0
- src/SentencePerturbation/perturbation_args.py +62 -0
- src/SentencePerturbation/sentence_perturbation.py +207 -0
- src/SentencePerturbation/word_replacer.py +102 -0
- src/__init__.py +4 -0
- src/adjustment_factor.py +22 -0
- src/evaluate.py +167 -0
- src/main_args.py +69 -0
- src/metrics.py +147 -0
- src/utils.py +80 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/original_datasets/En/paw_wiki.tsv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data/original_datasets/En/qoura.csv filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/data/perturbed_dataset/
|
2 |
+
/.python-version
|
3 |
+
/src/__pycache__
|
4 |
+
*.pyc
|
5 |
+
src/*.ipynb
|
Models/.env
ADDED
File without changes
|
Models/MultilingualTranslationModel.py
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
3 |
+
import os
|
4 |
+
from tqdm import tqdm
|
5 |
+
import pandas as pd
|
6 |
+
import time
|
7 |
+
import sys
|
8 |
+
from datasets import load_dataset
|
9 |
+
from src.utils import read_data
|
10 |
+
|
11 |
+
class NLLBTranslator:
|
12 |
+
def __init__(self, model_name="facebook/nllb-200-3.3B"):
|
13 |
+
"""
|
14 |
+
Initialize the NLLB model and tokenizer for translation
|
15 |
+
"""
|
16 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
17 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
18 |
+
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
|
19 |
+
|
20 |
+
def _get_nllb_code(self, language: str) -> str:
|
21 |
+
"""
|
22 |
+
Maps common language names to NLLB language codes.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
language (str): Common language name (case-insensitive)
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
str: NLLB language code or None if language not found
|
29 |
+
|
30 |
+
Examples:
|
31 |
+
>>> get_nllb_code("english")
|
32 |
+
'eng_Latn'
|
33 |
+
>>> get_nllb_code("Chinese")
|
34 |
+
'zho_Hans'
|
35 |
+
"""
|
36 |
+
language_mapping = {
|
37 |
+
# English variations
|
38 |
+
"english": "eng_Latn",
|
39 |
+
"eng": "eng_Latn",
|
40 |
+
"en": "eng_Latn",
|
41 |
+
|
42 |
+
# Hindi variations
|
43 |
+
"hindi": "hin_Deva",
|
44 |
+
"hi": "hin_Deva",
|
45 |
+
|
46 |
+
# French variations
|
47 |
+
"french": "fra_Latn",
|
48 |
+
"fr": "fra_Latn",
|
49 |
+
|
50 |
+
# Korean variations
|
51 |
+
"korean": "kor_Hang",
|
52 |
+
"ko": "kor_Hang",
|
53 |
+
|
54 |
+
# Spanish variations
|
55 |
+
"spanish": "spa_Latn",
|
56 |
+
"es": "spa_Latn",
|
57 |
+
|
58 |
+
# Chinese variations (defaulting to Simplified)
|
59 |
+
"chinese": "zho_Hans",
|
60 |
+
"chinese simplified": "zho_Hans",
|
61 |
+
"chinese traditional": "zho_Hant",
|
62 |
+
"mandarin": "zho_Hans",
|
63 |
+
"zh-cn": "zho_Hans",
|
64 |
+
|
65 |
+
# Japanese variations
|
66 |
+
"japanese": "jpn_Jpan",
|
67 |
+
"jpn": "jpn_Jpan",
|
68 |
+
"ja": "jpn_Jpan",
|
69 |
+
|
70 |
+
# German variations
|
71 |
+
"german": "deu_Latn",
|
72 |
+
"de": "deu_Latn"
|
73 |
+
}
|
74 |
+
|
75 |
+
# Convert input to lowercase for case-insensitive matching
|
76 |
+
normalized_input = language.lower().strip()
|
77 |
+
|
78 |
+
# Return the code if found, None otherwise
|
79 |
+
return language_mapping.get(normalized_input)
|
80 |
+
|
81 |
+
def add_language_code(self, name_code_dict, language, code):
|
82 |
+
# TODO: Add this fuctionality to _get_nllb_code
|
83 |
+
|
84 |
+
"""
|
85 |
+
Adds a language code to the dictionary if it is not already present.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
name_code_dict (dict): Dictionary of language names to codes
|
89 |
+
language (str): Language name
|
90 |
+
code (str): Language code
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
dict: Updated dictionary
|
94 |
+
"""
|
95 |
+
# Normalize the language name
|
96 |
+
normalized_language = language.lower().strip()
|
97 |
+
|
98 |
+
# Add the language code if not already present
|
99 |
+
if normalized_language not in name_code_dict:
|
100 |
+
name_code_dict[normalized_language] = code
|
101 |
+
|
102 |
+
return name_code_dict
|
103 |
+
|
104 |
+
|
105 |
+
def translate(self, text, source_lang="eng_Latn", target_lang="fra_Latn",batch_size=None):
|
106 |
+
"""
|
107 |
+
Translate text from source language to target language
|
108 |
+
|
109 |
+
Args:
|
110 |
+
text (str): Text to translate
|
111 |
+
source_lang (str): Source language code
|
112 |
+
target_lang (str): Target language code
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
str: Translated text
|
116 |
+
"""
|
117 |
+
# Tokenize the input text
|
118 |
+
inputs = self.tokenizer(text, return_tensors="pt", padding=True).to(self.device)
|
119 |
+
|
120 |
+
# map language names to NLLB language codes
|
121 |
+
source_lang = self._get_nllb_code(source_lang)
|
122 |
+
target_lang = self._get_nllb_code(target_lang)
|
123 |
+
# Add the source language token
|
124 |
+
forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(target_lang)
|
125 |
+
|
126 |
+
# Generate translation
|
127 |
+
translated_tokens = self.model.generate(
|
128 |
+
**inputs,
|
129 |
+
max_length=256,
|
130 |
+
num_beams=5,
|
131 |
+
temperature=0.5,
|
132 |
+
do_sample=True,
|
133 |
+
forced_bos_token_id=forced_bos_token_id,
|
134 |
+
)
|
135 |
+
|
136 |
+
# Decode the translation
|
137 |
+
if translated_tokens.shape[0] == 1: #single sentence
|
138 |
+
translation = self.tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
|
139 |
+
else:
|
140 |
+
translation = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
|
141 |
+
|
142 |
+
return translation
|
143 |
+
|
144 |
+
def main():
|
145 |
+
# Set up the model and tokenizer
|
146 |
+
print("Loading model and tokenizer...")
|
147 |
+
translator = NLLBTranslator()
|
148 |
+
|
149 |
+
# Example translations
|
150 |
+
texts = [
|
151 |
+
"Hello, how are you?",
|
152 |
+
"This is a test of the NLLB translation model.",
|
153 |
+
"Machine learning is fascinating."
|
154 |
+
]
|
155 |
+
print("\nTranslating texts from English to French:")
|
156 |
+
trt=translation = translator.translate(texts,target_lang="fr",batch_size=2)
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
main()
|
Models/SentenceTransformersModel.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import abc
|
2 |
+
import warnings
|
3 |
+
from pathlib import Path
|
4 |
+
from typing import List, Union
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from numpy.typing import NDArray
|
8 |
+
from sentence_transformers import SentenceTransformer
|
9 |
+
|
10 |
+
|
11 |
+
class SentenceTransformerModels():
|
12 |
+
|
13 |
+
def __init__(self, model_id, device: bool = False):
|
14 |
+
self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
+
self.model = SentenceTransformer(model_id).eval()
|
16 |
+
|
17 |
+
def encode(self, sentences: List[str], batch_size: int = 32) -> NDArray:
|
18 |
+
with torch.no_grad():
|
19 |
+
embeddings = self.model.encode(
|
20 |
+
sentences, batch_size=batch_size, device=self.device
|
21 |
+
)
|
22 |
+
if isinstance(embeddings, torch.Tensor):
|
23 |
+
return embeddings.cpu().numpy()
|
24 |
+
return embeddings
|
Models/llm_embeddings.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
|
3 |
+
from typing import Union, List
|
4 |
+
from pathlib import Path
|
5 |
+
from typing import Union, List
|
6 |
+
import dotenv
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
sys.path.insert(0,"./")
|
10 |
+
from src.utils import full_path
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
dotenv.load_dotenv(os.getenv("./models/.env"))
|
15 |
+
hf = os.getenv("huggingface_token")
|
16 |
+
|
17 |
+
def check_model_in_cache(model_name: str):
|
18 |
+
if model_name in ["LLaMA3","llama3"]:
|
19 |
+
return str(full_path("/data/shared/llama3-8b/Meta-Llama-3-8B_shard_size_1GB"))
|
20 |
+
|
21 |
+
if model_name in ["Mistral","mistral"]:
|
22 |
+
return str(full_path("/data/shared/mistral-7b-v03/Mistral-7B-v0.3_shard_size_1GB"))
|
23 |
+
|
24 |
+
if model_name in ["olmo","OLMo"]:
|
25 |
+
return str(full_path("/data/shared/olmo/OLMo-7B_shard_size_2GB"))
|
26 |
+
|
27 |
+
def mean_pooling(model_output, attention_mask):
|
28 |
+
"""
|
29 |
+
mean_pooling _summary_
|
30 |
+
|
31 |
+
Args:
|
32 |
+
model_output (_type_): _description_
|
33 |
+
attention_mask (_type_): _description_
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
_type_: _description_
|
37 |
+
"""
|
38 |
+
token_embeddings = model_output #First element of model_output contains all token embeddings
|
39 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
40 |
+
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
41 |
+
|
42 |
+
class LLMEmbeddings:
|
43 |
+
def __init__(self, model_name: str, device: torch.device = None):
|
44 |
+
"""
|
45 |
+
Initializes any Hugging Face LLM.
|
46 |
+
|
47 |
+
Args:
|
48 |
+
model_dir (str): Path or Hugging Face repo ID for the model.
|
49 |
+
device (torch.device): Device to load the model on (CPU/GPU).
|
50 |
+
"""
|
51 |
+
self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
52 |
+
|
53 |
+
# Load model from cache
|
54 |
+
try:
|
55 |
+
model_dir = check_model_in_cache(model_name)
|
56 |
+
except:
|
57 |
+
model_dir = model_name
|
58 |
+
|
59 |
+
# Load tokenizer
|
60 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
61 |
+
|
62 |
+
# Load model configuration to determine model type
|
63 |
+
config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
|
64 |
+
self.model_type = config.architectures[0] if config.architectures else ""
|
65 |
+
|
66 |
+
# Automatically choose between AutoModelForCausalLM and AutoModel
|
67 |
+
if "CausalLM" in self.model_type:
|
68 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
69 |
+
model_dir, trust_remote_code=True, torch_dtype=torch.float16
|
70 |
+
).to(self.device)
|
71 |
+
else:
|
72 |
+
self.model = AutoModel.from_pretrained(
|
73 |
+
model_dir, trust_remote_code=True, torch_dtype=torch.float16
|
74 |
+
).to(self.device)
|
75 |
+
|
76 |
+
# Ensure padding token is set (fixes issues in tokenization)
|
77 |
+
if self.tokenizer.pad_token is None:
|
78 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
79 |
+
|
80 |
+
self.model.eval()
|
81 |
+
|
82 |
+
def encode(self, text: Union[str, List[str]]):
|
83 |
+
"""Encodes input sentences into embeddings."""
|
84 |
+
inputs = self.tokenizer(
|
85 |
+
text, return_tensors="pt", padding=True, truncation=True, max_length=1024, return_token_type_ids=False
|
86 |
+
).to(self.device)
|
87 |
+
|
88 |
+
with torch.no_grad():
|
89 |
+
outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
|
90 |
+
|
91 |
+
embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
|
92 |
+
return embeddings
|
93 |
+
|
94 |
+
def encode_batch(self, text: Union[str, List[str]], batch_size: int = 32):
|
95 |
+
"""Encodes input sentences into embeddings using batching."""
|
96 |
+
# If a single string is provided, wrap it in a list.
|
97 |
+
if isinstance(text, str):
|
98 |
+
text = [text]
|
99 |
+
|
100 |
+
embeddings_list = []
|
101 |
+
# Process the text in batches
|
102 |
+
for i in tqdm(range(0, len(text), batch_size), desc="Processing Batches"):
|
103 |
+
batch_text = text[i:i+batch_size]
|
104 |
+
inputs = self.tokenizer(
|
105 |
+
batch_text,
|
106 |
+
return_tensors="pt",
|
107 |
+
padding=True,
|
108 |
+
truncation=True,
|
109 |
+
max_length=1024,
|
110 |
+
return_token_type_ids=False
|
111 |
+
).to(self.device)
|
112 |
+
|
113 |
+
with torch.no_grad():
|
114 |
+
outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
|
115 |
+
|
116 |
+
batch_embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
|
117 |
+
embeddings_list.append(batch_embeddings)
|
118 |
+
|
119 |
+
# Concatenate embeddings from all batches along the batch dimension.
|
120 |
+
embeddings = torch.cat(embeddings_list, dim=0)
|
121 |
+
return embeddings
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
if __name__ == "__main__":
|
126 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
127 |
+
|
128 |
+
# Load any Hugging Face LLM (e.g., LLaMA, Mistral, Falcon, GPT)
|
129 |
+
|
130 |
+
llm = LLMEmbeddings(model_name="llama3", device=device)
|
131 |
+
|
132 |
+
# Encode text into embeddings
|
133 |
+
embedding = llm.encode("Hugging Face models are powerful!")
|
134 |
+
print(embedding.shape)
|
135 |
+
print("Done!!")
|
README.md
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ALIGN-SIM: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings
|
2 |
+
|
3 |
+
ALIGN-SIM is a novel, task-free test bed for evaluating and interpreting sentence embeddings based on five intuitive semantic alignment criteria. It provides an alternative evaluation paradigm to popular task-specific benchmarks, offering deeper insights into whether sentence embeddings truly capture human-like semantic similarity.
|
4 |
+
|
5 |
+
## Overview
|
6 |
+
|
7 |
+
Sentence embeddings are central to many NLP applications such as translation, question answering, and text classification. However, evaluating these dense vector representations in a way that reflects human semantic understanding remains challenging. ALIGN-SIM addresses this challenge by introducing a framework based on five semantic alignment criteria:
|
8 |
+
|
9 |
+
- **Semantic Distinction:** Measures the ability of an encoder to differentiate between semantically similar sentence pairs and unrelated (random) sentence pairs.
|
10 |
+
- **Synonym Replacement:** Tests if minor lexical changes (using synonyms) preserve the semantic similarity of the original sentence.
|
11 |
+
- **Antonym Replacement (Paraphrase vs. Antonym):** Compares how closely a paraphrase aligns with the original sentence compared to a sentence where a key word is replaced with its antonym.
|
12 |
+
- **Paraphrase without Negation:** Evaluates whether removing negation (and rephrasing) preserves the semantic meaning.
|
13 |
+
- **Sentence Jumbling:** Assesses the sensitivity of the embeddings to changes in word order, ensuring that a jumbled sentence is distinctly represented.
|
14 |
+
|
15 |
+
ALIGN-SIM has been used to rigorously evaluate 13 sentence embedding models—including both classical encoders (e.g., SBERT, USE, SimCSE) and modern LLM-induced embeddings (e.g., GPT-3, LLaMA, Bloom)—across multiple datasets (QQP, PAWS-WIKI, MRPC, and AFIN).
|
16 |
+
|
17 |
+
|
18 |
+
## Features
|
19 |
+
|
20 |
+
- **Task-Free Evaluation:** Evaluate sentence embeddings without relying on task-specific training data.
|
21 |
+
- **Comprehensive Semantic Criteria:** Assess embedding quality using five human-intuitive semantic alignment tests.
|
22 |
+
- **Multiple Datasets:** Benchmark on diverse datasets to ensure robustness.
|
23 |
+
- **Comparative Analysis:** Provides insights into both classical sentence encoders and LLM-induced embeddings.
|
24 |
+
- **Extensive Experimental Results:** Detailed analysis demonstrating that high performance on task-specific benchmarks (e.g., SentEval) does not necessarily imply semantic alignment with human expectations.
|
25 |
+
|
26 |
+
## Installation
|
27 |
+
|
28 |
+
### Requirements
|
29 |
+
|
30 |
+
- Python 3.7 or higher
|
31 |
+
- [PyTorch](https://pytorch.org/)
|
32 |
+
- [Hugging Face Transformers](https://huggingface.co/transformers/)
|
33 |
+
- [SentenceTransformers](https://www.sbert.net/)
|
34 |
+
- Other dependencies as listed in `requirements.txt` (e.g., NumPy, SciPy, scikit-learn)
|
35 |
+
|
36 |
+
### Setup
|
37 |
+
|
38 |
+
Clone the repository and install dependencies:
|
39 |
+
|
40 |
+
```bash
|
41 |
+
git clone https://github.com/yourusername/ALIGNSIM.git
|
42 |
+
cd ALIGN-SIM
|
43 |
+
pip install -r requirements.txt
|
44 |
+
```
|
45 |
+
|
46 |
+
# Usage
|
47 |
+
|
48 |
+
## Creating Sentence Perturbation Dataset
|
49 |
+
A dataset is available for English and six other languages [Fr, es, de, zh, ja, ko]. If you want to work with a different dataset, run the code below otherwise skip this step:
|
50 |
+
|
51 |
+
|
52 |
+
``` bash
|
53 |
+
python src/SentencePerturbation/sentence_perturbation.py \
|
54 |
+
--dataset_name mrpc \
|
55 |
+
--task anto \
|
56 |
+
--target_lang en \
|
57 |
+
--output_dir ./data/perturbed_dataset/ \
|
58 |
+
--save True \
|
59 |
+
--sample_size 3500
|
60 |
+
```
|
61 |
+
|
62 |
+
## Evaluating Sentence Encoders
|
63 |
+
|
64 |
+
Run the evaluation script to test a sentence encoder against the five semantic alignment criteria. You can use any HuggingFace model for evaluaton. For example, to evaluate SBERT on the QQP dataset:
|
65 |
+
|
66 |
+
```bash
|
67 |
+
python src/evaluate.py --model llama3
|
68 |
+
--dataset qqp \
|
69 |
+
--task antonym \
|
70 |
+
--gpu auto \
|
71 |
+
--batch_size 16 \
|
72 |
+
--metric cosine \
|
73 |
+
--save True
|
74 |
+
```
|
75 |
+
The script supports different models (e.g., sbert, use, simcse, gpt3-ada, llama2, etc.) and datasets (e.g., qqp, paws_wiki, mrpc, afin). We evalauted models on two metric **Cosine Similarity** and **Normalized Euclidean Distance (NED)**
|
76 |
+
|
77 |
+
|
78 |
+
[# Viewing Results
|
79 |
+
|
80 |
+
Evaluation results—such as similarity scores, normalized distances, and histograms—are saved in the `Results/`. Use the provided Jupyter notebooks in the `src/PlotAndTables.ipynb` folder to explore and visualize the performance of different models across the evaluation criteria.]: #
|
81 |
+
|
82 |
+
|
83 |
+
# Citation
|
84 |
+
|
85 |
+
If you use ALIGN-SIM in your research, please cite our work:
|
86 |
+
|
87 |
+
```bibtex
|
88 |
+
@inproceedings{mahajan-etal-2024-align,
|
89 |
+
title = "{ALIGN}-{SIM}: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings through Semantic Similarity Alignment",
|
90 |
+
author = "Mahajan, Yash and
|
91 |
+
Bansal, Naman and
|
92 |
+
Blanco, Eduardo and
|
93 |
+
Karmaker, Santu",
|
94 |
+
editor = "Al-Onaizan, Yaser and
|
95 |
+
Bansal, Mohit and
|
96 |
+
Chen, Yun-Nung",
|
97 |
+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
|
98 |
+
month = nov,
|
99 |
+
year = "2024",
|
100 |
+
address = "Miami, Florida, USA",
|
101 |
+
publisher = "Association for Computational Linguistics",
|
102 |
+
url = "https://aclanthology.org/2024.findings-emnlp.436/",
|
103 |
+
doi = "10.18653/v1/2024.findings-emnlp.436",
|
104 |
+
pages = "7393--7428",
|
105 |
+
}
|
106 |
+
```
|
107 |
+
|
108 |
+
# Acknowledgments
|
109 |
+
|
110 |
+
This work has been partially supported by NSF Standard Grant Award #2302974 and AFOSR Cooperative Agreement Award #FA9550-23-1-0426. We also acknowledge the support from Auburn University College of Engineering and the Department of CSSE.
|
111 |
+
|
data/__init__.py
ADDED
File without changes
|
data/original_datasets/En/Readme.txt
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Dataset downloadable link.
|
2 |
+
Note that: the perturbed data is created with
|
3 |
+
the help these dataset and the code.
|
4 |
+
|
5 |
+
To download the dataset, please follow the link:
|
6 |
+
|
7 |
+
1) paws-wiki : https://github.com/google-research-datasets/paws
|
8 |
+
Open the link and scroll to PAWS_Wiki section. Download
|
9 |
+
PAWS-Wiki Labeled (Final)
|
10 |
+
|
11 |
+
2) QQP: https://huggingface.co/datasets/glue/viewer/qqp/train
|
12 |
+
Visit the huggingface link and download qqp paraphrasing
|
13 |
+
train dataset.
|
14 |
+
|
15 |
+
3) MRPC: https://huggingface.co/datasets/glue/viewer/mrpc/train
|
16 |
+
To download Microsoft Research Paraphrasing Corpus(MRPC)
|
17 |
+
dataset, visit the link and download the dataset (train
|
18 |
+
version).
|
19 |
+
Alternative:
|
20 |
+
You can use the dataset provided in the zip file. Just unzip the data file
|
21 |
+
and use the data.
|
22 |
+
|
23 |
+
Perturbed Data Generation:
|
24 |
+
We used the above dataset to create sentence perturbation for hypothesis
|
25 |
+
testing. We took the first column (i.e. sentence1 or question1) as our original
|
26 |
+
sentence and produce a sentence perturbation for these sentences using
|
27 |
+
WordNet toolkit. The code is provided in the zip file.
|
28 |
+
check: scr/word_replacer.py
|
29 |
+
|
data/original_datasets/En/mrpc.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/original_datasets/En/paw_wiki.tsv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28cea292e3fe964ca951cc3fd08e8edcdc31ea7118eafc7b9725b5702e78f50c
|
3 |
+
size 11734851
|
data/original_datasets/En/qoura.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f1c8f35afbb89a22437a9df147e5bef13ee2d7a7f7698b5c7e1d4b958992717
|
3 |
+
size 58317136
|
requiremnets.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
datasets==3.0.1
|
2 |
+
sentence-transformers==3.0.0
|
3 |
+
scikit-learn==1.4.2
|
4 |
+
nltk==3.8.1
|
5 |
+
numpy==1.26.4
|
6 |
+
matplotlib==3.8.4
|
7 |
+
matplotlib-inline==0.1.7
|
8 |
+
pandas==2.2.2
|
9 |
+
torch==2.4.0
|
10 |
+
tqdm==4.66.5
|
11 |
+
transformers==4.45.2
|
12 |
+
python-dotenv==1.0.1
|
13 |
+
|
14 |
+
|
src/MultilingualTranslation/args_parser.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# argument_parser.py
|
2 |
+
from argparse import ArgumentParser
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
def get_args():
|
6 |
+
"""
|
7 |
+
Parses command-line arguments for ALIGN-Multilingual.
|
8 |
+
|
9 |
+
Returns:
|
10 |
+
argparse.Namespace: Parsed arguments.
|
11 |
+
"""
|
12 |
+
parser = ArgumentParser(description="ALIGN-Multilingual Argument Parser")
|
13 |
+
|
14 |
+
parser.add_argument(
|
15 |
+
"--dataset_name",
|
16 |
+
dest="dataset_name",
|
17 |
+
type=str,
|
18 |
+
default="mrpc",
|
19 |
+
choices=["mrpc", "qqp"],
|
20 |
+
help="Name of the dataset to use.",
|
21 |
+
)
|
22 |
+
|
23 |
+
# parser.add_argument(
|
24 |
+
# "--language",
|
25 |
+
# type=str,
|
26 |
+
# default="fr",
|
27 |
+
# help="Target language for translation.",
|
28 |
+
# )
|
29 |
+
|
30 |
+
parser.add_argument(
|
31 |
+
"--model_name",
|
32 |
+
dest="model_name",
|
33 |
+
type=str,
|
34 |
+
default="facebook/nllb-200-3.3B",
|
35 |
+
help="Translation model name.",
|
36 |
+
)
|
37 |
+
|
38 |
+
parser.add_argument(
|
39 |
+
"--batch_size",
|
40 |
+
dest="batch_size",
|
41 |
+
type=int,
|
42 |
+
default=16,
|
43 |
+
help="Batch size for translation.",
|
44 |
+
)
|
45 |
+
|
46 |
+
parser.add_argument(
|
47 |
+
"--save",
|
48 |
+
dest="save",
|
49 |
+
type=bool,
|
50 |
+
help="Whether to save the translated dataset to a file.",
|
51 |
+
)
|
52 |
+
|
53 |
+
return parser.parse_args()
|
src/MultilingualTranslation/translation.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from googletrans import Translator # Import the googletrans module
|
3 |
+
import os
|
4 |
+
from tqdm import tqdm
|
5 |
+
import sys
|
6 |
+
sys.path.insert(0, "/home/yash/EMNLP-2024/ALIGN-Multilingual/")
|
7 |
+
from Models.MultilingualTranslationModel import NLLBTranslator
|
8 |
+
from args_parser import get_args
|
9 |
+
from src.utils import read_data
|
10 |
+
|
11 |
+
# TODO: Perturbation does not support Multilingual at the moment
|
12 |
+
|
13 |
+
def translate_dataset(dataset_name, model_name, target_lang,batch_size=16, sample_size=1000,save=False):
|
14 |
+
"""
|
15 |
+
Translates a dataset in batches using the NLLB model.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
dataset_name (str): Name of the dataset.
|
19 |
+
model_name (str): Model name used for translation.
|
20 |
+
target_lang (str): Target language for translation.
|
21 |
+
batch_size (int): Number of sentences to process in each batch.
|
22 |
+
sample_size (int): Number of rows to process.
|
23 |
+
save (bool): Whether to save the translated dataset to CSV.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
pd.DataFrame: Translated dataset.
|
27 |
+
"""
|
28 |
+
|
29 |
+
|
30 |
+
# check if translated dataset already exists else create it
|
31 |
+
translated_file_path = f"/home/yash/EMNLP-2024/ALIGN-Multilingual/data/{dataset_name}_{target_lang}.csv"
|
32 |
+
|
33 |
+
#original dataset
|
34 |
+
# data = pd.read_csv("/home/yash/EMNLP-2024/data/paw_wiki.tsv", sep='\t')
|
35 |
+
data = read_data(dataset_name)
|
36 |
+
#size of dataset
|
37 |
+
print(f"Size of dataset: {len(data)}")
|
38 |
+
|
39 |
+
print("original dataset loaded successfully")
|
40 |
+
|
41 |
+
model = NLLBTranslator(model_name=model_name)
|
42 |
+
print("NLLB model loaded successfully")
|
43 |
+
|
44 |
+
if os.path.exists(translated_file_path):
|
45 |
+
translated_dataset = pd.read_csv(translated_file_path)
|
46 |
+
print("Dataset exists and loaded successfully")
|
47 |
+
return translated_dataset
|
48 |
+
|
49 |
+
print("Creatign the dataset ....")
|
50 |
+
translated_dataset = pd.DataFrame(columns=['sentence1', 'sentence2', 'label'])
|
51 |
+
|
52 |
+
for i in tqdm(range(0, len(data), batch_size)):
|
53 |
+
batch_sentences1 = data.loc[i:i+batch_size-1, 'sentence1'].tolist()
|
54 |
+
batch_sentences2 = data.loc[i:i+batch_size-1, 'sentence2'].tolist()
|
55 |
+
batch_labels = data.loc[i:i+batch_size-1, 'label'].tolist()
|
56 |
+
|
57 |
+
translated_batch1 = model.translate(batch_sentences1, source_lang="en", target_lang=target_lang)
|
58 |
+
translated_batch2 = model.translate(batch_sentences2, source_lang="en", target_lang=target_lang)
|
59 |
+
|
60 |
+
# Append translated sentences and labels to DataFrame
|
61 |
+
batch_df = pd.DataFrame({
|
62 |
+
'sentence1': translated_batch1,
|
63 |
+
'sentence2': translated_batch2,
|
64 |
+
'label': batch_labels
|
65 |
+
})
|
66 |
+
|
67 |
+
translated_dataset = pd.concat([translated_dataset, batch_df], ignore_index=True)
|
68 |
+
|
69 |
+
if save:
|
70 |
+
translated_dataset.to_csv(translated_file_path, index=False)
|
71 |
+
print(f"Translated dataset saved to {translated_file_path}")
|
72 |
+
return translated_dataset
|
73 |
+
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == "__main__":
|
77 |
+
languages=['fr','es',"de","zh-CN","ja","ko"]
|
78 |
+
|
79 |
+
# Parse command-line arguments
|
80 |
+
args = get_args()
|
81 |
+
|
82 |
+
for language in languages:
|
83 |
+
print(f"Translating to {language} ....")
|
84 |
+
config= {
|
85 |
+
"dataset_name": args.dataset_name,
|
86 |
+
"model_name": args.model_name,
|
87 |
+
"target_lang": language,
|
88 |
+
"batch_size": args.batch_size,
|
89 |
+
"save": args.save
|
90 |
+
}
|
91 |
+
translated_dataset_lang = translate_dataset(**config)
|
92 |
+
|
93 |
+
# For Testing
|
94 |
+
# for language in languages:
|
95 |
+
# print(f"Translating to {language} ....")
|
96 |
+
# config= {
|
97 |
+
# "dataset_name": "qqp",
|
98 |
+
# "model_name": "nllb",
|
99 |
+
# "target_lang": language,
|
100 |
+
# "batch_size": 3,
|
101 |
+
# "save": True
|
102 |
+
# }
|
103 |
+
# translated_dataset_lang = translate_dataset(**config)
|
104 |
+
print("Done")
|
src/SentencePerturbation/perturbation_args.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
def get_args():
|
5 |
+
"""
|
6 |
+
Parses command-line arguments for ALIGN-Multilingual.
|
7 |
+
|
8 |
+
Returns:
|
9 |
+
argparse.Namespace: Parsed arguments.
|
10 |
+
"""
|
11 |
+
parser = ArgumentParser(description="ALIGN-SentencePerturbation Argument Parser")
|
12 |
+
|
13 |
+
parser.add_argument(
|
14 |
+
"--dataset_name",
|
15 |
+
dest="dataset_name",
|
16 |
+
type=str,
|
17 |
+
default="mrpc",
|
18 |
+
choices=["mrpc", "qqp","paws"],
|
19 |
+
help="Name of the dataset to use.",
|
20 |
+
)
|
21 |
+
|
22 |
+
parser.add_argument(
|
23 |
+
"--task",
|
24 |
+
dest="task",
|
25 |
+
type=str,
|
26 |
+
default="syn",
|
27 |
+
choices=["syn", "anto","jumb","jumbling","paraphrase","para"],
|
28 |
+
help="Perturbation task to perform.",
|
29 |
+
)
|
30 |
+
|
31 |
+
parser.add_argument(
|
32 |
+
"--target_lang",
|
33 |
+
dest="target_lang",
|
34 |
+
type=str,
|
35 |
+
default="en",
|
36 |
+
help="Target language for translation.",
|
37 |
+
)
|
38 |
+
|
39 |
+
parser.add_argument(
|
40 |
+
"--output_dir",
|
41 |
+
dest="output_dir",
|
42 |
+
type=str,
|
43 |
+
default="./data/perturbed_dataset/",
|
44 |
+
help="Output directory for perturbed dataset.",
|
45 |
+
)
|
46 |
+
|
47 |
+
parser.add_argument(
|
48 |
+
"--save",
|
49 |
+
dest="save",
|
50 |
+
type=bool,
|
51 |
+
help="Whether to save the translated dataset to a file.",
|
52 |
+
)
|
53 |
+
|
54 |
+
parser.add_argument(
|
55 |
+
"--sample_size",
|
56 |
+
dest="sample_size",
|
57 |
+
type=int,
|
58 |
+
default=None,
|
59 |
+
help="Number of rows to process.",
|
60 |
+
)
|
61 |
+
|
62 |
+
return parser.parse_args()
|
src/SentencePerturbation/sentence_perturbation.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from absl import logging
|
2 |
+
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
import pandas as pd
|
7 |
+
import re
|
8 |
+
import sys
|
9 |
+
sys.path.insert(0, "/home/yash/ALIGN-SIM/src")
|
10 |
+
from utils import mkdir_p, full_path, read_data
|
11 |
+
from SentencePerturbation.word_replacer import WordReplacer, WordSwapping
|
12 |
+
import random
|
13 |
+
from perturbation_args import get_args
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
def perturb_sentences(dataset_name: str, task: str, target_lang:str ="en", output_dir: str = "./data/perturbed_dataset/", sample_size: int = 3500, save :str = False) -> None:
|
18 |
+
"""
|
19 |
+
perturb_sentences _summary_
|
20 |
+
|
21 |
+
Args:
|
22 |
+
dataset_name (str): ["MRPC","PAWS","QQP"]
|
23 |
+
task (str): ["Synonym","Antonym","Jumbling"]
|
24 |
+
target_lang (str, optional): _description_. Defaults to "en".
|
25 |
+
output_dir (str, optional): _description_. Defaults to "./data/perturbed_dataset/".
|
26 |
+
sample_size (int, optional): _description_. Defaults to 3500.
|
27 |
+
save (str, optional): _description_. Defaults to False.
|
28 |
+
"""
|
29 |
+
|
30 |
+
print("--------------------------------------")
|
31 |
+
|
32 |
+
output_csv = full_path(os.path.join(output_dir, target_lang, task, f"{dataset_name}_{task}_perturbed_{target_lang}.csv"))
|
33 |
+
if os.path.exists(output_csv):
|
34 |
+
print(f"File already exists at: {output_csv}")
|
35 |
+
return
|
36 |
+
|
37 |
+
# TODO: make it compatible with other language datasets
|
38 |
+
print("Loading dataset...")
|
39 |
+
data = read_data(dataset_name)
|
40 |
+
if "Unnamed: 0" in data.columns:
|
41 |
+
data.drop("Unnamed: 0", axis=1, inplace=True)
|
42 |
+
|
43 |
+
if "idx" in data.columns:
|
44 |
+
data.drop("idx", axis=1, inplace=True)
|
45 |
+
|
46 |
+
print(f"Loaded {dataset_name} dataset")
|
47 |
+
|
48 |
+
print("--------------------------------------")
|
49 |
+
|
50 |
+
|
51 |
+
# Initialize WordReplacer
|
52 |
+
replacer = WordReplacer()
|
53 |
+
# set seed
|
54 |
+
random.seed(42)
|
55 |
+
|
56 |
+
# Create a new dataframe to store perturbed sentences
|
57 |
+
# Sample sentences
|
58 |
+
perturbed_data = pd.DataFrame(columns=["original_sentence"])
|
59 |
+
# sample_data , pos_pairs, balance_dataset = sampling(data, sample_size)
|
60 |
+
|
61 |
+
|
62 |
+
if task in ["Syn","syn","Synonym"]:
|
63 |
+
print("Creating Synonym perturbed data...")
|
64 |
+
sample_data = sampling(data, task, sample_size)
|
65 |
+
perturbed_data["original_sentence"] = sample_data.sentence1
|
66 |
+
perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "synonyms"))
|
67 |
+
perturbed_data["perturb_n2"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 2, "synonyms"))
|
68 |
+
perturbed_data["perturb_n3"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 3, "synonyms"))
|
69 |
+
|
70 |
+
assert perturbed_data.shape[1] == 4, "Perturbed data size mismatch"
|
71 |
+
|
72 |
+
if task in ["paraphrase","Paraphrase","para"]:
|
73 |
+
print("Creating Paraphrase perturbed data...")
|
74 |
+
# shuffling the negative samples
|
75 |
+
# we also want equal number of positive and negative samples
|
76 |
+
perturbed_data = sampling(data, task, sample_size) # balance data
|
77 |
+
perturbed_data["original_sentence"] = perturbed_data.sentence1
|
78 |
+
perturbed_data["paraphrased_sentence"] = perturbed_data.sentence2
|
79 |
+
assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch" # original_sentence, paraphrased, label
|
80 |
+
|
81 |
+
if task in ["Anto","anto","Antonym"]:
|
82 |
+
print("Creating Antonym perturbed data...")
|
83 |
+
pos_pairs = sampling(data, task, sample_size)
|
84 |
+
# Apply antonym replacement
|
85 |
+
perturbed_data["original_sentence"] = pos_pairs.sentence1
|
86 |
+
perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
|
87 |
+
perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "antonyms"))
|
88 |
+
assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch"
|
89 |
+
|
90 |
+
# Apply jumbling
|
91 |
+
if task in ["jumbling", "Jumbling","jumb"]:
|
92 |
+
print("Creating Jumbling perturbed data...")
|
93 |
+
pos_pairs = sampling(data, task, sample_size)
|
94 |
+
perturbed_data["original_sentence"] = pos_pairs.sentence1
|
95 |
+
perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
|
96 |
+
perturbed_data["perturb_n1"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,1))
|
97 |
+
perturbed_data["perturb_n2"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,2))
|
98 |
+
perturbed_data["perturb_n3"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,3))
|
99 |
+
|
100 |
+
assert perturbed_data.shape[1] == 5, "Perturbed data size mismatch"
|
101 |
+
# Save to CSV
|
102 |
+
if save:
|
103 |
+
perturbed_data.to_csv(mkdir_p(output_csv), index=False)
|
104 |
+
print("--------------------------------------")
|
105 |
+
print(f"Saved at: {output_csv}")
|
106 |
+
print("--------------------------------------")
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
def sampling(data: pd.DataFrame, task :str, sample_size: int, random_state: int = 42):
|
111 |
+
"""
|
112 |
+
Combines two sampling strategies:
|
113 |
+
|
114 |
+
1. sampled_data: Samples from the dataset by first taking all positive pairs and then,
|
115 |
+
if needed, filling the remainder with negative pairs.
|
116 |
+
2. balanced_data: Constructs a dataset with roughly equal positive and negative pairs,
|
117 |
+
adjusting the numbers if one group is underrepresented.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
sampled_data (pd.DataFrame): Dataset sampled by filling negatives if positives are insufficient.
|
121 |
+
positive_data (pd.DataFrame): All positive samples (label == 1).
|
122 |
+
balanced_data (pd.DataFrame): Dataset balanced between positive and negative pairs.
|
123 |
+
"""
|
124 |
+
# Split the data into positive and negative pairs
|
125 |
+
positive_data = data[data["label"] == 1]
|
126 |
+
negative_data = data[data["label"] == 0]
|
127 |
+
|
128 |
+
if task in ["Anto","anto","Antonym","jumbling", "Jumbling","jumb"]:
|
129 |
+
return positive_data
|
130 |
+
|
131 |
+
# ----- Sampling positive pair, but also checking if we satisfy sample size -----
|
132 |
+
if sample_size is None or sample_size > len(positive_data):
|
133 |
+
# If no sample size is provided or it exceeds the available data,
|
134 |
+
# return a copy of the entire dataset.
|
135 |
+
sampled_data = positive_data.copy()
|
136 |
+
else:
|
137 |
+
# Otherwise, randomly sample the specified number of rows.
|
138 |
+
sampled_data = positive_data.sample(n=sample_size, random_state=random_state)
|
139 |
+
|
140 |
+
|
141 |
+
if task in ["Syn","syn","Synonym"]:
|
142 |
+
return sampled_data
|
143 |
+
|
144 |
+
# ----- Sampling for Paraphrased Criterion -----
|
145 |
+
# Shuffle negative pairs first
|
146 |
+
negative_data = negative_data.reset_index(drop=True)
|
147 |
+
shuffled_sentence2 = negative_data["sentence2"].sample(frac=1, random_state=random_state).reset_index(drop=True)
|
148 |
+
negative_data["sentence2"] = shuffled_sentence2
|
149 |
+
|
150 |
+
# Determine ideal sample size per group (half of total sample size)
|
151 |
+
if sample_size is None:
|
152 |
+
pos_sample_size = len(positive_data)
|
153 |
+
neg_sample_size = len(negative_data)
|
154 |
+
else:
|
155 |
+
# Determine ideal sample size per group (half of total sample size)
|
156 |
+
half_size = sample_size // 2
|
157 |
+
pos_available = len(positive_data)
|
158 |
+
neg_available = len(negative_data)
|
159 |
+
pos_sample_size = min(half_size, pos_available)
|
160 |
+
neg_sample_size = min(half_size, neg_available)
|
161 |
+
|
162 |
+
# If there is a remainder, add extra samples from the group with more available data.
|
163 |
+
total_sampled = pos_sample_size + neg_sample_size
|
164 |
+
remainder = sample_size - total_sampled
|
165 |
+
if remainder > 0:
|
166 |
+
if (pos_available - pos_sample_size) >= (neg_available - neg_sample_size):
|
167 |
+
pos_sample_size += remainder
|
168 |
+
else:
|
169 |
+
neg_sample_size += remainder
|
170 |
+
|
171 |
+
# Sample from each group
|
172 |
+
sampled_positive = positive_data.sample(n=pos_sample_size, random_state=random_state)
|
173 |
+
sampled_negative = negative_data.sample(n=neg_sample_size, random_state=random_state)
|
174 |
+
# Add a 'label' column
|
175 |
+
sampled_positive["label"] = 1
|
176 |
+
sampled_negative["label"] = 0
|
177 |
+
# Combine and shuffle the resulting dataset
|
178 |
+
balanced_data = pd.concat([sampled_positive, sampled_negative]).sample(frac=1, random_state=random_state).reset_index(drop=True)
|
179 |
+
|
180 |
+
if task in ["paraphrase","Paraphrase","para"]:
|
181 |
+
return balanced_data
|
182 |
+
# return sampled_data, positive_data, balanced_data
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
|
188 |
+
# # For Testing
|
189 |
+
if sys.gettrace() is not None:
|
190 |
+
config = {
|
191 |
+
"dataset_name": "mrpc",
|
192 |
+
"task": "syn",
|
193 |
+
"target_lang": "en",
|
194 |
+
"output_dir": "./data/perturbed_dataset/",
|
195 |
+
"save": True
|
196 |
+
}
|
197 |
+
else:
|
198 |
+
args = get_args()
|
199 |
+
config = {
|
200 |
+
"dataset_name": args.dataset_name,
|
201 |
+
"task": args.task,
|
202 |
+
"target_lang": args.target_lang,
|
203 |
+
"output_dir": args.output_dir,
|
204 |
+
"save": args.save,
|
205 |
+
"sample_size": args.sample_size
|
206 |
+
}
|
207 |
+
perturb_sentences(**config)
|
src/SentencePerturbation/word_replacer.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import types
|
2 |
+
import nltk
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from nltk.corpus import wordnet
|
5 |
+
nltk.download('wordnet')
|
6 |
+
import pandas as pd
|
7 |
+
import random
|
8 |
+
|
9 |
+
|
10 |
+
class WordReplacer(object):
|
11 |
+
|
12 |
+
def get_antonyms(self, word, pos=None):
|
13 |
+
antonyms = set()
|
14 |
+
for syn in wordnet.synsets(word, pos=pos):
|
15 |
+
for lemma in syn.lemmas():
|
16 |
+
for antonym in lemma.antonyms():
|
17 |
+
antonyms.add(antonym.name())
|
18 |
+
if word in antonyms:
|
19 |
+
antonyms.remove(word)
|
20 |
+
return list(antonyms)
|
21 |
+
|
22 |
+
def get_synonyms(self,word):
|
23 |
+
"""
|
24 |
+
Get synonyms of a word
|
25 |
+
"""
|
26 |
+
synonyms = set()
|
27 |
+
|
28 |
+
for syn in wordnet.synsets(word):
|
29 |
+
for l in syn.lemmas():
|
30 |
+
synonym = l.name().replace("_", " ").replace("-", " ").lower()
|
31 |
+
synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
|
32 |
+
synonyms.add(synonym)
|
33 |
+
if word in synonyms:
|
34 |
+
synonyms.remove(word)
|
35 |
+
return list(synonyms)
|
36 |
+
|
37 |
+
|
38 |
+
def sentence_replacement(self,words,n,types=""):
|
39 |
+
words = words.split()
|
40 |
+
types= types.lower()
|
41 |
+
new_words= words.copy()
|
42 |
+
random_word_list = list(set([word for word in words if word not in stopwords.words("english")]))
|
43 |
+
random.shuffle(random_word_list)
|
44 |
+
num_replaced = 0
|
45 |
+
if types == "antonyms":
|
46 |
+
for random_word in random_word_list:
|
47 |
+
antonyms = self.get_antonyms(random_word)
|
48 |
+
|
49 |
+
if len(antonyms)>=1:
|
50 |
+
antonyms = random.choice(list(antonyms))
|
51 |
+
new_words = [antonyms if word == random_word else word for word in new_words]
|
52 |
+
num_replaced +=1
|
53 |
+
|
54 |
+
if num_replaced >=n:
|
55 |
+
break
|
56 |
+
|
57 |
+
if types=="synonyms":
|
58 |
+
for random_word in random_word_list:
|
59 |
+
synonyms = self.get_synonyms(random_word)
|
60 |
+
|
61 |
+
if len(synonyms)>=1:
|
62 |
+
synonyms = random.choice(list(synonyms))
|
63 |
+
new_words = [synonyms if word == random_word else word for word in new_words]
|
64 |
+
num_replaced +=1
|
65 |
+
|
66 |
+
if num_replaced >=n:
|
67 |
+
break
|
68 |
+
sentence= " ".join(new_words)
|
69 |
+
return sentence
|
70 |
+
|
71 |
+
class WordSwapping(object):
|
72 |
+
|
73 |
+
@staticmethod
|
74 |
+
def swap_word(new_words):
|
75 |
+
random_idx_1 = random.randint(0, len(new_words)-1)
|
76 |
+
random_idx_2 = random_idx_1
|
77 |
+
counter = 0
|
78 |
+
while random_idx_2 == random_idx_1:
|
79 |
+
random_idx_2 = random.randint(0, len(new_words)-1)
|
80 |
+
counter += 1
|
81 |
+
|
82 |
+
if counter > 3:
|
83 |
+
return new_words
|
84 |
+
|
85 |
+
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
|
86 |
+
return new_words
|
87 |
+
|
88 |
+
@staticmethod
|
89 |
+
def random_swap(words,n):
|
90 |
+
words = words.split()
|
91 |
+
new_words = words.copy()
|
92 |
+
for _ in range(n):
|
93 |
+
new_words = WordSwapping.swap_word(new_words)
|
94 |
+
sentence = ' '.join(new_words)
|
95 |
+
return sentence
|
96 |
+
|
97 |
+
# if __name__ == "__main__":
|
98 |
+
# replace= WordReplacer()
|
99 |
+
# temp1= ["i am testing", "this is second sent"]
|
100 |
+
# print([replace.sentence_replacement(i,n=1,types="synonyms") for i in temp1])
|
101 |
+
|
102 |
+
|
src/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.utils import mkdir_p, full_path, read_data
|
2 |
+
|
3 |
+
if "__all__" == ["mkdir_p", "full_path", "read_data"]:
|
4 |
+
print("All modules imported successfully")
|
src/adjustment_factor.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
def compute_alpha_model(rnd_similarities):
|
4 |
+
"""
|
5 |
+
Computes alpha_model as per the formula:
|
6 |
+
|
7 |
+
α_model = 1 - (1 / (n * |D|)) * sum(sim(RND-Pairs))
|
8 |
+
|
9 |
+
Args:
|
10 |
+
rnd_similarities (array-like): A 2D array of shape (n, |D|)
|
11 |
+
where each entry [i][j] is the similarity
|
12 |
+
of the j-th random pair in the i-th sample.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
float: The computed alpha_model value.
|
16 |
+
"""
|
17 |
+
rnd_similarities = np.array(rnd_similarities)
|
18 |
+
n, D_size = rnd_similarities.shape
|
19 |
+
alpha_model = 1 - (1 / (n * D_size)) * rnd_similarities.sum()
|
20 |
+
return alpha_model
|
21 |
+
|
22 |
+
|
src/evaluate.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import numpy as np
|
3 |
+
import os
|
4 |
+
import pandas as pd
|
5 |
+
from tqdm import tqdm
|
6 |
+
import torch
|
7 |
+
import utils
|
8 |
+
from metrics import *
|
9 |
+
import sys
|
10 |
+
sys.path.insert(0,"./")
|
11 |
+
from Models.SentenceTransformersModel import SentenceTransformerModels
|
12 |
+
from Models.llm_embeddings import LLMEmbeddings
|
13 |
+
from main_args import get_args
|
14 |
+
from metrics import CosineMetric
|
15 |
+
|
16 |
+
|
17 |
+
def read_pertubed_data(filename, task, lang="en"):
|
18 |
+
# path = f"./data/perturbed_dataset/{lang}/{task}/{filename}.csv"
|
19 |
+
if not os.path.exists(filename):
|
20 |
+
raise FileNotFoundError(f"File {filename} not found.")
|
21 |
+
return pd.read_csv(filename)
|
22 |
+
|
23 |
+
def compute_metrics(emb1, emb2,metric="cosine"):
|
24 |
+
"""Compute all metrics between two sets of embeddings."""
|
25 |
+
# sim = utils.cosine_similarity(emb1, emb2)
|
26 |
+
# ned = compute_ned_distance(emb1, emb2)
|
27 |
+
# ed = np.linalg.norm(emb1 - emb2, axis=1)
|
28 |
+
# dotp = np.sum(emb1 * emb2, axis=1)
|
29 |
+
if metric=="cosine":
|
30 |
+
sim = CosineMetric(emb1,emb2)
|
31 |
+
return sim
|
32 |
+
|
33 |
+
def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
|
34 |
+
model = LLMEmbeddings(args_model, device=default_gpu)
|
35 |
+
|
36 |
+
pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" # check if path exist
|
37 |
+
|
38 |
+
data = read_pertubed_data(pertubed_data_path, args_task)
|
39 |
+
# dataset_name = dataset_name.split(".")[0] if args_task == "paraphrase" else dataset_name.split("_")[0]
|
40 |
+
|
41 |
+
print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n")
|
42 |
+
|
43 |
+
# Collect all sentences based on task
|
44 |
+
sentences = []
|
45 |
+
if args_task in ["Anto","anto","Antonym"]:
|
46 |
+
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"]
|
47 |
+
for _, row in data[cols].iterrows():
|
48 |
+
sentences.extend(row.values)
|
49 |
+
elif args_task in ["jumbling", "Jumbling","jumb"]:
|
50 |
+
cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
|
51 |
+
for _, row in data[cols].iterrows():
|
52 |
+
sentences.extend(row.values)
|
53 |
+
elif args_task in ["Syn","syn","Synonym"]:
|
54 |
+
cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
|
55 |
+
for _, row in data[cols].iterrows():
|
56 |
+
sentences.extend(row.values)
|
57 |
+
elif args_task in ["paraphrase","Paraphrase","para"]:
|
58 |
+
cols = ["original_sentence", "paraphrased_sentence"]
|
59 |
+
for _, row in data[cols].iterrows():
|
60 |
+
sentences.extend(row.values)
|
61 |
+
|
62 |
+
# Batch process embeddings
|
63 |
+
embeddings = model.encode_batch(sentences,batch_size=batch_size)
|
64 |
+
if args_model != "chatgpt":
|
65 |
+
embeddings = [emb.cpu().numpy() for emb in embeddings]
|
66 |
+
embeddings = np.array(embeddings)
|
67 |
+
|
68 |
+
# Process embeddings based on task
|
69 |
+
if args_task == "anto":
|
70 |
+
emb_org = embeddings[0::3] # start at 0, step by 3
|
71 |
+
emb_para = embeddings[1::3] # start at 1, step by 3
|
72 |
+
emb_anto = embeddings[2::3] # start at 2, step by 3
|
73 |
+
|
74 |
+
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
|
75 |
+
mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto)
|
76 |
+
data["sim_org_para"] = sim_para
|
77 |
+
data["sim_org_anto"] = sim_anto
|
78 |
+
data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto)
|
79 |
+
|
80 |
+
print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """)
|
81 |
+
|
82 |
+
|
83 |
+
elif args_task == "jumbling":
|
84 |
+
|
85 |
+
emb_org = embeddings[0::5] # start at 0, step by 3
|
86 |
+
emb_para = embeddings[1::5] # start at 1, step by 3
|
87 |
+
emb_n1 = embeddings[2::5] # start at 2, step by 3
|
88 |
+
emb_n2 = embeddings[3::5]
|
89 |
+
emb_n3 = embeddings[4::5]
|
90 |
+
|
91 |
+
# Compute metrics for each perturbation
|
92 |
+
mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
|
93 |
+
mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1)
|
94 |
+
mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2)
|
95 |
+
mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3)
|
96 |
+
|
97 |
+
data["sim_org_para"] = sim_para
|
98 |
+
data["sim_org_n1"] = sim_n1
|
99 |
+
data["sim_org_n2"] = sim_n2
|
100 |
+
data["sim_org_n3"] = sim_n3
|
101 |
+
|
102 |
+
data["diff_org_para"] = sim_para - sim_para # Zero as per original
|
103 |
+
data["diff_org_n1"] = sim_para - sim_n1
|
104 |
+
data["diff_org_n2"] = sim_para - sim_n2
|
105 |
+
data["diff_org_n3"] = sim_para - sim_n3
|
106 |
+
|
107 |
+
print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """)
|
108 |
+
|
109 |
+
|
110 |
+
elif args_task == "syn":
|
111 |
+
|
112 |
+
emb_org = embeddings[0::4] # start at 0, step by 3
|
113 |
+
emb_s1 = embeddings[1::4] # start at 1, step by 3
|
114 |
+
emb_s2 = embeddings[2::4] # start at 2, step by 3
|
115 |
+
emb_s3 = embeddings[3::4]
|
116 |
+
|
117 |
+
_,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1)
|
118 |
+
_,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2)
|
119 |
+
_,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3)
|
120 |
+
|
121 |
+
data["sim_org_s1"] = sim_s1
|
122 |
+
data["sim_org_s2"] = sim_s2
|
123 |
+
data["sim_org_s3"] = sim_s3
|
124 |
+
|
125 |
+
print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """)
|
126 |
+
|
127 |
+
elif args_task == "paraphrase":
|
128 |
+
emb_s1 = embeddings[0::2] # start at 0, step by 3
|
129 |
+
emb_s2 = embeddings[1::2]
|
130 |
+
data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2)
|
131 |
+
|
132 |
+
print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """)
|
133 |
+
|
134 |
+
if save:
|
135 |
+
path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv"
|
136 |
+
data.to_csv(path)
|
137 |
+
print("Data saved at path : {path} ")
|
138 |
+
return data
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
if sys.gettrace() is None:
|
142 |
+
parser = get_args()
|
143 |
+
config = {
|
144 |
+
"args_model": parser.model_name,
|
145 |
+
"dataset_name": parser.perturbed_dataset,
|
146 |
+
"args_task": parser.task,
|
147 |
+
"default_gpu": parser.gpu,
|
148 |
+
"save": parser.save,
|
149 |
+
"target_lang": parser.target_lang,
|
150 |
+
"metric":parser.metric,
|
151 |
+
"batch_size":2
|
152 |
+
}
|
153 |
+
else:
|
154 |
+
config = {
|
155 |
+
"args_model": "llama3",
|
156 |
+
"dataset_name": "mrpc",
|
157 |
+
"args_task": "syn",
|
158 |
+
"default_gpu": "cuda:2",
|
159 |
+
"save": False,
|
160 |
+
"target_lang": "en"
|
161 |
+
|
162 |
+
}
|
163 |
+
run(**config)
|
164 |
+
|
165 |
+
|
166 |
+
# file_path = "/home/yash/ALIGN-SIM/data/perturbed_dataset/en/anto/mrpc_anto_perturbed_en.csv"
|
167 |
+
# run("llama3","mrpc_anto_perturbed_en", "anto", "cuda:2", False)
|
src/main_args.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser
|
2 |
+
|
3 |
+
|
4 |
+
def get_args():
|
5 |
+
"""
|
6 |
+
Parses command-line arguments for SentencePerturbation.
|
7 |
+
Returns:
|
8 |
+
argparse.Namespace: Parsed arguments.
|
9 |
+
"""
|
10 |
+
|
11 |
+
parser = ArgumentParser()
|
12 |
+
parser.add_argument(
|
13 |
+
"--perturb_dataset",
|
14 |
+
dest="perturb_dataset",
|
15 |
+
required=True,
|
16 |
+
help="Name of the CSV file"
|
17 |
+
)
|
18 |
+
parser.add_argument(
|
19 |
+
"--task",
|
20 |
+
dest="task",
|
21 |
+
required=True,
|
22 |
+
choices=["anto", "jumbling", "syn", "paraphrase"],
|
23 |
+
help="Task to perform: anto/jumbling/syn/paraphrase",
|
24 |
+
)
|
25 |
+
parser.add_argument(
|
26 |
+
"--M",
|
27 |
+
dest="model_name",
|
28 |
+
required=True,
|
29 |
+
help="LLM Model")
|
30 |
+
|
31 |
+
parser.add_argument(
|
32 |
+
"--target_lang",
|
33 |
+
dest="target_lang",
|
34 |
+
required=True,
|
35 |
+
default="en",
|
36 |
+
help="Language for translation"
|
37 |
+
)
|
38 |
+
|
39 |
+
parser.add_argument(
|
40 |
+
"--save",
|
41 |
+
dest="save",
|
42 |
+
action="store_true",
|
43 |
+
help="Save the results in a CSV file",
|
44 |
+
)
|
45 |
+
|
46 |
+
parser.add_argument(
|
47 |
+
"--gpu",
|
48 |
+
dest="gpu",
|
49 |
+
default="auto",
|
50 |
+
help="GPU to run the model"
|
51 |
+
)
|
52 |
+
|
53 |
+
parser.add_argument(
|
54 |
+
"--batch_size",
|
55 |
+
dest="batch_size",
|
56 |
+
type=int,
|
57 |
+
default=16,
|
58 |
+
help="Batch size for translation"
|
59 |
+
)
|
60 |
+
|
61 |
+
parser.add_argument(
|
62 |
+
"--metric",
|
63 |
+
dest="metric",
|
64 |
+
type=str,
|
65 |
+
default="cosine",
|
66 |
+
choices=["cosine","ned","both"],
|
67 |
+
help="Metric to use for comparison",
|
68 |
+
)
|
69 |
+
return parser.parse_args()
|
src/metrics.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
# Optional: import torch if available for type checking
|
5 |
+
try:
|
6 |
+
import torch
|
7 |
+
except ImportError:
|
8 |
+
torch = None
|
9 |
+
|
10 |
+
def to_numpy(arr) -> np.ndarray:
|
11 |
+
"""
|
12 |
+
Converts the input array (which can be a numpy array, torch tensor, or list) to a numpy array.
|
13 |
+
"""
|
14 |
+
# Check for torch.Tensor if torch is available
|
15 |
+
if torch is not None and isinstance(arr, torch.Tensor):
|
16 |
+
# Detach and move to CPU if needed, then convert to numpy
|
17 |
+
return arr.detach().cpu().numpy()
|
18 |
+
# If it's already a numpy array, return as is
|
19 |
+
if isinstance(arr, np.ndarray):
|
20 |
+
return arr
|
21 |
+
# Otherwise, try converting to a numpy array
|
22 |
+
return np.array(arr)
|
23 |
+
|
24 |
+
class Metric(ABC):
|
25 |
+
"""
|
26 |
+
Abstract base class for evaluation metrics.
|
27 |
+
Subclasses must implement the compute method.
|
28 |
+
"""
|
29 |
+
@abstractmethod
|
30 |
+
def compute(self, vector1, vector2) -> float:
|
31 |
+
"""
|
32 |
+
Compute the metric between two vectors.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
vector1: The first vector (numpy array, torch tensor, list, etc.).
|
36 |
+
vector2: The second vector (numpy array, torch tensor, list, etc.).
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
float: The computed metric value.
|
40 |
+
"""
|
41 |
+
pass
|
42 |
+
|
43 |
+
class CosineMetric(Metric):
|
44 |
+
"""
|
45 |
+
Implementation of the cosine similarity metric.
|
46 |
+
"""
|
47 |
+
def compute(self, vector1, vector2) -> float:
|
48 |
+
# Convert inputs to numpy arrays
|
49 |
+
vec1 = to_numpy(vector1)
|
50 |
+
vec2 = to_numpy(vector2)
|
51 |
+
|
52 |
+
dot_product = np.dot(vec1, vec2)
|
53 |
+
norm1 = np.linalg.norm(vec1)
|
54 |
+
norm2 = np.linalg.norm(vec2)
|
55 |
+
if norm1 == 0 or norm2 == 0:
|
56 |
+
return 0.0
|
57 |
+
return dot_product / (norm1 * norm2)
|
58 |
+
|
59 |
+
class NEDMetric(Metric):
|
60 |
+
"""
|
61 |
+
Implementation of a normalized Euclidean distance metric.
|
62 |
+
"""
|
63 |
+
def compute(self, vector1, vector2) -> float:
|
64 |
+
# Convert inputs to numpy arrays
|
65 |
+
vec1 = to_numpy(vector1)
|
66 |
+
vec2 = to_numpy(vector2)
|
67 |
+
|
68 |
+
euclidean_distance = np.linalg.norm(vec1 - vec2)
|
69 |
+
norm_sum = np.linalg.norm(vec1) + np.linalg.norm(vec2)
|
70 |
+
if norm_sum == 0:
|
71 |
+
return 0.0
|
72 |
+
return euclidean_distance / norm_sum
|
73 |
+
|
74 |
+
class EuclideanMetric(Metric):
|
75 |
+
def compute(self, vector1, vector2) -> float:
|
76 |
+
return np.linalg.norm(vector1 - vector2, axis=1)
|
77 |
+
|
78 |
+
def dot_product(x, y):
|
79 |
+
return np.dot(x, y.T)
|
80 |
+
|
81 |
+
def compute_ned_distance(x, y):
|
82 |
+
return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))
|
83 |
+
|
84 |
+
def batch_NED(batch_u, batch_v):
|
85 |
+
batch_u = np.array(batch_u)
|
86 |
+
batch_v = np.array(batch_v)
|
87 |
+
|
88 |
+
# Ensure batch_u and batch_v have the same number of elements
|
89 |
+
assert batch_u.shape[0] == batch_v.shape[0], "The batch sizes of u and v must be the same."
|
90 |
+
|
91 |
+
scores = []
|
92 |
+
|
93 |
+
for u, v in zip(batch_u, batch_v):
|
94 |
+
u = np.array(u)
|
95 |
+
v = np.array(v)
|
96 |
+
|
97 |
+
u_mean = np.mean(u)
|
98 |
+
v_mean = np.mean(v)
|
99 |
+
|
100 |
+
u_centered = u - u_mean
|
101 |
+
v_centered = v - v_mean
|
102 |
+
|
103 |
+
numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
|
104 |
+
denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
|
105 |
+
|
106 |
+
ned_score = 0.5 * numerator / denominator
|
107 |
+
scores.append(ned_score)
|
108 |
+
|
109 |
+
return np.array(scores)
|
110 |
+
|
111 |
+
|
112 |
+
def NED2(u, v):
|
113 |
+
u = np.array(u)
|
114 |
+
v = np.array(v)
|
115 |
+
|
116 |
+
u_mean = np.mean(u)
|
117 |
+
v_mean = np.mean(v)
|
118 |
+
|
119 |
+
u_centered = u - u_mean
|
120 |
+
v_centered = v - v_mean
|
121 |
+
|
122 |
+
numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
|
123 |
+
denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
|
124 |
+
|
125 |
+
return 0.5 * numerator / denominator
|
126 |
+
|
127 |
+
# --- Example Usage ---
|
128 |
+
if __name__ == "__main__":
|
129 |
+
# Example inputs: a numpy array and a torch tensor (if torch is available)
|
130 |
+
vec_np = np.array([1.0, 2.0, 3.0])
|
131 |
+
if torch is not None:
|
132 |
+
vec_torch = torch.tensor([4.0, 5.0, 6.0])
|
133 |
+
else:
|
134 |
+
vec_torch = [4.0, 5.0, 6.0] # fallback list
|
135 |
+
|
136 |
+
cosine = CosineMetric()
|
137 |
+
ned = NEDMetric()
|
138 |
+
|
139 |
+
print("Cosine Similarity:", cosine.compute(vec_np, vec_torch))
|
140 |
+
print("Normalized Euclidean Distance:", ned.compute(vec_np, vec_torch))
|
141 |
+
|
142 |
+
# x = [20,30,2]
|
143 |
+
# y = [1.0,2.0,3.0]
|
144 |
+
# print("Dot Product: ", dot_product(x, y))
|
145 |
+
# print(euclidean_distance(x, y))
|
146 |
+
# # print(NED(x, y))
|
147 |
+
# print("Done")
|
src/utils.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Union
|
7 |
+
import os
|
8 |
+
|
9 |
+
def delete_file(file_pt: Path) -> None:
|
10 |
+
try:
|
11 |
+
file_pt.unlink()
|
12 |
+
except FileNotFoundError:
|
13 |
+
pass
|
14 |
+
|
15 |
+
|
16 |
+
def full_path(inp_dir_or_path: str) -> Path:
|
17 |
+
"""Returns full path"""
|
18 |
+
return Path(inp_dir_or_path).expanduser().resolve()
|
19 |
+
|
20 |
+
|
21 |
+
def mkdir_p(inp_dir_or_path: Union[str, Path]) -> Path:
|
22 |
+
"""Give a file/dir path, makes sure that all the directories exists"""
|
23 |
+
inp_dir_or_path = full_path(inp_dir_or_path)
|
24 |
+
if inp_dir_or_path.suffix: # file
|
25 |
+
inp_dir_or_path.parent.mkdir(parents=True, exist_ok=True)
|
26 |
+
else: # dir
|
27 |
+
inp_dir_or_path.mkdir(parents=True, exist_ok=True)
|
28 |
+
return inp_dir_or_path
|
29 |
+
|
30 |
+
def similarity_between_sent(sent1_encoded, sent2_encoded):
|
31 |
+
"""report the avg. cosine similarity score b.w two pairs of sentences"""
|
32 |
+
similarity_scores = []
|
33 |
+
for i in range(len(sent1_encoded)):
|
34 |
+
similarity_scores.append(cosine_similarity(
|
35 |
+
sent1_encoded[i], sent2_encoded[i]))
|
36 |
+
|
37 |
+
return np.mean(similarity_scores),similarity_scores
|
38 |
+
|
39 |
+
|
40 |
+
def cosine_similarity(a, b):
|
41 |
+
"""
|
42 |
+
Takes 2 vectors a, b and returns the cosine similarity according
|
43 |
+
to the definition of the dot product
|
44 |
+
"""
|
45 |
+
dot_product = np.dot(a, b)
|
46 |
+
norm_a = np.linalg.norm(a)
|
47 |
+
norm_b = np.linalg.norm(b)
|
48 |
+
return dot_product / (norm_a * norm_b)
|
49 |
+
|
50 |
+
def load_data(path):
|
51 |
+
if path.endswith(".csv"):
|
52 |
+
data=pd.read_csv(path)
|
53 |
+
else:
|
54 |
+
data=pd.read_csv(path,sep="\t")
|
55 |
+
|
56 |
+
if not isinstance(data,pd.DataFrame):
|
57 |
+
raise ValueError("Data should be in pandas DataFrame format")
|
58 |
+
return data
|
59 |
+
|
60 |
+
def read_data(dataset):
|
61 |
+
if dataset == "mrpc":
|
62 |
+
data = load_data("/home/yash/EMNLP-2024/data/mrpc.csv")
|
63 |
+
data = data.copy()
|
64 |
+
|
65 |
+
elif dataset == "qqp":
|
66 |
+
data = load_data("/home/yash/EMNLP-2024/data/qoura.csv")
|
67 |
+
data = data.copy().dropna()
|
68 |
+
# handling irregularities in columns names
|
69 |
+
data.columns = data.columns.str.strip()
|
70 |
+
data = data.rename(columns={"is_duplicate":"label",'question1':"sentence1","question2":"sentence2"})
|
71 |
+
|
72 |
+
elif dataset in ["paws","paw","wiki"]:
|
73 |
+
path = "/home/yash/EMNLP-2024/data/paw_wiki.tsv"
|
74 |
+
data = load_data(path)
|
75 |
+
data = data.copy()
|
76 |
+
|
77 |
+
else:
|
78 |
+
ValueError("No dataset found.")
|
79 |
+
|
80 |
+
return data
|