yzm0034 commited on
Commit
4f08d2c
·
verified ·
1 Parent(s): f9a9f31

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/original_datasets/En/paw_wiki.tsv filter=lfs diff=lfs merge=lfs -text
37
+ data/original_datasets/En/qoura.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /data/perturbed_dataset/
2
+ /.python-version
3
+ /src/__pycache__
4
+ *.pyc
5
+ src/*.ipynb
Models/.env ADDED
File without changes
Models/MultilingualTranslationModel.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import os
4
+ from tqdm import tqdm
5
+ import pandas as pd
6
+ import time
7
+ import sys
8
+ from datasets import load_dataset
9
+ from src.utils import read_data
10
+
11
+ class NLLBTranslator:
12
+ def __init__(self, model_name="facebook/nllb-200-3.3B"):
13
+ """
14
+ Initialize the NLLB model and tokenizer for translation
15
+ """
16
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
19
+
20
+ def _get_nllb_code(self, language: str) -> str:
21
+ """
22
+ Maps common language names to NLLB language codes.
23
+
24
+ Args:
25
+ language (str): Common language name (case-insensitive)
26
+
27
+ Returns:
28
+ str: NLLB language code or None if language not found
29
+
30
+ Examples:
31
+ >>> get_nllb_code("english")
32
+ 'eng_Latn'
33
+ >>> get_nllb_code("Chinese")
34
+ 'zho_Hans'
35
+ """
36
+ language_mapping = {
37
+ # English variations
38
+ "english": "eng_Latn",
39
+ "eng": "eng_Latn",
40
+ "en": "eng_Latn",
41
+
42
+ # Hindi variations
43
+ "hindi": "hin_Deva",
44
+ "hi": "hin_Deva",
45
+
46
+ # French variations
47
+ "french": "fra_Latn",
48
+ "fr": "fra_Latn",
49
+
50
+ # Korean variations
51
+ "korean": "kor_Hang",
52
+ "ko": "kor_Hang",
53
+
54
+ # Spanish variations
55
+ "spanish": "spa_Latn",
56
+ "es": "spa_Latn",
57
+
58
+ # Chinese variations (defaulting to Simplified)
59
+ "chinese": "zho_Hans",
60
+ "chinese simplified": "zho_Hans",
61
+ "chinese traditional": "zho_Hant",
62
+ "mandarin": "zho_Hans",
63
+ "zh-cn": "zho_Hans",
64
+
65
+ # Japanese variations
66
+ "japanese": "jpn_Jpan",
67
+ "jpn": "jpn_Jpan",
68
+ "ja": "jpn_Jpan",
69
+
70
+ # German variations
71
+ "german": "deu_Latn",
72
+ "de": "deu_Latn"
73
+ }
74
+
75
+ # Convert input to lowercase for case-insensitive matching
76
+ normalized_input = language.lower().strip()
77
+
78
+ # Return the code if found, None otherwise
79
+ return language_mapping.get(normalized_input)
80
+
81
+ def add_language_code(self, name_code_dict, language, code):
82
+ # TODO: Add this fuctionality to _get_nllb_code
83
+
84
+ """
85
+ Adds a language code to the dictionary if it is not already present.
86
+
87
+ Args:
88
+ name_code_dict (dict): Dictionary of language names to codes
89
+ language (str): Language name
90
+ code (str): Language code
91
+
92
+ Returns:
93
+ dict: Updated dictionary
94
+ """
95
+ # Normalize the language name
96
+ normalized_language = language.lower().strip()
97
+
98
+ # Add the language code if not already present
99
+ if normalized_language not in name_code_dict:
100
+ name_code_dict[normalized_language] = code
101
+
102
+ return name_code_dict
103
+
104
+
105
+ def translate(self, text, source_lang="eng_Latn", target_lang="fra_Latn",batch_size=None):
106
+ """
107
+ Translate text from source language to target language
108
+
109
+ Args:
110
+ text (str): Text to translate
111
+ source_lang (str): Source language code
112
+ target_lang (str): Target language code
113
+
114
+ Returns:
115
+ str: Translated text
116
+ """
117
+ # Tokenize the input text
118
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True).to(self.device)
119
+
120
+ # map language names to NLLB language codes
121
+ source_lang = self._get_nllb_code(source_lang)
122
+ target_lang = self._get_nllb_code(target_lang)
123
+ # Add the source language token
124
+ forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(target_lang)
125
+
126
+ # Generate translation
127
+ translated_tokens = self.model.generate(
128
+ **inputs,
129
+ max_length=256,
130
+ num_beams=5,
131
+ temperature=0.5,
132
+ do_sample=True,
133
+ forced_bos_token_id=forced_bos_token_id,
134
+ )
135
+
136
+ # Decode the translation
137
+ if translated_tokens.shape[0] == 1: #single sentence
138
+ translation = self.tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
139
+ else:
140
+ translation = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
141
+
142
+ return translation
143
+
144
+ def main():
145
+ # Set up the model and tokenizer
146
+ print("Loading model and tokenizer...")
147
+ translator = NLLBTranslator()
148
+
149
+ # Example translations
150
+ texts = [
151
+ "Hello, how are you?",
152
+ "This is a test of the NLLB translation model.",
153
+ "Machine learning is fascinating."
154
+ ]
155
+ print("\nTranslating texts from English to French:")
156
+ trt=translation = translator.translate(texts,target_lang="fr",batch_size=2)
157
+
158
+ if __name__ == "__main__":
159
+ main()
Models/SentenceTransformersModel.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import warnings
3
+ from pathlib import Path
4
+ from typing import List, Union
5
+
6
+ import torch
7
+ from numpy.typing import NDArray
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+
11
+ class SentenceTransformerModels():
12
+
13
+ def __init__(self, model_id, device: bool = False):
14
+ self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ self.model = SentenceTransformer(model_id).eval()
16
+
17
+ def encode(self, sentences: List[str], batch_size: int = 32) -> NDArray:
18
+ with torch.no_grad():
19
+ embeddings = self.model.encode(
20
+ sentences, batch_size=batch_size, device=self.device
21
+ )
22
+ if isinstance(embeddings, torch.Tensor):
23
+ return embeddings.cpu().numpy()
24
+ return embeddings
Models/llm_embeddings.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
3
+ from typing import Union, List
4
+ from pathlib import Path
5
+ from typing import Union, List
6
+ import dotenv
7
+ import os
8
+ import sys
9
+ sys.path.insert(0,"./")
10
+ from src.utils import full_path
11
+ from tqdm import tqdm
12
+
13
+
14
+ dotenv.load_dotenv(os.getenv("./models/.env"))
15
+ hf = os.getenv("huggingface_token")
16
+
17
+ def check_model_in_cache(model_name: str):
18
+ if model_name in ["LLaMA3","llama3"]:
19
+ return str(full_path("/data/shared/llama3-8b/Meta-Llama-3-8B_shard_size_1GB"))
20
+
21
+ if model_name in ["Mistral","mistral"]:
22
+ return str(full_path("/data/shared/mistral-7b-v03/Mistral-7B-v0.3_shard_size_1GB"))
23
+
24
+ if model_name in ["olmo","OLMo"]:
25
+ return str(full_path("/data/shared/olmo/OLMo-7B_shard_size_2GB"))
26
+
27
+ def mean_pooling(model_output, attention_mask):
28
+ """
29
+ mean_pooling _summary_
30
+
31
+ Args:
32
+ model_output (_type_): _description_
33
+ attention_mask (_type_): _description_
34
+
35
+ Returns:
36
+ _type_: _description_
37
+ """
38
+ token_embeddings = model_output #First element of model_output contains all token embeddings
39
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
40
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
41
+
42
+ class LLMEmbeddings:
43
+ def __init__(self, model_name: str, device: torch.device = None):
44
+ """
45
+ Initializes any Hugging Face LLM.
46
+
47
+ Args:
48
+ model_dir (str): Path or Hugging Face repo ID for the model.
49
+ device (torch.device): Device to load the model on (CPU/GPU).
50
+ """
51
+ self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+
53
+ # Load model from cache
54
+ try:
55
+ model_dir = check_model_in_cache(model_name)
56
+ except:
57
+ model_dir = model_name
58
+
59
+ # Load tokenizer
60
+ self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
61
+
62
+ # Load model configuration to determine model type
63
+ config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
64
+ self.model_type = config.architectures[0] if config.architectures else ""
65
+
66
+ # Automatically choose between AutoModelForCausalLM and AutoModel
67
+ if "CausalLM" in self.model_type:
68
+ self.model = AutoModelForCausalLM.from_pretrained(
69
+ model_dir, trust_remote_code=True, torch_dtype=torch.float16
70
+ ).to(self.device)
71
+ else:
72
+ self.model = AutoModel.from_pretrained(
73
+ model_dir, trust_remote_code=True, torch_dtype=torch.float16
74
+ ).to(self.device)
75
+
76
+ # Ensure padding token is set (fixes issues in tokenization)
77
+ if self.tokenizer.pad_token is None:
78
+ self.tokenizer.pad_token = self.tokenizer.eos_token
79
+
80
+ self.model.eval()
81
+
82
+ def encode(self, text: Union[str, List[str]]):
83
+ """Encodes input sentences into embeddings."""
84
+ inputs = self.tokenizer(
85
+ text, return_tensors="pt", padding=True, truncation=True, max_length=1024, return_token_type_ids=False
86
+ ).to(self.device)
87
+
88
+ with torch.no_grad():
89
+ outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
90
+
91
+ embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
92
+ return embeddings
93
+
94
+ def encode_batch(self, text: Union[str, List[str]], batch_size: int = 32):
95
+ """Encodes input sentences into embeddings using batching."""
96
+ # If a single string is provided, wrap it in a list.
97
+ if isinstance(text, str):
98
+ text = [text]
99
+
100
+ embeddings_list = []
101
+ # Process the text in batches
102
+ for i in tqdm(range(0, len(text), batch_size), desc="Processing Batches"):
103
+ batch_text = text[i:i+batch_size]
104
+ inputs = self.tokenizer(
105
+ batch_text,
106
+ return_tensors="pt",
107
+ padding=True,
108
+ truncation=True,
109
+ max_length=1024,
110
+ return_token_type_ids=False
111
+ ).to(self.device)
112
+
113
+ with torch.no_grad():
114
+ outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
115
+
116
+ batch_embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
117
+ embeddings_list.append(batch_embeddings)
118
+
119
+ # Concatenate embeddings from all batches along the batch dimension.
120
+ embeddings = torch.cat(embeddings_list, dim=0)
121
+ return embeddings
122
+
123
+
124
+
125
+ if __name__ == "__main__":
126
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
127
+
128
+ # Load any Hugging Face LLM (e.g., LLaMA, Mistral, Falcon, GPT)
129
+
130
+ llm = LLMEmbeddings(model_name="llama3", device=device)
131
+
132
+ # Encode text into embeddings
133
+ embedding = llm.encode("Hugging Face models are powerful!")
134
+ print(embedding.shape)
135
+ print("Done!!")
README.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ALIGN-SIM: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings
2
+
3
+ ALIGN-SIM is a novel, task-free test bed for evaluating and interpreting sentence embeddings based on five intuitive semantic alignment criteria. It provides an alternative evaluation paradigm to popular task-specific benchmarks, offering deeper insights into whether sentence embeddings truly capture human-like semantic similarity.
4
+
5
+ ## Overview
6
+
7
+ Sentence embeddings are central to many NLP applications such as translation, question answering, and text classification. However, evaluating these dense vector representations in a way that reflects human semantic understanding remains challenging. ALIGN-SIM addresses this challenge by introducing a framework based on five semantic alignment criteria:
8
+
9
+ - **Semantic Distinction:** Measures the ability of an encoder to differentiate between semantically similar sentence pairs and unrelated (random) sentence pairs.
10
+ - **Synonym Replacement:** Tests if minor lexical changes (using synonyms) preserve the semantic similarity of the original sentence.
11
+ - **Antonym Replacement (Paraphrase vs. Antonym):** Compares how closely a paraphrase aligns with the original sentence compared to a sentence where a key word is replaced with its antonym.
12
+ - **Paraphrase without Negation:** Evaluates whether removing negation (and rephrasing) preserves the semantic meaning.
13
+ - **Sentence Jumbling:** Assesses the sensitivity of the embeddings to changes in word order, ensuring that a jumbled sentence is distinctly represented.
14
+
15
+ ALIGN-SIM has been used to rigorously evaluate 13 sentence embedding models—including both classical encoders (e.g., SBERT, USE, SimCSE) and modern LLM-induced embeddings (e.g., GPT-3, LLaMA, Bloom)—across multiple datasets (QQP, PAWS-WIKI, MRPC, and AFIN).
16
+
17
+
18
+ ## Features
19
+
20
+ - **Task-Free Evaluation:** Evaluate sentence embeddings without relying on task-specific training data.
21
+ - **Comprehensive Semantic Criteria:** Assess embedding quality using five human-intuitive semantic alignment tests.
22
+ - **Multiple Datasets:** Benchmark on diverse datasets to ensure robustness.
23
+ - **Comparative Analysis:** Provides insights into both classical sentence encoders and LLM-induced embeddings.
24
+ - **Extensive Experimental Results:** Detailed analysis demonstrating that high performance on task-specific benchmarks (e.g., SentEval) does not necessarily imply semantic alignment with human expectations.
25
+
26
+ ## Installation
27
+
28
+ ### Requirements
29
+
30
+ - Python 3.7 or higher
31
+ - [PyTorch](https://pytorch.org/)
32
+ - [Hugging Face Transformers](https://huggingface.co/transformers/)
33
+ - [SentenceTransformers](https://www.sbert.net/)
34
+ - Other dependencies as listed in `requirements.txt` (e.g., NumPy, SciPy, scikit-learn)
35
+
36
+ ### Setup
37
+
38
+ Clone the repository and install dependencies:
39
+
40
+ ```bash
41
+ git clone https://github.com/yourusername/ALIGNSIM.git
42
+ cd ALIGN-SIM
43
+ pip install -r requirements.txt
44
+ ```
45
+
46
+ # Usage
47
+
48
+ ## Creating Sentence Perturbation Dataset
49
+ A dataset is available for English and six other languages [Fr, es, de, zh, ja, ko]. If you want to work with a different dataset, run the code below otherwise skip this step:
50
+
51
+
52
+ ``` bash
53
+ python src/SentencePerturbation/sentence_perturbation.py \
54
+ --dataset_name mrpc \
55
+ --task anto \
56
+ --target_lang en \
57
+ --output_dir ./data/perturbed_dataset/ \
58
+ --save True \
59
+ --sample_size 3500
60
+ ```
61
+
62
+ ## Evaluating Sentence Encoders
63
+
64
+ Run the evaluation script to test a sentence encoder against the five semantic alignment criteria. You can use any HuggingFace model for evaluaton. For example, to evaluate SBERT on the QQP dataset:
65
+
66
+ ```bash
67
+ python src/evaluate.py --model llama3
68
+ --dataset qqp \
69
+ --task antonym \
70
+ --gpu auto \
71
+ --batch_size 16 \
72
+ --metric cosine \
73
+ --save True
74
+ ```
75
+ The script supports different models (e.g., sbert, use, simcse, gpt3-ada, llama2, etc.) and datasets (e.g., qqp, paws_wiki, mrpc, afin). We evalauted models on two metric **Cosine Similarity** and **Normalized Euclidean Distance (NED)**
76
+
77
+
78
+ [# Viewing Results
79
+
80
+ Evaluation results—such as similarity scores, normalized distances, and histograms—are saved in the `Results/`. Use the provided Jupyter notebooks in the `src/PlotAndTables.ipynb` folder to explore and visualize the performance of different models across the evaluation criteria.]: #
81
+
82
+
83
+ # Citation
84
+
85
+ If you use ALIGN-SIM in your research, please cite our work:
86
+
87
+ ```bibtex
88
+ @inproceedings{mahajan-etal-2024-align,
89
+ title = "{ALIGN}-{SIM}: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings through Semantic Similarity Alignment",
90
+ author = "Mahajan, Yash and
91
+ Bansal, Naman and
92
+ Blanco, Eduardo and
93
+ Karmaker, Santu",
94
+ editor = "Al-Onaizan, Yaser and
95
+ Bansal, Mohit and
96
+ Chen, Yun-Nung",
97
+ booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
98
+ month = nov,
99
+ year = "2024",
100
+ address = "Miami, Florida, USA",
101
+ publisher = "Association for Computational Linguistics",
102
+ url = "https://aclanthology.org/2024.findings-emnlp.436/",
103
+ doi = "10.18653/v1/2024.findings-emnlp.436",
104
+ pages = "7393--7428",
105
+ }
106
+ ```
107
+
108
+ # Acknowledgments
109
+
110
+ This work has been partially supported by NSF Standard Grant Award #2302974 and AFOSR Cooperative Agreement Award #FA9550-23-1-0426. We also acknowledge the support from Auburn University College of Engineering and the Department of CSSE.
111
+
data/__init__.py ADDED
File without changes
data/original_datasets/En/Readme.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Dataset downloadable link.
2
+ Note that: the perturbed data is created with
3
+ the help these dataset and the code.
4
+
5
+ To download the dataset, please follow the link:
6
+
7
+ 1) paws-wiki : https://github.com/google-research-datasets/paws
8
+ Open the link and scroll to PAWS_Wiki section. Download
9
+ PAWS-Wiki Labeled (Final)
10
+
11
+ 2) QQP: https://huggingface.co/datasets/glue/viewer/qqp/train
12
+ Visit the huggingface link and download qqp paraphrasing
13
+ train dataset.
14
+
15
+ 3) MRPC: https://huggingface.co/datasets/glue/viewer/mrpc/train
16
+ To download Microsoft Research Paraphrasing Corpus(MRPC)
17
+ dataset, visit the link and download the dataset (train
18
+ version).
19
+ Alternative:
20
+ You can use the dataset provided in the zip file. Just unzip the data file
21
+ and use the data.
22
+
23
+ Perturbed Data Generation:
24
+ We used the above dataset to create sentence perturbation for hypothesis
25
+ testing. We took the first column (i.e. sentence1 or question1) as our original
26
+ sentence and produce a sentence perturbation for these sentences using
27
+ WordNet toolkit. The code is provided in the zip file.
28
+ check: scr/word_replacer.py
29
+
data/original_datasets/En/mrpc.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/original_datasets/En/paw_wiki.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28cea292e3fe964ca951cc3fd08e8edcdc31ea7118eafc7b9725b5702e78f50c
3
+ size 11734851
data/original_datasets/En/qoura.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f1c8f35afbb89a22437a9df147e5bef13ee2d7a7f7698b5c7e1d4b958992717
3
+ size 58317136
requiremnets.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets==3.0.1
2
+ sentence-transformers==3.0.0
3
+ scikit-learn==1.4.2
4
+ nltk==3.8.1
5
+ numpy==1.26.4
6
+ matplotlib==3.8.4
7
+ matplotlib-inline==0.1.7
8
+ pandas==2.2.2
9
+ torch==2.4.0
10
+ tqdm==4.66.5
11
+ transformers==4.45.2
12
+ python-dotenv==1.0.1
13
+
14
+
src/MultilingualTranslation/args_parser.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # argument_parser.py
2
+ from argparse import ArgumentParser
3
+ from typing import List
4
+
5
+ def get_args():
6
+ """
7
+ Parses command-line arguments for ALIGN-Multilingual.
8
+
9
+ Returns:
10
+ argparse.Namespace: Parsed arguments.
11
+ """
12
+ parser = ArgumentParser(description="ALIGN-Multilingual Argument Parser")
13
+
14
+ parser.add_argument(
15
+ "--dataset_name",
16
+ dest="dataset_name",
17
+ type=str,
18
+ default="mrpc",
19
+ choices=["mrpc", "qqp"],
20
+ help="Name of the dataset to use.",
21
+ )
22
+
23
+ # parser.add_argument(
24
+ # "--language",
25
+ # type=str,
26
+ # default="fr",
27
+ # help="Target language for translation.",
28
+ # )
29
+
30
+ parser.add_argument(
31
+ "--model_name",
32
+ dest="model_name",
33
+ type=str,
34
+ default="facebook/nllb-200-3.3B",
35
+ help="Translation model name.",
36
+ )
37
+
38
+ parser.add_argument(
39
+ "--batch_size",
40
+ dest="batch_size",
41
+ type=int,
42
+ default=16,
43
+ help="Batch size for translation.",
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--save",
48
+ dest="save",
49
+ type=bool,
50
+ help="Whether to save the translated dataset to a file.",
51
+ )
52
+
53
+ return parser.parse_args()
src/MultilingualTranslation/translation.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from googletrans import Translator # Import the googletrans module
3
+ import os
4
+ from tqdm import tqdm
5
+ import sys
6
+ sys.path.insert(0, "/home/yash/EMNLP-2024/ALIGN-Multilingual/")
7
+ from Models.MultilingualTranslationModel import NLLBTranslator
8
+ from args_parser import get_args
9
+ from src.utils import read_data
10
+
11
+ # TODO: Perturbation does not support Multilingual at the moment
12
+
13
+ def translate_dataset(dataset_name, model_name, target_lang,batch_size=16, sample_size=1000,save=False):
14
+ """
15
+ Translates a dataset in batches using the NLLB model.
16
+
17
+ Args:
18
+ dataset_name (str): Name of the dataset.
19
+ model_name (str): Model name used for translation.
20
+ target_lang (str): Target language for translation.
21
+ batch_size (int): Number of sentences to process in each batch.
22
+ sample_size (int): Number of rows to process.
23
+ save (bool): Whether to save the translated dataset to CSV.
24
+
25
+ Returns:
26
+ pd.DataFrame: Translated dataset.
27
+ """
28
+
29
+
30
+ # check if translated dataset already exists else create it
31
+ translated_file_path = f"/home/yash/EMNLP-2024/ALIGN-Multilingual/data/{dataset_name}_{target_lang}.csv"
32
+
33
+ #original dataset
34
+ # data = pd.read_csv("/home/yash/EMNLP-2024/data/paw_wiki.tsv", sep='\t')
35
+ data = read_data(dataset_name)
36
+ #size of dataset
37
+ print(f"Size of dataset: {len(data)}")
38
+
39
+ print("original dataset loaded successfully")
40
+
41
+ model = NLLBTranslator(model_name=model_name)
42
+ print("NLLB model loaded successfully")
43
+
44
+ if os.path.exists(translated_file_path):
45
+ translated_dataset = pd.read_csv(translated_file_path)
46
+ print("Dataset exists and loaded successfully")
47
+ return translated_dataset
48
+
49
+ print("Creatign the dataset ....")
50
+ translated_dataset = pd.DataFrame(columns=['sentence1', 'sentence2', 'label'])
51
+
52
+ for i in tqdm(range(0, len(data), batch_size)):
53
+ batch_sentences1 = data.loc[i:i+batch_size-1, 'sentence1'].tolist()
54
+ batch_sentences2 = data.loc[i:i+batch_size-1, 'sentence2'].tolist()
55
+ batch_labels = data.loc[i:i+batch_size-1, 'label'].tolist()
56
+
57
+ translated_batch1 = model.translate(batch_sentences1, source_lang="en", target_lang=target_lang)
58
+ translated_batch2 = model.translate(batch_sentences2, source_lang="en", target_lang=target_lang)
59
+
60
+ # Append translated sentences and labels to DataFrame
61
+ batch_df = pd.DataFrame({
62
+ 'sentence1': translated_batch1,
63
+ 'sentence2': translated_batch2,
64
+ 'label': batch_labels
65
+ })
66
+
67
+ translated_dataset = pd.concat([translated_dataset, batch_df], ignore_index=True)
68
+
69
+ if save:
70
+ translated_dataset.to_csv(translated_file_path, index=False)
71
+ print(f"Translated dataset saved to {translated_file_path}")
72
+ return translated_dataset
73
+
74
+
75
+
76
+ if __name__ == "__main__":
77
+ languages=['fr','es',"de","zh-CN","ja","ko"]
78
+
79
+ # Parse command-line arguments
80
+ args = get_args()
81
+
82
+ for language in languages:
83
+ print(f"Translating to {language} ....")
84
+ config= {
85
+ "dataset_name": args.dataset_name,
86
+ "model_name": args.model_name,
87
+ "target_lang": language,
88
+ "batch_size": args.batch_size,
89
+ "save": args.save
90
+ }
91
+ translated_dataset_lang = translate_dataset(**config)
92
+
93
+ # For Testing
94
+ # for language in languages:
95
+ # print(f"Translating to {language} ....")
96
+ # config= {
97
+ # "dataset_name": "qqp",
98
+ # "model_name": "nllb",
99
+ # "target_lang": language,
100
+ # "batch_size": 3,
101
+ # "save": True
102
+ # }
103
+ # translated_dataset_lang = translate_dataset(**config)
104
+ print("Done")
src/SentencePerturbation/perturbation_args.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from typing import List
3
+
4
+ def get_args():
5
+ """
6
+ Parses command-line arguments for ALIGN-Multilingual.
7
+
8
+ Returns:
9
+ argparse.Namespace: Parsed arguments.
10
+ """
11
+ parser = ArgumentParser(description="ALIGN-SentencePerturbation Argument Parser")
12
+
13
+ parser.add_argument(
14
+ "--dataset_name",
15
+ dest="dataset_name",
16
+ type=str,
17
+ default="mrpc",
18
+ choices=["mrpc", "qqp","paws"],
19
+ help="Name of the dataset to use.",
20
+ )
21
+
22
+ parser.add_argument(
23
+ "--task",
24
+ dest="task",
25
+ type=str,
26
+ default="syn",
27
+ choices=["syn", "anto","jumb","jumbling","paraphrase","para"],
28
+ help="Perturbation task to perform.",
29
+ )
30
+
31
+ parser.add_argument(
32
+ "--target_lang",
33
+ dest="target_lang",
34
+ type=str,
35
+ default="en",
36
+ help="Target language for translation.",
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--output_dir",
41
+ dest="output_dir",
42
+ type=str,
43
+ default="./data/perturbed_dataset/",
44
+ help="Output directory for perturbed dataset.",
45
+ )
46
+
47
+ parser.add_argument(
48
+ "--save",
49
+ dest="save",
50
+ type=bool,
51
+ help="Whether to save the translated dataset to a file.",
52
+ )
53
+
54
+ parser.add_argument(
55
+ "--sample_size",
56
+ dest="sample_size",
57
+ type=int,
58
+ default=None,
59
+ help="Number of rows to process.",
60
+ )
61
+
62
+ return parser.parse_args()
src/SentencePerturbation/sentence_perturbation.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from absl import logging
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import os
6
+ import pandas as pd
7
+ import re
8
+ import sys
9
+ sys.path.insert(0, "/home/yash/ALIGN-SIM/src")
10
+ from utils import mkdir_p, full_path, read_data
11
+ from SentencePerturbation.word_replacer import WordReplacer, WordSwapping
12
+ import random
13
+ from perturbation_args import get_args
14
+
15
+
16
+
17
+ def perturb_sentences(dataset_name: str, task: str, target_lang:str ="en", output_dir: str = "./data/perturbed_dataset/", sample_size: int = 3500, save :str = False) -> None:
18
+ """
19
+ perturb_sentences _summary_
20
+
21
+ Args:
22
+ dataset_name (str): ["MRPC","PAWS","QQP"]
23
+ task (str): ["Synonym","Antonym","Jumbling"]
24
+ target_lang (str, optional): _description_. Defaults to "en".
25
+ output_dir (str, optional): _description_. Defaults to "./data/perturbed_dataset/".
26
+ sample_size (int, optional): _description_. Defaults to 3500.
27
+ save (str, optional): _description_. Defaults to False.
28
+ """
29
+
30
+ print("--------------------------------------")
31
+
32
+ output_csv = full_path(os.path.join(output_dir, target_lang, task, f"{dataset_name}_{task}_perturbed_{target_lang}.csv"))
33
+ if os.path.exists(output_csv):
34
+ print(f"File already exists at: {output_csv}")
35
+ return
36
+
37
+ # TODO: make it compatible with other language datasets
38
+ print("Loading dataset...")
39
+ data = read_data(dataset_name)
40
+ if "Unnamed: 0" in data.columns:
41
+ data.drop("Unnamed: 0", axis=1, inplace=True)
42
+
43
+ if "idx" in data.columns:
44
+ data.drop("idx", axis=1, inplace=True)
45
+
46
+ print(f"Loaded {dataset_name} dataset")
47
+
48
+ print("--------------------------------------")
49
+
50
+
51
+ # Initialize WordReplacer
52
+ replacer = WordReplacer()
53
+ # set seed
54
+ random.seed(42)
55
+
56
+ # Create a new dataframe to store perturbed sentences
57
+ # Sample sentences
58
+ perturbed_data = pd.DataFrame(columns=["original_sentence"])
59
+ # sample_data , pos_pairs, balance_dataset = sampling(data, sample_size)
60
+
61
+
62
+ if task in ["Syn","syn","Synonym"]:
63
+ print("Creating Synonym perturbed data...")
64
+ sample_data = sampling(data, task, sample_size)
65
+ perturbed_data["original_sentence"] = sample_data.sentence1
66
+ perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "synonyms"))
67
+ perturbed_data["perturb_n2"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 2, "synonyms"))
68
+ perturbed_data["perturb_n3"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 3, "synonyms"))
69
+
70
+ assert perturbed_data.shape[1] == 4, "Perturbed data size mismatch"
71
+
72
+ if task in ["paraphrase","Paraphrase","para"]:
73
+ print("Creating Paraphrase perturbed data...")
74
+ # shuffling the negative samples
75
+ # we also want equal number of positive and negative samples
76
+ perturbed_data = sampling(data, task, sample_size) # balance data
77
+ perturbed_data["original_sentence"] = perturbed_data.sentence1
78
+ perturbed_data["paraphrased_sentence"] = perturbed_data.sentence2
79
+ assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch" # original_sentence, paraphrased, label
80
+
81
+ if task in ["Anto","anto","Antonym"]:
82
+ print("Creating Antonym perturbed data...")
83
+ pos_pairs = sampling(data, task, sample_size)
84
+ # Apply antonym replacement
85
+ perturbed_data["original_sentence"] = pos_pairs.sentence1
86
+ perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
87
+ perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "antonyms"))
88
+ assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch"
89
+
90
+ # Apply jumbling
91
+ if task in ["jumbling", "Jumbling","jumb"]:
92
+ print("Creating Jumbling perturbed data...")
93
+ pos_pairs = sampling(data, task, sample_size)
94
+ perturbed_data["original_sentence"] = pos_pairs.sentence1
95
+ perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
96
+ perturbed_data["perturb_n1"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,1))
97
+ perturbed_data["perturb_n2"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,2))
98
+ perturbed_data["perturb_n3"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,3))
99
+
100
+ assert perturbed_data.shape[1] == 5, "Perturbed data size mismatch"
101
+ # Save to CSV
102
+ if save:
103
+ perturbed_data.to_csv(mkdir_p(output_csv), index=False)
104
+ print("--------------------------------------")
105
+ print(f"Saved at: {output_csv}")
106
+ print("--------------------------------------")
107
+
108
+
109
+
110
+ def sampling(data: pd.DataFrame, task :str, sample_size: int, random_state: int = 42):
111
+ """
112
+ Combines two sampling strategies:
113
+
114
+ 1. sampled_data: Samples from the dataset by first taking all positive pairs and then,
115
+ if needed, filling the remainder with negative pairs.
116
+ 2. balanced_data: Constructs a dataset with roughly equal positive and negative pairs,
117
+ adjusting the numbers if one group is underrepresented.
118
+
119
+ Returns:
120
+ sampled_data (pd.DataFrame): Dataset sampled by filling negatives if positives are insufficient.
121
+ positive_data (pd.DataFrame): All positive samples (label == 1).
122
+ balanced_data (pd.DataFrame): Dataset balanced between positive and negative pairs.
123
+ """
124
+ # Split the data into positive and negative pairs
125
+ positive_data = data[data["label"] == 1]
126
+ negative_data = data[data["label"] == 0]
127
+
128
+ if task in ["Anto","anto","Antonym","jumbling", "Jumbling","jumb"]:
129
+ return positive_data
130
+
131
+ # ----- Sampling positive pair, but also checking if we satisfy sample size -----
132
+ if sample_size is None or sample_size > len(positive_data):
133
+ # If no sample size is provided or it exceeds the available data,
134
+ # return a copy of the entire dataset.
135
+ sampled_data = positive_data.copy()
136
+ else:
137
+ # Otherwise, randomly sample the specified number of rows.
138
+ sampled_data = positive_data.sample(n=sample_size, random_state=random_state)
139
+
140
+
141
+ if task in ["Syn","syn","Synonym"]:
142
+ return sampled_data
143
+
144
+ # ----- Sampling for Paraphrased Criterion -----
145
+ # Shuffle negative pairs first
146
+ negative_data = negative_data.reset_index(drop=True)
147
+ shuffled_sentence2 = negative_data["sentence2"].sample(frac=1, random_state=random_state).reset_index(drop=True)
148
+ negative_data["sentence2"] = shuffled_sentence2
149
+
150
+ # Determine ideal sample size per group (half of total sample size)
151
+ if sample_size is None:
152
+ pos_sample_size = len(positive_data)
153
+ neg_sample_size = len(negative_data)
154
+ else:
155
+ # Determine ideal sample size per group (half of total sample size)
156
+ half_size = sample_size // 2
157
+ pos_available = len(positive_data)
158
+ neg_available = len(negative_data)
159
+ pos_sample_size = min(half_size, pos_available)
160
+ neg_sample_size = min(half_size, neg_available)
161
+
162
+ # If there is a remainder, add extra samples from the group with more available data.
163
+ total_sampled = pos_sample_size + neg_sample_size
164
+ remainder = sample_size - total_sampled
165
+ if remainder > 0:
166
+ if (pos_available - pos_sample_size) >= (neg_available - neg_sample_size):
167
+ pos_sample_size += remainder
168
+ else:
169
+ neg_sample_size += remainder
170
+
171
+ # Sample from each group
172
+ sampled_positive = positive_data.sample(n=pos_sample_size, random_state=random_state)
173
+ sampled_negative = negative_data.sample(n=neg_sample_size, random_state=random_state)
174
+ # Add a 'label' column
175
+ sampled_positive["label"] = 1
176
+ sampled_negative["label"] = 0
177
+ # Combine and shuffle the resulting dataset
178
+ balanced_data = pd.concat([sampled_positive, sampled_negative]).sample(frac=1, random_state=random_state).reset_index(drop=True)
179
+
180
+ if task in ["paraphrase","Paraphrase","para"]:
181
+ return balanced_data
182
+ # return sampled_data, positive_data, balanced_data
183
+
184
+
185
+
186
+ if __name__ == "__main__":
187
+
188
+ # # For Testing
189
+ if sys.gettrace() is not None:
190
+ config = {
191
+ "dataset_name": "mrpc",
192
+ "task": "syn",
193
+ "target_lang": "en",
194
+ "output_dir": "./data/perturbed_dataset/",
195
+ "save": True
196
+ }
197
+ else:
198
+ args = get_args()
199
+ config = {
200
+ "dataset_name": args.dataset_name,
201
+ "task": args.task,
202
+ "target_lang": args.target_lang,
203
+ "output_dir": args.output_dir,
204
+ "save": args.save,
205
+ "sample_size": args.sample_size
206
+ }
207
+ perturb_sentences(**config)
src/SentencePerturbation/word_replacer.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import types
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.corpus import wordnet
5
+ nltk.download('wordnet')
6
+ import pandas as pd
7
+ import random
8
+
9
+
10
+ class WordReplacer(object):
11
+
12
+ def get_antonyms(self, word, pos=None):
13
+ antonyms = set()
14
+ for syn in wordnet.synsets(word, pos=pos):
15
+ for lemma in syn.lemmas():
16
+ for antonym in lemma.antonyms():
17
+ antonyms.add(antonym.name())
18
+ if word in antonyms:
19
+ antonyms.remove(word)
20
+ return list(antonyms)
21
+
22
+ def get_synonyms(self,word):
23
+ """
24
+ Get synonyms of a word
25
+ """
26
+ synonyms = set()
27
+
28
+ for syn in wordnet.synsets(word):
29
+ for l in syn.lemmas():
30
+ synonym = l.name().replace("_", " ").replace("-", " ").lower()
31
+ synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
32
+ synonyms.add(synonym)
33
+ if word in synonyms:
34
+ synonyms.remove(word)
35
+ return list(synonyms)
36
+
37
+
38
+ def sentence_replacement(self,words,n,types=""):
39
+ words = words.split()
40
+ types= types.lower()
41
+ new_words= words.copy()
42
+ random_word_list = list(set([word for word in words if word not in stopwords.words("english")]))
43
+ random.shuffle(random_word_list)
44
+ num_replaced = 0
45
+ if types == "antonyms":
46
+ for random_word in random_word_list:
47
+ antonyms = self.get_antonyms(random_word)
48
+
49
+ if len(antonyms)>=1:
50
+ antonyms = random.choice(list(antonyms))
51
+ new_words = [antonyms if word == random_word else word for word in new_words]
52
+ num_replaced +=1
53
+
54
+ if num_replaced >=n:
55
+ break
56
+
57
+ if types=="synonyms":
58
+ for random_word in random_word_list:
59
+ synonyms = self.get_synonyms(random_word)
60
+
61
+ if len(synonyms)>=1:
62
+ synonyms = random.choice(list(synonyms))
63
+ new_words = [synonyms if word == random_word else word for word in new_words]
64
+ num_replaced +=1
65
+
66
+ if num_replaced >=n:
67
+ break
68
+ sentence= " ".join(new_words)
69
+ return sentence
70
+
71
+ class WordSwapping(object):
72
+
73
+ @staticmethod
74
+ def swap_word(new_words):
75
+ random_idx_1 = random.randint(0, len(new_words)-1)
76
+ random_idx_2 = random_idx_1
77
+ counter = 0
78
+ while random_idx_2 == random_idx_1:
79
+ random_idx_2 = random.randint(0, len(new_words)-1)
80
+ counter += 1
81
+
82
+ if counter > 3:
83
+ return new_words
84
+
85
+ new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
86
+ return new_words
87
+
88
+ @staticmethod
89
+ def random_swap(words,n):
90
+ words = words.split()
91
+ new_words = words.copy()
92
+ for _ in range(n):
93
+ new_words = WordSwapping.swap_word(new_words)
94
+ sentence = ' '.join(new_words)
95
+ return sentence
96
+
97
+ # if __name__ == "__main__":
98
+ # replace= WordReplacer()
99
+ # temp1= ["i am testing", "this is second sent"]
100
+ # print([replace.sentence_replacement(i,n=1,types="synonyms") for i in temp1])
101
+
102
+
src/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from src.utils import mkdir_p, full_path, read_data
2
+
3
+ if "__all__" == ["mkdir_p", "full_path", "read_data"]:
4
+ print("All modules imported successfully")
src/adjustment_factor.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def compute_alpha_model(rnd_similarities):
4
+ """
5
+ Computes alpha_model as per the formula:
6
+
7
+ α_model = 1 - (1 / (n * |D|)) * sum(sim(RND-Pairs))
8
+
9
+ Args:
10
+ rnd_similarities (array-like): A 2D array of shape (n, |D|)
11
+ where each entry [i][j] is the similarity
12
+ of the j-th random pair in the i-th sample.
13
+
14
+ Returns:
15
+ float: The computed alpha_model value.
16
+ """
17
+ rnd_similarities = np.array(rnd_similarities)
18
+ n, D_size = rnd_similarities.shape
19
+ alpha_model = 1 - (1 / (n * D_size)) * rnd_similarities.sum()
20
+ return alpha_model
21
+
22
+
src/evaluate.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import numpy as np
3
+ import os
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ import torch
7
+ import utils
8
+ from metrics import *
9
+ import sys
10
+ sys.path.insert(0,"./")
11
+ from Models.SentenceTransformersModel import SentenceTransformerModels
12
+ from Models.llm_embeddings import LLMEmbeddings
13
+ from main_args import get_args
14
+ from metrics import CosineMetric
15
+
16
+
17
+ def read_pertubed_data(filename, task, lang="en"):
18
+ # path = f"./data/perturbed_dataset/{lang}/{task}/{filename}.csv"
19
+ if not os.path.exists(filename):
20
+ raise FileNotFoundError(f"File {filename} not found.")
21
+ return pd.read_csv(filename)
22
+
23
+ def compute_metrics(emb1, emb2,metric="cosine"):
24
+ """Compute all metrics between two sets of embeddings."""
25
+ # sim = utils.cosine_similarity(emb1, emb2)
26
+ # ned = compute_ned_distance(emb1, emb2)
27
+ # ed = np.linalg.norm(emb1 - emb2, axis=1)
28
+ # dotp = np.sum(emb1 * emb2, axis=1)
29
+ if metric=="cosine":
30
+ sim = CosineMetric(emb1,emb2)
31
+ return sim
32
+
33
+ def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
34
+ model = LLMEmbeddings(args_model, device=default_gpu)
35
+
36
+ pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" # check if path exist
37
+
38
+ data = read_pertubed_data(pertubed_data_path, args_task)
39
+ # dataset_name = dataset_name.split(".")[0] if args_task == "paraphrase" else dataset_name.split("_")[0]
40
+
41
+ print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n")
42
+
43
+ # Collect all sentences based on task
44
+ sentences = []
45
+ if args_task in ["Anto","anto","Antonym"]:
46
+ cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"]
47
+ for _, row in data[cols].iterrows():
48
+ sentences.extend(row.values)
49
+ elif args_task in ["jumbling", "Jumbling","jumb"]:
50
+ cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
51
+ for _, row in data[cols].iterrows():
52
+ sentences.extend(row.values)
53
+ elif args_task in ["Syn","syn","Synonym"]:
54
+ cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
55
+ for _, row in data[cols].iterrows():
56
+ sentences.extend(row.values)
57
+ elif args_task in ["paraphrase","Paraphrase","para"]:
58
+ cols = ["original_sentence", "paraphrased_sentence"]
59
+ for _, row in data[cols].iterrows():
60
+ sentences.extend(row.values)
61
+
62
+ # Batch process embeddings
63
+ embeddings = model.encode_batch(sentences,batch_size=batch_size)
64
+ if args_model != "chatgpt":
65
+ embeddings = [emb.cpu().numpy() for emb in embeddings]
66
+ embeddings = np.array(embeddings)
67
+
68
+ # Process embeddings based on task
69
+ if args_task == "anto":
70
+ emb_org = embeddings[0::3] # start at 0, step by 3
71
+ emb_para = embeddings[1::3] # start at 1, step by 3
72
+ emb_anto = embeddings[2::3] # start at 2, step by 3
73
+
74
+ mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
75
+ mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto)
76
+ data["sim_org_para"] = sim_para
77
+ data["sim_org_anto"] = sim_anto
78
+ data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto)
79
+
80
+ print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """)
81
+
82
+
83
+ elif args_task == "jumbling":
84
+
85
+ emb_org = embeddings[0::5] # start at 0, step by 3
86
+ emb_para = embeddings[1::5] # start at 1, step by 3
87
+ emb_n1 = embeddings[2::5] # start at 2, step by 3
88
+ emb_n2 = embeddings[3::5]
89
+ emb_n3 = embeddings[4::5]
90
+
91
+ # Compute metrics for each perturbation
92
+ mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
93
+ mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1)
94
+ mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2)
95
+ mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3)
96
+
97
+ data["sim_org_para"] = sim_para
98
+ data["sim_org_n1"] = sim_n1
99
+ data["sim_org_n2"] = sim_n2
100
+ data["sim_org_n3"] = sim_n3
101
+
102
+ data["diff_org_para"] = sim_para - sim_para # Zero as per original
103
+ data["diff_org_n1"] = sim_para - sim_n1
104
+ data["diff_org_n2"] = sim_para - sim_n2
105
+ data["diff_org_n3"] = sim_para - sim_n3
106
+
107
+ print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """)
108
+
109
+
110
+ elif args_task == "syn":
111
+
112
+ emb_org = embeddings[0::4] # start at 0, step by 3
113
+ emb_s1 = embeddings[1::4] # start at 1, step by 3
114
+ emb_s2 = embeddings[2::4] # start at 2, step by 3
115
+ emb_s3 = embeddings[3::4]
116
+
117
+ _,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1)
118
+ _,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2)
119
+ _,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3)
120
+
121
+ data["sim_org_s1"] = sim_s1
122
+ data["sim_org_s2"] = sim_s2
123
+ data["sim_org_s3"] = sim_s3
124
+
125
+ print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """)
126
+
127
+ elif args_task == "paraphrase":
128
+ emb_s1 = embeddings[0::2] # start at 0, step by 3
129
+ emb_s2 = embeddings[1::2]
130
+ data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2)
131
+
132
+ print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """)
133
+
134
+ if save:
135
+ path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv"
136
+ data.to_csv(path)
137
+ print("Data saved at path : {path} ")
138
+ return data
139
+
140
+ if __name__ == "__main__":
141
+ if sys.gettrace() is None:
142
+ parser = get_args()
143
+ config = {
144
+ "args_model": parser.model_name,
145
+ "dataset_name": parser.perturbed_dataset,
146
+ "args_task": parser.task,
147
+ "default_gpu": parser.gpu,
148
+ "save": parser.save,
149
+ "target_lang": parser.target_lang,
150
+ "metric":parser.metric,
151
+ "batch_size":2
152
+ }
153
+ else:
154
+ config = {
155
+ "args_model": "llama3",
156
+ "dataset_name": "mrpc",
157
+ "args_task": "syn",
158
+ "default_gpu": "cuda:2",
159
+ "save": False,
160
+ "target_lang": "en"
161
+
162
+ }
163
+ run(**config)
164
+
165
+
166
+ # file_path = "/home/yash/ALIGN-SIM/data/perturbed_dataset/en/anto/mrpc_anto_perturbed_en.csv"
167
+ # run("llama3","mrpc_anto_perturbed_en", "anto", "cuda:2", False)
src/main_args.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+
3
+
4
+ def get_args():
5
+ """
6
+ Parses command-line arguments for SentencePerturbation.
7
+ Returns:
8
+ argparse.Namespace: Parsed arguments.
9
+ """
10
+
11
+ parser = ArgumentParser()
12
+ parser.add_argument(
13
+ "--perturb_dataset",
14
+ dest="perturb_dataset",
15
+ required=True,
16
+ help="Name of the CSV file"
17
+ )
18
+ parser.add_argument(
19
+ "--task",
20
+ dest="task",
21
+ required=True,
22
+ choices=["anto", "jumbling", "syn", "paraphrase"],
23
+ help="Task to perform: anto/jumbling/syn/paraphrase",
24
+ )
25
+ parser.add_argument(
26
+ "--M",
27
+ dest="model_name",
28
+ required=True,
29
+ help="LLM Model")
30
+
31
+ parser.add_argument(
32
+ "--target_lang",
33
+ dest="target_lang",
34
+ required=True,
35
+ default="en",
36
+ help="Language for translation"
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--save",
41
+ dest="save",
42
+ action="store_true",
43
+ help="Save the results in a CSV file",
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--gpu",
48
+ dest="gpu",
49
+ default="auto",
50
+ help="GPU to run the model"
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--batch_size",
55
+ dest="batch_size",
56
+ type=int,
57
+ default=16,
58
+ help="Batch size for translation"
59
+ )
60
+
61
+ parser.add_argument(
62
+ "--metric",
63
+ dest="metric",
64
+ type=str,
65
+ default="cosine",
66
+ choices=["cosine","ned","both"],
67
+ help="Metric to use for comparison",
68
+ )
69
+ return parser.parse_args()
src/metrics.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ import numpy as np
3
+
4
+ # Optional: import torch if available for type checking
5
+ try:
6
+ import torch
7
+ except ImportError:
8
+ torch = None
9
+
10
+ def to_numpy(arr) -> np.ndarray:
11
+ """
12
+ Converts the input array (which can be a numpy array, torch tensor, or list) to a numpy array.
13
+ """
14
+ # Check for torch.Tensor if torch is available
15
+ if torch is not None and isinstance(arr, torch.Tensor):
16
+ # Detach and move to CPU if needed, then convert to numpy
17
+ return arr.detach().cpu().numpy()
18
+ # If it's already a numpy array, return as is
19
+ if isinstance(arr, np.ndarray):
20
+ return arr
21
+ # Otherwise, try converting to a numpy array
22
+ return np.array(arr)
23
+
24
+ class Metric(ABC):
25
+ """
26
+ Abstract base class for evaluation metrics.
27
+ Subclasses must implement the compute method.
28
+ """
29
+ @abstractmethod
30
+ def compute(self, vector1, vector2) -> float:
31
+ """
32
+ Compute the metric between two vectors.
33
+
34
+ Args:
35
+ vector1: The first vector (numpy array, torch tensor, list, etc.).
36
+ vector2: The second vector (numpy array, torch tensor, list, etc.).
37
+
38
+ Returns:
39
+ float: The computed metric value.
40
+ """
41
+ pass
42
+
43
+ class CosineMetric(Metric):
44
+ """
45
+ Implementation of the cosine similarity metric.
46
+ """
47
+ def compute(self, vector1, vector2) -> float:
48
+ # Convert inputs to numpy arrays
49
+ vec1 = to_numpy(vector1)
50
+ vec2 = to_numpy(vector2)
51
+
52
+ dot_product = np.dot(vec1, vec2)
53
+ norm1 = np.linalg.norm(vec1)
54
+ norm2 = np.linalg.norm(vec2)
55
+ if norm1 == 0 or norm2 == 0:
56
+ return 0.0
57
+ return dot_product / (norm1 * norm2)
58
+
59
+ class NEDMetric(Metric):
60
+ """
61
+ Implementation of a normalized Euclidean distance metric.
62
+ """
63
+ def compute(self, vector1, vector2) -> float:
64
+ # Convert inputs to numpy arrays
65
+ vec1 = to_numpy(vector1)
66
+ vec2 = to_numpy(vector2)
67
+
68
+ euclidean_distance = np.linalg.norm(vec1 - vec2)
69
+ norm_sum = np.linalg.norm(vec1) + np.linalg.norm(vec2)
70
+ if norm_sum == 0:
71
+ return 0.0
72
+ return euclidean_distance / norm_sum
73
+
74
+ class EuclideanMetric(Metric):
75
+ def compute(self, vector1, vector2) -> float:
76
+ return np.linalg.norm(vector1 - vector2, axis=1)
77
+
78
+ def dot_product(x, y):
79
+ return np.dot(x, y.T)
80
+
81
+ def compute_ned_distance(x, y):
82
+ return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))
83
+
84
+ def batch_NED(batch_u, batch_v):
85
+ batch_u = np.array(batch_u)
86
+ batch_v = np.array(batch_v)
87
+
88
+ # Ensure batch_u and batch_v have the same number of elements
89
+ assert batch_u.shape[0] == batch_v.shape[0], "The batch sizes of u and v must be the same."
90
+
91
+ scores = []
92
+
93
+ for u, v in zip(batch_u, batch_v):
94
+ u = np.array(u)
95
+ v = np.array(v)
96
+
97
+ u_mean = np.mean(u)
98
+ v_mean = np.mean(v)
99
+
100
+ u_centered = u - u_mean
101
+ v_centered = v - v_mean
102
+
103
+ numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
104
+ denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
105
+
106
+ ned_score = 0.5 * numerator / denominator
107
+ scores.append(ned_score)
108
+
109
+ return np.array(scores)
110
+
111
+
112
+ def NED2(u, v):
113
+ u = np.array(u)
114
+ v = np.array(v)
115
+
116
+ u_mean = np.mean(u)
117
+ v_mean = np.mean(v)
118
+
119
+ u_centered = u - u_mean
120
+ v_centered = v - v_mean
121
+
122
+ numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
123
+ denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
124
+
125
+ return 0.5 * numerator / denominator
126
+
127
+ # --- Example Usage ---
128
+ if __name__ == "__main__":
129
+ # Example inputs: a numpy array and a torch tensor (if torch is available)
130
+ vec_np = np.array([1.0, 2.0, 3.0])
131
+ if torch is not None:
132
+ vec_torch = torch.tensor([4.0, 5.0, 6.0])
133
+ else:
134
+ vec_torch = [4.0, 5.0, 6.0] # fallback list
135
+
136
+ cosine = CosineMetric()
137
+ ned = NEDMetric()
138
+
139
+ print("Cosine Similarity:", cosine.compute(vec_np, vec_torch))
140
+ print("Normalized Euclidean Distance:", ned.compute(vec_np, vec_torch))
141
+
142
+ # x = [20,30,2]
143
+ # y = [1.0,2.0,3.0]
144
+ # print("Dot Product: ", dot_product(x, y))
145
+ # print(euclidean_distance(x, y))
146
+ # # print(NED(x, y))
147
+ # print("Done")
src/utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import matplotlib.pyplot as plt
5
+ from pathlib import Path
6
+ from typing import Union
7
+ import os
8
+
9
+ def delete_file(file_pt: Path) -> None:
10
+ try:
11
+ file_pt.unlink()
12
+ except FileNotFoundError:
13
+ pass
14
+
15
+
16
+ def full_path(inp_dir_or_path: str) -> Path:
17
+ """Returns full path"""
18
+ return Path(inp_dir_or_path).expanduser().resolve()
19
+
20
+
21
+ def mkdir_p(inp_dir_or_path: Union[str, Path]) -> Path:
22
+ """Give a file/dir path, makes sure that all the directories exists"""
23
+ inp_dir_or_path = full_path(inp_dir_or_path)
24
+ if inp_dir_or_path.suffix: # file
25
+ inp_dir_or_path.parent.mkdir(parents=True, exist_ok=True)
26
+ else: # dir
27
+ inp_dir_or_path.mkdir(parents=True, exist_ok=True)
28
+ return inp_dir_or_path
29
+
30
+ def similarity_between_sent(sent1_encoded, sent2_encoded):
31
+ """report the avg. cosine similarity score b.w two pairs of sentences"""
32
+ similarity_scores = []
33
+ for i in range(len(sent1_encoded)):
34
+ similarity_scores.append(cosine_similarity(
35
+ sent1_encoded[i], sent2_encoded[i]))
36
+
37
+ return np.mean(similarity_scores),similarity_scores
38
+
39
+
40
+ def cosine_similarity(a, b):
41
+ """
42
+ Takes 2 vectors a, b and returns the cosine similarity according
43
+ to the definition of the dot product
44
+ """
45
+ dot_product = np.dot(a, b)
46
+ norm_a = np.linalg.norm(a)
47
+ norm_b = np.linalg.norm(b)
48
+ return dot_product / (norm_a * norm_b)
49
+
50
+ def load_data(path):
51
+ if path.endswith(".csv"):
52
+ data=pd.read_csv(path)
53
+ else:
54
+ data=pd.read_csv(path,sep="\t")
55
+
56
+ if not isinstance(data,pd.DataFrame):
57
+ raise ValueError("Data should be in pandas DataFrame format")
58
+ return data
59
+
60
+ def read_data(dataset):
61
+ if dataset == "mrpc":
62
+ data = load_data("/home/yash/EMNLP-2024/data/mrpc.csv")
63
+ data = data.copy()
64
+
65
+ elif dataset == "qqp":
66
+ data = load_data("/home/yash/EMNLP-2024/data/qoura.csv")
67
+ data = data.copy().dropna()
68
+ # handling irregularities in columns names
69
+ data.columns = data.columns.str.strip()
70
+ data = data.rename(columns={"is_duplicate":"label",'question1':"sentence1","question2":"sentence2"})
71
+
72
+ elif dataset in ["paws","paw","wiki"]:
73
+ path = "/home/yash/EMNLP-2024/data/paw_wiki.tsv"
74
+ data = load_data(path)
75
+ data = data.copy()
76
+
77
+ else:
78
+ ValueError("No dataset found.")
79
+
80
+ return data