BridgeAI-Lab
/

ALIGN-Sim

Model card Files Files and versions Community

yzm0034 commited on Apr 11

Commit

4f08d2c

verified ·

1 Parent(s): f9a9f31

Upload folder using huggingface_hub

Browse files

Files changed (24) hide show

.gitattributes +2 -0
.gitignore +5 -0
Models/.env +0 -0
Models/MultilingualTranslationModel.py +159 -0
Models/SentenceTransformersModel.py +24 -0
Models/llm_embeddings.py +135 -0
README.md +111 -0
data/__init__.py +0 -0
data/original_datasets/En/Readme.txt +29 -0
data/original_datasets/En/mrpc.csv +0 -0
data/original_datasets/En/paw_wiki.tsv +3 -0
data/original_datasets/En/qoura.csv +3 -0
requiremnets.txt +14 -0
src/MultilingualTranslation/args_parser.py +53 -0
src/MultilingualTranslation/translation.py +104 -0
src/SentencePerturbation/perturbation_args.py +62 -0
src/SentencePerturbation/sentence_perturbation.py +207 -0
src/SentencePerturbation/word_replacer.py +102 -0
src/__init__.py +4 -0
src/adjustment_factor.py +22 -0
src/evaluate.py +167 -0
src/main_args.py +69 -0
src/metrics.py +147 -0
src/utils.py +80 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/original_datasets/En/paw_wiki.tsv filter=lfs diff=lfs merge=lfs -text
+data/original_datasets/En/qoura.csv filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+/data/perturbed_dataset/
+/.python-version
+/src/__pycache__
+*.pyc
+src/*.ipynb

Models/.env ADDED Viewed

File without changes

Models/MultilingualTranslationModel.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+import os
+from tqdm import tqdm
+import pandas as pd
+import time
+import sys
+from datasets import load_dataset
+from src.utils import read_data
+class NLLBTranslator:
+    def __init__(self, model_name="facebook/nllb-200-3.3B"):
+        """
+        Initialize the NLLB model and tokenizer for translation
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(self.device)
+    def _get_nllb_code(self, language: str) -> str:
+        """
+        Maps common language names to NLLB language codes.
+        Args:
+            language (str): Common language name (case-insensitive)
+        Returns:
+            str: NLLB language code or None if language not found
+        Examples:
+            >>> get_nllb_code("english")
+            'eng_Latn'
+            >>> get_nllb_code("Chinese")
+            'zho_Hans'
+        """
+        language_mapping = {
+            # English variations
+            "english": "eng_Latn",
+            "eng": "eng_Latn",
+            "en": "eng_Latn",
+            # Hindi variations
+            "hindi": "hin_Deva",
+            "hi": "hin_Deva",
+            # French variations
+            "french": "fra_Latn",
+            "fr": "fra_Latn",
+            # Korean variations
+            "korean": "kor_Hang",
+            "ko": "kor_Hang",
+            # Spanish variations
+            "spanish": "spa_Latn",
+            "es": "spa_Latn",
+            # Chinese variations (defaulting to Simplified)
+            "chinese": "zho_Hans",
+            "chinese simplified": "zho_Hans",
+            "chinese traditional": "zho_Hant",
+            "mandarin": "zho_Hans",
+            "zh-cn": "zho_Hans",
+            # Japanese variations
+            "japanese": "jpn_Jpan",
+            "jpn": "jpn_Jpan",
+            "ja": "jpn_Jpan",
+            # German variations
+            "german": "deu_Latn",
+            "de": "deu_Latn"
+        }
+        # Convert input to lowercase for case-insensitive matching
+        normalized_input = language.lower().strip()
+        # Return the code if found, None otherwise
+        return language_mapping.get(normalized_input)
+        def add_language_code(self, name_code_dict, language, code):
+            # TODO: Add this fuctionality to _get_nllb_code
+            """
+            Adds a language code to the dictionary if it is not already present.
+            Args:
+                name_code_dict (dict): Dictionary of language names to codes
+                language (str): Language name
+                code (str): Language code
+            Returns:
+                dict: Updated dictionary
+            """
+            # Normalize the language name
+            normalized_language = language.lower().strip()
+            # Add the language code if not already present
+            if normalized_language not in name_code_dict:
+                name_code_dict[normalized_language] = code
+            return name_code_dict
+    def translate(self, text, source_lang="eng_Latn", target_lang="fra_Latn",batch_size=None):
+        """
+        Translate text from source language to target language
+        Args:
+            text (str): Text to translate
+            source_lang (str): Source language code
+            target_lang (str): Target language code
+        Returns:
+            str: Translated text
+        """
+        # Tokenize the input text
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True).to(self.device)
+        # map language names to NLLB language codes
+        source_lang = self._get_nllb_code(source_lang)
+        target_lang = self._get_nllb_code(target_lang)
+        # Add the source language token
+        forced_bos_token_id = self.tokenizer.convert_tokens_to_ids(target_lang)
+        # Generate translation
+        translated_tokens = self.model.generate(
+            **inputs,
+            max_length=256,
+            num_beams=5,
+            temperature=0.5,
+            do_sample=True,
+            forced_bos_token_id=forced_bos_token_id,
+        )
+        # Decode the translation
+        if translated_tokens.shape[0] == 1: #single sentence
+            translation = self.tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
+        else:
+            translation = self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
+        return translation
+def main():
+    # Set up the model and tokenizer
+    print("Loading model and tokenizer...")
+    translator = NLLBTranslator()
+    # Example translations
+    texts = [
+        "Hello, how are you?",
+        "This is a test of the NLLB translation model.",
+        "Machine learning is fascinating."
+    ]
+    print("\nTranslating texts from English to French:")
+    trt=translation = translator.translate(texts,target_lang="fr",batch_size=2)
+if __name__ == "__main__":
+    main()

Models/SentenceTransformersModel.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import abc
+import warnings
+from pathlib import Path
+from typing import List, Union
+import torch
+from numpy.typing import NDArray
+from sentence_transformers import SentenceTransformer
+class SentenceTransformerModels():
+    def __init__(self, model_id, device: bool = False):
+        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = SentenceTransformer(model_id).eval()
+    def encode(self, sentences: List[str], batch_size: int = 32) -> NDArray:
+        with torch.no_grad():
+            embeddings = self.model.encode(
+                sentences, batch_size=batch_size, device=self.device
+            )
+        if isinstance(embeddings, torch.Tensor):
+            return embeddings.cpu().numpy()
+        return embeddings

Models/llm_embeddings.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import torch
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from typing import Union, List
+from pathlib import Path
+from typing import Union, List
+import dotenv
+import os
+import sys
+sys.path.insert(0,"./")
+from src.utils import full_path
+from tqdm import tqdm
+dotenv.load_dotenv(os.getenv("./models/.env"))
+hf = os.getenv("huggingface_token")
+def check_model_in_cache(model_name: str):
+    if model_name in ["LLaMA3","llama3"]:
+        return str(full_path("/data/shared/llama3-8b/Meta-Llama-3-8B_shard_size_1GB"))
+    if model_name in ["Mistral","mistral"]:
+        return str(full_path("/data/shared/mistral-7b-v03/Mistral-7B-v0.3_shard_size_1GB"))
+    if model_name in ["olmo","OLMo"]:
+        return str(full_path("/data/shared/olmo/OLMo-7B_shard_size_2GB"))
+def mean_pooling(model_output, attention_mask):
+    """
+    mean_pooling _summary_
+    Args:
+        model_output (_type_): _description_
+        attention_mask (_type_): _description_
+    Returns:
+        _type_: _description_
+    """
+    token_embeddings = model_output #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+class LLMEmbeddings:
+    def __init__(self, model_name: str, device: torch.device = None):
+        """
+        Initializes any Hugging Face LLM.
+        Args:
+            model_dir (str): Path or Hugging Face repo ID for the model.
+            device (torch.device): Device to load the model on (CPU/GPU).
+        """
+        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load model from cache
+        try:
+            model_dir = check_model_in_cache(model_name)
+        except:
+            model_dir = model_name
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+        # Load model configuration to determine model type
+        config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+        self.model_type = config.architectures[0] if config.architectures else ""
+        # Automatically choose between AutoModelForCausalLM and AutoModel
+        if "CausalLM" in self.model_type:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_dir, trust_remote_code=True, torch_dtype=torch.float16
+            ).to(self.device)
+        else:
+            self.model = AutoModel.from_pretrained(
+                model_dir, trust_remote_code=True, torch_dtype=torch.float16
+            ).to(self.device)
+        # Ensure padding token is set (fixes issues in tokenization)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model.eval()
+    def encode(self, text: Union[str, List[str]]):
+        """Encodes input sentences into embeddings."""
+        inputs = self.tokenizer(
+            text, return_tensors="pt", padding=True, truncation=True, max_length=1024, return_token_type_ids=False
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
+        embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
+        return embeddings
+    def encode_batch(self, text: Union[str, List[str]], batch_size: int = 32):
+        """Encodes input sentences into embeddings using batching."""
+        # If a single string is provided, wrap it in a list.
+        if isinstance(text, str):
+            text = [text]
+        embeddings_list = []
+        # Process the text in batches
+        for i in tqdm(range(0, len(text), batch_size), desc="Processing Batches"):
+            batch_text = text[i:i+batch_size]
+            inputs = self.tokenizer(
+                batch_text,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=1024,
+                return_token_type_ids=False
+            ).to(self.device)
+            with torch.no_grad():
+                outputs = self.model(**inputs, output_hidden_states=True, use_cache=False)
+            batch_embeddings = mean_pooling(outputs.hidden_states[-1], inputs["attention_mask"]).squeeze()
+            embeddings_list.append(batch_embeddings)
+        # Concatenate embeddings from all batches along the batch dimension.
+        embeddings = torch.cat(embeddings_list, dim=0)
+        return embeddings
+if __name__ == "__main__":
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load any Hugging Face LLM (e.g., LLaMA, Mistral, Falcon, GPT)
+    llm = LLMEmbeddings(model_name="llama3", device=device)
+    # Encode text into embeddings
+    embedding = llm.encode("Hugging Face models are powerful!")
+    print(embedding.shape)
+    print("Done!!")

README.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# ALIGN-SIM: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings
+ALIGN-SIM is a novel, task-free test bed for evaluating and interpreting sentence embeddings based on five intuitive semantic alignment criteria. It provides an alternative evaluation paradigm to popular task-specific benchmarks, offering deeper insights into whether sentence embeddings truly capture human-like semantic similarity.
+## Overview
+Sentence embeddings are central to many NLP applications such as translation, question answering, and text classification. However, evaluating these dense vector representations in a way that reflects human semantic understanding remains challenging. ALIGN-SIM addresses this challenge by introducing a framework based on five semantic alignment criteria:
+- **Semantic Distinction:** Measures the ability of an encoder to differentiate between semantically similar sentence pairs and unrelated (random) sentence pairs.
+- **Synonym Replacement:** Tests if minor lexical changes (using synonyms) preserve the semantic similarity of the original sentence.
+- **Antonym Replacement (Paraphrase vs. Antonym):** Compares how closely a paraphrase aligns with the original sentence compared to a sentence where a key word is replaced with its antonym.
+- **Paraphrase without Negation:** Evaluates whether removing negation (and rephrasing) preserves the semantic meaning.
+- **Sentence Jumbling:** Assesses the sensitivity of the embeddings to changes in word order, ensuring that a jumbled sentence is distinctly represented.
+ALIGN-SIM has been used to rigorously evaluate 13 sentence embedding models—including both classical encoders (e.g., SBERT, USE, SimCSE) and modern LLM-induced embeddings (e.g., GPT-3, LLaMA, Bloom)—across multiple datasets (QQP, PAWS-WIKI, MRPC, and AFIN).
+## Features
+- **Task-Free Evaluation:** Evaluate sentence embeddings without relying on task-specific training data.
+- **Comprehensive Semantic Criteria:** Assess embedding quality using five human-intuitive semantic alignment tests.
+- **Multiple Datasets:** Benchmark on diverse datasets to ensure robustness.
+- **Comparative Analysis:** Provides insights into both classical sentence encoders and LLM-induced embeddings.
+- **Extensive Experimental Results:** Detailed analysis demonstrating that high performance on task-specific benchmarks (e.g., SentEval) does not necessarily imply semantic alignment with human expectations.
+## Installation
+### Requirements
+- Python 3.7 or higher
+- [PyTorch](https://pytorch.org/)
+- [Hugging Face Transformers](https://huggingface.co/transformers/)
+- [SentenceTransformers](https://www.sbert.net/)
+- Other dependencies as listed in `requirements.txt` (e.g., NumPy, SciPy, scikit-learn)
+### Setup
+Clone the repository and install dependencies:
+```bash
+git clone https://github.com/yourusername/ALIGNSIM.git
+cd ALIGN-SIM
+pip install -r requirements.txt
+```
+# Usage
+## Creating Sentence Perturbation Dataset
+A dataset is available for English and six other languages [Fr, es, de, zh, ja, ko]. If you want to work with a different dataset, run the code below otherwise skip this step:
+``` bash
+python  src/SentencePerturbation/sentence_perturbation.py \
+        --dataset_name mrpc \
+        --task anto \
+        --target_lang en \
+        --output_dir ./data/perturbed_dataset/ \
+        --save True \
+        --sample_size 3500
+```
+## Evaluating Sentence Encoders
+Run the evaluation script to test a sentence encoder against the five semantic alignment criteria. You can use any HuggingFace model for evaluaton. For example, to evaluate SBERT on the QQP dataset:
+```bash
+python src/evaluate.py --model llama3
+    --dataset qqp \
+    --task antonym \
+    --gpu auto \
+    --batch_size 16 \
+    --metric cosine \
+    --save True
+```
+The script supports different models (e.g., sbert, use, simcse, gpt3-ada, llama2, etc.) and datasets (e.g., qqp, paws_wiki, mrpc, afin). We evalauted models on two metric **Cosine Similarity** and **Normalized Euclidean Distance (NED)**
+[# Viewing Results
+Evaluation results—such as similarity scores, normalized distances, and histograms—are saved in the `Results/`. Use the provided Jupyter notebooks in the `src/PlotAndTables.ipynb` folder to explore and visualize the performance of different models across the evaluation criteria.]: #
+# Citation
+If you use ALIGN-SIM in your research, please cite our work:
+```bibtex
+@inproceedings{mahajan-etal-2024-align,
+    title = "{ALIGN}-{SIM}: A Task-Free Test Bed for Evaluating and Interpreting Sentence Embeddings through Semantic Similarity Alignment",
+    author = "Mahajan, Yash  and
+      Bansal, Naman  and
+      Blanco, Eduardo  and
+      Karmaker, Santu",
+    editor = "Al-Onaizan, Yaser  and
+      Bansal, Mohit  and
+      Chen, Yun-Nung",
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-emnlp.436/",
+    doi = "10.18653/v1/2024.findings-emnlp.436",
+    pages = "7393--7428",
+}
+```
+# Acknowledgments
+This work has been partially supported by NSF Standard Grant Award #2302974 and AFOSR Cooperative Agreement Award #FA9550-23-1-0426. We also acknowledge the support from Auburn University College of Engineering and the Department of CSSE.

data/__init__.py ADDED Viewed

File without changes

data/original_datasets/En/Readme.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+Dataset downloadable link.
+Note that: the perturbed data is created with
+the help these dataset and the code.
+To download the dataset, please follow the link:
+1) paws-wiki : https://github.com/google-research-datasets/paws
+	Open the link and scroll to PAWS_Wiki section. Download
+	PAWS-Wiki Labeled (Final)
+2) QQP:  https://huggingface.co/datasets/glue/viewer/qqp/train
+	Visit the huggingface link and download qqp paraphrasing
+	train dataset.
+3) MRPC: https://huggingface.co/datasets/glue/viewer/mrpc/train
+	To download Microsoft Research Paraphrasing Corpus(MRPC)
+	dataset, visit the link and download the dataset (train
+	version).
+Alternative:
+You can use the dataset provided in the zip file. Just unzip the data file
+and use the data.
+Perturbed Data Generation:
+We used the above dataset to create sentence perturbation for hypothesis
+testing. We took the first column (i.e. sentence1 or question1) as our original
+sentence and produce a sentence perturbation for these sentences using
+WordNet toolkit. The code is provided in the zip file.
+check: scr/word_replacer.py

data/original_datasets/En/mrpc.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/original_datasets/En/paw_wiki.tsv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28cea292e3fe964ca951cc3fd08e8edcdc31ea7118eafc7b9725b5702e78f50c
+size 11734851

data/original_datasets/En/qoura.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f1c8f35afbb89a22437a9df147e5bef13ee2d7a7f7698b5c7e1d4b958992717
+size 58317136

requiremnets.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+datasets==3.0.1
+sentence-transformers==3.0.0
+scikit-learn==1.4.2
+nltk==3.8.1
+numpy==1.26.4
+matplotlib==3.8.4
+matplotlib-inline==0.1.7
+pandas==2.2.2
+torch==2.4.0
+tqdm==4.66.5
+transformers==4.45.2
+python-dotenv==1.0.1

src/MultilingualTranslation/args_parser.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# argument_parser.py
+from argparse import ArgumentParser
+from typing import List
+def get_args():
+    """
+    Parses command-line arguments for ALIGN-Multilingual.
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    parser = ArgumentParser(description="ALIGN-Multilingual Argument Parser")
+    parser.add_argument(
+        "--dataset_name",
+        dest="dataset_name",
+        type=str,
+        default="mrpc",
+        choices=["mrpc", "qqp"],
+        help="Name of the dataset to use.",
+    )
+    # parser.add_argument(
+    #     "--language",
+    #     type=str,
+    #     default="fr",
+    #     help="Target language for translation.",
+    # )
+    parser.add_argument(
+        "--model_name",
+        dest="model_name",
+        type=str,
+        default="facebook/nllb-200-3.3B",
+        help="Translation model name.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        dest="batch_size",
+        type=int,
+        default=16,
+        help="Batch size for translation.",
+    )
+    parser.add_argument(
+        "--save",
+        dest="save",
+        type=bool,
+        help="Whether to save the translated dataset to a file.",
+    )
+    return parser.parse_args()

src/MultilingualTranslation/translation.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import pandas as pd
+from googletrans import Translator  # Import the googletrans module
+import os
+from tqdm import tqdm
+import sys
+sys.path.insert(0, "/home/yash/EMNLP-2024/ALIGN-Multilingual/")
+from Models.MultilingualTranslationModel import NLLBTranslator
+from args_parser import get_args
+from src.utils import read_data
+# TODO: Perturbation does not support Multilingual at the moment
+def translate_dataset(dataset_name, model_name, target_lang,batch_size=16, sample_size=1000,save=False):
+    """
+    Translates a dataset in batches using the NLLB model.
+    Args:
+        dataset_name (str): Name of the dataset.
+        model_name (str): Model name used for translation.
+        target_lang (str): Target language for translation.
+        batch_size (int): Number of sentences to process in each batch.
+        sample_size (int): Number of rows to process.
+        save (bool): Whether to save the translated dataset to CSV.
+    Returns:
+        pd.DataFrame: Translated dataset.
+    """
+    # check if translated dataset already exists else create it
+    translated_file_path = f"/home/yash/EMNLP-2024/ALIGN-Multilingual/data/{dataset_name}_{target_lang}.csv"
+    #original dataset
+    # data = pd.read_csv("/home/yash/EMNLP-2024/data/paw_wiki.tsv", sep='\t')
+    data = read_data(dataset_name)
+    #size of dataset
+    print(f"Size of dataset: {len(data)}")
+    print("original dataset loaded successfully")
+    model = NLLBTranslator(model_name=model_name)
+    print("NLLB model loaded successfully")
+    if os.path.exists(translated_file_path):
+        translated_dataset = pd.read_csv(translated_file_path)
+        print("Dataset exists and loaded successfully")
+        return translated_dataset
+    print("Creatign the dataset ....")
+    translated_dataset = pd.DataFrame(columns=['sentence1', 'sentence2', 'label'])
+    for i in tqdm(range(0, len(data), batch_size)):
+        batch_sentences1 = data.loc[i:i+batch_size-1, 'sentence1'].tolist()
+        batch_sentences2 = data.loc[i:i+batch_size-1, 'sentence2'].tolist()
+        batch_labels = data.loc[i:i+batch_size-1, 'label'].tolist()
+        translated_batch1 = model.translate(batch_sentences1, source_lang="en", target_lang=target_lang)
+        translated_batch2 = model.translate(batch_sentences2, source_lang="en", target_lang=target_lang)
+        # Append translated sentences and labels to DataFrame
+        batch_df = pd.DataFrame({
+            'sentence1': translated_batch1,
+            'sentence2': translated_batch2,
+            'label': batch_labels
+        })
+        translated_dataset = pd.concat([translated_dataset, batch_df], ignore_index=True)
+    if save:
+        translated_dataset.to_csv(translated_file_path, index=False)
+        print(f"Translated dataset saved to {translated_file_path}")
+    return translated_dataset
+if __name__ == "__main__":
+    languages=['fr','es',"de","zh-CN","ja","ko"]
+    # Parse command-line arguments
+    args = get_args()
+    for language in languages:
+        print(f"Translating to {language} ....")
+        config= {
+            "dataset_name": args.dataset_name,
+            "model_name": args.model_name,
+            "target_lang": language,
+            "batch_size": args.batch_size,
+            "save": args.save
+        }
+        translated_dataset_lang = translate_dataset(**config)
+    # For Testing
+    # for language in languages:
+    #     print(f"Translating to {language} ....")
+    #     config= {
+    #         "dataset_name": "qqp",
+    #         "model_name": "nllb",
+    #         "target_lang": language,
+    #         "batch_size": 3,
+    #         "save": True
+    #     }
+    #     translated_dataset_lang = translate_dataset(**config)
+    print("Done")

src/SentencePerturbation/perturbation_args.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from argparse import ArgumentParser
+from typing import List
+def get_args():
+    """
+    Parses command-line arguments for ALIGN-Multilingual.
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    parser = ArgumentParser(description="ALIGN-SentencePerturbation Argument Parser")
+    parser.add_argument(
+        "--dataset_name",
+        dest="dataset_name",
+        type=str,
+        default="mrpc",
+        choices=["mrpc", "qqp","paws"],
+        help="Name of the dataset to use.",
+    )
+    parser.add_argument(
+        "--task",
+        dest="task",
+        type=str,
+        default="syn",
+        choices=["syn", "anto","jumb","jumbling","paraphrase","para"],
+        help="Perturbation task to perform.",
+    )
+    parser.add_argument(
+        "--target_lang",
+        dest="target_lang",
+        type=str,
+        default="en",
+        help="Target language for translation.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        dest="output_dir",
+        type=str,
+        default="./data/perturbed_dataset/",
+        help="Output directory for perturbed dataset.",
+    )
+    parser.add_argument(
+        "--save",
+        dest="save",
+        type=bool,
+        help="Whether to save the translated dataset to a file.",
+    )
+    parser.add_argument(
+        "--sample_size",
+        dest="sample_size",
+        type=int,
+        default=None,
+        help="Number of rows to process.",
+    )
+    return parser.parse_args()

src/SentencePerturbation/sentence_perturbation.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from absl import logging
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import pandas as pd
+import re
+import sys
+sys.path.insert(0, "/home/yash/ALIGN-SIM/src")
+from utils import mkdir_p, full_path, read_data
+from SentencePerturbation.word_replacer import WordReplacer, WordSwapping
+import random
+from perturbation_args import get_args
+def perturb_sentences(dataset_name: str, task: str, target_lang:str ="en", output_dir: str = "./data/perturbed_dataset/", sample_size: int = 3500, save :str = False) -> None:
+    """
+    perturb_sentences _summary_
+    Args:
+        dataset_name (str): ["MRPC","PAWS","QQP"]
+        task (str): ["Synonym","Antonym","Jumbling"]
+        target_lang (str, optional): _description_. Defaults to "en".
+        output_dir (str, optional): _description_. Defaults to "./data/perturbed_dataset/".
+        sample_size (int, optional): _description_. Defaults to 3500.
+        save (str, optional): _description_. Defaults to False.
+    """
+    print("--------------------------------------")
+    output_csv = full_path(os.path.join(output_dir, target_lang, task, f"{dataset_name}_{task}_perturbed_{target_lang}.csv"))
+    if os.path.exists(output_csv):
+        print(f"File already exists at: {output_csv}")
+        return
+    # TODO: make it compatible with other language datasets
+    print("Loading dataset...")
+    data = read_data(dataset_name)
+    if "Unnamed: 0" in data.columns:
+        data.drop("Unnamed: 0", axis=1, inplace=True)
+    if "idx" in data.columns:
+        data.drop("idx", axis=1, inplace=True)
+    print(f"Loaded {dataset_name} dataset")
+    print("--------------------------------------")
+    # Initialize WordReplacer
+    replacer = WordReplacer()
+    # set seed
+    random.seed(42)
+    # Create a new dataframe to store perturbed sentences
+    # Sample sentences
+    perturbed_data = pd.DataFrame(columns=["original_sentence"])
+    # sample_data , pos_pairs, balance_dataset  = sampling(data, sample_size)
+    if task in ["Syn","syn","Synonym"]:
+        print("Creating Synonym perturbed data...")
+        sample_data = sampling(data, task, sample_size)
+        perturbed_data["original_sentence"] = sample_data.sentence1
+        perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "synonyms"))
+        perturbed_data["perturb_n2"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 2, "synonyms"))
+        perturbed_data["perturb_n3"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 3, "synonyms"))
+        assert perturbed_data.shape[1] == 4, "Perturbed data size mismatch"
+    if task in ["paraphrase","Paraphrase","para"]:
+        print("Creating Paraphrase perturbed data...")
+        # shuffling the negative samples
+        # we also want equal number of positive and negative samples
+        perturbed_data = sampling(data, task, sample_size) # balance data
+        perturbed_data["original_sentence"] = perturbed_data.sentence1
+        perturbed_data["paraphrased_sentence"] = perturbed_data.sentence2
+        assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch" # original_sentence, paraphrased, label
+    if task in ["Anto","anto","Antonym"]:
+        print("Creating Antonym perturbed data...")
+        pos_pairs = sampling(data, task, sample_size)
+        # Apply antonym replacement
+        perturbed_data["original_sentence"] = pos_pairs.sentence1
+        perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
+        perturbed_data["perturb_n1"] = perturbed_data["original_sentence"].apply(lambda x: replacer.sentence_replacement(x, 1, "antonyms"))
+        assert perturbed_data.shape[1] == 3, "Perturbed data size mismatch"
+    # Apply jumbling
+    if task in ["jumbling", "Jumbling","jumb"]:
+        print("Creating Jumbling perturbed data...")
+        pos_pairs = sampling(data, task, sample_size)
+        perturbed_data["original_sentence"] = pos_pairs.sentence1
+        perturbed_data["paraphrased_sentence"] = pos_pairs.sentence2
+        perturbed_data["perturb_n1"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,1))
+        perturbed_data["perturb_n2"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,2))
+        perturbed_data["perturb_n3"]= perturbed_data["original_sentence"].apply(lambda x: WordSwapping.random_swap(x,3))
+        assert perturbed_data.shape[1] == 5, "Perturbed data size mismatch"
+    # Save to CSV
+    if save:
+        perturbed_data.to_csv(mkdir_p(output_csv), index=False)
+        print("--------------------------------------")
+        print(f"Saved at: {output_csv}")
+        print("--------------------------------------")
+def sampling(data: pd.DataFrame, task :str, sample_size: int, random_state: int = 42):
+    """
+    Combines two sampling strategies:
+    1. sampled_data: Samples from the dataset by first taking all positive pairs and then,
+       if needed, filling the remainder with negative pairs.
+    2. balanced_data: Constructs a dataset with roughly equal positive and negative pairs,
+       adjusting the numbers if one group is underrepresented.
+    Returns:
+        sampled_data (pd.DataFrame): Dataset sampled by filling negatives if positives are insufficient.
+        positive_data (pd.DataFrame): All positive samples (label == 1).
+        balanced_data (pd.DataFrame): Dataset balanced between positive and negative pairs.
+    """
+    # Split the data into positive and negative pairs
+    positive_data = data[data["label"] == 1]
+    negative_data = data[data["label"] == 0]
+    if task in ["Anto","anto","Antonym","jumbling", "Jumbling","jumb"]:
+        return positive_data
+    # ----- Sampling positive pair, but also checking if we satisfy sample size -----
+    if sample_size is None or sample_size > len(positive_data):
+        # If no sample size is provided or it exceeds the available data,
+        # return a copy of the entire dataset.
+        sampled_data = positive_data.copy()
+    else:
+        # Otherwise, randomly sample the specified number of rows.
+        sampled_data = positive_data.sample(n=sample_size, random_state=random_state)
+    if task in ["Syn","syn","Synonym"]:
+        return sampled_data
+    # ----- Sampling for Paraphrased Criterion -----
+    # Shuffle negative pairs first
+    negative_data = negative_data.reset_index(drop=True)
+    shuffled_sentence2 = negative_data["sentence2"].sample(frac=1, random_state=random_state).reset_index(drop=True)
+    negative_data["sentence2"] = shuffled_sentence2
+    # Determine ideal sample size per group (half of total sample size)
+    if sample_size is None:
+        pos_sample_size = len(positive_data)
+        neg_sample_size = len(negative_data)
+    else:
+        # Determine ideal sample size per group (half of total sample size)
+        half_size = sample_size // 2
+        pos_available = len(positive_data)
+        neg_available = len(negative_data)
+        pos_sample_size = min(half_size, pos_available)
+        neg_sample_size = min(half_size, neg_available)
+        # If there is a remainder, add extra samples from the group with more available data.
+        total_sampled = pos_sample_size + neg_sample_size
+        remainder = sample_size - total_sampled
+        if remainder > 0:
+            if (pos_available - pos_sample_size) >= (neg_available - neg_sample_size):
+                pos_sample_size += remainder
+            else:
+                neg_sample_size += remainder
+    # Sample from each group
+    sampled_positive = positive_data.sample(n=pos_sample_size, random_state=random_state)
+    sampled_negative = negative_data.sample(n=neg_sample_size, random_state=random_state)
+    # Add a 'label' column
+    sampled_positive["label"] = 1
+    sampled_negative["label"] = 0
+    # Combine and shuffle the resulting dataset
+    balanced_data = pd.concat([sampled_positive, sampled_negative]).sample(frac=1, random_state=random_state).reset_index(drop=True)
+    if task in ["paraphrase","Paraphrase","para"]:
+        return balanced_data
+    # return sampled_data, positive_data, balanced_data
+if __name__ == "__main__":
+    # # For Testing
+    if sys.gettrace() is not None:
+        config = {
+            "dataset_name": "mrpc",
+            "task": "syn",
+            "target_lang": "en",
+            "output_dir": "./data/perturbed_dataset/",
+            "save": True
+        }
+    else:
+        args = get_args()
+        config = {
+            "dataset_name": args.dataset_name,
+            "task": args.task,
+            "target_lang": args.target_lang,
+            "output_dir": args.output_dir,
+            "save": args.save,
+            "sample_size": args.sample_size
+        }
+    perturb_sentences(**config)

src/SentencePerturbation/word_replacer.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import types
+import nltk
+from nltk.corpus import stopwords
+from nltk.corpus import wordnet
+nltk.download('wordnet')
+import pandas as pd
+import random
+class WordReplacer(object):
+    def get_antonyms(self, word, pos=None):
+        antonyms = set()
+        for syn in wordnet.synsets(word, pos=pos):
+            for lemma in syn.lemmas():
+                for antonym in lemma.antonyms():
+                    antonyms.add(antonym.name())
+        if word in antonyms:
+            antonyms.remove(word)
+        return list(antonyms)
+    def get_synonyms(self,word):
+        """
+        Get synonyms of a word
+        """
+        synonyms = set()
+        for syn in wordnet.synsets(word):
+            for l in syn.lemmas():
+                synonym = l.name().replace("_", " ").replace("-", " ").lower()
+                synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
+                synonyms.add(synonym)
+        if word in synonyms:
+            synonyms.remove(word)
+        return list(synonyms)
+    def sentence_replacement(self,words,n,types=""):
+        words = words.split()
+        types= types.lower()
+        new_words= words.copy()
+        random_word_list = list(set([word for word in words if word not in stopwords.words("english")]))
+        random.shuffle(random_word_list)
+        num_replaced = 0
+        if types == "antonyms":
+            for random_word in random_word_list:
+              antonyms = self.get_antonyms(random_word)
+              if len(antonyms)>=1:
+                antonyms = random.choice(list(antonyms))
+                new_words = [antonyms if word == random_word else word for word in new_words]
+                num_replaced +=1
+              if num_replaced >=n:
+                break
+        if types=="synonyms":
+            for random_word in random_word_list:
+              synonyms = self.get_synonyms(random_word)
+              if len(synonyms)>=1:
+                synonyms = random.choice(list(synonyms))
+                new_words = [synonyms if word == random_word else word for word in new_words]
+                num_replaced +=1
+              if num_replaced >=n:
+                break
+        sentence= " ".join(new_words)
+        return sentence
+class WordSwapping(object):
+    @staticmethod
+    def swap_word(new_words):
+        random_idx_1 = random.randint(0, len(new_words)-1)
+        random_idx_2 = random_idx_1
+        counter = 0
+        while random_idx_2 == random_idx_1:
+            random_idx_2 = random.randint(0, len(new_words)-1)
+            counter += 1
+            if counter > 3:
+                return new_words
+        new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
+        return new_words
+    @staticmethod
+    def random_swap(words,n):
+        words = words.split()
+        new_words = words.copy()
+        for _ in range(n):
+            new_words = WordSwapping.swap_word(new_words)
+        sentence = ' '.join(new_words)
+        return sentence
+# if __name__ == "__main__":
+    # replace= WordReplacer()
+    # temp1= ["i am testing", "this is second sent"]
+    # print([replace.sentence_replacement(i,n=1,types="synonyms") for i in temp1])

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from src.utils import mkdir_p, full_path, read_data
+if "__all__" == ["mkdir_p", "full_path", "read_data"]:
+    print("All modules imported successfully")

src/adjustment_factor.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import numpy as np
+def compute_alpha_model(rnd_similarities):
+    """
+    Computes alpha_model as per the formula:
+        α_model = 1 - (1 / (n * |D|)) * sum(sim(RND-Pairs))
+    Args:
+        rnd_similarities (array-like): A 2D array of shape (n, |D|)
+                                       where each entry [i][j] is the similarity
+                                       of the j-th random pair in the i-th sample.
+    Returns:
+        float: The computed alpha_model value.
+    """
+    rnd_similarities = np.array(rnd_similarities)
+    n, D_size = rnd_similarities.shape
+    alpha_model = 1 - (1 / (n * D_size)) * rnd_similarities.sum()
+    return alpha_model

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import argparse
+import numpy as np
+import os
+import pandas as pd
+from tqdm import tqdm
+import torch
+import utils
+from metrics import *
+import sys
+sys.path.insert(0,"./")
+from Models.SentenceTransformersModel import SentenceTransformerModels
+from Models.llm_embeddings import LLMEmbeddings
+from main_args import get_args
+from metrics import CosineMetric
+def read_pertubed_data(filename, task, lang="en"):
+    # path = f"./data/perturbed_dataset/{lang}/{task}/{filename}.csv"
+    if not os.path.exists(filename):
+        raise FileNotFoundError(f"File {filename} not found.")
+    return pd.read_csv(filename)
+def compute_metrics(emb1, emb2,metric="cosine"):
+    """Compute all metrics between two sets of embeddings."""
+    # sim = utils.cosine_similarity(emb1, emb2)
+    # ned = compute_ned_distance(emb1, emb2)
+    # ed = np.linalg.norm(emb1 - emb2, axis=1)
+    # dotp = np.sum(emb1 * emb2, axis=1)
+    if metric=="cosine":
+        sim = CosineMetric(emb1,emb2)
+    return sim
+def run(args_model, dataset_name, target_lang,args_task, default_gpu="cuda", metric="cosine",save=False,batch_size=2):
+    model = LLMEmbeddings(args_model, device=default_gpu)
+    pertubed_data_path = f"./data/perturbed_dataset/{target_lang}/{args_task}/{dataset_name}_{args_task}_perturbed_{target_lang}.csv" # check if path exist
+    data = read_pertubed_data(pertubed_data_path, args_task)
+    # dataset_name = dataset_name.split(".")[0] if args_task == "paraphrase" else dataset_name.split("_")[0]
+    print(f"\n*** Model {args_model} on {dataset_name} dataset for {args_task} task ***\n")
+    # Collect all sentences based on task
+    sentences = []
+    if args_task in ["Anto","anto","Antonym"]:
+        cols = ["original_sentence", "paraphrased_sentence", "perturb_n1"]
+        for _, row in data[cols].iterrows():
+            sentences.extend(row.values)
+    elif args_task in ["jumbling", "Jumbling","jumb"]:
+        cols = ["original_sentence", "paraphrased_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
+        for _, row in data[cols].iterrows():
+            sentences.extend(row.values)
+    elif args_task in ["Syn","syn","Synonym"]:
+        cols = ["original_sentence", "perturb_n1", "perturb_n2", "perturb_n3"]
+        for _, row in data[cols].iterrows():
+            sentences.extend(row.values)
+    elif args_task in ["paraphrase","Paraphrase","para"]:
+        cols = ["original_sentence", "paraphrased_sentence"]
+        for _, row in data[cols].iterrows():
+            sentences.extend(row.values)
+    # Batch process embeddings
+    embeddings = model.encode_batch(sentences,batch_size=batch_size)
+    if args_model != "chatgpt":
+        embeddings = [emb.cpu().numpy() for emb in embeddings]
+    embeddings = np.array(embeddings)
+    # Process embeddings based on task
+    if args_task == "anto":
+        emb_org  = embeddings[0::3]  # start at 0, step by 3
+        emb_para = embeddings[1::3]  # start at 1, step by 3
+        emb_anto = embeddings[2::3]  # start at 2, step by 3
+        mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
+        mean_anto,sim_anto = utils.similarity_between_sent(emb_org, emb_anto)
+        data["sim_org_para"] = sim_para
+        data["sim_org_anto"] = sim_anto
+        data["diff_org_para"] = np.array(sim_para) - np.array(sim_anto)
+        print(f"""The summary for Antonym Criteria for {args_model} \n {data.describe()} """)
+    elif args_task == "jumbling":
+        emb_org  = embeddings[0::5]  # start at 0, step by 3
+        emb_para = embeddings[1::5]  # start at 1, step by 3
+        emb_n1 = embeddings[2::5]  # start at 2, step by 3
+        emb_n2 = embeddings[3::5]
+        emb_n3 = embeddings[4::5]
+        # Compute metrics for each perturbation
+        mean_para,sim_para = utils.similarity_between_sent(emb_org, emb_para)
+        mean_n1,sim_n1 = utils.similarity_between_sent(emb_org, emb_n1)
+        mean_n2,sim_n2 = utils.similarity_between_sent(emb_org, emb_n2)
+        mean_n3,sim_n3 = utils.similarity_between_sent(emb_org, emb_n3)
+        data["sim_org_para"] = sim_para
+        data["sim_org_n1"] = sim_n1
+        data["sim_org_n2"] = sim_n2
+        data["sim_org_n3"] = sim_n3
+        data["diff_org_para"] = sim_para - sim_para  # Zero as per original
+        data["diff_org_n1"] = sim_para - sim_n1
+        data["diff_org_n2"] = sim_para - sim_n2
+        data["diff_org_n3"] = sim_para - sim_n3
+        print(f"""The summary for Jumbling Criteria for {args_model} \n {data.describe()} """)
+    elif args_task == "syn":
+        emb_org  = embeddings[0::4]  # start at 0, step by 3
+        emb_s1 = embeddings[1::4]  # start at 1, step by 3
+        emb_s2 = embeddings[2::4]  # start at 2, step by 3
+        emb_s3 = embeddings[3::4]
+        _,sim_s1 = utils.similarity_between_sent(emb_org, emb_s1)
+        _,sim_s2 = utils.similarity_between_sent(emb_org, emb_s2)
+        _,sim_s3 = utils.similarity_between_sent(emb_org, emb_s3)
+        data["sim_org_s1"] = sim_s1
+        data["sim_org_s2"] = sim_s2
+        data["sim_org_s3"] = sim_s3
+        print(f"""The summary for Synonym Criteria for {args_model} \n {data.describe()} """)
+    elif args_task == "paraphrase":
+        emb_s1  = embeddings[0::2]  # start at 0, step by 3
+        emb_s2 = embeddings[1::2]
+        data["sim"] = utils.similarity_between_sent(emb_s1, emb_s2)
+        print(f"""The summary for Paraphrase Criteria for {args_model} \n {data.describe()} """)
+    if save:
+        path = f"./Results/{target_lang}/{args_task}/{dataset_name}_{args_model}_{args_task}_metric.csv"
+        data.to_csv(path)
+        print("Data saved at path : {path} ")
+    return data
+if __name__ == "__main__":
+    if sys.gettrace() is None:
+        parser = get_args()
+        config = {
+            "args_model": parser.model_name,
+            "dataset_name": parser.perturbed_dataset,
+            "args_task": parser.task,
+            "default_gpu": parser.gpu,
+            "save": parser.save,
+            "target_lang": parser.target_lang,
+            "metric":parser.metric,
+            "batch_size":2
+        }
+    else:
+        config = {
+            "args_model": "llama3",
+            "dataset_name": "mrpc",
+            "args_task": "syn",
+            "default_gpu": "cuda:2",
+            "save": False,
+            "target_lang": "en"
+        }
+    run(**config)
+    # file_path = "/home/yash/ALIGN-SIM/data/perturbed_dataset/en/anto/mrpc_anto_perturbed_en.csv"
+    # run("llama3","mrpc_anto_perturbed_en", "anto", "cuda:2", False)

src/main_args.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from argparse import ArgumentParser
+def get_args():
+    """
+    Parses command-line arguments for SentencePerturbation.
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--perturb_dataset",
+        dest="perturb_dataset",
+        required=True,
+        help="Name of the CSV file"
+    )
+    parser.add_argument(
+        "--task",
+        dest="task",
+        required=True,
+        choices=["anto", "jumbling", "syn", "paraphrase"],
+        help="Task to perform: anto/jumbling/syn/paraphrase",
+    )
+    parser.add_argument(
+        "--M",
+        dest="model_name",
+        required=True,
+        help="LLM Model")
+    parser.add_argument(
+        "--target_lang",
+        dest="target_lang",
+        required=True,
+        default="en",
+        help="Language for translation"
+    )
+    parser.add_argument(
+        "--save",
+        dest="save",
+        action="store_true",
+        help="Save the results in a CSV file",
+    )
+    parser.add_argument(
+        "--gpu",
+        dest="gpu",
+        default="auto",
+        help="GPU to run the model"
+    )
+    parser.add_argument(
+        "--batch_size",
+        dest="batch_size",
+        type=int,
+        default=16,
+        help="Batch size for translation"
+    )
+    parser.add_argument(
+        "--metric",
+        dest="metric",
+        type=str,
+        default="cosine",
+        choices=["cosine","ned","both"],
+        help="Metric to use for comparison",
+    )
+    return parser.parse_args()

src/metrics.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from abc import ABC, abstractmethod
+import numpy as np
+# Optional: import torch if available for type checking
+try:
+    import torch
+except ImportError:
+    torch = None
+def to_numpy(arr) -> np.ndarray:
+    """
+    Converts the input array (which can be a numpy array, torch tensor, or list) to a numpy array.
+    """
+    # Check for torch.Tensor if torch is available
+    if torch is not None and isinstance(arr, torch.Tensor):
+        # Detach and move to CPU if needed, then convert to numpy
+        return arr.detach().cpu().numpy()
+    # If it's already a numpy array, return as is
+    if isinstance(arr, np.ndarray):
+        return arr
+    # Otherwise, try converting to a numpy array
+    return np.array(arr)
+class Metric(ABC):
+    """
+    Abstract base class for evaluation metrics.
+    Subclasses must implement the compute method.
+    """
+    @abstractmethod
+    def compute(self, vector1, vector2) -> float:
+        """
+        Compute the metric between two vectors.
+        Args:
+            vector1: The first vector (numpy array, torch tensor, list, etc.).
+            vector2: The second vector (numpy array, torch tensor, list, etc.).
+        Returns:
+            float: The computed metric value.
+        """
+        pass
+class CosineMetric(Metric):
+    """
+    Implementation of the cosine similarity metric.
+    """
+    def compute(self, vector1, vector2) -> float:
+        # Convert inputs to numpy arrays
+        vec1 = to_numpy(vector1)
+        vec2 = to_numpy(vector2)
+        dot_product = np.dot(vec1, vec2)
+        norm1 = np.linalg.norm(vec1)
+        norm2 = np.linalg.norm(vec2)
+        if norm1 == 0 or norm2 == 0:
+            return 0.0
+        return dot_product / (norm1 * norm2)
+class NEDMetric(Metric):
+    """
+    Implementation of a normalized Euclidean distance metric.
+    """
+    def compute(self, vector1, vector2) -> float:
+        # Convert inputs to numpy arrays
+        vec1 = to_numpy(vector1)
+        vec2 = to_numpy(vector2)
+        euclidean_distance = np.linalg.norm(vec1 - vec2)
+        norm_sum = np.linalg.norm(vec1) + np.linalg.norm(vec2)
+        if norm_sum == 0:
+            return 0.0
+        return euclidean_distance / norm_sum
+class EuclideanMetric(Metric):
+    def compute(self, vector1, vector2) -> float:
+        return np.linalg.norm(vector1 - vector2, axis=1)
+def dot_product(x, y):
+    return np.dot(x, y.T)
+def compute_ned_distance(x, y):
+    return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))
+def batch_NED(batch_u, batch_v):
+    batch_u = np.array(batch_u)
+    batch_v = np.array(batch_v)
+    # Ensure batch_u and batch_v have the same number of elements
+    assert batch_u.shape[0] == batch_v.shape[0], "The batch sizes of u and v must be the same."
+    scores = []
+    for u, v in zip(batch_u, batch_v):
+        u = np.array(u)
+        v = np.array(v)
+        u_mean = np.mean(u)
+        v_mean = np.mean(v)
+        u_centered = u - u_mean
+        v_centered = v - v_mean
+        numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
+        denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
+        ned_score = 0.5 * numerator / denominator
+        scores.append(ned_score)
+    return np.array(scores)
+def NED2(u, v):
+    u = np.array(u)
+    v = np.array(v)
+    u_mean = np.mean(u)
+    v_mean = np.mean(v)
+    u_centered = u - u_mean
+    v_centered = v - v_mean
+    numerator = np.linalg.norm(u_centered - v_centered, ord=2)**2
+    denominator = np.linalg.norm(u_centered, ord=2)**2 + np.linalg.norm(v_centered, ord=2)**2
+    return 0.5 * numerator / denominator
+# --- Example Usage ---
+if __name__ == "__main__":
+    # Example inputs: a numpy array and a torch tensor (if torch is available)
+    vec_np = np.array([1.0, 2.0, 3.0])
+    if torch is not None:
+        vec_torch = torch.tensor([4.0, 5.0, 6.0])
+    else:
+        vec_torch = [4.0, 5.0, 6.0]  # fallback list
+    cosine = CosineMetric()
+    ned = NEDMetric()
+    print("Cosine Similarity:", cosine.compute(vec_np, vec_torch))
+    print("Normalized Euclidean Distance:", ned.compute(vec_np, vec_torch))
+    # x = [20,30,2]
+    # y = [1.0,2.0,3.0]
+    # print("Dot Product: ", dot_product(x, y))
+    # print(euclidean_distance(x, y))
+    # # print(NED(x, y))
+    # print("Done")

src/utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import numpy as np
+import pandas as pd
+import os
+import matplotlib.pyplot as plt
+from pathlib import Path
+from typing import Union
+import os
+def delete_file(file_pt: Path) -> None:
+    try:
+        file_pt.unlink()
+    except FileNotFoundError:
+        pass
+def full_path(inp_dir_or_path: str) -> Path:
+    """Returns full path"""
+    return Path(inp_dir_or_path).expanduser().resolve()
+def mkdir_p(inp_dir_or_path: Union[str, Path]) -> Path:
+    """Give a file/dir path, makes sure that all the directories exists"""
+    inp_dir_or_path = full_path(inp_dir_or_path)
+    if inp_dir_or_path.suffix:  # file
+        inp_dir_or_path.parent.mkdir(parents=True, exist_ok=True)
+    else:  # dir
+        inp_dir_or_path.mkdir(parents=True, exist_ok=True)
+    return inp_dir_or_path
+def similarity_between_sent(sent1_encoded, sent2_encoded):
+    """report the avg. cosine similarity score b.w two pairs of sentences"""
+    similarity_scores = []
+    for i in range(len(sent1_encoded)):
+        similarity_scores.append(cosine_similarity(
+            sent1_encoded[i], sent2_encoded[i]))
+    return np.mean(similarity_scores),similarity_scores
+def cosine_similarity(a, b):
+    """
+Takes 2 vectors a, b and returns the cosine similarity according
+    to the definition of the dot product
+    """
+    dot_product = np.dot(a, b)
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    return dot_product / (norm_a * norm_b)
+def load_data(path):
+    if path.endswith(".csv"):
+        data=pd.read_csv(path)
+    else:
+        data=pd.read_csv(path,sep="\t")
+    if not isinstance(data,pd.DataFrame):
+        raise ValueError("Data should be in pandas DataFrame format")
+    return data
+def read_data(dataset):
+    if dataset == "mrpc":
+        data = load_data("/home/yash/EMNLP-2024/data/mrpc.csv")
+        data = data.copy()
+    elif dataset == "qqp":
+        data = load_data("/home/yash/EMNLP-2024/data/qoura.csv")
+        data = data.copy().dropna()
+        # handling irregularities in columns names
+        data.columns = data.columns.str.strip()
+        data = data.rename(columns={"is_duplicate":"label",'question1':"sentence1","question2":"sentence2"})
+    elif dataset in ["paws","paw","wiki"]:
+        path = "/home/yash/EMNLP-2024/data/paw_wiki.tsv"
+        data = load_data(path)
+        data = data.copy()
+    else:
+        ValueError("No dataset found.")
+    return data