contrastive commit 1

Browse files

Files changed (15) hide show

base_bert.py +240 -240
classifier.py +67 -61
data/{sts-test-student.csv → nli-dev.parquet} +2 -2
data/{sts-train.csv → nli-test.parquet} +2 -2
data/{quora-test-student.csv → nli-train.parquet} +2 -2
data/{sts-dev.csv → stsb-dev.parquet} +2 -2
data/{quora-dev.csv → stsb-test.parquet} +2 -2
data/stsb-train.parquet +3 -0
data/{quora-train.csv → twitter-unsup.csv} +2 -2
datasets.py +0 -272
justfile +1 -0
multitask_classifier.py +0 -340
prompt +3 -0
trainings/last-layer-w-dropout.txt +4 -4
unsup_simcse.py +204 -0

base_bert.py CHANGED Viewed

@@ -5,244 +5,244 @@ from utils import *
 class BertPreTrainedModel(nn.Module):
-  config_class = BertConfig
-  base_model_prefix = "bert"
-  _keys_to_ignore_on_load_missing = [r"position_ids"]
-  _keys_to_ignore_on_load_unexpected = None
-  def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
-    super().__init__()
-    self.config = config
-    self.name_or_path = config.name_or_path
-  def init_weights(self):
-    # Initialize weights
-    self.apply(self._init_weights)
-  def _init_weights(self, module):
-    """ Initialize the weights """
-    if isinstance(module, (nn.Linear, nn.Embedding)):
-      # Slightly different from the TF version which uses truncated_normal for initialization
-      # cf https://github.com/pytorch/pytorch/pull/5617
-      module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-    elif isinstance(module, nn.LayerNorm):
-      module.bias.data.zero_()
-      module.weight.data.fill_(1.0)
-    if isinstance(module, nn.Linear) and module.bias is not None:
-      module.bias.data.zero_()
-  @property
-  def dtype(self) -> dtype:
-    return get_parameter_dtype(self)
-  @classmethod
-  def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
-    config = kwargs.pop("config", None)
-    state_dict = kwargs.pop("state_dict", None)
-    cache_dir = kwargs.pop("cache_dir", None)
-    force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
-    proxies = kwargs.pop("proxies", None)
-    output_loading_info = kwargs.pop("output_loading_info", False)
-    local_files_only = kwargs.pop("local_files_only", False)
-    use_auth_token = kwargs.pop("use_auth_token", None)
-    revision = kwargs.pop("revision", None)
-    mirror = kwargs.pop("mirror", None)
-    # Load config if we don't provide a configuration
-    if not isinstance(config, PretrainedConfig):
-      config_path = config if config is not None else pretrained_model_name_or_path
-      config, model_kwargs = cls.config_class.from_pretrained(
-        config_path,
-        *model_args,
-        cache_dir=cache_dir,
-        return_unused_kwargs=True,
-        force_download=force_download,
-        resume_download=resume_download,
-        proxies=proxies,
-        local_files_only=local_files_only,
-        use_auth_token=use_auth_token,
-        revision=revision,
-        **kwargs,
-      )
-    else:
-      model_kwargs = kwargs
-    # Load model
-    if pretrained_model_name_or_path is not None:
-      pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-      if os.path.isdir(pretrained_model_name_or_path):
-        # Load from a PyTorch checkpoint
-        archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-      elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-        archive_file = pretrained_model_name_or_path
-      else:
-        archive_file = hf_bucket_url(
-          pretrained_model_name_or_path,
-          filename=WEIGHTS_NAME,
-          revision=revision,
-          mirror=mirror,
-        )
-      try:
-        # Load from URL or cache if already cached
-        resolved_archive_file = cached_path(
-          archive_file,
-          cache_dir=cache_dir,
-          force_download=force_download,
-          proxies=proxies,
-          resume_download=resume_download,
-          local_files_only=local_files_only,
-          use_auth_token=use_auth_token,
-        )
-      except EnvironmentError as err:
-        #logger.error(err)
-        msg = (
-          f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
-          f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-          f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}.\n\n"
-        )
-        raise EnvironmentError(msg)
-    else:
-      resolved_archive_file = None
-    config.name_or_path = pretrained_model_name_or_path
-    # Instantiate model.
-    model = cls(config, *model_args, **model_kwargs)
-    if state_dict is None:
-      try:
-        state_dict = torch.load(resolved_archive_file, map_location="cpu", weights_only=True)
-      except Exception:
-        raise OSError(
-          f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
-          f"at '{resolved_archive_file}'"
-        )
-    missing_keys = []
-    unexpected_keys = []
-    error_msgs = []
-    # Convert old format to new format if needed from a PyTorch state_dict
-    old_keys = []
-    new_keys = []
-    m = {'embeddings.word_embeddings': 'word_embedding',
-         'embeddings.position_embeddings': 'pos_embedding',
-         'embeddings.token_type_embeddings': 'tk_type_embedding',
-         'embeddings.LayerNorm': 'embed_layer_norm',
-         'embeddings.dropout': 'embed_dropout',
-         'encoder.layer': 'bert_layers',
-         'pooler.dense': 'pooler_dense',
-         'pooler.activation': 'pooler_af',
-         'attention.self': "self_attention",
-         'attention.output.dense': 'attention_dense',
-         'attention.output.LayerNorm': 'attention_layer_norm',
-         'attention.output.dropout': 'attention_dropout',
-         'intermediate.dense': 'interm_dense',
-         'intermediate.intermediate_act_fn': 'interm_af',
-         'output.dense': 'out_dense',
-         'output.LayerNorm': 'out_layer_norm',
-         'output.dropout': 'out_dropout'}
-    for key in state_dict.keys():
-      new_key = None
-      if "gamma" in key:
-        new_key = key.replace("gamma", "weight")
-      if "beta" in key:
-        new_key = key.replace("beta", "bias")
-      for x, y in m.items():
-        if new_key is not None:
-          _key = new_key
         else:
-          _key = key
-        if x in key:
-          new_key = _key.replace(x, y)
-      if new_key:
-        old_keys.append(key)
-        new_keys.append(new_key)
-    for old_key, new_key in zip(old_keys, new_keys):
-      # print(old_key, new_key)
-      state_dict[new_key] = state_dict.pop(old_key)
-    # copy state_dict so _load_from_state_dict can modify it
-    metadata = getattr(state_dict, "_metadata", None)
-    state_dict = state_dict.copy()
-    if metadata is not None:
-      state_dict._metadata = metadata
-    your_bert_params = [f"bert.{x[0]}" for x in model.named_parameters()]
-    for k in state_dict:
-      if k not in your_bert_params and not k.startswith("cls."):
-        possible_rename = [x for x in k.split(".")[1:-1] if x in m.values()]
-        raise ValueError(f"{k} cannot be reload to your model, one/some of {possible_rename} we provided have been renamed")
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: nn.Module, prefix=""):
-      local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-      module._load_from_state_dict(
-        state_dict,
-        prefix,
-        local_metadata,
-        True,
-        missing_keys,
-        unexpected_keys,
-        error_msgs,
-      )
-      for name, child in module._modules.items():
-        if child is not None:
-          load(child, prefix + name + ".")
-    # Make sure we are able to load base models as well as derived models (with heads)
-    start_prefix = ""
-    model_to_load = model
-    has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
-    if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-      start_prefix = cls.base_model_prefix + "."
-    if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
-      model_to_load = getattr(model, cls.base_model_prefix)
-    load(model_to_load, prefix=start_prefix)
-    if model.__class__.__name__ != model_to_load.__class__.__name__:
-      base_model_state_dict = model_to_load.state_dict().keys()
-      head_model_state_dict_without_base_prefix = [
-        key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
-      ]
-      missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
-    # Some models may have keys that are not in the state by design, removing them before needlessly warning
-    # the user.
-    if cls._keys_to_ignore_on_load_missing is not None:
-      for pat in cls._keys_to_ignore_on_load_missing:
-        missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
-    if cls._keys_to_ignore_on_load_unexpected is not None:
-      for pat in cls._keys_to_ignore_on_load_unexpected:
-        unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
-    if len(error_msgs) > 0:
-      raise RuntimeError(
-        "Error(s) in loading state_dict for {}:\n\t{}".format(
-          model.__class__.__name__, "\n\t".join(error_msgs)
-        )
-      )
-    # Set model in evaluation mode to deactivate DropOut modules by default
-    model.eval()
-    if output_loading_info:
-      loading_info = {
-        "missing_keys": missing_keys,
-        "unexpected_keys": unexpected_keys,
-        "error_msgs": error_msgs,
-      }
-      return model, loading_info
-    if hasattr(config, "xla_device") and config.xla_device and is_torch_tpu_available():
-      import torch_xla.core.xla_model as xm
-      model = xm.send_cpu_data_to_device(model, xm.xla_device())
-      model.to(xm.xla_device())
-    return model

 class BertPreTrainedModel(nn.Module):
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_unexpected = None
+    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+        super().__init__()
+        self.config = config
+        self.name_or_path = config.name_or_path
+    def init_weights(self):
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    @property
+    def dtype(self) -> dtype:
+        return get_parameter_dtype(self)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        mirror = kwargs.pop("mirror", None)
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                **kwargs,
+            )
         else:
+            model_kwargs = kwargs
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            if os.path.isdir(pretrained_model_name_or_path):
+                # Load from a PyTorch checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            else:
+                archive_file = hf_bucket_url(
+                    pretrained_model_name_or_path,
+                    filename=WEIGHTS_NAME,
+                    revision=revision,
+                    mirror=mirror,
+                )
+            try:
+                # Load from URL or cache if already cached
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                )
+            except EnvironmentError as err:
+                #logger.error(err)
+                msg = (
+                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}.\n\n"
+                )
+                raise EnvironmentError(msg)
+        else:
+            resolved_archive_file = None
+        config.name_or_path = pretrained_model_name_or_path
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+        if state_dict is None:
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location="cpu", weights_only=True)
+            except Exception:
+                raise OSError(
+                    f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
+                    f"at '{resolved_archive_file}'"
+                )
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        m = {'embeddings.word_embeddings': 'word_embedding',
+                 'embeddings.position_embeddings': 'pos_embedding',
+                 'embeddings.token_type_embeddings': 'tk_type_embedding',
+                 'embeddings.LayerNorm': 'embed_layer_norm',
+                 'embeddings.dropout': 'embed_dropout',
+                 'encoder.layer': 'bert_layers',
+                 'pooler.dense': 'pooler_dense',
+                 'pooler.activation': 'pooler_af',
+                 'attention.self': "self_attention",
+                 'attention.output.dense': 'attention_dense',
+                 'attention.output.LayerNorm': 'attention_layer_norm',
+                 'attention.output.dropout': 'attention_dropout',
+                 'intermediate.dense': 'interm_dense',
+                 'intermediate.intermediate_act_fn': 'interm_af',
+                 'output.dense': 'out_dense',
+                 'output.LayerNorm': 'out_layer_norm',
+                 'output.dropout': 'out_dropout'}
+        for key in state_dict.keys():
+            new_key = None
+            if "gamma" in key:
+                new_key = key.replace("gamma", "weight")
+            if "beta" in key:
+                new_key = key.replace("beta", "bias")
+            for x, y in m.items():
+                if new_key is not None:
+                    _key = new_key
+                else:
+                    _key = key
+                if x in key:
+                    new_key = _key.replace(x, y)
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            # print(old_key, new_key)
+            state_dict[new_key] = state_dict.pop(old_key)
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+        your_bert_params = [f"bert.{x[0]}" for x in model.named_parameters()]
+        for k in state_dict:
+            if k not in your_bert_params and not k.startswith("cls."):
+                possible_rename = [x for x in k.split(".")[1:-1] if x in m.values()]
+                raise ValueError(f"{k} cannot be reload to your model, one/some of {possible_rename} we provided have been renamed")
+        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+        # so we need to apply the function recursively.
+        def load(module: nn.Module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                True,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ""
+        model_to_load = model
+        has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
+        if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+            start_prefix = cls.base_model_prefix + "."
+        if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+            model_to_load = getattr(model, cls.base_model_prefix)
+        load(model_to_load, prefix=start_prefix)
+        if model.__class__.__name__ != model_to_load.__class__.__name__:
+            base_model_state_dict = model_to_load.state_dict().keys()
+            head_model_state_dict_without_base_prefix = [
+                key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+            ]
+            missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+        # Some models may have keys that are not in the state by design, removing them before needlessly warning
+        # the user.
+        if cls._keys_to_ignore_on_load_missing is not None:
+            for pat in cls._keys_to_ignore_on_load_missing:
+                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+        if cls._keys_to_ignore_on_load_unexpected is not None:
+            for pat in cls._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    model.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "error_msgs": error_msgs,
+            }
+            return model, loading_info
+        if hasattr(config, "xla_device") and config.xla_device and is_torch_tpu_available():
+            import torch_xla.core.xla_model as xm
+            model = xm.send_cpu_data_to_device(model, xm.xla_device())
+            model.to(xm.xla_device())
+        return model

classifier.py CHANGED Viewed

@@ -3,6 +3,7 @@ from types import SimpleNamespace
 import csv
 import torch
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
 from sklearn.metrics import f1_score, accuracy_score
@@ -10,7 +11,6 @@ from sklearn.metrics import f1_score, accuracy_score
 from tokenizer import BertTokenizer
 from bert import BertModel
 from optimizer import AdamW
-from tqdm import tqdm
 TQDM_DISABLE=False
@@ -34,10 +34,10 @@ class BertSentimentClassifier(torch.nn.Module):
     In the SST dataset, there are 5 sentiment categories (from 0 - "negative" to 4 - "positive").
     Thus, your forward() should return one logit for each of the 5 classes.
     '''
-    def __init__(self, config):
         super(BertSentimentClassifier, self).__init__()
         self.num_labels = config.num_labels
-        self.bert: BertModel = BertModel.from_pretrained('bert-base-uncased')
         # Pretrain mode does not require updating BERT paramters.
         assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
@@ -59,26 +59,21 @@ class BertSentimentClassifier(torch.nn.Module):
         # the training loop currently uses F.cross_entropy as the loss function.
         # Get the embedding for each input token.
-        embedding_output = self.bert.embed(input_ids=input_ids)
-        # Feed to a transformer (BERT layers).
-        sequence_output = self.bert.encode(embedding_output, attention_mask=attention_mask)
-        # The final BERT contextualized embedding is the hidden state of [CLS] token (the first token).
-        cls_token_output = sequence_output[:, 0, :]  # The first token is [CLS]
         # Pass the [CLS] token representation through the classifier.
-        logits = self.classifier(self.dropout(cls_token_output))
         return logits
 class SentimentDataset(Dataset):
     def __init__(self, dataset, args):
         self.dataset = dataset
         self.p = args
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     def __len__(self):
         return len(self.dataset)
@@ -91,7 +86,7 @@ class SentimentDataset(Dataset):
         labels = [x[1] for x in data]
         sent_ids = [x[2] for x in data]
-        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
         token_ids = torch.LongTensor(encoding['input_ids'])
         attention_mask = torch.LongTensor(encoding['attention_mask'])
         labels = torch.LongTensor(labels)
@@ -99,15 +94,15 @@ class SentimentDataset(Dataset):
         return token_ids, attention_mask, labels, sents, sent_ids
     def collate_fn(self, all_data):
-        token_ids, attention_mask, labels, sents, sent_ids= self.pad_data(all_data)
         batched_data = {
-                'token_ids': token_ids,
-                'attention_mask': attention_mask,
-                'labels': labels,
-                'sents': sents,
-                'sent_ids': sent_ids
-            }
         return batched_data
@@ -116,7 +111,6 @@ class SentimentTestDataset(Dataset):
     def __init__(self, dataset, args):
         self.dataset = dataset
         self.p = args
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     def __len__(self):
         return len(self.dataset)
@@ -128,7 +122,7 @@ class SentimentTestDataset(Dataset):
         sents = [x[0] for x in data]
         sent_ids = [x[1] for x in data]
-        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
         token_ids = torch.LongTensor(encoding['input_ids'])
         attention_mask = torch.LongTensor(encoding['attention_mask'])
@@ -138,34 +132,31 @@ class SentimentTestDataset(Dataset):
         token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
         batched_data = {
-                'token_ids': token_ids,
-                'attention_mask': attention_mask,
-                'sents': sents,
-                'sent_ids': sent_ids
-            }
         return batched_data
 # Load the data: a list of (sentence, label).
 def load_data(filename, flag='train'):
-    num_labels = {}
     data = []
-    if flag == 'test':
-        with open(filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
                 sent = record['sentence'].lower().strip()
                 sent_id = record['id'].lower().strip()
                 data.append((sent,sent_id))
-    else:
-        with open(filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
                 sent = record['sentence'].lower().strip()
                 sent_id = record['id'].lower().strip()
                 label = int(record['sentiment'].strip())
-                if label not in num_labels:
-                    num_labels[label] = len(num_labels)
-                data.append((sent, label,sent_id))
         print(f"load {len(data)} data from {filename}")
     if flag == 'train':
@@ -253,9 +244,9 @@ def train(args):
     dev_dataset = SentimentDataset(dev_data, args)
     train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size,
-                                  collate_fn=train_dataset.collate_fn)
     dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
-                                collate_fn=dev_dataset.collate_fn)
     # Init model.
     config = {'hidden_dropout_prob': args.hidden_dropout_prob,
@@ -311,7 +302,7 @@ def train(args):
 def test(args):
     with torch.no_grad():
         device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
-        saved = torch.load(args.filepath)
         config = saved['model_config']
         model = BertSentimentClassifier(config)
         model.load_state_dict(saved['model'])
@@ -320,38 +311,44 @@ def test(args):
         dev_data = load_data(args.dev, 'valid')
         dev_dataset = SentimentDataset(dev_data, args)
-        dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn)
-        test_data = load_data(args.test, 'test')
-        test_dataset = SentimentTestDataset(test_data, args)
-        test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=test_dataset.collate_fn)
         dev_acc, dev_f1, dev_pred, dev_true, dev_sents, dev_sent_ids = model_eval(dev_dataloader, model, device)
         print('DONE DEV')
-        test_pred, test_sents, test_sent_ids = model_test_eval(test_dataloader, model, device)
-        print('DONE Test')
-        with open(args.dev_out, "w+") as f:
-            print(f"dev acc :: {dev_acc :.3f}")
-            f.write(f"id \t Predicted_Sentiment \n")
-            for p, s in zip(dev_sent_ids,dev_pred ):
-                f.write(f"{p} , {s} \n")
-        with open(args.test_out, "w+") as f:
-            f.write(f"id \t Predicted_Sentiment \n")
-            for p, s  in zip(test_sent_ids,test_pred ):
-                f.write(f"{p} , {s} \n")
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--seed", type=int, default=11711)
     parser.add_argument("--epochs", type=int, default=10)
     parser.add_argument("--fine-tune-mode", type=str,
                         help='last-linear-layer: the BERT parameters are frozen and the task specific head parameters are updated; full-model: BERT parameters are updated as well',
                         choices=('last-linear-layer', 'full-model'), default="last-linear-layer")
     parser.add_argument("--use_gpu", action='store_true')
-    parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
     parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
     parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                         default=1e-3)
@@ -360,17 +357,21 @@ def get_args():
     return args
-if __name__ == "__main__":
     args = get_args()
     seed_everything(args.seed)
     print('Training Sentiment Classifier on SST...')
     config = SimpleNamespace(
         filepath='sst-classifier.pt',
         lr=args.lr,
         use_gpu=args.use_gpu,
         epochs=args.epochs,
-        batch_size=args.batch_size,
         hidden_dropout_prob=args.hidden_dropout_prob,
         train='data/ids-sst-train.csv',
         dev='data/ids-sst-dev.csv',
@@ -389,9 +390,10 @@ if __name__ == "__main__":
     config = SimpleNamespace(
         filepath='cfimdb-classifier.pt',
         lr=args.lr,
         use_gpu=args.use_gpu,
         epochs=args.epochs,
-        batch_size=8,
         hidden_dropout_prob=args.hidden_dropout_prob,
         train='data/ids-cfimdb-train.csv',
         dev='data/ids-cfimdb-dev.csv',
@@ -405,3 +407,7 @@ if __name__ == "__main__":
     print('Evaluating on cfimdb...')
     test(config)

 import csv
 import torch
+from tqdm import tqdm
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
 from sklearn.metrics import f1_score, accuracy_score
 from tokenizer import BertTokenizer
 from bert import BertModel
 from optimizer import AdamW
 TQDM_DISABLE=False
     In the SST dataset, there are 5 sentiment categories (from 0 - "negative" to 4 - "positive").
     Thus, your forward() should return one logit for each of the 5 classes.
     '''
+    def __init__(self, config, bert_model = None):
         super(BertSentimentClassifier, self).__init__()
         self.num_labels = config.num_labels
+        self.bert: BertModel = bert_model or BertModel.from_pretrained('bert-base-uncased')
         # Pretrain mode does not require updating BERT paramters.
         assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
         # the training loop currently uses F.cross_entropy as the loss function.
         # Get the embedding for each input token.
+        outputs = self.bert(input_ids, attention_mask)
+        pooler_output = outputs['pooler_output']
         # Pass the [CLS] token representation through the classifier.
+        logits = self.classifier(self.dropout(pooler_output))
         return logits
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 class SentimentDataset(Dataset):
     def __init__(self, dataset, args):
         self.dataset = dataset
         self.p = args
     def __len__(self):
         return len(self.dataset)
         labels = [x[1] for x in data]
         sent_ids = [x[2] for x in data]
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
         token_ids = torch.LongTensor(encoding['input_ids'])
         attention_mask = torch.LongTensor(encoding['attention_mask'])
         labels = torch.LongTensor(labels)
         return token_ids, attention_mask, labels, sents, sent_ids
     def collate_fn(self, all_data):
+        token_ids, attention_mask, labels, sents, sent_ids = self.pad_data(all_data)
         batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+            'labels': labels,
+            'sents': sents,
+            'sent_ids': sent_ids
+        }
         return batched_data
     def __init__(self, dataset, args):
         self.dataset = dataset
         self.p = args
     def __len__(self):
         return len(self.dataset)
         sents = [x[0] for x in data]
         sent_ids = [x[1] for x in data]
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
         token_ids = torch.LongTensor(encoding['input_ids'])
         attention_mask = torch.LongTensor(encoding['attention_mask'])
         token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
         batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+            'sents': sents,
+            'sent_ids': sent_ids
+        }
         return batched_data
 # Load the data: a list of (sentence, label).
 def load_data(filename, flag='train'):
+    num_labels = set()
     data = []
+    with open(filename, 'r') as fp:
+        for record in csv.DictReader(fp, delimiter = '\t'):
+            if flag == 'test':
                 sent = record['sentence'].lower().strip()
                 sent_id = record['id'].lower().strip()
                 data.append((sent,sent_id))
+            else:
                 sent = record['sentence'].lower().strip()
                 sent_id = record['id'].lower().strip()
                 label = int(record['sentiment'].strip())
+                num_labels.add(label)
+                data.append((sent, label, sent_id))
         print(f"load {len(data)} data from {filename}")
     if flag == 'train':
     dev_dataset = SentimentDataset(dev_data, args)
     train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size,
+                                  num_workers=args.num_cpu_cores, collate_fn=train_dataset.collate_fn)
     dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
+                                num_workers=args.num_cpu_cores, collate_fn=dev_dataset.collate_fn)
     # Init model.
     config = {'hidden_dropout_prob': args.hidden_dropout_prob,
 def test(args):
     with torch.no_grad():
         device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+        saved = torch.load(args.filepath, weights_only=False)
         config = saved['model_config']
         model = BertSentimentClassifier(config)
         model.load_state_dict(saved['model'])
         dev_data = load_data(args.dev, 'valid')
         dev_dataset = SentimentDataset(dev_data, args)
+        dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
+                                    num_workers=args.num_cpu_cores, collate_fn=dev_dataset.collate_fn)
         dev_acc, dev_f1, dev_pred, dev_true, dev_sents, dev_sent_ids = model_eval(dev_dataloader, model, device)
         print('DONE DEV')
+        print(f"dev acc :: {dev_acc :.3f}")
+        # ---- SKIP RUNNING ON TEST DATASET ---- #
+        # test_data = load_data(args.test, 'test')
+        # test_dataset = SentimentTestDataset(test_data, args)
+        # test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size,
+        #                              num_workers=args.num_cpu_cores, collate_fn=test_dataset.collate_fn)
+        # test_pred, test_sents, test_sent_ids = model_test_eval(test_dataloader, model, device)
+        # print('DONE TEST')
+        # ---- SKIP SAVING PREDICTIONS ----
+        # with open(args.dev_out, "w+") as f:
+        #     f.write(f"id \t Predicted_Sentiment \n")
+        #     for p, s in zip(dev_sent_ids,dev_pred):
+        #         f.write(f"{p} , {s} \n")
+        # with open(args.test_out, "w+") as f:
+        #     f.write(f"id \t Predicted_Sentiment \n")
+        #     for p, s  in zip(test_sent_ids,test_pred ):
+        #         f.write(f"{p} , {s} \n")
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--seed", type=int, default=11711)
+    parser.add_argument("--num-cpu-cores", type=int, default=4)
     parser.add_argument("--epochs", type=int, default=10)
     parser.add_argument("--fine-tune-mode", type=str,
                         help='last-linear-layer: the BERT parameters are frozen and the task specific head parameters are updated; full-model: BERT parameters are updated as well',
                         choices=('last-linear-layer', 'full-model'), default="last-linear-layer")
     parser.add_argument("--use_gpu", action='store_true')
+    parser.add_argument("--batch_size_sst", help='64 can fit a 12GB GPU', type=int, default=8)
+    parser.add_argument("--batch_size_cfimdb", help='8 can fit a 12GB GPU', type=int, default=8)
     parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
     parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
                         default=1e-3)
     return args
+def main():
     args = get_args()
     seed_everything(args.seed)
+    torch.set_num_threads(args.num_cpu_cores)
+    print(torch.get_num_threads())
     print('Training Sentiment Classifier on SST...')
     config = SimpleNamespace(
         filepath='sst-classifier.pt',
         lr=args.lr,
+        num_cpu_cores=args.num_cpu_cores,
         use_gpu=args.use_gpu,
         epochs=args.epochs,
+        batch_size=args.batch_size_sst,
         hidden_dropout_prob=args.hidden_dropout_prob,
         train='data/ids-sst-train.csv',
         dev='data/ids-sst-dev.csv',
     config = SimpleNamespace(
         filepath='cfimdb-classifier.pt',
         lr=args.lr,
+        num_cpu_cores=args.num_cpu_cores,
         use_gpu=args.use_gpu,
         epochs=args.epochs,
+        batch_size=args.batch_size_cfimdb,
         hidden_dropout_prob=args.hidden_dropout_prob,
         train='data/ids-cfimdb-train.csv',
         dev='data/ids-cfimdb-dev.csv',
     print('Evaluating on cfimdb...')
     test(config)
+if __name__ == "__main__":
+    main()

data/{sts-test-student.csv → nli-dev.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dee455745b72e9ca3ff74e7c056bd73e34bad5b8d5641045a2c1e7e131866f47
-size 256677

 version https://git-lfs.github.com/spec/v1
+oid sha256:c267496435885e724abc71e53669fae59db875bfa13389eab8f9b0b2dfb2b32e
+size 782233

data/{sts-train.csv → nli-test.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15d12efc2d656fffb1d61ac1f08ec4227f43925fd16f420c037cbd063699c21b
-size 928832

 version https://git-lfs.github.com/spec/v1
+oid sha256:01688df43ae4c019a86144a0d2351146b124688a55f285071cccd156225a5fdf
+size 810423

data/{quora-test-student.csv → nli-train.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4fa130f532cdde70287081aa04af13a4b12e3aa862e9162763d15fb46385497a
-size 13487951

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9aeca80b1bda983ee316f854ebc37af8341877fb932dd6a2c6aba978ad112a5
+size 38396324

data/{sts-dev.csv → stsb-dev.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce3cad6f16062586ac7ba462c28b010a9be10c530fd5074165860d7b7ab4e93d
-size 132265

 version https://git-lfs.github.com/spec/v1
+oid sha256:9c6e0e9881f1b398abe3e439a482f4686305c3784568c462f6bba58bdff03b0a
+size 142187

data/{quora-dev.csv → stsb-test.parquet} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1e9dc46b273a711d82a065f55e1754a9b92c10ad7345ebe0b0ebba61397dda4a
-size 6896912

 version https://git-lfs.github.com/spec/v1
+oid sha256:8acbc291c50977d8655934952956016c3e049c2fe04f8a6c454c1bf6acc42ca1
+size 108100

data/stsb-train.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eae324ff1eac2d0ba769851736eb7232eda64f370a16eb20e74a2c5f8f5fafe0
+size 470612

data/{quora-train.csv → twitter-unsup.csv} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7cd59e1ddb3a5b5d03f4a885c64e67aaf50122d9ab9ed7a476b5d2d6f7137ae8
-size 48270674

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a7af1ec5fc749ec8e5ea13c574aeb5c06254aa1c081e3421868079d5356b3f4
+size 20895533

datasets.py DELETED Viewed

@@ -1,272 +0,0 @@
-#!/usr/bin/env python3
-'''
-This module contains our Dataset classes and functions that load the three datasets
-for training and evaluating multitask BERT.
-Feel free to edit code in this file if you wish to modify the way in which the data
-examples are preprocessed.
-'''
-import csv
-import torch
-from torch.utils.data import Dataset
-from tokenizer import BertTokenizer
-def preprocess_string(s):
-    return ' '.join(s.lower()
-                    .replace('.', ' .')
-                    .replace('?', ' ?')
-                    .replace(',', ' ,')
-                    .replace('\'', ' \'')
-                    .split())
-class SentenceClassificationDataset(Dataset):
-    def __init__(self, dataset, args):
-        self.dataset = dataset
-        self.p = args
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        return self.dataset[idx]
-    def pad_data(self, data):
-        sents = [x[0] for x in data]
-        labels = [x[1] for x in data]
-        sent_ids = [x[2] for x in data]
-        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
-        token_ids = torch.LongTensor(encoding['input_ids'])
-        attention_mask = torch.LongTensor(encoding['attention_mask'])
-        labels = torch.LongTensor(labels)
-        return token_ids, attention_mask, labels, sents, sent_ids
-    def collate_fn(self, all_data):
-        token_ids, attention_mask, labels, sents, sent_ids= self.pad_data(all_data)
-        batched_data = {
-                'token_ids': token_ids,
-                'attention_mask': attention_mask,
-                'labels': labels,
-                'sents': sents,
-                'sent_ids': sent_ids
-            }
-        return batched_data
-# Unlike SentenceClassificationDataset, we do not load labels in SentenceClassificationTestDataset.
-class SentenceClassificationTestDataset(Dataset):
-    def __init__(self, dataset, args):
-        self.dataset = dataset
-        self.p = args
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        return self.dataset[idx]
-    def pad_data(self, data):
-        sents = [x[0] for x in data]
-        sent_ids = [x[1] for x in data]
-        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
-        token_ids = torch.LongTensor(encoding['input_ids'])
-        attention_mask = torch.LongTensor(encoding['attention_mask'])
-        return token_ids, attention_mask, sents, sent_ids
-    def collate_fn(self, all_data):
-        token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
-        batched_data = {
-                'token_ids': token_ids,
-                'attention_mask': attention_mask,
-                'sents': sents,
-                'sent_ids': sent_ids
-            }
-        return batched_data
-class SentencePairDataset(Dataset):
-    def __init__(self, dataset, args, isRegression=False):
-        self.dataset = dataset
-        self.p = args
-        self.isRegression = isRegression
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        return self.dataset[idx]
-    def pad_data(self, data):
-        sent1 = [x[0] for x in data]
-        sent2 = [x[1] for x in data]
-        labels = [x[2] for x in data]
-        sent_ids = [x[3] for x in data]
-        encoding1 = self.tokenizer(sent1, return_tensors='pt', padding=True, truncation=True)
-        encoding2 = self.tokenizer(sent2, return_tensors='pt', padding=True, truncation=True)
-        token_ids = torch.LongTensor(encoding1['input_ids'])
-        attention_mask = torch.LongTensor(encoding1['attention_mask'])
-        token_type_ids = torch.LongTensor(encoding1['token_type_ids'])
-        token_ids2 = torch.LongTensor(encoding2['input_ids'])
-        attention_mask2 = torch.LongTensor(encoding2['attention_mask'])
-        token_type_ids2 = torch.LongTensor(encoding2['token_type_ids'])
-        if self.isRegression:
-            labels = torch.DoubleTensor(labels)
-        else:
-            labels = torch.LongTensor(labels)
-        return (token_ids, token_type_ids, attention_mask,
-                token_ids2, token_type_ids2, attention_mask2,
-                labels,sent_ids)
-    def collate_fn(self, all_data):
-        (token_ids, token_type_ids, attention_mask,
-         token_ids2, token_type_ids2, attention_mask2,
-         labels, sent_ids) = self.pad_data(all_data)
-        batched_data = {
-                'token_ids_1': token_ids,
-                'token_type_ids_1': token_type_ids,
-                'attention_mask_1': attention_mask,
-                'token_ids_2': token_ids2,
-                'token_type_ids_2': token_type_ids2,
-                'attention_mask_2': attention_mask2,
-                'labels': labels,
-                'sent_ids': sent_ids
-            }
-        return batched_data
-# Unlike SentencePairDataset, we do not load labels in SentencePairTestDataset.
-class SentencePairTestDataset(Dataset):
-    def __init__(self, dataset, args):
-        self.dataset = dataset
-        self.p = args
-        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    def __len__(self):
-        return len(self.dataset)
-    def __getitem__(self, idx):
-        return self.dataset[idx]
-    def pad_data(self, data):
-        sent1 = [x[0] for x in data]
-        sent2 = [x[1] for x in data]
-        sent_ids = [x[2] for x in data]
-        encoding1 = self.tokenizer(sent1, return_tensors='pt', padding=True, truncation=True)
-        encoding2 = self.tokenizer(sent2, return_tensors='pt', padding=True, truncation=True)
-        token_ids = torch.LongTensor(encoding1['input_ids'])
-        attention_mask = torch.LongTensor(encoding1['attention_mask'])
-        token_type_ids = torch.LongTensor(encoding1['token_type_ids'])
-        token_ids2 = torch.LongTensor(encoding2['input_ids'])
-        attention_mask2 = torch.LongTensor(encoding2['attention_mask'])
-        token_type_ids2 = torch.LongTensor(encoding2['token_type_ids'])
-        return (token_ids, token_type_ids, attention_mask,
-                token_ids2, token_type_ids2, attention_mask2,
-               sent_ids)
-    def collate_fn(self, all_data):
-        (token_ids, token_type_ids, attention_mask,
-         token_ids2, token_type_ids2, attention_mask2,
-         sent_ids) = self.pad_data(all_data)
-        batched_data = {
-                'token_ids_1': token_ids,
-                'token_type_ids_1': token_type_ids,
-                'attention_mask_1': attention_mask,
-                'token_ids_2': token_ids2,
-                'token_type_ids_2': token_type_ids2,
-                'attention_mask_2': attention_mask2,
-                'sent_ids': sent_ids
-            }
-        return batched_data
-def load_multitask_data(sentiment_filename,paraphrase_filename,similarity_filename,split='train'):
-    sentiment_data = []
-    num_labels = {}
-    if split == 'test':
-        with open(sentiment_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                sent = record['sentence'].lower().strip()
-                sent_id = record['id'].lower().strip()
-                sentiment_data.append((sent,sent_id))
-    else:
-        with open(sentiment_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                sent = record['sentence'].lower().strip()
-                sent_id = record['id'].lower().strip()
-                label = int(record['sentiment'].strip())
-                if label not in num_labels:
-                    num_labels[label] = len(num_labels)
-                sentiment_data.append((sent, label,sent_id))
-    print(f"Loaded {len(sentiment_data)} {split} examples from {sentiment_filename}")
-    paraphrase_data = []
-    if split == 'test':
-        with open(paraphrase_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                sent_id = record['id'].lower().strip()
-                paraphrase_data.append((preprocess_string(record['sentence1']),
-                                        preprocess_string(record['sentence2']),
-                                        sent_id))
-    else:
-        with open(paraphrase_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                try:
-                    sent_id = record['id'].lower().strip()
-                    paraphrase_data.append((preprocess_string(record['sentence1']),
-                                            preprocess_string(record['sentence2']),
-                                            int(float(record['is_duplicate'])),sent_id))
-                except:
-                    pass
-    print(f"Loaded {len(paraphrase_data)} {split} examples from {paraphrase_filename}")
-    similarity_data = []
-    if split == 'test':
-        with open(similarity_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                sent_id = record['id'].lower().strip()
-                similarity_data.append((preprocess_string(record['sentence1']),
-                                        preprocess_string(record['sentence2'])
-                                        ,sent_id))
-    else:
-        with open(similarity_filename, 'r') as fp:
-            for record in csv.DictReader(fp,delimiter = '\t'):
-                sent_id = record['id'].lower().strip()
-                similarity_data.append((preprocess_string(record['sentence1']),
-                                        preprocess_string(record['sentence2']),
-                                        float(record['similarity']),sent_id))
-    print(f"Loaded {len(similarity_data)} {split} examples from {similarity_filename}")
-    return sentiment_data, num_labels, paraphrase_data, similarity_data

justfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ python classifier.py --num-cpu-cores 8 --batch_size_sst 64 --batch_size_cfimdb 8

multitask_classifier.py DELETED Viewed

@@ -1,340 +0,0 @@
-'''
-Multitask BERT class, starter training code, evaluation, and test code.
-Of note are:
-* class MultitaskBERT: Your implementation of multitask BERT.
-* function train_multitask: Training procedure for MultitaskBERT. Starter code
-    copies training procedure from `classifier.py` (single-task SST).
-* function test_multitask: Test procedure for MultitaskBERT. This function generates
-    the required files for submission.
-Running `python multitask_classifier.py` trains and tests your MultitaskBERT and
-writes all required submission files.
-'''
-import random, numpy as np, argparse
-from types import SimpleNamespace
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.utils.data import DataLoader
-from bert import BertModel
-from optimizer import AdamW
-from tqdm import tqdm
-from datasets import (
-    SentenceClassificationDataset,
-    SentenceClassificationTestDataset,
-    SentencePairDataset,
-    SentencePairTestDataset,
-    load_multitask_data
-)
-from evaluation import model_eval_sst, model_eval_multitask, model_eval_test_multitask
-TQDM_DISABLE=False
-# Fix the random seed.
-def seed_everything(seed=11711):
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.benchmark = False
-    torch.backends.cudnn.deterministic = True
-BERT_HIDDEN_SIZE = 768
-N_SENTIMENT_CLASSES = 5
-class MultitaskBERT(nn.Module):
-    '''
-    This module should use BERT for 3 tasks:
-    - Sentiment classification (predict_sentiment)
-    - Paraphrase detection (predict_paraphrase)
-    - Semantic Textual Similarity (predict_similarity)
-    '''
-    def __init__(self, config):
-        super(MultitaskBERT, self).__init__()
-        self.bert = BertModel.from_pretrained('bert-base-uncased')
-        # last-linear-layer mode does not require updating BERT paramters.
-        assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
-        for param in self.bert.parameters():
-            if config.fine_tune_mode == 'last-linear-layer':
-                param.requires_grad = False
-            elif config.fine_tune_mode == 'full-model':
-                param.requires_grad = True
-        # You will want to add layers here to perform the downstream tasks.
-        ### TODO
-        raise NotImplementedError
-    def forward(self, input_ids, attention_mask):
-        'Takes a batch of sentences and produces embeddings for them.'
-        # The final BERT embedding is the hidden state of [CLS] token (the first token)
-        # Here, you can start by just returning the embeddings straight from BERT.
-        # When thinking of improvements, you can later try modifying this
-        # (e.g., by adding other layers).
-        ### TODO
-        raise NotImplementedError
-    def predict_sentiment(self, input_ids, attention_mask):
-        '''Given a batch of sentences, outputs logits for classifying sentiment.
-        There are 5 sentiment classes:
-        (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
-        Thus, your output should contain 5 logits for each sentence.
-        '''
-        ### TODO
-        raise NotImplementedError
-    def predict_paraphrase(self,
-                           input_ids_1, attention_mask_1,
-                           input_ids_2, attention_mask_2):
-        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
-        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
-        during evaluation.
-        '''
-        ### TODO
-        raise NotImplementedError
-    def predict_similarity(self,
-                           input_ids_1, attention_mask_1,
-                           input_ids_2, attention_mask_2):
-        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
-        Note that your output should be unnormalized (a logit).
-        '''
-        ### TODO
-        raise NotImplementedError
-def save_model(model, optimizer, args, config, filepath):
-    save_info = {
-        'model': model.state_dict(),
-        'optim': optimizer.state_dict(),
-        'args': args,
-        'model_config': config,
-        'system_rng': random.getstate(),
-        'numpy_rng': np.random.get_state(),
-        'torch_rng': torch.random.get_rng_state(),
-    }
-    torch.save(save_info, filepath)
-    print(f"save the model to {filepath}")
-def train_multitask(args):
-    '''Train MultitaskBERT.
-    Currently only trains on SST dataset. The way you incorporate training examples
-    from other datasets into the training procedure is up to you. To begin, take a
-    look at test_multitask below to see how you can use the custom torch `Dataset`s
-    in datasets.py to load in examples from the Quora and SemEval datasets.
-    '''
-    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
-    # Create the data and its corresponding datasets and dataloader.
-    sst_train_data, num_labels,para_train_data, sts_train_data = load_multitask_data(args.sst_train,args.para_train,args.sts_train, split ='train')
-    sst_dev_data, num_labels,para_dev_data, sts_dev_data = load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev, split ='train')
-    sst_train_data = SentenceClassificationDataset(sst_train_data, args)
-    sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)
-    sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args.batch_size,
-                                      collate_fn=sst_train_data.collate_fn)
-    sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
-                                    collate_fn=sst_dev_data.collate_fn)
-    # Init model.
-    config = {'hidden_dropout_prob': args.hidden_dropout_prob,
-              'num_labels': num_labels,
-              'hidden_size': 768,
-              'data_dir': '.',
-              'fine_tune_mode': args.fine_tune_mode}
-    config = SimpleNamespace(**config)
-    model = MultitaskBERT(config)
-    model = model.to(device)
-    lr = args.lr
-    optimizer = AdamW(model.parameters(), lr=lr)
-    best_dev_acc = 0
-    # Run for the specified number of epochs.
-    for epoch in range(args.epochs):
-        model.train()
-        train_loss = 0
-        num_batches = 0
-        for batch in tqdm(sst_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
-            b_ids, b_mask, b_labels = (batch['token_ids'],
-                                       batch['attention_mask'], batch['labels'])
-            b_ids = b_ids.to(device)
-            b_mask = b_mask.to(device)
-            b_labels = b_labels.to(device)
-            optimizer.zero_grad()
-            logits = model.predict_sentiment(b_ids, b_mask)
-            loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size
-            loss.backward()
-            optimizer.step()
-            train_loss += loss.item()
-            num_batches += 1
-        train_loss = train_loss / (num_batches)
-        train_acc, train_f1, *_ = model_eval_sst(sst_train_dataloader, model, device)
-        dev_acc, dev_f1, *_ = model_eval_sst(sst_dev_dataloader, model, device)
-        if dev_acc > best_dev_acc:
-            best_dev_acc = dev_acc
-            save_model(model, optimizer, args, config, args.filepath)
-        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")
-def test_multitask(args):
-    '''Test and save predictions on the dev and test sets of all three tasks.'''
-    with torch.no_grad():
-        device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
-        saved = torch.load(args.filepath)
-        config = saved['model_config']
-        model = MultitaskBERT(config)
-        model.load_state_dict(saved['model'])
-        model = model.to(device)
-        print(f"Loaded model to test from {args.filepath}")
-        sst_test_data, num_labels,para_test_data, sts_test_data = \
-            load_multitask_data(args.sst_test,args.para_test, args.sts_test, split='test')
-        sst_dev_data, num_labels,para_dev_data, sts_dev_data = \
-            load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev,split='dev')
-        sst_test_data = SentenceClassificationTestDataset(sst_test_data, args)
-        sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)
-        sst_test_dataloader = DataLoader(sst_test_data, shuffle=True, batch_size=args.batch_size,
-                                         collate_fn=sst_test_data.collate_fn)
-        sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
-                                        collate_fn=sst_dev_data.collate_fn)
-        para_test_data = SentencePairTestDataset(para_test_data, args)
-        para_dev_data = SentencePairDataset(para_dev_data, args)
-        para_test_dataloader = DataLoader(para_test_data, shuffle=True, batch_size=args.batch_size,
-                                          collate_fn=para_test_data.collate_fn)
-        para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
-                                         collate_fn=para_dev_data.collate_fn)
-        sts_test_data = SentencePairTestDataset(sts_test_data, args)
-        sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)
-        sts_test_dataloader = DataLoader(sts_test_data, shuffle=True, batch_size=args.batch_size,
-                                         collate_fn=sts_test_data.collate_fn)
-        sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
-                                        collate_fn=sts_dev_data.collate_fn)
-        dev_sentiment_accuracy,dev_sst_y_pred, dev_sst_sent_ids, \
-            dev_paraphrase_accuracy, dev_para_y_pred, dev_para_sent_ids, \
-            dev_sts_corr, dev_sts_y_pred, dev_sts_sent_ids = model_eval_multitask(sst_dev_dataloader,
-                                                                    para_dev_dataloader,
-                                                                    sts_dev_dataloader, model, device)
-        test_sst_y_pred, \
-            test_sst_sent_ids, test_para_y_pred, test_para_sent_ids, test_sts_y_pred, test_sts_sent_ids = \
-                model_eval_test_multitask(sst_test_dataloader,
-                                          para_test_dataloader,
-                                          sts_test_dataloader, model, device)
-        with open(args.sst_dev_out, "w+") as f:
-            print(f"dev sentiment acc :: {dev_sentiment_accuracy :.3f}")
-            f.write(f"id \t Predicted_Sentiment \n")
-            for p, s in zip(dev_sst_sent_ids, dev_sst_y_pred):
-                f.write(f"{p} , {s} \n")
-        with open(args.sst_test_out, "w+") as f:
-            f.write(f"id \t Predicted_Sentiment \n")
-            for p, s in zip(test_sst_sent_ids, test_sst_y_pred):
-                f.write(f"{p} , {s} \n")
-        with open(args.para_dev_out, "w+") as f:
-            print(f"dev paraphrase acc :: {dev_paraphrase_accuracy :.3f}")
-            f.write(f"id \t Predicted_Is_Paraphrase \n")
-            for p, s in zip(dev_para_sent_ids, dev_para_y_pred):
-                f.write(f"{p} , {s} \n")
-        with open(args.para_test_out, "w+") as f:
-            f.write(f"id \t Predicted_Is_Paraphrase \n")
-            for p, s in zip(test_para_sent_ids, test_para_y_pred):
-                f.write(f"{p} , {s} \n")
-        with open(args.sts_dev_out, "w+") as f:
-            print(f"dev sts corr :: {dev_sts_corr :.3f}")
-            f.write(f"id \t Predicted_Similiary \n")
-            for p, s in zip(dev_sts_sent_ids, dev_sts_y_pred):
-                f.write(f"{p} , {s} \n")
-        with open(args.sts_test_out, "w+") as f:
-            f.write(f"id \t Predicted_Similiary \n")
-            for p, s in zip(test_sts_sent_ids, test_sts_y_pred):
-                f.write(f"{p} , {s} \n")
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--sst_train", type=str, default="data/ids-sst-train.csv")
-    parser.add_argument("--sst_dev", type=str, default="data/ids-sst-dev.csv")
-    parser.add_argument("--sst_test", type=str, default="data/ids-sst-test-student.csv")
-    parser.add_argument("--para_train", type=str, default="data/quora-train.csv")
-    parser.add_argument("--para_dev", type=str, default="data/quora-dev.csv")
-    parser.add_argument("--para_test", type=str, default="data/quora-test-student.csv")
-    parser.add_argument("--sts_train", type=str, default="data/sts-train.csv")
-    parser.add_argument("--sts_dev", type=str, default="data/sts-dev.csv")
-    parser.add_argument("--sts_test", type=str, default="data/sts-test-student.csv")
-    parser.add_argument("--seed", type=int, default=11711)
-    parser.add_argument("--epochs", type=int, default=10)
-    parser.add_argument("--fine-tune-mode", type=str,
-                        help='last-linear-layer: the BERT parameters are frozen and the task specific head parameters are updated; full-model: BERT parameters are updated as well',
-                        choices=('last-linear-layer', 'full-model'), default="last-linear-layer")
-    parser.add_argument("--use_gpu", action='store_true')
-    parser.add_argument("--sst_dev_out", type=str, default="predictions/sst-dev-output.csv")
-    parser.add_argument("--sst_test_out", type=str, default="predictions/sst-test-output.csv")
-    parser.add_argument("--para_dev_out", type=str, default="predictions/para-dev-output.csv")
-    parser.add_argument("--para_test_out", type=str, default="predictions/para-test-output.csv")
-    parser.add_argument("--sts_dev_out", type=str, default="predictions/sts-dev-output.csv")
-    parser.add_argument("--sts_test_out", type=str, default="predictions/sts-test-output.csv")
-    parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
-    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
-    parser.add_argument("--lr", type=float, help="learning rate", default=1e-5)
-    args = parser.parse_args()
-    return args
-if __name__ == "__main__":
-    args = get_args()
-    args.filepath = f'{args.fine_tune_mode}-{args.epochs}-{args.lr}-multitask.pt' # Save path.
-    seed_everything(args.seed)  # Fix the seed for reproducibility.
-    train_multitask(args)
-    test_multitask(args)

prompt ADDED Viewed

	@@ -0,0 +1,3 @@

+Tôi muốn finetune minBERT bằng phương pháp Unsupervised SimCSE để thực hiện sentiment analysis nhưng chưa biết phải làm như thế nào. theo như tôi hiểu thì tôi sẽ finetune mô hình minBERT bằng SimCSE để có được embeddings tốt hơn, sau đó sẽ dùng embeddings này để truyền qua SentimentClassifier để phân loại. Tuy nhiên, hướng tiếp cận đúng đắn là gì?
+Tôi đã nghĩ đến hai cách sau đây (hoặc có thể cách khác nhưng chưa nghĩ ra). Bạn xem xét thử nhé!
+1. Finetune minBERT bằng SimCSE trước rồi mới finetune SentimentClassifier: sử dụng dataset STS-B hoặc Twitter Sentiment Dataset để finetune minBERT, rồi đánh giá độ

trainings/last-layer-w-dropout.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-Training Sentiment Classifier on SST...
 load 8544 data from data/ids-sst-train.csv
 load 1101 data from data/ids-sst-dev.csv
 Epoch 0: train loss :: 1.458, train acc :: 0.460, dev acc :: 0.442
@@ -14,7 +14,7 @@ Epoch 9: train loss :: 1.227, train acc :: 0.509, dev acc :: 0.475
 Evaluating on SST...
 load model from sst-classifier.pt
 load 1101 data from data/ids-sst-dev.csv
-DONE DEV
 DONE Test
 dev acc :: 0.475
 Training Sentiment Classifier on cfimdb...
@@ -33,6 +33,6 @@ Epoch 9: train loss :: 0.407, train acc :: 0.895, dev acc :: 0.873
 Evaluating on cfimdb...
 load model from cfimdb-classifier.pt
 load 245 data from data/ids-cfimdb-dev.csv
-DONE DEV
-DONE Test
 dev acc :: 0.873

+Training Sentiment Classifier on SST...
 load 8544 data from data/ids-sst-train.csv
 load 1101 data from data/ids-sst-dev.csv
 Epoch 0: train loss :: 1.458, train acc :: 0.460, dev acc :: 0.442
 Evaluating on SST...
 load model from sst-classifier.pt
 load 1101 data from data/ids-sst-dev.csv
+DONE DEV
 DONE Test
 dev acc :: 0.475
 Training Sentiment Classifier on cfimdb...
 Evaluating on cfimdb...
 load model from cfimdb-classifier.pt
 load 245 data from data/ids-cfimdb-dev.csv
+DONE DEV
+DONE Test
 dev acc :: 0.873

unsup_simcse.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import csv
+import torch
+import random
+import argparse
+import numpy as np
+from tqdm import tqdm
+from types import SimpleNamespace
+from torch.utils.data import Dataset, DataLoader
+from sklearn.metrics import f1_score, accuracy_score
+from bert import BertModel
+from optimizer import AdamW
+from classifier import seed_everything, tokenizer
+from classifier import SentimentDataset, BertSentimentClassifier
+TQDM_DISABLE = False
+class TwitterDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, sents):
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attension_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attension_mask
+    def collate_fn(self, sents):
+        token_ids, attention_mask = self.pad_data(sents)
+        batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+        }
+        return batched_data
+def load_data(filename, flag='train'):
+    '''
+    - for Twitter dataset: list of sentences
+    - for SST/CFIMDB dataset: list of (sent, [label], sent_id)
+    '''
+    num_labels = set()
+    data = []
+    with open(filename, 'r') as fp:
+        for record in csv.DictReader(fp, delimiter = ',', ):
+            if flag == 'twitter':
+                sent = record['clean_text'].lower().strip()
+                data.append(sent)
+            elif flag == 'test':
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                data.append((sent,sent_id))
+            else:
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                label = int(record['sentiment'].strip())
+                num_labels.add(label)
+                data.append((sent, label, sent_id))
+        print(f"load {len(data)} data from {filename}")
+    if flag == 'train':
+        return data, len(num_labels)
+    else:
+        return data
+def save_model(model, optimizer, args, config, filepath):
+    save_info = {
+        'model': model.state_dict(),
+        'optim': optimizer.state_dict(),
+        'args': args,
+        'model_config': config,
+        'system_rng': random.getstate(),
+        'numpy_rng': np.random.get_state(),
+        'torch_rng': torch.random.get_rng_state(),
+    }
+    torch.save(save_info, filepath)
+    print(f"save the model to {filepath}")
+def train(args):
+    '''
+    Training Pipeline
+    -----------------
+    1. Load the Twitter Sentiment and SST Dataset.
+    2. Determine batch_size (64) and number of batches (?).
+    3. Initialize SentimentClassifier (including bert).
+    4. Looping through 10 epoches.
+    5. Finetune minBERT with SimCSE loss function.
+    6. Finetune Classifier with cross-entropy function.
+    7. Backpropagation using Adam Optimizer for both.
+    8. Evaluating the model on dev dataset.
+    9. If dev_acc > best_dev_acc: save_model(...)
+    '''
+    twitter_data = load_data(args.train_bert, 'twitter')
+    train_data, num_labels = load_data(args.train, 'train')
+    dev_data = load_data(args.dev, 'valid')
+    twitter_dataset = TwitterDataset(twitter_data, args)
+    train_dataset = SentimentDataset(train_data, args)
+    dev_dataset = SentimentDataset(dev_data, args)
+    twitter_dataloader = DataLoader(twitter_dataset, shuffle=True, batch_size=args.batch_size_cse,
+                                    num_workers=args.num_cpu_cores, collate_fn=twitter_dataset.collate_fn)
+    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size_classifier,
+                                  num_workers=args.num_cpu_cores, collate_fn=train_dataset.collate_fn)
+    dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size_classifier,
+                                num_workers=args.num_cpu_cores, collate_fn=dev_dataset.collate_fn)
+    config = SimpleNamespace(
+        hidden_dropout_prob=args.hidden_dropout_prob,
+        num_labels=num_labels,
+        hidden_size=768,
+        data_dir='.',
+        fine_tune_mode='full-model'
+    )
+    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+    model = BertSentimentClassifier(config)
+    model = model.to(device)
+    optimizer_cse = AdamW(model.bert.parameters(), lr=args.lr_cse)
+    optimizer_classifier = AdamW(model.parameters(), lr=args.lr_classifier)
+    best_dev_acc = 0
+    for epoch in range(args.epochs):
+        model.bert.train()
+        train_loss = num_batches = 0
+        for batch in tqdm(twitter_dataloader, f'train-twitter-{epoch}', leave=False, disable=TQDM_DISABLE):
+            b_ids, b_mask = batch['token_ids'], batch['attention_mask']
+            b_ids = b_ids.to(device)
+            b_mask = b_mask.to(device)
+            optimizer_cse.zero_grad()
+            logits = model.bert.embed(b_ids)
+            logits = model.bert.encode(logits, b_mask)
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=11711)
+    parser.add_argument("--num-cpu-cores", type=int, default=4)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--use_gpu", action='store_true')
+    parser.add_argument("--batch_size_cse", help="'unsup': 64, 'sup': 512", type=int)
+    parser.add_argument("--batch_size_classifier", help="'sst': 64, 'cfimdb': 8", type=int)
+    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
+    parser.add_argument("--lr_cse", default=2e-5)
+    parser.add_argument("--lr_classifier", default=1e-5)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = get_args()
+    seed_everything(args.seed)
+    torch.set_num_threads(args.num_cpu_cores)
+    print('Finetuning minBERT with Unsupervised SimCSE...')
+    config = SimpleNamespace(
+        filepath='contrastive-nli.pt',
+        lr=args.lr,
+        num_cpu_cores=args.num_cpu_cores,
+        use_gpu=args.use_gpu,
+        epochs=args.epochs,
+        batch_size_cse=args.batch_size_cse,
+        batch_size_classifier=args.batch_size_classifier,
+        train_bert='data/twitter-unsup.csv',
+        train='data/ids-sst-train.csv',
+        dev='data/ids-sst-dev.csv',
+        test='data/ids-sst-test-student.csv',
+        dev_out = 'predictions/' + args.fine_tune_mode + '-sst-dev-out.csv',
+        test_out = 'predictions/' + args.fine_tune_mode + '-sst-test-out.csv'
+    )
+    train(config)
+    # model = BertModel.from_pretrained('bert-base-uncased')
+    # model.eval()
+    # s = set()
+    # for param in model.parameters():
+    #     s.add(param.requires_grad)
+    # print(s)