GlowCheese commited on Nov 17, 2024

Commit

9756d99

1 Parent(s): 23d93ea

First model version

Browse files

Files changed (48) hide show

.gitattributes +1 -0
.python-version +1 -0
LICENSE +203 -0
README.md +27 -0
__pycache__/base_bert.cpython-38.pyc +0 -0
__pycache__/bert.cpython-38.pyc +0 -0
__pycache__/config.cpython-38.pyc +0 -0
__pycache__/optimizer.cpython-38.pyc +0 -0
__pycache__/tokenizer.cpython-38.pyc +0 -0
__pycache__/utils.cpython-38.pyc +0 -0
base_bert.py +248 -0
bert.py +225 -0
cfimdb-classifier.pt +3 -0
classifier.py +406 -0
config.py +222 -0
data/ids-cfimdb-dev.csv +3 -0
data/ids-cfimdb-test-student.csv +3 -0
data/ids-cfimdb-train.csv +3 -0
data/ids-sst-dev.csv +3 -0
data/ids-sst-test-student.csv +3 -0
data/ids-sst-train.csv +3 -0
data/quora-dev.csv +3 -0
data/quora-test-student.csv +3 -0
data/quora-train.csv +3 -0
data/sts-dev.csv +3 -0
data/sts-test-student.csv +3 -0
data/sts-train.csv +3 -0
datasets.py +272 -0
evaluation.py +205 -0
multitask_classifier.py +340 -0
optimizer.py +90 -0
optimizer_test.npy +3 -0
optimizer_test.py +34 -0
predictions/README +2 -0
predictions/last-linear-layer-cfimdb-dev-out.csv +3 -0
predictions/last-linear-layer-cfimdb-test-out.csv +3 -0
predictions/last-linear-layer-sst-dev-out.csv +3 -0
predictions/last-linear-layer-sst-test-out.csv +3 -0
prepare_submit.py +18 -0
sanity_check.data +0 -0
sanity_check.py +19 -0
setup.sh +13 -0
sst-classifier.pt +3 -0
tokenizer.py +0 -0
utils.py +347 -0
zemo1.py +53 -0
zemo2.py +41 -0
zemo3.py +32 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.8.20

LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+Copyright 2018- The Hugging Face team. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# CS 224N Default Final Project - Multitask BERT
+This is the default final project for the Stanford CS 224N class. Please refer to the project handout on the course website for detailed instructions and an overview of the codebase.
+This project comprises two parts. In the first part, you will implement some important components of the BERT model to better understand its architecture.
+In the second part, you will use the embeddings produced by your BERT model on three downstream tasks: sentiment classification, paraphrase detection, and semantic similarity. You will implement extensions to improve your model's performance on the three downstream tasks.
+In broad strokes, Part 1 of this project targets:
+* bert.py: Missing code blocks.
+* classifier.py: Missing code blocks.
+* optimizer.py: Missing code blocks.
+And Part 2 targets:
+* multitask_classifier.py: Missing code blocks.
+* datasets.py: Possibly useful functions/classes for extensions.
+* evaluation.py: Possibly useful functions/classes for extensions.
+## Setup instructions
+Follow `setup.sh` to properly setup a conda environment and install dependencies.
+## Acknowledgement
+The BERT implementation part of the project was adapted from the "minbert" assignment developed at Carnegie Mellon University's [CS11-711 Advanced NLP](http://phontron.com/class/anlp2021/index.html),
+created by Shuyan Zhou, Zhengbao Jiang, Ritam Dutt, Brendon Boldt, Aditya Veerubhotla, and Graham Neubig.
+Parts of the code are from the [`transformers`](https://github.com/huggingface/transformers) library ([Apache License 2.0](./LICENSE)).

__pycache__/base_bert.cpython-38.pyc ADDED Viewed

Binary file (7.19 kB). View file

__pycache__/bert.cpython-38.pyc ADDED Viewed

Binary file (6.3 kB). View file

__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (6.64 kB). View file

__pycache__/optimizer.cpython-38.pyc ADDED Viewed

Binary file (2.37 kB). View file

__pycache__/tokenizer.cpython-38.pyc ADDED Viewed

Binary file (76.3 kB). View file

__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (9.09 kB). View file

base_bert.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import re
+from torch import device, dtype
+from config import BertConfig, PretrainedConfig
+from utils import *
+class BertPreTrainedModel(nn.Module):
+  config_class = BertConfig
+  base_model_prefix = "bert"
+  _keys_to_ignore_on_load_missing = [r"position_ids"]
+  _keys_to_ignore_on_load_unexpected = None
+  def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+    super().__init__()
+    self.config = config
+    self.name_or_path = config.name_or_path
+  def init_weights(self):
+    # Initialize weights
+    self.apply(self._init_weights)
+  def _init_weights(self, module):
+    """ Initialize the weights """
+    if isinstance(module, (nn.Linear, nn.Embedding)):
+      # Slightly different from the TF version which uses truncated_normal for initialization
+      # cf https://github.com/pytorch/pytorch/pull/5617
+      module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+    elif isinstance(module, nn.LayerNorm):
+      module.bias.data.zero_()
+      module.weight.data.fill_(1.0)
+    if isinstance(module, nn.Linear) and module.bias is not None:
+      module.bias.data.zero_()
+  @property
+  def dtype(self) -> dtype:
+    return get_parameter_dtype(self)
+  @classmethod
+  def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+    config = kwargs.pop("config", None)
+    state_dict = kwargs.pop("state_dict", None)
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    output_loading_info = kwargs.pop("output_loading_info", False)
+    local_files_only = kwargs.pop("local_files_only", False)
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    revision = kwargs.pop("revision", None)
+    mirror = kwargs.pop("mirror", None)
+    # Load config if we don't provide a configuration
+    if not isinstance(config, PretrainedConfig):
+      config_path = config if config is not None else pretrained_model_name_or_path
+      config, model_kwargs = cls.config_class.from_pretrained(
+        config_path,
+        *model_args,
+        cache_dir=cache_dir,
+        return_unused_kwargs=True,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        local_files_only=local_files_only,
+        use_auth_token=use_auth_token,
+        revision=revision,
+        **kwargs,
+      )
+    else:
+      model_kwargs = kwargs
+    # Load model
+    if pretrained_model_name_or_path is not None:
+      pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+      if os.path.isdir(pretrained_model_name_or_path):
+        # Load from a PyTorch checkpoint
+        archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+      elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+        archive_file = pretrained_model_name_or_path
+      else:
+        archive_file = hf_bucket_url(
+          pretrained_model_name_or_path,
+          filename=WEIGHTS_NAME,
+          revision=revision,
+          mirror=mirror,
+        )
+      try:
+        # Load from URL or cache if already cached
+        resolved_archive_file = cached_path(
+          archive_file,
+          cache_dir=cache_dir,
+          force_download=force_download,
+          proxies=proxies,
+          resume_download=resume_download,
+          local_files_only=local_files_only,
+          use_auth_token=use_auth_token,
+        )
+      except EnvironmentError as err:
+        #logger.error(err)
+        msg = (
+          f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+          f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+          f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}.\n\n"
+        )
+        raise EnvironmentError(msg)
+    else:
+      resolved_archive_file = None
+    config.name_or_path = pretrained_model_name_or_path
+    # Instantiate model.
+    model = cls(config, *model_args, **model_kwargs)
+    if state_dict is None:
+      try:
+        state_dict = torch.load(resolved_archive_file, map_location="cpu", weights_only=True)
+      except Exception:
+        raise OSError(
+          f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
+          f"at '{resolved_archive_file}'"
+        )
+    missing_keys = []
+    unexpected_keys = []
+    error_msgs = []
+    # Convert old format to new format if needed from a PyTorch state_dict
+    old_keys = []
+    new_keys = []
+    m = {'embeddings.word_embeddings': 'word_embedding',
+         'embeddings.position_embeddings': 'pos_embedding',
+         'embeddings.token_type_embeddings': 'tk_type_embedding',
+         'embeddings.LayerNorm': 'embed_layer_norm',
+         'embeddings.dropout': 'embed_dropout',
+         'encoder.layer': 'bert_layers',
+         'pooler.dense': 'pooler_dense',
+         'pooler.activation': 'pooler_af',
+         'attention.self': "self_attention",
+         'attention.output.dense': 'attention_dense',
+         'attention.output.LayerNorm': 'attention_layer_norm',
+         'attention.output.dropout': 'attention_dropout',
+         'intermediate.dense': 'interm_dense',
+         'intermediate.intermediate_act_fn': 'interm_af',
+         'output.dense': 'out_dense',
+         'output.LayerNorm': 'out_layer_norm',
+         'output.dropout': 'out_dropout'}
+    for key in state_dict.keys():
+      new_key = None
+      if "gamma" in key:
+        new_key = key.replace("gamma", "weight")
+      if "beta" in key:
+        new_key = key.replace("beta", "bias")
+      for x, y in m.items():
+        if new_key is not None:
+          _key = new_key
+        else:
+          _key = key
+        if x in key:
+          new_key = _key.replace(x, y)
+      if new_key:
+        old_keys.append(key)
+        new_keys.append(new_key)
+    for old_key, new_key in zip(old_keys, new_keys):
+      # print(old_key, new_key)
+      state_dict[new_key] = state_dict.pop(old_key)
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    if metadata is not None:
+      state_dict._metadata = metadata
+    your_bert_params = [f"bert.{x[0]}" for x in model.named_parameters()]
+    for k in state_dict:
+      if k not in your_bert_params and not k.startswith("cls."):
+        possible_rename = [x for x in k.split(".")[1:-1] if x in m.values()]
+        raise ValueError(f"{k} cannot be reload to your model, one/some of {possible_rename} we provided have been renamed")
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: nn.Module, prefix=""):
+      local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+      module._load_from_state_dict(
+        state_dict,
+        prefix,
+        local_metadata,
+        True,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+      )
+      for name, child in module._modules.items():
+        if child is not None:
+          load(child, prefix + name + ".")
+    # Make sure we are able to load base models as well as derived models (with heads)
+    start_prefix = ""
+    model_to_load = model
+    has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
+    if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+      start_prefix = cls.base_model_prefix + "."
+    if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+      model_to_load = getattr(model, cls.base_model_prefix)
+    load(model_to_load, prefix=start_prefix)
+    if model.__class__.__name__ != model_to_load.__class__.__name__:
+      base_model_state_dict = model_to_load.state_dict().keys()
+      head_model_state_dict_without_base_prefix = [
+        key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+      ]
+      missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+    # Some models may have keys that are not in the state by design, removing them before needlessly warning
+    # the user.
+    if cls._keys_to_ignore_on_load_missing is not None:
+      for pat in cls._keys_to_ignore_on_load_missing:
+        missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+    if cls._keys_to_ignore_on_load_unexpected is not None:
+      for pat in cls._keys_to_ignore_on_load_unexpected:
+        unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+    if len(error_msgs) > 0:
+      raise RuntimeError(
+        "Error(s) in loading state_dict for {}:\n\t{}".format(
+          model.__class__.__name__, "\n\t".join(error_msgs)
+        )
+      )
+    # Set model in evaluation mode to deactivate DropOut modules by default
+    model.eval()
+    if output_loading_info:
+      loading_info = {
+        "missing_keys": missing_keys,
+        "unexpected_keys": unexpected_keys,
+        "error_msgs": error_msgs,
+      }
+      return model, loading_info
+    if hasattr(config, "xla_device") and config.xla_device and is_torch_tpu_available():
+      import torch_xla.core.xla_model as xm
+      model = xm.send_cpu_data_to_device(model, xm.xla_device())
+      model.to(xm.xla_device())
+    return model

bert.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from base_bert import BertPreTrainedModel
+from utils import *
+class BertSelfAttention(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.num_attention_heads = config.num_attention_heads
+    self.attention_head_size = config.hidden_size // config.num_attention_heads
+    self.all_head_size = self.num_attention_heads * self.attention_head_size
+    # Initialize the linear transformation layers for key, value, query.
+    self.query = nn.Linear(config.hidden_size, self.all_head_size)
+    self.key = nn.Linear(config.hidden_size, self.all_head_size)
+    self.value = nn.Linear(config.hidden_size, self.all_head_size)
+    # This dropout is applied to normalized attention scores following the original
+    # implementation of transformer. Although it is a bit unusual, we empirically
+    # observe that it yields better performance.
+    self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+  def transform(self, x, linear_layer):
+    # The corresponding linear_layer of k, v, q are used to project the hidden_state (x).
+    bs, seq_len = x.shape[:2]
+    proj = linear_layer(x)
+    # Next, we need to produce multiple heads for the proj. This is done by spliting the
+    # hidden state to self.num_attention_heads, each of size self.attention_head_size.
+    proj = proj.view(bs, seq_len, self.num_attention_heads, self.attention_head_size)
+    # By proper transpose, we have proj of size [bs, num_attention_heads, seq_len, attention_head_size].
+    proj = proj.transpose(1, 2)
+    return proj
+  def attention(self, key, query, value, attention_mask):
+    """
+    key, query, value: [batch_size, num_attention_heads, seq_len, attention_head_size]
+    attention_mask: [batch_size, 1, 1, seq_len], masks padding tokens in the input.
+    """
+    d_k = query.size(-1)  # attention_head_size
+    attention_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
+    # attention_scores shape: [batch_size, num_attention_heads, seq_len, seq_len]
+    # Apply attention mask
+    attention_scores = attention_scores + attention_mask
+    # Normalize scores with softmax and apply dropout.
+    attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+    attention_probs = self.dropout(attention_probs)
+    context = torch.matmul(attention_probs, value)
+    # context shape: [batch_size, num_attention_heads, seq_len, attention_head_size]
+    # Concatenate all attention heads to recover original shape: [batch_size, seq_len, hidden_size]
+    context = context.transpose(1, 2).contiguous()
+    context = context.view(context.size(0), context.size(1), -1)
+    return context
+  def forward(self, hidden_states, attention_mask):
+    """
+    hidden_states: [bs, seq_len, hidden_state]
+    attention_mask: [bs, 1, 1, seq_len]
+    output: [bs, seq_len, hidden_state]
+    """
+    # First, we have to generate the key, value, query for each token for multi-head attention
+    # using self.transform (more details inside the function).
+    # Size of *_layer is [bs, num_attention_heads, seq_len, attention_head_size].
+    key_layer = self.transform(hidden_states, self.key)
+    value_layer = self.transform(hidden_states, self.value)
+    query_layer = self.transform(hidden_states, self.query)
+    # Calculate the multi-head attention.
+    attn_value = self.attention(key_layer, query_layer, value_layer, attention_mask)
+    return attn_value
+class BertLayer(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # Multi-head attention.
+    self.self_attention = BertSelfAttention(config)
+    # Add-norm for multi-head attention.
+    self.attention_dense = nn.Linear(config.hidden_size, config.hidden_size)
+    self.attention_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.attention_dropout = nn.Dropout(config.hidden_dropout_prob)
+    # Feed forward.
+    self.interm_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+    self.interm_af = F.gelu
+    # Add-norm for feed forward.
+    self.out_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+    self.out_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.out_dropout = nn.Dropout(config.hidden_dropout_prob)
+  def add_norm(self, input, output, dense_layer, dropout, ln_layer):
+    transformed_output = dense_layer(output)  # Biến đổi output bằng dense_layer
+    transformed_output = dropout(transformed_output)  # Áp dụng dropout
+    added_output = input + transformed_output  # Kết hợp input và output
+    normalized_output = ln_layer(added_output)  # Áp dụng chuẩn hóa
+    return normalized_output
+  def forward(self, hidden_states, attention_mask):
+    # 1. Multi-head attention
+    attention_output = self.self_attention(hidden_states, attention_mask)
+    # 2. Add-norm after attention
+    attention_output = self.add_norm(
+      hidden_states,
+      attention_output,
+      self.attention_dense,
+      self.attention_dropout,
+      self.attention_layer_norm
+    )
+    # 3. Feed-forward network
+    intermediate_output = self.interm_af(self.interm_dense(attention_output))
+    # 4. Add-norm after feed-forward
+    layer_output = self.add_norm(
+      attention_output,
+      intermediate_output,
+      self.out_dense,
+      self.out_dropout,
+      self.out_layer_norm
+    )
+    return layer_output
+class BertModel(BertPreTrainedModel):
+  """
+  The BERT model returns the final embeddings for each token in a sentence.
+  The model consists of:
+  1. Embedding layers (used in self.embed).
+  2. A stack of n BERT layers (used in self.encode).
+  3. A linear transformation layer for the [CLS] token (used in self.forward, as given).
+  """
+  def __init__(self, config):
+    super().__init__(config)
+    self.config = config
+    # Embedding layers.
+    self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+    self.pos_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+    self.tk_type_embedding = nn.Embedding(config.type_vocab_size, config.hidden_size)
+    self.embed_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
+    # Register position_ids (1, len position emb) to buffer because it is a constant.
+    position_ids = torch.arange(config.max_position_embeddings).unsqueeze(0)
+    self.register_buffer('position_ids', position_ids)
+    # BERT encoder.
+    self.bert_layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    # [CLS] token transformations.
+    self.pooler_dense = nn.Linear(config.hidden_size, config.hidden_size)
+    self.pooler_af = nn.Tanh()
+    self.init_weights()
+  def embed(self, input_ids):
+    input_shape = input_ids.size()
+    seq_length = input_shape[1]
+    inputs_embeds = self.word_embedding(input_ids)
+    pos_ids = self.position_ids[:, :seq_length]
+    pos_embeds = self.pos_embedding(pos_ids)
+    # Since we are not considering token type, this embedding is just a placeholder.
+    tk_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)
+    tk_type_embeds = self.tk_type_embedding(tk_type_ids)
+    embeddings = inputs_embeds + pos_embeds + tk_type_embeds
+    embeddings = self.embed_layer_norm(embeddings)
+    embeddings = self.embed_dropout(embeddings)
+    return embeddings
+  def encode(self, hidden_states, attention_mask):
+    """
+    hidden_states: the output from the embedding layer [batch_size, seq_len, hidden_size]
+    attention_mask: [batch_size, seq_len]
+    """
+    # Get the extended attention mask for self-attention.
+    # Returns extended_attention_mask of size [batch_size, 1, 1, seq_len].
+    # Distinguishes between non-padding tokens (with a value of 0) and padding tokens
+    # (with a value of a large negative number).
+    extended_attention_mask: torch.Tensor = get_extended_attention_mask(attention_mask, self.dtype)
+    # Pass the hidden states through the encoder layers.
+    for i, layer_module in enumerate(self.bert_layers):
+      # Feed the encoding from the last bert_layer to the next.
+      hidden_states = layer_module(hidden_states, extended_attention_mask)
+    return hidden_states
+  def forward(self, input_ids, attention_mask):
+    """
+    input_ids: [batch_size, seq_len], seq_len is the max length of the batch
+    attention_mask: same size as input_ids, 1 represents non-padding tokens, 0 represents padding tokens
+    """
+    # Get the embedding for each input token.
+    embedding_output = self.embed(input_ids=input_ids)
+    # Feed to a transformer (a stack of BertLayers).
+    sequence_output = self.encode(embedding_output, attention_mask=attention_mask)
+    # Get cls token hidden state.
+    first_tk = sequence_output[:, 0]
+    first_tk = self.pooler_dense(first_tk)
+    first_tk = self.pooler_af(first_tk)
+    return {'last_hidden_state': sequence_output, 'pooler_output': first_tk}

cfimdb-classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1c66df3c0ce0e4326519041f49707f102df5f680de5ded1b5125ba689a9d141
+size 438045778

classifier.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import random, numpy as np, argparse
+from types import SimpleNamespace
+import csv
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from sklearn.metrics import f1_score, accuracy_score
+from tokenizer import BertTokenizer
+from bert import BertModel
+from optimizer import AdamW
+from tqdm import tqdm
+TQDM_DISABLE=False
+# Fix the random seed.
+def seed_everything(seed=11711):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+class BertSentimentClassifier(torch.nn.Module):
+    '''
+    This module performs sentiment classification using BERT embeddings on the SST dataset.
+    In the SST dataset, there are 5 sentiment categories (from 0 - "negative" to 4 - "positive").
+    Thus, your forward() should return one logit for each of the 5 classes.
+    '''
+    def __init__(self, config):
+        super(BertSentimentClassifier, self).__init__()
+        self.num_labels = config.num_labels
+        self.bert: BertModel = BertModel.from_pretrained('bert-base-uncased')
+        # Pretrain mode does not require updating BERT paramters.
+        assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
+        for param in self.bert.parameters():
+            if config.fine_tune_mode == 'last-linear-layer':
+                param.requires_grad = False
+            elif config.fine_tune_mode == 'full-model':
+                param.requires_grad = True
+        # Create any instance variables you need to classify the sentiment of BERT embeddings.
+        self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)
+    def forward(self, input_ids, attention_mask):
+        '''Takes a batch of sentences and returns logits for sentiment classes'''
+        # The final BERT contextualized embedding is the hidden state of [CLS] token (the first token).
+        # HINT: You should consider what is an appropriate return value given that
+        # the training loop currently uses F.cross_entropy as the loss function.
+        # Get the embedding for each input token.
+        embedding_output = self.bert.embed(input_ids=input_ids)
+        # Feed to a transformer (BERT layers).
+        sequence_output = self.bert.encode(embedding_output, attention_mask=attention_mask)
+        # The final BERT contextualized embedding is the hidden state of [CLS] token (the first token).
+        cls_token_output = sequence_output[:, 0, :]  # The first token is [CLS]
+        # Pass the [CLS] token representation through the classifier.
+        logits = self.classifier(cls_token_output)
+        return logits
+class SentimentDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        labels = [x[1] for x in data]
+        sent_ids = [x[2] for x in data]
+        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        labels = torch.LongTensor(labels)
+        return token_ids, attention_mask, labels, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, labels, sents, sent_ids= self.pad_data(all_data)
+        batched_data = {
+                'token_ids': token_ids,
+                'attention_mask': attention_mask,
+                'labels': labels,
+                'sents': sents,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+class SentimentTestDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        sent_ids = [x[1] for x in data]
+        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attention_mask, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
+        batched_data = {
+                'token_ids': token_ids,
+                'attention_mask': attention_mask,
+                'sents': sents,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+# Load the data: a list of (sentence, label).
+def load_data(filename, flag='train'):
+    num_labels = {}
+    data = []
+    if flag == 'test':
+        with open(filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                data.append((sent,sent_id))
+    else:
+        with open(filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                label = int(record['sentiment'].strip())
+                if label not in num_labels:
+                    num_labels[label] = len(num_labels)
+                data.append((sent, label,sent_id))
+        print(f"load {len(data)} data from {filename}")
+    if flag == 'train':
+        return data, len(num_labels)
+    else:
+        return data
+# Evaluate the model on dev examples.
+def model_eval(dataloader, model, device):
+    model.eval() # Switch to eval model, will turn off randomness like dropout.
+    y_true = []
+    y_pred = []
+    sents = []
+    sent_ids = []
+    for step, batch in enumerate(tqdm(dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+        b_ids, b_mask, b_labels, b_sents, b_sent_ids = batch['token_ids'],batch['attention_mask'],  \
+                                                        batch['labels'], batch['sents'], batch['sent_ids']
+        b_ids = b_ids.to(device)
+        b_mask = b_mask.to(device)
+        logits = model(b_ids, b_mask)
+        logits = logits.detach().cpu().numpy()
+        preds = np.argmax(logits, axis=1).flatten()
+        b_labels = b_labels.flatten()
+        y_true.extend(b_labels)
+        y_pred.extend(preds)
+        sents.extend(b_sents)
+        sent_ids.extend(b_sent_ids)
+    f1 = f1_score(y_true, y_pred, average='macro')
+    acc = accuracy_score(y_true, y_pred)
+    return acc, f1, y_pred, y_true, sents, sent_ids
+# Evaluate the model on test examples.
+def model_test_eval(dataloader, model, device):
+    model.eval() # Switch to eval model, will turn off randomness like dropout.
+    y_pred = []
+    sents = []
+    sent_ids = []
+    for step, batch in enumerate(tqdm(dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+        b_ids, b_mask, b_sents, b_sent_ids = batch['token_ids'],batch['attention_mask'],  \
+                                                         batch['sents'], batch['sent_ids']
+        b_ids = b_ids.to(device)
+        b_mask = b_mask.to(device)
+        logits = model(b_ids, b_mask)
+        logits = logits.detach().cpu().numpy()
+        preds = np.argmax(logits, axis=1).flatten()
+        y_pred.extend(preds)
+        sents.extend(b_sents)
+        sent_ids.extend(b_sent_ids)
+    return y_pred, sents, sent_ids
+def save_model(model, optimizer, args, config, filepath):
+    save_info = {
+        'model': model.state_dict(),
+        'optim': optimizer.state_dict(),
+        'args': args,
+        'model_config': config,
+        'system_rng': random.getstate(),
+        'numpy_rng': np.random.get_state(),
+        'torch_rng': torch.random.get_rng_state(),
+    }
+    torch.save(save_info, filepath)
+    print(f"save the model to {filepath}")
+def train(args):
+    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+    # Create the data and its corresponding datasets and dataloader.
+    train_data, num_labels = load_data(args.train, 'train')
+    dev_data = load_data(args.dev, 'valid')
+    train_dataset = SentimentDataset(train_data, args)
+    dev_dataset = SentimentDataset(dev_data, args)
+    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size,
+                                  collate_fn=train_dataset.collate_fn)
+    dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
+                                collate_fn=dev_dataset.collate_fn)
+    # Init model.
+    config = {'hidden_dropout_prob': args.hidden_dropout_prob,
+              'num_labels': num_labels,
+              'hidden_size': 768,
+              'data_dir': '.',
+              'fine_tune_mode': args.fine_tune_mode}
+    config = SimpleNamespace(**config)
+    model = BertSentimentClassifier(config)
+    model = model.to(device)
+    lr = args.lr
+    optimizer = AdamW(model.parameters(), lr=lr)
+    best_dev_acc = 0
+    # Run for the specified number of epochs.
+    for epoch in range(args.epochs):
+        model.train()
+        train_loss = 0
+        num_batches = 0
+        for batch in tqdm(train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
+            b_ids, b_mask, b_labels = (batch['token_ids'],
+                                       batch['attention_mask'], batch['labels'])
+            b_ids = b_ids.to(device)
+            b_mask = b_mask.to(device)
+            b_labels = b_labels.to(device)
+            optimizer.zero_grad()
+            logits = model(b_ids, b_mask)
+            loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            num_batches += 1
+        train_loss = train_loss / (num_batches)
+        train_acc, train_f1, *_  = model_eval(train_dataloader, model, device)
+        dev_acc, dev_f1, *_ = model_eval(dev_dataloader, model, device)
+        if dev_acc > best_dev_acc:
+            best_dev_acc = dev_acc
+            save_model(model, optimizer, args, config, args.filepath)
+        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")
+def test(args):
+    with torch.no_grad():
+        device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+        saved = torch.load(args.filepath)
+        config = saved['model_config']
+        model = BertSentimentClassifier(config)
+        model.load_state_dict(saved['model'])
+        model = model.to(device)
+        print(f"load model from {args.filepath}")
+        dev_data = load_data(args.dev, 'valid')
+        dev_dataset = SentimentDataset(dev_data, args)
+        dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn)
+        test_data = load_data(args.test, 'test')
+        test_dataset = SentimentTestDataset(test_data, args)
+        test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=test_dataset.collate_fn)
+        dev_acc, dev_f1, dev_pred, dev_true, dev_sents, dev_sent_ids = model_eval(dev_dataloader, model, device)
+        print('DONE DEV')
+        test_pred, test_sents, test_sent_ids = model_test_eval(test_dataloader, model, device)
+        print('DONE Test')
+        with open(args.dev_out, "w+") as f:
+            print(f"dev acc :: {dev_acc :.3f}")
+            f.write(f"id \t Predicted_Sentiment \n")
+            for p, s in zip(dev_sent_ids,dev_pred ):
+                f.write(f"{p} , {s} \n")
+        with open(args.test_out, "w+") as f:
+            f.write(f"id \t Predicted_Sentiment \n")
+            for p, s  in zip(test_sent_ids,test_pred ):
+                f.write(f"{p} , {s} \n")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--seed", type=int, default=11711)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--fine-tune-mode", type=str,
+                        help='last-linear-layer: the BERT parameters are frozen and the task specific head parameters are updated; full-model: BERT parameters are updated as well',
+                        choices=('last-linear-layer', 'full-model'), default="last-linear-layer")
+    parser.add_argument("--use_gpu", action='store_true')
+    parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
+    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
+    parser.add_argument("--lr", type=float, help="learning rate, default lr for 'pretrain': 1e-3, 'finetune': 1e-5",
+                        default=1e-3)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = get_args()
+    seed_everything(args.seed)
+    print('Training Sentiment Classifier on SST...')
+    config = SimpleNamespace(
+        filepath='sst-classifier.pt',
+        lr=args.lr,
+        use_gpu=args.use_gpu,
+        epochs=args.epochs,
+        batch_size=args.batch_size,
+        hidden_dropout_prob=args.hidden_dropout_prob,
+        train='data/ids-sst-train.csv',
+        dev='data/ids-sst-dev.csv',
+        test='data/ids-sst-test-student.csv',
+        fine_tune_mode=args.fine_tune_mode,
+        dev_out = 'predictions/' + args.fine_tune_mode + '-sst-dev-out.csv',
+        test_out = 'predictions/' + args.fine_tune_mode + '-sst-test-out.csv'
+    )
+    train(config)
+    print('Evaluating on SST...')
+    test(config)
+    print('Training Sentiment Classifier on cfimdb...')
+    config = SimpleNamespace(
+        filepath='cfimdb-classifier.pt',
+        lr=args.lr,
+        use_gpu=args.use_gpu,
+        epochs=args.epochs,
+        batch_size=8,
+        hidden_dropout_prob=args.hidden_dropout_prob,
+        train='data/ids-cfimdb-train.csv',
+        dev='data/ids-cfimdb-dev.csv',
+        test='data/ids-cfimdb-test-student.csv',
+        fine_tune_mode=args.fine_tune_mode,
+        dev_out = 'predictions/' + args.fine_tune_mode + '-cfimdb-dev-out.csv',
+        test_out = 'predictions/' + args.fine_tune_mode + '-cfimdb-test-out.csv'
+    )
+    train(config)
+    print('Evaluating on cfimdb...')
+    test(config)

config.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from typing import Union, Tuple, Dict, Any, Optional
+import os
+import json
+from collections import OrderedDict
+import torch
+from utils import CONFIG_NAME, hf_bucket_url, cached_path, is_remote_url
+class PretrainedConfig(object):
+  model_type: str = ""
+  is_composition: bool = False
+  def __init__(self, **kwargs):
+    # Attributes with defaults
+    self.return_dict = kwargs.pop("return_dict", True)
+    self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+    self.output_attentions = kwargs.pop("output_attentions", False)
+    self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+    self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+    self.pruned_heads = kwargs.pop("pruned_heads", {})
+    self.tie_word_embeddings = kwargs.pop(
+      "tie_word_embeddings", True
+    )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
+    # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+    self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+    self.is_decoder = kwargs.pop("is_decoder", False)
+    self.add_cross_attention = kwargs.pop("add_cross_attention", False)
+    self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
+    # Parameters for sequence generation
+    self.max_length = kwargs.pop("max_length", 20)
+    self.min_length = kwargs.pop("min_length", 0)
+    self.do_sample = kwargs.pop("do_sample", False)
+    self.early_stopping = kwargs.pop("early_stopping", False)
+    self.num_beams = kwargs.pop("num_beams", 1)
+    self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+    self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+    self.temperature = kwargs.pop("temperature", 1.0)
+    self.top_k = kwargs.pop("top_k", 50)
+    self.top_p = kwargs.pop("top_p", 1.0)
+    self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+    self.length_penalty = kwargs.pop("length_penalty", 1.0)
+    self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+    self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
+    self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+    self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+    self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
+    self.output_scores = kwargs.pop("output_scores", False)
+    self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
+    self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
+    self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+    # Fine-tuning task arguments
+    self.architectures = kwargs.pop("architectures", None)
+    self.finetuning_task = kwargs.pop("finetuning_task", None)
+    self.id2label = kwargs.pop("id2label", None)
+    self.label2id = kwargs.pop("label2id", None)
+    if self.id2label is not None:
+      kwargs.pop("num_labels", None)
+      self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+      # Keys are always strings in JSON so convert ids to int here.
+    else:
+      self.num_labels = kwargs.pop("num_labels", 2)
+    # Tokenizer arguments
+    self.tokenizer_class = kwargs.pop("tokenizer_class", None)
+    self.prefix = kwargs.pop("prefix", None)
+    self.bos_token_id = kwargs.pop("bos_token_id", None)
+    self.pad_token_id = kwargs.pop("pad_token_id", None)
+    self.eos_token_id = kwargs.pop("eos_token_id", None)
+    self.sep_token_id = kwargs.pop("sep_token_id", None)
+    self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+    # task specific arguments
+    self.task_specific_params = kwargs.pop("task_specific_params", None)
+    # TPU arguments
+    self.xla_device = kwargs.pop("xla_device", None)
+    # Name or path to the pretrained checkpoint
+    self._name_or_path = str(kwargs.pop("name_or_path", ""))
+    # Drop the transformers version info
+    kwargs.pop("transformers_version", None)
+    # Additional attributes without default values
+    for key, value in kwargs.items():
+      try:
+        setattr(self, key, value)
+      except AttributeError as err:
+        raise err
+  @classmethod
+  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+    return cls.from_dict(config_dict, **kwargs)
+  @classmethod
+  def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+    with open(json_file, "r", encoding="utf-8") as reader:
+      text = reader.read()
+    return json.loads(text)
+  @classmethod
+  def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
+    return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+    config = cls(**config_dict)
+    if hasattr(config, "pruned_heads"):
+      config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+    # Update config with kwargs if needed
+    to_remove = []
+    for key, value in kwargs.items():
+      if hasattr(config, key):
+        setattr(config, key, value)
+        to_remove.append(key)
+    for key in to_remove:
+      kwargs.pop(key, None)
+    if return_unused_kwargs:
+      return config, kwargs
+    else:
+      return config
+  @classmethod
+  def get_config_dict(
+    cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+  ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    local_files_only = kwargs.pop("local_files_only", False)
+    revision = kwargs.pop("revision", None)
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isdir(pretrained_model_name_or_path):
+      config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+    elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+      config_file = pretrained_model_name_or_path
+    else:
+      config_file = hf_bucket_url(
+        pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
+      )
+    try:
+      # Load from URL or cache if already cached
+      resolved_config_file = cached_path(
+        config_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+        use_auth_token=use_auth_token,
+      )
+      # Load config dict
+      config_dict = cls._dict_from_json_file(resolved_config_file)
+    except EnvironmentError as err:
+      msg = (
+        f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+        f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+        f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
+      )
+      raise EnvironmentError(msg)
+    except json.JSONDecodeError:
+      msg = (
+        "Couldn't reach server at '{}' to download configuration file or "
+        "configuration file is not a valid JSON file. "
+        "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+      )
+      raise EnvironmentError(msg)
+    return config_dict, kwargs
+class BertConfig(PretrainedConfig):
+  model_type = "bert"
+  def __init__(
+    self,
+    vocab_size=30522,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=2,
+    initializer_range=0.02,
+    layer_norm_eps=1e-12,
+    pad_token_id=0,
+    gradient_checkpointing=False,
+    position_embedding_type="absolute",
+    use_cache=True,
+    **kwargs
+  ):
+    super().__init__(pad_token_id=pad_token_id, **kwargs)
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.layer_norm_eps = layer_norm_eps
+    self.gradient_checkpointing = gradient_checkpointing
+    self.position_embedding_type = position_embedding_type
+    self.use_cache = use_cache

data/ids-cfimdb-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3087f571b66860fe5d035b5a018d08202ad3fd3720e4821c04b2acf6c7ded559
+size 249095

data/ids-cfimdb-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ae611548c9eac879e9ebb406cc9f8ae68ff12f78090e4965af5cbdfa06240f4
+size 495595

data/ids-cfimdb-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:140fc513045a966109faed46a5c7a898767b96714d71bcb9c15f659129fadcea
+size 1693182

data/ids-sst-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a186ce94577635fbe10beaaddd50f16cccf6c30973221cefdf90deed2a584bfe
+size 151384

data/ids-sst-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd5a767faa0c26782117e37767ece154c30d5d04fb8727d09c71e3850a55c7b
+size 313202

data/ids-sst-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03b2b625c090f94a6afd59f114cde5282e2053aab0b101e87ed695d8a0c5b1df
+size 1175139

data/quora-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e9dc46b273a711d82a065f55e1754a9b92c10ad7345ebe0b0ebba61397dda4a
+size 6896912

data/quora-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fa130f532cdde70287081aa04af13a4b12e3aa862e9162763d15fb46385497a
+size 13487951

data/quora-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cd59e1ddb3a5b5d03f4a885c64e67aaf50122d9ab9ed7a476b5d2d6f7137ae8
+size 48270674

data/sts-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce3cad6f16062586ac7ba462c28b010a9be10c530fd5074165860d7b7ab4e93d
+size 132265

data/sts-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dee455745b72e9ca3ff74e7c056bd73e34bad5b8d5641045a2c1e7e131866f47
+size 256677

data/sts-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15d12efc2d656fffb1d61ac1f08ec4227f43925fd16f420c037cbd063699c21b
+size 928832

datasets.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python3
+'''
+This module contains our Dataset classes and functions that load the three datasets
+for training and evaluating multitask BERT.
+Feel free to edit code in this file if you wish to modify the way in which the data
+examples are preprocessed.
+'''
+import csv
+import torch
+from torch.utils.data import Dataset
+from tokenizer import BertTokenizer
+def preprocess_string(s):
+    return ' '.join(s.lower()
+                    .replace('.', ' .')
+                    .replace('?', ' ?')
+                    .replace(',', ' ,')
+                    .replace('\'', ' \'')
+                    .split())
+class SentenceClassificationDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        labels = [x[1] for x in data]
+        sent_ids = [x[2] for x in data]
+        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        labels = torch.LongTensor(labels)
+        return token_ids, attention_mask, labels, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, labels, sents, sent_ids= self.pad_data(all_data)
+        batched_data = {
+                'token_ids': token_ids,
+                'attention_mask': attention_mask,
+                'labels': labels,
+                'sents': sents,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+# Unlike SentenceClassificationDataset, we do not load labels in SentenceClassificationTestDataset.
+class SentenceClassificationTestDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        sent_ids = [x[1] for x in data]
+        encoding = self.tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attention_mask, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
+        batched_data = {
+                'token_ids': token_ids,
+                'attention_mask': attention_mask,
+                'sents': sents,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+class SentencePairDataset(Dataset):
+    def __init__(self, dataset, args, isRegression=False):
+        self.dataset = dataset
+        self.p = args
+        self.isRegression = isRegression
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sent1 = [x[0] for x in data]
+        sent2 = [x[1] for x in data]
+        labels = [x[2] for x in data]
+        sent_ids = [x[3] for x in data]
+        encoding1 = self.tokenizer(sent1, return_tensors='pt', padding=True, truncation=True)
+        encoding2 = self.tokenizer(sent2, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding1['input_ids'])
+        attention_mask = torch.LongTensor(encoding1['attention_mask'])
+        token_type_ids = torch.LongTensor(encoding1['token_type_ids'])
+        token_ids2 = torch.LongTensor(encoding2['input_ids'])
+        attention_mask2 = torch.LongTensor(encoding2['attention_mask'])
+        token_type_ids2 = torch.LongTensor(encoding2['token_type_ids'])
+        if self.isRegression:
+            labels = torch.DoubleTensor(labels)
+        else:
+            labels = torch.LongTensor(labels)
+        return (token_ids, token_type_ids, attention_mask,
+                token_ids2, token_type_ids2, attention_mask2,
+                labels,sent_ids)
+    def collate_fn(self, all_data):
+        (token_ids, token_type_ids, attention_mask,
+         token_ids2, token_type_ids2, attention_mask2,
+         labels, sent_ids) = self.pad_data(all_data)
+        batched_data = {
+                'token_ids_1': token_ids,
+                'token_type_ids_1': token_type_ids,
+                'attention_mask_1': attention_mask,
+                'token_ids_2': token_ids2,
+                'token_type_ids_2': token_type_ids2,
+                'attention_mask_2': attention_mask2,
+                'labels': labels,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+# Unlike SentencePairDataset, we do not load labels in SentencePairTestDataset.
+class SentencePairTestDataset(Dataset):
+    def __init__(self, dataset, args):
+        self.dataset = dataset
+        self.p = args
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sent1 = [x[0] for x in data]
+        sent2 = [x[1] for x in data]
+        sent_ids = [x[2] for x in data]
+        encoding1 = self.tokenizer(sent1, return_tensors='pt', padding=True, truncation=True)
+        encoding2 = self.tokenizer(sent2, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding1['input_ids'])
+        attention_mask = torch.LongTensor(encoding1['attention_mask'])
+        token_type_ids = torch.LongTensor(encoding1['token_type_ids'])
+        token_ids2 = torch.LongTensor(encoding2['input_ids'])
+        attention_mask2 = torch.LongTensor(encoding2['attention_mask'])
+        token_type_ids2 = torch.LongTensor(encoding2['token_type_ids'])
+        return (token_ids, token_type_ids, attention_mask,
+                token_ids2, token_type_ids2, attention_mask2,
+               sent_ids)
+    def collate_fn(self, all_data):
+        (token_ids, token_type_ids, attention_mask,
+         token_ids2, token_type_ids2, attention_mask2,
+         sent_ids) = self.pad_data(all_data)
+        batched_data = {
+                'token_ids_1': token_ids,
+                'token_type_ids_1': token_type_ids,
+                'attention_mask_1': attention_mask,
+                'token_ids_2': token_ids2,
+                'token_type_ids_2': token_type_ids2,
+                'attention_mask_2': attention_mask2,
+                'sent_ids': sent_ids
+            }
+        return batched_data
+def load_multitask_data(sentiment_filename,paraphrase_filename,similarity_filename,split='train'):
+    sentiment_data = []
+    num_labels = {}
+    if split == 'test':
+        with open(sentiment_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                sentiment_data.append((sent,sent_id))
+    else:
+        with open(sentiment_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent = record['sentence'].lower().strip()
+                sent_id = record['id'].lower().strip()
+                label = int(record['sentiment'].strip())
+                if label not in num_labels:
+                    num_labels[label] = len(num_labels)
+                sentiment_data.append((sent, label,sent_id))
+    print(f"Loaded {len(sentiment_data)} {split} examples from {sentiment_filename}")
+    paraphrase_data = []
+    if split == 'test':
+        with open(paraphrase_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent_id = record['id'].lower().strip()
+                paraphrase_data.append((preprocess_string(record['sentence1']),
+                                        preprocess_string(record['sentence2']),
+                                        sent_id))
+    else:
+        with open(paraphrase_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                try:
+                    sent_id = record['id'].lower().strip()
+                    paraphrase_data.append((preprocess_string(record['sentence1']),
+                                            preprocess_string(record['sentence2']),
+                                            int(float(record['is_duplicate'])),sent_id))
+                except:
+                    pass
+    print(f"Loaded {len(paraphrase_data)} {split} examples from {paraphrase_filename}")
+    similarity_data = []
+    if split == 'test':
+        with open(similarity_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent_id = record['id'].lower().strip()
+                similarity_data.append((preprocess_string(record['sentence1']),
+                                        preprocess_string(record['sentence2'])
+                                        ,sent_id))
+    else:
+        with open(similarity_filename, 'r') as fp:
+            for record in csv.DictReader(fp,delimiter = '\t'):
+                sent_id = record['id'].lower().strip()
+                similarity_data.append((preprocess_string(record['sentence1']),
+                                        preprocess_string(record['sentence2']),
+                                        float(record['similarity']),sent_id))
+    print(f"Loaded {len(similarity_data)} {split} examples from {similarity_filename}")
+    return sentiment_data, num_labels, paraphrase_data, similarity_data

evaluation.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python3
+'''
+Multitask BERT evaluation functions.
+When training your multitask model, you will find it useful to call
+model_eval_multitask to evaluate your model on the 3 tasks' dev sets.
+'''
+import torch
+from sklearn.metrics import f1_score, accuracy_score
+from tqdm import tqdm
+import numpy as np
+TQDM_DISABLE = False
+# Evaluate multitask model on SST only.
+def model_eval_sst(dataloader, model, device):
+    model.eval()  # Switch to eval model, will turn off randomness like dropout.
+    y_true = []
+    y_pred = []
+    sents = []
+    sent_ids = []
+    for step, batch in enumerate(tqdm(dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+        b_ids, b_mask, b_labels, b_sents, b_sent_ids = batch['token_ids'],batch['attention_mask'],  \
+                                                        batch['labels'], batch['sents'], batch['sent_ids']
+        b_ids = b_ids.to(device)
+        b_mask = b_mask.to(device)
+        logits = model.predict_sentiment(b_ids, b_mask)
+        logits = logits.detach().cpu().numpy()
+        preds = np.argmax(logits, axis=1).flatten()
+        b_labels = b_labels.flatten()
+        y_true.extend(b_labels)
+        y_pred.extend(preds)
+        sents.extend(b_sents)
+        sent_ids.extend(b_sent_ids)
+    f1 = f1_score(y_true, y_pred, average='macro')
+    acc = accuracy_score(y_true, y_pred)
+    return acc, f1, y_pred, y_true, sents, sent_ids
+# Evaluate multitask model on dev sets.
+def model_eval_multitask(sentiment_dataloader,
+                         paraphrase_dataloader,
+                         sts_dataloader,
+                         model, device):
+    model.eval()  # Switch to eval model, will turn off randomness like dropout.
+    with torch.no_grad():
+        # Evaluate sentiment classification.
+        sst_y_true = []
+        sst_y_pred = []
+        sst_sent_ids = []
+        for step, batch in enumerate(tqdm(sentiment_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            b_ids, b_mask, b_labels, b_sent_ids = batch['token_ids'], batch['attention_mask'], batch['labels'], batch['sent_ids']
+            b_ids = b_ids.to(device)
+            b_mask = b_mask.to(device)
+            logits = model.predict_sentiment(b_ids, b_mask)
+            y_hat = logits.argmax(dim=-1).flatten().cpu().numpy()
+            b_labels = b_labels.flatten().cpu().numpy()
+            sst_y_pred.extend(y_hat)
+            sst_y_true.extend(b_labels)
+            sst_sent_ids.extend(b_sent_ids)
+        sentiment_accuracy = np.mean(np.array(sst_y_pred) == np.array(sst_y_true))
+        # Evaluate paraphrase detection.
+        para_y_true = []
+        para_y_pred = []
+        para_sent_ids = []
+        for step, batch in enumerate(tqdm(paraphrase_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            (b_ids1, b_mask1,
+             b_ids2, b_mask2,
+             b_labels, b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
+                          batch['token_ids_2'], batch['attention_mask_2'],
+                          batch['labels'], batch['sent_ids'])
+            b_ids1 = b_ids1.to(device)
+            b_mask1 = b_mask1.to(device)
+            b_ids2 = b_ids2.to(device)
+            b_mask2 = b_mask2.to(device)
+            logits = model.predict_paraphrase(b_ids1, b_mask1, b_ids2, b_mask2)
+            y_hat = logits.sigmoid().round().flatten().cpu().numpy()
+            b_labels = b_labels.flatten().cpu().numpy()
+            para_y_pred.extend(y_hat)
+            para_y_true.extend(b_labels)
+            para_sent_ids.extend(b_sent_ids)
+        paraphrase_accuracy = np.mean(np.array(para_y_pred) == np.array(para_y_true))
+        # Evaluate semantic textual similarity.
+        sts_y_true = []
+        sts_y_pred = []
+        sts_sent_ids = []
+        for step, batch in enumerate(tqdm(sts_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            (b_ids1, b_mask1,
+             b_ids2, b_mask2,
+             b_labels, b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
+                          batch['token_ids_2'], batch['attention_mask_2'],
+                          batch['labels'], batch['sent_ids'])
+            b_ids1 = b_ids1.to(device)
+            b_mask1 = b_mask1.to(device)
+            b_ids2 = b_ids2.to(device)
+            b_mask2 = b_mask2.to(device)
+            logits = model.predict_similarity(b_ids1, b_mask1, b_ids2, b_mask2)
+            y_hat = logits.flatten().cpu().numpy()
+            b_labels = b_labels.flatten().cpu().numpy()
+            sts_y_pred.extend(y_hat)
+            sts_y_true.extend(b_labels)
+            sts_sent_ids.extend(b_sent_ids)
+        pearson_mat = np.corrcoef(sts_y_pred,sts_y_true)
+        sts_corr = pearson_mat[1][0]
+        print(f'Sentiment classification accuracy: {sentiment_accuracy:.3f}')
+        print(f'Paraphrase detection accuracy: {paraphrase_accuracy:.3f}')
+        print(f'Semantic Textual Similarity correlation: {sts_corr:.3f}')
+        return (sentiment_accuracy,sst_y_pred, sst_sent_ids,
+                paraphrase_accuracy, para_y_pred, para_sent_ids,
+                sts_corr, sts_y_pred, sts_sent_ids)
+# Evaluate multitask model on test sets.
+def model_eval_test_multitask(sentiment_dataloader,
+                         paraphrase_dataloader,
+                         sts_dataloader,
+                         model, device):
+    model.eval()  # Switch to eval model, will turn off randomness like dropout.
+    with torch.no_grad():
+        # Evaluate sentiment classification.
+        sst_y_pred = []
+        sst_sent_ids = []
+        for step, batch in enumerate(tqdm(sentiment_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            b_ids, b_mask, b_sent_ids = batch['token_ids'], batch['attention_mask'],  batch['sent_ids']
+            b_ids = b_ids.to(device)
+            b_mask = b_mask.to(device)
+            logits = model.predict_sentiment(b_ids, b_mask)
+            y_hat = logits.argmax(dim=-1).flatten().cpu().numpy()
+            sst_y_pred.extend(y_hat)
+            sst_sent_ids.extend(b_sent_ids)
+        # Evaluate paraphrase detection.
+        para_y_pred = []
+        para_sent_ids = []
+        for step, batch in enumerate(tqdm(paraphrase_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            (b_ids1, b_mask1,
+             b_ids2, b_mask2,
+             b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
+                          batch['token_ids_2'], batch['attention_mask_2'],
+                          batch['sent_ids'])
+            b_ids1 = b_ids1.to(device)
+            b_mask1 = b_mask1.to(device)
+            b_ids2 = b_ids2.to(device)
+            b_mask2 = b_mask2.to(device)
+            logits = model.predict_paraphrase(b_ids1, b_mask1, b_ids2, b_mask2)
+            y_hat = logits.sigmoid().round().flatten().cpu().numpy()
+            para_y_pred.extend(y_hat)
+            para_sent_ids.extend(b_sent_ids)
+        # Evaluate semantic textual similarity.
+        sts_y_pred = []
+        sts_sent_ids = []
+        for step, batch in enumerate(tqdm(sts_dataloader, desc=f'eval', disable=TQDM_DISABLE)):
+            (b_ids1, b_mask1,
+             b_ids2, b_mask2,
+             b_sent_ids) = (batch['token_ids_1'], batch['attention_mask_1'],
+                          batch['token_ids_2'], batch['attention_mask_2'],
+                          batch['sent_ids'])
+            b_ids1 = b_ids1.to(device)
+            b_mask1 = b_mask1.to(device)
+            b_ids2 = b_ids2.to(device)
+            b_mask2 = b_mask2.to(device)
+            logits = model.predict_similarity(b_ids1, b_mask1, b_ids2, b_mask2)
+            y_hat = logits.flatten().cpu().numpy()
+            sts_y_pred.extend(y_hat)
+            sts_sent_ids.extend(b_sent_ids)
+        return (sst_y_pred, sst_sent_ids,
+                para_y_pred, para_sent_ids,
+                sts_y_pred, sts_sent_ids)

multitask_classifier.py ADDED Viewed

	@@ -0,0 +1,340 @@

+'''
+Multitask BERT class, starter training code, evaluation, and test code.
+Of note are:
+* class MultitaskBERT: Your implementation of multitask BERT.
+* function train_multitask: Training procedure for MultitaskBERT. Starter code
+    copies training procedure from `classifier.py` (single-task SST).
+* function test_multitask: Test procedure for MultitaskBERT. This function generates
+    the required files for submission.
+Running `python multitask_classifier.py` trains and tests your MultitaskBERT and
+writes all required submission files.
+'''
+import random, numpy as np, argparse
+from types import SimpleNamespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from bert import BertModel
+from optimizer import AdamW
+from tqdm import tqdm
+from datasets import (
+    SentenceClassificationDataset,
+    SentenceClassificationTestDataset,
+    SentencePairDataset,
+    SentencePairTestDataset,
+    load_multitask_data
+)
+from evaluation import model_eval_sst, model_eval_multitask, model_eval_test_multitask
+TQDM_DISABLE=False
+# Fix the random seed.
+def seed_everything(seed=11711):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+BERT_HIDDEN_SIZE = 768
+N_SENTIMENT_CLASSES = 5
+class MultitaskBERT(nn.Module):
+    '''
+    This module should use BERT for 3 tasks:
+    - Sentiment classification (predict_sentiment)
+    - Paraphrase detection (predict_paraphrase)
+    - Semantic Textual Similarity (predict_similarity)
+    '''
+    def __init__(self, config):
+        super(MultitaskBERT, self).__init__()
+        self.bert = BertModel.from_pretrained('bert-base-uncased')
+        # last-linear-layer mode does not require updating BERT paramters.
+        assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
+        for param in self.bert.parameters():
+            if config.fine_tune_mode == 'last-linear-layer':
+                param.requires_grad = False
+            elif config.fine_tune_mode == 'full-model':
+                param.requires_grad = True
+        # You will want to add layers here to perform the downstream tasks.
+        ### TODO
+        raise NotImplementedError
+    def forward(self, input_ids, attention_mask):
+        'Takes a batch of sentences and produces embeddings for them.'
+        # The final BERT embedding is the hidden state of [CLS] token (the first token)
+        # Here, you can start by just returning the embeddings straight from BERT.
+        # When thinking of improvements, you can later try modifying this
+        # (e.g., by adding other layers).
+        ### TODO
+        raise NotImplementedError
+    def predict_sentiment(self, input_ids, attention_mask):
+        '''Given a batch of sentences, outputs logits for classifying sentiment.
+        There are 5 sentiment classes:
+        (0 - negative, 1- somewhat negative, 2- neutral, 3- somewhat positive, 4- positive)
+        Thus, your output should contain 5 logits for each sentence.
+        '''
+        ### TODO
+        raise NotImplementedError
+    def predict_paraphrase(self,
+                           input_ids_1, attention_mask_1,
+                           input_ids_2, attention_mask_2):
+        '''Given a batch of pairs of sentences, outputs a single logit for predicting whether they are paraphrases.
+        Note that your output should be unnormalized (a logit); it will be passed to the sigmoid function
+        during evaluation.
+        '''
+        ### TODO
+        raise NotImplementedError
+    def predict_similarity(self,
+                           input_ids_1, attention_mask_1,
+                           input_ids_2, attention_mask_2):
+        '''Given a batch of pairs of sentences, outputs a single logit corresponding to how similar they are.
+        Note that your output should be unnormalized (a logit).
+        '''
+        ### TODO
+        raise NotImplementedError
+def save_model(model, optimizer, args, config, filepath):
+    save_info = {
+        'model': model.state_dict(),
+        'optim': optimizer.state_dict(),
+        'args': args,
+        'model_config': config,
+        'system_rng': random.getstate(),
+        'numpy_rng': np.random.get_state(),
+        'torch_rng': torch.random.get_rng_state(),
+    }
+    torch.save(save_info, filepath)
+    print(f"save the model to {filepath}")
+def train_multitask(args):
+    '''Train MultitaskBERT.
+    Currently only trains on SST dataset. The way you incorporate training examples
+    from other datasets into the training procedure is up to you. To begin, take a
+    look at test_multitask below to see how you can use the custom torch `Dataset`s
+    in datasets.py to load in examples from the Quora and SemEval datasets.
+    '''
+    device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+    # Create the data and its corresponding datasets and dataloader.
+    sst_train_data, num_labels,para_train_data, sts_train_data = load_multitask_data(args.sst_train,args.para_train,args.sts_train, split ='train')
+    sst_dev_data, num_labels,para_dev_data, sts_dev_data = load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev, split ='train')
+    sst_train_data = SentenceClassificationDataset(sst_train_data, args)
+    sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)
+    sst_train_dataloader = DataLoader(sst_train_data, shuffle=True, batch_size=args.batch_size,
+                                      collate_fn=sst_train_data.collate_fn)
+    sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
+                                    collate_fn=sst_dev_data.collate_fn)
+    # Init model.
+    config = {'hidden_dropout_prob': args.hidden_dropout_prob,
+              'num_labels': num_labels,
+              'hidden_size': 768,
+              'data_dir': '.',
+              'fine_tune_mode': args.fine_tune_mode}
+    config = SimpleNamespace(**config)
+    model = MultitaskBERT(config)
+    model = model.to(device)
+    lr = args.lr
+    optimizer = AdamW(model.parameters(), lr=lr)
+    best_dev_acc = 0
+    # Run for the specified number of epochs.
+    for epoch in range(args.epochs):
+        model.train()
+        train_loss = 0
+        num_batches = 0
+        for batch in tqdm(sst_train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE):
+            b_ids, b_mask, b_labels = (batch['token_ids'],
+                                       batch['attention_mask'], batch['labels'])
+            b_ids = b_ids.to(device)
+            b_mask = b_mask.to(device)
+            b_labels = b_labels.to(device)
+            optimizer.zero_grad()
+            logits = model.predict_sentiment(b_ids, b_mask)
+            loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            num_batches += 1
+        train_loss = train_loss / (num_batches)
+        train_acc, train_f1, *_ = model_eval_sst(sst_train_dataloader, model, device)
+        dev_acc, dev_f1, *_ = model_eval_sst(sst_dev_dataloader, model, device)
+        if dev_acc > best_dev_acc:
+            best_dev_acc = dev_acc
+            save_model(model, optimizer, args, config, args.filepath)
+        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")
+def test_multitask(args):
+    '''Test and save predictions on the dev and test sets of all three tasks.'''
+    with torch.no_grad():
+        device = torch.device('cuda') if args.use_gpu else torch.device('cpu')
+        saved = torch.load(args.filepath)
+        config = saved['model_config']
+        model = MultitaskBERT(config)
+        model.load_state_dict(saved['model'])
+        model = model.to(device)
+        print(f"Loaded model to test from {args.filepath}")
+        sst_test_data, num_labels,para_test_data, sts_test_data = \
+            load_multitask_data(args.sst_test,args.para_test, args.sts_test, split='test')
+        sst_dev_data, num_labels,para_dev_data, sts_dev_data = \
+            load_multitask_data(args.sst_dev,args.para_dev,args.sts_dev,split='dev')
+        sst_test_data = SentenceClassificationTestDataset(sst_test_data, args)
+        sst_dev_data = SentenceClassificationDataset(sst_dev_data, args)
+        sst_test_dataloader = DataLoader(sst_test_data, shuffle=True, batch_size=args.batch_size,
+                                         collate_fn=sst_test_data.collate_fn)
+        sst_dev_dataloader = DataLoader(sst_dev_data, shuffle=False, batch_size=args.batch_size,
+                                        collate_fn=sst_dev_data.collate_fn)
+        para_test_data = SentencePairTestDataset(para_test_data, args)
+        para_dev_data = SentencePairDataset(para_dev_data, args)
+        para_test_dataloader = DataLoader(para_test_data, shuffle=True, batch_size=args.batch_size,
+                                          collate_fn=para_test_data.collate_fn)
+        para_dev_dataloader = DataLoader(para_dev_data, shuffle=False, batch_size=args.batch_size,
+                                         collate_fn=para_dev_data.collate_fn)
+        sts_test_data = SentencePairTestDataset(sts_test_data, args)
+        sts_dev_data = SentencePairDataset(sts_dev_data, args, isRegression=True)
+        sts_test_dataloader = DataLoader(sts_test_data, shuffle=True, batch_size=args.batch_size,
+                                         collate_fn=sts_test_data.collate_fn)
+        sts_dev_dataloader = DataLoader(sts_dev_data, shuffle=False, batch_size=args.batch_size,
+                                        collate_fn=sts_dev_data.collate_fn)
+        dev_sentiment_accuracy,dev_sst_y_pred, dev_sst_sent_ids, \
+            dev_paraphrase_accuracy, dev_para_y_pred, dev_para_sent_ids, \
+            dev_sts_corr, dev_sts_y_pred, dev_sts_sent_ids = model_eval_multitask(sst_dev_dataloader,
+                                                                    para_dev_dataloader,
+                                                                    sts_dev_dataloader, model, device)
+        test_sst_y_pred, \
+            test_sst_sent_ids, test_para_y_pred, test_para_sent_ids, test_sts_y_pred, test_sts_sent_ids = \
+                model_eval_test_multitask(sst_test_dataloader,
+                                          para_test_dataloader,
+                                          sts_test_dataloader, model, device)
+        with open(args.sst_dev_out, "w+") as f:
+            print(f"dev sentiment acc :: {dev_sentiment_accuracy :.3f}")
+            f.write(f"id \t Predicted_Sentiment \n")
+            for p, s in zip(dev_sst_sent_ids, dev_sst_y_pred):
+                f.write(f"{p} , {s} \n")
+        with open(args.sst_test_out, "w+") as f:
+            f.write(f"id \t Predicted_Sentiment \n")
+            for p, s in zip(test_sst_sent_ids, test_sst_y_pred):
+                f.write(f"{p} , {s} \n")
+        with open(args.para_dev_out, "w+") as f:
+            print(f"dev paraphrase acc :: {dev_paraphrase_accuracy :.3f}")
+            f.write(f"id \t Predicted_Is_Paraphrase \n")
+            for p, s in zip(dev_para_sent_ids, dev_para_y_pred):
+                f.write(f"{p} , {s} \n")
+        with open(args.para_test_out, "w+") as f:
+            f.write(f"id \t Predicted_Is_Paraphrase \n")
+            for p, s in zip(test_para_sent_ids, test_para_y_pred):
+                f.write(f"{p} , {s} \n")
+        with open(args.sts_dev_out, "w+") as f:
+            print(f"dev sts corr :: {dev_sts_corr :.3f}")
+            f.write(f"id \t Predicted_Similiary \n")
+            for p, s in zip(dev_sts_sent_ids, dev_sts_y_pred):
+                f.write(f"{p} , {s} \n")
+        with open(args.sts_test_out, "w+") as f:
+            f.write(f"id \t Predicted_Similiary \n")
+            for p, s in zip(test_sts_sent_ids, test_sts_y_pred):
+                f.write(f"{p} , {s} \n")
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--sst_train", type=str, default="data/ids-sst-train.csv")
+    parser.add_argument("--sst_dev", type=str, default="data/ids-sst-dev.csv")
+    parser.add_argument("--sst_test", type=str, default="data/ids-sst-test-student.csv")
+    parser.add_argument("--para_train", type=str, default="data/quora-train.csv")
+    parser.add_argument("--para_dev", type=str, default="data/quora-dev.csv")
+    parser.add_argument("--para_test", type=str, default="data/quora-test-student.csv")
+    parser.add_argument("--sts_train", type=str, default="data/sts-train.csv")
+    parser.add_argument("--sts_dev", type=str, default="data/sts-dev.csv")
+    parser.add_argument("--sts_test", type=str, default="data/sts-test-student.csv")
+    parser.add_argument("--seed", type=int, default=11711)
+    parser.add_argument("--epochs", type=int, default=10)
+    parser.add_argument("--fine-tune-mode", type=str,
+                        help='last-linear-layer: the BERT parameters are frozen and the task specific head parameters are updated; full-model: BERT parameters are updated as well',
+                        choices=('last-linear-layer', 'full-model'), default="last-linear-layer")
+    parser.add_argument("--use_gpu", action='store_true')
+    parser.add_argument("--sst_dev_out", type=str, default="predictions/sst-dev-output.csv")
+    parser.add_argument("--sst_test_out", type=str, default="predictions/sst-test-output.csv")
+    parser.add_argument("--para_dev_out", type=str, default="predictions/para-dev-output.csv")
+    parser.add_argument("--para_test_out", type=str, default="predictions/para-test-output.csv")
+    parser.add_argument("--sts_dev_out", type=str, default="predictions/sts-dev-output.csv")
+    parser.add_argument("--sts_test_out", type=str, default="predictions/sts-test-output.csv")
+    parser.add_argument("--batch_size", help='sst: 64, cfimdb: 8 can fit a 12GB GPU', type=int, default=8)
+    parser.add_argument("--hidden_dropout_prob", type=float, default=0.3)
+    parser.add_argument("--lr", type=float, help="learning rate", default=1e-5)
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = get_args()
+    args.filepath = f'{args.fine_tune_mode}-{args.epochs}-{args.lr}-multitask.pt' # Save path.
+    seed_everything(args.seed)  # Fix the seed for reproducibility.
+    train_multitask(args)
+    test_multitask(args)

optimizer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Callable, Iterable, Tuple
+import math
+import torch
+from torch.optim import Optimizer
+class AdamW(Optimizer):
+    def __init__(
+            self,
+            params: Iterable[torch.nn.parameter.Parameter],
+            lr: float = 1e-3,
+            betas: Tuple[float, float] = (0.9, 0.999),
+            eps: float = 1e-6,
+            weight_decay: float = 0.0,
+            correct_bias: bool = True,
+    ):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
+        super().__init__(params, defaults)
+    def step(self, closure: Callable = None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
+                # Access state
+                state = self.state[p]
+                # Initialize state if not already done
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                # Hyperparameters
+                alpha = group["lr"]
+                beta1, beta2 = group["betas"]
+                eps = group["eps"]
+                weight_decay = group["weight_decay"]
+                correct_bias = group["correct_bias"]
+                # Retrieve state variables
+                exp_avg = state["exp_avg"]
+                exp_avg_sq = state["exp_avg_sq"]
+                step = state["step"]
+                # Update step
+                step += 1
+                state["step"] = step
+                # Update biased first and second moment estimates
+                exp_avg.mul_(beta1).add_(grad, alpha=(1 - beta1))
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
+                # Compute bias-corrected moments
+                if correct_bias:
+                    bias_correction1 = 1 - beta1 ** step
+                    bias_correction2 = 1 - beta2 ** step
+                    exp_avg_corr = exp_avg / bias_correction1
+                    exp_avg_sq_corr = exp_avg_sq / bias_correction2
+                else:
+                    exp_avg_corr = exp_avg
+                    exp_avg_sq_corr = exp_avg_sq
+                # Update parameters
+                denom = exp_avg_sq_corr.sqrt().add_(eps)
+                step_size = alpha
+                p.data.addcdiv_(exp_avg_corr, denom, value=-step_size)
+                # Apply weight decay
+                if weight_decay != 0:
+                    p.data.add_(p.data, alpha=-alpha * weight_decay)
+        return loss

optimizer_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b817e0dce16a9bc8d3a6bcb88035db68f7d783dc8a565737581fadd05db815
+size 152

optimizer_test.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import numpy as np
+from optimizer import AdamW
+seed = 0
+def test_optimizer(opt_class) -> torch.Tensor:
+    rng = np.random.default_rng(seed)
+    torch.manual_seed(seed)
+    model = torch.nn.Linear(3, 2, bias=False)
+    opt = opt_class(
+        model.parameters(),
+        lr=1e-3,
+        weight_decay=1e-4,
+        correct_bias=True,
+    )
+    for i in range(1000):
+        opt.zero_grad()
+        x = torch.FloatTensor(rng.uniform(size=[model.in_features]))
+        y_hat = model(x)
+        y = torch.Tensor([x[0] + x[1], -x[2]])
+        loss = ((y - y_hat) ** 2).sum()
+        loss.backward()
+        opt.step()
+    return model.weight.detach()
+ref = torch.tensor(np.load("optimizer_test.npy"))
+actual = test_optimizer(AdamW)
+print(ref)
+print(actual)
+assert torch.allclose(ref, actual, atol=1e-6, rtol=1e-4)
+print("Optimizer test passed!")

predictions/README ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ By default, `classifier.py` and `multitask_classifier.py` write your model predictions into this folder.
2	+ Before running prepare_submit.py, make sure that this directory has been populated!

predictions/last-linear-layer-cfimdb-dev-out.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f994587376345ea6a1e80a7946d5889259f6a427989c71e0b45de28ea4545d
+size 7621

predictions/last-linear-layer-cfimdb-test-out.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ebedf210c8973e02648e96152e253daa2385b230a48da151812a58d80178536
+size 15154

predictions/last-linear-layer-sst-dev-out.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22412dead5299ffb8fae45448f240cb135e3ad5dc04cea96975e893bdd719ba8
+size 34157

predictions/last-linear-layer-sst-test-out.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3455d6637e5ecd118c31e48534d92298da3c865ed11ad93e2aadc09fcc743666
+size 68536

prepare_submit.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Creates a zip file for submission on Gradescope.
+import os
+import zipfile
+required_files = [p for p in os.listdir('.') if p.endswith('.py')] + \
+                 [f'predictions/{p}' for p in os.listdir('predictions')]
+def main():
+    aid = 'cs224n_default_final_project_submission'
+    path = os.getcwd()
+    with zipfile.ZipFile(f"{aid}.zip", 'w') as zz:
+        for file in required_files:
+            zz.write(file, os.path.join(".", file))
+    print(f"Submission zip file created: {aid}.zip")
+if __name__ == '__main__':
+    main()

sanity_check.data ADDED Viewed

Binary file (56.4 kB). View file

sanity_check.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+from bert import BertModel
+sanity_data = torch.load("./sanity_check.data", weights_only=True)
+sent_ids = torch.tensor([[101, 7592, 2088, 102, 0, 0, 0, 0],
+                         [101, 7592, 15756, 2897, 2005, 17953, 2361, 102]])
+att_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1]])
+# Load model.
+bert = BertModel.from_pretrained('bert-base-uncased')
+outputs = bert(sent_ids, att_mask)
+att_mask = att_mask.unsqueeze(-1)
+outputs['last_hidden_state'] = outputs['last_hidden_state'] * att_mask
+sanity_data['last_hidden_state'] = sanity_data['last_hidden_state'] * att_mask
+for k in ['last_hidden_state', 'pooler_output']:
+    assert torch.allclose(outputs[k], sanity_data[k], atol=1e-5, rtol=1e-3)
+print("Your BERT implementation is correct!")

setup.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/usr/bin/env bash
+conda create -n cs224n_dfp python=3.8
+conda activate cs224n_dfp
+pip install torch torchvision torchaudio
+pip install tqdm==4.58.0
+pip install requests==2.25.1
+pip install importlib-metadata==3.7.0
+pip install filelock==3.0.12
+pip install sklearn==0.0
+pip install tokenizers==0.15
+pip install explainaboard_client==0.0.7

sst-classifier.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:62f6282ea608a997c1b43071cedcb1c4ba454b420305c7b15138aa9d7f70103d
+size 438072793

tokenizer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from typing import Dict, List, Optional, Union, Tuple, BinaryIO
+import os
+import sys
+import json
+import tempfile
+import copy
+from tqdm.auto import tqdm
+from functools import partial
+from urllib.parse import urlparse
+from pathlib import Path
+import requests
+from hashlib import sha256
+from filelock import FileLock
+import importlib_metadata
+import torch
+import torch.nn as nn
+from torch import Tensor
+import fnmatch
+__version__ = "4.0.0"
+_torch_version = importlib_metadata.version("torch")
+hf_cache_home = os.path.expanduser(os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")))
+default_cache_path = os.path.join(hf_cache_home, "transformers")
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+PRESET_MIRROR_DICT = {
+    "tuna": "https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models",
+    "bfsu": "https://mirrors.bfsu.edu.cn/hugging-face-models",
+}
+HUGGINGFACE_CO_PREFIX = "https://huggingface.co/{model_id}/resolve/{revision}/{filename}"
+WEIGHTS_NAME = "pytorch_model.bin"
+CONFIG_NAME = "config.json"
+def is_torch_available():
+  return True
+def is_tf_available():
+  return False
+def is_remote_url(url_or_filename):
+  parsed = urlparse(url_or_filename)
+  return parsed.scheme in ("http", "https")
+def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
+  headers = copy.deepcopy(headers)
+  if resume_size > 0:
+    headers["Range"] = "bytes=%d-" % (resume_size,)
+  r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+  r.raise_for_status()
+  content_length = r.headers.get("Content-Length")
+  total = resume_size + int(content_length) if content_length is not None else None
+  progress = tqdm(
+    unit="B",
+    unit_scale=True,
+    total=total,
+    initial=resume_size,
+    desc="Downloading",
+    disable=False,
+  )
+  for chunk in r.iter_content(chunk_size=1024):
+    if chunk:  # filter out keep-alive new chunks
+      progress.update(len(chunk))
+      temp_file.write(chunk)
+  progress.close()
+def url_to_filename(url: str, etag: Optional[str] = None) -> str:
+  url_bytes = url.encode("utf-8")
+  filename = sha256(url_bytes).hexdigest()
+  if etag:
+    etag_bytes = etag.encode("utf-8")
+    filename += "." + sha256(etag_bytes).hexdigest()
+  if url.endswith(".h5"):
+    filename += ".h5"
+  return filename
+def hf_bucket_url(
+  model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
+) -> str:
+  if subfolder is not None:
+    filename = f"{subfolder}/{filename}"
+  if mirror:
+    endpoint = PRESET_MIRROR_DICT.get(mirror, mirror)
+    legacy_format = "/" not in model_id
+    if legacy_format:
+      return f"{endpoint}/{model_id}-{filename}"
+    else:
+      return f"{endpoint}/{model_id}/{filename}"
+  if revision is None:
+    revision = "main"
+  return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+  ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+  if is_torch_available():
+    ua += f"; torch/{_torch_version}"
+  if is_tf_available():
+    ua += f"; tensorflow/{_tf_version}"
+  if isinstance(user_agent, dict):
+    ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+  elif isinstance(user_agent, str):
+    ua += "; " + user_agent
+  return ua
+def get_from_cache(
+  url: str,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  etag_timeout=10,
+  resume_download=False,
+  user_agent: Union[Dict, str, None] = None,
+  use_auth_token: Union[bool, str, None] = None,
+  local_files_only=False,
+) -> Optional[str]:
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+  os.makedirs(cache_dir, exist_ok=True)
+  headers = {"user-agent": http_user_agent(user_agent)}
+  if isinstance(use_auth_token, str):
+    headers["authorization"] = "Bearer {}".format(use_auth_token)
+  elif use_auth_token:
+    token = HfFolder.get_token()
+    if token is None:
+      raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+    headers["authorization"] = "Bearer {}".format(token)
+  url_to_download = url
+  etag = None
+  if not local_files_only:
+    try:
+      r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
+      r.raise_for_status()
+      etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+      # We favor a custom header indicating the etag of the linked resource, and
+      # we fallback to the regular etag header.
+      # If we don't have any of those, raise an error.
+      if etag is None:
+        raise OSError(
+          "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+        )
+      # In case of a redirect,
+      # save an extra redirect on the request.get call,
+      # and ensure we download the exact atomic version even if it changed
+      # between the HEAD and the GET (unlikely, but hey).
+      if 300 <= r.status_code <= 399:
+        url_to_download = r.headers["Location"]
+    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+      # etag is already None
+      pass
+  filename = url_to_filename(url, etag)
+  # get cache path to put the file
+  cache_path = os.path.join(cache_dir, filename)
+  # etag is None == we don't have a connection or we passed local_files_only.
+  # try to get the last downloaded one
+  if etag is None:
+    if os.path.exists(cache_path):
+      return cache_path
+    else:
+      matching_files = [
+        file
+        for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
+        if not file.endswith(".json") and not file.endswith(".lock")
+      ]
+      if len(matching_files) > 0:
+        return os.path.join(cache_dir, matching_files[-1])
+      else:
+        # If files cannot be found and local_files_only=True,
+        # the models might've been found if local_files_only=False
+        # Notify the user about that
+        if local_files_only:
+          raise FileNotFoundError(
+            "Cannot find the requested files in the cached path and outgoing traffic has been"
+            " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+            " to False."
+          )
+        else:
+          raise ValueError(
+            "Connection error, and we cannot find the requested files in the cached path."
+            " Please try again or make sure your Internet connection is on."
+          )
+  # From now on, etag is not None.
+  if os.path.exists(cache_path) and not force_download:
+    return cache_path
+  # Prevent parallel downloads of the same file with a lock.
+  lock_path = cache_path + ".lock"
+  with FileLock(lock_path):
+    # If the download just completed while the lock was activated.
+    if os.path.exists(cache_path) and not force_download:
+      # Even if returning early like here, the lock will be released.
+      return cache_path
+    if resume_download:
+      incomplete_path = cache_path + ".incomplete"
+      @contextmanager
+      def _resumable_file_manager() -> "io.BufferedWriter":
+        with open(incomplete_path, "ab") as f:
+          yield f
+      temp_file_manager = _resumable_file_manager
+      if os.path.exists(incomplete_path):
+        resume_size = os.stat(incomplete_path).st_size
+      else:
+        resume_size = 0
+    else:
+      temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
+      resume_size = 0
+    # Download to temporary file, then copy to cache dir once finished.
+    # Otherwise you get corrupt cache entries if the download gets interrupted.
+    with temp_file_manager() as temp_file:
+      http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
+    os.replace(temp_file.name, cache_path)
+    meta = {"url": url, "etag": etag}
+    meta_path = cache_path + ".json"
+    with open(meta_path, "w") as meta_file:
+      json.dump(meta, meta_file)
+  return cache_path
+def cached_path(
+  url_or_filename,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  resume_download=False,
+  user_agent: Union[Dict, str, None] = None,
+  extract_compressed_file=False,
+  force_extract=False,
+  use_auth_token: Union[bool, str, None] = None,
+  local_files_only=False,
+) -> Optional[str]:
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(url_or_filename, Path):
+    url_or_filename = str(url_or_filename)
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+  if is_remote_url(url_or_filename):
+    # URL, so get it from the cache (downloading if necessary)
+    output_path = get_from_cache(
+      url_or_filename,
+      cache_dir=cache_dir,
+      force_download=force_download,
+      proxies=proxies,
+      resume_download=resume_download,
+      user_agent=user_agent,
+      use_auth_token=use_auth_token,
+      local_files_only=local_files_only,
+    )
+  elif os.path.exists(url_or_filename):
+    # File, and it exists.
+    output_path = url_or_filename
+  elif urlparse(url_or_filename).scheme == "":
+    # File, but it doesn't exist.
+    raise EnvironmentError("file {} not found".format(url_or_filename))
+  else:
+    # Something unknown
+    raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+  if extract_compressed_file:
+    if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+      return output_path
+    # Path where we extract compressed archives
+    # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+    output_dir, output_file = os.path.split(output_path)
+    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+    if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+      return output_path_extracted
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+      shutil.rmtree(output_path_extracted, ignore_errors=True)
+      os.makedirs(output_path_extracted)
+      if is_zipfile(output_path):
+        with ZipFile(output_path, "r") as zip_file:
+          zip_file.extractall(output_path_extracted)
+          zip_file.close()
+      elif tarfile.is_tarfile(output_path):
+        tar_file = tarfile.open(output_path)
+        tar_file.extractall(output_path_extracted)
+        tar_file.close()
+      else:
+        raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+    return output_path_extracted
+  return output_path
+def get_parameter_dtype(parameter: Union[nn.Module]):
+  try:
+    return next(parameter.parameters()).dtype
+  except StopIteration:
+    # For nn.DataParallel compatibility in PyTorch 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+      tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+      return tuples
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    first_tuple = next(gen)
+    return first_tuple[1].dtype
+def get_extended_attention_mask(attention_mask: Tensor, dtype) -> Tensor:
+  # attention_mask [batch_size, seq_length]
+  assert attention_mask.dim() == 2
+  # [batch_size, 1, 1, seq_length] for multi-head attention
+  extended_attention_mask = attention_mask[:, None, None, :]
+  extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+  extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+  return extended_attention_mask

zemo1.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+import torch.nn as nn
+from tqdm import tqdm
+import torch.optim as optim
+# Bước 1: Chuẩn bị dữ liệu mẫu
+# Dữ liệu giả: mỗi dòng là [giờ học, giờ giải trí, giờ ngủ], điểm trung bình
+data = [
+    [2, 1, 7, 6.0],
+    [3, 2, 6, 7.5],
+    [1, 3, 8, 5.5],
+    [4, 1, 6, 8.0],
+    [5, 0, 5, 9.0],
+    [6, 0, 6, 9.5]
+]
+# Tách đặc trưng (features) và mục tiêu (target)
+X = torch.tensor([row[:3] for row in data], dtype=torch.float32)  # Giờ học, giờ giải trí, giờ ngủ
+y = torch.tensor([[row[3]] for row in data], dtype=torch.float32)  # Điểm trung bình
+# Bước 2: Xây dựng mô hình
+class StudentGradeModel(nn.Module):
+    def __init__(self):
+        super(StudentGradeModel, self).__init__()
+        self.linear = nn.Linear(3, 1)  # 3 đầu vào, 1 đầu ra
+    def forward(self, x):
+        return self.linear(x)
+model = StudentGradeModel()
+# Bước 3: Định nghĩa hàm mất mát và bộ tối ưu
+criterion = nn.MSELoss()
+optimizer = optim.SGD(model.parameters(), lr=0.01)
+# Bước 4: Huấn luyện mô hình
+for epoch in tqdm(range(10000), desc="Training Epochs"):
+    optimizer.zero_grad()  # Xóa gradient cũ
+    output = model(X)      # Truyền dữ liệu qua mô hình
+    loss = criterion(output, y)  # Tính mất mát
+    loss.backward()        # Tính gradient
+    optimizer.step()       # Cập nhật trọng số
+    # In loss để theo dõi quá trình huấn luyện
+    if (epoch + 1) % 1000 == 0:
+        tqdm.write(f'Epoch [{epoch + 1}/10000], Loss: {loss.item():.4f}')
+# Bước 5: Dự đoán thử với một học sinh mới
+model.eval()
+with torch.no_grad():
+    test_input = torch.tensor([[4, 1, 6]], dtype=torch.float32)  # Ví dụ: 4 giờ học, 1 giờ giải trí, 6 giờ ngủ
+    prediction = model(test_input)
+    print("Dự đoán điểm trung bình:", prediction.item())

zemo2.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import torch
+import torch.nn as nn
+# Xây dựng mô hình RNN
+class RNNModel(nn.Module):
+    def __init__(self, input_size, hidden_size, output_size):
+        super(RNNModel, self).__init__()
+        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)  # Định nghĩa RNN
+        self.fc = nn.Linear(hidden_size, output_size)  # Lớp fully connected để dự đoán output
+    def forward(self, x):
+        out, _ = self.rnn(x)  # Lấy output từ RNN
+        out = out[:, -1, :]  # Lấy output của bước cuối cùng (nếu dữ liệu có nhiều bước thời gian)
+        out = self.fc(out)  # Dự đoán output
+        return out
+# Khởi tạo mô hình
+input_size = 10  # Kích thước đầu vào
+hidden_size = 20  # Số lượng hidden units
+output_size = 1  # Đầu ra (ví dụ: hồi quy)
+model = RNNModel(input_size, hidden_size, output_size)
+# Khởi tạo dữ liệu giả
+X = torch.randn(32, 5, 10)  # 32 samples, 5 bước thời gian, mỗi bước có 10 đặc trưng
+y = torch.randn(32, 1)  # 32 samples, 1 giá trị đầu ra cho mỗi sample
+# Hàm mất mát và bộ tối ưu
+criterion = nn.MSELoss()
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+# Huấn luyện mô hình
+for epoch in range(100):
+    model.train()
+    optimizer.zero_grad()
+    output = model(X)  # Truyền dữ liệu qua mô hình
+    loss = criterion(output, y)  # Tính mất mát
+    loss.backward()  # Tính gradient
+    optimizer.step()  # Cập nhật trọng số
+    if (epoch + 1) % 10 == 0:
+        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

zemo3.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import torch
+from tokenizer import BertTokenizer
+from torch import nn
+from bert import BertModel
+# Initialize the BERT tokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+# Example sentence
+sentences = [
+    "She loves reading novels in her free time",
+    "An apple a day keeps the doctor away",
+    "If you can't explain it simply, you don't understand it well enough."
+]
+# Tokenize and encode the sentence
+encoding = tokenizer.batch_encode_plus(
+    sentences,
+    max_length=512,
+    padding='max_length',
+    truncation=True,
+    return_tensors='pt'
+)
+# Get the token IDs from the encoding
+input_ids = encoding['input_ids']
+attention_mask = encoding['attention_mask']
+model = BertModel.from_pretrained('bert-base-uncased')
+assert isinstance(model, BertModel)
+print(model.embed(input_ids).size())