GlowCheese commited on Dec 3, 2024

Commit

a0b398e

1 Parent(s): 354d4d3

Transfer code from Kaggle

Browse files

Files changed (43) hide show

.python-version +1 -1
base_bert.py +247 -0
bert.py +221 -0
classifier.py +199 -0
classifier_utils.py +235 -0
config.py +222 -0
constants.py +36 -0
everything.py +20 -0
finetune-scripts/sup.py +12 -0
finetune-scripts/unsup.py +12 -0
finetuning.py +173 -0
minbert-data/amazon-polarity.parquet +3 -0
minbert-data/ids-cfimdb-dev.csv +3 -0
minbert-data/ids-cfimdb-test-student.csv +3 -0
minbert-data/ids-cfimdb-train.csv +3 -0
minbert-data/ids-sst-dev.csv +3 -0
minbert-data/ids-sst-test-student.csv +3 -0
minbert-data/ids-sst-train.csv +3 -0
minbert-data/nli-train.parquet +3 -0
minbert-data/optimizer_test.npy +3 -0
minbert-data/sanity_check.data +0 -0
minbert-data/stsb-dev.parquet +3 -0
minbert-model/sup-cse-bert.pth +3 -0
minbert-model/unsup-cse-bert.pth +3 -0
optimizer.py +90 -0
optimizer_test.py +36 -0
requirements.txt +9 -0
sanity_check.py +21 -0
tokenizer.py +0 -0
train-scripts/base_cfimdb_onfm.py +15 -0
train-scripts/base_cfimdb_onll.py +15 -0
train-scripts/base_sst_onfm.py +15 -0
train-scripts/base_sst_onll.py +15 -0
train-scripts/finetuned_bert.py +15 -0
train-scripts/sup_cfimdb_onfm.py +17 -0
train-scripts/sup_cfimdb_onll.py +17 -0
train-scripts/sup_sst_onfm.py +17 -0
train-scripts/sup_sst_onll.py +17 -0
train-scripts/unsup_cfimdb_onfm.py +17 -0
train-scripts/unsup_cfimdb_onll.py +17 -0
train-scripts/unsup_sst_onfm.py +17 -0
train-scripts/unsup_sst_onll.py +17 -0
utils.py +349 -0

.python-version CHANGED Viewed

	@@ -1 +1 @@
1	- 3.8.20


1	+ 3.10.15

base_bert.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import re
+from torch import device, dtype, nn
+from config import BertConfig, PretrainedConfig
+from utils import *
+class BertPreTrainedModel(nn.Module):
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    _keys_to_ignore_on_load_unexpected = None
+    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+        super().__init__()
+        self.config = config
+        self.name_or_path = config.name_or_path
+    def init_weights(self):
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+    @property
+    def dtype(self) -> dtype:
+        return get_parameter_dtype(self)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        mirror = kwargs.pop("mirror", None)
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            if os.path.isdir(pretrained_model_name_or_path):
+                # Load from a PyTorch checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            else:
+                archive_file = hf_bucket_url(
+                    pretrained_model_name_or_path,
+                    filename=WEIGHTS_NAME,
+                    revision=revision,
+                    mirror=mirror,
+                )
+            try:
+                # Load from URL or cache if already cached
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                )
+            except EnvironmentError as err:
+                #logger.error(err)
+                msg = (
+                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}.\n\n"
+                )
+                raise EnvironmentError(msg)
+        else:
+            resolved_archive_file = None
+        config.name_or_path = pretrained_model_name_or_path
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+        if state_dict is None:
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location="cpu", weights_only=True)
+            except Exception:
+                raise OSError(
+                    f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
+                    f"at '{resolved_archive_file}'"
+                )
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        m = {'embeddings.word_embeddings': 'word_embedding',
+                 'embeddings.position_embeddings': 'pos_embedding',
+                 'embeddings.token_type_embeddings': 'tk_type_embedding',
+                 'embeddings.LayerNorm': 'embed_layer_norm',
+                 'embeddings.dropout': 'embed_dropout',
+                 'encoder.layer': 'bert_layers',
+                 'pooler.dense': 'pooler_dense',
+                 'pooler.activation': 'pooler_af',
+                 'attention.self': "self_attention",
+                 'attention.output.dense': 'attention_dense',
+                 'attention.output.LayerNorm': 'attention_layer_norm',
+                 'attention.output.dropout': 'attention_dropout',
+                 'intermediate.dense': 'interm_dense',
+                 'intermediate.intermediate_act_fn': 'interm_af',
+                 'output.dense': 'out_dense',
+                 'output.LayerNorm': 'out_layer_norm',
+                 'output.dropout': 'out_dropout'}
+        for key in state_dict.keys():
+            new_key = None
+            if "gamma" in key:
+                new_key = key.replace("gamma", "weight")
+            if "beta" in key:
+                new_key = key.replace("beta", "bias")
+            for x, y in m.items():
+                if new_key is not None:
+                    _key = new_key
+                else:
+                    _key = key
+                if x in key:
+                    new_key = _key.replace(x, y)
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            # print(old_key, new_key)
+            state_dict[new_key] = state_dict.pop(old_key)
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+        your_bert_params = [f"bert.{x[0]}" for x in model.named_parameters()]
+        for k in state_dict:
+            if k not in your_bert_params and not k.startswith("cls."):
+                possible_rename = [x for x in k.split(".")[1:-1] if x in m.values()]
+                raise ValueError(f"{k} cannot be reload to your model, one/some of {possible_rename} we provided have been renamed")
+        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+        # so we need to apply the function recursively.
+        def load(module: nn.Module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict,
+                prefix,
+                local_metadata,
+                True,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            )
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ""
+        model_to_load = model
+        has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
+        if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+            start_prefix = cls.base_model_prefix + "."
+        if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+            model_to_load = getattr(model, cls.base_model_prefix)
+        load(model_to_load, prefix=start_prefix)
+        if model.__class__.__name__ != model_to_load.__class__.__name__:
+            base_model_state_dict = model_to_load.state_dict().keys()
+            head_model_state_dict_without_base_prefix = [
+                key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+            ]
+            missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+        # Some models may have keys that are not in the state by design, removing them before needlessly warning
+        # the user.
+        if cls._keys_to_ignore_on_load_missing is not None:
+            for pat in cls._keys_to_ignore_on_load_missing:
+                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+        if cls._keys_to_ignore_on_load_unexpected is not None:
+            for pat in cls._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading state_dict for {}:\n\t{}".format(
+                    model.__class__.__name__, "\n\t".join(error_msgs)
+                )
+            )
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "error_msgs": error_msgs,
+            }
+            return model, loading_info
+        if hasattr(config, "xla_device") and config.xla_device and is_torch_tpu_available():
+            import torch_xla.core.xla_model as xm
+            model = xm.send_cpu_data_to_device(model, xm.xla_device())
+            model.to(xm.xla_device())
+        return model

bert.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BertSelfAttention(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    self.num_attention_heads = config.num_attention_heads
+    self.attention_head_size = config.hidden_size // config.num_attention_heads
+    self.all_head_size = self.num_attention_heads * self.attention_head_size
+    # Initialize the linear transformation layers for key, value, query.
+    self.query = nn.Linear(config.hidden_size, self.all_head_size)
+    self.key = nn.Linear(config.hidden_size, self.all_head_size)
+    self.value = nn.Linear(config.hidden_size, self.all_head_size)
+    # This dropout is applied to normalized attention scores following the original
+    # implementation of transformer. Although it is a bit unusual, we empirically
+    # observe that it yields better performance.
+    self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+  def transform(self, x, linear_layer):
+    # The corresponding linear_layer of k, v, q are used to project the hidden_state (x).
+    bs, seq_len = x.shape[:2]
+    proj = linear_layer(x)
+    # Next, we need to produce multiple heads for the proj. This is done by spliting the
+    # hidden state to self.num_attention_heads, each of size self.attention_head_size.
+    proj = proj.view(bs, seq_len, self.num_attention_heads, self.attention_head_size)
+    # By proper transpose, we have proj of size [bs, num_attention_heads, seq_len, attention_head_size].
+    proj = proj.transpose(1, 2)
+    return proj
+  def attention(self, key, query, value, attention_mask):
+    """
+    key, query, value: [batch_size, num_attention_heads, seq_len, attention_head_size]
+    attention_mask: [batch_size, 1, 1, seq_len], masks padding tokens in the input.
+    """
+    d_k = query.size(-1)  # attention_head_size
+    attention_scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
+    # attention_scores shape: [batch_size, num_attention_heads, seq_len, seq_len]
+    # Apply attention mask
+    attention_scores = attention_scores + attention_mask
+    # Normalize scores with softmax and apply dropout.
+    attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+    attention_probs = self.dropout(attention_probs)
+    context = torch.matmul(attention_probs, value)
+    # context shape: [batch_size, num_attention_heads, seq_len, attention_head_size]
+    # Concatenate all attention heads to recover original shape: [batch_size, seq_len, hidden_size]
+    context = context.transpose(1, 2).contiguous()
+    context = context.view(context.size(0), context.size(1), -1)
+    return context
+  def forward(self, hidden_states, attention_mask):
+    """
+    hidden_states: [bs, seq_len, hidden_state]
+    attention_mask: [bs, 1, 1, seq_len]
+    output: [bs, seq_len, hidden_state]
+    """
+    # First, we have to generate the key, value, query for each token for multi-head attention
+    # using self.transform (more details inside the function).
+    # Size of *_layer is [bs, num_attention_heads, seq_len, attention_head_size].
+    key_layer = self.transform(hidden_states, self.key)
+    value_layer = self.transform(hidden_states, self.value)
+    query_layer = self.transform(hidden_states, self.query)
+    # Calculate the multi-head attention.
+    attn_value = self.attention(key_layer, query_layer, value_layer, attention_mask)
+    return attn_value
+class BertLayer(nn.Module):
+  def __init__(self, config):
+    super().__init__()
+    # Multi-head attention.
+    self.self_attention = BertSelfAttention(config)
+    # Add-norm for multi-head attention.
+    self.attention_dense = nn.Linear(config.hidden_size, config.hidden_size)
+    self.attention_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.attention_dropout = nn.Dropout(config.hidden_dropout_prob)
+    # Feed forward.
+    self.interm_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+    self.interm_af = F.gelu
+    # Add-norm for feed forward.
+    self.out_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+    self.out_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.out_dropout = nn.Dropout(config.hidden_dropout_prob)
+  def add_norm(self, input, output, dense_layer, dropout, ln_layer):
+    transformed_output = dense_layer(output)  # Biến đổi output bằng dense_layer
+    transformed_output = dropout(transformed_output)  # Áp dụng dropout
+    added_output = input + transformed_output  # Kết hợp input và output
+    normalized_output = ln_layer(added_output)  # Áp dụng chuẩn hóa
+    return normalized_output
+  def forward(self, hidden_states, attention_mask):
+    # 1. Multi-head attention
+    attention_output = self.self_attention(hidden_states, attention_mask)
+    # 2. Add-norm after attention
+    attention_output = self.add_norm(
+      hidden_states,
+      attention_output,
+      self.attention_dense,
+      self.attention_dropout,
+      self.attention_layer_norm
+    )
+    # 3. Feed-forward network
+    intermediate_output = self.interm_af(self.interm_dense(attention_output))
+    # 4. Add-norm after feed-forward
+    layer_output = self.add_norm(
+      attention_output,
+      intermediate_output,
+      self.out_dense,
+      self.out_dropout,
+      self.out_layer_norm
+    )
+    return layer_output
+class BertModel(BertPreTrainedModel):
+  """
+  The BERT model returns the final embeddings for each token in a sentence.
+  The model consists of:
+  1. Embedding layers (used in self.embed).
+  2. A stack of n BERT layers (used in self.encode).
+  3. A linear transformation layer for the [CLS] token (used in self.forward, as given).
+  """
+  def __init__(self, config):
+    super().__init__(config)
+    self.config = config
+    # Embedding layers.
+    self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+    self.pos_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+    self.tk_type_embedding = nn.Embedding(config.type_vocab_size, config.hidden_size)
+    self.embed_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    self.embed_dropout = nn.Dropout(config.hidden_dropout_prob)
+    # Register position_ids (1, len position emb) to buffer because it is a constant.
+    position_ids = torch.arange(config.max_position_embeddings).unsqueeze(0)
+    self.register_buffer('position_ids', position_ids)
+    # BERT encoder.
+    self.bert_layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    # [CLS] token transformations.
+    self.pooler_dense = nn.Linear(config.hidden_size, config.hidden_size)
+    self.pooler_af = nn.Tanh()
+    self.init_weights()
+  def embed(self, input_ids):
+    input_shape = input_ids.size()
+    seq_length = input_shape[1]
+    inputs_embeds = self.word_embedding(input_ids)
+    pos_ids = self.position_ids[:, :seq_length]
+    pos_embeds = self.pos_embedding(pos_ids)
+    # Since we are not considering token type, this embedding is just a placeholder.
+    tk_type_ids = torch.zeros(input_shape, dtype=torch.long, device=input_ids.device)
+    tk_type_embeds = self.tk_type_embedding(tk_type_ids)
+    embeddings = inputs_embeds + pos_embeds + tk_type_embeds
+    embeddings = self.embed_layer_norm(embeddings)
+    embeddings = self.embed_dropout(embeddings)
+    return embeddings
+  def encode(self, hidden_states, attention_mask):
+    """
+    hidden_states: the output from the embedding layer [batch_size, seq_len, hidden_size]
+    attention_mask: [batch_size, seq_len]
+    """
+    # Get the extended attention mask for self-attention.
+    # Returns extended_attention_mask of size [batch_size, 1, 1, seq_len].
+    # Distinguishes between non-padding tokens (with a value of 0) and padding tokens
+    # (with a value of a large negative number).
+    extended_attention_mask: torch.Tensor = get_extended_attention_mask(attention_mask, self.dtype)
+    # Pass the hidden states through the encoder layers.
+    for i, layer_module in enumerate(self.bert_layers):
+      # Feed the encoding from the last bert_layer to the next.
+      hidden_states = layer_module(hidden_states, extended_attention_mask)
+    return hidden_states
+  def forward(self, input_ids, attention_mask):
+    """
+    input_ids: [batch_size, seq_len], seq_len is the max length of the batch
+    attention_mask: same size as input_ids, 1 represents non-padding tokens, 0 represents padding tokens
+    """
+    # Get the embedding for each input token.
+    embedding_output = self.embed(input_ids=input_ids)
+    # Feed to a transformer (a stack of BertLayers).
+    sequence_output = self.encode(embedding_output, attention_mask=attention_mask)
+    # Get cls token hidden state.
+    first_tk = sequence_output[:, 0]
+    first_tk = self.pooler_dense(first_tk)
+    first_tk = self.pooler_af(first_tk)
+    return {'last_hidden_state': sequence_output, 'pooler_output': first_tk}

classifier.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from classifier_utils import *
+TQDM_DISABLE=True
+class BertSentimentClassifier(torch.nn.Module):
+    def __init__(self, config, custom_bert = None):
+        super(BertSentimentClassifier, self).__init__()
+        self.num_labels = config.num_labels
+        self.bert: BertModel = custom_bert or BertModel.from_pretrained('bert-base-uncased')
+        # Pretrain mode does not require updating BERT paramters.
+        assert config.fine_tune_mode in ["last-linear-layer", "full-model"]
+        for param in self.bert.parameters():
+            if config.fine_tune_mode == 'last-linear-layer':
+                param.requires_grad = False
+            elif config.fine_tune_mode == 'full-model':
+                param.requires_grad = True
+        # Classifier = Dropout + Linear
+        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = torch.nn.Linear(config.hidden_size, self.num_labels)
+    def forward(self, input_ids, attention_mask):
+        outputs = self.bert(input_ids, attention_mask)
+        pooler_output = outputs['pooler_output']
+        return self.classifier(self.dropout(pooler_output))
+# Evaluate the model on dev examples.
+def model_eval(dataloader, model: BertSentimentClassifier, device):
+    model.eval() # Switch to eval model, will turn off randomness like dropout.
+    y_true = []
+    y_pred = []
+    sents = []
+    sent_ids = []
+    for step, batch in enumerate(tqdm(dataloader, desc=f'eval', leave=False, disable=TQDM_DISABLE)):
+        b_labels, b_sents, b_sent_ids = batch['labels'], batch['sents'], batch['sent_ids']
+        b_ids = batch['token_ids'].to(device)
+        b_mask = batch['attention_mask'].to(device)
+        logits = model(b_ids, b_mask)
+        logits = logits.detach().cpu().numpy()
+        preds = np.argmax(logits, axis=1).flatten()
+        b_labels = b_labels.flatten()
+        y_true.extend(b_labels)
+        y_pred.extend(preds)
+        sents.extend(b_sents)
+        sent_ids.extend(b_sent_ids)
+    f1 = f1_score(y_true, y_pred, average='macro')
+    acc = accuracy_score(y_true, y_pred)
+    return acc, f1, y_pred, y_true, sents, sent_ids
+# Evaluate the model on test examples.
+def model_test_eval(dataloader, model, device):
+    model.eval() # Switch to eval model, will turn off randomness like dropout.
+    y_pred = []
+    sents = []
+    sent_ids = []
+    for step, batch in enumerate(tqdm(dataloader, desc=f'eval', leave=False, disable=TQDM_DISABLE)):
+        b_sents, b_sent_ids = batch['sents'], batch['sent_ids']
+        b_ids = batch['token_ids'].to(device)
+        b_mask = batch['attention_mask'].to(device)
+        logits = model(b_ids, b_mask)
+        logits = logits.detach().cpu().numpy()
+        preds = np.argmax(logits, axis=1).flatten()
+        y_pred.extend(preds)
+        sents.extend(b_sents)
+        sent_ids.extend(b_sent_ids)
+    return y_pred, sents, sent_ids
+def save_model(model, args, config, filepath):
+    save_info = {
+        'model': model.state_dict(),
+        'args': args,
+        'model_config': config,
+        'system_rng': random.getstate(),
+        'numpy_rng': np.random.get_state(),
+        'torch_rng': torch.random.get_rng_state(),
+    }
+    torch.save(save_info, filepath)
+    print(f"save the model to {filepath}")
+def train(args, custom_bert=None):
+    device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+    # Create the data and its corresponding datasets and dataloader.
+    train_data, num_labels = load_data(args.train, 'train')
+    dev_data = load_data(args.dev, 'valid')
+    train_dataset = SentimentDataset(train_data)
+    dev_dataset = SentimentDataset(dev_data)
+    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size,
+                                  num_workers=NUM_CPU_CORES, collate_fn=train_dataset.collate_fn)
+    dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
+                                num_workers=NUM_CPU_CORES, collate_fn=dev_dataset.collate_fn)
+    # Init model.
+    config = {'hidden_dropout_prob': HIDDEN_DROPOUT_PROB,
+              'num_labels': num_labels,
+              'hidden_size': 768,
+              'data_dir': '.',
+              'fine_tune_mode': args.fine_tune_mode}
+    config = SimpleNamespace(**config)
+    model = BertSentimentClassifier(config, custom_bert)
+    model = model.to(device)
+    lr = args.lr
+    optimizer = AdamW(model.parameters(), lr=lr)
+    best_dev_acc = 0
+    # Run for the specified number of epochs.
+    for epoch in range(EPOCHS):
+        model.train()
+        train_loss = 0
+        num_batches = 0
+        for batch in tqdm(train_dataloader, desc=f'train-{epoch}', leave=False, disable=TQDM_DISABLE):
+            b_ids = batch['token_ids'].to(device)
+            b_mask = batch['attention_mask'].to(device)
+            b_labels = batch['labels'].to(device)
+            optimizer.zero_grad()
+            logits = model(b_ids, b_mask)
+            loss = F.cross_entropy(logits, b_labels.view(-1), reduction='sum') / args.batch_size
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            num_batches += 1
+        train_loss = train_loss / (num_batches)
+        train_acc, train_f1, *_  = model_eval(train_dataloader, model, device)
+        dev_acc, dev_f1, *_ = model_eval(dev_dataloader, model, device)
+        if dev_acc > best_dev_acc:
+            best_dev_acc = dev_acc
+            save_model(model, args, config, args.filepath)
+        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}")
+def test(args):
+    with torch.no_grad():
+        device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+        saved = torch.load(args.filepath, weights_only=False)
+        config = saved['model_config']
+        model = BertSentimentClassifier(config)
+        model.load_state_dict(saved['model'])
+        model = model.to(device)
+        print(f"load model from {args.filepath}")
+        dev_data = load_data(args.dev, 'valid')
+        dev_dataset = SentimentDataset(dev_data)
+        dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size,
+                                    num_workers=NUM_CPU_CORES, collate_fn=dev_dataset.collate_fn)
+        dev_acc, dev_f1, dev_pred, dev_true, dev_sents, dev_sent_ids = model_eval(dev_dataloader, model, device)
+        print('DONE DEV')
+        print(f"dev acc :: {dev_acc :.3f}")
+def classifier_run(args, custom_bert=None):
+    seed_everything(SEED)
+    torch.set_num_threads(NUM_CPU_CORES)
+    print(f'Training Sentiment Classifier on {args.dataset}...')
+    config = SimpleNamespace(
+        filepath=f'/kaggle/working/{args.dataset}-classifier.pt',
+        lr=args.lr,
+        batch_size=args.batch_size,
+        fine_tune_mode=args.fine_tune_mode,
+        train=args.train, dev=args.dev, test=args.test,
+        dev_out  = f'/kaggle/working/predictions/{args.fine_tune_mode}-{args.dataset}-dev-out.csv',
+        test_out = f'/kaggle/working/predictions/{args.fine_tune_mode}-{args.dataset}-test-out.csv'
+    )
+    train(config, custom_bert)
+    print(f'Evaluating on {args.dataset}...')
+    test(config)

classifier_utils.py ADDED Viewed

	@@ -0,0 +1,235 @@

+from everything import *
+from bert import BertModel
+from optimizer import AdamW
+from tokenizer import BertTokenizer
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+class SentimentDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        labels = [x[1] for x in data]
+        sent_ids = [x[2] for x in data]
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        labels = torch.LongTensor(labels)
+        return token_ids, attention_mask, labels, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, labels, sents, sent_ids = self.pad_data(all_data)
+        batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+            'labels': labels,
+            'sents': sents,
+            'sent_ids': sent_ids
+        }
+        return batched_data
+class SentimentTestDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        sent_ids = [x[1] for x in data]
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attention_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attention_mask, sents, sent_ids
+    def collate_fn(self, all_data):
+        token_ids, attention_mask, sents, sent_ids= self.pad_data(all_data)
+        batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+            'sents': sents,
+            'sent_ids': sent_ids
+        }
+        return batched_data
+class AmazonDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents = [x[0] for x in data]
+        sent_ids = [x[1] for x in data]
+        encoding = tokenizer(sents, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attension_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attension_mask, sent_ids
+    def collate_fn(self, data):
+        token_ids, attention_mask, sent_ids = self.pad_data(data)
+        batched_data = {
+            'token_ids': token_ids,
+            'attention_mask': attention_mask,
+            'sent_ids': sent_ids
+        }
+        return batched_data
+class SemanticDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        sents1 = [x[0] for x in data]
+        sents2 = [x[1] for x in data]
+        score = [x[2] for x in data]
+        sent_ids = [x[3] for x in data]
+        encoding = tokenizer(sents1 + sents2, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attension_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attension_mask, score, sent_ids
+    def collate_fn(self, data):
+        token_ids, attention_mask, score, sent_ids = self.pad_data(data)
+        n = len(sent_ids)
+        batched_data = {
+            'token_ids_1': token_ids[:n],
+            'token_ids_2': token_ids[n:],
+            'attention_mask_1': attention_mask[:n],
+            'attention_mask_2': attention_mask[n:],
+            'score': score,
+            'sent_ids': sent_ids
+        }
+        return batched_data
+class InferenceDataset(Dataset):
+    def __init__(self, dataset):
+        self.dataset = dataset
+    def __len__(self):
+        return len(self.dataset)
+    def __getitem__(self, idx):
+        return self.dataset[idx]
+    def pad_data(self, data):
+        anchor = [x[0] for x in data]
+        positive = [x[1] for x in data]
+        negative = [x[2] for x in data]
+        sent_ids = [x[3] for x in data]
+        encoding = tokenizer(anchor + positive + negative, return_tensors='pt', padding=True, truncation=True)
+        token_ids = torch.LongTensor(encoding['input_ids'])
+        attension_mask = torch.LongTensor(encoding['attention_mask'])
+        return token_ids, attension_mask, sent_ids
+    def collate_fn(self, data):
+        token_ids, attention_mask, sent_ids = self.pad_data(data)
+        n = len(sent_ids)
+        batched_data = {
+            'anchor_ids': token_ids[:n],
+            'positive_ids': token_ids[n:2*n],
+            'negative_ids': token_ids[2*n:],
+            'anchor_masks': attention_mask[:n],
+            'positive_masks': attention_mask[n:2*n],
+            'negative_masks': attention_mask[2*n:],
+            'sent_ids': sent_ids
+        }
+        return batched_data
+def load_data(filename, flag='train'):
+    '''
+    - for amazon dataset: list of (sent, id)
+    - for nli dataset: list of (anchor, positive, negative, id)
+    - for stsb dataset: list of (sentence1, sentence2, score, id)
+    - for test dataset: list of (sent, id)
+    - for train dataset: list of (sent, label, id)
+    '''
+    if flag == 'amazon':
+        df = pd.read_parquet(filename)
+        data = list(zip(df['content'], df.index))
+    elif flag == 'nli':
+        df = pd.read_parquet(filename)
+        data = list(zip(df['anchor'], df['positive'], df['negative'], df.index))
+    elif flag == 'stsb':
+        df = pd.read_parquet(filename)
+        data = list(zip(df['sentence1'], df['sentence2'], df['score'], df.index))
+    else:
+        data, num_labels = [], set()
+        with open(filename, 'r') as fp:
+            if flag == 'test':
+                for record in csv.DictReader(fp, delimiter = '\t'):
+                    sent = record['sentence'].lower().strip()
+                    sent_id = record['id'].lower().strip()
+                    data.append((sent,sent_id))
+            else:
+                for record in csv.DictReader(fp, delimiter = '\t'):
+                    sent = record['sentence'].lower().strip()
+                    sent_id = record['id'].lower().strip()
+                    label = int(record['sentiment'].strip())
+                    num_labels.add(label)
+                    data.append((sent, label, sent_id))
+    print(f"load {len(data)} data from {filename}")
+    if flag == "train":
+        return data, len(num_labels)
+    else:
+        return data
+def seed_everything(seed=11711):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True

config.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from typing import Union, Tuple, Dict, Any, Optional
+import os
+import json
+from collections import OrderedDict
+import torch
+from utils import CONFIG_NAME, hf_bucket_url, cached_path, is_remote_url
+class PretrainedConfig(object):
+  model_type: str = ""
+  is_composition: bool = False
+  def __init__(self, **kwargs):
+    # Attributes with defaults
+    self.return_dict = kwargs.pop("return_dict", True)
+    self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+    self.output_attentions = kwargs.pop("output_attentions", False)
+    self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+    self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+    self.pruned_heads = kwargs.pop("pruned_heads", {})
+    self.tie_word_embeddings = kwargs.pop(
+      "tie_word_embeddings", True
+    )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
+    # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+    self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+    self.is_decoder = kwargs.pop("is_decoder", False)
+    self.add_cross_attention = kwargs.pop("add_cross_attention", False)
+    self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
+    # Parameters for sequence generation
+    self.max_length = kwargs.pop("max_length", 20)
+    self.min_length = kwargs.pop("min_length", 0)
+    self.do_sample = kwargs.pop("do_sample", False)
+    self.early_stopping = kwargs.pop("early_stopping", False)
+    self.num_beams = kwargs.pop("num_beams", 1)
+    self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+    self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+    self.temperature = kwargs.pop("temperature", 1.0)
+    self.top_k = kwargs.pop("top_k", 50)
+    self.top_p = kwargs.pop("top_p", 1.0)
+    self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+    self.length_penalty = kwargs.pop("length_penalty", 1.0)
+    self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+    self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
+    self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+    self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+    self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
+    self.output_scores = kwargs.pop("output_scores", False)
+    self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
+    self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
+    self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+    # Fine-tuning task arguments
+    self.architectures = kwargs.pop("architectures", None)
+    self.finetuning_task = kwargs.pop("finetuning_task", None)
+    self.id2label = kwargs.pop("id2label", None)
+    self.label2id = kwargs.pop("label2id", None)
+    if self.id2label is not None:
+      kwargs.pop("num_labels", None)
+      self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+      # Keys are always strings in JSON so convert ids to int here.
+    else:
+      self.num_labels = kwargs.pop("num_labels", 2)
+    # Tokenizer arguments
+    self.tokenizer_class = kwargs.pop("tokenizer_class", None)
+    self.prefix = kwargs.pop("prefix", None)
+    self.bos_token_id = kwargs.pop("bos_token_id", None)
+    self.pad_token_id = kwargs.pop("pad_token_id", None)
+    self.eos_token_id = kwargs.pop("eos_token_id", None)
+    self.sep_token_id = kwargs.pop("sep_token_id", None)
+    self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+    # task specific arguments
+    self.task_specific_params = kwargs.pop("task_specific_params", None)
+    # TPU arguments
+    self.xla_device = kwargs.pop("xla_device", None)
+    # Name or path to the pretrained checkpoint
+    self._name_or_path = str(kwargs.pop("name_or_path", ""))
+    # Drop the transformers version info
+    kwargs.pop("transformers_version", None)
+    # Additional attributes without default values
+    for key, value in kwargs.items():
+      try:
+        setattr(self, key, value)
+      except AttributeError as err:
+        raise err
+  @classmethod
+  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+    return cls.from_dict(config_dict, **kwargs)
+  @classmethod
+  def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+    with open(json_file, "r", encoding="utf-8") as reader:
+      text = reader.read()
+    return json.loads(text)
+  @classmethod
+  def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
+    return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+    config = cls(**config_dict)
+    if hasattr(config, "pruned_heads"):
+      config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+    # Update config with kwargs if needed
+    to_remove = []
+    for key, value in kwargs.items():
+      if hasattr(config, key):
+        setattr(config, key, value)
+        to_remove.append(key)
+    for key in to_remove:
+      kwargs.pop(key, None)
+    if return_unused_kwargs:
+      return config, kwargs
+    else:
+      return config
+  @classmethod
+  def get_config_dict(
+    cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+  ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    use_auth_token = kwargs.pop("use_auth_token", None)
+    local_files_only = kwargs.pop("local_files_only", False)
+    revision = kwargs.pop("revision", None)
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isdir(pretrained_model_name_or_path):
+      config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+    elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+      config_file = pretrained_model_name_or_path
+    else:
+      config_file = hf_bucket_url(
+        pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
+      )
+    try:
+      # Load from URL or cache if already cached
+      resolved_config_file = cached_path(
+        config_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+        use_auth_token=use_auth_token,
+      )
+      # Load config dict
+      config_dict = cls._dict_from_json_file(resolved_config_file)
+    except EnvironmentError as err:
+      msg = (
+        f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+        f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+        f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
+      )
+      raise EnvironmentError(msg)
+    except json.JSONDecodeError:
+      msg = (
+        "Couldn't reach server at '{}' to download configuration file or "
+        "configuration file is not a valid JSON file. "
+        "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+      )
+      raise EnvironmentError(msg)
+    return config_dict, kwargs
+class BertConfig(PretrainedConfig):
+  model_type = "bert"
+  def __init__(
+    self,
+    vocab_size=30522,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=2,
+    initializer_range=0.02,
+    layer_norm_eps=1e-12,
+    pad_token_id=0,
+    gradient_checkpointing=False,
+    position_embedding_type="absolute",
+    use_cache=True,
+    **kwargs
+  ):
+    super().__init__(pad_token_id=pad_token_id, **kwargs)
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.layer_norm_eps = layer_norm_eps
+    self.gradient_checkpointing = gradient_checkpointing
+    self.position_embedding_type = position_embedding_type
+    self.use_cache = use_cache

constants.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+DATA_DIR = 'minbert-data'
+MODEL_DIR = 'minbert-model'
+# Pretrained weights
+SUP_BERT = os.path.join(MODEL_DIR, 'sup-cse-bert.pth')
+UNSUP_BERT = os.path.join(MODEL_DIR, 'unsup-cse-bert.pth')
+# CFIMDB dataset
+IDS_CFIMDB_DEV = os.path.join(DATA_DIR, 'ids-cfimdb-dev.csv')
+IDS_CFIMDB_TEST = os.path.join(DATA_DIR, 'ids-cfimdb-test-student.csv')
+IDS_CFIMDB_TRAIN = os.path.join(DATA_DIR, 'ids-cfimdb-train.csv')
+# SST dataset
+IDS_SST_DEV = os.path.join(DATA_DIR, 'ids-sst-dev.csv')
+IDS_SST_TEST = os.path.join(DATA_DIR, 'ids-sst-test-student.csv')
+IDS_SST_TRAIN = os.path.join(DATA_DIR, 'ids-sst-train.csv')
+# SimCSE train/dev dataset
+NLI_TRAIN = os.path.join(DATA_DIR, 'nli-train.parquet')
+AMAZON_POLARITY = os.path.join(DATA_DIR, 'amazon-polarity.parquet')
+STSB_DEV = os.path.join(DATA_DIR, 'stsb-dev.parquet')
+# Training-specific constants
+SEED=11711
+NUM_CPU_CORES=4
+EPOCHS=10
+USE_GPU=True
+BATCH_SIZE_CSE=8
+BATCH_SIZE_SST=64
+BATCH_SIZE_CFIMDB=8
+HIDDEN_DROPOUT_PROB=0.3

everything.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import csv
+import torch
+import random
+import numpy as np
+import pandas as pd
+import torch.nn.functional as F
+from tqdm import tqdm
+from torch import nn, Tensor
+from types import SimpleNamespace
+from scipy.stats import spearmanr
+from torch.utils.data import Dataset, DataLoader
+from sklearn.metrics import f1_score, accuracy_score
+from constants import *
+import random, numpy as np, argparse
+from types import SimpleNamespace
+import csv

finetune-scripts/sup.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from constants import *
+from types import SimpleNamespace
+from finetuning import finetune_bert
+ARGUMENTS = SimpleNamespace(
+    mode='sup',
+    filepath='/minbert-model/sup-cse-bert.pth',
+    batch_size_train=12,
+    batch_size_dev=64,
+    temp=0.05, lr=1e-5,
+)
+finetune_bert(ARGUMENTS)

finetune-scripts/unsup.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from constants import *
+from types import SimpleNamespace
+from finetuning import finetune_bert
+ARGUMENTS = SimpleNamespace(
+    mode='unsup',
+    filepath='/minbert-model/unsup-cse-bert.pth',
+    batch_size_train=8,
+    batch_size_dev=64,
+    temp=0.05, lr=1e-5,
+)
+finetune_bert(ARGUMENTS)

finetuning.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from classifier_utils import *
+TQDM_DISABLE=True
+def unsup_contrastive_loss(embeds_1: Tensor, embeds_2: Tensor, temp=0.05):
+    '''
+    embeds_1: [batch_size, hidden_size]
+    embeds_2: [batch_size, hidden_size]
+    '''
+    # [batch_size, batch_size]
+    sim_matrix = F.cosine_similarity(embeds_1.unsqueeze(1), embeds_2.unsqueeze(0), dim=-1) / temp
+    # [batch_size]
+    positive_sim = torch.diagonal(sim_matrix)
+    # [batch_size]
+    nume = torch.exp(positive_sim)
+    # [batch_size]
+    deno = torch.exp(sim_matrix).sum(1)
+    # [batch_size]
+    loss_per_batch = -torch.log(nume / deno)
+    return loss_per_batch.sum()
+def sup_contrastive_loss(embeds_1: Tensor, embeds_2: Tensor, embeds_3: Tensor, temp=0.05):
+    '''
+    embeds_1: [batch_size, hidden_size]
+    embeds_2: [batch_size, hidden_size]
+    embeds_3: [batch_size, hidden_size]
+    '''
+    # [batch_size, batch_size]
+    pos_sim_matrix = F.cosine_similarity(embeds_1.unsqueeze(1), embeds_2.unsqueeze(0), dim=-1) / temp
+    neg_sim_matrix = F.cosine_similarity(embeds_1.unsqueeze(1), embeds_3.unsqueeze(0), dim=-1) / temp
+    # [batch_size]
+    positive_sim = torch.diagonal(pos_sim_matrix)
+    # [batch_size]
+    nume = torch.exp(positive_sim)
+    # [batch_size]
+    deno = (torch.exp(pos_sim_matrix) + torch.exp(neg_sim_matrix)).sum(1)
+    # [batch_size]
+    loss_per_batch = -torch.log(nume / deno)
+    return loss_per_batch.sum()
+def sts_eval(dataloader, model: BertModel, device):
+    model.eval()
+    y_true = []
+    y_pred = []
+    sent_ids = []
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc='eval', leave=False, disable=TQDM_DISABLE):
+            token_ids_1 = batch['token_ids_1'].to(device)
+            token_ids_2 = batch['token_ids_2'].to(device)
+            attention_mask_1 = batch['attention_mask_1'].to(device)
+            attention_mask_2 = batch['attention_mask_2'].to(device)
+            scores = batch['score']
+            b_sent_ids = batch['sent_ids']
+            logits_1 = model(token_ids_1, attention_mask_1)['pooler_output']
+            logits_2 = model(token_ids_2, attention_mask_2)['pooler_output']
+            sim = F.cosine_similarity(logits_1, logits_2)
+            y_true.extend(scores)
+            y_pred.extend(sim.cpu().tolist())
+            sent_ids.extend(b_sent_ids)
+    spearman_corr, _ = spearmanr(y_pred, y_true)
+    return spearman_corr, b_sent_ids
+def finetune_bert(args):
+    '''
+    Finetuning Baseline
+    -------------------
+    1. Load the Amazon Polarity (train) and STS Dataset (dev).
+    2. Initialize pretrained minBERT
+    3. Looping through 10 epoches.
+    4. Calculate batches' SimCSE loss function.
+    5. Backpropagation using Adam Optimizer.
+    6. Evaluation on dev dataset:
+        6.1. Create two [CLS] embeddings for given pair.
+        6.2. Calculate their cosine similarity (0 <= sim <= 1).
+        6.3. Spearman's correlation between calculated sim and expected sim.
+    7. Better spearman's correlation (dev_acc > best_dev_acc) -> save_model(...).
+    '''
+    assert args.mode in ['unsup', 'sup']
+    seed_everything(SEED)
+    torch.set_num_threads(NUM_CPU_CORES)
+    if args.mode == 'unsup':
+        train_dataset = AmazonDataset(load_data(AMAZON_POLARITY, 'amazon'))
+    else:
+        train_dataset = InferenceDataset(load_data(NLI_TRAIN, 'nli'))
+    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size_train,
+                                  num_workers=NUM_CPU_CORES, collate_fn=train_dataset.collate_fn)
+    sts_dataset = SemanticDataset(load_data(STSB_DEV, 'stsb'))
+    sts_dataloader = DataLoader(sts_dataset, shuffle=False, batch_size=args.batch_size_dev,
+                                num_workers=NUM_CPU_CORES, collate_fn=sts_dataset.collate_fn)
+    device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+    model = BertModel.from_pretrained('bert-base-uncased')
+    model.to(device)
+    best_dev_acc = 0
+    optimizer = AdamW(model.parameters(), lr=args.lr)
+    print(f'Finetuning minBERT with {args.mode}ervised method...')
+    for epoch in range(EPOCHS):
+        model.train()
+        train_loss = num_batches = 0
+        for batch in tqdm(train_dataloader, f'train-{epoch}', leave=False, disable=TQDM_DISABLE):
+            if args.mode == 'unsup':
+                b_ids = batch['token_ids'].to(device)
+                b_mask = batch['attention_mask'].to(device)
+                # Get different embeddings with different dropout masks
+                logits_1 = model(b_ids, b_mask)['pooler_output']
+                logits_2 = model(b_ids, b_mask)['pooler_output']
+                # Calculate mean SimCSE loss function
+                loss = unsup_contrastive_loss(logits_1, logits_2, args.temp)
+            else:
+                b_anchor_ids = batch['anchor_ids'].to(device)
+                b_positive_ids = batch['positive_ids'].to(device)
+                b_negative_ids = batch['negative_ids'].to(device)
+                b_anchor_masks = batch['anchor_masks'].to(device)
+                b_positive_masks = batch['positive_masks'].to(device)
+                b_negative_masks = batch['negative_masks'].to(device)
+                logits_1 = model(b_anchor_ids, b_anchor_masks)['pooler_output']
+                logits_2 = model(b_positive_ids, b_positive_masks)['pooler_output']
+                logits_3 = model(b_negative_ids, b_negative_masks)['pooler_output']
+                loss = sup_contrastive_loss(logits_1, logits_2, logits_3, args.temp)
+            # Back propagation
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            train_loss += loss.item()
+            num_batches += 1
+        train_loss /= num_batches
+        dev_acc, _ = sts_eval(sts_dataloader, model, device)
+        if dev_acc > best_dev_acc:
+            best_dev_acc = dev_acc
+            torch.save(model.state_dict(), args.filepath)
+            print(f"save the model to {args.filepath}")
+        print(f"Epoch {epoch}: train loss :: {train_loss :.3f}, dev acc :: {dev_acc :.3f}")

minbert-data/amazon-polarity.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cbb5fc18093875baac0f49ae926f76aa4938e3e9b8114d9a1d95fa05810d8e4
+size 31246252

minbert-data/ids-cfimdb-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3087f571b66860fe5d035b5a018d08202ad3fd3720e4821c04b2acf6c7ded559
+size 249095

minbert-data/ids-cfimdb-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ae611548c9eac879e9ebb406cc9f8ae68ff12f78090e4965af5cbdfa06240f4
+size 495595

minbert-data/ids-cfimdb-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:140fc513045a966109faed46a5c7a898767b96714d71bcb9c15f659129fadcea
+size 1693182

minbert-data/ids-sst-dev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a186ce94577635fbe10beaaddd50f16cccf6c30973221cefdf90deed2a584bfe
+size 151384

minbert-data/ids-sst-test-student.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd5a767faa0c26782117e37767ece154c30d5d04fb8727d09c71e3850a55c7b
+size 313202

minbert-data/ids-sst-train.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03b2b625c090f94a6afd59f114cde5282e2053aab0b101e87ed695d8a0c5b1df
+size 1175139

minbert-data/nli-train.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc8e9bffa6e24c1175be20aa2d064dae99501937d7dec52a341f23f75eaeaec8
+size 28964735

minbert-data/optimizer_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b817e0dce16a9bc8d3a6bcb88035db68f7d783dc8a565737581fadd05db815
+size 152

minbert-data/sanity_check.data ADDED Viewed

Binary file (56.4 kB). View file

minbert-data/stsb-dev.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c6e0e9881f1b398abe3e439a482f4686305c3784568c462f6bba58bdff03b0a
+size 142187

minbert-model/sup-cse-bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ba7bedbe15db3ce7345fa9cba47dc281d4ddb34512fe745445468adbe6abd08
+size 438007966

minbert-model/unsup-cse-bert.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a8cfeab46f5903b3297f9536c29516ec96c9bef525ecf13d33494dbd0edebd7
+size 438008374

optimizer.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from typing import Callable, Iterable, Tuple
+import math
+import torch
+from torch.optim import Optimizer
+class AdamW(Optimizer):
+    def __init__(
+            self,
+            params: Iterable[torch.nn.parameter.Parameter],
+            lr: float = 1e-3,
+            betas: Tuple[float, float] = (0.9, 0.999),
+            eps: float = 1e-6,
+            weight_decay: float = 0.0,
+            correct_bias: bool = True,
+    ):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, correct_bias=correct_bias)
+        super().__init__(params, defaults)
+    def step(self, closure: Callable = None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead")
+                # Access state
+                state = self.state[p]
+                # Initialize state if not already done
+                if len(state) == 0:
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
+                # Hyperparameters
+                alpha = group["lr"]
+                beta1, beta2 = group["betas"]
+                eps = group["eps"]
+                weight_decay = group["weight_decay"]
+                correct_bias = group["correct_bias"]
+                # Retrieve state variables
+                exp_avg = state["exp_avg"]
+                exp_avg_sq = state["exp_avg_sq"]
+                step = state["step"]
+                # Update step
+                step += 1
+                state["step"] = step
+                # Update biased first and second moment estimates
+                exp_avg.mul_(beta1).add_(grad, alpha=(1 - beta1))
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=(1 - beta2))
+                # Compute bias-corrected moments
+                if correct_bias:
+                    bias_correction1 = 1 - beta1 ** step
+                    bias_correction2 = 1 - beta2 ** step
+                    exp_avg_corr = exp_avg / bias_correction1
+                    exp_avg_sq_corr = exp_avg_sq / bias_correction2
+                else:
+                    exp_avg_corr = exp_avg
+                    exp_avg_sq_corr = exp_avg_sq
+                # Update parameters
+                denom = exp_avg_sq_corr.sqrt().add_(eps)
+                step_size = alpha
+                p.data.addcdiv_(exp_avg_corr, denom, value=-step_size)
+                # Apply weight decay
+                if weight_decay != 0:
+                    p.data.add_(p.data, alpha=-alpha * weight_decay)
+        return loss

optimizer_test.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import os
+import torch
+import numpy as np
+from optimizer import AdamW
+from constants import DATA_DIR
+seed = 0
+def test_optimizer(opt_class) -> torch.Tensor:
+    rng = np.random.default_rng(seed)
+    torch.manual_seed(seed)
+    model = torch.nn.Linear(3, 2, bias=False)
+    opt = opt_class(
+        model.parameters(),
+        lr=1e-3,
+        weight_decay=1e-4,
+        correct_bias=True,
+    )
+    for i in range(1000):
+        opt.zero_grad()
+        x = torch.FloatTensor(rng.uniform(size=[model.in_features]))
+        y_hat = model(x)
+        y = torch.Tensor([x[0] + x[1], -x[2]])
+        loss = ((y - y_hat) ** 2).sum()
+        loss.backward()
+        opt.step()
+    return model.weight.detach()
+ref = torch.tensor(np.load(os.path.join(DATA_DIR, "optimizer_test.npy")))
+actual = test_optimizer(AdamW)
+print(ref)
+print(actual)
+assert torch.allclose(ref, actual, atol=1e-6, rtol=1e-4)
+print("Optimizer test passed!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch==2.4.1
+tqdm==4.58.0
+requests==2.25.1
+importlib-metadata==3.7.0
+filelock==3.16.1
+sklearn==0.0
+tokenizers==0.15
+explainaboard_client==0.0.7
+pandas==2.2.3

sanity_check.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import os
+import torch
+from bert import BertModel
+from constants import DATA_DIR
+sanity_data = torch.load(os.path.join(DATA_DIR, "sanity_check.data"), weights_only=True)
+sent_ids = torch.tensor([[101, 7592, 2088, 102, 0, 0, 0, 0],
+                         [101, 7592, 15756, 2897, 2005, 17953, 2361, 102]])
+att_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0],[1, 1, 1, 1, 1, 1, 1, 1]])
+# Load model.
+bert = BertModel.from_pretrained('bert-base-uncased')
+outputs = bert(sent_ids, att_mask)
+att_mask = att_mask.unsqueeze(-1)
+outputs['last_hidden_state'] = outputs['last_hidden_state'] * att_mask
+sanity_data['last_hidden_state'] = sanity_data['last_hidden_state'] * att_mask
+for k in ['last_hidden_state', 'pooler_output']:
+    assert torch.allclose(outputs[k], sanity_data[k], atol=1e-5, rtol=1e-3)
+print("Your BERT implementation is correct!")

tokenizer.py ADDED Viewed

The diff for this file is too large to render. See raw diff

train-scripts/base_cfimdb_onfm.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+classifier_run(ARGUMENTS)

train-scripts/base_cfimdb_onll.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+classifier_run(ARGUMENTS)

train-scripts/base_sst_onfm.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+classifier_run(ARGUMENTS)

train-scripts/base_sst_onll.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+classifier_run(ARGUMENTS)

train-scripts/finetuned_bert.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from everything import *
+from bert import BertModel
+def get_finetuned_bert(mode: str):
+    assert mode in ['sup', 'unsup']
+    bert = BertModel.from_pretrained('bert-base-uncased')
+    if mode == 'sup':
+        state_dict = torch.load(SUP_BERT, weights_only=True)
+    else:
+        state_dict = torch.load(UNSUP_BERT, weights_only=True)
+    device = torch.device('cuda') if USE_GPU else torch.device('cpu')
+    bert.load_state_dict(state_dict)
+    return bert.to(device)

train-scripts/sup_cfimdb_onfm.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+bert = get_finetuned_bert('sup')
+classifier_run(ARGUMENTS, bert)

train-scripts/sup_cfimdb_onll.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+bert = get_finetuned_bert('sup')
+classifier_run(ARGUMENTS, bert)

train-scripts/sup_sst_onfm.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+bert = get_finetuned_bert('sup')
+classifier_run(ARGUMENTS, bert)

train-scripts/sup_sst_onll.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+bert = get_finetuned_bert('sup')
+classifier_run(ARGUMENTS, bert)

train-scripts/unsup_cfimdb_onfm.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+bert = get_finetuned_bert('unsup')
+classifier_run(ARGUMENTS, bert)

train-scripts/unsup_cfimdb_onll.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='cfimdb',
+    batch_size=BATCH_SIZE_CFIMDB,
+    train=IDS_CFIMDB_TRAIN,
+    dev=IDS_CFIMDB_DEV,
+    test=IDS_CFIMDB_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+bert = get_finetuned_bert('unsup')
+classifier_run(ARGUMENTS, bert)

train-scripts/unsup_sst_onfm.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-5,
+    fine_tune_mode='full-model'
+)
+bert = get_finetuned_bert('unsup')
+classifier_run(ARGUMENTS, bert)

train-scripts/unsup_sst_onll.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from constants import *
+from types import SimpleNamespace
+from classifier import classifier_run
+from .finetuned_bert import get_finetuned_bert
+ARGUMENTS = SimpleNamespace(
+    dataset='sst',
+    batch_size=BATCH_SIZE_SST,
+    train=IDS_SST_TRAIN,
+    dev=IDS_SST_DEV,
+    test=IDS_SST_TEST,
+    lr=1e-3,
+    fine_tune_mode='last-linear-layer'
+)
+bert = get_finetuned_bert('unsup')
+classifier_run(ARGUMENTS, bert)

utils.py ADDED Viewed

	@@ -0,0 +1,349 @@

+from typing import Dict, List, Optional, Union, Tuple, BinaryIO
+import os
+import sys
+import json
+import shutil
+import tempfile
+import copy
+from tqdm.auto import tqdm
+from functools import partial
+from urllib.parse import urlparse
+from pathlib import Path
+import requests
+from hashlib import sha256
+from filelock import FileLock
+import importlib_metadata
+import torch
+import torch.nn as nn
+from torch import Tensor
+import fnmatch
+__version__ = "4.0.0"
+_torch_version = importlib_metadata.version("torch")
+hf_cache_home = os.path.expanduser(os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")))
+default_cache_path = os.path.join(hf_cache_home, "transformers")
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+PRESET_MIRROR_DICT = {
+    "tuna": "https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models",
+    "bfsu": "https://mirrors.bfsu.edu.cn/hugging-face-models",
+}
+HUGGINGFACE_CO_PREFIX = "https://huggingface.co/{model_id}/resolve/{revision}/{filename}"
+WEIGHTS_NAME = "pytorch_model.bin"
+CONFIG_NAME = "config.json"
+def is_torch_available():
+  return True
+def is_tf_available():
+  return False
+def is_remote_url(url_or_filename):
+  parsed = urlparse(url_or_filename)
+  return parsed.scheme in ("http", "https")
+def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
+  headers = copy.deepcopy(headers)
+  if resume_size > 0:
+    headers["Range"] = "bytes=%d-" % (resume_size,)
+  r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+  r.raise_for_status()
+  content_length = r.headers.get("Content-Length")
+  total = resume_size + int(content_length) if content_length is not None else None
+  progress = tqdm(
+    unit="B",
+    unit_scale=True,
+    total=total,
+    initial=resume_size,
+    desc="Downloading",
+    disable=False,
+  )
+  for chunk in r.iter_content(chunk_size=1024):
+    if chunk:  # filter out keep-alive new chunks
+      progress.update(len(chunk))
+      temp_file.write(chunk)
+  progress.close()
+def url_to_filename(url: str, etag: Optional[str] = None) -> str:
+  url_bytes = url.encode("utf-8")
+  filename = sha256(url_bytes).hexdigest()
+  if etag:
+    etag_bytes = etag.encode("utf-8")
+    filename += "." + sha256(etag_bytes).hexdigest()
+  if url.endswith(".h5"):
+    filename += ".h5"
+  return filename
+def hf_bucket_url(
+  model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
+) -> str:
+  if subfolder is not None:
+    filename = f"{subfolder}/{filename}"
+  if mirror:
+    endpoint = PRESET_MIRROR_DICT.get(mirror, mirror)
+    legacy_format = "/" not in model_id
+    if legacy_format:
+      return f"{endpoint}/{model_id}-{filename}"
+    else:
+      return f"{endpoint}/{model_id}/{filename}"
+  if revision is None:
+    revision = "main"
+  return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+  ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+  if is_torch_available():
+    ua += f"; torch/{_torch_version}"
+  if is_tf_available():
+    ua += f"; tensorflow/{_tf_version}"
+  if isinstance(user_agent, dict):
+    ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+  elif isinstance(user_agent, str):
+    ua += "; " + user_agent
+  return ua
+def get_from_cache(
+  url: str,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  etag_timeout=10,
+  resume_download=False,
+  user_agent: Union[Dict, str, None] = None,
+  use_auth_token: Union[bool, str, None] = None,
+  local_files_only=False,
+) -> Optional[str]:
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+  os.makedirs(cache_dir, exist_ok=True)
+  headers = {"user-agent": http_user_agent(user_agent)}
+  if isinstance(use_auth_token, str):
+    headers["authorization"] = "Bearer {}".format(use_auth_token)
+  elif use_auth_token:
+    token = HfFolder.get_token()
+    if token is None:
+      raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+    headers["authorization"] = "Bearer {}".format(token)
+  url_to_download = url
+  etag = None
+  if not local_files_only:
+    try:
+      r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
+      r.raise_for_status()
+      etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+      # We favor a custom header indicating the etag of the linked resource, and
+      # we fallback to the regular etag header.
+      # If we don't have any of those, raise an error.
+      if etag is None:
+        raise OSError(
+          "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+        )
+      # In case of a redirect,
+      # save an extra redirect on the request.get call,
+      # and ensure we download the exact atomic version even if it changed
+      # between the HEAD and the GET (unlikely, but hey).
+      if 300 <= r.status_code <= 399:
+        url_to_download = r.headers["Location"]
+    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+      # etag is already None
+      pass
+  filename = url_to_filename(url, etag)
+  # get cache path to put the file
+  cache_path = os.path.join(cache_dir, filename)
+  # etag is None == we don't have a connection or we passed local_files_only.
+  # try to get the last downloaded one
+  if etag is None:
+    if os.path.exists(cache_path):
+      return cache_path
+    else:
+      matching_files = [
+        file
+        for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
+        if not file.endswith(".json") and not file.endswith(".lock")
+      ]
+      if len(matching_files) > 0:
+        return os.path.join(cache_dir, matching_files[-1])
+      else:
+        # If files cannot be found and local_files_only=True,
+        # the models might've been found if local_files_only=False
+        # Notify the user about that
+        if local_files_only:
+          raise FileNotFoundError(
+            "Cannot find the requested files in the cached path and outgoing traffic has been"
+            " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+            " to False."
+          )
+        else:
+          raise ValueError(
+            "Connection error, and we cannot find the requested files in the cached path."
+            " Please try again or make sure your Internet connection is on."
+          )
+  # From now on, etag is not None.
+  if os.path.exists(cache_path) and not force_download:
+    return cache_path
+  # Prevent parallel downloads of the same file with a lock.
+  lock_path = cache_path + ".lock"
+  with FileLock(lock_path):
+    # If the download just completed while the lock was activated.
+    if os.path.exists(cache_path) and not force_download:
+      # Even if returning early like here, the lock will be released.
+      return cache_path
+    if resume_download:
+      incomplete_path = cache_path + ".incomplete"
+      @contextmanager
+      def _resumable_file_manager() -> "io.BufferedWriter":
+        with open(incomplete_path, "ab") as f:
+          yield f
+      temp_file_manager = _resumable_file_manager
+      if os.path.exists(incomplete_path):
+        resume_size = os.stat(incomplete_path).st_size
+      else:
+        resume_size = 0
+    else:
+      temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
+      resume_size = 0
+    # Download to temporary file, then copy to cache dir once finished.
+    # Otherwise you get corrupt cache entries if the download gets interrupted.
+    with temp_file_manager() as temp_file:
+      http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
+    os.replace(temp_file.name, cache_path)
+    meta = {"url": url, "etag": etag}
+    meta_path = cache_path + ".json"
+    with open(meta_path, "w") as meta_file:
+      json.dump(meta, meta_file)
+  return cache_path
+def cached_path(
+  url_or_filename,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  resume_download=False,
+  user_agent: Union[Dict, str, None] = None,
+  extract_compressed_file=False,
+  force_extract=False,
+  use_auth_token: Union[bool, str, None] = None,
+  local_files_only=False,
+) -> Optional[str]:
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(url_or_filename, Path):
+    url_or_filename = str(url_or_filename)
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+  if is_remote_url(url_or_filename):
+    # URL, so get it from the cache (downloading if necessary)
+    output_path = get_from_cache(
+      url_or_filename,
+      cache_dir=cache_dir,
+      force_download=force_download,
+      proxies=proxies,
+      resume_download=resume_download,
+      user_agent=user_agent,
+      use_auth_token=use_auth_token,
+      local_files_only=local_files_only,
+    )
+  elif os.path.exists(url_or_filename):
+    # File, and it exists.
+    output_path = url_or_filename
+  elif urlparse(url_or_filename).scheme == "":
+    # File, but it doesn't exist.
+    raise EnvironmentError("file {} not found".format(url_or_filename))
+  else:
+    # Something unknown
+    raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+  if extract_compressed_file:
+    if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+      return output_path
+    # Path where we extract compressed archives
+    # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+    output_dir, output_file = os.path.split(output_path)
+    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+    if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+      return output_path_extracted
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+      shutil.rmtree(output_path_extracted, ignore_errors=True)
+      os.makedirs(output_path_extracted)
+      if is_zipfile(output_path):
+        with ZipFile(output_path, "r") as zip_file:
+          zip_file.extractall(output_path_extracted)
+          zip_file.close()
+      elif tarfile.is_tarfile(output_path):
+        tar_file = tarfile.open(output_path)
+        tar_file.extractall(output_path_extracted)
+        tar_file.close()
+      else:
+        raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+    return output_path_extracted
+  return output_path
+def get_parameter_dtype(parameter: Union[nn.Module]):
+  try:
+    return next(parameter.parameters()).dtype
+  except StopIteration:
+    # For nn.DataParallel compatibility in PyTorch 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+      tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+      return tuples
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    first_tuple = next(gen)
+    return first_tuple[1].dtype
+def get_extended_attention_mask(attention_mask: Tensor, dtype) -> Tensor:
+  # attention_mask [batch_size, seq_length]
+  assert attention_mask.dim() == 2
+  # [batch_size, 1, 1, seq_length] for multi-head attention
+  extended_attention_mask = attention_mask[:, None, None, :]
+  extended_attention_mask = extended_attention_mask.to(dtype=dtype)  # fp16 compatibility
+  extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+  return extended_attention_mask