# Tutorial url
# https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891

!pip install transformers datasets tokenizers Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad. Again.', \"Well, I thought we'd start with pronunciation, if that's okay with you.\"]\n" ] } ], "source": [ "print(pairs[0])" ] }, { "cell_type": "code", "execution_count": 18, "id": "28b4a8b9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 221616/221616 [00:00<00:00, 1154039.87it/s]\n", "C:\\Users\\yozhan\\AppData\\Local\\anaconda3\\lib\\site-packages\\transformers\\tokenization_utils_base.py:1679: FutureWarning: Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated and won't be possible anymore in v5. Use a model identifier or the path to a directory instead.\n", " warnings.warn(\n" ] } ], "source": [ "os.mkdir('./data')\n", "text_data = []\n", "file_count = 0\n", "\n", "for sample in tqdm.tqdm([x[0] for x in pairs]):\n", " text_data.append(sample)\n", "\n", " # once we hit the 10K mark, save to file\n", " if len(text_data) == 10000:\n", " with open(f'./data/text_{file_count}.txt', 'w', encoding='utf-8') as fp:\n", " fp.write('\\n'.join(text_data))\n", " text_data = []\n", " file_count += 1\n", "\n", "paths = [str(x) for x in Path('./data').glob('**/*.txt')]\n", "\n", "### training own tokenizer\n", "tokenizer = BertWordPieceTokenizer(\n", " clean_text=True,\n", " handle_chinese_chars=False,\n", " strip_accents=False,\n", " lowercase=True\n", ")\n", "\n", "tokenizer.train( \n", " files=paths,\n", " vocab_size=30_000, \n", " min_frequency=5,\n", " limit_alphabet=1000, \n", " wordpieces_prefix='##',\n", " special_tokens=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]']\n", " )\n", "\n", "os.mkdir('./bert-it-1')\n", "tokenizer.save_model('./bert-it-1', 'bert-it')\n", "tokenizer = BertTokenizer.from_pretrained('./bert-it-1/bert-it-vocab.txt', local_files_only=True)" ] }, { "cell_type": "code", "execution_count": 19, "id": "30e775ad", "metadata": {}, "outputs": [], "source": [ "class BERTDataset(Dataset):\n", " def __init__(self, data_pair, tokenizer, seq_len=64):\n", "\n", " self.tokenizer = tokenizer\n", " self.seq_len = seq_len\n", " self.corpus_lines = len(data_pair)\n", " self.lines = data_pair\n", "\n", " def __len__(self):\n", " return self.corpus_lines\n", "\n", " def __getitem__(self, item):\n", "\n", " # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)\n", " t1, t2, is_next_label = self.get_sent(item)\n", "\n", " # Step 2: replace random words in sentence with mask / random words\n", " t1_random, t1_label = self.random_word(t1)\n", " t2_random, t2_label = self.random_word(t2)\n", "\n", " # Step 3: Adding CLS and SEP tokens to the start and end of sentences\n", " # Adding PAD token for labels\n", " t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]\n", " t2 = t2_random + [self.tokenizer.vocab['[SEP]']]\n", " t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]\n", " t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]\n", "\n", " # Step 4: combine sentence 1 and 2 as one input\n", " # adding PAD tokens to make the sentence same length as seq_len\n", " segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]\n", " bert_input = (t1 + t2)[:self.seq_len]\n", " bert_label = (t1_label + t2_label)[:self.seq_len]\n", " padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]\n", " bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)\n", "\n", " output = {\"bert_input\": bert_input,\n", " \"bert_label\": bert_label,\n", " \"segment_label\": segment_label,\n", " \"is_next\": is_next_label}\n", "\n", " return {key: torch.tensor(value) for key, value in output.items()}\n", "\n", " def random_word(self, sentence):\n", " tokens = sentence.split()\n", " output_label = []\n", " output = []\n", "\n", " # 15% of the tokens would be replaced\n", " for i, token in enumerate(tokens):\n", " prob = random.random()\n", "\n", " # remove cls and sep token\n", " token_id = self.tokenizer(token)['input_ids'][1:-1]\n", "\n", " if prob < 0.15:\n", " prob /= 0.15\n", "\n", " # 80% chance change token to mask token\n", " if prob < 0.8:\n", " for i in range(len(token_id)):\n", " output.append(self.tokenizer.vocab['[MASK]'])\n", "\n", " # 10% chance change token to random token\n", " elif prob < 0.9:\n", " for i in range(len(token_id)):\n", " output.append(random.randrange(len(self.tokenizer.vocab)))\n", "\n", " # 10% chance change token to current token\n", " else:\n", " output.append(token_id)\n", "\n", " output_label.append(token_id)\n", "\n", " else:\n", " output.append(token_id)\n", " for i in range(len(token_id)):\n", " output_label.append(0)\n", "\n", " # flattening\n", " output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))\n", " output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))\n", " assert len(output) == len(output_label)\n", " return output, output_label\n", "\n", " def get_sent(self, index):\n", " '''return random sentence pair'''\n", " t1, t2 = self.get_corpus_line(index)\n", "\n", " # negative or positive pair, for next sentence prediction\n", " if random.random() > 0.5:\n", " return t1, t2, 1\n", " else:\n", " return t1, self.get_random_line(), 0\n", "\n", " def get_corpus_line(self, item):\n", " '''return sentence pair'''\n", " return self.lines[item][0], self.lines[item][1]\n", "\n", " def get_random_line(self):\n", " '''return random single sentence'''\n", " return self.lines[random.randrange(len(self.lines))][1]" ] }, { "cell_type": "code", "execution_count": 20, "id": "b0d9f35c", "metadata": {}, "outputs": [], "source": [ "train_data = BERTDataset(\n", " pairs, seq_len=MAX_LEN, tokenizer=tokenizer)\n", "train_loader = DataLoader(\n", " train_data, batch_size=32, shuffle=True, pin_memory=True)\n", "sample_data = next(iter(train_loader))" ] }, { "cell_type": "code", "execution_count": 21, "id": "ad60cf79", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'bert_input': tensor([ 1, 182, 11, 58, 162, 874, 34, 2, 6, 3232, 108, 512,\n", " 17, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0]), 'bert_label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'segment_label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'is_next': tensor(1)}\n" ] } ], "source": [ "print(train_data[random.randrange(len(train_data))])" ] }, { "cell_type": "code", "execution_count": 22, "id": "bd70e96e", "metadata": {}, "outputs": [], "source": [ "class PositionalEmbedding(torch.nn.Module):\n", "\n", " def __init__(self, d_model, max_len=128):\n", " super().__init__()\n", "\n", " # Compute the positional encodings once in log space.\n", " pe = torch.zeros(max_len, d_model).float()\n", " pe.require_grad = False\n", "\n", " for pos in range(max_len): \n", " # for each dimension of the each position\n", " for i in range(0, d_model, 2): \n", " pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/d_model)))\n", " pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))\n", "\n", " # include the batch size\n", " self.pe = pe.unsqueeze(0) \n", " # self.register_buffer('pe', pe)\n", "\n", " def forward(self, x):\n", " return self.pe\n", "\n", "class BERTEmbedding(torch.nn.Module):\n", " \"\"\"\n", " BERT Embedding which is consisted with under features\n", " 1. TokenEmbedding : normal embedding matrix\n", " 2. PositionalEmbedding : adding positional information using sin, cos\n", " 2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2)\n", " sum of all these features are output of BERTEmbedding\n", " \"\"\"\n", "\n", " def __init__(self, vocab_size, embed_size, seq_len=64, dropout=0.1):\n", " \"\"\"\n", " :param vocab_size: total vocab size\n", " :param embed_size: embedding size of token embedding\n", " :param dropout: dropout rate\n", " \"\"\"\n", "\n", " super().__init__()\n", " self.embed_size = embed_size\n", " # (m, seq_len) --> (m, seq_len, embed_size)\n", " # padding_idx is not updated during training, remains as fixed pad (0)\n", " self.token = torch.nn.Embedding(vocab_size, embed_size, padding_idx=0)\n", " self.segment = torch.nn.Embedding(3, embed_size, padding_idx=0)\n", " self.position = PositionalEmbedding(d_model=embed_size, max_len=seq_len)\n", " self.dropout = torch.nn.Dropout(p=dropout)\n", " \n", " def forward(self, sequence, segment_label):\n", " x = self.token(sequence) + self.position(sequence) + self.segment(segment_label)\n", " return self.dropout(x)" ] }, { "cell_type": "code", "execution_count": 23, "id": "baa5caa0", "metadata": {}, "outputs": [], "source": [ "### attention layers\n", "class MultiHeadedAttention(torch.nn.Module):\n", " \n", " def __init__(self, heads, d_model, dropout=0.1):\n", " super(MultiHeadedAttention, self).__init__()\n", " \n", " assert d_model % heads == 0\n", " self.d_k = d_model // heads\n", " self.heads = heads\n", " self.dropout = torch.nn.Dropout(dropout)\n", "\n", " self.query = torch.nn.Linear(d_model, d_model)\n", " self.key = torch.nn.Linear(d_model, d_model)\n", " self.value = torch.nn.Linear(d_model, d_model)\n", " self.output_linear = torch.nn.Linear(d_model, d_model)\n", " \n", " def forward(self, query, key, value, mask):\n", " \"\"\"\n", " query, key, value of shape: (batch_size, max_len, d_model)\n", " mask of shape: (batch_size, 1, 1, max_words)\n", " \"\"\"\n", " # (batch_size, max_len, d_model)\n", " query = self.query(query)\n", " key = self.key(key) \n", " value = self.value(value) \n", " \n", " # (batch_size, max_len, d_model) --> (batch_size, max_len, h, d_k) --> (batch_size, h, max_len, d_k)\n", " query = query.view(query.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3) \n", " key = key.view(key.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3) \n", " value = value.view(value.shape[0], -1, self.heads, self.d_k).permute(0, 2, 1, 3) \n", " \n", " # (batch_size, h, max_len, d_k) matmul (batch_size, h, d_k, max_len) --> (batch_size, h, max_len, max_len)\n", " scores = torch.matmul(query, key.permute(0, 1, 3, 2)) / math.sqrt(query.size(-1))\n", "\n", " # fill 0 mask with super small number so it wont affect the softmax weight\n", " # (batch_size, h, max_len, max_len)\n", " scores = scores.masked_fill(mask == 0, -1e9) \n", "\n", " # (batch_size, h, max_len, max_len)\n", " # softmax to put attention weight for all non-pad tokens\n", " # max_len X max_len matrix of attention\n", " weights = F.softmax(scores, dim=-1) \n", " weights = self.dropout(weights)\n", "\n", " # (batch_size, h, max_len, max_len) matmul (batch_size, h, max_len, d_k) --> (batch_size, h, max_len, d_k)\n", " context = torch.matmul(weights, value)\n", "\n", " # (batch_size, h, max_len, d_k) --> (batch_size, max_len, h, d_k) --> (batch_size, max_len, d_model)\n", " context = context.permute(0, 2, 1, 3).contiguous().view(context.shape[0], -1, self.heads * self.d_k)\n", "\n", " # (batch_size, max_len, d_model)\n", " return self.output_linear(context)\n", "\n", "class FeedForward(torch.nn.Module):\n", " \"Implements FFN equation.\"\n", "\n", " def __init__(self, d_model, middle_dim=2048, dropout=0.1):\n", " super(FeedForward, self).__init__()\n", " \n", " self.fc1 = torch.nn.Linear(d_model, middle_dim)\n", " self.fc2 = torch.nn.Linear(middle_dim, d_model)\n", " self.dropout = torch.nn.Dropout(dropout)\n", " self.activation = torch.nn.GELU()\n", "\n", " def forward(self, x):\n", " out = self.activation(self.fc1(x))\n", " out = self.fc2(self.dropout(out))\n", " return out\n", "\n", "class EncoderLayer(torch.nn.Module):\n", " def __init__(\n", " self, \n", " d_model=768,\n", " heads=12, \n", " feed_forward_hidden=768 * 4, \n", " dropout=0.1\n", " ):\n", " super(EncoderLayer, self).__init__()\n", " self.layernorm = torch.nn.LayerNorm(d_model)\n", " self.self_multihead = MultiHeadedAttention(heads, d_model)\n", " self.feed_forward = FeedForward(d_model, middle_dim=feed_forward_hidden)\n", " self.dropout = torch.nn.Dropout(dropout)\n", "\n", " def forward(self, embeddings, mask):\n", " # embeddings: (batch_size, max_len, d_model)\n", " # encoder mask: (batch_size, 1, 1, max_len)\n", " # result: (batch_size, max_len, d_model)\n", " interacted = self.dropout(self.self_multihead(embeddings, embeddings, embeddings, mask))\n", " # residual layer\n", " interacted = self.layernorm(interacted + embeddings)\n", " # bottleneck\n", " feed_forward_out = self.dropout(self.feed_forward(interacted))\n", " encoded = self.layernorm(feed_forward_out + interacted)\n", " return encoded" ] }, { "cell_type": "code", "execution_count": 27, "id": "33fe273b", "metadata": {}, "outputs": [], "source": [ "class BERT(torch.nn.Module):\n", " \"\"\"\n", " BERT model : Bidirectional Encoder Representations from Transformers.\n", " \"\"\"\n", "\n", " def __init__(self, vocab_size, d_model=768, n_layers=12, heads=12, dropout=0.1):\n", " \"\"\"\n", " :param vocab_size: vocab_size of total words\n", " :param hidden: BERT model hidden size\n", " :param n_layers: numbers of Transformer blocks(layers)\n", " :param attn_heads: number of attention heads\n", " :param dropout: dropout rate\n", " \"\"\"\n", "\n", " super().__init__()\n", " self.d_model = d_model\n", " self.n_layers = n_layers\n", " self.heads = heads\n", "\n", " # paper noted they used 4 * hidden_size for ff_network_hidden_size\n", " self.feed_forward_hidden = d_model * 4\n", "\n", " # embedding for BERT, sum of positional, segment, token embeddings\n", " self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=d_model)\n", "\n", " # multi-layers transformer blocks, deep network\n", " self.encoder_blocks = torch.nn.ModuleList(\n", " [EncoderLayer(d_model, heads, d_model * 4, dropout) for _ in range(n_layers)])\n", "\n", " def forward(self, x, segment_info):\n", " # attention masking for padded token\n", " # (batch_size, 1, seq_len, seq_len)\n", " mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1)\n", "\n", " # embedding the indexed sequence to sequence of vectors\n", " x = self.embedding(x, segment_info)\n", "\n", " # running over multiple transformer blocks\n", " for encoder in self.encoder_blocks:\n", " x = encoder.forward(x, mask)\n", " return x\n", "\n", "class NextSentencePrediction(torch.nn.Module):\n", " \"\"\"\n", " 2-class classification model : is_next, is_not_next\n", " \"\"\"\n", "\n", " def __init__(self, hidden):\n", " \"\"\"\n", " :param hidden: BERT model output size\n", " \"\"\"\n", " super().__init__()\n", " self.linear = torch.nn.Linear(hidden, 2)\n", " self.softmax = torch.nn.LogSoftmax(dim=-1)\n", "\n", " def forward(self, x):\n", " # use only the first token which is the [CLS]\n", " return self.softmax(self.linear(x[:, 0]))\n", "\n", "class MaskedLanguageModel(torch.nn.Module):\n", " \"\"\"\n", " predicting origin token from masked input sequence\n", " n-class classification problem, n-class = vocab_size\n", " \"\"\"\n", "\n", " def __init__(self, hidden, vocab_size):\n", " \"\"\"\n", " :param hidden: output size of BERT model\n", " :param vocab_size: total vocab size\n", " \"\"\"\n", " super().__init__()\n", " self.linear = torch.nn.Linear(hidden, vocab_size)\n", " self.softmax = torch.nn.LogSoftmax(dim=-1)\n", "\n", " def forward(self, x):\n", " return self.softmax(self.linear(x))\n", "\n", "class BERTLM(torch.nn.Module):\n", " \"\"\"\n", " BERT Language Model\n", " Next Sentence Prediction Model + Masked Language Model\n", " \"\"\"\n", "\n", " def __init__(self, bert: BERT, vocab_size):\n", " \"\"\"\n", " :param bert: BERT model which should be trained\n", " :param vocab_size: total vocab size for masked_lm\n", " \"\"\"\n", "\n", " super().__init__()\n", " self.bert = bert\n", " self.next_sentence = NextSentencePrediction(self.bert.d_model)\n", " self.mask_lm = MaskedLanguageModel(self.bert.d_model, vocab_size)\n", "\n", " def forward(self, x, segment_label):\n", " x = self.bert(x, segment_label)\n", " return self.next_sentence(x), self.mask_lm(x)" ] }, { "cell_type": "code", "execution_count": 24, "id": "b8f7c1f8", "metadata": {}, "outputs": [], "source": [ "class ScheduledOptim():\n", " '''A simple wrapper class for learning rate scheduling'''\n", "\n", " def __init__(self, optimizer, d_model, n_warmup_steps):\n", " self._optimizer = optimizer\n", " self.n_warmup_steps = n_warmup_steps\n", " self.n_current_steps = 0\n", " self.init_lr = np.power(d_model, -0.5)\n", "\n", " def step_and_update_lr(self):\n", " \"Step with the inner optimizer\"\n", " self._update_learning_rate()\n", " self._optimizer.step()\n", "\n", " def zero_grad(self):\n", " \"Zero out the gradients by the inner optimizer\"\n", " self._optimizer.zero_grad()\n", "\n", " def _get_lr_scale(self):\n", " return np.min([\n", " np.power(self.n_current_steps, -0.5),\n", " np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])\n", "\n", " def _update_learning_rate(self):\n", " ''' Learning rate scheduling per step '''\n", "\n", " self.n_current_steps += 1\n", " lr = self.init_lr * self._get_lr_scale()\n", "\n", " for param_group in self._optimizer.param_groups:\n", " param_group['lr'] = lr" ] }, { "cell_type": "code", "execution_count": 25, "id": "9dd8e50e", "metadata": {}, "outputs": [], "source": [ "class BERTTrainer:\n", " def __init__(\n", " self, \n", " model, \n", " train_dataloader, \n", " test_dataloader=None, \n", " lr= 1e-4,\n", " weight_decay=0.01,\n", " betas=(0.9, 0.999),\n", " warmup_steps=10000,\n", " log_freq=10,\n", " device='cuda'\n", " ):\n", "\n", " self.device = device\n", " self.model = model\n", " self.train_data = train_dataloader\n", " self.test_data = test_dataloader\n", "\n", " # Setting the Adam optimizer with hyper-param\n", " self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)\n", " self.optim_schedule = ScheduledOptim(\n", " self.optim, self.model.bert.d_model, n_warmup_steps=warmup_steps\n", " )\n", "\n", " # Using Negative Log Likelihood Loss function for predicting the masked_token\n", " self.criterion = torch.nn.NLLLoss(ignore_index=0)\n", " self.log_freq = log_freq\n", " print(\"Total Parameters:\", sum([p.nelement() for p in self.model.parameters()]))\n", " \n", " def train(self, epoch):\n", " self.iteration(epoch, self.train_data)\n", "\n", " def test(self, epoch):\n", " self.iteration(epoch, self.test_data, train=False)\n", "\n", " def iteration(self, epoch, data_loader, train=True):\n", " \n", " avg_loss = 0.0\n", " total_correct = 0\n", " total_element = 0\n", " \n", " mode = \"train\" if train else \"test\"\n", "\n", " # progress bar\n", " data_iter = tqdm.tqdm(\n", " enumerate(data_loader),\n", " desc=\"EP_%s:%d\" % (mode, epoch),\n", " total=len(data_loader),\n", " bar_format=\"{l_bar}{r_bar}\"\n", " )\n", "\n", " for i, data in data_iter:\n", "\n", " # 0. batch_data will be sent into the device(GPU or cpu)\n", " data = {key: value.to(self.device) for key, value in data.items()}\n", "\n", " # 1. forward the next_sentence_prediction and masked_lm model\n", " next_sent_output, mask_lm_output = self.model.forward(data[\"bert_input\"], data[\"segment_label\"])\n", "\n", " # 2-1. NLL(negative log likelihood) loss of is_next classification result\n", " next_loss = self.criterion(next_sent_output, data[\"is_next\"])\n", "\n", " # 2-2. NLLLoss of predicting masked token word\n", " # transpose to (m, vocab_size, seq_len) vs (m, seq_len)\n", " # criterion(mask_lm_output.view(-1, mask_lm_output.size(-1)), data[\"bert_label\"].view(-1))\n", " mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data[\"bert_label\"])\n", "\n", " # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure\n", " loss = next_loss + mask_loss\n", "\n", " # 3. backward and optimization only in train\n", " if train:\n", " self.optim_schedule.zero_grad()\n", " loss.backward()\n", " self.optim_schedule.step_and_update_lr()\n", "\n", " # next sentence prediction accuracy\n", " correct = next_sent_output.argmax(dim=-1).eq(data[\"is_next\"]).sum().item()\n", " avg_loss += loss.item()\n", " total_correct += correct\n", " total_element += data[\"is_next\"].nelement()\n", "\n", " post_fix = {\n", " \"epoch\": epoch,\n", " \"iter\": i,\n", " \"avg_loss\": avg_loss / (i + 1),\n", " \"avg_acc\": total_correct / total_element * 100,\n", " \"loss\": loss.item()\n", " }\n", "\n", " if i % self.log_freq == 0:\n", " data_iter.write(str(post_fix))\n", " print(\n", " f\"EP{epoch}, {mode}: \\\n", " avg_loss={avg_loss / len(data_iter)}, \\\n", " total_acc={total_correct * 100.0 / total_element}\"\n", " ) " ] }, { "cell_type": "code", "execution_count": 