{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"scrolled":true,"trusted":true},"outputs":[],"source":["import argparse\n","import glob\n","import logging\n","import os\n","import pickle\n","import random\n","import re\n","import shutil\n","from typing import Dict, List, Tuple\n","from copy import deepcopy\n","from multiprocessing import Pool\n","\n","import numpy as np\n","import torch\n","from torch.nn.utils.rnn import pad_sequence\n","from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler\n","from torch.utils.data.distributed import DistributedSampler\n","from tqdm import tqdm, trange\n","from transformers import AutoTokenizer, AutoModel\n","from transformers import (\n"," WEIGHTS_NAME,\n"," AdamW,\n"," BertConfig,\n"," BertForMaskedLM,\n"," BertTokenizer,\n"," CamembertConfig,\n"," CamembertForMaskedLM,\n"," CamembertTokenizer,\n"," DistilBertConfig,\n"," DistilBertForMaskedLM,\n"," DistilBertTokenizer,\n"," GPT2Config,\n"," GPT2LMHeadModel,\n"," GPT2Tokenizer,\n"," OpenAIGPTConfig,\n"," OpenAIGPTLMHeadModel,\n"," OpenAIGPTTokenizer,\n"," PreTrainedModel,\n"," PreTrainedTokenizer,\n"," RobertaConfig,\n"," RobertaForMaskedLM,\n"," RobertaTokenizer,\n"," get_linear_schedule_with_warmup,\n"," get_cosine_with_hard_restarts_schedule_with_warmup\n",")\n","\n","\n","try:\n"," from torch.utils.tensorboard import SummaryWriter\n","except ImportError:\n"," from tensorboardX import SummaryWriter\n","\n","\n","logger = logging.getLogger(__name__)\n","\n","DNATokenizer = AutoTokenizer.from_pretrained(\"zhihan1996/DNA_bert_6\", trust_remote_code=True)\n","\n","\n","MODEL_CLASSES = {\n"," \"gpt2\": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),\n"," \"openai-gpt\": (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),\n"," \"dna\": (BertConfig, BertForMaskedLM, DNATokenizer),\n"," \"bert\": (BertConfig, BertForMaskedLM, BertTokenizer),\n"," \"roberta\": (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),\n"," \"distilbert\": (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),\n"," \"camembert\": (CamembertConfig, CamembertForMaskedLM, CamembertTokenizer),\n","}\n","\n","MASK_LIST = {\n"," \"3\": [-1, 1],\n"," \"4\": [-1, 1, 2],\n"," \"5\": [-2, -1, 1, 2],\n"," \"6\": [-2, -1, 1, 2, 3]\n","}\n","\n","\n","class TextDataset(Dataset):\n"," def __init__(self, tokenizer: PreTrainedTokenizer, config, file_path: str, block_size=512):\n"," assert os.path.isfile(file_path)\n","\n","\n"," directory, filename = os.path.split(file_path)\n"," cached_features_file = os.path.join(\n"," directory, dna + \"_cached_lm_\" + str(block_size) + \"_\" + filename\n"," )\n","\n"," if os.path.exists(cached_features_file) and not config['overwrite_cache']:\n"," logger.info(\"Loading features from cached file %s\", cached_features_file)\n"," with open(cached_features_file, \"rb\") as handle:\n"," self.examples = pickle.load(handle)\n"," else:\n"," logger.info(\"Creating features from dataset file at %s\", directory)\n","\n"," self.examples = []\n"," with open(file_path, encoding=\"utf-8\") as f:\n"," text = f.read()\n","\n"," tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))\n","\n"," for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size\n"," self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size]))\n"," logger.info(\"Saving features into cached file %s\", cached_features_file)\n"," with open(cached_features_file, \"wb\") as handle:\n"," pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n"," def __len__(self):\n"," return len(self.examples)\n","\n"," def __getitem__(self, item):\n"," return torch.tensor(self.examples[item], dtype=torch.long)\n","\n","def convert_line_to_example(tokenizer, lines, max_length, add_special_tokens=True):\n"," examples = tokenizer.batch_encode_plus(lines, add_special_tokens=add_special_tokens, max_length=max_length)[\"input_ids\"]\n"," return examples\n","\n","class LineByLineTextDataset(Dataset):\n"," def __init__(self, tokenizer: PreTrainedTokenizer, config, file_path: str, block_size=512):\n"," assert os.path.isfile(file_path)\n"," # Here, we do not cache the features, operating under the assumption\n"," # that we will soon use fast multithreaded tokenizers from the\n"," # `tokenizers` repo everywhere =)\n"," directory, filename = os.path.split(file_path)\n"," cached_features_file = os.path.join(\n"," '/kaggle/working/', 'dna' + \"_cached_lm_\" + str(block_size) + \"_\" + filename\n"," )\n","\n"," if os.path.exists(cached_features_file) and not config['overwrite_cache']:\n"," logger.info(\"Loading features from cached file %s\", cached_features_file)\n"," with open(cached_features_file, \"rb\") as handle:\n"," self.examples = pickle.load(handle)\n"," else:\n"," logger.info(\"Creating features from dataset file at %s\", file_path)\n","\n"," with open(file_path, encoding=\"utf-8\") as f:\n"," lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]\n"," \n"," if config['n_process'] == 1:\n"," self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size)[\"input_ids\"]\n"," else:\n"," n_proc = config['n_process']\n"," p = Pool(n_proc)\n"," indexes = [0]\n"," len_slice = int(len(lines)/n_proc)\n"," for i in range(1, n_proc+1):\n"," if i != n_proc:\n"," indexes.append(len_slice*(i))\n"," else:\n"," indexes.append(len(lines))\n"," results = []\n"," for i in range(n_proc):\n"," results.append(p.apply_async(convert_line_to_example,[tokenizer, lines[indexes[i]:indexes[i+1]], block_size,]))\n"," print(str(i) + \" start\")\n"," p.close() \n"," p.join()\n","\n"," self.examples = []\n"," for result in results:\n"," ids = result.get()\n"," self.examples.extend(ids)\n","\n"," logger.info(\"Saving features into cached file %s\", cached_features_file)\n"," with open(cached_features_file, \"wb\") as handle:\n"," pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n"," def __len__(self):\n"," return len(self.examples)\n","\n"," def __getitem__(self, i):\n"," return torch.tensor(self.examples[i], dtype=torch.long)\n","\n","\n","def load_and_cache_examples(config, tokenizer, evaluate=False):\n"," file_path = r\"/kaggle/input/random-dna-sequences-for-transfomer-pretraining/6_12k.txt\" if evaluate else r'/kaggle/input/random-dna-sequences-for-transfomer-pretraining/6_12k.txt'\n"," if config['line_by_line']:\n"," return LineByLineTextDataset(tokenizer, config, file_path=file_path, block_size=config['block_size'])\n"," else:\n"," return TextDataset(tokenizer, config, file_path=file_path, block_size=config['block_size'])\n","\n","\n","def set_seed(config):\n"," random.seed(config['seed'])\n"," np.random.seed(config['seed'])\n"," torch.manual_seed(config['seed'])\n"," if config['n_gpu'] > 0:\n"," torch.cuda.manual_seed_all(config['seed'])\n","\n","\n","def _sorted_checkpoints(config, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> List[str]:\n"," ordering_and_checkpoint_path = []\n"," st = r\"/kaggle/working/output\"\n"," \n"," glob_checkpoints = glob.glob(os.path.join(st, \"{}-*\".format(checkpoint_prefix)))\n","\n"," for path in glob_checkpoints:\n"," if use_mtime:\n"," ordering_and_checkpoint_path.append((os.path.getmtime(path), path))\n"," else:\n"," regex_match = re.match(\".*{}-([0-9]+)\".format(checkpoint_prefix), path)\n"," if regex_match and regex_match.groups():\n"," ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))\n","\n"," checkpoints_sorted = sorted(ordering_and_checkpoint_path)\n"," checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]\n"," return checkpoints_sorted\n","\n","\n","def _rotate_checkpoints(config, checkpoint_prefix=\"checkpoint\", use_mtime=False) -> None:\n"," if not config['save_total_limit']:\n"," return\n"," if config['save_total_limit'] <= 0:\n"," return\n","\n"," # Check if we should delete older checkpoint(s)\n"," checkpoints_sorted = _sorted_checkpoints(config, checkpoint_prefix, use_mtime)\n"," if len(checkpoints_sorted) <= config['save_total_limit']:\n"," return\n","\n"," number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - config['save_total_limit'])\n"," checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]\n"," for checkpoint in checkpoints_to_be_deleted:\n"," logger.info(\"Deleting older checkpoint [{}] due to config['save_total_limit']\".format(checkpoint))\n"," shutil.rmtree(checkpoint)\n","\n","\n","\n","\n","def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, config) -> Tuple[torch.Tensor, torch.Tensor]:\n"," \"\"\"Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.\"\"\"\n","\n"," mask_list = MASK_LIST['6']\n","\n"," if tokenizer.mask_token is None:\n"," raise ValueError(\n"," \"This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer.\"\n"," )\n","\n"," labels = inputs.clone()\n"," # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)\n"," probability_matrix = torch.full(labels.shape, config['mlm_probability'])\n"," special_tokens_mask = [\n"," tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()\n"," ]\n"," probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)\n"," if tokenizer.pad_token is not None:\n"," padding_mask = labels.eq(tokenizer.pad_token_id)\n"," probability_matrix.masked_fill_(padding_mask, value=0.0)\n","\n"," masked_indices = torch.bernoulli(probability_matrix).bool()\n","\n"," # Ensure masked_indices and probability_matrix are the same shape\n"," masks = deepcopy(masked_indices)\n"," for i, masked_index in enumerate(masks):\n"," # Ensure there are non-zero elements to avoid IndexError\n"," non_zero_indices = torch.where(probability_matrix[i] != 0)[0]\n"," if non_zero_indices.numel() == 0:\n"," # If no non-zero elements, skip this sequence\n"," continue\n","\n"," end = non_zero_indices.tolist()[-1]\n"," mask_centers = set(torch.where(masked_index == 1)[0].tolist())\n"," new_centers = deepcopy(mask_centers)\n"," for center in mask_centers:\n"," for mask_number in mask_list:\n"," current_index = center + mask_number\n"," if current_index <= end and current_index >= 1:\n"," new_centers.add(current_index)\n"," new_centers = list(new_centers)\n"," masked_indices[i][new_centers] = True\n","\n"," labels[~masked_indices] = -100 # We only compute loss on masked tokens\n","\n"," # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])\n"," indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices\n"," inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)\n","\n"," # 10% of the time, we replace masked input tokens with random word\n"," indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced\n"," random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)\n"," inputs[indices_random] = random_words[indices_random]\n","\n"," # The rest of the time (10% of the time) we keep the masked input tokens unchanged\n"," return inputs, labels\n","\n","import os\n","import torch\n","from torch.nn.utils.rnn import pad_sequence\n","from torch.utils.data import DataLoader, RandomSampler, DistributedSampler\n","from tqdm import tqdm, trange\n","from transformers import PreTrainedModel, PreTrainedTokenizer, AdamW, get_linear_schedule_with_warmup\n","from typing import List, Dict, Tuple\n","import wandb\n","import time\n","\n","def train(config, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:\n"," \"\"\" Train the model \"\"\"\n"," if config['local_rank'] in [-1, 0]:\n"," tb_writer = SummaryWriter()\n","\n"," config['train_batch_size'] = config['per_gpu_train_batch_size'] * max(1, config['n_gpu'])\n","\n"," def collate(examples: List[torch.Tensor]):\n"," if tokenizer._pad_token is None:\n"," return pad_sequence(examples, batch_first=True)\n"," return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n","\n"," train_sampler = RandomSampler(train_dataset) if config['local_rank'] == -1 else DistributedSampler(train_dataset)\n"," train_dataloader = DataLoader(\n"," train_dataset, sampler=train_sampler, batch_size=config['train_batch_size'], collate_fn=collate\n"," )\n","\n"," if config['max_steps'] > 0:\n"," t_total = config['max_steps']\n"," config['num_train_epochs'] = config['max_steps'] // (len(train_dataloader) // config['gradient_accumulation_steps']) + 1\n"," else:\n"," t_total = len(train_dataloader) // config['gradient_accumulation_steps'] * config['num_train_epochs']\n","\n"," # Prepare optimizer and schedule (linear warmup and decay)\n"," no_decay = [\"bias\", \"LayerNorm.weight\"]\n"," optimizer_grouped_parameters = [\n"," {\n"," \"params\": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],\n"," \"weight_decay\": config['weight_decay'],\n"," },\n"," {\"params\": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], \"weight_decay\": 0.0},\n"," ]\n"," optimizer = AdamW(optimizer_grouped_parameters, lr=config['learning_rate'], eps=config['adam_epsilon'], betas=(config['beta1'],config['beta2']))\n"," scheduler = get_linear_schedule_with_warmup(\n"," optimizer, num_warmup_steps=2000, num_training_steps=t_total\n"," )\n","\n"," # Train!\n"," logger.info(\"***** Running training *****\")\n"," logger.info(\" Num examples = %d\", len(train_dataset))\n"," logger.info(\" Num Epochs = %d\", config['num_train_epochs'])\n"," logger.info(\" Instantaneous batch size per GPU = %d\", config['per_gpu_train_batch_size'])\n"," logger.info(\n"," \" Total train batch size (w. parallel, distributed & accumulation) = %d\",\n"," config['train_batch_size']\n"," * config['gradient_accumulation_steps']\n"," * (torch.distributed.get_world_size() if config['local_rank'] != -1 else 1),\n"," )\n"," logger.info(\" Gradient Accumulation steps = %d\", config['gradient_accumulation_steps'])\n"," logger.info(\" Total optimization steps = %d\", t_total)\n","\n"," global_step = 0\n"," epochs_trained = 0\n"," steps_trained_in_current_epoch = 0\n","\n"," tr_loss, logging_loss = 0.0, 0.0\n","\n"," model_to_resize = model.module if hasattr(model, \"module\") else model # Take care of distributed/parallel training\n"," model_to_resize.resize_token_embeddings(len(tokenizer))\n","\n"," model.zero_grad()\n"," train_iterator = trange(\n"," epochs_trained, int(config['num_train_epochs']), desc=\"Epoch\", disable=config['local_rank'] not in [-1, 0]\n"," )\n"," set_seed(config) # Added here for reproducibility\n","\n"," for epoch in train_iterator:\n"," epoch_start_time = time.time()\n"," epoch_iterator = tqdm(train_dataloader, desc=\"Iteration\", disable=config['local_rank'] not in [-1, 0])\n"," for step, batch in enumerate(epoch_iterator):\n","\n"," # Skip past any already trained steps if resuming training\n"," if steps_trained_in_current_epoch > 0:\n"," steps_trained_in_current_epoch -= 1\n"," continue\n","\n"," inputs, labels = mask_tokens(batch, tokenizer, config) if config['mlm'] else (batch, batch)\n","\n"," inputs = inputs.to(config['device'])\n"," labels = labels.to(config['device'])\n"," model.train()\n"," outputs = model(inputs, labels=labels) if config['mlm'] else model(inputs, labels=labels)\n"," loss = outputs[0] # model outputs are always tuple in transformers (see doc)\n","\n"," if config['n_gpu'] > 1:\n"," loss = loss.mean() # mean() to average on multi-gpu parallel training\n"," if config['gradient_accumulation_steps'] > 1:\n"," loss = loss / config['gradient_accumulation_steps']\n","\n"," loss.backward()\n","\n"," tr_loss += loss.item()\n"," if (step + 1) % config['gradient_accumulation_steps'] == 0:\n"," torch.nn.utils.clip_grad_norm_(model.parameters(), config['max_grad_norm'])\n"," optimizer.step()\n"," scheduler.step() # Update learning rate schedule\n"," model.zero_grad()\n"," global_step += 1\n","\n"," # Log metrics to wandb\n"," wandb.log({\"learning_rate\": scheduler.get_last_lr()[0], \"loss\": loss.item(), \"global_step\": global_step})\n","\n"," if config['local_rank'] in [-1, 0] and config['logging_steps'] > 0 and global_step % config['logging_steps'] == 0:\n"," # Log metrics\n"," if (\n"," config['local_rank'] == -1 and config['evaluate_during_training']\n"," ): # Only evaluate when single GPU otherwise metrics may not average well\n"," results = evaluate(config, model, tokenizer)\n"," for key, value in results.items():\n"," tb_writer.add_scalar(\"eval_{}\".format(key), value, global_step)\n"," wandb.log({f\"eval_{key}\": value, \"global_step\": global_step})\n"," tb_writer.add_scalar(\"lr\", scheduler.get_lr()[0], global_step)\n"," tb_writer.add_scalar(\"loss\", (tr_loss - logging_loss) / config['logging_steps'], global_step)\n"," logging_loss = tr_loss\n","\n"," if config['local_rank'] in [-1, 0] and config['save_steps'] > 0 and global_step % config['save_steps'] == 0:\n"," checkpoint_prefix = \"checkpoint\"\n"," # Save model checkpoint\n"," st = r\"/kaggle/working/output\"\n"," output_dir = os.path.join(st, \"{}-{}\".format(checkpoint_prefix, global_step))\n"," os.makedirs(output_dir, exist_ok=True)\n"," model_to_save = (\n"," model.module if hasattr(model, \"module\") else model\n"," ) # Take care of distributed/parallel training\n"," model_to_save.save_pretrained(output_dir)\n"," tokenizer.save_pretrained(output_dir)\n","\n"," torch.save(config, os.path.join(output_dir, \"training_args.bin\"))\n"," logger.info(\"Saving model checkpoint to %s\", output_dir)\n","\n"," _rotate_checkpoints(config, checkpoint_prefix)\n","\n"," torch.save(optimizer.state_dict(), os.path.join(output_dir, \"optimizer.pt\"))\n"," torch.save(scheduler.state_dict(), os.path.join(output_dir, \"scheduler.pt\"))\n"," logger.info(\"Saving optimizer and scheduler states to %s\", output_dir)\n","\n"," if config['max_steps'] > 0 and global_step > config['max_steps']:\n"," epoch_iterator.close()\n"," break\n"," if config['max_steps'] > 0 and global_step > config['max_steps']:\n"," train_iterator.close()\n"," break\n"," epoch_end_time = time.time()\n"," epoch_time = epoch_end_time - epoch_start_time\n"," # Log epoch time\n"," output_dir = r\"/kaggle/working/output\"\n"," logging.info(f'Epoch {epoch + 1}: Time {epoch_time:.4f}s')\n"," log_dir = os.path.join(output_dir, 'training_logs')\n"," os.makedirs(log_dir, exist_ok=True)\n"," file = os.path.join(log_dir,'log.txt')\n"," with open(file, 'a') as f:\n"," f.write(f\"Epoch {epoch + 1}/{config['num_train_epochs']}:\\n\")\n"," f.write(f\" Epoch Time: {epoch_time}\\n\")\n","\n"," # Log epoch time to wandb\n"," wandb.log({\"epoch_time\": epoch_time, \"epoch\": epoch + 1})\n","\n"," if config['local_rank'] in [-1, 0]:\n"," tb_writer.close()\n","\n"," return global_step, tr_loss / global_step\n","\n","\n","import os\n","import torch\n","from torch.nn.utils.rnn import pad_sequence\n","from torch.utils.data import DataLoader, SequentialSampler\n","from tqdm import tqdm\n","from transformers import PreTrainedModel, PreTrainedTokenizer\n","from typing import List, Dict\n","import wandb\n","\n","def evaluate(config, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix=\"\") -> Dict:\n"," # Loop to handle MNLI double evaluation (matched, mis-matched)\n"," eval_output_dir = config['output_dir']\n","\n"," eval_dataset = load_and_cache_examples(config, tokenizer, evaluate=True)\n","\n"," if config['local_rank'] in [-1, 0]:\n"," os.makedirs(eval_output_dir, exist_ok=True)\n","\n"," config['eval_batch_size'] = config['per_gpu_eval_batch_size'] * max(1, config['n_gpu'])\n"," # Note that DistributedSampler samples randomly\n","\n"," def collate(examples: List[torch.Tensor]):\n"," if tokenizer._pad_token is None:\n"," return pad_sequence(examples, batch_first=True)\n"," return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)\n","\n"," eval_sampler = SequentialSampler(eval_dataset)\n"," eval_dataloader = DataLoader(\n"," eval_dataset, sampler=eval_sampler, batch_size=config['eval_batch_size'], collate_fn=collate\n"," )\n","\n"," # multi-gpu evaluate\n"," if config['n_gpu'] > 1 and not isinstance(model, torch.nn.DataParallel):\n"," model = torch.nn.DataParallel(model)\n","\n"," # Eval!\n"," logger.info(\"***** Running evaluation {} *****\".format(prefix))\n"," logger.info(\" Num examples = %d\", len(eval_dataset))\n"," logger.info(\" Batch size = %d\", config['eval_batch_size'])\n"," eval_loss = 0.0\n"," nb_eval_steps = 0\n"," model.eval()\n","\n"," for batch in tqdm(eval_dataloader, desc=\"Evaluating\"):\n"," inputs, labels = mask_tokens(batch, tokenizer, config) if config['mlm'] else (batch, batch)\n"," inputs = inputs.to(config['device'])\n"," labels = labels.to(config['device'])\n","\n"," with torch.no_grad():\n"," outputs = model(inputs, labels=labels) if config['mlm'] else model(inputs, labels=labels)\n"," lm_loss = outputs[0]\n"," eval_loss += lm_loss.mean().item()\n"," nb_eval_steps += 1\n","\n"," eval_loss = eval_loss / nb_eval_steps\n"," perplexity = torch.exp(torch.tensor(eval_loss))\n","\n"," result = {\"perplexity\": perplexity.item()}\n","\n"," # Log metrics to wandb\n"," wandb.log({\"eval perplexity\" : result})\n","\n"," output_eval_file = os.path.join(eval_output_dir, prefix, \"eval_results.txt\")\n"," with open(output_eval_file, \"a\") as writer:\n"," logger.info(\"***** Eval results {} *****\".format(prefix))\n"," for key in sorted(result.keys()):\n"," logger.info(\" %s = %s\", key, str(result[key]))\n"," writer.write(\"%s = %s\\n\" % (key, str(result[key])))\n","\n"," return result\n","\n","\n","import argparse\n","import os\n","import logging\n","import torch\n","import os\n","import logging\n","import torch\n","\n","def main(config):\n"," # Handle checkpoint continuation\n"," if config['should_continue']:\n"," sorted_checkpoints = _sorted_checkpoints(config)\n"," if len(sorted_checkpoints) == 0:\n"," raise ValueError(\"Used --should_continue but no checkpoint was found in --output_dir.\")\n"," else:\n"," config['model_name_or_path'] = sorted_checkpoints[-1]\n","\n"," output_dir = config.get('output_dir', './output')\n"," if (\n"," os.path.exists(output_dir)\n"," and os.listdir(output_dir)\n"," and config['do_train']\n"," and not config.get('overwrite_output_dir', False)\n"," ):\n"," raise ValueError(\n"," \"Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.\".format(\n"," output_dir\n"," )\n"," )\n","\n"," # Setup CUDA, GPU & distributed training\n"," if config.get('local_rank', -1) == -1 or config.get('no_cuda', False):\n"," device = torch.device(\"cuda:0\" if torch.cuda.is_available() and not config.get('no_cuda', False) else \"cpu\")\n"," config['n_gpu'] = torch.cuda.device_count()\n"," else:\n"," torch.cuda.set_device(config.get('local_rank', 0))\n"," device = torch.device(\"cuda\", config.get('local_rank', 0))\n"," torch.distributed.init_process_group(backend=\"nccl\")\n"," config['n_gpu'] = 1\n"," config['device'] = device\n","\n"," # Setup logging\n"," logging.basicConfig(\n"," format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n"," datefmt=\"%m/%d/%Y %H:%M:%S\",\n"," level=logging.INFO if config.get('local_rank', -1) in [-1, 0] else logging.WARN,\n"," filename = 'app.log'\n"," )\n"," logger = logging.getLogger(__name__)\n"," logger.warning(\n"," \"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s\",\n"," config.get('local_rank', -1),\n"," device,\n"," config['n_gpu'],\n"," bool(config.get('local_rank', -1) != -1),\n"," config.get('fp16', False),\n"," )\n","\n"," # Set seed\n"," set_seed(config)\n","\n"," # Load pretrained model and tokenizer\n"," if config.get('local_rank', -1) not in [-1, 0]:\n"," torch.distributed.barrier() # Barrier to make sure only the first process in distributed training download model & vocab\n","\n"," config_class, model_class, tokenizer_class = MODEL_CLASSES['dna']\n"," config_obj = config_class.from_pretrained('prajjwal1/bert-tiny', cache_dir=config.get('cache_dir', None))\n","\n"," tokenizer = tokenizer_class.from_pretrained('zhihan1996/DNA_bert_6', cache_dir=config.get('cache_dir', None))\n","\n"," if config.get('block_size', 512) <= 0:\n"," config['block_size'] = 512\n"," else:\n"," config['block_size'] = min(config['block_size'], 512)\n","\n"," if config.get('model_name_or_path'):\n","# model = model_class.from_pretrained(\n","# config['model_name_or_path'],\n","# from_tf=bool(\".ckpt\" in config['model_name_or_path']),\n","# config=config_obj,\n","# cache_dir=config.get('cache_dir', None),\n"," pass\n"," else:\n"," logger.info(\"Training new model from scratch\")\n"," model = model_class(config=config_obj)\n","\n"," model.to(config['device'])\n","\n"," if config.get('local_rank', -1) == 0:\n"," torch.distributed.barrier()\n","\n"," logger.info(\"Training/evaluation parameters %s\", config)\n","\n"," # Training\n"," if config.get('do_train', False):\n"," if config.get('local_rank', -1) not in [-1, 0]:\n"," torch.distributed.barrier()\n","\n"," train_dataset = load_and_cache_examples(config, tokenizer, evaluate=False)\n","\n"," if config.get('local_rank', -1) == 0:\n"," torch.distributed.barrier()\n","\n"," global_step, tr_loss = train(config, train_dataset, model, tokenizer)\n"," logger.info(\" global_step = %s, average loss = %s\", global_step, tr_loss)\n","\n"," # Save and reload model\n"," if config.get('do_train', False) and (config.get('local_rank', -1) == -1 or torch.distributed.get_rank() == 0):\n"," if config.get('local_rank', -1) in [-1, 0]:\n"," os.makedirs(output_dir, exist_ok=True)\n","\n"," logger.info(\"Saving model checkpoint to %s\", output_dir)\n"," model_to_save = (\n"," model.module if hasattr(model, \"module\") else model\n"," )\n"," model_to_save.save_pretrained(output_dir)\n"," tokenizer.save_pretrained(output_dir)\n"," torch.save(config, os.path.join(output_dir, \"training_args.bin\"))\n","\n"," model = model_class.from_pretrained(output_dir)\n"," tokenizer = tokenizer_class.from_pretrained(output_dir)\n"," model.to(config['device'])\n","\n"," # Evaluation\n"," results = {}\n"," if config.get('do_eval', False) and config.get('local_rank', -1) in [-1, 0]:\n"," checkpoints = [output_dir]\n"," if config.get('eval_all_checkpoints', False):\n"," checkpoints = list(\n"," os.path.dirname(c) for c in sorted(glob.glob(output_dir + \"/**/\" + WEIGHTS_NAME, recursive=True))\n"," )\n"," logging.getLogger(\"transformers.modeling_utils\").setLevel(logging.WARN)\n"," logger.info(\"Evaluate the following checkpoints: %s\", checkpoints)\n"," for checkpoint in checkpoints:\n"," global_step = checkpoint.split(\"-\")[-1] if len(checkpoints) > 1 else \"\"\n"," prefix = checkpoint.split(\"/\")[-1] if checkpoint.find(\"checkpoint\") != -1 else \"\"\n","\n"," model = model_class.from_pretrained(checkpoint)\n"," model.to(config['device'])\n"," result = evaluate(config, model, tokenizer, prefix=prefix)\n"," result = dict((k + \"_{}\".format(global_step), v) for k, v in result.items())\n"," results.update(result)\n","\n"," return results\n","\n","# Example configuration dictionary\n","config = {\n"," 'line_by_line': True,\n"," 'should_continue': False,#use if you have a checkpoint present or it will throw error\n"," 'mlm': True,\n"," 'mlm_probability': 0.15,\n"," 'config_name': None,\n"," 'tokenizer_name': None,\n"," 'cache_dir': None,\n"," 'block_size': 512,\n"," 'do_train': True,\n"," 'do_eval': True,\n"," 'evaluate_during_training': True,\n"," 'per_gpu_train_batch_size': 175,\n"," 'per_gpu_eval_batch_size': 25,\n"," 'gradient_accumulation_steps': 1,\n"," 'learning_rate': 4e-4,\n"," 'weight_decay': 0.01,\n"," 'adam_epsilon': 1e-6,\n"," 'beta1': 0.9,\n"," 'beta2': 0.98,\n"," 'max_grad_norm': 1.0,\n"," 'num_train_epochs': 2000,\n"," 'max_steps': -1,\n"," 'warmup_steps': 100,\n"," 'logging_steps': 200,\n"," 'save_steps': 1000,\n"," 'save_total_limit': 10,\n"," 'eval_all_checkpoints': False,\n"," 'no_cuda': False,\n"," 'overwrite_output_dir': True,\n"," 'overwrite_cache': False,\n"," 'seed': 42,\n"," 'n_process': 1,\n"," 'fp16': False,\n"," 'fp16_opt_level': 'O1',\n"," 'local_rank': -1,\n"," 'server_ip': '',\n"," 'server_port': '',\n"," 'output_dir': './output',\n"," 'device':'cuda'\n","}\n","\n","if __name__ == \"__main__\":\n"," main(config)\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"gpu","dataSources":[{"datasetId":5477436,"sourceId":9095316,"sourceType":"datasetVersion"}],"dockerImageVersionId":30732,"isGpuEnabled":true,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.13"}},"nbformat":4,"nbformat_minor":4}