Spaces:

NLPV
/

ISCO-code-predictor-api

Sleeping

App Files Files

Pradeep Kumar commited on Aug 15, 2024

Commit

439fdba

verified ·

1 Parent(s): be3c9fb

Delete create_xlnet_pretraining_data.py

Browse files

Files changed (1) hide show

create_xlnet_pretraining_data.py +0 -721

create_xlnet_pretraining_data.py DELETED Viewed

@@ -1,721 +0,0 @@
-# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Create LM TF examples for XLNet."""
-import dataclasses
-import json
-import math
-import os
-import random
-from typing import Iterable, Mapping, List, Optional, Tuple
-import unicodedata
-# Import libraries
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow as tf, tf_keras
-from official.nlp.tools import tokenization
-special_symbols = {
-    "<unk>": 0,
-    "<s>": 1,
-    "</s>": 2,
-    "<cls>": 3,
-    "<sep>": 4,
-    "<pad>": 5,
-    "<mask>": 6,
-    "<eod>": 7,
-    "<eop>": 8,
-}
-FLAGS = flags.FLAGS
-flags.DEFINE_integer("seq_length", 512,
-                     help="Sequence length.")
-flags.DEFINE_integer("reuse_length", 256,
-                     help="Number of token that can be reused as memory. "
-                     "Could be half of `seq_len`.")
-flags.DEFINE_string("input_file", None,
-                    "Input raw text file (or comma-separated list of files).")
-flags.DEFINE_string(
-    "save_dir", None,
-    "Directory for saving processed data.")
-flags.DEFINE_string("sp_model_file", "",
-                    "The path to the model used by sentence piece tokenizer.")
-flags.DEFINE_bool("use_eod_token", True,
-                  "Whether or not to include EOD tokens.")
-flags.DEFINE_bool("bi_data", True, "Whether or not to use bi-directional data.")
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-flags.DEFINE_integer("per_host_batch_size", 32, "Batch size per host.")
-flags.DEFINE_integer("num_cores_per_host", 16,
-                     "The number of (TPU) cores per host.")
-flags.DEFINE_string("prefix", "", "Filename prefix.")
-flags.DEFINE_string("suffix", "", "Filename suffix.")
-flags.DEFINE_integer("task_id", None,
-                     "The id of the current task.")
-flags.DEFINE_integer("num_tasks", None,
-                     "The total number of tasks.")
-flags.DEFINE_integer("num_passes", 1, "The number of times to run the script.")
-@dataclasses.dataclass
-class TrainingInstance:
-  """Representation of a single XLNet Pretraining instance."""
-  data: Iterable[int]
-  segment_ids: Iterable[int]
-  boundary_indices: Iterable[int]
-  label: int
-  def to_feature(self) -> Mapping[str, tf.train.Feature]:
-    feat = lambda x: tf.train.Feature(int64_list=tf.train.Int64List(value=x))
-    return dict(
-        input_word_ids=feat(self.data),
-        input_type_ids=feat(self.segment_ids),
-        boundary_indices=feat(self.boundary_indices),
-        label=feat([self.label]))
-  def to_example(self) -> tf.train.Example:
-    return tf.train.Example(
-        features=tf.train.Features(feature=self.to_feature()))
-  def __str__(self):
-    def seq_to_str(seq):
-      return " ".join([str(x) for x in seq])
-    s = ""
-    s += "tokens: %s\n" % seq_to_str(self.data)
-    s += "segment_ids: %s\n" % seq_to_str(self.segment_ids)
-    s += "boundary_indices: %s\n" % seq_to_str(self.boundary_indices)
-    s += "label: %s\n" % self.label
-    s += "\n"
-    return s
-  def __repr__(self):
-    return self.__str__()
-def _preprocess_line(line: str, do_lower_case: bool = False) -> str:
-  """Preprocesses an individual raw text line.
-  This function will:
-    - Remove extraneous spaces.
-    - Replace `` with ", and '' with ".
-    - Replaces accents.
-    - Applies lower casing.
-  Args:
-    line: The input line to preprocess.
-    do_lower_case: Whether or not to lower case the text.
-  Returns:
-    The preprocessed line.
-  """
-  line = " ".join(line.split())
-  line = line.replace("``", "\"").replace("''", "\"")
-  # Replace accents.
-  line = unicodedata.normalize("NFKD", line)
-  line = "".join([c for c in line if not unicodedata.combining(c)])
-  if do_lower_case:
-    line = line.lower()
-  return line
-def preprocess_and_tokenize_input_files(
-    input_files: Iterable[str],
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    use_eod: bool = True,
-    do_lower_case: bool = False,
-    log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]:
-  """Preprocesses and encodes raw text from input files.
-  This function preprocesses raw text and encodes them into tokens using a
-  `SentencePieceModel` tokenization method. This also provides the sentence
-  indicator for each token.
-  Args:
-    input_files: The list of input file names.
-    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
-    use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is
-      not included.
-    do_lower_case: Whether or not to apply lower casing during raw text
-      preprocessing.
-    log_example_freq: The optional field for how many lines to process before
-      emitting an info log.
-  Returns:
-    The preprocessed list. Each entry in the list is a tuple consisting of
-    the token IDs and the sentence IDs.
-  """
-  all_data = []
-  eod_symbol = special_symbols["<eod>"]
-  total_number_of_lines = 0
-  # Input file format:
-  # (1) One sentence per line. These should ideally be actual sentences, not
-  # entire paragraphs or arbitrary spans of text. (Because we use the
-  # sentence boundaries for the "next sentence prediction" task).
-  # (2) Blank lines between documents. Document boundaries are needed so
-  # that the "next sentence prediction" task doesn't span between documents.
-  for input_file in input_files:
-    line_count = 0
-    logging.info("Preprocessing %s", input_file)
-    all_tokens = []
-    all_sentence_ids = []
-    sentence_id = True
-    with tf.io.gfile.GFile(input_file, "rb") as reader:
-      while True:
-        line = tokenization.convert_to_unicode(reader.readline())
-        if not line:
-          break
-        line_count += 1
-        if line_count % log_example_freq == 0:
-          logging.info("Loading line %d", line_count)
-        line = line.strip()
-        if not line:
-          if use_eod:
-            token_ids = [eod_symbol]
-            sentence_id = not sentence_id
-          else:
-            continue
-        else:
-          preprocessed_line = _preprocess_line(
-              line=line, do_lower_case=do_lower_case)
-          token_ids = tokenization.encode_ids(
-              sp_model=tokenizer.sp_model, text=preprocessed_line)
-        all_tokens.extend(token_ids)
-        all_sentence_ids.extend([sentence_id] * len(token_ids))
-        sentence_id = not sentence_id
-      logging.info("Finished processing %s. Number of lines: %d",
-                   input_file, line_count)
-      if line_count == 0:
-        continue
-      total_number_of_lines += line_count
-      all_tokens = np.array(all_tokens, dtype=np.int64)
-      all_sentence_ids = np.array(all_sentence_ids, dtype=bool)
-      all_data.append((all_tokens, all_sentence_ids))
-  logging.info("Completed text preprocessing. Total number of lines: %d",
-               total_number_of_lines)
-  return all_data
-def _reshape_to_batch_dimensions(
-    tokens: np.array,
-    sentence_ids: np.array,
-    per_host_batch_size: int) -> Tuple[np.array, np.array]:
-  """Truncates and reshapes input data with a batch major dimension.
-  Args:
-    tokens: The input token ids. This should have the same shape as
-      `sentence_ids`.
-    sentence_ids: The input sentence ids. This should have the same shape as
-      `token_ids`.
-    per_host_batch_size: The target per-host batch size.
-  Returns:
-    The tuple of reshaped tokens and sentence_ids.
-  """
-  num_steps = len(tokens) // per_host_batch_size
-  truncated_data_length = num_steps * per_host_batch_size
-  logging.info("per_host_batch_size: %d", per_host_batch_size)
-  logging.info("num_steps: %d", num_steps)
-  def truncate_and_reshape(a):
-    return a[:truncated_data_length].reshape((per_host_batch_size, num_steps))
-  return (truncate_and_reshape(tokens), truncate_and_reshape(sentence_ids))
-def _create_a_and_b_segments(
-    tokens: np.array,
-    sentence_ids: np.array,
-    begin_index: int,
-    total_length: int,
-    no_cut_probability: float = 0.5):
-  """Splits segments A and B from a single instance of tokens and sentence ids.
-  Args:
-    tokens: The 1D input token ids. This represents an individual entry within a
-      batch.
-    sentence_ids: The 1D input sentence ids. This represents an individual entry
-      within a batch. This should be the same length as `tokens`.
-    begin_index: The reference beginning index to split data.
-    total_length: The target combined length of segments A and B.
-    no_cut_probability: The probability of not cutting a segment despite
-      a cut possibly existing.
-  Returns:
-    A tuple consisting of A data, B data, and label.
-  """
-  data_length = tokens.shape[0]
-  if begin_index + total_length >= data_length:
-    logging.info("[_create_segments]: begin_index %d + total_length %d >= "
-                 "data_length %d", begin_index, total_length, data_length)
-    return None
-  end_index = begin_index + 1
-  cut_indices = []
-  # Identify all indices where sentence IDs change from one to the next.
-  while end_index < data_length:
-    if sentence_ids[end_index] != sentence_ids[end_index - 1]:
-      if end_index - begin_index >= total_length:
-        break
-      cut_indices.append(end_index)
-    end_index += 1
-  a_begin = begin_index
-  if not cut_indices or random.random() < no_cut_probability:
-    # Segments A and B are contained within the same sentence.
-    label = 0
-    if not cut_indices:
-      a_end = end_index
-    else:
-      a_end = random.choice(cut_indices)
-    b_length = max(1, total_length - (a_end - a_begin))
-    b_begin = random.randint(0, data_length - 1 - b_length)
-    b_end = b_begin + b_length
-    while b_begin > 0 and sentence_ids[b_begin - 1] == sentence_ids[b_begin]:
-      b_begin -= 1
-    while (b_end < data_length - 1 and
-           sentence_ids[b_end - 1] == sentence_ids[b_end]):
-      b_end += 1
-  else:
-    # Segments A and B are different sentences.
-    label = 1
-    a_end = random.choice(cut_indices)
-    b_begin = a_end
-    b_end = end_index
-  while a_end - a_begin + b_end - b_begin > total_length:
-    if a_end - a_begin > b_end - b_begin:
-      # Delete only the right side for the LM objective.
-      a_end -= 1
-    else:
-      b_end -= 1
-  if a_end >= data_length or b_end >= data_length:
-    logging.info("[_create_segments]: a_end %d or b_end %d >= data_length %d",
-                 a_end, b_end, data_length)
-    return None
-  a_data = tokens[a_begin: a_end]
-  b_data = tokens[b_begin: b_end]
-  return a_data, b_data, label
-def _is_functional_piece(piece: str) -> bool:
-  return piece != "<unk>" and piece.startswith("<") and piece.endswith(">")
-def _is_start_piece(piece: str) -> bool:
-  special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~'))
-  if (piece.startswith("▁") or piece in special_pieces):
-    return True
-  else:
-    return False
-def _get_boundary_indices(
-    data: np.array,
-    tokenizer: tokenization.FullSentencePieceTokenizer) -> np.array:
-  """Gets the boundary indices of whole words."""
-  seq_length = len(data)
-  boundary_indices = []
-  for index, piece in enumerate(tokenizer.convert_ids_to_tokens(data.tolist())):
-    if _is_start_piece(piece) and not _is_functional_piece(piece):
-      boundary_indices.append(index)
-  boundary_indices.append(seq_length)
-  return boundary_indices
-def _convert_tokens_to_instances(
-    tokens: np.array,
-    sentence_ids: np.array,
-    per_host_batch_size: int,
-    seq_length: int,
-    reuse_length: int,
-    bi_data: bool,
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    num_cores_per_host: int = 0,
-    logging_frequency: int = 500) -> List[TrainingInstance]:
-  """Converts tokens and sentence IDs into individual training instances.
-  The format of data in the XLNet pretraining task is very similar to the
-  BERT pretraining task. Two segments A and B are randomly sampled, and the
-  contatenation of A and B into a single sequence is used to perform
-  language modeling.
-  To create an XLNet Pretraining instance from a single long sequence, S:
-  - Create a segment of length `reuse_length`. This first segment represents
-    past tokens. During modeling, this segment is used to cache obtained
-    content representations for the segment recurrence mechanism.
-  - Similar to BERT, create a segment of length `seq_length` - `reuse_length`
-    composed of A and B segments.
-    For XLNet, the order is "A", "SEP", "B", "SEP", "CLS".
-  Args:
-    tokens: All tokens concatenated into a single list.
-    sentence_ids: All sentence IDs concatenated into a single list.
-    per_host_batch_size: The target batch size per host.
-    seq_length: The max sequence length.
-    reuse_length: The number of tokens to use from the previous segment.
-    bi_data: Whether or not to use bidirectional data.
-    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
-    num_cores_per_host: The number of cores per host. This is required if
-      `bi_data` = `True`.
-    logging_frequency: The frequency at which to log status updates.
-  Returns:
-    A list of `TrainingInstance` objects.
-  """
-  instances = []
-  per_core_batch_size = (per_host_batch_size // num_cores_per_host
-                         if bi_data else None)
-  if bi_data:
-    logging.info("Bi-directional data enabled.")
-    assert per_host_batch_size % (2 * num_cores_per_host) == 0
-    forward_tokens, forward_sentence_ids = _reshape_to_batch_dimensions(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=per_host_batch_size // 2)
-    forward_data_shape = (num_cores_per_host, 1, per_core_batch_size // 2, -1)
-    forward_tokens = forward_tokens.reshape(forward_data_shape)
-    forward_sentence_ids = forward_sentence_ids.reshape(forward_data_shape)
-    backwards_tokens = forward_tokens[:, :, :, ::-1]
-    backwards_sentence_ids = forward_sentence_ids[:, :, :, ::-1]
-    tokens = np.concatenate([forward_tokens, backwards_tokens], 1).reshape(
-        per_host_batch_size, -1)
-    sentence_ids = np.concatenate(
-        [forward_sentence_ids, backwards_sentence_ids]).reshape(
-            per_host_batch_size, -1)
-  else:
-    logging.info("Bi-directional data disabled.")
-    tokens, sentence_ids = _reshape_to_batch_dimensions(
-        tokens=tokens,
-        sentence_ids=sentence_ids,
-        per_host_batch_size=per_host_batch_size)
-  logging.info("Tokens shape: %s", tokens.shape)
-  data_length = tokens.shape[1]
-  sep = np.array([special_symbols["<sep>"]], dtype=np.int64)
-  cls = np.array([special_symbols["<cls>"]], dtype=np.int64)
-  # 2 sep, 1 cls
-  num_special_tokens = 3
-  data_index = 0
-  batch_number = 0
-  step_size = reuse_length if reuse_length else seq_length
-  num_batches = math.ceil(data_length / step_size)
-  while data_index + seq_length <= data_length:
-    if batch_number % logging_frequency == 0:
-      logging.info("Processing batch %d of %d", batch_number, num_batches)
-    for batch_index in range(per_host_batch_size):
-      previous_segment_tokens = tokens[
-          batch_index, data_index: data_index + reuse_length]
-      results = _create_a_and_b_segments(
-          tokens=tokens[batch_index],
-          sentence_ids=sentence_ids[batch_index],
-          begin_index=data_index + reuse_length,
-          total_length=seq_length - reuse_length - num_special_tokens)
-      if results is None:
-        logging.info("Stopping at data index: %d", data_index)
-        break
-      a_data, b_data, label = results
-      data = np.concatenate(
-          [previous_segment_tokens, a_data, sep, b_data, sep, cls])
-      a_length = a_data.shape[0]
-      b_length = b_data.shape[0]
-      segment_ids = ([0] * (reuse_length + a_length) + [0]
-                     + [1] * b_length + [1] + [2])
-      boundary_indices = _get_boundary_indices(tokenizer=tokenizer,
-                                               data=data)
-      assert len(data) == seq_length
-      assert len(segment_ids) == seq_length
-      assert len(boundary_indices) > 0  # pylint: disable=g-explicit-length-test
-      instances.append(TrainingInstance(
-          data=data,
-          segment_ids=segment_ids,
-          boundary_indices=boundary_indices,
-          label=label))
-    batch_number += 1
-    data_index += step_size
-  return instances
-def write_instances_to_tfrecord(
-    instances: Iterable[TrainingInstance],
-    save_path: str):
-  """Writes instances to TFRecord."""
-  record_writer = tf.io.TFRecordWriter(save_path)
-  logging.info("Start writing to %s.", save_path)
-  for i, instance in enumerate(instances):
-    if i < 5:
-      logging.info("Instance %d: %s", i, str(instance))
-    record_writer.write(instance.to_example().SerializeToString())
-  record_writer.close()
-  logging.info("Done writing %s.", save_path)
-def shuffle_and_combine_preprocessed_data(
-    all_data: List[Tuple[np.array, np.array]]) -> Tuple[np.array, np.array]:
-  """Shuffles and combines preprocessed token/sentence IDs from documents."""
-  document_permutation = np.random.permutation(len(all_data))
-  previous_sentence_id = None
-  all_tokens, all_sentence_ids = [], []
-  for document_index in document_permutation:
-    tokens, sentence_ids = all_data[document_index]
-    # pylint: disable=g-explicit-length-test
-    if len(tokens) == 0:
-      continue
-    if (previous_sentence_id is not None and
-        sentence_ids[0] == previous_sentence_id):
-      sentence_ids = np.logical_not(sentence_ids)
-    all_tokens.append(tokens)
-    all_sentence_ids.append(sentence_ids)
-    previous_sentence_id = sentence_ids[-1]
-  return np.concatenate(all_tokens), np.concatenate(all_sentence_ids)
-def get_tfrecord_name(
-    per_host_batch_size: int,
-    num_cores_per_host: int,
-    seq_length: int,
-    bi_data: bool,
-    reuse_length: int,
-    do_lower_case: bool,
-    use_eod_token: bool,
-    prefix: str = "",
-    suffix: str = "",
-    pass_id: int = 0,
-    num_passes: int = 1,
-    task_id: int = None,
-    num_tasks: int = None) -> str:
-  """Formats the resulting TFRecord name based on provided inputs."""
-  components = []
-  if prefix:
-    components.append(prefix)
-  components.append("seqlen-{}".format(seq_length))
-  if reuse_length == 0:
-    components.append("memless")
-  else:
-    components.append("reuse-{}".format(reuse_length))
-  components.append("bs-{}".format(per_host_batch_size))
-  components.append("cores-{}".format(num_cores_per_host))
-  if do_lower_case:
-    components.append("uncased")
-  else:
-    components.append("cased")
-  if use_eod_token:
-    components.append("eod")
-  if bi_data:
-    components.append("bi")
-  else:
-    components.append("uni")
-  if suffix:
-    components.append(suffix)
-  s = "_".join(components) + ".tfrecord"
-  if num_passes == 1 and task_id is None:
-    return s
-  if task_id is None:
-    num_tasks = 1
-    task_id = 0
-  current_shard = task_id * num_passes + pass_id
-  total_shards = num_tasks * num_passes
-  return s + "-{}-of-{}".format(current_shard, total_shards)
-def create_tfrecords(
-    tokenizer: tokenization.FullSentencePieceTokenizer,
-    input_file_or_files: str,
-    use_eod_token: bool,
-    do_lower_case: bool,
-    per_host_batch_size: int,
-    seq_length: int,
-    reuse_length: int,
-    bi_data: bool,
-    num_cores_per_host: int,
-    save_dir: str,
-    prefix: str = "",
-    suffix: str = "",
-    num_tasks: Optional[int] = None,
-    task_id: Optional[int] = None,
-    num_passes: int = 1):
-  """Runs the end-to-end preprocessing pipeline."""
-  logging.info("Input configuration:")
-  logging.info("input file(s): %s", input_file_or_files)
-  logging.info("use_eod_token: %s", use_eod_token)
-  logging.info("do_lower_case: %s", do_lower_case)
-  logging.info("per_host_batch_size: %d", per_host_batch_size)
-  logging.info("seq_length: %d", seq_length)
-  logging.info("reuse_length: %d", reuse_length)
-  logging.info("bi_data: %s", bi_data)
-  logging.info("num_cores_per_host: %d", num_cores_per_host)
-  logging.info("save_dir: %s", save_dir)
-  if task_id is not None and num_tasks is not None:
-    logging.info("task_id: %d", task_id)
-    logging.info("num_tasks: %d", num_tasks)
-  input_files = []
-  for input_pattern in input_file_or_files.split(","):
-    input_files.extend(tf.io.gfile.glob(input_pattern))
-  logging.info("*** Reading from input files ***")
-  for input_file in input_files:
-    logging.info("  %s", input_file)
-  logging.info("Shuffling the files with a fixed random seed.")
-  np.random.shuffle(input_files)
-  if num_tasks is not None:
-    assert task_id is not None
-    logging.info("Total number of input files: %d", len(input_files))
-    logging.info("Splitting into %d shards of %d files each.",
-                 num_tasks, len(input_files) // num_tasks)
-    input_files = input_files[task_id::num_tasks]
-  all_data = preprocess_and_tokenize_input_files(
-      input_files=input_files,
-      tokenizer=tokenizer,
-      use_eod=use_eod_token,
-      do_lower_case=do_lower_case)
-  for pass_id in range(num_passes):
-    logging.info("Beginning pass %d of %d", pass_id, num_passes)
-    tokens, sentence_ids = shuffle_and_combine_preprocessed_data(all_data)
-    assert len(tokens) == len(sentence_ids)
-    filename = get_tfrecord_name(
-        per_host_batch_size=per_host_batch_size,
-        num_cores_per_host=num_cores_per_host,
-        seq_length=seq_length,
-        bi_data=bi_data,
-        use_eod_token=use_eod_token,
-        reuse_length=reuse_length,
-        do_lower_case=do_lower_case,
-        prefix=prefix,
-        suffix=suffix,
-        pass_id=pass_id,
-        num_passes=num_passes,
-        num_tasks=num_tasks,
-        task_id=task_id)
-    save_path = os.path.join(save_dir, filename)
-    if os.path.exists(save_path):
-      # If the path already exists, then we were probably preempted but
-      # previously wrote this file.
-      logging.info("%s already exists, skipping this batch.", save_path)
-    else:
-      instances = _convert_tokens_to_instances(
-          tokenizer=tokenizer,
-          tokens=tokens,
-          sentence_ids=sentence_ids,
-          per_host_batch_size=per_host_batch_size,
-          seq_length=seq_length,
-          reuse_length=reuse_length,
-          bi_data=bi_data,
-          num_cores_per_host=num_cores_per_host)
-      write_instances_to_tfrecord(instances=instances, save_path=save_path)
-  if task_id is None or task_id == 0:
-    corpus_info = {
-        "vocab_size": 32000,
-        "per_host_batch_size": per_host_batch_size,
-        "num_cores_per_host": num_cores_per_host,
-        "seq_length": seq_length,
-        "reuse_length": reuse_length,
-        "do_lower_case": do_lower_case,
-        "bi_data": bi_data,
-        "use_eod_token": use_eod_token,
-    }
-    corpus_fname = os.path.basename(filename) + ".json"
-    corpus_destination = os.path.join(save_dir, corpus_fname)
-    logging.info("Saving corpus info to %s", corpus_destination)
-    with tf.io.gfile.GFile(corpus_destination, "w") as fp:
-      json.dump(corpus_info, fp)
-def main(_):
-  tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-  create_tfrecords(
-      tokenizer=tokenizer,
-      input_file_or_files=FLAGS.input_file,
-      use_eod_token=FLAGS.use_eod_token,
-      do_lower_case=FLAGS.do_lower_case,
-      per_host_batch_size=FLAGS.per_host_batch_size,
-      seq_length=FLAGS.seq_length,
-      reuse_length=FLAGS.reuse_length,
-      bi_data=FLAGS.bi_data,
-      num_cores_per_host=FLAGS.num_cores_per_host,
-      save_dir=FLAGS.save_dir,
-      prefix=FLAGS.prefix,
-      suffix=FLAGS.suffix,
-      num_tasks=FLAGS.num_tasks,
-      task_id=FLAGS.task_id,
-      num_passes=FLAGS.num_passes)
-if __name__ == "__main__":
-  np.random.seed(0)
-  logging.set_verbosity(logging.INFO)
-  app.run(main)