In [1]:
import transformers
transformers.__version__

'4.10.0'

In [2]:
from transformers import T5Config, T5Model, load_tf_weights_in_t5

In [4]:
# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/t5-base-2021-07-28.tar.gz
# !tar -zxf t5-base-2021-07-28.tar.gz
# !rm t5-base-2021-07-28.tar.gz
!ls t5-base-v2

checkpoint                            model.ckpt-759900.index
model.ckpt-759900.data-00000-of-00002 model.ckpt-759900.meta
model.ckpt-759900.data-00001-of-00002 operative_config.gin


In [5]:
config = T5Config(
    vocab_size = 32128,
    n_positions=1024,
    d_ff = 3072,
    d_kv = 64,
    d_model = 768,
    dropout_rate = 0.1,
    inputs_length = 1024,
    num_heads = 12,
    num_layers = 12,
    decoder_start_token_id = 0,
    eos_token_id = 1,
    pad_token_id = 0)
print(config)
config.save_pretrained('./')

T5Config {
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "gradient_checkpointing": false,
  "initializer_factor": 1.0,
  "inputs_length": 1024,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 1024,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.10.0",
  "use_cache": true,
  "vocab_size": 32128
}



In [6]:
model = T5Model(config)
load_tf_weights_in_t5(model, config, 't5-base-v2/model.ckpt-759900')

T5Model(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dropout(p=0.1, inplac

In [7]:
from transformers import CONFIG_NAME, WEIGHTS_NAME
CONFIG_NAME, WEIGHTS_NAME

('config.json', 'pytorch_model.bin')

In [8]:
import torch

torch.save(model.state_dict(), './' + WEIGHTS_NAME)

In [9]:
from transformers import T5Config, T5Model, T5Tokenizer

In [10]:
# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en.model

In [11]:
tokenizer = T5Tokenizer('sp10m.cased.ms-en.model')
tokenizer.save_pretrained('./')

('./tokenizer_config.json',
 './special_tokens_map.json',
 './spiece.model',
 './added_tokens.json')

In [12]:
tokenizer = T5Tokenizer.from_pretrained('./', lower = False)

In [13]:
config = T5Config.from_pretrained('./')

In [14]:
model = T5Model.from_pretrained('./pytorch_model.bin', config = config)

In [15]:
model.save_pretrained('./')

In [16]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [17]:
model = T5ForConditionalGeneration.from_pretrained('./')

In [18]:
input_ids = tokenizer.encode('soalan: siapakah perdana menteri malaysia?', return_tensors = 'pt')
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

'<pad> Mahathir Mohamad</s>'

In [19]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: PETALING JAYA: Former prime minister Najib Razak has questioned whether the government knows how to manage the Covid-19 pandemic, outlining several seemingly contradictory announcements it has made.', return_tensors = 'pt')
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

'<pad> PETALING JAYA: Bekas perdana menteri, Najib Razak, mempersoalkan sama ada kerajaan tahu bagaimana menguruskan wabak'

In [20]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: PETALING JAYA: Pertemuan bekas Perdana Menteri, Datuk Seri Najib Tun Razak dan Timbalan Perdana Menteri, Datuk Seri Ismail Sabri Yaakob hari ini adalah bagi membincangkan isu berkaitan hala tuju dan dasar negara.', return_tensors = 'pt')
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

'<pad> PETALING JAYA: Former Prime Minister Najib Tun Razak and Deputy Prime Minister Ismail Sabri Yaakob today discussed'

In [21]:
input_ids = tokenizer.encode('grafik pengetahuan: Keuskupan Agung Katolik Rom Maracaibo terletak di barat daya Keuskupan Katolik Rom Machiques.', return_tensors = 'pt')
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

'<pad> Roman Catholic Archdiocese of Maracaibo shares border with Roman Catholic Diocese'

In [22]:
!rm -rf t5-base-v2