File size: 4,602 Bytes

e9ab399

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "58d45708",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e0314358",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model.ckpt-320000.data-00000-of-00001  model.ckpt-320000.meta\r\n",
      "model.ckpt-320000.index\r\n"
     ]
    }
   ],
   "source": [
    "# !tar -zxf xlnet-large-2021-09-06.tar.gz\n",
    "# !rm xlnet-large-2021-09-06.tar.gz\n",
    "!ls xlnet-large"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "59d2c8b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.vocab\n",
    "# !wget https://raw.githubusercontent.com/huseinzol05/malaya/master/pretrained-model/xlnet/tokenizer/sp10m.cased.v9.model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f35e09f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./tokenizer_config.json',\n",
       " './special_tokens_map.json',\n",
       " './spiece.model',\n",
       " './added_tokens.json')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)\n",
    "tokenizer.save_pretrained('./')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4438ff5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "config = {\n",
    "    \"d_head\": 64,\n",
    "    \"d_inner\": 4096,\n",
    "    \"d_model\": 1024,\n",
    "    \"ff_activation\": \"gelu\",\n",
    "    \"n_head\": 16,\n",
    "    \"n_layer\": 20,\n",
    "    \"n_token\": 32000,\n",
    "    \"untie_r\": True\n",
    "}\n",
    "\n",
    "with open('config.json', 'w') as fopen:\n",
    "    json.dump(config, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a265f23c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli convert --model_type xlnet \\\n",
    "#   --tf_checkpoint xlnet-large/model.ckpt-320000 \\\n",
    "#   --config config.json \\\n",
    "#   --pytorch_dump_output ./"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "22b94055",
   "metadata": {},
   "outputs": [],
   "source": [
    "config = XLNetConfig(f'./config.json')\n",
    "config.vocab_size = 32000\n",
    "config.d_inner = 4096\n",
    "config.d_model = 1024\n",
    "config.n_head = 16\n",
    "config.n_layer = 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "17c6d447",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at ./ were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']\n",
      "- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "model = XLNetModel.from_pretrained('./', config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d0fc0138",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = XLNetTokenizer.from_pretrained('./',do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ec2c0661",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('./')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}