Spaces:

clarkchan
/

learn_on_hugging

Paused

App Files Files Community

clarkchan commited on Mar 22, 2023

Commit

4130c63

1 Parent(s): 03723ee

tt

Browse files

Files changed (1) hide show

tutorial.ipynb +28 -12

tutorial.ipynb CHANGED Viewed

@@ -5,7 +5,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## tokenizer"
    ]
   },
   {
@@ -14,48 +16,62 @@
    "metadata": {},
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "['这', '是', '中', '英', '文', 'test', '语', '句', '，', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
      ]
     }
    ],
    "source": [
     "input = \"这是中英文test语句，mix中英文及标点符号\"\n",
-    "result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
-    "a = tokenizer.tokenize(input)\n",
-    "print(a)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "{'input_ids': tensor([[  101,  6821,  3221,   704,  5739,  3152, 10060,  6427,  1368,  8024,\n",
-      "          9678,   704,  5739,  3152,  1350,  3403,  4157,  5016,  1384,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
      ]
     }
    ],
    "source": [
-    "#print(result.input_ids)\n",
-    "print(result)\n",
-    "tokenizer.convert_ids_to_tokens(result.input_ids)"
    ]
   }
  ],

    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## tokenizer\n",
+    "\n",
+    "tokenizer负责将输入的字符串，转换为token，或者转换为ids序列"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# 导入模型\n",
     "from transformers import AutoTokenizer\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "['这', '是', '中', '英', '文', 'test', '语', '句', '，', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n",
+      "['测', '试', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
      ]
     }
    ],
    "source": [
+    "# 转换为token\n",
     "input = \"这是中英文test语句，mix中英文及标点符号\"\n",
+    "input_array = [\"测试\",\"mix中英文及标点符号\"]\n",
+    "print(tokenizer.tokenize(input))\n",
+    "print(tokenizer.tokenize(input_array))\n",
+    "\n"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "tensor([[  101,  6821,  3221,   704,  5739,  3152, 10060,  6427,  1368,  8024,\n",
+      "          9678,   704,  5739,  3152,  1350,  3403,  4157,  5016,  1384,   102]])\n",
+      "tensor([[ 101, 3844, 6407,  102, 9678,  704, 5739, 3152, 1350, 3403, 4157, 5016,\n",
+      "         1384,  102]])\n",
+      "['[CLS]', '这', '是', '中', '英', '文', 'test', '语', '句', '，', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n",
+      "['[CLS]', '测', '试', '[SEP]', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n"
      ]
     }
    ],
    "source": [
+    "#转换为id序列\n",
+    "ids = tokenizer(input,padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
+    "print(ids.input_ids)\n",
+    "idss = tokenizer([input_array],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
+    "print(idss.input_ids)\n",
+    "\n",
+    "#将ids序列，又解码成为token\n",
+    "print(tokenizer.convert_ids_to_tokens(ids.input_ids[0]))\n",
+    "print(tokenizer.convert_ids_to_tokens(idss.input_ids[0]))"
    ]
   }
  ],