clarkchan commited on
Commit
03723ee
·
1 Parent(s): 31ff37c

add decode

Browse files
Files changed (1) hide show
  1. tutorial.ipynb +7 -20
tutorial.ipynb CHANGED
@@ -20,7 +20,7 @@
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 14,
24
  "metadata": {},
25
  "outputs": [
26
  {
@@ -40,35 +40,22 @@
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": 24,
44
  "metadata": {},
45
  "outputs": [
46
  {
47
  "name": "stdout",
48
  "output_type": "stream",
49
  "text": [
50
- "tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
51
- " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n"
52
- ]
53
- },
54
- {
55
- "ename": "TypeError",
56
- "evalue": "argument 'ids': 'dict' object cannot be converted to 'Sequence'",
57
- "output_type": "error",
58
- "traceback": [
59
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
60
- "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
61
- "Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m ids \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mdecode(result)\n\u001b[1;32m 3\u001b[0m tokenizer\u001b[39m.\u001b[39mdecode(ids)\n",
62
- "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m 3472\u001b[0m token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m 3473\u001b[0m skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m 3474\u001b[0m clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m 3475\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 3476\u001b[0m )\n",
63
- "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m 550\u001b[0m token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m 553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m 554\u001b[0m clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n",
64
- "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'dict' object cannot be converted to 'Sequence'"
65
  ]
66
  }
67
  ],
68
  "source": [
69
- "print(result.input_ids)\n",
70
- "ids = tokenizer.decode(result)\n",
71
- "tokenizer.decode(ids)"
72
  ]
73
  }
74
  ],
 
20
  },
21
  {
22
  "cell_type": "code",
23
+ "execution_count": 2,
24
  "metadata": {},
25
  "outputs": [
26
  {
 
40
  },
41
  {
42
  "cell_type": "code",
43
+ "execution_count": 8,
44
  "metadata": {},
45
  "outputs": [
46
  {
47
  "name": "stdout",
48
  "output_type": "stream",
49
  "text": [
50
+ "{'input_ids': tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
51
+ " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ]
53
  }
54
  ],
55
  "source": [
56
+ "#print(result.input_ids)\n",
57
+ "print(result)\n",
58
+ "tokenizer.convert_ids_to_tokens(result.input_ids)"
59
  ]
60
  }
61
  ],