Spaces:
Paused
Paused
add decode
Browse files- tutorial.ipynb +7 -20
tutorial.ipynb
CHANGED
@@ -20,7 +20,7 @@
|
|
20 |
},
|
21 |
{
|
22 |
"cell_type": "code",
|
23 |
-
"execution_count":
|
24 |
"metadata": {},
|
25 |
"outputs": [
|
26 |
{
|
@@ -40,35 +40,22 @@
|
|
40 |
},
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
-
"execution_count":
|
44 |
"metadata": {},
|
45 |
"outputs": [
|
46 |
{
|
47 |
"name": "stdout",
|
48 |
"output_type": "stream",
|
49 |
"text": [
|
50 |
-
"tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
|
51 |
-
" 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n"
|
52 |
-
]
|
53 |
-
},
|
54 |
-
{
|
55 |
-
"ename": "TypeError",
|
56 |
-
"evalue": "argument 'ids': 'dict' object cannot be converted to 'Sequence'",
|
57 |
-
"output_type": "error",
|
58 |
-
"traceback": [
|
59 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
60 |
-
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
61 |
-
"Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m ids \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mdecode(result)\n\u001b[1;32m 3\u001b[0m tokenizer\u001b[39m.\u001b[39mdecode(ids)\n",
|
62 |
-
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m 3472\u001b[0m token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m 3473\u001b[0m skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m 3474\u001b[0m clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m 3475\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 3476\u001b[0m )\n",
|
63 |
-
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m 550\u001b[0m token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m 553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m 554\u001b[0m clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n",
|
64 |
-
"\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'dict' object cannot be converted to 'Sequence'"
|
65 |
]
|
66 |
}
|
67 |
],
|
68 |
"source": [
|
69 |
-
"print(result.input_ids)\n",
|
70 |
-
"
|
71 |
-
"tokenizer.
|
72 |
]
|
73 |
}
|
74 |
],
|
|
|
20 |
},
|
21 |
{
|
22 |
"cell_type": "code",
|
23 |
+
"execution_count": 2,
|
24 |
"metadata": {},
|
25 |
"outputs": [
|
26 |
{
|
|
|
40 |
},
|
41 |
{
|
42 |
"cell_type": "code",
|
43 |
+
"execution_count": 8,
|
44 |
"metadata": {},
|
45 |
"outputs": [
|
46 |
{
|
47 |
"name": "stdout",
|
48 |
"output_type": "stream",
|
49 |
"text": [
|
50 |
+
"{'input_ids': tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
|
51 |
+
" 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
]
|
53 |
}
|
54 |
],
|
55 |
"source": [
|
56 |
+
"#print(result.input_ids)\n",
|
57 |
+
"print(result)\n",
|
58 |
+
"tokenizer.convert_ids_to_tokens(result.input_ids)"
|
59 |
]
|
60 |
}
|
61 |
],
|