Spaces:
Paused
Paused
commit
Browse files- tutorial.ipynb +109 -0
tutorial.ipynb
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"## tokenizer"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 1,
|
14 |
+
"metadata": {},
|
15 |
+
"outputs": [],
|
16 |
+
"source": [
|
17 |
+
"from transformers import AutoTokenizer\n",
|
18 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 11,
|
24 |
+
"metadata": {},
|
25 |
+
"outputs": [
|
26 |
+
{
|
27 |
+
"ename": "TypeError",
|
28 |
+
"evalue": "TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]",
|
29 |
+
"output_type": "error",
|
30 |
+
"traceback": [
|
31 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
32 |
+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
33 |
+
"Cell \u001b[0;32mIn[11], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39minput\u001b[39m \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m这是中英文test语句,mix中英文及标点符号\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m result \u001b[39m=\u001b[39m tokenizer([\u001b[39minput\u001b[39m],padding\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,truncation\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,max_length\u001b[39m=\u001b[39m\u001b[39m512\u001b[39m,return_tensors\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mpt\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m a \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mtokenize([\u001b[39minput\u001b[39;49m])\n",
|
34 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:320\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast.tokenize\u001b[0;34m(self, text, pair, add_special_tokens, **kwargs)\u001b[0m\n\u001b[1;32m 319\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mtokenize\u001b[39m(\u001b[39mself\u001b[39m, text: \u001b[39mstr\u001b[39m, pair: Optional[\u001b[39mstr\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m, add_special_tokens: \u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m List[\u001b[39mstr\u001b[39m]:\n\u001b[0;32m--> 320\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mencode_plus(text\u001b[39m=\u001b[39;49mtext, text_pair\u001b[39m=\u001b[39;49mpair, add_special_tokens\u001b[39m=\u001b[39;49madd_special_tokens, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\u001b[39m.\u001b[39mtokens()\n",
|
35 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2702\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 2692\u001b[0m \u001b[39m# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'\u001b[39;00m\n\u001b[1;32m 2693\u001b[0m padding_strategy, truncation_strategy, max_length, kwargs \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_get_padding_truncation_strategies(\n\u001b[1;32m 2694\u001b[0m padding\u001b[39m=\u001b[39mpadding,\n\u001b[1;32m 2695\u001b[0m truncation\u001b[39m=\u001b[39mtruncation,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2699\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs,\n\u001b[1;32m 2700\u001b[0m )\n\u001b[0;32m-> 2702\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_encode_plus(\n\u001b[1;32m 2703\u001b[0m text\u001b[39m=\u001b[39;49mtext,\n\u001b[1;32m 2704\u001b[0m text_pair\u001b[39m=\u001b[39;49mtext_pair,\n\u001b[1;32m 2705\u001b[0m add_special_tokens\u001b[39m=\u001b[39;49madd_special_tokens,\n\u001b[1;32m 2706\u001b[0m padding_strategy\u001b[39m=\u001b[39;49mpadding_strategy,\n\u001b[1;32m 2707\u001b[0m truncation_strategy\u001b[39m=\u001b[39;49mtruncation_strategy,\n\u001b[1;32m 2708\u001b[0m max_length\u001b[39m=\u001b[39;49mmax_length,\n\u001b[1;32m 2709\u001b[0m stride\u001b[39m=\u001b[39;49mstride,\n\u001b[1;32m 2710\u001b[0m is_split_into_words\u001b[39m=\u001b[39;49mis_split_into_words,\n\u001b[1;32m 2711\u001b[0m pad_to_multiple_of\u001b[39m=\u001b[39;49mpad_to_multiple_of,\n\u001b[1;32m 2712\u001b[0m return_tensors\u001b[39m=\u001b[39;49mreturn_tensors,\n\u001b[1;32m 2713\u001b[0m return_token_type_ids\u001b[39m=\u001b[39;49mreturn_token_type_ids,\n\u001b[1;32m 2714\u001b[0m return_attention_mask\u001b[39m=\u001b[39;49mreturn_attention_mask,\n\u001b[1;32m 2715\u001b[0m return_overflowing_tokens\u001b[39m=\u001b[39;49mreturn_overflowing_tokens,\n\u001b[1;32m 2716\u001b[0m return_special_tokens_mask\u001b[39m=\u001b[39;49mreturn_special_tokens_mask,\n\u001b[1;32m 2717\u001b[0m return_offsets_mapping\u001b[39m=\u001b[39;49mreturn_offsets_mapping,\n\u001b[1;32m 2718\u001b[0m return_length\u001b[39m=\u001b[39;49mreturn_length,\n\u001b[1;32m 2719\u001b[0m verbose\u001b[39m=\u001b[39;49mverbose,\n\u001b[1;32m 2720\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 2721\u001b[0m )\n",
|
36 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:502\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._encode_plus\u001b[0;34m(self, text, text_pair, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_encode_plus\u001b[39m(\n\u001b[1;32m 480\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 481\u001b[0m text: Union[TextInput, PreTokenizedInput],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 498\u001b[0m \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs\n\u001b[1;32m 499\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m BatchEncoding:\n\u001b[1;32m 501\u001b[0m batched_input \u001b[39m=\u001b[39m [(text, text_pair)] \u001b[39mif\u001b[39;00m text_pair \u001b[39melse\u001b[39;00m [text]\n\u001b[0;32m--> 502\u001b[0m batched_output \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_batch_encode_plus(\n\u001b[1;32m 503\u001b[0m batched_input,\n\u001b[1;32m 504\u001b[0m is_split_into_words\u001b[39m=\u001b[39;49mis_split_into_words,\n\u001b[1;32m 505\u001b[0m add_special_tokens\u001b[39m=\u001b[39;49madd_special_tokens,\n\u001b[1;32m 506\u001b[0m padding_strategy\u001b[39m=\u001b[39;49mpadding_strategy,\n\u001b[1;32m 507\u001b[0m truncation_strategy\u001b[39m=\u001b[39;49mtruncation_strategy,\n\u001b[1;32m 508\u001b[0m max_length\u001b[39m=\u001b[39;49mmax_length,\n\u001b[1;32m 509\u001b[0m stride\u001b[39m=\u001b[39;49mstride,\n\u001b[1;32m 510\u001b[0m pad_to_multiple_of\u001b[39m=\u001b[39;49mpad_to_multiple_of,\n\u001b[1;32m 511\u001b[0m return_tensors\u001b[39m=\u001b[39;49mreturn_tensors,\n\u001b[1;32m 512\u001b[0m return_token_type_ids\u001b[39m=\u001b[39;49mreturn_token_type_ids,\n\u001b[1;32m 513\u001b[0m return_attention_mask\u001b[39m=\u001b[39;49mreturn_attention_mask,\n\u001b[1;32m 514\u001b[0m return_overflowing_tokens\u001b[39m=\u001b[39;49mreturn_overflowing_tokens,\n\u001b[1;32m 515\u001b[0m return_special_tokens_mask\u001b[39m=\u001b[39;49mreturn_special_tokens_mask,\n\u001b[1;32m 516\u001b[0m return_offsets_mapping\u001b[39m=\u001b[39;49mreturn_offsets_mapping,\n\u001b[1;32m 517\u001b[0m return_length\u001b[39m=\u001b[39;49mreturn_length,\n\u001b[1;32m 518\u001b[0m verbose\u001b[39m=\u001b[39;49mverbose,\n\u001b[1;32m 519\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 520\u001b[0m )\n\u001b[1;32m 522\u001b[0m \u001b[39m# Return tensor is None, then we can remove the leading batch axis\u001b[39;00m\n\u001b[1;32m 523\u001b[0m \u001b[39m# Overflowing tokens are returned as a batch of output so we keep them in this case\u001b[39;00m\n\u001b[1;32m 524\u001b[0m \u001b[39mif\u001b[39;00m return_tensors \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m return_overflowing_tokens:\n",
|
37 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:429\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._batch_encode_plus\u001b[0;34m(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[39m# Set the truncation and padding strategy and restore the initial configuration\u001b[39;00m\n\u001b[1;32m 421\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mset_truncation_and_padding(\n\u001b[1;32m 422\u001b[0m padding_strategy\u001b[39m=\u001b[39mpadding_strategy,\n\u001b[1;32m 423\u001b[0m truncation_strategy\u001b[39m=\u001b[39mtruncation_strategy,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 426\u001b[0m pad_to_multiple_of\u001b[39m=\u001b[39mpad_to_multiple_of,\n\u001b[1;32m 427\u001b[0m )\n\u001b[0;32m--> 429\u001b[0m encodings \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mencode_batch(\n\u001b[1;32m 430\u001b[0m batch_text_or_text_pairs,\n\u001b[1;32m 431\u001b[0m add_special_tokens\u001b[39m=\u001b[39;49madd_special_tokens,\n\u001b[1;32m 432\u001b[0m is_pretokenized\u001b[39m=\u001b[39;49mis_split_into_words,\n\u001b[1;32m 433\u001b[0m )\n\u001b[1;32m 435\u001b[0m \u001b[39m# Convert encoding to dict\u001b[39;00m\n\u001b[1;32m 436\u001b[0m \u001b[39m# `Tokens` has type: Tuple[\u001b[39;00m\n\u001b[1;32m 437\u001b[0m \u001b[39m# List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],\u001b[39;00m\n\u001b[1;32m 438\u001b[0m \u001b[39m# List[EncodingFast]\u001b[39;00m\n\u001b[1;32m 439\u001b[0m \u001b[39m# ]\u001b[39;00m\n\u001b[1;32m 440\u001b[0m \u001b[39m# with nested dimensions corresponding to batch, overflows, sequence length\u001b[39;00m\n\u001b[1;32m 441\u001b[0m tokens_and_encodings \u001b[39m=\u001b[39m [\n\u001b[1;32m 442\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_convert_encoding(\n\u001b[1;32m 443\u001b[0m encoding\u001b[39m=\u001b[39mencoding,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 452\u001b[0m \u001b[39mfor\u001b[39;00m encoding \u001b[39min\u001b[39;00m encodings\n\u001b[1;32m 453\u001b[0m ]\n",
|
38 |
+
"\u001b[0;31mTypeError\u001b[0m: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]"
|
39 |
+
]
|
40 |
+
}
|
41 |
+
],
|
42 |
+
"source": [
|
43 |
+
"input = \"这是中英文test语句,mix中英文及标点符号\"\n",
|
44 |
+
"result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
|
45 |
+
"a = tokenizer.tokenize([input])\n",
|
46 |
+
"print(a)\n"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 12,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [
|
54 |
+
{
|
55 |
+
"name": "stdout",
|
56 |
+
"output_type": "stream",
|
57 |
+
"text": [
|
58 |
+
"tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
|
59 |
+
" 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n"
|
60 |
+
]
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"ename": "TypeError",
|
64 |
+
"evalue": "argument 'ids': 'list' object cannot be interpreted as an integer",
|
65 |
+
"output_type": "error",
|
66 |
+
"traceback": [
|
67 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
68 |
+
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
69 |
+
"Cell \u001b[0;32mIn[12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m tokenizer\u001b[39m.\u001b[39;49mdecode(result\u001b[39m.\u001b[39;49minput_ids)\n",
|
70 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m 3472\u001b[0m token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m 3473\u001b[0m skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m 3474\u001b[0m clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m 3475\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 3476\u001b[0m )\n",
|
71 |
+
"File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m 550\u001b[0m token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m 553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m 554\u001b[0m clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n",
|
72 |
+
"\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'list' object cannot be interpreted as an integer"
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"print(result.input_ids)\n",
|
78 |
+
"tokenizer.decode(result.input_ids)"
|
79 |
+
]
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"metadata": {
|
83 |
+
"kernelspec": {
|
84 |
+
"display_name": "Python 3",
|
85 |
+
"language": "python",
|
86 |
+
"name": "python3"
|
87 |
+
},
|
88 |
+
"language_info": {
|
89 |
+
"codemirror_mode": {
|
90 |
+
"name": "ipython",
|
91 |
+
"version": 3
|
92 |
+
},
|
93 |
+
"file_extension": ".py",
|
94 |
+
"mimetype": "text/x-python",
|
95 |
+
"name": "python",
|
96 |
+
"nbconvert_exporter": "python",
|
97 |
+
"pygments_lexer": "ipython3",
|
98 |
+
"version": "3.8.10"
|
99 |
+
},
|
100 |
+
"orig_nbformat": 4,
|
101 |
+
"vscode": {
|
102 |
+
"interpreter": {
|
103 |
+
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
104 |
+
}
|
105 |
+
}
|
106 |
+
},
|
107 |
+
"nbformat": 4,
|
108 |
+
"nbformat_minor": 2
|
109 |
+
}
|