{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## tokenizer" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n" ] } ], "source": [ "input = \"这是中英文test语句,mix中英文及标点符号\"\n", "result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n", "a = tokenizer.tokenize(input)\n", "print(a)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n", " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n" ] }, { "ename": "TypeError", "evalue": "argument 'ids': 'dict' object cannot be converted to 'Sequence'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[24], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mprint\u001b[39m(result\u001b[39m.\u001b[39minput_ids)\n\u001b[0;32m----> 2\u001b[0m ids \u001b[39m=\u001b[39m tokenizer\u001b[39m.\u001b[39;49mdecode(result)\n\u001b[1;32m 3\u001b[0m tokenizer\u001b[39m.\u001b[39mdecode(ids)\n", "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:3471\u001b[0m, in \u001b[0;36mPreTrainedTokenizerBase.decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 3468\u001b[0m \u001b[39m# Convert inputs to python lists\u001b[39;00m\n\u001b[1;32m 3469\u001b[0m token_ids \u001b[39m=\u001b[39m to_py_obj(token_ids)\n\u001b[0;32m-> 3471\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_decode(\n\u001b[1;32m 3472\u001b[0m token_ids\u001b[39m=\u001b[39;49mtoken_ids,\n\u001b[1;32m 3473\u001b[0m skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens,\n\u001b[1;32m 3474\u001b[0m clean_up_tokenization_spaces\u001b[39m=\u001b[39;49mclean_up_tokenization_spaces,\n\u001b[1;32m 3475\u001b[0m \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs,\n\u001b[1;32m 3476\u001b[0m )\n", "File \u001b[0;32m~/.local/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:551\u001b[0m, in \u001b[0;36mPreTrainedTokenizerFast._decode\u001b[0;34m(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(token_ids, \u001b[39mint\u001b[39m):\n\u001b[1;32m 550\u001b[0m token_ids \u001b[39m=\u001b[39m [token_ids]\n\u001b[0;32m--> 551\u001b[0m text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_tokenizer\u001b[39m.\u001b[39;49mdecode(token_ids, skip_special_tokens\u001b[39m=\u001b[39;49mskip_special_tokens)\n\u001b[1;32m 553\u001b[0m \u001b[39mif\u001b[39;00m clean_up_tokenization_spaces:\n\u001b[1;32m 554\u001b[0m clean_text \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclean_up_tokenization(text)\n", "\u001b[0;31mTypeError\u001b[0m: argument 'ids': 'dict' object cannot be converted to 'Sequence'" ] } ], "source": [ "print(result.input_ids)\n", "ids = tokenizer.decode(result)\n", "tokenizer.decode(ids)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } } }, "nbformat": 4, "nbformat_minor": 2 }