{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## tokenizer\n", "\n", "tokenizer负责将输入的字符串,转换为token,或者转换为ids序列" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# 导入模型\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n", "['测', '试', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n" ] } ], "source": [ "# 转换为token\n", "input = \"这是中英文test语句,mix中英文及标点符号\"\n", "input_array = [\"测试\",\"mix中英文及标点符号\"]\n", "print(tokenizer.tokenize(input))\n", "print(tokenizer.tokenize(input_array))\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n", " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n", "tensor([[ 101, 3844, 6407, 102, 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016,\n", " 1384, 102]])\n", "['[CLS]', '这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n", "['[CLS]', '测', '试', '[SEP]', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n" ] } ], "source": [ "#转换为id序列\n", "ids = tokenizer(input,padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n", "print(ids.input_ids)\n", "idss = tokenizer([input_array],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n", "print(idss.input_ids)\n", "\n", "#将ids序列,又解码成为token\n", "print(tokenizer.convert_ids_to_tokens(ids.input_ids[0]))\n", "print(tokenizer.convert_ids_to_tokens(idss.input_ids[0]))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } } }, "nbformat": 4, "nbformat_minor": 2 }