{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## tokenizer" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n" ] } ], "source": [ "input = \"这是中英文test语句,mix中英文及标点符号\"\n", "result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n", "a = tokenizer.tokenize(input)\n", "print(a)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n", " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n" ] } ], "source": [ "#print(result.input_ids)\n", "print(result)\n", "tokenizer.convert_ids_to_tokens(result.input_ids)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" } } }, "nbformat": 4, "nbformat_minor": 2 }