clarkchan commited on
Commit
4130c63
1 Parent(s): 03723ee
Files changed (1) hide show
  1. tutorial.ipynb +28 -12
tutorial.ipynb CHANGED
@@ -5,7 +5,9 @@
5
  "cell_type": "markdown",
6
  "metadata": {},
7
  "source": [
8
- "## tokenizer"
 
 
9
  ]
10
  },
11
  {
@@ -14,48 +16,62 @@
14
  "metadata": {},
15
  "outputs": [],
16
  "source": [
 
17
  "from transformers import AutoTokenizer\n",
18
  "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
19
  ]
20
  },
21
  {
22
  "cell_type": "code",
23
- "execution_count": 2,
24
  "metadata": {},
25
  "outputs": [
26
  {
27
  "name": "stdout",
28
  "output_type": "stream",
29
  "text": [
30
- "['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
 
31
  ]
32
  }
33
  ],
34
  "source": [
 
35
  "input = \"这是中英文test语句,mix中英文及标点符号\"\n",
36
- "result = tokenizer([input],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
37
- "a = tokenizer.tokenize(input)\n",
38
- "print(a)\n"
 
39
  ]
40
  },
41
  {
42
  "cell_type": "code",
43
- "execution_count": 8,
44
  "metadata": {},
45
  "outputs": [
46
  {
47
  "name": "stdout",
48
  "output_type": "stream",
49
  "text": [
50
- "{'input_ids': tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
51
- " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
 
 
 
 
52
  ]
53
  }
54
  ],
55
  "source": [
56
- "#print(result.input_ids)\n",
57
- "print(result)\n",
58
- "tokenizer.convert_ids_to_tokens(result.input_ids)"
 
 
 
 
 
 
59
  ]
60
  }
61
  ],
 
5
  "cell_type": "markdown",
6
  "metadata": {},
7
  "source": [
8
+ "## tokenizer\n",
9
+ "\n",
10
+ "tokenizer负责将输入的字符串,转换为token,或者转换为ids序列"
11
  ]
12
  },
13
  {
 
16
  "metadata": {},
17
  "outputs": [],
18
  "source": [
19
+ "# 导入模型\n",
20
  "from transformers import AutoTokenizer\n",
21
  "tokenizer = AutoTokenizer.from_pretrained(\"liam168/c2-roberta-base-finetuned-dianping-chinese\")"
22
  ]
23
  },
24
  {
25
  "cell_type": "code",
26
+ "execution_count": 16,
27
  "metadata": {},
28
  "outputs": [
29
  {
30
  "name": "stdout",
31
  "output_type": "stream",
32
  "text": [
33
+ "['这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n",
34
+ "['测', '试', 'mix', '中', '英', '文', '及', '标', '点', '符', '号']\n"
35
  ]
36
  }
37
  ],
38
  "source": [
39
+ "# 转换为token\n",
40
  "input = \"这是中英文test语句,mix中英文及标点符号\"\n",
41
+ "input_array = [\"测试\",\"mix中英文及标点符号\"]\n",
42
+ "print(tokenizer.tokenize(input))\n",
43
+ "print(tokenizer.tokenize(input_array))\n",
44
+ "\n"
45
  ]
46
  },
47
  {
48
  "cell_type": "code",
49
+ "execution_count": 18,
50
  "metadata": {},
51
  "outputs": [
52
  {
53
  "name": "stdout",
54
  "output_type": "stream",
55
  "text": [
56
+ "tensor([[ 101, 6821, 3221, 704, 5739, 3152, 10060, 6427, 1368, 8024,\n",
57
+ " 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016, 1384, 102]])\n",
58
+ "tensor([[ 101, 3844, 6407, 102, 9678, 704, 5739, 3152, 1350, 3403, 4157, 5016,\n",
59
+ " 1384, 102]])\n",
60
+ "['[CLS]', '这', '是', '中', '英', '文', 'test', '语', '句', ',', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n",
61
+ "['[CLS]', '测', '试', '[SEP]', 'mix', '中', '英', '文', '及', '标', '点', '符', '号', '[SEP]']\n"
62
  ]
63
  }
64
  ],
65
  "source": [
66
+ "#转换为id序列\n",
67
+ "ids = tokenizer(input,padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
68
+ "print(ids.input_ids)\n",
69
+ "idss = tokenizer([input_array],padding=True,truncation=True,max_length=512,return_tensors=\"pt\")\n",
70
+ "print(idss.input_ids)\n",
71
+ "\n",
72
+ "#将ids序列,又解码成为token\n",
73
+ "print(tokenizer.convert_ids_to_tokens(ids.input_ids[0]))\n",
74
+ "print(tokenizer.convert_ids_to_tokens(idss.input_ids[0]))"
75
  ]
76
  }
77
  ],