marisming commited on
Commit
1df7ad4
·
verified ·
1 Parent(s): ca90249

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. 02-gpt2_bert/.ipynb_checkpoints/2-dna-gpt-checkpoint.ipynb +507 -27
  3. 02-gpt2_bert/.ipynb_checkpoints/3-dna-bert-checkpoint.ipynb +533 -32
  4. 02-gpt2_bert/.ipynb_checkpoints/5-multi-seq-gpt-checkpoint.ipynb +0 -0
  5. 02-gpt2_bert/1-dna-bpe.ipynb +1 -1
  6. 02-gpt2_bert/2-dna-gpt.ipynb +507 -27
  7. 02-gpt2_bert/3-dna-bert.ipynb +0 -0
  8. 02-gpt2_bert/4-gene-feature.ipynb +125 -161
  9. 02-gpt2_bert/5-multi-seq-gpt.ipynb +0 -0
  10. 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/special_tokens_map-checkpoint.json +7 -0
  11. 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer-checkpoint.json +0 -0
  12. 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer_config-checkpoint.json +53 -0
  13. 02-gpt2_bert/gene_en_bpe.py +19 -0
  14. 02-gpt2_bert/gene_eng_dict.json +0 -0
  15. 02-gpt2_bert/gene_eng_dict/merges.txt +3 -0
  16. 02-gpt2_bert/gene_eng_dict/special_tokens_map.json +5 -0
  17. 02-gpt2_bert/gene_eng_dict/tokenizer.json +0 -0
  18. 02-gpt2_bert/gene_eng_dict/tokenizer_config.json +20 -0
  19. 02-gpt2_bert/gene_eng_dict/vocab.json +0 -0
  20. 03-gene-task/.ipynb_checkpoints/1-category-task-checkpoint.ipynb +808 -2
  21. 03-gene-task/.ipynb_checkpoints/3-multi-seq-task-checkpoint.ipynb +0 -0
  22. 03-gene-task/.ipynb_checkpoints/5-regression-task-checkpoint.ipynb +555 -1
  23. 03-gene-task/1-category-task.ipynb +780 -1
  24. 03-gene-task/2-structure-predict.ipynb +954 -1
  25. 03-gene-task/3-multi-seq-task.ipynb +0 -0
  26. 03-gene-task/4-fun-predict.ipynb +755 -1
  27. 03-gene-task/5-regression-task.ipynb +555 -1
  28. 03-gene-task/data/.ipynb_checkpoints/protein_stab-checkpoint.csv +0 -0
  29. 03-gene-task/data/dna_protein_full.json +3 -0
  30. 03-gene-task/data/protein_stab.csv +0 -0
  31. 03-gene-task/img/.ipynb_checkpoints/dataset-checkpoint.png +0 -0
  32. 03-gene-task/img/2_structure.png +0 -0
  33. 03-gene-task/img/dataset.png +0 -0
  34. 03-gene-task/img/ds_structure.png +0 -0
  35. 03-gene-task/img/function.png +0 -0
  36. 03-gene-task/img/gpt2-ft.png +0 -0
  37. 03-gene-task/img/pdb1.png +0 -0
  38. 03-gene-task/img/protein-structure-1-2.png +3 -0
  39. 03-gene-task/img/protein-structure-1.png +0 -0
  40. 03-gene-task/img/protein-structure-2.png +0 -0
  41. 03-gene-task/img/sequence.png +0 -0
  42. 04-gene-sft/.ipynb_checkpoints/1-finetue-intro-checkpoint.ipynb +254 -0
  43. 04-gene-sft/.ipynb_checkpoints/2-gpt2-instruction-ft-checkpoint.ipynb +498 -0
  44. 04-gene-sft/.ipynb_checkpoints/3-llama-expand-dict-checkpoint.ipynb +272 -0
  45. 04-gene-sft/.ipynb_checkpoints/4-deepspeed-intro-checkpoint.ipynb +593 -0
  46. 04-gene-sft/.ipynb_checkpoints/5-llama-continue-train-checkpoint.ipynb +41 -0
  47. 04-gene-sft/.ipynb_checkpoints/6-llama-instruction-ft-checkpoint.ipynb +206 -0
  48. 04-gene-sft/.ipynb_checkpoints/build_gene_bpe_seg-checkpoint.py +14 -0
  49. 04-gene-sft/.ipynb_checkpoints/deepspeed_pretrain_gpt2-checkpoint.py +114 -0
  50. 04-gene-sft/.ipynb_checkpoints/ds_zero2_no_offload-checkpoint.json +27 -0
.gitattributes CHANGED
@@ -35,4 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.psd filter=lfs diff=lfs merge=lfs -text
37
  *.txt filter=lfs diff=lfs merge=lfs -text
 
 
 
38
  img/gpt2_bridge.png filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.psd filter=lfs diff=lfs merge=lfs -text
37
  *.txt filter=lfs diff=lfs merge=lfs -text
38
+ 03-gene-task/data/dna_protein_full.json filter=lfs diff=lfs merge=lfs -text
39
+ 03-gene-task/img/protein-structure-1-2.png filter=lfs diff=lfs merge=lfs -text
40
+ 04-gene-sft/sft_data/train_data.json filter=lfs diff=lfs merge=lfs -text
41
  img/gpt2_bridge.png filter=lfs diff=lfs merge=lfs -text
02-gpt2_bert/.ipynb_checkpoints/2-dna-gpt-checkpoint.ipynb CHANGED
@@ -49,9 +49,9 @@
49
  "\n",
50
  "### 历史背景\n",
51
  "\n",
52
- "- **发布日期**:GPT-2 最初于 2019 2 月发布。OpenAI 在最初并没有一次性公开所有版本,而是逐步发布了不同规模的模型。\n",
53
  " \n",
54
- "- **开发动机**:GPT-2 是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
55
  "\n",
56
  "- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
57
  "\n",
@@ -73,6 +73,24 @@
73
  {
74
  "cell_type": "code",
75
  "execution_count": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "id": "70581590-096f-45f8-b13b-b84e88615849",
77
  "metadata": {},
78
  "outputs": [],
@@ -96,7 +114,7 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": 2,
100
  "id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
101
  "metadata": {},
102
  "outputs": [],
@@ -117,7 +135,7 @@
117
  },
118
  {
119
  "cell_type": "code",
120
- "execution_count": 3,
121
  "id": "87435829-f522-4820-a51d-11fa4afee6d7",
122
  "metadata": {},
123
  "outputs": [],
@@ -136,58 +154,53 @@
136
  ]
137
  },
138
  {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "id": "0a0adfdd-4be9-4027-a12d-3bf848be3012",
142
  "metadata": {},
143
- "outputs": [],
144
  "source": [
145
- "接着是训练数据集,"
 
 
 
 
146
  ]
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": 4,
151
  "id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
152
  "metadata": {},
153
  "outputs": [
154
  {
155
  "data": {
156
  "application/vnd.jupyter.widget-view+json": {
157
- "model_id": "4e8e73b13d42451bb7214b50bf1d8d47",
158
  "version_major": 2,
159
  "version_minor": 0
160
  },
161
  "text/plain": [
162
- "Generating train split: 0 examples [00:00, ? examples/s]"
163
  ]
164
  },
165
  "metadata": {},
166
  "output_type": "display_data"
167
  },
168
  {
169
- "data": {
170
- "application/vnd.jupyter.widget-view+json": {
171
- "model_id": "6a3e649a8ca14fc8adfc361c2f6eeb7e",
172
- "version_major": 2,
173
- "version_minor": 0
174
- },
175
- "text/plain": [
176
- "Map (num_proc=15): 0%| | 0/971635 [00:00<?, ? examples/s]"
177
- ]
178
- },
179
- "metadata": {},
180
- "output_type": "display_data"
181
  },
182
  {
183
  "data": {
184
  "application/vnd.jupyter.widget-view+json": {
185
- "model_id": "b0fb7862bca842518fa4e96901b93be4",
186
  "version_major": 2,
187
  "version_minor": 0
188
  },
189
  "text/plain": [
190
- "Map (num_proc=15): 0%| | 0/107960 [00:00<?, ? examples/s]"
191
  ]
192
  },
193
  "metadata": {},
@@ -197,6 +210,7 @@
197
  "source": [
198
  "# 1. load dna dataset\n",
199
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
 
200
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
201
  "\n",
202
  "# 2. tokenize\n",
@@ -212,6 +226,174 @@
212
  ")"
213
  ]
214
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  {
216
  "cell_type": "code",
217
  "execution_count": 5,
@@ -4656,10 +4838,308 @@
4656
  },
4657
  {
4658
  "cell_type": "code",
4659
- "execution_count": null,
4660
  "id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
4661
  "metadata": {},
4662
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4663
  "source": []
4664
  }
4665
  ],
 
49
  "\n",
50
  "### 历史背景\n",
51
  "\n",
52
+ "- **发布日期**:GPT(Generative Pre-trained Transformer)的第一个版本,即 GPT-1,是在 2018 年由 OpenAI 发布的。具体来说,关于 GPT-1 的研究论文《Improving Language Understanding by Generative Pre-Training》在 2018 年 6 月发布。\n",
53
  " \n",
54
+ "- **开发动机**:GPT-2 2019年发表,是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
55
  "\n",
56
  "- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
57
  "\n",
 
73
  {
74
  "cell_type": "code",
75
  "execution_count": 1,
76
+ "id": "83af3495-b1fd-4ea1-84d7-9224b7094c0f",
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "import subprocess\n",
81
+ "import os\n",
82
+ "# 设置环境变量, autodl一般区域\n",
83
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
84
+ "output = result.stdout\n",
85
+ "for line in output.splitlines():\n",
86
+ " if '=' in line:\n",
87
+ " var, value = line.split('=', 1)\n",
88
+ " os.environ[var] = value"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 2,
94
  "id": "70581590-096f-45f8-b13b-b84e88615849",
95
  "metadata": {},
96
  "outputs": [],
 
114
  },
115
  {
116
  "cell_type": "code",
117
+ "execution_count": 6,
118
  "id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
119
  "metadata": {},
120
  "outputs": [],
 
135
  },
136
  {
137
  "cell_type": "code",
138
+ "execution_count": 7,
139
  "id": "87435829-f522-4820-a51d-11fa4afee6d7",
140
  "metadata": {},
141
  "outputs": [],
 
154
  ]
155
  },
156
  {
157
+ "cell_type": "markdown",
158
+ "id": "05875e2f-32e7-485d-9399-99dc1e4bf71f",
 
159
  "metadata": {},
 
160
  "source": [
161
+ "## 训练数据\n",
162
+ "\n",
163
+ "接着是训练数据集,最重要的是构建模型的输入和输出。\n",
164
+ "\n",
165
+ "这里使用DataCollatorForLanguageModeling ,它是专为语言建模而设计(顾名思义)。除了堆叠和填充批次,它还负责创建语言模型标签——在因果语言建模中,输入也用作标签(只是移动了一个元素),并且这个数据整理器在训练期间即时创建它们,所以我们不需要复制 input_ids。"
166
  ]
167
  },
168
  {
169
  "cell_type": "code",
170
+ "execution_count": 9,
171
  "id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
172
  "metadata": {},
173
  "outputs": [
174
  {
175
  "data": {
176
  "application/vnd.jupyter.widget-view+json": {
177
+ "model_id": "3db6964a82794db7ac007c7aa513ad33",
178
  "version_major": 2,
179
  "version_minor": 0
180
  },
181
  "text/plain": [
182
+ "Map (num_proc=15): 0%| | 0/90 [00:00<?, ? examples/s]"
183
  ]
184
  },
185
  "metadata": {},
186
  "output_type": "display_data"
187
  },
188
  {
189
+ "name": "stderr",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.\n"
193
+ ]
 
 
 
 
 
 
 
194
  },
195
  {
196
  "data": {
197
  "application/vnd.jupyter.widget-view+json": {
198
+ "model_id": "ba2c0d0e766949c79e4db6e6bd881f06",
199
  "version_major": 2,
200
  "version_minor": 0
201
  },
202
  "text/plain": [
203
+ "Map (num_proc=10): 0%| | 0/10 [00:00<?, ? examples/s]"
204
  ]
205
  },
206
  "metadata": {},
 
210
  "source": [
211
  "# 1. load dna dataset\n",
212
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
213
+ "#dataset = raw_dataset[\"train\"].select(range(100)).train_test_split(test_size=0.1, shuffle=True)\n",
214
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
215
  "\n",
216
  "# 2. tokenize\n",
 
226
  ")"
227
  ]
228
  },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 10,
232
+ "id": "2eb1ff7a-f733-404b-a6ed-da82a677da3f",
233
+ "metadata": {},
234
+ "outputs": [
235
+ {
236
+ "name": "stdout",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "[{'input_ids': [20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978, 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412, 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65, 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84, 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137, 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419, 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468, 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65, 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138, 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003, 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772, 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079, 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269, 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614, 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}]\n"
240
+ ]
241
+ }
242
+ ],
243
+ "source": [
244
+ "samples = [tokenized_datasets[\"train\"][0]]\n",
245
+ "print(samples)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 11,
251
+ "id": "260283a4-5ceb-4ef6-be1b-a4765fb74b20",
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "name": "stdout",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "{'input_ids': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
259
+ " 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
260
+ " 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
261
+ " 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
262
+ " 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
263
+ " 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
264
+ " 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
265
+ " 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
266
+ " 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
267
+ " 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
268
+ " 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
269
+ " 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
270
+ " 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
271
+ " 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
272
+ " 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0,\n",
273
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
274
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
275
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
276
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
277
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
278
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
279
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
280
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
281
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
282
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
283
+ " 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
284
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
285
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
286
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
287
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
288
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
289
+ " 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
290
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
291
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
292
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
293
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
294
+ " 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
295
+ " 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
296
+ " 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
297
+ " 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
298
+ " 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
299
+ " 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
300
+ " 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
301
+ " 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
302
+ " 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
303
+ " 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
304
+ " 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
305
+ " 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
306
+ " 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
307
+ " 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, -100,\n",
308
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
309
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
310
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
311
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
312
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
313
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
314
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
315
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
316
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
317
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
318
+ " -100, -100, -100, -100, -100, -100]])}\n"
319
+ ]
320
+ }
321
+ ],
322
+ "source": [
323
+ "io_data = data_collator(samples)\n",
324
+ "print(io_data)"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "id": "80a84504-eaa3-43a9-ba13-3a2b73942c59",
330
+ "metadata": {},
331
+ "source": [
332
+ "这段代码展示了如何加载 DNA 数据集、对其进行分词处理,并为语言模型训练准备数据。让我们逐段解析代码,并特别关注 `DataCollatorForLanguageModeling` 函数。\n",
333
+ "\n",
334
+ "### 1. 加载 DNA 数据集\n",
335
+ "\n",
336
+ "```python\n",
337
+ "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
338
+ "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
339
+ "```\n",
340
+ "\n",
341
+ "- **`load_dataset`**:使用 Hugging Face 的 `datasets` 库加载文本文件作为数据集。这里指定的是一个本地的 DNA 序列文本文件 `dna_1g.txt`。\n",
342
+ "- **`train_test_split`**:将原始数据集分割为训练集和测试集,其中测试集占 10%(`test_size=0.1`),并随机打乱数据(`shuffle=True`)。\n",
343
+ "\n",
344
+ "### 2. 定义分词函数\n",
345
+ "\n",
346
+ "```python\n",
347
+ "def tokenize_function(examples):\n",
348
+ " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
349
+ "```\n",
350
+ "\n",
351
+ "- **`tokenize_function`**:这是一个自定义的分词函数,用于对数据集中的每条记录进行分词处理。\n",
352
+ "- **参数解释**:\n",
353
+ " - `examples['text']`:获取数据集中每条记录的文本内容。\n",
354
+ " - `truncation=True`:确保所有输入序列被截断到 `max_length` 指定的最大长度。\n",
355
+ " - `padding='max_length'`:将所有输入序列填充到 `max_length` 指定的最大长度,以保证批次内所有序列具有相同的长度。\n",
356
+ " - `max_length`:指定最大序列长度,需要根据具体任务和模型要求设置。\n",
357
+ "\n",
358
+ "### 3. 对数据集应用分词函数\n",
359
+ "\n",
360
+ "```python\n",
361
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15)\n",
362
+ "```\n",
363
+ "\n",
364
+ "- **`map`**:将 `tokenize_function` 应用到整个数据集上。`batched=True` 表示批量处理,可以显著提高处理速度。\n",
365
+ "- **`remove_columns=['text']`**:分词后不再需要原始文本列,因此将其移除。\n",
366
+ "- **`num_proc=15`**:指定使用的 CPU 核心数(或进程数),可以根据你的硬件资源调整。这有助于加速分词过程。\n",
367
+ "\n",
368
+ "### 4. 创建数据收集器\n",
369
+ "\n",
370
+ "```python\n",
371
+ "data_collator = DataCollatorForLanguageModeling(\n",
372
+ " tokenizer=tokenizer, mlm=False\n",
373
+ ")\n",
374
+ "```\n",
375
+ "\n",
376
+ "#### `DataCollatorForLanguageModeling` 函数详解\n",
377
+ "\n",
378
+ "`DataCollatorForLanguageModeling` 是 Hugging Face 提供的一个工具,用于在训练语言模型时动态地处理批次数据。它主要用于两种任务:\n",
379
+ "\n",
380
+ "- **Masked Language Modeling (MLM)**:遮蔽某些 token 并预测它们,常用于预训练模型(如 BERT)。\n",
381
+ "- **Causal Language Modeling (CLM)**:基于前文预测下一个 token,适用于生成式模型(如 GPT 系列)。\n",
382
+ "\n",
383
+ "在这个例子中,`mlm=False` 表明我们正在处理因果语言建模(CLM),即每个 token 只能依赖于其前面的 token 进行预测。这对于像 GPT 这样的生成模型非常适用。\n",
384
+ "\n",
385
+ "- **`tokenizer=tokenizer`**:指定用于编码和解码的分词器对象。\n",
386
+ "- **`mlm=False`**:关闭 MLM 模式,因为我们不需要遮蔽任何 token。对于因果语言建模,模型会尝试根据之前的上下文预测下一个 token。"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "markdown",
391
+ "id": "3fbe9480-c394-4bab-bdee-e80f21e0259a",
392
+ "metadata": {},
393
+ "source": [
394
+ "### 开始训练"
395
+ ]
396
+ },
397
  {
398
  "cell_type": "code",
399
  "execution_count": 5,
 
4838
  },
4839
  {
4840
  "cell_type": "code",
4841
+ "execution_count": 3,
4842
  "id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
4843
  "metadata": {},
4844
  "outputs": [],
4845
+ "source": [
4846
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dna_bpe_dict\")\n",
4847
+ "tokenizer.pad_token = tokenizer.eos_token"
4848
+ ]
4849
+ },
4850
+ {
4851
+ "cell_type": "code",
4852
+ "execution_count": 5,
4853
+ "id": "76f7c636-20c0-47a1-83c1-72e5ee101c0f",
4854
+ "metadata": {},
4855
+ "outputs": [],
4856
+ "source": [
4857
+ "from transformers import AutoTokenizer, AutoModel\n",
4858
+ "model = AutoModel.from_pretrained('dna_gpt2_v0')"
4859
+ ]
4860
+ },
4861
+ {
4862
+ "cell_type": "code",
4863
+ "execution_count": 6,
4864
+ "id": "c041ad1b-7fe4-4d00-a77e-8ab17f020600",
4865
+ "metadata": {},
4866
+ "outputs": [
4867
+ {
4868
+ "name": "stdout",
4869
+ "output_type": "stream",
4870
+ "text": [
4871
+ "[2024-12-30 20:29:16,315] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
4872
+ ]
4873
+ },
4874
+ {
4875
+ "name": "stderr",
4876
+ "output_type": "stream",
4877
+ "text": [
4878
+ "/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
4879
+ "collect2: error: ld returned 1 exit status\n",
4880
+ "/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4881
+ "/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4882
+ "/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4883
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
4884
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
4885
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
4886
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
4887
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
4888
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
4889
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
4890
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4891
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
4892
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
4893
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
4894
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
4895
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4896
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
4897
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
4898
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
4899
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4900
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
4901
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
4902
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4903
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
4904
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
4905
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
4906
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
4907
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
4908
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
4909
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
4910
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
4911
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
4912
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
4913
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
4914
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
4915
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
4916
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
4917
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
4918
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
4919
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
4920
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
4921
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
4922
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4923
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
4924
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
4925
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4926
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
4927
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
4928
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4929
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
4930
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
4931
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
4932
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
4933
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
4934
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
4935
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
4936
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
4937
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
4938
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
4939
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
4940
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
4941
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
4942
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4943
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
4944
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4945
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4946
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
4947
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4948
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
4949
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
4950
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
4951
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
4952
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
4953
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
4954
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
4955
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4956
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
4957
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
4958
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
4959
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4960
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
4961
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
4962
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
4963
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
4964
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
4965
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
4966
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
4967
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
4968
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
4969
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
4970
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
4971
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
4972
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
4973
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4974
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
4975
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
4976
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
4977
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4978
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
4979
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
4980
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
4981
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
4982
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
4983
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
4984
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
4985
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
4986
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
4987
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
4988
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4989
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
4990
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
4991
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
4992
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4993
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
4994
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
4995
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
4996
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
4997
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
4998
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
4999
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
5000
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
5001
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
5002
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
5003
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
5004
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
5005
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
5006
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
5007
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
5008
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
5009
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
5010
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
5011
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
5012
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
5013
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
5014
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
5015
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
5016
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
5017
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5018
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
5019
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
5020
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
5021
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
5022
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
5023
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
5024
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
5025
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
5026
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
5027
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
5028
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
5029
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
5030
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
5031
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
5032
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
5033
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
5034
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
5035
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
5036
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
5037
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
5038
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
5039
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
5040
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
5041
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
5042
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
5043
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
5044
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
5045
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
5046
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
5047
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5048
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
5049
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5050
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
5051
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
5052
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
5053
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
5054
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
5055
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
5056
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
5057
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
5058
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
5059
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
5060
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
5061
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
5062
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
5063
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
5064
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
5065
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
5066
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
5067
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
5068
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
5069
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
5070
+ "collect2: error: ld returned 1 exit status\n"
5071
+ ]
5072
+ },
5073
+ {
5074
+ "data": {
5075
+ "application/vnd.jupyter.widget-view+json": {
5076
+ "model_id": "857d0b6286fb4eaaafcb8911cef664dc",
5077
+ "version_major": 2,
5078
+ "version_minor": 0
5079
+ },
5080
+ "text/plain": [
5081
+ "model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
5082
+ ]
5083
+ },
5084
+ "metadata": {},
5085
+ "output_type": "display_data"
5086
+ },
5087
+ {
5088
+ "data": {
5089
+ "text/plain": [
5090
+ "CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', commit_message='Upload model', commit_description='', oid='e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
5091
+ ]
5092
+ },
5093
+ "execution_count": 6,
5094
+ "metadata": {},
5095
+ "output_type": "execute_result"
5096
+ }
5097
+ ],
5098
+ "source": [
5099
+ "model.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_***\")"
5100
+ ]
5101
+ },
5102
+ {
5103
+ "cell_type": "code",
5104
+ "execution_count": 7,
5105
+ "id": "8a28a45b-56ba-4328-8edf-4cd7ee9289c5",
5106
+ "metadata": {},
5107
+ "outputs": [
5108
+ {
5109
+ "data": {
5110
+ "application/vnd.jupyter.widget-view+json": {
5111
+ "model_id": "42c48d91578f41439d7b3ec26a6d566c",
5112
+ "version_major": 2,
5113
+ "version_minor": 0
5114
+ },
5115
+ "text/plain": [
5116
+ "README.md: 0%| | 0.00/5.17k [00:00<?, ?B/s]"
5117
+ ]
5118
+ },
5119
+ "metadata": {},
5120
+ "output_type": "display_data"
5121
+ },
5122
+ {
5123
+ "data": {
5124
+ "text/plain": [
5125
+ "CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/16138639cb17307b84421e443a1c67f4fe188121', commit_message='Upload tokenizer', commit_description='', oid='16138639cb17307b84421e443a1c67f4fe188121', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
5126
+ ]
5127
+ },
5128
+ "execution_count": 7,
5129
+ "metadata": {},
5130
+ "output_type": "execute_result"
5131
+ }
5132
+ ],
5133
+ "source": [
5134
+ "tokenizer.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_**\")"
5135
+ ]
5136
+ },
5137
+ {
5138
+ "cell_type": "code",
5139
+ "execution_count": null,
5140
+ "id": "ec5364cc-4386-4db8-a400-cd788657de84",
5141
+ "metadata": {},
5142
+ "outputs": [],
5143
  "source": []
5144
  }
5145
  ],
02-gpt2_bert/.ipynb_checkpoints/3-dna-bert-checkpoint.ipynb CHANGED
@@ -1,8 +1,102 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "id": "a3ec4b86-2029-4d50-9bbf-64b208249165",
7
  "metadata": {},
8
  "outputs": [],
@@ -10,12 +104,13 @@
10
  "from tokenizers import Tokenizer\n",
11
  "from tokenizers.models import WordPiece\n",
12
  "from tokenizers.trainers import WordPieceTrainer\n",
13
- "from tokenizers.pre_tokenizers import Whitespace"
 
14
  ]
15
  },
16
  {
17
  "cell_type": "code",
18
- "execution_count": null,
19
  "id": "47b3fc92-df22-4e4b-bdf9-671bda924c49",
20
  "metadata": {},
21
  "outputs": [],
@@ -29,11 +124,20 @@
29
  "execution_count": null,
30
  "id": "73f59aa6-8cce-4124-a3ee-7a5617b91ea7",
31
  "metadata": {},
32
- "outputs": [],
 
 
 
 
 
 
 
 
 
33
  "source": [
34
  "# 设置训练参数\n",
35
  "trainer = WordPieceTrainer(\n",
36
- " vocab_size=90000, # 词汇表大小\n",
37
  " min_frequency=2, # 最小词频\n",
38
  " special_tokens=[\n",
39
  " \"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"\n",
@@ -45,7 +149,7 @@
45
  },
46
  {
47
  "cell_type": "code",
48
- "execution_count": null,
49
  "id": "7a0ccd64-5172-4f40-9868-cdf02687ae10",
50
  "metadata": {},
51
  "outputs": [],
@@ -75,10 +179,23 @@
75
  },
76
  {
77
  "cell_type": "code",
78
- "execution_count": null,
79
  "id": "48e1f20b-cd1a-49fa-be2b-aba30a24e706",
80
  "metadata": {},
81
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  "source": [
83
  "new_tokenizer = Tokenizer.from_file(\"dna_wordpiece_dict.json\")\n",
84
  "\n",
@@ -95,7 +212,7 @@
95
  },
96
  {
97
  "cell_type": "code",
98
- "execution_count": null,
99
  "id": "c94dc601-86ec-421c-8638-c8d8b5078682",
100
  "metadata": {},
101
  "outputs": [],
@@ -112,7 +229,7 @@
112
  },
113
  {
114
  "cell_type": "code",
115
- "execution_count": null,
116
  "id": "b2658cd2-0ac5-483e-b04d-2716993770e3",
117
  "metadata": {},
118
  "outputs": [],
@@ -123,49 +240,103 @@
123
  },
124
  {
125
  "cell_type": "code",
126
- "execution_count": null,
127
- "id": "a7d0b7b8-b6dc-422a-9133-1d51ec40adbe",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  "metadata": {},
129
  "outputs": [],
130
  "source": [
131
- "max_length = 256 #最大���入长度\n",
132
- "\n",
133
- "# Building the config\n",
134
- "#config = BertConfig()\n",
135
  "\n",
 
 
136
  "\n",
137
  "# 构建配置\n",
138
- "config = AutoConfig.from_pretrained(\n",
139
- " \"bert-base-uncased\", # 或者其他预训练 BERT 模型名称,这里只是为了获取默认配置\n",
140
- " vocab_size=len(tokenizer),\n",
141
- " max_position_embeddings=max_length, # 对应于最大位置嵌入数\n",
142
- " pad_token_id=tokenizer.pad_token_id,\n",
143
- " bos_token_id=tokenizer.cls_token_id, # BERT 使用 [CLS] 作为句子开始标记\n",
144
- " eos_token_id=tokenizer.sep_token_id # BERT 使用 [SEP] 作为句子结束标记\n",
145
  ")\n",
146
  "\n",
147
- "\n",
148
  "# Building the model from the config\n",
149
- "model = AutoModelForMaskedLM.from_config(config)"
150
  ]
151
  },
152
  {
153
  "cell_type": "code",
154
- "execution_count": null,
155
  "id": "afc2cdd1-228e-4ee7-95f5-07718f00723d",
156
  "metadata": {},
157
  "outputs": [],
158
  "source": [
159
  "# 1. load dna dataset\n",
160
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
 
161
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
162
  "\n",
163
  "# 2. tokenize\n",
 
 
 
 
 
 
164
  "def tokenize_function(examples):\n",
165
  " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
166
  "\n",
 
167
  "# 3. 对数据集应用分词函数\n",
168
- "tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15) # 设置为你的 CPU 核心数或根据需要调整\n",
169
  "\n",
170
  "# 4. 创建一个数据收集器,用于动态填充和遮蔽,注意mlm=true\n",
171
  "data_collator = DataCollatorForLanguageModeling(\n",
@@ -175,13 +346,71 @@
175
  },
176
  {
177
  "cell_type": "code",
178
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  "id": "604491f9-2ee7-4722-aad6-02e98457b5ee",
180
  "metadata": {},
181
  "outputs": [],
182
  "source": [
183
  "run_path = \"bert_run\"\n",
184
- "train_epoches = 5\n",
185
  "batch_size = 10\n",
186
  "\n",
187
  "\n",
@@ -208,10 +437,182 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": null,
212
  "id": "d91a8bfb-f3ff-4031-a0d7-ebedc200d65a",
213
  "metadata": {},
214
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  "source": [
216
  "trainer.train()\n",
217
  "trainer.save_model(\"dna_bert_v0\")"
@@ -219,7 +620,27 @@
219
  },
220
  {
221
  "cell_type": "code",
222
- "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  "id": "fc4ad6ad-6433-471f-8510-1ae46558d4ce",
224
  "metadata": {},
225
  "outputs": [],
@@ -227,6 +648,86 @@
227
  "#upload model\n",
228
  "#model.push_to_hub(\"dna_bert_v0\", organization=\"dnagpt\", use_auth_token=\"hf_*******\")"
229
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  }
231
  ],
232
  "metadata": {
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "6c42f2f6-2332-40c7-9b69-50a0f0c12901",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 2.3 从头训练dna bert大模型"
9
+ ]
10
+ },
11
+ {
12
+ "attachments": {
13
+ "6a042b8f-c47d-4f6d-b601-b80124836ec4.jpg": {
14
+ "image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcU\nFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgo\nKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAGaBDgDASIA\nAhEBAxEB/8QAHAABAAIDAQEBAAAAAAAAAAAAAAUGAwQHAgEI/8QAWhAAAQMCAQQLCgkLAQQKAwEA\nAAECAwQFEQYSIVUTFRYxNEFRkZPR0gcUU1RhcXJ0krIXIjI1UoGUorEIMzZCVnN1oaOkwSMkYmTC\nJTdDY4KDlbPh8ERF0/H/xAAbAQEBAAMBAQEAAAAAAAAAAAAAAwECBAUGB//EAD8RAQABAgEGCwYF\nBAIDAQEAAAABAgMRBBITITFRFBUyM0FSU4GRobEFNGFxctEGkqLB8CI1QuEjQ4LC8SRF/9oADAMB\nAAIRAxEAPwD9UgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAujfI+a9WuGT\nY5bjSMfvZrpmov4gSAMcE8NQzPgljlbyscjk/kZAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAGrUzPdJsEC4Owxe76KdZiqYpjGRllqIolwe9EXk4zyysgc7BHoi+XQY4omRp8VNPGq76\nnpzWvTByIqcinNOU69jbNbINBjnUj0xXGnXRp/UXqN86KK4rjGGoADYAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAACCuGU9BSzrTw7JWVW9sVO3PVPOu8RvX7diM65VhClqzXdnC3GKdBWUu9\n+m009kSNnFs0yIvMFuuUEWmWyxvam/sUyKvMc3GVrdVh9NX2dHArm+n80fdZgV2nyso9lSG4w1Fv\nmXinZg3nLBG9krEfG5r2LpRzVxRTos5Tav8AN1Y4ePgjdsXLPLjB6ABdEAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAqoiKqrgiGpdbhT2u3zVlW/Mhi\nbiq8a+RDk17ymyiyht9fV26N1JZoGrnuRcFcnJjxr5EA37ncbplzfZrXZ53U9qhXCSVqqmenKv8A\nhCZp+5hYoqfCd1TI/D40mejf5YH3uURxUmRj6pGYvc973qm+uG8hCuqcpsu3q2kTa2zq5Wq/H5Sf\ni78AK7c5GZHZQNTJi5vqE3pIt9PRXDQpa4u6Hd2RJJU5PTbFxvbnJ/gmbdYMnsjKPvqskjWVN+ef\nS5V5Gp1EnYcrbNfZFgo6hEm4opW5jnJ5EXfAxZLZY23KFVigV0NUiYrDJoX6uUspzXukZOtoEblD\nZ02CpgejpEZoRUx38C85O3FLtZaStbo2aNHKnIvGBIgByo1qqq4ImlQNe4V1NbqV1RWzMhhbvucu\nBBNyhr61M60WSpmi4pZ3JC13mRdP8jBZKfdHW7d16Z9G1ypb4HfJRqLhsqpxq7i5EwLZvAVtbxe6\nZM+tsD3R8a007ZFT6tCknZ7zRXeN60ki7JGuEkT0zXsXkVq6UJErmVlskaxLzamo260aK9Ebo74Y\nml0TuXFN7kXACwTytggklfjmsarlw5ETErNLlZPVU8c9NYLnJDImcx6Nbg5OXfJd9ZFcMnX1lOuM\nM9KsrF8itxQ18iv0TtXq7fwA1N0lZ+zt05m9Z6TKuGHBblb7hQMX/tJocWJ51THAseB8c1r2q1zU\nVqpgqKm+BF3K7upoIJqKiqLhHNpR1NgqInLvkfukrP2dunM3rPOTzEteUVztEOPeisbVwM4o85VR\nzU8mKY/WWfACpx5XzSVc1MywXRZ4ka57c1uhFxw4/Ipn3SVn7O3Tmb1n21fpte/3FP8A8xZMANW2\nVb62kbNLSzUrlVU2KXDOTmNoADzLIyKNz5XtYxqYq5y4IhAy5Y2ONyolc16JvujY5yJ9aIatXAmU\neUU9HUKq2u3o3ZIscEmlcmKI7lREw0eUssNNDDGkcMMbGImCNa1EQDUtd5t11RVoKuKZU32td8ZP\nOm+SBX8o8noayFaqga2lusKZ8FRGmC4ppzV5UXeVCQyeuG2llo6xW5rpY0c5vI7jTnAkAAAAAAAA\nAAAAAHieRIoXvX9VMSPp6iBiK188ezKuL/jJjibNcucsMXE52K+ZNJCVOTNHNK+RHysc5cVwXE5M\npq1xS2hNtkY75L2r5lPquRqYuVETylMS00y3B1HT3GVs7eJW6PNiZK6ypRw7LX3N6R45qYNVVVec\n5mVomqqZrVbLNEiKmCorkMttlR8KszkdmLgjsd9vEpXaXJmhliZKlRLKxyYouOGJN0cDKOeCKFFS\nNY1jRPNpT/JaxVhVhvYlJAA7moAAK1lNlrZcm62Gkuk8jJ5WZ7GsjV+KfURXwqZM+Hqvsz+ojMo2\ntf3bcnmvajm95TaFTE6P3rT+Ai9hAKY3up5LY/6lZNEnLJA9qfgWqz3i33mm74tdXDVRcbo3Y4ef\nkM8lDSSNVslNA5q8Sxopy/Lu0w5C3O35U5PolJEtQynrqWPRHKxy4Z2HKgHTbtcKe1W6orq16sp4\nGK97kTHBEKe3uq5Luajm1FS5q6UVKZ6ov8iR7prkf3PL45N5aRyoeu5zTwuyFsSuhjVVo48VVqfR\nAjfhUyZ8PVfZn9Q+FTJnw9V9mf1F171p/ARewg71p/ARewgES/Ki2R5MLf3yvbbUbnq9WKi4Y4b2\n/vlfTuq5MKiKlRUqi8aUz+oyd2ZrWdzG9tYiNRIm4IiYfrIT2S1NAuTdrVYYlVaaPTmJ9FAK98Ke\nTHh6r7M/qJG05f5M3WpSnpLrBs7tCMkxYq85Y+9afwEXsIV/KzIqzZR0EkNVRxRz5q7HPG1Gvjdx\nKioBJZQ32hsFAlZcXvbArkbixivXFfIhWfhUyZ8PVfZn9RrdyC4VdXabjZ7w9KmptFStNsj9Oc39\nVS/d60/gIvYQCkL3V8lkejFqqhHrvNWnfiv1YHr4VMmfD1X2Z/URWUcMSd2fJpiRsRq0kuKZqYKd\nK71p/ARewgEPkxlVa8pVnS1ySv2HDPz4nM3/ADk6eI4o48djYxmO/mpgewAAAAAAAAAAAAACkZWX\napqLgtuhiqmUbNE8kLFVz/InkM1rvdotcLYY6GppGcbnQrivlVS44JyHl7GORc9rVTyoeTPs+7pa\nr0XNc76ccI3Rrehwy1NuLU0ao3ThjO/Y0aC50VwbjSVEcnkRdPMbhR8oNrqqsdDY6WWW5tX87Srm\ntYv+8u8WawJcm0LW3jYlnTecxcVVPL5TTJ8qqruTaqjHDpjk/L4T4mUZLTboi5TOGPRO3/54N2pp\n4aqJYqiJksa77XpihWqi3VeTz3VVmV8tFjjLRuXHBONWkhlRbKq4Uf8AsFVJDOzSjUdg1/kUr9jt\n9JcM6B9bcKS5R6JIXy6UXlTHfQ58rqmq9FFNGFX+NWOHdH2navklMRamua8aemnDHx+8bF0tlfBc\nqOOppnYsem9xovIptFaydsdXZK6VG1KVFHNpXO0Oa7l8pZT2MkuXblqJvU5tXS8/KaLdFyYtTjT0\nAAOlAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHNu63\nPJU1VntDHK1lTLnPw49KJ/ksGVlDDb+59cqSmY1kUVKqIjU5CB7rkElPJabtExXJSy4PVOLSip+B\naat0WVOSFQ2gmYqVlO5jXLvNcqcf1gQfc4XDueuVf++IXJDKC4vyfprVk/QOmq2ZyPqJdEUeKrzn\n2zXZ2TGT9TY77SzUc6tk2KZUzo5FXiRyFp7mKNTI2hVERFVHKvl0qBTYKeS0ZQrV5dU81ZnL/pVS\nrnws/wDDxFiveRVoyhhSvs0zKWpd8ZssHyHL5UTeXyoT+WrUdkrc0ciL/oOXShRbNZqy0ZJwX6y3\nB8MjadZp6eX40ciImK4JxKBoXm85RWO1VVov9P31BMxY46lV/wA8f16S89y9ix5F0KK5FxznaOLF\nymzaJ4cr8k4pa6mYjalrkczfRFRVTFOYq3cymmteUF2yenermQqskfkwXBfxQDpZA5dzyQZJ3HYV\nzZZWJTtXkWRyMRfvE8VzuhJhkpVS4KqQSQ1C4cjJWOX+SKBPUdPHSUkNPC1GxRMSNiJxIiYIZQio\nqIqLii6QACoipgu8QVZbr1LVSPpr5sELlxbH3qx2anJiu+YdqsoP2i/s4+oCSnpIKGxT01JGkUEc\nL0Yxu8iYKa2RX6J2r1dv4GjYK6qr8mLi6vmbPNDJUwbIjEbnIxVRFwTRxEbkpk/Uz5N26Vt+ucSP\nhaqMY5uDfImgC9YnmSRkUbpJHtYxqYq5y4IiFd3NVf7RXb2mdk9NyUppXNW51ldcWtXFGVEyqz62\npgigYsm37aXy43qNFSke1tLTOX/tGtVVc9PIqrgnmLOeY2NjY1kbUaxqYI1EwREPQFbtX6bXz9xT\n/wDMWQrdq/Ta+fuKf/mLFK/Y4nvzXOzUVc1qYqvmQD0Ct7rG6lv32BxPUc/fVLHMkcsWe3HMlbmu\nb5FTiUDWtdtZb5K57ZFe6qnWd2KYYYoiYfyN8EVfb7SWhsbJc+asmXNgpIUzpZl/3W8nKq6E41Ay\nX+5MtVrmqX6XombExN9710Nan1njJehfbbBRUsuGysjTPw+kulf5qaFstFXWXBl1v6sWoZwekYuM\ndP5cf1n+XmLGAAAAAAAAAAAAAAac+muj8jFUyGObRXs8sa/iZDgv8ttCIr+9qSr2eKFnfb00v5DB\ns8dwRKe4Ma+NVxRU0YKeLqqrXSY+Q1E30wPNqu1RW9O3Yom3GMbVqhiZDE2ONqNY1MEROIxz6JKd\n3JKn80VP8mWPFWNx38EMVR8unTllb/lf8HoWuVDzZbwAPRaAAA5llG5rO7bk857kaneU2lVwOj99\nU/h4vbQ5Tl7aKS+d12w0Nwa59O+jlVyNerV0Lo0oWH4K8l/Fqn7S/rAucldSRtV0lVA1qb6ukRDm\n2W9WmXdxoMnLEiz0UVQyor6xqf6bGt3mIvGqrychI1PckyWnjViw1bfKlS/rIito7v3MIY66hrJb\nnk0x6NqaWZE2SBqrhntcm+iAW7umtRvc8vjU3kpHIg7nNRA3IWxI6aNFSjjxRXJ9Exd0SpirO5pd\n6mnej4ZaJZGOTjRURUKxkT3OMnbjklaaypp6hZ5qZj3qlQ9EVVTToRQOn99U/h4vbQd9U/h4vbQp\nXwVZL+LVP2l/WfU7leTCLj3tU/aX9YGTu0ae5le8PBN95CxZK/o1avVY/dQrXdijbD3LrxGxPiMh\na1EXkRzSJsOR+UE1koJYss7jDG+Bjmxtjbg1M1NCAdPNK8XOktFvmrK+ZkMEbVVVcuGPkTlUpa5F\nZRqipu4ufRtK7d8kb5k+7birljysgpl2R0FXnNkjRN9WJirVXzoBZe5FQVDaC6XmridDJd6t1SyN\nyYOazebj9RfiMyavNJf7JS3K3qve8zMURUwVq8aL5iTA5plH/wBdWTPqkp0s5plH/wBdWTPqkp0s\nAAAAAAAAAAAAAAAAAR19oqi4UDqelqe9lfoc9ExXDkQkQqoiYquCGl23TcomirZLe3XNFUVU7YU+\nhs99s0CRW+aimiT9R0eaq+dTZjyjkpZGxXuiko3LoSVPjRr9ZtV+VFtpZVhje+qqN7Y6dueuP1aD\nSnut1ronMZk690Lkw/2iRG4/UeFVoLH9GTXZxjowmuPLGY8XqxFy9/VlFuNfTjFM+eqfBZIpGTRt\nkic17HJijkXFFI68WSluma+VHR1DPkTxrg9v1kHkxS3i3XF8ctFsVulXFGJKjkiXyceBbzps1Rld\nr/low3xMecY+Tku0zkt3/irx3TE/ZW2w5R0HxYZ6aviTe2VM1/Oetsco10JaadF5Vm0FiIfKK7d4\nRMgpW7LcKj4sMSaV9JfIaXbegomrS1REfGJ8MYmVLd2b1cU6OmZn4YekxCLtlyvdblD3nOtPHFAm\nfNsSZ3mbivGbOVNJWUlruFfTXetjfGx0jGIjFa3yfJ3iSyctW1dDmyOz6mVc+aT6TlMeWf6K3T9w\n47cgtXLdrG9MzVOvXOOHw/nSjllyiu5hbiIiNWrp+KKvMNdbMnqmuiu9bJKkbcEkRioiuc1MdDd/\nSSKWesWPFl9r0eqaFVsapzZpgyyTOyJq2oqtxjjTFN9PjNPT8npZadWperm3FPCov+DtcjPYrs+W\nyT1NydGjqV8jJZGpg1yM33InmNTJa418lVJDdnfHqokradubhmMVcFj8qt+L7RCPqZ67J+is1DBB\n31LUPilZirWLHG7F6rhiqZ2hPrU3b4+800lDc6qkoo4aGTGR0MznO2JyZrkwVqcWn6gMtNea2nyl\nuCVkiPteztp2/Fw2B6tRUVV5FVcPOS+UVZNS7Xd7vzdlrIo36McWqulDQssEFdWZRwzNbLBNO1FT\nicisQjK6omglt1pr3q+ppq6B0Urv+3hzsEd6SbzvLgvGBJ0qVF8r7hstwqKWKmmWFkFOqNcmH6zl\nVFXTzHpzrpbG3CGSd9TTNpnTQVL2pnscmjMdhoXiVFw5T3DSW2/umqthkp6uKR0L3MkzJEVq4aVT\n/JoXB9VbZqm2Pq5KymqKKaVqy4LJErcE0u42rncfGgG5arbWVVspaiS93BHyxNe5ESPDFUx+ieqi\nWtlro7LQVb0dFEktVWPajnoiqua1EwwzlwXzIhhsWTtLJZaF7pq3F0DFXCpeifJTymW3Zluyvrqe\nVyolZTxPgc9fl7Gitc3HjVMUX61A9VVsuNvhdU2u41VTNGmctPVORzZcP1UVERWryKasldLe7ram\n0NfUUtJU0L6nCJG4qucxExxRd7FSyXCtgt9FNV1UiMghar3OXkT/AD5CiWe01Hfthppp6mjmS3Ty\nPSJ2a5qulY7NXzY4fUBNXdlwsdK2viuk9VHHIxskFQ1q57XORvxVREVF0nm03asXLG60lXJnUTpU\nipkwRNjc2Jj1bjx45yr9SmG+2yS1sprjJXVNbFBURZ8FS7Obg56Nzk/3kzkX6jG+mfPJlLLTp/tN\nNcGTxL/vNgiXD60xQCx5Q1r6K2PdT4d8yuSGFF+m5cE5t/6iq2mtuVzpcmYpblPG+ppZZJ5YmtRZ\nHNwRF0ouG+S9JVMvt7ppotNJRwpL5NlemhPOjcecrdjo466HJCKV0rW95zrjG9WLvt40AuMFoqIp\no3uvFfIjXIqsdmYO8i4NIrJ+mrrrblrJrzWxvfPO3MjRiNajZXtRExbyIhL0tipqaoZNHNVucxcU\nR87nJ9aKpX8krPJU2XZW3OvhR1TU/Ejkwa3/AF372gCXtFRWU18qLXV1PfjEhbPHM5qNeiKqorXY\naOLfNnKCWso2RV1IqyRQLjPAiY57ONU8qb5H5Pwut2UFfQSyOqXuiZO2ok0yORVVM1y8eCp/Mlr7\ncm22hWTM2WeRUjhhTfkeu81P/u8BHXG7OrnUtDZJkWepakr50TFIYuN3nXeROo2cnqueqmu7Z356\nQVroY9G81GMXDnVSBsNHJkpcEbWq10F0eiumRMEinX/s/I1eLy+cl8leE3/+JP8A/bjAnwAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEJlurEyUuayMa9qQroVMTlWSdPlLaLRHeLLhUUcirslOi4\n7y4Lin+UOyXiibcbXVUb/kzRqznOe9zC8ttk9Tk5c3bDPHK7Yc/Qjl42+fjQCYs2VtlyngWgukTI\nZ3aHU9QmhV8i/wD1SEvEj8irhEzJ2u75jmfptb1V6pjxtw3i2ZRZGWm+O2WWJYKnw0PxXfXymBlJ\nk7kVSLU1D2tmVPzsq58si+TjAr2UVLlHe7JWVt0lS20McSvbSRri5+H0lJFz1j7kL3J4jhz6P8kR\nW3y+ZbukorDSrTWt3xZKiVN9PKv+ELBlXSss/c0qqN8qO2OnbEjt7OXFE0AZu5i9rMhqJz1RrW7I\nqqvF8dSv5Cu217oV7usOPezWrG13LiqYe6V/JxmUt7sEFntsPe9uRXZ9S7FqORVxVMePzIdUyUsM\nGT1pZSQLnv8AlSSYYK93KBMmCvpYq6hqKWdM6GeN0b05UVMF/EzgCvZHVsjqN1rrnf8ASNvwhkx3\n5Gp8mRPIqYfXiWEh73Y2XCWOqppn0dxh0R1MaacPouT9ZvkU02V+UdHhHV2iGvw3pqOdrMfOx+GH\n1KoFkI3KG5stNskqHJnyr8SGJPlSyLoa1PKqkc+732b4tHk7JE5d59XUxtannRiuU9WuxVDq9tzv\ntS2rr2IqQsY3Nhp8d/Mby/7y6QIjISGWmyJuMNQ/ZJo56psj+V2c7H+ZPZFfonavV2/gY7Naaijt\nFxppVjWSonqJGZqqqYPcqpjo8pu5OUUtusVDR1CtWWGJrHZq4pinIBIgAAAAK3av02vn7in/AOYs\nhEUNumgyjuVc9WbDURxMYiLpxbnY485LgAABAXy71KVjbVZmNluL25z3v+RTs+k7y8icZnsVgp7U\n6Soc99VcZk/1qubS9/kT6LeRqaCFt1NlHbaq4yR263VDqmodLsz6xzXK3eaipmLhghv9+ZUaotn2\n93/8wLGCud+ZUaotn293/wDMl7XJXS02dcqeGnnxX4kMqyNw4tKon4AbgAAAAAAAAAAAADUrEzZo\nH8WKtX6z2equNZadzW/K32+dDUjWWViPZI1EXiVm95N848op1xLaGtdaJ0ypLEmLkTBU5TUobfI+\nZrpmq1jVxXHjJbMn8Kz2P/kZk/hWex/8nFNmmas5005RXTTmwzmJfj1sDfoo56/gn4qecyfwrPY/\n+T3QIr3STu053xWr5E4/rXE67FONWLmluAA7moAAOZ5Qf9d+TvqUx0w5rlza8omZfWq+2G0suMdN\nTPic11QyLS5fKuJsbocvv2Lg/wDUY+sDoRU+6tVQUnc/vTqlUzX07o2ov6znaERPrIhb73QpPixZ\nI0MKr+tLXtcifUinikyNvV/udPX5d18M8NO/ZIbbSoqQtdxK5V+UB5uFNNSdwqSCpRUlZa/jIu+m\nKYlk7m/6B2H1OP3TNl1b6i55HXWgoI0kqZ6d0cbMUbivJiuhClZPV+XtmsdDbm5HwStpYWxI9bhG\nmdgmGOGIHUwc93Q5ffsXB/6jH1jdDl9+xcH/AKjH1gbndo/6s75+6b7yFiyV/Rq1eqx+6hWsrqO+\nZS9zKvpJbY2nvFQzNSlSdrkRUcn6+OG8harBTy0ljoKeobmzRQMY9uOOCo1EUDfPMzUdE9q6UVqo\np6CpiioBzbuGPVthu1Lj/p01ymYxPopnbx0ko3czsdysFTlBBcKbMp5611RTyo9qo9rvIi4p9ZeQ\nOaZR/wDXVkz6pKdLKPerFcanun2O7w06Ot9NTyRyy57UzXLvJhjipeAAAAAAAAAAAAAAAAANS6XC\nntlI6oq3o1ibycbl5EK6ymuOUa7LXvkoravyadi4PkTlcpgvarTZTxVN8Yr7amine1MWRu5XJylu\nieyWNr4nNcxyYo5q4oqHjVVzll2q3XOFNOrN6Z+M/DdHT07npYcFt010RjVV09EfCPjva9vt9Jb4\nkjo4GRN/3U0r51No1bnRNr6R8LnvjVdLXsXBWrylFkdeLRVrDc7pVQU6r/p1KM2SNfPxoYyjKYyP\nCnR/074wiI+ezBrYyfheM5/9W6ccZ+W3F0Q8TSxwsV8r2sanG5cCr0tFXV7EdFlNsjF8C1uP4m5D\nknSOej6+oqa13/ev0cyFKb1+7GNu33zMYeWLWqxatzhcueETj54MdXlEtTKtLYoVq6hdCyYf6bPK\nqm5YbJ3lI+srZO+blL8uV36qcjeRCWpaaCliSOmiZExOJqYGUvZyOc+Lt+c6qNm6PlG/4y0uZTEU\nzbsxhE7d8/P4fAMNbSw1tLLTVLM+GVua9uOGKGYHe5GvWUVPWUTqSoZnwOREVuKpvKip+BsIiImC\nbwAEfRWahoqySqpoMyeTHF2cq764rgi72k3KiGOogkhmaj43tVrmrxopkAGnbrbS25r0pI8xH4Z2\nLlXHBME3/Ih8uFro7hPSTVcDZJaSTZYX4qisd9X4G6AIqusNHVVTqlFnp6l2h8lPK6NX8mdhoX6z\n1R2KhpWVCIySWSobmSyzSK9728mcunDyISYAx08MdNTxwwtzY42o1qY7yJvGC522lucCRVkSPai5\nzVRVRzHcStVNKL5UNsAQ0OTlEyeOWd9VVuiXOjbUzOkaxeVEXRj5VJJ1HA6uZWKz/aGRuia7FdDV\nVFVMPOiGcAYK+jgr6V9NVMz4X4KrcVTeVFTSnlRD5TUcFNLUyQszX1D0klXFVznZqNx5mobAA07Z\nbKO1xSR0MKRMkkWRyIqri5d9dJpS5NW18NHE1k0TaRrmQrDO9itRd9MUXFfrJkAQ8GT9LDMyVs9w\nVzHI5EdWyuRcOVFdgpI0NHBQ06QUrMyJHOfm4qulzlcu/wCVVM4A1+84O/u/Mz/aFj2JXYr8nHHD\nDznyWhp5a6Gskjzp4Wq2NyquDcd9UTex8psgDXuFHBcKOWlq40kglTBzV0fz4l8pGPyZoHTzTMfW\nxPmdnybFVysRzsETHBHYY4IhNgDStttht6PSGSpfn7+zTvl5s5VwN0AAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAeJZo4WK6WRjGpxuVEQD2VnKzI6hyicyaRz6erYmDZo99eTHlJOW/WqNcH19P9T8\nfwMS5S2dP/2EHOvUb6OvdLfR1blRZkxlhSxrDS5QtfCmhM/FXYedUUpveqWzKrDLpKqoZhi1+crm\nu5NPJ5EOwbprNrCDnXqNervWTtZHmVdTSTN3sHpj/gaKvdJo690tOPLXJmko2pT1cTImJg2ONmGH\nkRCo3Gtru6HdoKKhilp7NE/PfI5MMcOPz8iFiZSZEMnWZrKHO38FRcOYnIMoLFBGkcFZTRsTea1M\nE/AaKvdJo690pajpoqOlip6diMhiajGNTiRDMQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2\nsIOdeobprNrCDnXqGir3SaOvdKYBD7prNrCDnXqG6azawg516hoq90mjr3SmAQ+6azawg516hums\n2sIOdeoaKvdJo690pgEPums2sIOdeobprNrCDnXqGir3SaOvdKYBD7prNrCDnXqG6azawg516hoq\n90mjr3SmAQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2sIOdeobprNrCDnXqGir3SaOvdKYB\nD7prNrCDnXqG6azawg516hoq90mjr3SmAQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2sIOd\neo9JlJaF3rhBzjRV7pNHXulLA06e50NSqJBVwPcvE16Y8xuGsxMbWsxMbQAGGAAAAAAAAA0Z071l\ndJp2B64uw/VXl8ymzPURQJjK9G+TjNR10hXFGRyP8zThynLcltf0Xa4id3T4KU266tcQ2EVFRFRc\nUXjBGLOiKqwRVEePEiYpzHzvh7tEqVKt5GtzTzuH5Ljy/Kfs30Ve5uyOWeRYIsf99ybyJyec3mNR\njUa1MERMEQjYq9kTEayllaiciHrbRPF5uY6rftPIqIwz/KfsxNm5PQkQR22ieLzcw20b4vNzG/G+\nR9fyn7MaCvckQR+2jfATcw20Z4CbmHG+R9fyn7Ggr3JAEftozwM3MNtGeBm5hxvkfaev2NBc3JAE\nftrH4GbmG2sfgZuYcb5H2keZoLm5IAj9tYvBTcw21i8FLzION8i7SPM0FzckAR+2sXgpeYbbQ+Dl\n5hxvkXaQaC5uSAI/baHwcvMNtofBy8w43yLtINBc3JAEfttD9CTmG20H0JOYcb5F2sGgubkgCP22\ng+jJzDbaD6MnMON8i7WDQXNyQBH7bQfRk5httB9GT2TPG+RdrBoLm5IA0Ntafkk9kbbU/JJ7I43y\nLtY8TQ3NzfBoba0//eeyem3OlVdL1TztU2j2rkc7LtPixobm5ug8RTRypjG9rk8ins7aa6a4zqZx\nhpMYbQAGzAAAAAA8Twx1ETop2NkjcmCtcmKKVt+T1XbnufYK1YY1XFaab40f1chZwq4b5zX8lt38\nJrjXGyY1THevayiu1jFM6p6J1x4Kwlyv8HxZ7O2ZfpQy6DzJW36sasUdniia7Qq1D0VOYkrhlNY7\nc5W112oYHpvtfO1HJ9WOJFP7ouSTF036i+pyr/gjxdcmMNJVh3euDM+0bFE66aYn5z6YsVtyLiSq\nWqub2PkVcdigbmMTmLgxrWMRrERGomCInEVL4SckNfUfO7qHwk5Ia+o+d3UdGS+z7eSxMWqMMdu+\nUL/tOMonG5cicPjC3AqPwk5Ia+o+d3UPhJyQ19R87uo6cyrchwmz148YW4FR+EnJDX1Hzu6h8JOS\nGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHwk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87uofCT\nkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHw\nk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87uofCTkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h\n8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHwk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87u\nofCTkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO\n7qCd0jJFf/31Hzr1DMq3HCbPXjxhbgVqmy7yXqXI2K/W9VX6UyN/En6Srp6yJJaSeKeNd50b0ci/\nWhiaZja3puUV8mYlmABhuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABHZQ3Day0VFSmGe1uDEX6S6EM\n0xNU4QzETVOEIy/X6WOqS3WmPZq52hV30Z5zXpsklqnJPe6uapmXSrEdg1PIbeRtr70t6Vc/xqup\n/wBR7nb+C6UQsJeq5o/6bfitVXo/6aPFCxZL2eNuHeTHeVyqpk3OWjxCHmU3q+uprfTOnrJmxRJx\nu4/InKQL8rWOXGltlfMzicrWsx9pUJ6S5PTLSK7k7JlIbnLR4hDzDc5aPEIeYjN1c2pK3pI+0N1c\n2pK3pI+0Zzrm+WcbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ\n8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXN\nqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w\n3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqS\nt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3O\nWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6\nSPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWj\nxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8x8XJu0L/wDgQ/zI7dXNx2Wt9uPtGxR5\nVUUszIquOoonvXBq1DMGqvJnJoMZ9yOmWJquRtmXypyPtMyLmQuhdxKxy6CMliu2TC7LHK6utqfK\na75TELofHNR7Va5EVqpgqKbU36tlWuGYvVbKtcNa210FxpGVFM7OY7nReRTaKdQtWwZVLRtVUoq1\nM6NOJruQuJrcoimdWyWLlEUzq2SAAmmAAAaVfVOY5sFOmM7/AORuOVGtVy7yJiR9qZsjpap+lz3Y\nJ5EPOy65XVVRk1qcJrxxndEbe/ohW3ERE1z0MlNb2MXPn/1ZV31dvG41EamDURE8h9IWryjpY6h8\nFJFUV0zND0pmZyNXkVy6E5zpyfJbWTU5tunD175aVVzVrqlNAr239bxWCvw/eRJ/zDb+u1BXdJF2\ni+MNMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV\n3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0\nMYMYWEFe2/rtQV3SRdobf12oK7pIu0MYM6FhBXtv67UFd0kXaG39dqCu6SLtDGDOhYQV7b+u1BXd\nJF2ht/Xagruki7QxgzoWEFe2/rtQV3SRdobf12oK7pIu0MYM6FhGCchXtv67UFd0kXaG39dqCu6S\nLtDGDOjesOCciDBORCvbf12oK7pIu0Nv67UFd0kXaMajOjesGCciDNTkQr+39dqCu6SLtH3dFMzT\nU2W4xM43NRkmH1Ncqj+kzo3p/Nb9FOY8uhicnxo2L9RgttwpblBs1HKkjMcF4lavIqb6KbRibdFW\nqYbYy0J7czHPplWKRN7BdB6oapz3rBUJmzN/mbpoXaPBjahmiSNcceVDyspyeMixynJowiNdVMbJ\njp1dExtWoq0n9Ffc3weYXpJE16bzkxPR61NUVRFUbJQmMAAGwAACMykvlFk9aZrhcpEZDGm9xuXi\nRPKctpqfKvumOWpqKqSyZOOX/TjjxSSZvL5fwMt8jdl73Um2eRVdZrOiSTtRdD38i/8A3iOvRRsi\njbHG1GsaiI1qJgiIW5uPi8/CcrrnGcKI1fOenuUC29yLJSja3ZqSWrkTfdNKun6kwJVvc6ySamix\n0n15y/5LYV+65WW23zup0WSpnbocyBudmryKu8hG5fzIzq6sI+MumjI7XJoojwanweZJ6jo+Zesf\nB5knqOj9leswLl1HxWusX/xM6z5u6j1VWe3H1nJxrkvb0/mj7r8Wz2X6f9Nj4PMk9R0fsr1j4PMk\n9R0fsr1mvu6j1VWe3H1jd1Hqqs9uPrHGuS9vT+aPuzxbPZfp/wBNj4PMk9R0fsr1j4PMk9R0fsr1\nmvu6j1VWe3H1jd1Hqqs9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe\n3H1jd1Hqqs9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe3H1jd1Hqq\ns9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe3H1jd1Hqqs9uPrHGuS\n9vT+aPucWz2X6f8ATY+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7n\nFs9l+n/TY+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/TY\n+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/TY+DzJPUdH7\nK9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/AE2Pg8yT1HR+yvWPg8yT\n1HR+yvWa+7qPVVZ7cfWN3Ueqqz24+sca5L29P5o+5xbPZfp/02Pg8yT1HR+yvWPg8yT1HR+yvWa+\n7qPVVZ7cfWN3Ueqqz24+sca5L29P5o+5xbPZfp/02Pg8yT1HR+yvWPg7yT1HR8y9Zr7uo9VVntx9\nZkjy6ps7/Wt9bG3lRGu/kimY9qZNM4Rep/NH3Yn2dhttfp/08VPcyyRnYrVs0LMeNjnNX8StXDuU\nOtb3VuRV2q7fWN0pE9+LH+T/AP3E6Xa7nSXSn2ahmbIzeVN5WryKnEbh203auiXLXkVir/HCfhqn\nyc6yEy5qqi5uyeyrgSjvcehrsMGz+bynRTn3diyZ21sLrrQIsd2tibPFKzQ5Wt0qn+fqLBkBf0yl\nyToLkuCTSMzZkTikbod/PT9ZmuImM6GtiuuiubFycemJ3x94WEAE3YAAAAAAAAAAAAAAAAAAAAAA\nAAAAAVTuhKr6Ohp0XRNUIi/h/ktZU8udNXY05alPxaWyfnIWsc5C1sajGNa1MERMEQ+gLvKRRUa6\nSrccoKl0vxoaNUiiau9nYYud5+I9mrS6a67L/wAbJ+CG0dduMKYd9qMKIAAbqAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAeJomTROjlaj2OTBWrvKewBKZF1Mj6Ooo5nq91JJmNc5dKsVMW4/gWIqmR3zpd\n08sXuqWs4qowmYedXGFUwqeXTdjktdUmh0c6JiWti5zUXlQqvdC+baVeSdpaIPzLPRQrXzdPepXz\ndPe9gAiiAADDWrm0kq/7qmO1phQxebE9XHgM3oi38Ch9FDzZ1+0I+FH/ALK/9Xejcq6mSKgjp6dy\nslqpEhRyb7UXfVPqPlHTRUdMyCnYjI2JgiJ+Jhyo4fZU/wCJX3VNw7q3LcAATTAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAARNWiW68UddB8VJ5EgnRN5yL8lV8qKWsqmUfBKbyVcPvoWstRsWo2BirG51\nLKi/RUynio/MSeippfjOtVRO6VKdsNe1OzqGPHi0G2aVn4CzzqbpzezZxyS1M9WPRvd5cgAO5MPM\nrsyNzuRFU9GGt0Uk3oL+AYnY5h3Dmd81GU1yfpfPXObj5EVTqpy38n7Tk7c15a551Ipe5cuTII//\nAD0oLLS4S2+xyLTuzZpnJE1yb6Y768xzmKNsTEaxNCfz8pde6P8AN9Cn/Ep7qlMPzv8AGd6vS27O\nP9OGPfjg+s9i0Rm1V9OOAAD4l7YAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADbsdY+23ulnjXBkr0h\nmTici6EX6lOrHG5dDoF5J4vfQ7Im8fqH4TvV3chwrnHNmYj5YRP7vl/a1EUX8Y6YeZo2zRPjkRHM\ne1WuReNFOW/k/vdDZ73bnKqpSV7mpj5dH/KdUOU9xDRdstmcTbkvvyH1dPIq7ngXtWUWp+r0dWAB\nJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8OsXrKe80tk/OR/Ohaxy4WwLvKAu8\npFFz+k4bdfXZP8G0atJw26+uyf4No66OTD0LfIgABu3AAAAAAAAYK6R0NDUSswz2RucmPKiKpV6S\n53WngtFTWVMNRDcGq3NSLMdG/Y1cipp0powLXPE2aGSJ+OY9qtXDkVMCIosm6OlfC7ZaqfYGKyFJ\npVckaKmC5qcS4aMTEtaomZ1K/acoa5bIlxqap0z+9klWFaTY2Zy6E+PjvYqStVWXSz0c1bWVNPVw\ntpnyrGjUY5r0RMEbyt06Taocm6WkpkpkqK2alSPYkgmmzmZvJhge6fJ6jjc5Z31FUixOga2okz0Y\nxd9qecxhLSKasEdV191tPe7qyphqEqo34I2LN2N6Mzkw5UMMN0utJT2yoq6mGoir2ORESLMdG/Y1\nei7+lNBLQZO0ka4yS1U+bGsUaTSq7Y2qmC5v1cZ8pMm6OnWNXS1U+xRujiSaVXJGipguanEuHGMJ\nM2pXqS+3J9gmr3VyvmbS7LmOo8xqO0bzsdJJZO3eqqrz3o+rbVxLTbM5ywbErHYoiInKhuQ5M00d\nE6kWrr30qx7FsT58Wo3yaCS2vgSvgq0zkmhhWBunQrVw3+XeERJFNXS3AAbKgAAAAAAAAAA28jvn\nW7/+V7qlrKpkd863f/yvdUtZx18qXn3OVKqd0P5spv37S0QfmWeihV+6H82U379paIPzLPRQpXzV\nPe3q5unvewARRAABrXLgM3onqg4FD6KHm5cBm9E9UHA4fRQ83/8Aof8Ah/7K/wDV3oXKj5wsvrC+\n6puGnlR84WX1hfdU3Dtr2uW5tAAaJgAAAAAAABS8pLzX0+VW19PWOpqdKJk/xKTZ3K5XuauOnQmC\nIXQhbnk9BXXTbBKutpqlYUgV1PNmZzEcrkRdHKqm1OHSzCs1GVVwoLhdIKpzHU0dOiU86x5uE2x5\n2Dk8vIb1Dcbxe5KtKKrgpEoookVXRZ2yyvjR646dDdKIStRkvb6qhrqWq2aZlY1qSufJi5VamCOR\neJfKeajJaikXGKarp86JsEuwzK3ZmNTBEdyrhox3zOMGpoxVt5utbVwUdVTUi0UUefgzZEllc3Hf\nx0NNG2Xq8X+sjbQVcNE3a9lSrHQ56LIr3NVMcd74pPVOTFFI9HU8lTR/6SQvSmlViPYiYIi/Vx75\njnyToXzskp5qykzadtNm08ysRY2qqoi86jGBWEyquVdVUzGVK0bXU2e9IqRZ8ZEerV49CaD1dspb\njSV13j2zbGlFFC6Ji0au2VXRo5c5f1dPMWWTJOiSaGSknraJYoUgTvaZWIrUXHTy6TbisNIxtwRz\nppFr2MjndI/FXI1mYn14GcaTGG/QyyTUVPLM1GyPja5yIuKIqpipnMNHTspKSGnjVysiYkbVcuK4\nImGlTMTYAAAAAAAAAABE5ScDp/W4PfQtZVMpOB0/rcHvoWsrRsWo2B4qPzEnoqezHUfmJPRUxe5u\nr5SpTthrWfgLPOpumlZ+AM86m6cvsz3O19Meje9y5AAdyYYK7gc/oO/AzmCu4HP6DvwDE7HM/wAn\n39Grj67IdTOWfk+/o1cfXZDqZS7y5cuQe70fJUO6P830PrKe6pTS5d0j5vofWU91Smn5r+M/erf0\n/vL672NzVXz/AGAAfHvYAABguEroaCpljwz44nObjyoiqVimvVWqQ4VLZ1kp3SuRYczMVG46OXSW\nqoibPTywvxzJGqx2HIqYGpNaqeWKmjdno2nYsbMHacFbm6fqPSyLKMnt0TTepxmfhG777sHNft3K\nqomicP8A6haa7Vbbc6pfOs0ixtzWOp8xqOcqIi48aaTauE9yt1LPI+phmRI85qqzNc12KIujjTSb\ncNlhZA6CSapmgVmZsckmKInk8p9bZafCXZ3zVCyR7FjK/FUbyIdVWVZJn4xEYY7M2NcasI6MMNfz\n+KMWr2bhjrw3tO/XOpo3SJA5qIlIsqYtx+Niif5NGS9VcTapI6ls+x0+y57ocxWOxRN7jQl0sVMs\nczZpKiZZY9iV0kmKo3kTkM9baqesejpc9F2NYviuwxaqov8AgzayrIrcU0VU44bZwj4d++Nsd5Va\nv1TNUTh3q/Ne6yKGs2KpZPsdOkiPdDmK1yuRMMOMzVFyr4aLZIJ1nlWWNiNkp9j310+cma+0U1bn\nbNnoqxbCua7D4uKLz6Dy20R/F2Woqpka9r2pJJjgrVxQ2jLcjzYnMjHHXGEa9nw+ezBjQX8ZjHz/\nAJ+6N20qa7v11HO2FkMLZURY85UXBcWr9aG9QzVaWN1VUTtllfDsrcGZqN+Ljh5TPFaaWKWtkja5\nq1aYSoi6N7DRyb5sMpY20KUiY7Ekex7+nDDDfOS/lOTzEUWqdWMdEY4Ya9e3b4q27VyJxqnXr6fB\nBR19wp4KOaeeOZlVE5UTY81WORiuTzpoMFFeKpLZ35LO6V2wI/Y1p8xucuH63GmKktT2Omhzc6Se\nXMYsbNkkV2YipguH1H2mssMMKQLPUy06R7HsUkmLc3DDeOqcqyLCcacdcf4xGMYzq+GrCOhKLN/G\nNfn8mtVVNwtlLLUVE0NQxIVdm5uaqP0YYcqaTxVVdfbnRJUTxz7PG/DBmbmPRud9aaDdis1M3P2Z\n0tQjo1iRJn52a1eJBHZqdqqskk8yoxY2bJJjmNXQuHWSpynJY5URO3H+mIx1asN2E7d7ebV3o9fH\nxRrbhcKWnpJaiaOZtVE5UwjzVY5GZyedNBrxXWt2nlq1qldI2nSTNdTZrUVcOPjJiCx00SIjnzy5\nsaxs2STOzGqmC4ch8ZY4W0rqZ1TVvgVmx5jpcURPIWjK8i6aemP8Y1xjP7YR0YtNDf3+bXs1xnnu\nTqd1QlRGkKPVyxbGrVx3vKTxrd5xd+R1KZ2yMj2JNOhU8psnlZXdt3a4qtxhq8+7CPJ12aKqKcKp\nxAAcqoAAMcu/D++i99DsqbxxqXfh/fxe+h2VN4/Svwd7lV9U+kPmvbPPx8v3kU5T3EvnzLn+JL78\nh1ZTlPcS+fMuf4kvvyH2NHIqfO3+ftd/o6sACTsAAAAAAAAAAAAAAAAAAAAAAAAAAAKnlzw6xesp\n7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgXeAIoqDE1Y7pdo3aHJVK/DyORFQ2CQyjs9QtXtjbWJJKr\nc2aDHDZETeVF5UIF9yiiXNqoqqnem+2SB2KcyKdNuuMMJdlq5Tm4TLeBH7cUXhX9C/sjbii8K/oX\n9k3zo3q59O9IAj9uKLwr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSA\nI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLwr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG\n3FF4V/Qv7Izo3mfTvSAI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLwr+hf2RtxReFf0L+yM6N\n5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSAI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLw\nr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSAI/bei4pHr/5L+ozQyVd\neux2ujme52jZpWLHGzyqq7/mQTXTHSxNymOlLZGJnV13lT5KyMZ9aN/+S1GhY7ay1UDKdrle/FXS\nSLvvcu+pvnJM4zi4KpxmZVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFa+ap71aubp73sAEU\nQAAa1y4DN5j1QcDh9FDzcuAzeY9UPA4fRQ82P7hP0f8Asr/1d6GyrTNntMy/JZUoir50VDbNq7UE\ndyoJKaVVajtLXJvtcm8qFeSvqrcmw3ilnxboSpgjWSN6cq4aWr5FO6uJnW5q6ZnXCXBEbo7Xx1D0\n88EnZG6S1eMu6CTsmmEp4SlwRG6S1eMu6CTsjdJavGXdBJ2RhJhKXBEbpLV4y7oJOyN0lq8Zd0En\nZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsjdJavGXdBJ2RhJhKXBEbpLV4y7o\nJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsjdJavGXdBJ2RhJ\nhKXBEbpLV4y7oJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsj\ndJavGXdBJ2RhJhKXBEbpLV4y7oJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7ITKChfop0qah/E2Kne\nqrzogwkzZesofjR0USfKkq4sE8y4/wCC1FdtdBVVtwjuFyi73ZEi9706ri5FXfc7Dj8hYitMYQtT\nGEBjqPzEnoqZDHU8Hk9FTS9zdXylvTthrWfgDPOpumlZ+AM+s3Tm9me52vpj0b3eXIADuTDBXcDn\n9B34GcwV3A5/Qd+AYnY5n+T7+jVx9dkOpnLPyff0auPrsh1Mpd5cuXIPd6PkqfdGYq2mmk4mVDVX\n60VClHV7rQxXKgmpKhFzJG4YpvovEqHMrhabja5FZVU0ssSfJqIWq5rk8qJpRT4j8Veyr+VTRfsU\n52EYTEbX0nsrKrdqJt3JwxaoMK1DE30lRf3Tuod8x8knRO6j4rivLexq/LP2e3wqz148YZgYe+Y+\nSTondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+xwqz148YZg\nYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+xwqz1\n48YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+\nxwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2\nNX5Z+xwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0Tuo\ncWZb2NX5Z+xwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qPcb3S\nuzYIKiVy7yMhcq/gZj2XlszhFmr8s/YnKrEf5x4w9IxZamkibpc+ojRE/wDEi/4OxJvFIySycqUr\nI7hc49i2PTDAq4qir+s7y+Qu5+lfh7ILmQ5HmXdVUzjhu/mD5n2jlFN+9nUbI1CnKe4l8+Zc/wAS\nX35DqynKe4l8+Zc/xJffkPoqORU8W/z9rv8AR1YAEnYAAAAAAAAAAAAAAAAAAAAAAAAAAAVPLnh1\ni9ZT3mlsKnlzw6xesp7zS2T85H86FrHLhbAARRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFq+ap71qubp73sAEUQAAa1z4DL5j1Q8Dh9FD\nxc+Ay+Y90PBIvRQ82P7hP0R6yr/1d7OAD0kgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMdT\nweT0VMhjqeDyeipK/wA3V8pZp2w1rPwCM3TTtHAI/rNw5vZnudr6Y9G93lyAA7kwwV3A5/Qd+BnM\nFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej5AAJusAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAFOU9xL58y5/iS+/IdWU5T3EvnzLn+JL78hWjkVOO/z9rv9HVgASdgAAAA\nAAAAAAAAAAAAAAAAAAAAAABU8ueHWL1lPeaWwqeXPDrF6ynvNLZPzkfzoWscuFsABFEAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABVO6H82U379paIPzLPRQq/dD+bKb9+0tEH5lnooWr5qnvW\nq5unvewARRAABq3TgMvmMlFwSH0UMd04DL5jJRcEi9FDzY/uE/RHrKv/AFd7MAD0kgAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAMdVweT0VMhjquDy+ipK/zVXylmnbDXtHAIzcNO0cAjNw5/Zvu\nlr6Y9G93lyAA7UwwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD\n3ej5AAJusAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOU9xL58y5/iS+/IdWU5T3EvnzLn+J\nL78hWjkVOO/z9rv9HVgASdgAAAAAAAAAAAAAAAAAAAAAAAAAABU8ueHWL1lPeaWwqeXPDrF6ynvN\nLZPzkfzoWscuFsABFEAAAAAAAAAAAAAAABVb13QcmLLlJS2G5XaCC51GGZC7ix3sV4sS0SSMjidJ\nI5rY2orlcq6ETlOTZadw2yZVd0Gnypqq6shla5jpqePDNlVnydK6U3kxOpVtFDWW6eimRVgmidC9\nEXD4qpgv8lAgMkcvcm8r6qspsn7pDWT0q4SMbii4Y4Ypjvp5Tdyvyps+SFpW5ZQVsdJS5yMRztKu\ncvEicalG7kvcXtHc3vNfcqGvq6yapbsTEmRESNmOOGjfXyk33XO5zQd0rJ6K2XCpnpHwTJNDPEiK\nrXYKioqLoVFRQLVYbxQX6009ytFTHVUVQ3OjlYuhTfK53PskqLIjJWjsVtkllhp0VVklX4z3KuKq\nvJ5ixgAAAAAAAAAAAAAAAAVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFq+ap71qubp73sAE\nUQAAat04DL5jJR8Ei9FDHdOAy+YyUfBYvRQ82n+4VfRHrKv/AF97MAD0kgAAAAAAAAAAAABHZQ3u\n35PWie53iqjpaKBMXyP3k8nnNfJLKe0ZW2htysFZHV0jnKzPbxOTfRU4lNLuj5HUWXeSlVYrjLLD\nFMqObLFhnMcm8unf8xodybue0Hc3ybfabfUzVSyzLPLNKiIrnKiJoRNCJgiAbmVeX+TWSdfR0V+u\nkNJU1S/6bHY729ivInlLO17XxpI1yKxUzkci6FTlOV91buKWfui3+hutfX1dLLA1I5Ww4KkrEXHD\nTvL5Tp9PSxU9DHSRoqQxxpG1Mf1UTACuWHug5MX7KCrstqu0FRcabHPib5NC4Lx4FqOS5B9w+yZH\nZdVeUtJXVk8smfsMEmCNiz1xXSml3kOtAAAAAAAAAAAAAAAAADFVcGl9FTKYqrg0voqSv81V8pbU\n7YYLRwCM3DTtPAIzcOf2b7pa+mPRtd5cgAO1MMFdwOf0HfgZzBXcDn9B34Bidjmf5Pv6NXH12Q6m\ncs/J9/Rq4+uyHUyl3ly5cg93o+QACbrAAAAAAAAAAAAAGhfbvQWK1VFyu1THTUVO3OkleuCIhpZH\n5VWbK+07Y5P1sdXSo9Y3OboVrk4lRd4w90DJOiy2yVrbFcpJYoKlE/1IlwcxyLiip9ZD9yLucUHc\n1sE9uoKqerkqJtmmnlREVy4YIiIm8iIBJZX5e5N5IVNJT5Q3SGjmqlwja7FVwxwxXDeTyllilZLE\nyWJ7XxvRHNci4oqLvKcv7rfcZtHdIu1vuFdX1dHPTN2J+woipJHjjhp3l8p0qgooaG209DAipTwR\nNhYiriua1ME/kgFds/dByYvOU1TYLddoJrpT458LePDfwXeXAtZyTI7uG2TJjuiVGVdLXVk0rnPf\nDTPwzYnPxzlxTSu+uB1sAAAAAAAAAAAAAAAAApynuJfPmXP8SX35DqynKe4l8+Zc/wASX35CtHIq\ncd/n7Xf6OrAAk7AAAAAAAAAAAAAAAAAAAAAAAAAAACp5c8OsXrKe80thU8ueHWL1lPeaWyfnI/nQ\ntY5cLYACKIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACqd0P5spv37S0QfmWeihV+6H82U\n379paIPzLPRQtXzVPetVzdPe9gAiiAADUuvAZfMZaPgsXooYrrwGXzGaj4LF6KHm0/3Cr6I9ZVnm\no+bKAD0kgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMVXwaX0VMpiq+DS+ipHKOaq+UtqdsM\nNp4BGbZqWngEZtkPZ3ulr6Y9G13lyAA7UwwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8\nn39Grj67IdTKXeXLlyD3ej5AAJusAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOU9xL58y5/\niS+/IdWU5T3EvnzLn+JL78hWjkVOO/z9rv8AR1YAEnYAAAAAAAAAAAAAAAAAAAAAAAAAAAVPLnh1\ni9ZT3mlsKnlzw6xesp7zS2T85H86FrHLhbAARRAad0uVLbKfZquTNaq4NaiYucvIicakC/KqpeuN\nNZpnM4lmmbGq/VpMxEzsZimZ2QtQKnunuOpWfbW9k+7p7jqVn21vZM5lW5to6ty1gqm6e46lZ9tb\n2RunuOpWfbW9kZlW40dW5awVTdPcdSs+2t7I3T3HUrPtreyMyrcaOrctYKpunuOpWfbW9kbp7jqV\nn21vZGZVuNHVuWsFU3T3HUrPtreyN09x1Kz7a3sjMq3Gjq3LWCqbp7jqVn21vZG6e46lZ9tb2RmV\nbjR1blrBVN09x1Kz7a3sjdPcdSs+2t7IzKtxo6ty1gqm6e46lZ9tb2RunuOpWfbW9kZlW40dW5aw\nVTdPcdSs+2t7I3T3HUrPtreyMyrcaOrctYKpunuOpWfbW9kbp7jqVn21vZGZVuNHVuWsFU3T3HUr\nPtreyN09x1Kz7a3sjMq3Gjq3LWCqbp7jqVn21vZG6e46lZ9tb2RmVbjR1blrBVN09x47Kz6qxvZM\n1PlZEkjWXKjnokcuCSuVHx4+Vyb31oJpmOhiaKo2wsoPjXI5qOaqKi6UVD6atVU7ofzZTfv2log/\nMs9FCr90P5spv37S0QfmWeihavmqe9arm6e97ABFEAAGpdeAS+YzUnBYvRQw3XgEvmM1JwWL0UPN\np/uFX0R6yrPNx82UBVwTFd4gZ8o2vlfFa6OevVi4OkYqNjReTOXf+rE9JJPAru3N44rHH9dc3sjb\nm86ji+3N7JjOhjOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6D\nOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3\nN7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzey\nM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86j\ni+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I28uUemex\nyZnHsNSyRebQMYM6FiBo2q6U1zic6mc5HMXB8b0zXsXkVDeMshiq+Cy+iplMVXwWX0VI5RzVfyn0\nbU8qGG1cAi8xtmpauAReY2yPs73W19MejN3lyAA7GgYK7gc/oO/AzmCu4HP6DvwDE7HM/wAn39Gr\nj67IdTOWfk+/o1cfXZDqZS7y5cuQe70fIAKxdMsqKknfBSQy1srFwcseCMReTOX/ABic927RZpz7\nlURG+dTtpoqrnCmMZWcFHXLqfis/PVJ2T5u6qNTf3Sdk4uN8h7anxhbgl/qT4LyCjbuqjU390nZG\n7qo1N/dJ2RxvkPbU+MHBL/UnwXkFG3dVGpv7pOyN3VRqb+6TsjjfIe2p8YOCX+pPgvIKNu6qNTf3\nSdkbuqjU390nZHG+Q9tT4wcEv9SfBeQUbd1Uam/uk7I3dVGpv7pOyON8h7anxg4Jf6k+C8go27qo\n1N/dJ2Ru6qNTf3Sdkcb5D21PjBwS/wBSfBeQUbd1Uam/uk7I3dVGpv7pOyON8h7anxg4Jf6k+C8g\no27qo1N/dJ2Ru6qNTf3Sdkcb5D21PjBwS/1J8F5BRt3VRqb+6Tsjd1Uam/uk7I43yHtqfGDgl/qT\n4LyCjbuqjU390nZG7qo1N/dJ2RxvkPbU+MHBL/UnwXkFG3dVGpv7pOyN3VRqb+6TsjjfIe2p8YOC\nX+pPgvIKNu6qNTf3SdkbuqjU390nZHG+Q9tT4wcEv9SfBeQUbd1Uam/uk7J7jy6fnJs1ola3jWOd\nrl5sEMx7WyGZwi9T4wcEvx/hPguwI6zXmjvEKyUb1zm6HxuTBzV8qEid8TExjDnmMNUinKe4l8+Z\nc/xJffkOrKcp7iXz5lz/ABJffkLUcipx3+ftd/o6sACTsAAAAAAAAAAAAAAAAAAAAAAAAAAAKnlz\nw6xesp7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgAXeUiiotdKtflDWTSLnMpXJBEnE1cMXL5zIatLw\n26r/AMbJ+CG0dduMKYd9qMKIAAbqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAeXsa9jmvRHNcmCovGe\ngBJZEzv72qqJ7lclLJmsVd/MVMUT6iyFUyO+dLv54vdUtZxVRhMvOrjCqYVTuh/NlN+/aWiD8yz0\nUKv3Q/mym/ftLRB+ZZ6KFa+ap71Kubp73sAEUQAAal24BIZqTg0XooYLtwCQz0vBovRQ82j+4VfR\nHrKs83HzRGVsz0oYaWJysdVytiVyb6N4/wCR7p4Y6eFkULUZGxMGtTiNfKjh9lT/AIlfdU3Dtr2u\nW5tAAaJgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIq4f7FdaCvi+K58qQTYfrtdvY+ZS1FUyj4JTe\ntw++hay1GxajYGKs4LL6KmUw1nBZfRUnlPM1/KfRWjlQxWrgMXmNs1bXwGLzG0R9n+62/pj0Zucu\nQAHY0DBXcDn9B34GcwV3A5/Qd+AYnY5n+T7+jVx9dkOpnLPyff0auPrsh1Mpd5cuXIPd6Pkr2XNb\nJR2J7YHKySd6Qo5N9EXf/kc9jY2NiMYmDU3kLr3SPm+h9ZT3VKYfnX4zu1aa3ax1YY9+OH7PrPYt\nEZlVXTiAA+Ke0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA2LTVOt96o6mNc1HSNikT6TXLhp+vA6y\nhxuX5UH7+L30OyJvH6f+ErtVzIcKp5NUxHywif3fMe16Ipv4x0wKcp7iXz5lz/El9+Q6spynuJfP\nmXP8SX35D6yjkVPn7/P2u/0dWABJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8O\nsXrKe80tk/OR/Ohaxy4WwLvKAu8pFFz+k4bdfXZP8G0atJw26+uyf4No66OTD0LfIgABu3AAAAAA\nAAa1yc5luqnsVWubE9UVOJc1Sn0ck1HBYZ4q+pmlrmq2eGWXPxTYnOz0TiwVELu9jZGOY9Ec1yYK\ni8aGjRWW20L1fSUNPC5W5qqxmC4chiYa1UzM6lLtFRXQZMJcca11R3oj0llqkkaqr+tmfzJOvctm\nopKmiuc89S6hklSGR2yJIqIi7InJhj9ZP0ditVE9H0lBTxORqtxa3DRyGSitNBRPkfSUkMTpEzXK\n1u+nJ5vIYwaRRMQrNcslrSidR3CoqVqoH7Jny5+ODMdkTk0mCKWehprNNBcKmaStiek0Ukufo2Jz\ns9OTBUTnLXTWa20qyLT0UEayNVrs1u+i76eYUdlttGrlpaGniVzVYqtZvtXfTzDAzJUqkfWMyUmr\nXPr2zLR56Svq85FVcNKN4lJXJ2WpZlAyB610UL6PZFjqpdk2R2cnxmrxYf5JuHJ60Qo5IrdTMRzc\nxURm+nIb/esGzxT7EzZomKxj8NLWrvonk0IMCKJhmABsqAAAAAAAAAADbyO+dbv/AOV7qlrKpkd8\n63f/AMr3VLWcdfKl59zlSqndD+bKb9+0tEH5lnooVfuh/NlN+/aWiD8yz0UKV81T3t6ubp73sAEU\nQAAad24BIZ6Xg0XooYLvwCQz0vBovRQ82j+4V/RHrKs83HzQeVHzhZfWF91TcNPKj5wsvrC+6puH\nbXtctzaAA0TAAAAAAAACiZTOqp8tXUsa1skDLeyXYqeqSFEcsjkzlx39CYF7I242K13KoSevoKeo\nmRuYj3txVG444ebSptTOBCh3G7XG01d8ldUTrbmQNgwc/OdBIseLXY+VdCqb9udJeJLhthc6mlSh\npoNi2OXMwR0SOWV30tOKcmguCWe3JTTU6UUGwTtRkrM3Q9qJgiL9Rjq7DaqxYVqaCnkWJqMZnM3m\npvJ5vIZzoZxVikc681ta2ru9RHFSU0ToZIn7Ej8W4rKqcf4GjZ5qi/VsbrjcKulzbVHNjFNsaZ6v\nemeqb2lERS719ktle+N1ZQwTOjTNarm7ycnm8h5rbBaa2ZktXbqaaRjEja5zMcGpvJ5hnQYufUtb\ncbtWUiTPrp07zzlSnqUgzlR6tR/lxRDJd5q1lwyg2Jbs9KOGBYlhqfiwqsWKq5P1tOlS+VuT1orn\nsfV26mlexmxtVzN5vInkNiC2UMDJ2Q0sTGztayVEb8trW5qIvLgmgznwYvVre6W20kkkiSvdExzn\nomCOVUTSbR4gijghjhhY1kUbUa1rd5qJvIeybAAAAAAAAAAAInKTgdP63B76FrKplJwOn9bg99C1\nlaNi1GwMNZwWX0VMxhreCS+ipPKeZr+U+itHKhjtfAYvMbRq2vgMXmNonkHutv6Y9GbnLkAB1tAw\nV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej5Kh3R/m+h9ZT3\nVKaXLukfN9D6ynuqU0/Nfxn71b+n95fXexuaq+f7AAPj3sAAA1ro5zLZVvYqtc2F6oqcS5qlThnq\nYmUrkfWR7LSve500mcki5mPxeReMucjGyRuZI1HMcitci7yovEYX0dNIyJj4I3NiTNYip8lMMME+\nrQelkWW28nomiunHH7fdzX7FVyqKonDD7qvDLU09mfUotSyR0TE2SSdHomcqIrsOLfxNm6t2vpqh\nKa4TOesGfmOfnLocnx0Xi3yap7VQUyu2CkhZnNzVwbvpyHqnt1HTtkbDTRMbImDkRu+nJ5jqq9pW\nZrzoiduOGEa9mqcZnZh59CMZLXm4Y9H83ILKOrmY6Xved6IlCr/iO3lzkwXzmlVz1NMlWxjqyFve\nmejZZM5XOzkTFq8X/wAloitVDFFJHHSxNZImD0zflJyGaejpqhUWaFj1RqsTOTHQvF/IWvaVi1FN\nEUYxHy17OjZvKsluVzNWOtT62oqqaGtbG+sgTvVHo2WTOcq5yJi1eI2axlVFQtSnfVQTSTxsa6Sf\nZMcVLNUUVNU47PBHJizY/jJj8Xfw8xihtVBB+apImaUdobxpvKbx7Vs5sY064nHZjE7Pj8OmJY4J\nXjOvUgI6mS5uuivlnhfTwtVWMercx6IuJJUEboMnFmSaZ8slOkiukfnKi5vFyEmlJTo+Z6QsR8yY\nSLh8tPKe0hjSBIUY1IkbmZmGjDkOS/l9FcRTRThTjE4fKNfmrRk9VM41TjOtVmSS0lPb5IqyaR9T\nC/ZGPfnfqKucnJgpiopqqGx9+f7Vsy0zXZ8k+e1VVExXN+vEs1Pa6GmztgpYY85uaua3fTkPlNaa\nCmdnQUkMa4ZvxW8XIdU+07GExmzOuOiNeuZwnCfjh07EoyW5jGv+av50IeuctrpZZaSvllmWnV6M\ne7Px3vjpyYYnmtz7e+nSlrJpknhkz85+dvNxR6cmn8SdprdR0qvWnpoo1emDsG76cnmPMFsooM/Y\naWJmema7Bu+nJ5iVOX2o2xM7eiNerVjuzdsbe5vOT1zs1eOr/wCq8kktHT0L4auaR9TA9ZGPfnYY\nMxzk5NJgjdUMyelqVdVtlWmRySOqM5FVcN5OItFPa6GmztgpYY85uaua3fTkPEdmt0SOSOjhajm5\nqojd9OQtHtSx1Z2xOyNeuZw2/HDp2NOCXN6Lsr5mXhYXd8xxrAj8yofn5y476FjMewRbM2XY27K1\nuY12GlE5DIeVld+L9cVxGGp12bc26c2ZAAcqoAAMcu/D++i99DsqbxxqXfh/fxe+h2VN4/Svwd7l\nV9U+kPmvbPPx8v3kU5T3EvnzLn+JL78h1ZTlPcS+fMuf4kvvyH2NHIqfO3+ftd/o6sACTsAAAAAA\nAAAAAAAAAAAAAAAAAAAAAKnlzw6xesp7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgAIoqEjFgvN2gdo\nds+yp5Uciaf5GcmsoLI6vlZV0UjYa6NM1FcmLZG/Rd1kA+K7QrmzWiZ68sEjXIvOqHRRcjDCXXau\n0xThLKDBjcdS1/3O0MbjqWv+52jfSU71dLRvZwYMbjqWv+52hjcdS1/3O0NJTvNLRvZwYMbjqWv+\n52hjcdS1/wBztDSU7zS0b2cGDG46lr/udoY3HUtf9ztDSU7zS0b2cGDG46lr/udoY3HUtf8Ac7Q0\nlO80tG9nBgxuOpa/7naGNx1LX/c7Q0lO80tG9nBgxuOpa/7naGNx1LX/AHO0NJTvNLRvZwYMbjqW\nv+52hjcdS1/3O0NJTvNLRvZwYMbjqWv+52hjcdS1/wBztDSU7zS0b2cGDG46lr/udoY3HUtf9ztD\nSU7zS0b2cGDG46lr/udoY3HUtf8Ac7Q0lO80tG9nBgxuOpa/7naGNx1LX/c7Q0lO80tG9nBgxuK7\n1lr+dnaM0NrvFeqRrA23wr8qWR6Peif7rU4/KqmJuUx0sTeojpb2RTc+e61CfIdK2NF5c1NP4lpN\na20MNuoo6ambmxsTDTvqvGq+U2TmmcZxcNU4ziqndD+bKb9+0tEH5lnooVfuh/NlN+/aWiD8yz0U\nK181T3q1c3T3vYAIogAA07vwCQ2Kbg8Xooa934BIbFLweP0UPMo/uFf0R6yrPNx80Hlcmx7W1K/I\nhqW5y8iKmBtm/W0sVbSyU9Q3OikTNchXUp7va02JsCXKmb8iRr0ZKicjkXQvnQ9CunHY5q6ZnYkw\nRe2Vam/Yrjj5Nj7Q2yrNRXL+n2jTNlpmylARe2VZqK5f0+0NsqzUVy/p9oZsmbKUBF7ZVmorl/T7\nQ2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe2VZqK5f0+0NsqzUVy/p9oZsmbK\nUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe2VZqK5f0+0Nsq\nzUVy/p9oZsmbKUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe\n2VZqK5f0+0NsqzUVy/p9oZsmbKUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtH1K25Sa\nILHVI7iWaRjE50VRmyZsvN+/1HW+mbpfLVMVE8jVzlX+RaiEtFpmZVrX3SRklYrc1jI/kQt5Ex31\n8pNlKYwhWmMIDDW8El9FTMYa7gk3oqSyrma/lPopRyoY7XwGLzG0ats4DF5jaJ5B7rb+mPRm5y5A\nAdbQMFdwOf0HfgZzBXcDn9B34Bidjmf5Pv6NXH12Q6mcs/J9/Rq4+uyHUyl3ly5cg93o+Sq90WJX\nWWKZExSGdrneRN7/ACUg65VU8VVTSQVDEfFI1WuavGhz245J3OikclC1tbS/q4uRsjU5Fx0L5z47\n8Texr+XZl7J4xmnVMfD4PovZmWUWMaLmqJQoNtbVd037TVfUres+bVXfVFX93tHx3EHtHsp8vu9j\njDJ+u1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8meMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7Kf\nI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQe0ey\nnyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7KfI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtH\nsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQe0eynyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7\nR7KfI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQ\ne0eynyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7KfI4wyfrw1QbW1V31RV/d7R7jst5ldmstc\nrVXjke1qfiZj8P8AtGZw0U+X3Yn2hk0f5tOGJaiuooGJi6SoZh9S4r+B2BN4q+S2TLrdP37cHskr\nMMGNZ8mJPJyr5S0H6J7C9nV+zski1c5UzjL53L8pjKLudTs2CnKe4l8+Zc/xJffkOrKcp7iXz5lz\n/El9+Q96jkVPGv8AP2u/0dWABJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8OsX\nrKe80tk/OR/Ohaxy4WwAEUQHxyo1MXKiInGpHzXy1wuVstwpWOTiWRAJHAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJUEWmUNnXeudJ0qG9TVdP\nVNzqaeOVvKxyKBmAAFU7ofzZTfv2log/Ms9FCr90P5spv37S0QfmWeihavmqe9arm6e97ABFEAAG\nnd+ASfUbFNweP0UNa8cAf9Rs03B4/RQ8yj+4V/RT6yrPNR82QAwVVZTUrc6pniiTle5EPTSZxgRa\n5Q2dN+50nSofN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nIrdFZtZ0nSoZYb3a5nI2K4Ur3LxJIgEgAio5EVFxReNAAMFdwSb0VM5gruBzeipz5VzFfyn0bUcq\nHi2cBi8xtGtbOAxeY2TXIfdrf0x6M3OXIADqaBgruBz+g78DOYK7gc/oO/AMTscz/J9/Rq4+uyHU\nzln5Pv6NXH12Q6mUu8uXLkHu9HyADDU1dPStzqmeOJvK9yITdbMMCLXKGzouC3Oj6VD5uis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwBFborNrOj6VDNBerZUPRs\nFfTSOXibIigb4CLimKbwAKcp7iXz5lz/ABJffkOrKcp7iXz5lz/El9+QrRyKnHf5+13+jqwAJOwA\nAAAAAAAAAAAAAAAAAAAAAAAAAAqeXPDrF6ynvNLYVPLnh1i9ZT3mlsn5yP50LWOXC2ABd4iio96q\n5LxcZ4M9zbfTP2NWNXDZXpv4+RDFHSU0bcI6eJqeRiGCzrnUsr133VEqr7am8ddFMRDvtURFMMew\nReCj9hBsEXgo/YQyA2UwY9gi8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8\nFH7CGQAwY9gi8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8FH7CGQAwY9gi\n8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8FH7CGQAwY9gi8FH7CDYIvBR+\nwhkAMGPYIvBR+wg2CLwUfsIZADBiWCFd+GP2ENd9EkT0noF71qm6Wvj0IvkcnGhugTETtYmmJjCV\nlsFx20tkVQ5uZJpZI3kcmhSRKzkOv+jcmcTap2HMilmOOYwl50xhOCqd0P5spv37S0QfmWeihV+6\nH82U379paIPzLPRQrXzVPerVzdPe9gAiiAADSvHAH/UbNNweP0UNa8cAf50Nmn/MR+ih5lv+4V/T\nT6yrPNx82lf691utzpImo6d7kjiavG5d4h6SzwNXZq1Eq6x2l8sqZ2nkROJDYyp01lmYu8tTp+pq\nm4d9c9DluT0MKUtOm9Tw9GnUO9afwEPRp1GYE02HvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9\nafwEPRp1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afw\nEPRp1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afwEPR\np1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afwEPRp1G\nYAYe9afwEPRp1GOa30czVbLSwOReJY0NoARdC59kucFO17nW6qdmNa5cdifvoiLyKWkquUWimpHJ\nvtq4cPrdgWotTOML0TjAYK7gc3oqZzBX8Dm9FSOV8xX8p9FKOVDzbeAw+Y2TWtvAYfMbJrkPu1v6\nY9GbnLkAB1NAwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej\n5I3KK5pabTNVZuc9MGsbyuXQhy6fPq5lqK56zzu0q5+lE8iJxIXbujuXa2jbxOqW48ylLPgvxfl1\n61VRk9urCJjGcOnXg+o9j2KKoquVRjOx52Nn0GcyDY2fQbzIegfDaWvfL3cync87Gz6DeZBsbPoN\n5kPQGlr3yZlO552Nn0G8yDY2fQbzIegNLXvkzKdzzsbPoN5kGxs+g3mQ9AaWvfJmU7nnY2fQbzIN\njZ9BvMh6A0te+TMp3POxs+g3mQbGz6DeZD0Bpa98mZTuedjZ9BvMg2Nn0G8yHoDS175Mync87Gz6\nDeZBsbPoN5kPQGlr3yZlO552Nn0G8yDY2fQbzIegNLXvkzKdzzsbPoN5kGxs+g3mQ9AaWvfJmU7n\nnY2fQbzINjZ9BvMh6A0te+TMp3POxs+g3mQbGz6DeZD0Bpa98mZTuedjZ9BvMh5dBE5MHRsX6jID\nMXrlM4xVPixNFM6phY8ibxNT3BlsqJHSQSoqwq5cVaqac3HkwL8cktrlbe7U5N/vlqc6KdbTeP1X\n8O5ZcyvIaa7s4zEzGPyfKe0bNNm/NNOwU5T3EvnzLn+JL78h1ZTlPcS+fMuf4kvvyH0NHIqeLf5+\n13+jqwAJOwAAAAAAAAAAAAAAAAAAAAAAAAAAAqeXPDrF6ynvNLYVPLnh1i9ZT3mlsn5yP50LWOXC\n2Bd4Bd4ii51ZeBO/fzf+443zQsvAnfv5v/ccb52U8mHo0cmAAGzYAAAAAAABjqZUgp5ZnIqpGxXq\niceCYkHRZSbMtKtTQTU0dWxXQSOejkeqNV2bo3lwRSarIlno54WqiOkjcxFXeTFMCuUljuT4rbBc\nJaJtPQNVY0gz1dI/MVqK5VRMETFV0GJaVY46megylWpt6V89BJT0Sw7NsiytcuHEmamnEysv6xo9\n9xoZ6SFIHVDZHKjkzW76LhvL5CIteSlRDau8KmntcaLBsTqmnV6yqqbyqioib6ElNa7rc6aWlulT\nTxUrqd0ObTYqsjlwweucmjDDQiGNbWJrwe2ZQqxWd+0E1KksbpYVc5Fz8ExwXDeXA80uUuelOtVQ\nTUzKmN0kD1ejkfg3OzdG8uCGKe0XO47Clyko2tpo3JHsGd/qPVuajnYpoTyJiY4LFcpoKCC4S0TI\naFjtiSDPVXvVisRXKqJgiYroQazGpmhymlfa33B9skZSpDsyO2Zqq5OJMENy3Xl9TcGUdVRSUs0k\nOzx5z0cjm4oi729vkDSZL1sVnloFprRE59PsK1ETpM9y8q4oTFusEdtvEVXRNjihWmWGdiK5Vc7F\nFaqY/WNZE16sU8ADZUAAAAAAAAAAG/kP8i6etr7rSzlYyH+RdPW191pZziq2y86vlSqndD+bKb9+\n0tEH5lnooVfuh/NlN+/aWiD8yz0UK181T3qVc3T3vYAIogAA0rxwB/nQ2af8xH6KGteOAv8AOhtU\n/wCYj9FDzLf9wr+mn1lWebj5ygcqPnCy+sL7qm4aeVHzhZfWF91TcO6va5bm0ABomAAAAAAAAEBd\ncoJaS9LbKW3SVcyU7ahzklaxEarlam/5UJ8qOUWTlVX5R7ZQ01rq4lpG0+x1qvTNVHudnJmovLgZ\npw6SGw3K6mStuFHPTyRVNHT98KxXIqPTNxVqLyoen5Tulc9tuts9YsMLJqjNejdjz25yNTHfdhpw\nIy5ZHVNfSXJdmpqatnRi074s5WxKjM1zVxTHNXeNqmsd4tTqhbRNQu77iiSXZ85FikYxGZ7cE+Mi\noiaFw0obYUs6m3NlMr5XMt1uqKxY4WzTIioxY0cmKNwXfdhxGFcrNnqGR2u3T1zXUratXMe1qoxy\nqmGC8eKKG2e8W+qqJrZVUlQ+riY2Z9XnNVJGphnpmpguPJoNSiybutmq432aWgkZ3iykc6qV6Kjk\ne5yuRGpp0u3sRqNTZ3XpUTwx2y3y1eyQJPisrY81McMFx48UPVTlVNDPXsbaJ5GUDI31D2zNTMzm\nZ2CcuCEQ7Iqqp6mnfDFa7gxlPsbkrc9vx1crlciNRcExU258i2V0t4nr46bZquKJsGxOfhCrI81e\nTRjgqb+gz/SalvpJ2VVLDURY7HKxHtx38FTEymvb45YaCmiqXNdOyNrXubvK5EwVUNgmwAAAAAAA\nAAACJyk4HT+twe+hayqZScDp/W4PfQtZWjYtRsDBX8Dm9FTOYLhwKb0VJZZzFfyn0Vo5UPNt4DD6\nJsmtbeAw+ibJrkXu1v6Y9GbnKkAB1NAwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39\nGrj67IdTKXeXLlyD3ej5Kh3R/m+h9ZT3VKaXLuj/ADfQ+sp7qlNPzX8Z+9W/p/eX13sbmqvn+wAD\n497AAAMVVMlPTTTORVSNivVE48ExIll9VGxOqKOSJs0SyxrnouciNzsPJoJSuidUUVRCxUR0kbmI\nq7yKqKhCbnUijpu9dijlbA6GZcVwfi3DR9ek9LIqclmidPt6Nu757/hPdtc1+b0VRo9n+21FelWk\ndVVFI+GBGI9HbIjldjvJgnGuIlvL6eOV1XRSxKyPZE0oqOTHDDHiXTvGlTWGdKJ1NJFQwLmNRJoM\n5XK5qoqKqKiaMUNupoLhXwTx1k0ETHRZjWQ4uRXY45y4pjxbx1VW8iivozcdeudmrDDXOPTj+yUV\nX83pxw+G3yZ7nd46BXI+J71bAs+heJFRMP5mCa+rTpKlVRyRSNi2Zrc9FzkxRN/i3zXq7RXXCOoW\nrfTMkdT7BGkauVN/FVVVTyCsydRVlSi2KJk1PsUjXKulcUVF4/KLVrIKYppuTr6dvw6cfn0T3bWK\nqsomZmmNTPPfu9mTd9UckcjItmRuei5zcUTf4t8+1N8dSUqz1dFJGxHtYmEiOVcV8hrV2TmcyoZQ\nbFE2anSJ7XK7BXI5FRePRvntlnncxjHQUEDWzMkXYVcucjV0ouKG0Uez5pir469cxOGr4/PrdzGd\nlOMx/PT7NqovCMdOlNTuqEhjbKqteiIrVRVx/kZqO4vntz6yWmfCxGbI1Fciq5uGPEaVFZJKV9zR\nszXQ1EeZC1ccWJguhfJpJCKkeyzMo1c3ZEgSLO4scMMTlvxklMRTb164168cMNfw2q0Temcavjq1\ndzTgvmc2J1RSSwNmjWSJyuRyOwTHDRvLgKa9rJSd9TUj4aZYtlR6yNcuCpiiYIYI7TXSw00VXJTN\njpo3NYkWcqucrVaiqq7yaeIw0dgmZQd6TQ0EabCkazQq5XqqYYKqKmG+h1TayDCdcY4xsmdmM7Ne\n3DDfrSivKMY/m7+dCQS8LG1762klpo2xrKjlVHIqJxaN5fIfEvKxqiVdJJAr43SR4uRc7BMVTyLg\neJrfX18EsFfPDHE6JWIkGK5zuJy4pxYbx5nt1fXOYtc+mbsLHIxIs5c56twzlx3k072klTbyT/LC\nNuOEzq1asN+M7duHwbzVe6MfCO/H9nuG+YtY6opJYGyxrJEquRUfgmOGjeXA8tvkneD6ySheyFIt\nlRdlaqryJh9ZiZaK6aGnirJKZrKaJzY0izlVzlbm4qq8Rrw2GpbbJKRYbdGr4diWWNX5zt7f0eQt\nFrIOnDbGOudmM7Ne7De0z8o+P87t/wAkvRXJ09Z3tPTPp5Vj2RuLkcip9RIkTRWhlFcknpUYyJ0O\nZI3FVVXIuhfxJY8rK9DnxNjZh5+M+suuzn5v/JtAAcqoAAMtB882r1pn+TribyHI6D55tXrTP8nX\nE3kP0/8ACX9v/wDKf2fL+1/eO6BTlPcS+fMuf4kvvyHVlOU9xL58y5/iS+/IfWUcip4F/n7Xf6Or\nAAk7AAAAAAAAAAAAAAAAAAAAAAAAAAACp5c8OsXrKe80thU8u9E9mfxNqU/FpbJ+cj+dC1jlwtgX\neAIoud2dMKSRq77Z5UX21N4xXWB1muk6yoqUNU9ZWS8THrvtXk5UPbXsemLXNVPIp10TE0w77VUT\nTD0D5inKMU5TdR9B8xTlGKcoH0HzFOUYpygfQfMU5RinKB9B8xTlGKcoH0HzFOUYpygfQfMU5Rin\nKB9B8xTlGKcoH0HzFOUYpygfQfMU5RinKB9B8xTlGKcoH0HzFOUYpygfQfMU5UNaprGRqkcKbNUv\n0Rws0ucv+POYmcNpMxGuU1kOn+lcncS1TsPZQsxGZN251stUcMqo6dyrJKqb2culSTOOZxl5szjO\nKqd0P5spv37S0QfmWeihV+6DpoKNvGtQ0tEKYRMTyIVr5qnvVq5unvewARRAABpXjgL/ADobVP8A\nmI/RQ1bxwB/nQ2afTBH6KHmW/wC4V/TT6yrPNx80FlTorbM7iSpX3VNw+5R0MldbsKfBKmF6SxY8\nbk4vrI+33SCrbmudsVQ3RJDJocxeTA7646XLcjpb4PmcnKgxTlQmm+g+YpyoMU5UA+g+YpyoMU5U\nA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+\nYpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoeZJY42q6SRjUTjVcAI3KPTS0reNauHD2kLUV\nSnXbu606wIq0FI/ZHS8Uj95ETlRC1lqIwhaiMIDXuHApvRU2DXuK4UM3okMsnDJ7nyn0Wt8qHy28\nBh9E2TXt6YUUPomwMi1ZPb+mPQucqQAHS0DDXcDn9B34GYxVaY0sycrF/AMTscx/J9/Rq4+uyHUz\nlvcA+LYrvHxsrnoqHUil3ly5Mg93oVHujp/0bRLxJUp+ClMOmZT2xbtZ5qeNUSZMHxqu9nJpQ5ir\nlZI6KdqwzsXB8b9Cop+f/jHJLtVdvKKYxpiMJ+GvF9V7GvUxFVuZ17XoDFOUaOU+Gwl7uMAGjlGj\nlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlG\njlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlPLntamLnIieVTMUzOqIMYbFuRXXq1Im/3\n01fxOtpvIc7yJtklbdI7g9itpKfHY1VPzj10Yp5EOiH6t+HMluZLkNNN2MJmZnD5vk/aV2m7fmad\nkahTlPcS+e8uP4kvvyHVjlPcN+PXZYzJvSXFcF/8T1/yfRUcip4l/n7Xf6OrAAm7AAAAAAAAAAAA\nAAAAAAAAAAAAAAACsd0GFz7MydnyqeVr/q3uos5hrKeOrpZaeZMY5Gq1yec3t1ZlUVN7dWZVFRRV\nDaqjhnjXFsjEcn1oZim5P1z7FWOs90dmxo7GnmXeVF4i5IqKiKi4opm5RmT8Ohm5RmT8HiWJk0bo\n5WNex2hWuTFFIiTJezyOxWiY30VVv4KTQJpoLcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AIJMk7Mi8E/qO6yQoLVQ2/FaOmiicu+5E0r9ZugAAa\nN3udNa6R09S9Ew+S1N9y8iGYiZnCGYiZnCEBlcvfV6tNAzS7ZNkcnIhbUTBMCq5K0c9ZWzXq4NzZ\nJdELF/VaWore1YUR0K3dWFEdAACKIAANe4M2Sjlam/hifLbJslFEvImCmyulNJFxuW31To38HkXF\nq8inl5VVwbKacoq5MxmzO7XjE/Lo71qIz6JpjbtShpV9poa9yOq6aOR6aEcqaU+s3UVFTFFxQHqb\nUUGuSto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V/W\nTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V/\nWTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V\n/WTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6\nV/WTgAg9yto8Wd0r+s9x5M2hjse82u9NyuT+akyAPMUbImIyNrWMTQjWpgiHoAAaV3fm0T043KjU\nN1VRExXQhGKu2Fa1G8HhXFV5VPN9p3P+GbFHLr1RHz2z8ojWrZj+rOnZDfpm5lPG3kahkAPQopii\nmKY6E5nGcQAGzAHIioqLvKAByfuTP2qy0yrsUy5r9n74javG1V4udDrByzuo2eutF6o8s7DGslRS\n/Fq4k/7SPl5tHMXjJPKW3ZT2qOttsyOxT/UjVfjxu40chW5Gd/XDhySrRTOT1bY2fGJTZpXC1UNw\nwWspYpVTeVzdKfWboJO5ALkhZFXgTfad1jchZPEk9t3WT4NcyncznSgNyFk8ST23dY3IWTxJPbd1\nk+BmU7jOlAbkLJ4kntu6xuQsniSe27rJ8DMp3GdKA3IWTxJPbd1jchZPEk9t3WT4GZTuM6UBuQsn\niSe27rG5CyeJJ7busnwMyncZ0oDchZPEk9t3WNyFk8ST23dZPgZlO4zpQG5CyeJJ7busbkLJ4knt\nu6yfAzKdxnSgNyFk8ST23dY3IWTxJPbd1k+BmU7jOlAbkLJ4kntu6xuQsniSe27rJ8DMp3GdKA3I\nWTxJPbd1jchZPEk9t3WT4GZTuM6UBuQsniSe27rG5CyeJJ7busnwMyncZ0oDchZPEk9t3WNyFk8S\nT23dZPgZlO4zpQG5CyeJJ7busywZL2aB6OZQxKqb2di78SaAiimNkGMvjGtY1GsRGtTQiImCIfQf\nHORjVc5URqaVVeI2Yad6r47XaKyunVEjp4nSLj5ExwKB3AaOSHI2aumRc+uqny4rxomj8UUisvr5\nNlzdoskMmH59Or0dX1bdLGtRd7HjRP5rgdWtFvp7Ta6Wgo2ZlPTxpGxPIifiVmMyjCdsuGirT5Rn\n08mmJjvnb4NsAEncAAAAAAAAAAAAAAAAAAAAAAAAAAAAANG7WululPsVUzHD5Lk32r5CvNtl/tHx\nbdVMqqdN6OXfQt4KU3aqYw2wpTcmmMNsKlt1lBFols+evK1T4uUN61HLzqW4YG2lp6kebbSU9WPN\nUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3RXnUc\nvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnU\ncvOW7AYDSUdSPM0lHV9VR3RXnUcvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzjdFedRy85bsBg\nNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3RXnUcvON0V51HLzluwGA0lHUjzN\nJR1fVUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3\nRXnUcvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzn1MoL2u9Y3/AFqW3AYDSUdSPM0lPVjzVJa3\nKer+LDRQ0yL+s9d4zW/Jhz6lKu9VC1k6aUYvyULOBN6cMKYwYm9OGFMYCIiIiImCJxAAikAAAAAB\n4miZNGrJG4tU9g1qppriaaoxiWYnDXCN72qqVf8AZpEfH9Bx978qm6H0ir5iRB53F029Vi7VTG7V\nMecSppceVESj+/5eOkkG2EnikpIAzwTKu3n8tJn0dVH7YSeKyjbF/ispIAcFyrt5/LBn0dXzR+2L\n/FZRti7xWUkAOC5X2/6YM+jq+aP2yd4rKNsneKy8xIAcFyvt/wBMGfR1fNHbZL4rLzDbJfFZeYkQ\nOC5X2/6YM+jq+aO2zXxaXmG2f/DS8xIgcFyzt/0wZ9HV80dtn/w0vMNs/wDh5eYkQODZZ2/6YM+j\nq+aP2zTxeXmPm2aeLy8xIjAcGyzt/wBMGfR1fNH7Zp4vLzDbNPAS8xIYDAcGyzt/0x9zPt9XzR+2\nbfAS8w20b4CXmJDAYDg2Wdv+mPuZ9vq+aP20b4GXmG2jPAy8xIYIME5BwfLe3j8sfczrfV80ftoz\nwMvMNtGeCl5iQwTkGCcg4PlvbR+WPuZ1vq+aP20j8FLzDbSPwUvMSGCcgwTkHB8t7aPy/wCzOt9X\nzR+2kfgpeY+LcnO0RU0rl8qEjgnIBwbLJ23/AApj7yZ9HV80asVXWfn12KL6Kb6m/DEyGNGRpg1D\n2C2T5FRYqm5MzVVPTO3/AFHya1XJqjDZAADsaAAAAAD45qOarXIitVMFReM5nf8AuZvhuT7rkZcH\n2iucuLok/NPXzcR00G1Nc07Eb1ii9GFcOTNvHdMtXxKuzUlya3RskTkRV/mely+y0ZofkVMq+Ryn\nVxgnIhvpI6aYQ4LXHJuz5T+zk3wg5Y/sTU86j4Qcsf2JqedTrOCciDBORDGfT1Tg13tZ8I+zk3wg\n5Y/sTU86j4Qcsf2JqedTrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTk\nQZ9PVODXe1nwj7OTfCDlj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4\nQcsf2JqedTrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1\nnwj7OTfCDlj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4Qcsf2JqedT\nrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1nwj7OTfCDl\nj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4Qcsf2JqedTrOCciDBORB\nn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1nwj7OTfCDlj+xNTzqfU7\noGWK72RNR9aqdYwTkQYJyIM+nqnBrvaz4R9nKFyyy+qvi0mSCROXeWR+j+amKTJvL3KzBmUVzhtd\nvd8uClX4zk5Fw6zrmAM6TDZEHA5q5yuZjds9EJknkxbMl7elLa4UbjpfI7S968qqTYBOZmZxl10U\nU0Rm0xhAADDYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAB/9k=\n"
15
+ }
16
+ },
17
+ "cell_type": "markdown",
18
+ "id": "7ee017c2-79bf-4890-ba8a-61c52ca4810b",
19
+ "metadata": {},
20
+ "source": [
21
+ "### BERT大模型简介\n",
22
+ "\n",
23
+ "BERT(Bidirectional Encoder Representations from Transformers)是一个在自然语言处理领域具有里程碑意义的预训练模型,由Google AI在2018年提出。它基于Transformer架构中的编码器部分构建,并采用了双向训练方法来理解文本中词语的上下文信息。\n",
24
+ "\n",
25
+ "### BERT的主要特点:\n",
26
+ "\n",
27
+ "- **双向性**:与传统的单向语言模型不同,BERT能够同时利用左右两侧的上下文信息来进行预测,这使得它在理解语义方面更为强大。\n",
28
+ "- **预训练和微调**:BERT的训练分为两个阶段。首先是在大规模无标签语料库上进行预训练,然后针对特定任务使用少量标注数据进行微调。这种迁移学习的方法显著提高了下游任务的表现。\n",
29
+ "- **Masked Language Model (MLM)**:作为预训练的一部分,BERT随机遮蔽输入序列中的一些词,并尝试根据上下文恢复这些被遮蔽的词。这种方法增强了模型对句子内部结构的理解。\n",
30
+ "- **Next Sentence Prediction (NSP)**:另一个预训练任务是判断给定的两个句子是否连续出现,帮助BERT更好地捕捉句子间的逻辑关系。\n",
31
+ "- **输入表示**:BERT的输入由三部分组成:\n",
32
+ "\n",
33
+ " Token Embeddings:将词语映射为向量。\n",
34
+ "\n",
35
+ " Segment Embeddings:用于区分句子对(如问答任务中的问题和答案)。\n",
36
+ "\n",
37
+ " Position Embeddings:表示词语在序列中的位置。\n",
38
+ "\n",
39
+ "**特殊标记**:\n",
40
+ "\n",
41
+ " [CLS]:用于分类任务的输出。\n",
42
+ "\n",
43
+ " [SEP]:用于分隔句子对。\n",
44
+ "\n",
45
+ " [MASK]:用于掩码语言模型任务。\n",
46
+ "\n",
47
+ "### 模型配置\n",
48
+ "\n",
49
+ "BERT有两种主要变体:\n",
50
+ "- **BERT Base**:12层(或称作块/层),每层有12个自注意力头,总参数量约为1.1亿。\n",
51
+ "- **BERT Large**:24层,每层有16个自注意力头,参数量增加到约3.4亿,理论上具备更强的表达能力。\n",
52
+ "\n",
53
+ "### 网络结构\n",
54
+ "BERT Base的基本网络结构和GPT2的区别如下所示:\n",
55
+ "\n",
56
+ "![bert.jpg](attachment:6a042b8f-c47d-4f6d-b601-b80124836ec4.jpg)"
57
+ ]
58
+ },
59
  {
60
  "cell_type": "code",
61
+ "execution_count": 31,
62
+ "id": "602ad045-0a4c-4f48-afd2-04d09c0c0f71",
63
+ "metadata": {},
64
+ "outputs": [
65
+ {
66
+ "data": {
67
+ "text/plain": [
68
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
69
+ ]
70
+ },
71
+ "execution_count": 31,
72
+ "metadata": {},
73
+ "output_type": "execute_result"
74
+ }
75
+ ],
76
+ "source": [
77
+ "import subprocess\n",
78
+ "import os\n",
79
+ "# 设置环境变量, autodl一般区域\n",
80
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
81
+ "output = result.stdout\n",
82
+ "for line in output.splitlines():\n",
83
+ " if '=' in line:\n",
84
+ " var, value = line.split('=', 1)\n",
85
+ " os.environ[var] = value\n",
86
+ "\"\"\"\n",
87
+ "import os\n",
88
+ "\n",
89
+ "# 设置环境变量, autodl专区 其他idc\n",
90
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
91
+ "\n",
92
+ "# 打印环境变量以确认设置成功\n",
93
+ "print(os.environ.get('HF_ENDPOINT'))\n",
94
+ "\"\"\""
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": 32,
100
  "id": "a3ec4b86-2029-4d50-9bbf-64b208249165",
101
  "metadata": {},
102
  "outputs": [],
 
104
  "from tokenizers import Tokenizer\n",
105
  "from tokenizers.models import WordPiece\n",
106
  "from tokenizers.trainers import WordPieceTrainer\n",
107
+ "from tokenizers.pre_tokenizers import Whitespace\n",
108
+ "from transformers import PreTrainedTokenizerFast,AutoModelForMaskedLM"
109
  ]
110
  },
111
  {
112
  "cell_type": "code",
113
+ "execution_count": 16,
114
  "id": "47b3fc92-df22-4e4b-bdf9-671bda924c49",
115
  "metadata": {},
116
  "outputs": [],
 
124
  "execution_count": null,
125
  "id": "73f59aa6-8cce-4124-a3ee-7a5617b91ea7",
126
  "metadata": {},
127
+ "outputs": [
128
+ {
129
+ "name": "stdout",
130
+ "output_type": "stream",
131
+ "text": [
132
+ "\n",
133
+ "\n"
134
+ ]
135
+ }
136
+ ],
137
  "source": [
138
  "# 设置训练参数\n",
139
  "trainer = WordPieceTrainer(\n",
140
+ " vocab_size=30000, # 词汇表大小\n",
141
  " min_frequency=2, # 最小词频\n",
142
  " special_tokens=[\n",
143
  " \"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"\n",
 
149
  },
150
  {
151
  "cell_type": "code",
152
+ "execution_count": 4,
153
  "id": "7a0ccd64-5172-4f40-9868-cdf02687ae10",
154
  "metadata": {},
155
  "outputs": [],
 
179
  },
180
  {
181
  "cell_type": "code",
182
+ "execution_count": 17,
183
  "id": "48e1f20b-cd1a-49fa-be2b-aba30a24e706",
184
  "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": [
189
+ "('dna_wordpiece_dict/tokenizer_config.json',\n",
190
+ " 'dna_wordpiece_dict/special_tokens_map.json',\n",
191
+ " 'dna_wordpiece_dict/tokenizer.json')"
192
+ ]
193
+ },
194
+ "execution_count": 17,
195
+ "metadata": {},
196
+ "output_type": "execute_result"
197
+ }
198
+ ],
199
  "source": [
200
  "new_tokenizer = Tokenizer.from_file(\"dna_wordpiece_dict.json\")\n",
201
  "\n",
 
212
  },
213
  {
214
  "cell_type": "code",
215
+ "execution_count": 33,
216
  "id": "c94dc601-86ec-421c-8638-c8d8b5078682",
217
  "metadata": {},
218
  "outputs": [],
 
229
  },
230
  {
231
  "cell_type": "code",
232
+ "execution_count": 34,
233
  "id": "b2658cd2-0ac5-483e-b04d-2716993770e3",
234
  "metadata": {},
235
  "outputs": [],
 
240
  },
241
  {
242
  "cell_type": "code",
243
+ "execution_count": 19,
244
+ "id": "20b35091-791e-4a6f-8f2d-fda39348daa3",
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "data": {
249
+ "text/plain": [
250
+ "{'input_ids': [5, 761, 12283], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}"
251
+ ]
252
+ },
253
+ "execution_count": 19,
254
+ "metadata": {},
255
+ "output_type": "execute_result"
256
+ }
257
+ ],
258
+ "source": [
259
+ "tokenizer(\"ATCGGATCG\")"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 35,
265
+ "id": "3a485c0a-c1a7-4b1c-b16e-3086593fd328",
266
+ "metadata": {},
267
+ "outputs": [
268
+ {
269
+ "data": {
270
+ "text/plain": [
271
+ "PreTrainedTokenizerFast(name_or_path='dna_wordpiece_dict', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
272
+ "\t0: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
273
+ "\t1: AddedToken(\"[UNK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
274
+ "\t2: AddedToken(\"[CLS]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
275
+ "\t3: AddedToken(\"[SEP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
276
+ "\t4: AddedToken(\"[MASK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
277
+ "}\n",
278
+ ")"
279
+ ]
280
+ },
281
+ "execution_count": 35,
282
+ "metadata": {},
283
+ "output_type": "execute_result"
284
+ }
285
+ ],
286
+ "source": [
287
+ "tokenizer"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 36,
293
+ "id": "a48642d6-69d1-4fff-903a-b0711ab7691b",
294
  "metadata": {},
295
  "outputs": [],
296
  "source": [
297
+ "from transformers import BertConfig,BertForMaskedLM\n",
 
 
 
298
  "\n",
299
+ "# 设置最大输入长度\n",
300
+ "max_length = 128 # 最大输入长度\n",
301
  "\n",
302
  "# 构建配置\n",
303
+ "config = BertConfig(\n",
304
+ " vocab_size=len(tokenizer), # 词汇表大小,与分词器一致\n",
305
+ " max_position_embeddings=max_length, # 最大位置嵌入数,与 max_length 一致\n",
306
+ " pad_token_id=tokenizer.pad_token_id, # 填充标记的 ID\n",
307
+ " bos_token_id=tokenizer.cls_token_id, # 句子开始标记的 ID(BERT 使用 [CLS])\n",
308
+ " eos_token_id=tokenizer.sep_token_id, # 句子结束标记的 ID(BERT 使用 [SEP])\n",
 
309
  ")\n",
310
  "\n",
 
311
  "# Building the model from the config\n",
312
+ "model = BertForMaskedLM(config)"
313
  ]
314
  },
315
  {
316
  "cell_type": "code",
317
+ "execution_count": 37,
318
  "id": "afc2cdd1-228e-4ee7-95f5-07718f00723d",
319
  "metadata": {},
320
  "outputs": [],
321
  "source": [
322
  "# 1. load dna dataset\n",
323
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
324
+ "#dataset = raw_dataset[\"train\"].select(range(1000)).train_test_split(test_size=0.1, shuffle=True)\n",
325
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
326
  "\n",
327
  "# 2. tokenize\n",
328
+ "# def tokenize_function(examples):\n",
329
+ "# return tokenizer(examples['text'][:100], truncation=True, padding='max_length', max_length=max_length)\n",
330
+ "\n",
331
+ "# 2. tokenize, 必须设置最大长度\n",
332
+ "#默认是100,设置成1000就行了。否则如果字符串长超过100,就是有bug,只生成1个unk了\n",
333
+ "tokenizer._tokenizer.model.max_input_chars_per_word = 10000\n",
334
  "def tokenize_function(examples):\n",
335
  " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
336
  "\n",
337
+ "\n",
338
  "# 3. 对数据集应用分词函数\n",
339
+ "tokenized_datasets = dataset.map(tokenize_function, batched=False, remove_columns=['text'], num_proc=15) # 设置为你的 CPU 核心数或根据需要调整\n",
340
  "\n",
341
  "# 4. 创建一个数据收集器,用于动态填充和遮蔽,注意mlm=true\n",
342
  "data_collator = DataCollatorForLanguageModeling(\n",
 
346
  },
347
  {
348
  "cell_type": "code",
349
+ "execution_count": 38,
350
+ "id": "9345610a-631e-49bb-a10c-bc0646a694fa",
351
+ "metadata": {},
352
+ "outputs": [
353
+ {
354
+ "data": {
355
+ "text/plain": [
356
+ "{'text': 'GAATCATATTTCTCTTAGTAATGCTTTCTATTCAAAATATTAAATGGAAATTTTAAAAAGAATTTTAGAGCTAATTCTAAACAATCTCTTTTACATTCATTCAACTGCCATTTTACTTTTCTGAAGTTGCTACCTTATTTTCTCTACCAAATATCATTATTTTAATTCATTGATTACATACACAGCTAGTTCTAAAGTTAAAAAAAGTACTTACTAAGCACTTATTAAACAATAAAATATGTTATTCACATTTGAAGCATAAGTAACCAATAAGTAACCAATTGTAACCAATGGTTACGTTAGCATAAATAGGATAAATATTTTAATGTATTAGCATTAGTTTTCTGATAGCACCCTGATCTGCTTGCACAAGAACCCAGGGACGGTTTTACTAGTACGCCAAATAAGCTGCAGCCAAGGGCTCTTATAATATTTAGAGGAACCTATTTTAAAACATGGAACTTTTTTCTTTAAAGTTCTATAAAATTTGATTATTTATGAGTGATAAAGGCCTTAAAAATTTATTCTGCTTTGGCTCCCTGAAATGTTACAAATGTTTTAGGTACTAAAGTGTTATTTATATGAATGCCATATACACTCATAAATTATAACTAAGATTGACTAAATAGCTAATGAATTCCCACAATTTGCCATAAAACATTAACCGGCTTGCCTTACCGTACCAAGTTTAAACTATTTAAAGTTAAAAATTTGGCTTGGGCTAAAGTCACCCTTAGTGCCAAAGTATTATTAATGTGAATCAGCATTTTTTGTTTTGAATCGAATCTTGAATTATAGCAGGTCCTGATTCGCGTTTAAATCATATTTCAAATCAATGTTTTTAAGTCTAAATTTAAGCTTTTTTTGTCTTGTTTTTTTGTCCTGTTTTTTTTTGTCCTGTTTTTGGCCCTGTGAATCAGCATTTTTTGTTTTGAATCGAATCTTGAATCGAAGTAGGTCTTGATTCGCGTTTTTAATCACATTTCAAATCAAAATTTTT'}"
357
+ ]
358
+ },
359
+ "execution_count": 38,
360
+ "metadata": {},
361
+ "output_type": "execute_result"
362
+ }
363
+ ],
364
+ "source": [
365
+ "dataset[\"train\"][0]"
366
+ ]
367
+ },
368
+ {
369
+ "cell_type": "code",
370
+ "execution_count": 24,
371
+ "id": "8dfc0ff6-2b11-4020-98aa-c8b2b83b9bbc",
372
+ "metadata": {},
373
+ "outputs": [
374
+ {
375
+ "data": {
376
+ "text/plain": [
377
+ "['GAA',\n",
378
+ " '##TCATATT',\n",
379
+ " '##TCTCTTA',\n",
380
+ " '##GTAATG',\n",
381
+ " '##CTTTCTATT',\n",
382
+ " '##CAAAATATTA',\n",
383
+ " '##AA',\n",
384
+ " '##TGGAA',\n",
385
+ " '##A',\n",
386
+ " '##TTTTAAAAA',\n",
387
+ " '##GAATTTTA',\n",
388
+ " '##GAGCTAA',\n",
389
+ " '##TT',\n",
390
+ " '##CTAAACAA',\n",
391
+ " '##TCTCTTTTA',\n",
392
+ " '##CATT',\n",
393
+ " '##CAT']"
394
+ ]
395
+ },
396
+ "execution_count": 24,
397
+ "metadata": {},
398
+ "output_type": "execute_result"
399
+ }
400
+ ],
401
+ "source": [
402
+ "tokenizer.tokenize(dataset[\"train\"][0][\"text\"][:100])"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": 39,
408
  "id": "604491f9-2ee7-4722-aad6-02e98457b5ee",
409
  "metadata": {},
410
  "outputs": [],
411
  "source": [
412
  "run_path = \"bert_run\"\n",
413
+ "train_epoches = 200\n",
414
  "batch_size = 10\n",
415
  "\n",
416
  "\n",
 
437
  },
438
  {
439
  "cell_type": "code",
440
+ "execution_count": 26,
441
  "id": "d91a8bfb-f3ff-4031-a0d7-ebedc200d65a",
442
  "metadata": {},
443
+ "outputs": [
444
+ {
445
+ "data": {
446
+ "text/html": [
447
+ "\n",
448
+ " <div>\n",
449
+ " \n",
450
+ " <progress value='18000' max='18000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
451
+ " [18000/18000 13:45, Epoch 200/200]\n",
452
+ " </div>\n",
453
+ " <table border=\"1\" class=\"dataframe\">\n",
454
+ " <thead>\n",
455
+ " <tr style=\"text-align: left;\">\n",
456
+ " <th>Step</th>\n",
457
+ " <th>Training Loss</th>\n",
458
+ " </tr>\n",
459
+ " </thead>\n",
460
+ " <tbody>\n",
461
+ " <tr>\n",
462
+ " <td>500</td>\n",
463
+ " <td>9.029000</td>\n",
464
+ " </tr>\n",
465
+ " <tr>\n",
466
+ " <td>1000</td>\n",
467
+ " <td>8.534200</td>\n",
468
+ " </tr>\n",
469
+ " <tr>\n",
470
+ " <td>1500</td>\n",
471
+ " <td>8.344000</td>\n",
472
+ " </tr>\n",
473
+ " <tr>\n",
474
+ " <td>2000</td>\n",
475
+ " <td>8.243700</td>\n",
476
+ " </tr>\n",
477
+ " <tr>\n",
478
+ " <td>2500</td>\n",
479
+ " <td>8.190700</td>\n",
480
+ " </tr>\n",
481
+ " <tr>\n",
482
+ " <td>3000</td>\n",
483
+ " <td>8.172000</td>\n",
484
+ " </tr>\n",
485
+ " <tr>\n",
486
+ " <td>3500</td>\n",
487
+ " <td>8.167900</td>\n",
488
+ " </tr>\n",
489
+ " <tr>\n",
490
+ " <td>4000</td>\n",
491
+ " <td>8.123100</td>\n",
492
+ " </tr>\n",
493
+ " <tr>\n",
494
+ " <td>4500</td>\n",
495
+ " <td>8.081900</td>\n",
496
+ " </tr>\n",
497
+ " <tr>\n",
498
+ " <td>5000</td>\n",
499
+ " <td>8.115100</td>\n",
500
+ " </tr>\n",
501
+ " <tr>\n",
502
+ " <td>5500</td>\n",
503
+ " <td>8.094800</td>\n",
504
+ " </tr>\n",
505
+ " <tr>\n",
506
+ " <td>6000</td>\n",
507
+ " <td>8.088200</td>\n",
508
+ " </tr>\n",
509
+ " <tr>\n",
510
+ " <td>6500</td>\n",
511
+ " <td>8.089600</td>\n",
512
+ " </tr>\n",
513
+ " <tr>\n",
514
+ " <td>7000</td>\n",
515
+ " <td>8.068900</td>\n",
516
+ " </tr>\n",
517
+ " <tr>\n",
518
+ " <td>7500</td>\n",
519
+ " <td>8.067000</td>\n",
520
+ " </tr>\n",
521
+ " <tr>\n",
522
+ " <td>8000</td>\n",
523
+ " <td>8.066400</td>\n",
524
+ " </tr>\n",
525
+ " <tr>\n",
526
+ " <td>8500</td>\n",
527
+ " <td>8.036600</td>\n",
528
+ " </tr>\n",
529
+ " <tr>\n",
530
+ " <td>9000</td>\n",
531
+ " <td>8.057600</td>\n",
532
+ " </tr>\n",
533
+ " <tr>\n",
534
+ " <td>9500</td>\n",
535
+ " <td>8.057800</td>\n",
536
+ " </tr>\n",
537
+ " <tr>\n",
538
+ " <td>10000</td>\n",
539
+ " <td>8.069700</td>\n",
540
+ " </tr>\n",
541
+ " <tr>\n",
542
+ " <td>10500</td>\n",
543
+ " <td>8.032500</td>\n",
544
+ " </tr>\n",
545
+ " <tr>\n",
546
+ " <td>11000</td>\n",
547
+ " <td>8.042600</td>\n",
548
+ " </tr>\n",
549
+ " <tr>\n",
550
+ " <td>11500</td>\n",
551
+ " <td>8.037500</td>\n",
552
+ " </tr>\n",
553
+ " <tr>\n",
554
+ " <td>12000</td>\n",
555
+ " <td>8.068900</td>\n",
556
+ " </tr>\n",
557
+ " <tr>\n",
558
+ " <td>12500</td>\n",
559
+ " <td>8.047800</td>\n",
560
+ " </tr>\n",
561
+ " <tr>\n",
562
+ " <td>13000</td>\n",
563
+ " <td>8.055800</td>\n",
564
+ " </tr>\n",
565
+ " <tr>\n",
566
+ " <td>13500</td>\n",
567
+ " <td>8.050900</td>\n",
568
+ " </tr>\n",
569
+ " <tr>\n",
570
+ " <td>14000</td>\n",
571
+ " <td>8.054800</td>\n",
572
+ " </tr>\n",
573
+ " <tr>\n",
574
+ " <td>14500</td>\n",
575
+ " <td>8.026000</td>\n",
576
+ " </tr>\n",
577
+ " <tr>\n",
578
+ " <td>15000</td>\n",
579
+ " <td>8.050300</td>\n",
580
+ " </tr>\n",
581
+ " <tr>\n",
582
+ " <td>15500</td>\n",
583
+ " <td>8.054800</td>\n",
584
+ " </tr>\n",
585
+ " <tr>\n",
586
+ " <td>16000</td>\n",
587
+ " <td>8.059600</td>\n",
588
+ " </tr>\n",
589
+ " <tr>\n",
590
+ " <td>16500</td>\n",
591
+ " <td>8.042800</td>\n",
592
+ " </tr>\n",
593
+ " <tr>\n",
594
+ " <td>17000</td>\n",
595
+ " <td>8.024000</td>\n",
596
+ " </tr>\n",
597
+ " <tr>\n",
598
+ " <td>17500</td>\n",
599
+ " <td>8.030000</td>\n",
600
+ " </tr>\n",
601
+ " <tr>\n",
602
+ " <td>18000</td>\n",
603
+ " <td>8.050600</td>\n",
604
+ " </tr>\n",
605
+ " </tbody>\n",
606
+ "</table><p>"
607
+ ],
608
+ "text/plain": [
609
+ "<IPython.core.display.HTML object>"
610
+ ]
611
+ },
612
+ "metadata": {},
613
+ "output_type": "display_data"
614
+ }
615
+ ],
616
  "source": [
617
  "trainer.train()\n",
618
  "trainer.save_model(\"dna_bert_v0\")"
 
620
  },
621
  {
622
  "cell_type": "code",
623
+ "execution_count": 28,
624
+ "id": "438f877b-63ca-473f-aa34-2a3291a52c18",
625
+ "metadata": {},
626
+ "outputs": [
627
+ {
628
+ "name": "stdout",
629
+ "output_type": "stream",
630
+ "text": [
631
+ "Perplexity: 145488.27\n"
632
+ ]
633
+ }
634
+ ],
635
+ "source": [
636
+ "import math\n",
637
+ "eval_results = trainer.evaluate()\n",
638
+ "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": 29,
644
  "id": "fc4ad6ad-6433-471f-8510-1ae46558d4ce",
645
  "metadata": {},
646
  "outputs": [],
 
648
  "#upload model\n",
649
  "#model.push_to_hub(\"dna_bert_v0\", organization=\"dnagpt\", use_auth_token=\"hf_*******\")"
650
  ]
651
+ },
652
+ {
653
+ "cell_type": "code",
654
+ "execution_count": 30,
655
+ "id": "bb01748e-4835-4014-bcb5-360931b26c99",
656
+ "metadata": {},
657
+ "outputs": [
658
+ {
659
+ "name": "stderr",
660
+ "output_type": "stream",
661
+ "text": [
662
+ "Some weights of BertModel were not initialized from the model checkpoint at dna_bert_v0 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
663
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
664
+ ]
665
+ },
666
+ {
667
+ "data": {
668
+ "text/plain": [
669
+ "BertModel(\n",
670
+ " (embeddings): BertEmbeddings(\n",
671
+ " (word_embeddings): Embedding(30000, 768, padding_idx=0)\n",
672
+ " (position_embeddings): Embedding(128, 768)\n",
673
+ " (token_type_embeddings): Embedding(2, 768)\n",
674
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
675
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
676
+ " )\n",
677
+ " (encoder): BertEncoder(\n",
678
+ " (layer): ModuleList(\n",
679
+ " (0-11): 12 x BertLayer(\n",
680
+ " (attention): BertAttention(\n",
681
+ " (self): BertSdpaSelfAttention(\n",
682
+ " (query): Linear(in_features=768, out_features=768, bias=True)\n",
683
+ " (key): Linear(in_features=768, out_features=768, bias=True)\n",
684
+ " (value): Linear(in_features=768, out_features=768, bias=True)\n",
685
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
686
+ " )\n",
687
+ " (output): BertSelfOutput(\n",
688
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
689
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
690
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
691
+ " )\n",
692
+ " )\n",
693
+ " (intermediate): BertIntermediate(\n",
694
+ " (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
695
+ " (intermediate_act_fn): GELUActivation()\n",
696
+ " )\n",
697
+ " (output): BertOutput(\n",
698
+ " (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
699
+ " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
700
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
701
+ " )\n",
702
+ " )\n",
703
+ " )\n",
704
+ " )\n",
705
+ " (pooler): BertPooler(\n",
706
+ " (dense): Linear(in_features=768, out_features=768, bias=True)\n",
707
+ " (activation): Tanh()\n",
708
+ " )\n",
709
+ ")"
710
+ ]
711
+ },
712
+ "execution_count": 30,
713
+ "metadata": {},
714
+ "output_type": "execute_result"
715
+ }
716
+ ],
717
+ "source": [
718
+ "from transformers import AutoTokenizer, AutoModel\n",
719
+ "import torch\n",
720
+ "model = AutoModel.from_pretrained('dna_bert_v0')\n",
721
+ "model"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "code",
726
+ "execution_count": null,
727
+ "id": "894a6afa-070c-40af-b3f1-cd45bf541c53",
728
+ "metadata": {},
729
+ "outputs": [],
730
+ "source": []
731
  }
732
  ],
733
  "metadata": {
02-gpt2_bert/.ipynb_checkpoints/5-multi-seq-gpt-checkpoint.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/1-dna-bpe.ipynb CHANGED
@@ -284,7 +284,7 @@
284
  "id": "c24f10dc-1117-4493-9333-5ed6d898f44a",
285
  "metadata": {},
286
  "source": [
287
- "### 训练DNA BPE分词器\n",
288
  "\n",
289
  "以上方法展示了如何对 DNA 和蛋白质序列进行“分词”,以提取有用的特征。选择哪种方法取决于具体的任务需求和数据特性。对于简单的分类或回归任务,K-mer 分解或滑动窗口可能是足够的;而对于更复杂的任务,如序列标注或结构预测,基于词汇表的方法或嵌入表示可能会提供更好的性能。\n",
290
  "\n",
 
284
  "id": "c24f10dc-1117-4493-9333-5ed6d898f44a",
285
  "metadata": {},
286
  "source": [
287
+ "### **训练DNA BPE分词器**\n",
288
  "\n",
289
  "以上方法展示了如何对 DNA 和蛋白质序列进行“分词”,以提取有用的特征。选择哪种方法取决于具体的任务需求和数据特性。对于简单的分类或回归任务,K-mer 分解或滑动窗口可能是足够的;而对于更复杂的任务,如序列标注或结构预测,基于词汇表的方法或嵌入表示可能会提供更好的性能。\n",
290
  "\n",
02-gpt2_bert/2-dna-gpt.ipynb CHANGED
@@ -49,9 +49,9 @@
49
  "\n",
50
  "### 历史背景\n",
51
  "\n",
52
- "- **发布日期**:GPT-2 最初于 2019 2 月发布。OpenAI 在最初并没有一次性公开所有版本,而是逐步发布了不同规模的模型。\n",
53
  " \n",
54
- "- **开发动机**:GPT-2 是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
55
  "\n",
56
  "- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
57
  "\n",
@@ -73,6 +73,24 @@
73
  {
74
  "cell_type": "code",
75
  "execution_count": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "id": "70581590-096f-45f8-b13b-b84e88615849",
77
  "metadata": {},
78
  "outputs": [],
@@ -96,7 +114,7 @@
96
  },
97
  {
98
  "cell_type": "code",
99
- "execution_count": 2,
100
  "id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
101
  "metadata": {},
102
  "outputs": [],
@@ -117,7 +135,7 @@
117
  },
118
  {
119
  "cell_type": "code",
120
- "execution_count": 3,
121
  "id": "87435829-f522-4820-a51d-11fa4afee6d7",
122
  "metadata": {},
123
  "outputs": [],
@@ -136,58 +154,53 @@
136
  ]
137
  },
138
  {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "id": "0a0adfdd-4be9-4027-a12d-3bf848be3012",
142
  "metadata": {},
143
- "outputs": [],
144
  "source": [
145
- "接着是训练数据集,"
 
 
 
 
146
  ]
147
  },
148
  {
149
  "cell_type": "code",
150
- "execution_count": 4,
151
  "id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
152
  "metadata": {},
153
  "outputs": [
154
  {
155
  "data": {
156
  "application/vnd.jupyter.widget-view+json": {
157
- "model_id": "4e8e73b13d42451bb7214b50bf1d8d47",
158
  "version_major": 2,
159
  "version_minor": 0
160
  },
161
  "text/plain": [
162
- "Generating train split: 0 examples [00:00, ? examples/s]"
163
  ]
164
  },
165
  "metadata": {},
166
  "output_type": "display_data"
167
  },
168
  {
169
- "data": {
170
- "application/vnd.jupyter.widget-view+json": {
171
- "model_id": "6a3e649a8ca14fc8adfc361c2f6eeb7e",
172
- "version_major": 2,
173
- "version_minor": 0
174
- },
175
- "text/plain": [
176
- "Map (num_proc=15): 0%| | 0/971635 [00:00<?, ? examples/s]"
177
- ]
178
- },
179
- "metadata": {},
180
- "output_type": "display_data"
181
  },
182
  {
183
  "data": {
184
  "application/vnd.jupyter.widget-view+json": {
185
- "model_id": "b0fb7862bca842518fa4e96901b93be4",
186
  "version_major": 2,
187
  "version_minor": 0
188
  },
189
  "text/plain": [
190
- "Map (num_proc=15): 0%| | 0/107960 [00:00<?, ? examples/s]"
191
  ]
192
  },
193
  "metadata": {},
@@ -197,6 +210,7 @@
197
  "source": [
198
  "# 1. load dna dataset\n",
199
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
 
200
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
201
  "\n",
202
  "# 2. tokenize\n",
@@ -212,6 +226,174 @@
212
  ")"
213
  ]
214
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  {
216
  "cell_type": "code",
217
  "execution_count": 5,
@@ -4656,10 +4838,308 @@
4656
  },
4657
  {
4658
  "cell_type": "code",
4659
- "execution_count": null,
4660
  "id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
4661
  "metadata": {},
4662
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4663
  "source": []
4664
  }
4665
  ],
 
49
  "\n",
50
  "### 历史背景\n",
51
  "\n",
52
+ "- **发布日期**:GPT(Generative Pre-trained Transformer)的第一个版本,即 GPT-1,是在 2018 年由 OpenAI 发布的。具体来说,关于 GPT-1 的研究论文《Improving Language Understanding by Generative Pre-Training》在 2018 年 6 月发布。\n",
53
  " \n",
54
+ "- **开发动机**:GPT-2 2019年发表,是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
55
  "\n",
56
  "- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
57
  "\n",
 
73
  {
74
  "cell_type": "code",
75
  "execution_count": 1,
76
+ "id": "83af3495-b1fd-4ea1-84d7-9224b7094c0f",
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "import subprocess\n",
81
+ "import os\n",
82
+ "# 设置环境变量, autodl一般区域\n",
83
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
84
+ "output = result.stdout\n",
85
+ "for line in output.splitlines():\n",
86
+ " if '=' in line:\n",
87
+ " var, value = line.split('=', 1)\n",
88
+ " os.environ[var] = value"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": 2,
94
  "id": "70581590-096f-45f8-b13b-b84e88615849",
95
  "metadata": {},
96
  "outputs": [],
 
114
  },
115
  {
116
  "cell_type": "code",
117
+ "execution_count": 6,
118
  "id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
119
  "metadata": {},
120
  "outputs": [],
 
135
  },
136
  {
137
  "cell_type": "code",
138
+ "execution_count": 7,
139
  "id": "87435829-f522-4820-a51d-11fa4afee6d7",
140
  "metadata": {},
141
  "outputs": [],
 
154
  ]
155
  },
156
  {
157
+ "cell_type": "markdown",
158
+ "id": "05875e2f-32e7-485d-9399-99dc1e4bf71f",
 
159
  "metadata": {},
 
160
  "source": [
161
+ "## 训练数据\n",
162
+ "\n",
163
+ "接着是训练数据集,最重要的是构建模型的输入和输出。\n",
164
+ "\n",
165
+ "这里使用DataCollatorForLanguageModeling ,它是专为语言建模而设计(顾名思义)。除了堆叠和填充批次,它还负责创建语言模型标签——在因果语言建模中,输入也用作标签(只是移动了一个元素),并且这个数据整理器在训练期间即时创建它们,所以我们不需要复制 input_ids。"
166
  ]
167
  },
168
  {
169
  "cell_type": "code",
170
+ "execution_count": 9,
171
  "id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
172
  "metadata": {},
173
  "outputs": [
174
  {
175
  "data": {
176
  "application/vnd.jupyter.widget-view+json": {
177
+ "model_id": "3db6964a82794db7ac007c7aa513ad33",
178
  "version_major": 2,
179
  "version_minor": 0
180
  },
181
  "text/plain": [
182
+ "Map (num_proc=15): 0%| | 0/90 [00:00<?, ? examples/s]"
183
  ]
184
  },
185
  "metadata": {},
186
  "output_type": "display_data"
187
  },
188
  {
189
+ "name": "stderr",
190
+ "output_type": "stream",
191
+ "text": [
192
+ "num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.\n"
193
+ ]
 
 
 
 
 
 
 
194
  },
195
  {
196
  "data": {
197
  "application/vnd.jupyter.widget-view+json": {
198
+ "model_id": "ba2c0d0e766949c79e4db6e6bd881f06",
199
  "version_major": 2,
200
  "version_minor": 0
201
  },
202
  "text/plain": [
203
+ "Map (num_proc=10): 0%| | 0/10 [00:00<?, ? examples/s]"
204
  ]
205
  },
206
  "metadata": {},
 
210
  "source": [
211
  "# 1. load dna dataset\n",
212
  "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
213
+ "#dataset = raw_dataset[\"train\"].select(range(100)).train_test_split(test_size=0.1, shuffle=True)\n",
214
  "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
215
  "\n",
216
  "# 2. tokenize\n",
 
226
  ")"
227
  ]
228
  },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 10,
232
+ "id": "2eb1ff7a-f733-404b-a6ed-da82a677da3f",
233
+ "metadata": {},
234
+ "outputs": [
235
+ {
236
+ "name": "stdout",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "[{'input_ids': [20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978, 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412, 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65, 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84, 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137, 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419, 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468, 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65, 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138, 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003, 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772, 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079, 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269, 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614, 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}]\n"
240
+ ]
241
+ }
242
+ ],
243
+ "source": [
244
+ "samples = [tokenized_datasets[\"train\"][0]]\n",
245
+ "print(samples)"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": 11,
251
+ "id": "260283a4-5ceb-4ef6-be1b-a4765fb74b20",
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "name": "stdout",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "{'input_ids': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
259
+ " 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
260
+ " 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
261
+ " 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
262
+ " 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
263
+ " 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
264
+ " 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
265
+ " 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
266
+ " 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
267
+ " 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
268
+ " 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
269
+ " 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
270
+ " 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
271
+ " 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
272
+ " 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0,\n",
273
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
274
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
275
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
276
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
277
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
278
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
279
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
280
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
281
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
282
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
283
+ " 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
284
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
285
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
286
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
287
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
288
+ " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
289
+ " 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
290
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
291
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
292
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
293
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
294
+ " 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
295
+ " 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
296
+ " 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
297
+ " 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
298
+ " 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
299
+ " 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
300
+ " 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
301
+ " 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
302
+ " 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
303
+ " 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
304
+ " 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
305
+ " 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
306
+ " 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
307
+ " 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, -100,\n",
308
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
309
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
310
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
311
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
312
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
313
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
314
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
315
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
316
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
317
+ " -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
318
+ " -100, -100, -100, -100, -100, -100]])}\n"
319
+ ]
320
+ }
321
+ ],
322
+ "source": [
323
+ "io_data = data_collator(samples)\n",
324
+ "print(io_data)"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "markdown",
329
+ "id": "80a84504-eaa3-43a9-ba13-3a2b73942c59",
330
+ "metadata": {},
331
+ "source": [
332
+ "这段代码展示了如何加载 DNA 数据集、对其进行分词处理,并为语言模型训练准备数据。让我们逐段解析代码,并特别关注 `DataCollatorForLanguageModeling` 函数。\n",
333
+ "\n",
334
+ "### 1. 加载 DNA 数据集\n",
335
+ "\n",
336
+ "```python\n",
337
+ "raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
338
+ "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
339
+ "```\n",
340
+ "\n",
341
+ "- **`load_dataset`**:使用 Hugging Face 的 `datasets` 库加载文本文件作为数据集。这里指定的是一个本地的 DNA 序列文本文件 `dna_1g.txt`。\n",
342
+ "- **`train_test_split`**:将原始数据集分割为训练集和测试集,其中测试集占 10%(`test_size=0.1`),并随机打乱数据(`shuffle=True`)。\n",
343
+ "\n",
344
+ "### 2. 定义分词函数\n",
345
+ "\n",
346
+ "```python\n",
347
+ "def tokenize_function(examples):\n",
348
+ " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
349
+ "```\n",
350
+ "\n",
351
+ "- **`tokenize_function`**:这是一个自定义的分词函数,用于对数据集中的每条记录进行分词处理。\n",
352
+ "- **参数解释**:\n",
353
+ " - `examples['text']`:获取数据集中每条记录的文本内容。\n",
354
+ " - `truncation=True`:确保所有输入序列被截断到 `max_length` 指定的最大长度。\n",
355
+ " - `padding='max_length'`:将所有输入序列填充到 `max_length` 指定的最大长度,以保证批次内所有序列具有相同的长度。\n",
356
+ " - `max_length`:指定最大序列长度,需要根据具体任务和模型要求设置。\n",
357
+ "\n",
358
+ "### 3. 对数据集应用分词函数\n",
359
+ "\n",
360
+ "```python\n",
361
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15)\n",
362
+ "```\n",
363
+ "\n",
364
+ "- **`map`**:将 `tokenize_function` 应用到整个数据集上。`batched=True` 表示批量处理,可以显著提高处理速度。\n",
365
+ "- **`remove_columns=['text']`**:分词后不再需要原始文本列,因此将其移除。\n",
366
+ "- **`num_proc=15`**:指定使用的 CPU 核心数(或进程数),可以根据你的硬件资源调整。这有助于加速分词过程。\n",
367
+ "\n",
368
+ "### 4. 创建数据收集器\n",
369
+ "\n",
370
+ "```python\n",
371
+ "data_collator = DataCollatorForLanguageModeling(\n",
372
+ " tokenizer=tokenizer, mlm=False\n",
373
+ ")\n",
374
+ "```\n",
375
+ "\n",
376
+ "#### `DataCollatorForLanguageModeling` 函数详解\n",
377
+ "\n",
378
+ "`DataCollatorForLanguageModeling` 是 Hugging Face 提供的一个工具,用于在训练语言模型时动态地处理批次数据。它主要用于两种任务:\n",
379
+ "\n",
380
+ "- **Masked Language Modeling (MLM)**:遮蔽某些 token 并预测它们,常用于预训练模型(如 BERT)。\n",
381
+ "- **Causal Language Modeling (CLM)**:基于前文预测下一个 token,适用于生成式模型(如 GPT 系列)。\n",
382
+ "\n",
383
+ "在这个例子中,`mlm=False` 表明我们正在处理因果语言建模(CLM),即每个 token 只能依赖于其前面的 token 进行预测。这对于像 GPT 这样的生成模型非常适用。\n",
384
+ "\n",
385
+ "- **`tokenizer=tokenizer`**:指定用于编码和解码的分词器对象。\n",
386
+ "- **`mlm=False`**:关闭 MLM 模式,因为我们不需要遮蔽任何 token。对于因果语言建模,模型会尝试根据之前的上下文预测下一个 token。"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "markdown",
391
+ "id": "3fbe9480-c394-4bab-bdee-e80f21e0259a",
392
+ "metadata": {},
393
+ "source": [
394
+ "### 开始训练"
395
+ ]
396
+ },
397
  {
398
  "cell_type": "code",
399
  "execution_count": 5,
 
4838
  },
4839
  {
4840
  "cell_type": "code",
4841
+ "execution_count": 3,
4842
  "id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
4843
  "metadata": {},
4844
  "outputs": [],
4845
+ "source": [
4846
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dna_bpe_dict\")\n",
4847
+ "tokenizer.pad_token = tokenizer.eos_token"
4848
+ ]
4849
+ },
4850
+ {
4851
+ "cell_type": "code",
4852
+ "execution_count": 5,
4853
+ "id": "76f7c636-20c0-47a1-83c1-72e5ee101c0f",
4854
+ "metadata": {},
4855
+ "outputs": [],
4856
+ "source": [
4857
+ "from transformers import AutoTokenizer, AutoModel\n",
4858
+ "model = AutoModel.from_pretrained('dna_gpt2_v0')"
4859
+ ]
4860
+ },
4861
+ {
4862
+ "cell_type": "code",
4863
+ "execution_count": 6,
4864
+ "id": "c041ad1b-7fe4-4d00-a77e-8ab17f020600",
4865
+ "metadata": {},
4866
+ "outputs": [
4867
+ {
4868
+ "name": "stdout",
4869
+ "output_type": "stream",
4870
+ "text": [
4871
+ "[2024-12-30 20:29:16,315] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
4872
+ ]
4873
+ },
4874
+ {
4875
+ "name": "stderr",
4876
+ "output_type": "stream",
4877
+ "text": [
4878
+ "/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
4879
+ "collect2: error: ld returned 1 exit status\n",
4880
+ "/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4881
+ "/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4882
+ "/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
4883
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
4884
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
4885
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
4886
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
4887
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
4888
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
4889
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
4890
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4891
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
4892
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
4893
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
4894
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
4895
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4896
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
4897
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
4898
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
4899
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4900
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
4901
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
4902
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4903
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
4904
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
4905
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
4906
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
4907
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
4908
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
4909
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
4910
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
4911
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
4912
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
4913
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
4914
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
4915
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
4916
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
4917
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
4918
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
4919
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
4920
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
4921
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
4922
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4923
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
4924
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
4925
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4926
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
4927
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
4928
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4929
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
4930
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
4931
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
4932
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
4933
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
4934
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
4935
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
4936
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
4937
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
4938
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
4939
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
4940
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
4941
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
4942
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4943
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
4944
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4945
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4946
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
4947
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4948
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
4949
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
4950
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
4951
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
4952
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
4953
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
4954
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
4955
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
4956
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
4957
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
4958
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
4959
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4960
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
4961
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
4962
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
4963
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
4964
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
4965
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
4966
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
4967
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
4968
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
4969
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
4970
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
4971
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
4972
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
4973
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4974
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
4975
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
4976
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
4977
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
4978
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
4979
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
4980
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
4981
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
4982
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
4983
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
4984
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
4985
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
4986
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
4987
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
4988
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4989
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
4990
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
4991
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
4992
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
4993
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
4994
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
4995
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
4996
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
4997
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
4998
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
4999
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
5000
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
5001
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
5002
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
5003
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
5004
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
5005
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
5006
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
5007
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
5008
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
5009
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
5010
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
5011
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
5012
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
5013
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
5014
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
5015
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
5016
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
5017
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5018
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
5019
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
5020
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
5021
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
5022
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
5023
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
5024
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
5025
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
5026
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
5027
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
5028
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
5029
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
5030
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
5031
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
5032
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
5033
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
5034
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
5035
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
5036
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
5037
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
5038
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
5039
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
5040
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
5041
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
5042
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
5043
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
5044
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
5045
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
5046
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
5047
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5048
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
5049
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
5050
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
5051
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
5052
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
5053
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
5054
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
5055
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
5056
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
5057
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
5058
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
5059
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
5060
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
5061
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
5062
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
5063
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
5064
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
5065
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
5066
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
5067
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
5068
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
5069
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
5070
+ "collect2: error: ld returned 1 exit status\n"
5071
+ ]
5072
+ },
5073
+ {
5074
+ "data": {
5075
+ "application/vnd.jupyter.widget-view+json": {
5076
+ "model_id": "857d0b6286fb4eaaafcb8911cef664dc",
5077
+ "version_major": 2,
5078
+ "version_minor": 0
5079
+ },
5080
+ "text/plain": [
5081
+ "model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
5082
+ ]
5083
+ },
5084
+ "metadata": {},
5085
+ "output_type": "display_data"
5086
+ },
5087
+ {
5088
+ "data": {
5089
+ "text/plain": [
5090
+ "CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', commit_message='Upload model', commit_description='', oid='e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
5091
+ ]
5092
+ },
5093
+ "execution_count": 6,
5094
+ "metadata": {},
5095
+ "output_type": "execute_result"
5096
+ }
5097
+ ],
5098
+ "source": [
5099
+ "model.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_***\")"
5100
+ ]
5101
+ },
5102
+ {
5103
+ "cell_type": "code",
5104
+ "execution_count": 7,
5105
+ "id": "8a28a45b-56ba-4328-8edf-4cd7ee9289c5",
5106
+ "metadata": {},
5107
+ "outputs": [
5108
+ {
5109
+ "data": {
5110
+ "application/vnd.jupyter.widget-view+json": {
5111
+ "model_id": "42c48d91578f41439d7b3ec26a6d566c",
5112
+ "version_major": 2,
5113
+ "version_minor": 0
5114
+ },
5115
+ "text/plain": [
5116
+ "README.md: 0%| | 0.00/5.17k [00:00<?, ?B/s]"
5117
+ ]
5118
+ },
5119
+ "metadata": {},
5120
+ "output_type": "display_data"
5121
+ },
5122
+ {
5123
+ "data": {
5124
+ "text/plain": [
5125
+ "CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/16138639cb17307b84421e443a1c67f4fe188121', commit_message='Upload tokenizer', commit_description='', oid='16138639cb17307b84421e443a1c67f4fe188121', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
5126
+ ]
5127
+ },
5128
+ "execution_count": 7,
5129
+ "metadata": {},
5130
+ "output_type": "execute_result"
5131
+ }
5132
+ ],
5133
+ "source": [
5134
+ "tokenizer.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_**\")"
5135
+ ]
5136
+ },
5137
+ {
5138
+ "cell_type": "code",
5139
+ "execution_count": null,
5140
+ "id": "ec5364cc-4386-4db8-a400-cd788657de84",
5141
+ "metadata": {},
5142
+ "outputs": [],
5143
  "source": []
5144
  }
5145
  ],
02-gpt2_bert/3-dna-bert.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/4-gene-feature.ipynb CHANGED
@@ -153,7 +153,7 @@
153
  },
154
  {
155
  "cell_type": "code",
156
- "execution_count": 45,
157
  "id": "f1ca177c-a80f-48a1-b2f9-16c13b3350db",
158
  "metadata": {},
159
  "outputs": [
@@ -163,7 +163,7 @@
163
  "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
164
  ]
165
  },
166
- "execution_count": 45,
167
  "metadata": {},
168
  "output_type": "execute_result"
169
  }
@@ -193,10 +193,18 @@
193
  },
194
  {
195
  "cell_type": "code",
196
- "execution_count": 46,
197
  "id": "2295739c-e80a-47be-9400-88bfab4b0bb6",
198
  "metadata": {},
199
  "outputs": [
 
 
 
 
 
 
 
 
200
  {
201
  "data": {
202
  "text/plain": [
@@ -208,7 +216,7 @@
208
  "})"
209
  ]
210
  },
211
- "execution_count": 46,
212
  "metadata": {},
213
  "output_type": "execute_result"
214
  }
@@ -229,7 +237,7 @@
229
  },
230
  {
231
  "cell_type": "code",
232
- "execution_count": 47,
233
  "id": "9a47a1b1-21f2-4d71-801c-50f88e326ed3",
234
  "metadata": {},
235
  "outputs": [
@@ -240,7 +248,7 @@
240
  " 'label': 0}"
241
  ]
242
  },
243
- "execution_count": 47,
244
  "metadata": {},
245
  "output_type": "execute_result"
246
  }
@@ -259,7 +267,7 @@
259
  },
260
  {
261
  "cell_type": "code",
262
- "execution_count": 52,
263
  "id": "4010d991-056a-43ce-8cca-30eeec8678f5",
264
  "metadata": {},
265
  "outputs": [],
@@ -267,198 +275,154 @@
267
  "import numpy as np\n",
268
  "from sklearn.model_selection import train_test_split\n",
269
  "from sklearn.linear_model import LogisticRegression\n",
270
- "from sklearn.datasets import load_iris\n",
271
  "from sklearn.metrics import accuracy_score\n",
 
 
272
  "\n",
 
 
 
 
273
  "\n",
274
  "def get_gpt2_feature(sequence):\n",
275
- " return \n",
276
- "\n",
277
- "# 加载数据集\n",
278
- "data = load_iris()\n",
279
- "X = data.data[data.target < 2] # 只选择前两个类别\n",
280
- "y = data.target[data.target < 2]\n",
281
- "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  "X = []\n",
283
  "Y = []\n",
284
  "\n",
285
- "for item in dna_data[\"train\"]:\n",
 
286
  " sequence = item[\"sequence\"]\n",
287
  " label = item[\"label\"]\n",
288
  " x_v = get_gpt2_feature(sequence)\n",
289
  " y_v = label\n",
290
  " X.append(x_v)\n",
291
- " Y.append(y_v)"
292
  ]
293
  },
294
  {
295
  "cell_type": "code",
296
- "execution_count": 49,
297
- "id": "8af0effa-b2b6-4e49-9256-cead146d848c",
298
  "metadata": {},
299
- "outputs": [
300
- {
301
- "data": {
302
- "text/plain": [
303
- "array([[5.1, 3.5, 1.4, 0.2],\n",
304
- " [4.9, 3. , 1.4, 0.2],\n",
305
- " [4.7, 3.2, 1.3, 0.2],\n",
306
- " [4.6, 3.1, 1.5, 0.2],\n",
307
- " [5. , 3.6, 1.4, 0.2],\n",
308
- " [5.4, 3.9, 1.7, 0.4],\n",
309
- " [4.6, 3.4, 1.4, 0.3],\n",
310
- " [5. , 3.4, 1.5, 0.2],\n",
311
- " [4.4, 2.9, 1.4, 0.2],\n",
312
- " [4.9, 3.1, 1.5, 0.1],\n",
313
- " [5.4, 3.7, 1.5, 0.2],\n",
314
- " [4.8, 3.4, 1.6, 0.2],\n",
315
- " [4.8, 3. , 1.4, 0.1],\n",
316
- " [4.3, 3. , 1.1, 0.1],\n",
317
- " [5.8, 4. , 1.2, 0.2],\n",
318
- " [5.7, 4.4, 1.5, 0.4],\n",
319
- " [5.4, 3.9, 1.3, 0.4],\n",
320
- " [5.1, 3.5, 1.4, 0.3],\n",
321
- " [5.7, 3.8, 1.7, 0.3],\n",
322
- " [5.1, 3.8, 1.5, 0.3],\n",
323
- " [5.4, 3.4, 1.7, 0.2],\n",
324
- " [5.1, 3.7, 1.5, 0.4],\n",
325
- " [4.6, 3.6, 1. , 0.2],\n",
326
- " [5.1, 3.3, 1.7, 0.5],\n",
327
- " [4.8, 3.4, 1.9, 0.2],\n",
328
- " [5. , 3. , 1.6, 0.2],\n",
329
- " [5. , 3.4, 1.6, 0.4],\n",
330
- " [5.2, 3.5, 1.5, 0.2],\n",
331
- " [5.2, 3.4, 1.4, 0.2],\n",
332
- " [4.7, 3.2, 1.6, 0.2],\n",
333
- " [4.8, 3.1, 1.6, 0.2],\n",
334
- " [5.4, 3.4, 1.5, 0.4],\n",
335
- " [5.2, 4.1, 1.5, 0.1],\n",
336
- " [5.5, 4.2, 1.4, 0.2],\n",
337
- " [4.9, 3.1, 1.5, 0.2],\n",
338
- " [5. , 3.2, 1.2, 0.2],\n",
339
- " [5.5, 3.5, 1.3, 0.2],\n",
340
- " [4.9, 3.6, 1.4, 0.1],\n",
341
- " [4.4, 3. , 1.3, 0.2],\n",
342
- " [5.1, 3.4, 1.5, 0.2],\n",
343
- " [5. , 3.5, 1.3, 0.3],\n",
344
- " [4.5, 2.3, 1.3, 0.3],\n",
345
- " [4.4, 3.2, 1.3, 0.2],\n",
346
- " [5. , 3.5, 1.6, 0.6],\n",
347
- " [5.1, 3.8, 1.9, 0.4],\n",
348
- " [4.8, 3. , 1.4, 0.3],\n",
349
- " [5.1, 3.8, 1.6, 0.2],\n",
350
- " [4.6, 3.2, 1.4, 0.2],\n",
351
- " [5.3, 3.7, 1.5, 0.2],\n",
352
- " [5. , 3.3, 1.4, 0.2],\n",
353
- " [7. , 3.2, 4.7, 1.4],\n",
354
- " [6.4, 3.2, 4.5, 1.5],\n",
355
- " [6.9, 3.1, 4.9, 1.5],\n",
356
- " [5.5, 2.3, 4. , 1.3],\n",
357
- " [6.5, 2.8, 4.6, 1.5],\n",
358
- " [5.7, 2.8, 4.5, 1.3],\n",
359
- " [6.3, 3.3, 4.7, 1.6],\n",
360
- " [4.9, 2.4, 3.3, 1. ],\n",
361
- " [6.6, 2.9, 4.6, 1.3],\n",
362
- " [5.2, 2.7, 3.9, 1.4],\n",
363
- " [5. , 2. , 3.5, 1. ],\n",
364
- " [5.9, 3. , 4.2, 1.5],\n",
365
- " [6. , 2.2, 4. , 1. ],\n",
366
- " [6.1, 2.9, 4.7, 1.4],\n",
367
- " [5.6, 2.9, 3.6, 1.3],\n",
368
- " [6.7, 3.1, 4.4, 1.4],\n",
369
- " [5.6, 3. , 4.5, 1.5],\n",
370
- " [5.8, 2.7, 4.1, 1. ],\n",
371
- " [6.2, 2.2, 4.5, 1.5],\n",
372
- " [5.6, 2.5, 3.9, 1.1],\n",
373
- " [5.9, 3.2, 4.8, 1.8],\n",
374
- " [6.1, 2.8, 4. , 1.3],\n",
375
- " [6.3, 2.5, 4.9, 1.5],\n",
376
- " [6.1, 2.8, 4.7, 1.2],\n",
377
- " [6.4, 2.9, 4.3, 1.3],\n",
378
- " [6.6, 3. , 4.4, 1.4],\n",
379
- " [6.8, 2.8, 4.8, 1.4],\n",
380
- " [6.7, 3. , 5. , 1.7],\n",
381
- " [6. , 2.9, 4.5, 1.5],\n",
382
- " [5.7, 2.6, 3.5, 1. ],\n",
383
- " [5.5, 2.4, 3.8, 1.1],\n",
384
- " [5.5, 2.4, 3.7, 1. ],\n",
385
- " [5.8, 2.7, 3.9, 1.2],\n",
386
- " [6. , 2.7, 5.1, 1.6],\n",
387
- " [5.4, 3. , 4.5, 1.5],\n",
388
- " [6. , 3.4, 4.5, 1.6],\n",
389
- " [6.7, 3.1, 4.7, 1.5],\n",
390
- " [6.3, 2.3, 4.4, 1.3],\n",
391
- " [5.6, 3. , 4.1, 1.3],\n",
392
- " [5.5, 2.5, 4. , 1.3],\n",
393
- " [5.5, 2.6, 4.4, 1.2],\n",
394
- " [6.1, 3. , 4.6, 1.4],\n",
395
- " [5.8, 2.6, 4. , 1.2],\n",
396
- " [5. , 2.3, 3.3, 1. ],\n",
397
- " [5.6, 2.7, 4.2, 1.3],\n",
398
- " [5.7, 3. , 4.2, 1.2],\n",
399
- " [5.7, 2.9, 4.2, 1.3],\n",
400
- " [6.2, 2.9, 4.3, 1.3],\n",
401
- " [5.1, 2.5, 3. , 1.1],\n",
402
- " [5.7, 2.8, 4.1, 1.3]])"
403
- ]
404
- },
405
- "execution_count": 49,
406
- "metadata": {},
407
- "output_type": "execute_result"
408
- }
409
- ],
410
  "source": [
411
- "X"
412
  ]
413
  },
414
  {
415
  "cell_type": "code",
416
- "execution_count": 51,
417
- "id": "868a3cab-e991-4990-9ec5-3e632a41a599",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  "metadata": {},
419
  "outputs": [
420
  {
421
- "data": {
422
- "text/plain": [
423
- "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
424
- " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
425
- " 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
426
- " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
427
- " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])"
428
- ]
429
- },
430
- "execution_count": 51,
431
- "metadata": {},
432
- "output_type": "execute_result"
433
  }
434
  ],
435
  "source": [
436
- "y"
 
 
437
  ]
438
  },
439
  {
440
  "cell_type": "code",
441
- "execution_count": null,
442
- "id": "5ab0c188-6476-43c4-b361-a2bfe0ec7a8a",
443
  "metadata": {},
444
  "outputs": [],
445
  "source": [
446
- "# 将数据分为训练集和测试集\n",
447
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
448
- "\n",
449
- "# 创建逻辑回归模型\n",
450
- "model = LogisticRegression()\n",
451
- "\n",
452
- "# 训练模型\n",
453
- "model.fit(X_train, y_train)\n",
454
- "\n",
455
  "# 在测试集上进行预测\n",
456
- "y_pred = model.predict(X_test)\n",
457
- "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
458
  "# 计算准确率\n",
459
  "accuracy = accuracy_score(y_test, y_pred)\n",
460
- "print(f\"Accuracy: {accuracy * 100:.2f}%\")\n",
461
- "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  "# 输出部分预测结果与真实标签对比\n",
463
  "for i in range(5):\n",
464
  " print(f\"True: {y_test[i]}, Predicted: {y_pred[i]}\")"
 
153
  },
154
  {
155
  "cell_type": "code",
156
+ "execution_count": 2,
157
  "id": "f1ca177c-a80f-48a1-b2f9-16c13b3350db",
158
  "metadata": {},
159
  "outputs": [
 
163
  "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
164
  ]
165
  },
166
+ "execution_count": 2,
167
  "metadata": {},
168
  "output_type": "execute_result"
169
  }
 
193
  },
194
  {
195
  "cell_type": "code",
196
+ "execution_count": 3,
197
  "id": "2295739c-e80a-47be-9400-88bfab4b0bb6",
198
  "metadata": {},
199
  "outputs": [
200
+ {
201
+ "name": "stderr",
202
+ "output_type": "stream",
203
+ "text": [
204
+ "Using the latest cached version of the dataset since dnagpt/dna_core_promoter couldn't be found on the Hugging Face Hub\n",
205
+ "Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/dnagpt___dna_core_promoter/default/0.0.0/809065798bf4928f67397ddba23e4aa9cc5ac3ed (last modified on Fri Dec 27 16:05:19 2024).\n"
206
+ ]
207
+ },
208
  {
209
  "data": {
210
  "text/plain": [
 
216
  "})"
217
  ]
218
  },
219
+ "execution_count": 3,
220
  "metadata": {},
221
  "output_type": "execute_result"
222
  }
 
237
  },
238
  {
239
  "cell_type": "code",
240
+ "execution_count": 4,
241
  "id": "9a47a1b1-21f2-4d71-801c-50f88e326ed3",
242
  "metadata": {},
243
  "outputs": [
 
248
  " 'label': 0}"
249
  ]
250
  },
251
+ "execution_count": 4,
252
  "metadata": {},
253
  "output_type": "execute_result"
254
  }
 
267
  },
268
  {
269
  "cell_type": "code",
270
+ "execution_count": 5,
271
  "id": "4010d991-056a-43ce-8cca-30eeec8678f5",
272
  "metadata": {},
273
  "outputs": [],
 
275
  "import numpy as np\n",
276
  "from sklearn.model_selection import train_test_split\n",
277
  "from sklearn.linear_model import LogisticRegression\n",
 
278
  "from sklearn.metrics import accuracy_score\n",
279
+ "from transformers import GPT2Tokenizer, GPT2Model\n",
280
+ "import torch\n",
281
  "\n",
282
+ "# 初始化 GPT-2 模型和分词器\n",
283
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
284
+ "tokenizer.pad_token = tokenizer.eos_token # 将填充符号设置为 eos_token\n",
285
+ "model = GPT2Model.from_pretrained(\"gpt2\")\n",
286
  "\n",
287
  "def get_gpt2_feature(sequence):\n",
288
+ " \"\"\"\n",
289
+ " 使用 GPT-2 模型提取特征向量。\n",
290
+ " :param sequence: DNA 序列 (字符串格式)\n",
291
+ " :return: 平均特征向量 (numpy 数组)\n",
292
+ " \"\"\"\n",
293
+ " # DNA 序列分词并转换为 GPT-2 输入\n",
294
+ " inputs = tokenizer(sequence, return_tensors=\"pt\", padding=True, truncation=True)\n",
295
+ " with torch.no_grad():\n",
296
+ " outputs = model(**inputs)\n",
297
+ " # 提取最后一层的隐藏状态作为特征向量并平均,会对每个序列的所有 token 的特征进行平均,最终得到一个形状为 (1, 768) 的向量(对于 batch_size=1)\n",
298
+ " feature_vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()\n",
299
+ " return feature_vector\n"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 6,
305
+ "id": "057eee1e-9f9a-47a2-b577-588caec58d31",
306
+ "metadata": {},
307
+ "outputs": [
308
+ {
309
+ "name": "stderr",
310
+ "output_type": "stream",
311
+ "text": [
312
+ "Processing DNA data: 100%|██████████| 59196/59196 [25:16<00:00, 39.04it/s]\n"
313
+ ]
314
+ }
315
+ ],
316
+ "source": [
317
+ "from tqdm import tqdm\n",
318
+ "# 提取特征和标签\n",
319
  "X = []\n",
320
  "Y = []\n",
321
  "\n",
322
+ "# 存储特征向量和标签\n",
323
+ "for item in tqdm(dna_data[\"train\"], desc=\"Processing DNA data\"):\n",
324
  " sequence = item[\"sequence\"]\n",
325
  " label = item[\"label\"]\n",
326
  " x_v = get_gpt2_feature(sequence)\n",
327
  " y_v = label\n",
328
  " X.append(x_v)\n",
329
+ " Y.append(y_v)\n"
330
  ]
331
  },
332
  {
333
  "cell_type": "code",
334
+ "execution_count": 11,
335
+ "id": "51133c2a-42e7-4e11-a6f9-6812a4e54182",
336
  "metadata": {},
337
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  "source": [
339
+ "X = np.array(X).squeeze(1) # 去掉维度为1的那一维"
340
  ]
341
  },
342
  {
343
  "cell_type": "code",
344
+ "execution_count": 17,
345
+ "id": "5ab0c188-6476-43c4-b361-a2bfe0ec7a8a",
346
+ "metadata": {},
347
+ "outputs": [],
348
+ "source": [
349
+ "# 将数据分为训练集和测试集\n",
350
+ "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
351
+ "\n",
352
+ "# 创建逻辑回归模型\n",
353
+ "model = LogisticRegression(max_iter=200, solver='newton-cg')\n"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": 18,
359
+ "id": "fd9be2bf-331e-4905-99e6-832e58a0463a",
360
  "metadata": {},
361
  "outputs": [
362
  {
363
+ "name": "stderr",
364
+ "output_type": "stream",
365
+ "text": [
366
+ "Training Logistic Regression: 100%|██████████| 200/200 [27:45<00:00, 8.33s/it]\n"
367
+ ]
 
 
 
 
 
 
 
368
  }
369
  ],
370
  "source": [
371
+ "# 训练模型\n",
372
+ "for i in tqdm(range(200), desc=\"Training Logistic Regression\"):\n",
373
+ " model.fit(X_train, y_train)"
374
  ]
375
  },
376
  {
377
  "cell_type": "code",
378
+ "execution_count": 19,
379
+ "id": "5417e4e2-3bca-4718-83a1-f418ad8a65b6",
380
  "metadata": {},
381
  "outputs": [],
382
  "source": [
 
 
 
 
 
 
 
 
 
383
  "# 在测试集上进行预测\n",
384
+ "y_pred = model.predict(X_test)"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": 20,
390
+ "id": "34371f07-0cbe-43cf-99a1-2ccd55e43e14",
391
+ "metadata": {},
392
+ "outputs": [
393
+ {
394
+ "name": "stdout",
395
+ "output_type": "stream",
396
+ "text": [
397
+ "Accuracy: 77.48%\n"
398
+ ]
399
+ }
400
+ ],
401
+ "source": [
402
  "# 计算准确率\n",
403
  "accuracy = accuracy_score(y_test, y_pred)\n",
404
+ "print(f\"Accuracy: {accuracy * 100:.2f}%\")"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": 21,
410
+ "id": "15472a1e-7813-4ccd-878b-e0cf5d7ce095",
411
+ "metadata": {},
412
+ "outputs": [
413
+ {
414
+ "name": "stdout",
415
+ "output_type": "stream",
416
+ "text": [
417
+ "True: 0, Predicted: 0\n",
418
+ "True: 0, Predicted: 1\n",
419
+ "True: 1, Predicted: 1\n",
420
+ "True: 0, Predicted: 0\n",
421
+ "True: 0, Predicted: 0\n"
422
+ ]
423
+ }
424
+ ],
425
+ "source": [
426
  "# 输出部分预测结果与真实标签对比\n",
427
  "for i in range(5):\n",
428
  " print(f\"True: {y_test[i]}, Predicted: {y_pred[i]}\")"
02-gpt2_bert/5-multi-seq-gpt.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/special_tokens_map-checkpoint.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer-checkpoint.json ADDED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer_config-checkpoint.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "extra_special_tokens": {},
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 1000000000000000019884624838656,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "tokenizer_class": "PreTrainedTokenizerFast",
52
+ "unk_token": "[UNK]"
53
+ }
02-gpt2_bert/gene_en_bpe.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tokenizers import (
2
+ decoders,
3
+ models,
4
+ normalizers,
5
+ pre_tokenizers,
6
+ processors,
7
+ trainers,
8
+ Tokenizer,
9
+ )
10
+
11
+ tokenizer = Tokenizer(models.BPE())
12
+ tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) #use_regex=False,空格当成一般字符串
13
+ trainer = trainers.BpeTrainer(vocab_size=90000, special_tokens=["<|endoftext|>"]) #9w words
14
+
15
+ tokenizer.train(["dna_1g.txt","protein_1g.txt","english_500m.txt"]
16
+ , trainer=trainer) #all file list, take 10-20 min
17
+
18
+
19
+ tokenizer.save("gene_eng_dict.json")
02-gpt2_bert/gene_eng_dict.json ADDED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/gene_eng_dict/merges.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:713f6b800eca1349925657153ee5e0d0543e7e48909e4db9a18685dbf0f38794
3
+ size 744912
02-gpt2_bert/gene_eng_dict/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
02-gpt2_bert/gene_eng_dict/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
02-gpt2_bert/gene_eng_dict/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1000000000000000019884624838656,
18
+ "tokenizer_class": "GPT2Tokenizer",
19
+ "unk_token": "<|endoftext|>"
20
+ }
02-gpt2_bert/gene_eng_dict/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
03-gene-task/.ipynb_checkpoints/1-category-task-checkpoint.ipynb CHANGED
@@ -1,6 +1,812 @@
1
  {
2
- "cells": [],
3
- "metadata": {},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  "nbformat": 4,
5
  "nbformat_minor": 5
6
  }
 
1
  {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "5840e900-43cb-4ab4-81a5-988b68fda9b1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.1 序列分类任务"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "958e7b5f-759a-431c-8af0-325271facb41",
14
+ "metadata": {},
15
+ "source": [
16
+ "基于 GPT-2 模型,可以通过微调(fine-tuning)或使用提示(prompt-based)方法来完成多种下游任务。\n",
17
+ "本章主要使用经典的微调方式,提示微调则属于chatgpt的范围,放在下一章,以下是几种常见的下游任务及其简单描述:\n",
18
+ "\n",
19
+ "\n",
20
+ "### 1. **文本分类**\n",
21
+ "\n",
22
+ "#### 任务描述\n",
23
+ "\n",
24
+ "文本分类是将文本分配到一个或多个预定义类别中的任务。例如,情感分析、主题分类等。生物序列中对应如启动序列等分类问题。\n",
25
+ "\n",
26
+ "#### 使用的模型类型\n",
27
+ "\n",
28
+ "- **GPT2ForSequenceClassification或AutoModelForSequenceClassification**:该模型在 GPT-2 的基础上添加了一个分类头,用于处理文本分类任务。通过微调这个模型,可以将其应用于多种分类任务。\n",
29
+ "\n",
30
+ "### 2. **机器翻译**\n",
31
+ "\n",
32
+ "#### 任务描述\n",
33
+ "\n",
34
+ "机器翻译是指将一种语言的文本转换为另一种语言的过程。生物学中,可以是生物序列到功能描述(英文)的翻译。\n",
35
+ "\n",
36
+ "#### 使用的模型类型\n",
37
+ "\n",
38
+ "- **AutoModelForSeq2SeqLM**:虽然 GPT-2 不是专门为机器翻译设计的模型,但可以通过构造特定格式的提示,让 GPT-2 根据上下文生成目标语言的翻译结果。\n",
39
+ "- **注意**:对于机器翻译任务,通常更推荐使用专门为此类任务设计的模型,如 T5 或 mBART。\n",
40
+ "\n",
41
+ "### 3. **词性标注 (POS Tagging)**\n",
42
+ "\n",
43
+ "#### 任务描述\n",
44
+ "\n",
45
+ "词性标注是指为每个单词分配其正确的词性标签(如名词、动词、形容词等)。生物学中,对应于结构预测任务,典型的如二级结构预测。\n",
46
+ "\n",
47
+ "#### 使用的模型类型\n",
48
+ "\n",
49
+ "- **AutoModelForTokenClassification**:该模型适用于标记级别的分类任务。通过微调,可以将 GPT-2 应用于词性标注,每个 token 的隐藏状态会被映射到相应的词性标签。\n",
50
+ "\n",
51
+ "### 4. **命名实体识别 (NER)**\n",
52
+ "\n",
53
+ "#### 任务描述\n",
54
+ "\n",
55
+ "命名实体识别是指识别文本中的人名、地名、组织机构等实体,并对其进行分类。生物学中,也对应于结构预测任务,典型的如膜结构预测。和词性标注类似。\n",
56
+ "\n",
57
+ "#### 使用的模型类型\n",
58
+ "\n",
59
+ "- **AutoModelForTokenClassification**:类似于词性标注,该模型可以用于 NER 任务,通过对每个 token 进行分类来识别和标注命名实体。\n",
60
+ "\n",
61
+ "### 5. **问答系统**\n",
62
+ "\n",
63
+ "#### 任务描述\n",
64
+ "\n",
65
+ "问答系统旨在根据给定的问题从文档或知识库中提取答案。目前一些最新的生物学大模型论文中,输入是包含生物序列的问题,回答则也是混合式的。一般是生物学领域的QA。\n",
66
+ "\n",
67
+ "#### 使用的模型类型\n",
68
+ "\n",
69
+ "- **AutoModelForQuestionAnswering**:该模型专门用于问答任务,能够理解问题并从上下文中提取答案。通过微调,它可以适应特定领域的问答需求。\n",
70
+ "\n",
71
+ "### 6. **文本生成**\n",
72
+ "\n",
73
+ "#### 任务描述\n",
74
+ "\n",
75
+ "文本生成是指根据给定的提示或前缀生成连贯的文本内容。生物学中,对应新的序列生成,如产生全新的蛋白质序列。\n",
76
+ "\n",
77
+ "#### 使用的模型类型\n",
78
+ "\n",
79
+ "- **GPT2LMHeadModel**:这是 GPT-2 的标准语言模型版本,擅长生成自然流畅的文本。它可以根据输入的提示生成后续文本,广泛应用于创作、对话系统等领域。\n",
80
+ "\n",
81
+ "### 6. **回归问题**\n",
82
+ "\n",
83
+ "#### 任务描述\n",
84
+ "\n",
85
+ "生物序列相关的回归问题,输入为序列,输出为一个float值。\n",
86
+ "\n",
87
+ "#### 使用的模型类型\n",
88
+ "\n",
89
+ "- huggingface没有特定的header,但一般回归问题,输出使用一个线性层即可,设定损失函数为均方误差(MSE)即可。最简单的,就是使用AutoModelForTokenClassification,类别数设置为1,输出的label为实测float值即可。\n",
90
+ "一个官方推荐的 [例子](https://github.com/huggingface/transformers/blob/7ae6f070044b0171a71f3269613bf02fd9fca6f2/src/transformers/models/bert/modeling_bert.py#L1564-L1575)\n",
91
+ "\n",
92
+ "### 小结\n",
93
+ "\n",
94
+ "GPT-2 可以通过微调或提示工程应用于多种下游任务。不同的任务需要使用特定类型的模型,这些模型基于 GPT-2 并添加了额外的组件或进���了调整,以更好地适应特定的任务需求\n",
95
+ "\n",
96
+ "<img src=\"img/gpt2-ft.png\" width=\"800px\" />"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 1,
102
+ "id": "eca17933-7b8f-44de-8c59-ea7a1c8a3b33",
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "data": {
107
+ "text/plain": [
108
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
109
+ ]
110
+ },
111
+ "execution_count": 1,
112
+ "metadata": {},
113
+ "output_type": "execute_result"
114
+ }
115
+ ],
116
+ "source": [
117
+ "import subprocess\n",
118
+ "import os\n",
119
+ "# 设置环境变量, autodl一般区域\n",
120
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
121
+ "output = result.stdout\n",
122
+ "for line in output.splitlines():\n",
123
+ " if '=' in line:\n",
124
+ " var, value = line.split('=', 1)\n",
125
+ " os.environ[var] = value\n",
126
+ "\n",
127
+ "\"\"\"\n",
128
+ "import os\n",
129
+ "\n",
130
+ "# 设置环境变量, autodl专区 其他idc\n",
131
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
132
+ "\n",
133
+ "# 打印环境变量以确认设置成功\n",
134
+ "print(os.environ.get('HF_ENDPOINT'))\n",
135
+ "\"\"\""
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 2,
141
+ "id": "108d9c3c-ae4d-4110-a532-a40a6fe1f9df",
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "from transformers import AutoTokenizer, AutoModel\n",
146
+ "from tokenizers import Tokenizer\n",
147
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
148
+ "from transformers import AutoModelForSequenceClassification\n",
149
+ "from transformers import DataCollatorWithPadding"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 6,
155
+ "id": "bcdc9f7a-1ea5-4647-b87e-ac72ddf17818",
156
+ "metadata": {},
157
+ "outputs": [
158
+ {
159
+ "data": {
160
+ "application/vnd.jupyter.widget-view+json": {
161
+ "model_id": "c2e31c61549449e78a4e1fe0e884233f",
162
+ "version_major": 2,
163
+ "version_minor": 0
164
+ },
165
+ "text/plain": [
166
+ "tokenizer_config.json: 0%| | 0.00/580 [00:00<?, ?B/s]"
167
+ ]
168
+ },
169
+ "metadata": {},
170
+ "output_type": "display_data"
171
+ },
172
+ {
173
+ "data": {
174
+ "application/vnd.jupyter.widget-view+json": {
175
+ "model_id": "da2009ca96634f759f052a9a4ff7e41e",
176
+ "version_major": 2,
177
+ "version_minor": 0
178
+ },
179
+ "text/plain": [
180
+ "vocab.json: 0%| | 0.00/642k [00:00<?, ?B/s]"
181
+ ]
182
+ },
183
+ "metadata": {},
184
+ "output_type": "display_data"
185
+ },
186
+ {
187
+ "data": {
188
+ "application/vnd.jupyter.widget-view+json": {
189
+ "model_id": "b6b6ec58d8cb4878aa2e0786ff0bbcf4",
190
+ "version_major": 2,
191
+ "version_minor": 0
192
+ },
193
+ "text/plain": [
194
+ "merges.txt: 0%| | 0.00/323k [00:00<?, ?B/s]"
195
+ ]
196
+ },
197
+ "metadata": {},
198
+ "output_type": "display_data"
199
+ },
200
+ {
201
+ "data": {
202
+ "application/vnd.jupyter.widget-view+json": {
203
+ "model_id": "5dbb5171eb6242bdbded42c87ef46c27",
204
+ "version_major": 2,
205
+ "version_minor": 0
206
+ },
207
+ "text/plain": [
208
+ "special_tokens_map.json: 0%| | 0.00/473 [00:00<?, ?B/s]"
209
+ ]
210
+ },
211
+ "metadata": {},
212
+ "output_type": "display_data"
213
+ }
214
+ ],
215
+ "source": [
216
+ "#set tokenizer\n",
217
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/dna_gpt2_v0\")\n",
218
+ "tokenizer.pad_token = tokenizer.eos_token"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 3,
224
+ "id": "0e930ef5-865a-4528-84b5-ddae6d710a99",
225
+ "metadata": {},
226
+ "outputs": [
227
+ {
228
+ "name": "stderr",
229
+ "output_type": "stream",
230
+ "text": [
231
+ "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/dna_gpt2_v0 and are newly initialized: ['score.weight']\n",
232
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
233
+ ]
234
+ },
235
+ {
236
+ "data": {
237
+ "text/plain": [
238
+ "GPT2ForSequenceClassification(\n",
239
+ " (transformer): GPT2Model(\n",
240
+ " (wte): Embedding(30000, 768)\n",
241
+ " (wpe): Embedding(1024, 768)\n",
242
+ " (drop): Dropout(p=0.1, inplace=False)\n",
243
+ " (h): ModuleList(\n",
244
+ " (0-11): 12 x GPT2Block(\n",
245
+ " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
246
+ " (attn): GPT2SdpaAttention(\n",
247
+ " (c_attn): Conv1D(nf=2304, nx=768)\n",
248
+ " (c_proj): Conv1D(nf=768, nx=768)\n",
249
+ " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
250
+ " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
251
+ " )\n",
252
+ " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
253
+ " (mlp): GPT2MLP(\n",
254
+ " (c_fc): Conv1D(nf=3072, nx=768)\n",
255
+ " (c_proj): Conv1D(nf=768, nx=3072)\n",
256
+ " (act): NewGELUActivation()\n",
257
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
258
+ " )\n",
259
+ " )\n",
260
+ " )\n",
261
+ " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
262
+ " )\n",
263
+ " (score): Linear(in_features=768, out_features=2, bias=False)\n",
264
+ ")"
265
+ ]
266
+ },
267
+ "execution_count": 3,
268
+ "metadata": {},
269
+ "output_type": "execute_result"
270
+ }
271
+ ],
272
+ "source": [
273
+ "#set model\n",
274
+ "model = AutoModelForSequenceClassification.from_pretrained('dnagpt/dna_gpt2_v0', num_labels=2)\n",
275
+ "model.config.pad_token_id = model.config.eos_token_id\n",
276
+ "model"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "markdown",
281
+ "id": "bd14794b-e507-4c1d-be47-0e0144835f18",
282
+ "metadata": {},
283
+ "source": [
284
+ "在生物学中,**启动子(promoter)** 是一段特定的DNA序列,它位于基因的上游(通常是5'端),并且是转录起始的关键调控元件。启动子的主要功能是为RNA聚合酶提供结合位点,并招募其他转录因子,以启动基因转录过程。以下是关于启动子的一些重要概念和特点:\n",
285
+ "\n",
286
+ "### 启动子的功能\n",
287
+ "\n",
288
+ "1. **转录起始**:\n",
289
+ " - 启动子是基因表达的第一步,它决定了何时、何地以及多频繁地进行转录。\n",
290
+ " \n",
291
+ "2. **调控基因表达**:\n",
292
+ " - 不同类型的启动子可以调节不同组织或细胞类型中的基因表达水平。例如,在某些细胞中高度活跃而在其他细胞中不活跃。\n",
293
+ "\n",
294
+ "3. **与转录因子和其他蛋白质相互作用**:\n",
295
+ " - 启动子区域通常包含多个顺式作用元件(cis-regulatory elements),这些元件可以与特定的转录因子或其他调控蛋白结合,进一步精细调整基因表达。\n",
296
+ " \n",
297
+ " \n",
298
+ "在生物学中,启动子(promoter)序列的二分类问题通常是指将DNA序列分为两类:**启动子序列**和**非启动子序列**。这种分类任务的目标是通过机器学习或生物信息学方法来预测给定的DNA序列是否具有启动子功能。\n",
299
+ "\n",
300
+ "### 二分类问题中的两个类别\n",
301
+ "\n",
302
+ "1. **启动子序列(Promoter Sequences)**:\n",
303
+ " - 这些序列包含能够指导转录起始的调控元件,通常是位于基因5'端上游区域的一段DNA。\n",
304
+ " - 启动子序列可能含有特定的保守基序(motifs),如TATA盒、CAAT盒等,这些基序对于RNA聚合酶及其辅助因子的结合至关重要。\n",
305
+ "\n",
306
+ "2. **非启动子序列(Non-Promoter Sequences)**:\n",
307
+ " - 这类序列指的是那些不具有启动子功能的DNA片段。它们可以来自基因内部(编码区或内含子)、基因间区域(intergenic regions)或其他调控元件(如增强子、沉默子等),但明确不是启动子。\n",
308
+ " - 非启动子序列不具备启动转录的能力,或者至少在自然条件下不会作为主要的转录起始点。\n",
309
+ "\n",
310
+ "### 启动子的研究意义\n",
311
+ "\n",
312
+ "理解启动子的工作机制对于揭示基因表达调控网络非常重要。这不仅有助于基础科学研究,而且对于医学应用也有着深远的影响,比如开发新的治疗策略来纠正异常的基因表达模式,或者利用合成生物学设计定制化的基因表达系统。\n"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 8,
318
+ "id": "aee08f3f-6cda-4975-8cb9-9a7bfacb9eac",
319
+ "metadata": {},
320
+ "outputs": [
321
+ {
322
+ "data": {
323
+ "application/vnd.jupyter.widget-view+json": {
324
+ "model_id": "82d2ec71cf6648469040897d9174a55f",
325
+ "version_major": 2,
326
+ "version_minor": 0
327
+ },
328
+ "text/plain": [
329
+ "README.md: 0%| | 0.00/314 [00:00<?, ?B/s]"
330
+ ]
331
+ },
332
+ "metadata": {},
333
+ "output_type": "display_data"
334
+ },
335
+ {
336
+ "data": {
337
+ "application/vnd.jupyter.widget-view+json": {
338
+ "model_id": "40183e0714ea4155a2c0772fb7c72a00",
339
+ "version_major": 2,
340
+ "version_minor": 0
341
+ },
342
+ "text/plain": [
343
+ "train-00000-of-00001.parquet: 0%| | 0.00/8.66M [00:00<?, ?B/s]"
344
+ ]
345
+ },
346
+ "metadata": {},
347
+ "output_type": "display_data"
348
+ },
349
+ {
350
+ "data": {
351
+ "application/vnd.jupyter.widget-view+json": {
352
+ "model_id": "8e5ebe15df194e3c8bf5811777755947",
353
+ "version_major": 2,
354
+ "version_minor": 0
355
+ },
356
+ "text/plain": [
357
+ "Generating train split: 0%| | 0/59195 [00:00<?, ? examples/s]"
358
+ ]
359
+ },
360
+ "metadata": {},
361
+ "output_type": "display_data"
362
+ }
363
+ ],
364
+ "source": [
365
+ "from datasets import load_dataset\n",
366
+ "# 1. load ~11k samples from promoters prediction dataset\n",
367
+ "dataset = load_dataset(\"dnagpt/dna_promoter_300\")['train'].train_test_split(test_size=0.1)"
368
+ ]
369
+ },
370
+ {
371
+ "cell_type": "code",
372
+ "execution_count": 9,
373
+ "id": "6ac9fe5b-2175-42d8-949c-cb12bc8fb65c",
374
+ "metadata": {},
375
+ "outputs": [
376
+ {
377
+ "data": {
378
+ "text/plain": [
379
+ "DatasetDict({\n",
380
+ " train: Dataset({\n",
381
+ " features: ['sequence', 'label'],\n",
382
+ " num_rows: 53275\n",
383
+ " })\n",
384
+ " test: Dataset({\n",
385
+ " features: ['sequence', 'label'],\n",
386
+ " num_rows: 5920\n",
387
+ " })\n",
388
+ "})"
389
+ ]
390
+ },
391
+ "execution_count": 9,
392
+ "metadata": {},
393
+ "output_type": "execute_result"
394
+ }
395
+ ],
396
+ "source": [
397
+ "dataset"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 10,
403
+ "id": "b5025f95-ca5d-42b1-95e1-55495f77d009",
404
+ "metadata": {},
405
+ "outputs": [
406
+ {
407
+ "data": {
408
+ "text/plain": [
409
+ "{'sequence': 'CCTGACGCCCACCGCAAGCTGCCGGGTAAGACCGGGTCGACTTCAGCGCGGCCCGCTGCACGAGAGACCATTATGGTGATCCGCCCGCCTGACACTACTGATATGTTGGGATTACAGGCGTGAGCCACGGCGCCCGGCGGGCAAGACACCCTCAGAGCACAGGGTGAATCCATGGTTAAAATACAGCGGGAAGTTAGCGCCGAAGTCGCCGTGTAATTTGTGCGCGGTTCAGGTTCATGTATTCAGAATCATTTTACTAGGTTTAGGGCTCGCCGCTGCCTCAGTGGCTTTCAGGCGCTT',\n",
410
+ " 'label': 0}"
411
+ ]
412
+ },
413
+ "execution_count": 10,
414
+ "metadata": {},
415
+ "output_type": "execute_result"
416
+ }
417
+ ],
418
+ "source": [
419
+ "dataset[\"train\"][0]"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 13,
425
+ "id": "ac999213-67b1-4294-8d92-80b8c6c68acd",
426
+ "metadata": {},
427
+ "outputs": [
428
+ {
429
+ "name": "stdout",
430
+ "output_type": "stream",
431
+ "text": [
432
+ "dna datasets mean token lenght 52.41266891891892 min token length 33 max token length 60\n"
433
+ ]
434
+ }
435
+ ],
436
+ "source": [
437
+ "token_len_list = []\n",
438
+ "for item in dataset[\"test\"]:\n",
439
+ " inputs = tokenizer.tokenize(item[\"sequence\"])\n",
440
+ " token_len_list.append( len(inputs) )\n",
441
+ "\n",
442
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
443
+ "min_len = min(token_len_list)\n",
444
+ "max_len = max(token_len_list)\n",
445
+ "\n",
446
+ "print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": 14,
452
+ "id": "72a2dec3-043b-41e4-afd8-4dbd8c8fcbb0",
453
+ "metadata": {},
454
+ "outputs": [
455
+ {
456
+ "data": {
457
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABdnElEQVR4nO3deZyN9f//8eeZfTEzxjIbZowte2NLEyIUkQotiqylLBXSotVSREiopG9ZKh+lT4sP2UVCkrJlrNHImGEyjDH7zPX7Y36OTmO7zpwzZ4bH/XY7N3Nd1/v1vt7nOE7z7Hpf72MxDMMQAAAAAOCqubl6AAAAAABQ2hCkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpADAyUaPHi2LxVIs52rTpo3atGlj3V63bp0sFou+/PLLYjl/3759VbVq1WI5l73S0tL06KOPKiwsTBaLRcOGDXPq+c7//ScnJzv1PNe6vn37qkyZMq4eBgBYEaQAwIS5c+fKYrFYHz4+PoqIiFCHDh00ffp0nT171iHnSUhI0OjRo7V9+3aH9OdIJXlsV2P8+PGaO3euBg0apE8++USPPPJIoTbnw8+VHv8MraVBcQdrs9LT0zV69GitW7fO1UMBgCvycPUAAKA0Gjt2rKKjo5WTk6PExEStW7dOw4YN09SpU7V48WI1bNjQ2vbll1/WCy+8YKr/hIQEjRkzRlWrVlVMTMxV161cudLUeexxubF9+OGHys/Pd/oYimLt2rW6+eab9dprr12yTbdu3VSjRg3rdlpamgYNGqSuXbuqW7du1v2hoaFOHev1Jj09XWPGjJGkUhdSAVx/CFIAYIc777xTTZs2tW6PGjVKa9eu1V133aW7775bcXFx8vX1lSR5eHjIw8O5H7fp6eny8/OTl5eXU89zJZ6eni49/9U4ceKE6tate9k2DRs2tAnDycnJGjRokBo2bKhevXo5e4gAgFKAqX0A4CBt27bVK6+8oj///FOffvqpdf/F7pFatWqVWrZsqbJly6pMmTK64YYb9OKLL0oqmH7VrFkzSVK/fv2s08jmzp0rqeD/1NevX1/btm3TrbfeKj8/P2vtv++ROi8vL08vvviiwsLC5O/vr7vvvltHjx61aVO1alX17du3UO0/+7zS2C52j9S5c+f0zDPPqEqVKvL29tYNN9ygyZMnyzAMm3YWi0VDhw7VN998o/r168vb21v16tXT8uXLL/6C/8uJEyc0YMAAhYaGysfHRzfeeKPmzZtnPX5+Wtvhw4e1dOlS69iPHDlyVf1fzNq1a9WqVSv5+/urbNmyuueeexQXF3fFuj///FM1atRQ/fr1lZSUJEk6ffq0hg0bZn2datSooYkTJ9pc4Tty5IgsFosmT56s2bNnq3r16vL29lazZs20detWu5/HvzljLIsWLVLdunXl4+Oj+vXr6+uvv7Z5vxw5ckQVK1aUJI0ZM8b69zN69Gibfo4dO6Z7771XZcqUUcWKFTVy5Ejl5eXZtFm4cKGaNGmigIAABQYGqkGDBnrnnXcc9voAgMQVKQBwqEceeUQvvviiVq5cqccee+yibX7//XfdddddatiwocaOHStvb28dPHhQGzdulCTVqVNHY8eO1auvvqqBAweqVatWkqRbbrnF2sfff/+tO++8Uz169FCvXr2uOMXsjTfekMVi0fPPP68TJ05o2rRpat++vbZv3269cnY1rmZs/2QYhu6++259//33GjBggGJiYrRixQo9++yzOnbsmN5++22b9j/++KO++uorDR48WAEBAZo+fbq6d++u+Ph4lS9f/pLjysjIUJs2bXTw4EENHTpU0dHRWrRokfr27avTp0/r6aefVp06dfTJJ59o+PDhqly5sp555hlJsv7ybtbq1at15513qlq1aho9erQyMjI0Y8YMtWjRQr/++uslF904dOiQ2rZtq3LlymnVqlWqUKGC0tPT1bp1ax07dkyPP/64IiMjtWnTJo0aNUrHjx/XtGnTbPpYsGCBzp49q8cff1wWi0WTJk1St27d9McffxT5qqAzxrJ06VI9+OCDatCggSZMmKCUlBQNGDBAlSpVsvZTsWJFvf/++4WmUP7zymBeXp46dOig5s2ba/LkyVq9erWmTJmi6tWra9CgQZIK/ifFQw89pHbt2mnixImSpLi4OG3cuFFPP/10kV4bALBhAACu2pw5cwxJxtatWy/ZJigoyGjUqJF1+7XXXjP++XH79ttvG5KMkydPXrKPrVu3GpKMOXPmFDrWunVrQ5Ixa9asix5r3bq1dfv77783JBmVKlUyUlNTrfu/+OILQ5LxzjvvWPdFRUUZffr0uWKflxtbnz59jKioKOv2N998Y0gyXn/9dZt29913n2GxWIyDBw9a90kyvLy8bPbt2LHDkGTMmDGj0Ln+adq0aYYk49NPP7Xuy87ONmJjY40yZcrYPPeoqCijc+fOl+3v306ePGlIMl577TXrvpiYGCMkJMT4+++/bcbr5uZm9O7d27rv/N//yZMnjbi4OCMiIsJo1qyZcerUKWubcePGGf7+/sb+/fttzvvCCy8Y7u7uRnx8vGEYhnH48GFDklG+fHmb+m+//daQZPzvf/+77PM4/35YtGjRJds4YywNGjQwKleubJw9e9a6b926dYYkm/fLxV7n8/r06WNIMsaOHWuzv1GjRkaTJk2s208//bQRGBho5ObmXva1AICiYmofADhYmTJlLrt6X9myZSVJ3377rd0LM3h7e6tfv35X3b53794KCAiwbt93330KDw/Xd999Z9f5r9Z3330nd3d3PfXUUzb7n3nmGRmGoWXLltnsb9++vapXr27dbtiwoQIDA/XHH39c8TxhYWF66KGHrPs8PT311FNPKS0tTevXr3fAs7ng+PHj2r59u/r27aty5crZjPf222+/6Ou6e/dutW7dWlWrVtXq1asVHBxsPbZo0SK1atVKwcHBSk5Otj7at2+vvLw8/fDDDzZ9Pfjggzb1568MXul1uhqOHktCQoJ27dql3r172yxf3rp1azVo0MD0+J544gmb7VatWtk877Jly+rcuXNatWqV6b4BwAyCFAA4WFpamk1o+bcHH3xQLVq00KOPPqrQ0FD16NFDX3zxhalQValSJVMLS9SsWdNm22KxqEaNGkW6P+hq/Pnnn4qIiCj0etSpU8d6/J8iIyML9REcHKyUlJQrnqdmzZpyc7P9z9qlzlNU5/u74YYbCh2rU6eOkpOTde7cOZv9Xbp0UUBAgFasWKHAwECbYwcOHNDy5ctVsWJFm0f79u0lFdz/9U//fp3OB5krvU5Xw9FjOf9a/XMVxPMutu9yfHx8Ck3F/Pf7Y/DgwapVq5buvPNOVa5cWf3797/q++wAwAzukQIAB/rrr7905syZy/6C6Ovrqx9++EHff/+9li5dquXLl+vzzz9X27ZttXLlSrm7u1/xPGbua7pal/rS4Ly8vKsakyNc6jzGvxamKI26d++uefPm6bPPPtPjjz9ucyw/P1+33367nnvuuYvW1qpVy2bbma9TSRrLv13N+zAkJETbt2/XihUrtGzZMi1btkxz5sxR7969bRYfAYCiIkgBgAN98sknkqQOHTpctp2bm5vatWundu3aaerUqRo/frxeeuklff/992rfvv0lQ429Dhw4YLNtGIYOHjxocyN/cHCwTp8+Xaj2zz//VLVq1azbZsYWFRWl1atX6+zZszZXpfbu3Ws97ghRUVHauXOn8vPzba5KOfo8/zyfJO3bt6/Qsb1796pChQry9/e32f/WW2/Jw8PDupDGww8/bD1WvXp1paWlWa/6uJKjx3L+tTp48GChY//e56j3vZeXl7p06aIuXbooPz9fgwcP1gcffKBXXnnF9FUwALgUpvYBgIOsXbtW48aNU3R0tHr27HnJdqdOnSq07/wX22ZlZUmS9ZfwiwUbe8yfP9/mvq0vv/xSx48f15133mndV716df3000/Kzs627luyZEmhZdLNjK1Tp07Ky8vTzJkzbfa//fbbslgsNucvik6dOikxMVGff/65dV9ubq5mzJihMmXKqHXr1g45z3nh4eGKiYnRvHnzbF6H3bt3a+XKlerUqVOhGovFotmzZ+u+++5Tnz59tHjxYuuxBx54QJs3b9aKFSsK1Z0+fVq5ubkOHf/lOHosERERql+/vubPn6+0tDTr/vXr12vXrl02bf38/Kznsdfff/9ts+3m5mb9Hwbn/30BgCNwRQoA7LBs2TLt3btXubm5SkpK0tq1a7Vq1SpFRUVp8eLF8vHxuWTt2LFj9cMPP6hz586KiorSiRMn9N5776ly5cpq2bKlpIJQU7ZsWc2aNUsBAQHy9/dX8+bNFR0dbdd4y5Urp5YtW6pfv35KSkrStGnTVKNGDZsl2h999FF9+eWX6tixox544AEdOnRIn376qc3iD2bH1qVLF91222166aWXdOTIEd14441auXKlvv32Ww0bNqxQ3/YaOHCgPvjgA/Xt21fbtm1T1apV9eWXX2rjxo2aNm3aZe9Zs9dbb72lO++8U7GxsRowYIB1+fOgoKBC3310npubmz799FPde++9euCBB/Tdd9+pbdu2evbZZ7V48WLddddd6tu3r5o0aaJz585p165d+vLLL3XkyBFVqFDBYWP/73//a71a9099+vRxyljGjx+ve+65Ry1atFC/fv2UkpKimTNnqn79+jbhytfXV3Xr1tXnn3+uWrVqqVy5cqpfv77q169/1ed69NFHderUKbVt21aVK1fWn3/+qRkzZigmJsZ6zxwAOIRL1wwEgFLm/PLn5x9eXl5GWFiYcfvttxvvvPOOzTLb5/17+fM1a9YY99xzjxEREWF4eXkZERERxkMPPVRouelvv/3WqFu3ruHh4WGz3Hjr1q2NevXqXXR8l1r+/D//+Y8xatQoIyQkxPD19TU6d+5s/Pnnn4Xqp0yZYlSqVMnw9vY2WrRoYfzyyy+F+rzc2P69/LlhGMbZs2eN4cOHGxEREYanp6dRs2ZN46233jLy8/Nt2kkyhgwZUmhMl1qW/d+SkpKMfv36GRUqVDC8vLyMBg0aXHSJdkctf24YhrF69WqjRYsWhq+vrxEYGGh06dLF2LNnj02bfy5/fl56errRunVro0yZMsZPP/1kGEbB6zRq1CijRo0ahpeXl1GhQgXjlltuMSZPnmxkZ2cbhnFhyfG33nqr0BgvNr5/O/9+uNRjw4YNThvLwoULjdq1axve3t5G/fr1jcWLFxvdu3c3ateubdNu06ZNRpMmTQwvLy+bfvr06WP4+/sXOte//319+eWXxh133GGEhIQYXl5eRmRkpPH4448bx48fv+xrAwBmWQzjGriDFwAAlDoxMTGqWLEiS5UDKJW4RwoAADhVTk5OoXur1q1bpx07dqhNmzauGRQAFBFXpAAAgFMdOXJE7du3V69evRQREaG9e/dq1qxZCgoK0u7du1W+fHlXDxEATGOxCQAA4FTBwcFq0qSJ/u///k8nT56Uv7+/OnfurDfffJMQBaDU4ooUAAAAAJjEPVIAAAAAYBJBCgAAAABM4h4pSfn5+UpISFBAQIAsFourhwMAAADARQzD0NmzZxURESE3t0tfdyJISUpISFCVKlVcPQwAAAAAJcTRo0dVuXLlSx4nSEkKCAiQVPBiBQYGung0AADgmlW7tnT8uBQeLu3da758Zm0dP3tc4QHh2jvUfD2AK0tNTVWVKlWsGeFSCFKSdTpfYGAgQQoAADjP+WlCbm6SHb9zuPm4STkFf/I7C+BcV7rlh8UmAAAAAMAkghQAAAAAmESQAgAAAACTuEcKAACguGzdKuXlSe7u9pU/tlV5Rp7cLfbVA3AcghQAAEBxCQ8vWnlA0eoBOA5T+wAAAADAJIIUAAAAAJjE1D4AAIDiMnu2lJYmlSkjDRxovnzbbKVlp6mMVxkNbGK+HoDjWAzDMFw9CFdLTU1VUFCQzpw5w5fbAQAA56lcWTp2TKpUSfrrL/PlUyvr2NljqhRQSX+NMF8P4MquNhswtQ8AAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHDdqFVLCgqSQkPtKy9fS0E+QQr1t68egONYDMMwXD0IV7vaby8GAAAoTvHx8UpOTnZK3xUqVFBkZKRT+gZKs6vNBlyRAgAAKIHi4+NVu04dZaSnO6V/Xz8/7Y2LI0wBdiJIAQAAlEDJycnKSE/X4MmzFVG9lkP7Tji0X++NHKjk5GSCFGAnghQAAEAJFlG9lqLrxbh6GAD+hSAFAABQXHr2lJKTpQoVpM8+M13+7sbHdDbrbwV4l9eQFh86YYAArhZBCgAAoLisXy8dOyZVqmRXedyJjUrJSFCwb4SDBwbALL5HCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQXB57TDpzRgoKsqv8thq9lZGdKl+vQAcPDIBZBCkAAIDi8tprRSrv3uAFBw0EQFExtQ8AAABF17evZLFITzxR+NiQIQXH+vYt7lFdnfNj/+ejY8cLx48ckQYMkKKjJV9fqXr1glCcnX11/RuGdOedBf1+843tsaeekpo0kby9pZgYxzwfFAuuSAEAAMAxqlSRFi6U3n67IHBIUmamtGCBFBnp2rFdSceO0pw5F7a9vS/8vHevlJ8vffCBVKOGtHt3wTTNc+ekyZOv3Pe0aQUh6lL695e2bJF27rR7+Ch+XJECAACAYzRuXBCmvvrqwr6vvioIUY0a2bbNz5cmTLhwlefGG6Uvv7xwPC/P9irQDTdI77xj20ffvtK99xaEmfBwqXz5gqtfOTnmx+7tLYWFXXgEB184dj5k3XGHVK2adPfd0siRts/zUrZvl6ZMkT7++OLHp08vGHO1aubHDJciSAEAABSXypULrkxUrmxX+dCv66rngrIa+nVdBw/Mgfr3t72y8/HHUr9+hdtNmCDNny/NmiX9/rs0fLjUq5e0fn3B8fz8gtdp0SJpzx7p1VelF1+UvvjCtp/vv5cOHSr4c948ae7cgsd5o0dLVateedzr1kkhIQWBbdAg6e+/L9/+zBmpXLnLt0lPlx5+WHr33YJwhmsKU/sAAADgOL16SaNGSX/+WbC9cWPBdL916y60ycqSxo+XVq+WYmML9lWrJv34Y8H0udatJU9PacyYCzXR0dLmzQVB6oEHLuwPDpZmzpTc3aXataXOnaU1awqm3klShQoF9zRdTseOUrduBec4dKggsN15Z8H53N0Ltz94UJox48rT+oYPl265Rbrnnsu3Q6lEkAIAAIDjVKxYEGbmzi1YZKFz54Iw808HDxZcrbn9dtv92dm2UwDffbfgilZ8vJSRUXD83wsy1KtnG3bCw6Vduy5sDx1a8LicHj0u/NyggdSwYUH4WrdOatfOtu2xYwXB6/77L4S1i1m8WFq7Vvrtt8ufG6UWQQoAAACO1b//hfDy7ruFj6elFfy5dKlUqZLtsfOLPCxcWHAf0pQpBVetAgKkt94qWJThnzw9bbctloJpgUVRrVpB+Dt40DZIJSRIt91WcJVp9uzL97F2bcHVrbJlbfd37y61amV7hQ6lEkEKAAAAjtWxY8HVI4tF6tCh8PG6dQsCU3x8wTS+i9m4sSCwDB58Yd+hQ84Z77/99VfBPVLh4Rf2HTtWEKKaNCm4B8ztCksNvPCC9OijtvsaNChY0bBLF8ePGcWOIAUAAADHcneX4uIu/PxvAQEFV5uGDy+4etSyZcHiDRs3SoGBUp8+Us2aBYtRrFhRcO/SJ59IW7cW/GzGzJnS118X3Dd1MWlpBfdide9esCDEoUPSc88VLHN+PgQeOya1aSNFRRXcF3Xy5IX684tIHDtWcPVq/nzpppsurP73b5GRts/h4MGCMSQmFkxf3L69YH/dupKXl7nnimJFkAIAAIDjBQZe/vi4cQX3U02YIP3xR8EUuMaNCxZ6kKTHHy+4v+jBBwuubD30UMHVqWXLzI0jOfnyV7Lc3Qu+v2nePOn0aSkiomCZ83HjLkwzXLWqIPAcPFh4xUXDKPgzJ0fat6/g3i8zHn30wkqF0oV7xA4fvrrVBuEyFsM4/7d//UpNTVVQUJDOnDmjwCv9owcAALBX5coFVy4qVSqYPnYZv/76q5o0aaLXv16n6HoxkgqWP0/JSFCwb4Rmdt1j9zAO/75dL3dto23btqlx48Z29wNci642G/A9UgAAAABgEkEKAAAAAEziHikAAIDi8umnBV9Ge/7eG5MG3/KBcvOy5eHOIgSAqxGkAAAAikubNkUqrxvayjHj+P/izq+s5wQVKlRQZGSk0/oHXI0gBQAAcJ05fTJJFotFvXr1cto5fP38tDcujjCFaxZBCgAA4DqTnnpGhmGo37jpql6/ocP7Tzi0X++NHKjk5GSCFK5ZBCkAAIDism7dhXuk7Jjmtydpg/UeKUdM8wuPrmFdWh2AOQQpAACA4tKr11V/j9TFvLfpcYd8jxSAomP5cwAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATOJ7pAAAAOwUHx+v5OTkq25fPydHXpKyc3K0+9dfL9s2Li6uiKMD4EwEKQAAADvEx8erdp06ykhPv+qao5IqSzpx4oSaNGlyVTXZWdn2DRCAUxGkAAAA7JCcnKyM9HQNnjxbEdVrXVXNvH/8/PoV2u5Yv0qLpr2h3Nxc676ZXfeYHygApyBIAQAAFEFE9VqKrhfj8H4TDu13eJ8AHIfFJgAAAADAJIIUAAAAAJjE1D4AAIBiUn/Gm/I6m6rsgEDtfvIF0/X/3fWmMrJT5esVqO4NzNcDcByCFAAAQDGp8cV8+SUlKD00wq4g9f3B+UrJSFCwbwRBCnAxpvYBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1wapPLy8vTKK68oOjpavr6+ql69usaNGyfDMKxtDMPQq6++qvDwcPn6+qp9+/Y6cOCATT+nTp1Sz549FRgYqLJly2rAgAFKS0sr7qcDAAAA4Drh0iA1ceJEvf/++5o5c6bi4uI0ceJETZo0STNmzLC2mTRpkqZPn65Zs2Zpy5Yt8vf3V4cOHZSZmWlt07NnT/3+++9atWqVlixZoh9++EEDBw50xVMCAAAAcB1w6Rfybtq0Sffcc486d+4sSapatar+85//6Oeff5ZUcDVq2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjh+Li4rR8+XJt3bpVTZs2lSTNmDFDnTp10uTJkxUREeGaJwcAAPAvJ25qIe+Uv5UVXN6u+johLXQ2628FeNtXD8BxXBqkbrnlFs2ePVv79+9XrVq1tGPHDv3444+aOnWqJOnw4cNKTExU+/btrTVBQUFq3ry5Nm/erB49emjz5s0qW7asNURJUvv27eXm5qYtW7aoa9euhc6blZWlrKws63ZqaqoTnyUAAECBTVM+LFL9kBZFqwfgOC4NUi+88IJSU1NVu3Ztubu7Ky8vT2+88YZ69uwpSUpMTJQkhYaG2tSFhoZajyUmJiokJMTmuIeHh8qVK2dt828TJkzQmDFjHP10AAAAAFwnXHqP1BdffKHPPvtMCxYs0K+//qp58+Zp8uTJmjdvnlPPO2rUKJ05c8b6OHr0qFPPBwClwboj62QZY9HpzNOSpLnb56rsm2VdOiYAAEoqlwapZ599Vi+88IJ69OihBg0a6JFHHtHw4cM1YcIESVJYWJgkKSkpyaYuKSnJeiwsLEwnTpywOZ6bm6tTp05Z2/ybt7e3AgMDbR4AUJL1/aavLGMsemLJE4WODVk6RJYxFvX9pq9Dz/lgvQe1/8n9Du3zalWdVlWWMRabx5s/vmk9vi95n26bd5tCJ4fK53UfVXunml5e+7Jy8nIu22/8mXh1XtBZfm/4KeStED278lnl5ufatFl3ZJ0af9BY3q97q8b0Gpq7fa4zniIAoJRz6dS+9PR0ubnZZjl3d3fl5+dLkqKjoxUWFqY1a9YoJiZGUsH9TFu2bNGgQYMkSbGxsTp9+rS2bdumJk2aSJLWrl2r/Px8NW/evPieDAA4WZXAKlq4e6He7vC2fD19JUmZuZlasHuBIoMiHX4+X09f63lcYWybsXqsyWPW7QCvAOvPnu6e6t2wtxqHN1ZZn7LakbRDj/3vMeUb+RrfbvxF+8vLz1PnBZ0VViZMmwZs0vGzx9X7m97ydPe01hxOOazOCzrriSZP6LNun2nN4TV6dPGjCi8Trg41Ojj3CeO60LZ3F/kkn1RmhYpaO/9/puvfWNNFZzJPKsinol5qZ74egOO4NEh16dJFb7zxhiIjI1WvXj399ttvmjp1qvr37y9JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XklSnTh117NhRjz32mGbNmqWcnBwNHTpUPXr0YMU+ANeUxuGNdSjlkL6K+0o9GxbcS/pV3FeKDIpUdNlom7b5Rr4m/jhRs3+drcS0RNUqX0uv3PqK7qt7n7XNdwe+07Dlw3Q09ahurnyz+tzYx6aPudvnatjyYTr9wmlJ0qFThzRi5Qj99NdPOpd9TnUq1tGEdhPUvtqFBYGqTquqgU0G6uCpg1q0Z5GCfYL18q0va2AT819JEeAdoLAyF59ZUC24mqoFV7NuR5WN0roj67QhfsMl+1t5aKX2nNyj1Y+sVmiZUMWExWjcbeP0/OrnNbrNaHm5e2nWL7MUXTZaUzpMkSTVqVhHP8b/qLd/epsgBYcIPHxIfkkJSj9r30JXx1MPKSUjQenZLJQFuJpLp/bNmDFD9913nwYPHqw6depo5MiRevzxxzVu3Dhrm+eee05PPvmkBg4cqGbNmiktLU3Lly+Xj4+Ptc1nn32m2rVrq127durUqZNatmyp2bNnu+IpAYBT9Y/prznb51i3P/7tY/WL6Veo3YQNEzR/53zN6jxLvw/+XcNvHq5eX/XS+iPrJUlHzxxVt8+7qUutLtr++HY92uhRvbD6hcueOy07TZ1qdNKa3mv02+O/qWP1juryny6KPxNv027K5ilqGtFUvz3+mwY3G6xBSwdpX/I+6/E2c9tc1TTEN398U+UnlVejDxrprY1vFZqC908HTx3U8oPL1Tqq9SXbbP5rsxqENFBomQsLGHWo3kGpWan6/cTv1jb/DIbn22z+a/MVxwsAuL649IpUQECApk2bpmnTpl2yjcVi0dixYzV27NhLtilXrpwWLFjghBECQMnSq2EvjVozSn+e/lOStPHoRi28b6HWHVlnbZOVm6XxP47X6kdWK7ZKrKSCKzg/xv+oD7Z9oNZVW+v9X95X9XLVrVdebqhwg3ad2KWJGyde8tw3ht2oG8NutG6PaztOX+/9Wov3LdbQm4Za93eq2UmDmw2WJD3f4nm9/dPb+v7I97qhwg2SpMigSIWXCb/s83yq+VNqHN5Y5XzLadPRTRq1ZpSOpx3X1A5Tbdrd8tEt+vX4r8rKy9LAxgM19rZL/7ciMS3RJkRJsm4npiVeaONfuE1qVqoycjJcOtURAFCyuDRIAQDMqehfUZ1rddbc7XNlyFDnmp1Vwa+CTZuDpw4qPSddt39yu83+7LxsNQpvJEmKS45T80q295HGVo697LnTstM0et1oLT2wVMfPHldufq4ycjMKXZFqGNLQ+rPFYlFYmTCdOHdhUaD5Xedf8XmOiB1xob/QhvJy99LjSx7XhHYT5O3hbT32+X2f62z2We1I3KFnVz2ryZsm67kWz12xfwAAioogBQClTP+Y/hq6rOAK0Lud3i10PC07TZK09OGlqhRYyeaYt7t3ofZXa+TKkVr1xypNvn2yapSrIV9PX933xX3Kzsu2aefp7mmzbZFF+Ua+3eeVpOaVmis3P1dHTh+xXtmSpCpBVSRJdSvWVZ6Rp4H/G6hnYp+Ru5t7oT7CyoTp52M/2+xLSkuyHjv/Z9K5pEJtAr0DuRoFALBBkAKAUqZjjY7KzsuWRRZ1qF54AYS6FevK291b8Wfi1brqxe8ZqlOhjhbvW2yz76e/frrseTce3ai+N/ZV1zpdJRUEtiOnj9j3JEzanrhdbhY3hfiHXLJNvpGvnPwc5Rv5clfhIBVbOVZvbHhDJ86dsPaz6o9VCvQOVN2Kda1tvjv4nU3dqj9WXfFqHQDg+kOQAoBSxt3NXXFD4qw//1uAd4BG3jJSw1cMV76Rr5aRLXUm64w2xm9UoHeg+sT00RNNn9CUzVP07Mpn9WjjR7Xt+DbN3TH3suetWa6mvtr7lbrc0EUWWfTK96/YdaWp99e9VSmgkia0n3DR45uPbtaWY1t0W9XbFOAdoM1HN2v4iuHq1bCXgn2DJUmf7fxMnu6eahDSQN4e3vol4ReNWjNKD9Z70HpF7Ou4rzVqzSjtHbpXknRH9TtUt2JdPfL1I5rUfpIS0xL18tqXNaTZEOt0wSeaPqGZW2fquVXPqX+j/lp7eK2++P0LLX14qennCQC4thGkAKAUCvS+/BeJj7ttnCr6VdSEHyfoj5Q/VNanrBqHN9aLrV6UVLDgw38f+K+GrxiuGT/P0E2VbtL4tuPVf3H/S/Y5tcNU9f+2v2756BZV8Kug51s8r9Qs80swx5+Jl5vl0ovGent4a+HuhRq9brSy8rIUXTZaw28ebnPflIebhyZunKj9f++XYRiKKhuloc2GanjscGubM1lntO/vC6sFuru5a8lDSzRo6SDFfhQrfy9/9bmxj80CFdHB0Vr68FINXzFc72x5R5UDK+v/7v4/lj4HABRiMQzDcPUgXC01NVVBQUE6c+aMAgMv/8sJAACAJP36669q0qSJXv96naLrxVxVzb2t6hZ8j1RohL7ZsOeybTcu/kLvjRyoFz/9TvVuukWSNPTrukrJSFCwb4Rmdr18vdm+Henw79v1ctc22rZtmxo3buzw/gFnutpswBUpAACAYrJryHPyTE9Tjl8Zu+q7NXhOmTlp8vG0rx6A4xCkAAAAismhHn2LVN+2RtHqATjOpSepAwAAAAAuiiAFAAAAACYxtQ8AAKCY+JxIlCU/T4abuzJDwkzXp2QkKt/Ik5vFXcG+5usBOA5BCgAAoJh07N72qlftu5hXlrd1yKp9AIqOqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGCSh6sHAAAAcL1YM+9bueXlKt/dvl/BXmz3rfLzc+Xmxq9wgKvxrxAAAKCYnK1Ws0j1EYFFqwfgOEztAwAAAACTCFIAAAAAYBJT+wAAAIpJ1P8WySMjQ7m+vvqzy/2m6zceWaTs3Ax5efiqRVXz9QAchyAFAABQTBpNek1+SQlKD42wK0j957fXlJKRoGDfCIIU4GJM7QMAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYxBfyAgAAFJOMiiE2f5pV1jfE5k8ArkOQAgAAKCYrvlpXpPrXOxatHoDjEKQAAMA1LT4+XsnJyQ7vNy4uzuF9Aig9CFIAAOCaFR8fr9p16igjPd1p58jOynZa3wBKLoIUAAC4ZiUnJysjPV2DJ89WRPVaDu17x/pVWjTtDeXm5jq0XwClA0EKAABc8yKq11J0vRiH9plwaL/pmmavDJP3mRRlBQVr67hppus/+nmY0rJSVMY7WANuMl8PwHEIUgAAAMWk0rqV8ktKUHpohLbaUf/bsZVKyUhQsG+Ew8cGwBy+RwoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUEyO3NVdXmdOKzuorF31t1TtrnPZp+XvZV89AMchSAEAABST7c+PK1L9w42KVg/AcZjaBwAAAAAmEaQAAAAAwCSCFAAAAACYxD1SAAAAxaRzh2byO5Go9JAwLV2x1XT9yCXNlJKeqGC/ME2+y3w9AMfhihQAAEAx8Uw/J89zZ+WZfs6u+sycc8rMPavMHPvqATgOQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHC9+HnsVLlnZirPx8eu+v43TVVOXqY83e2rB+A4BCkAAIBiknBbxyLVN65UtHoAjsPUPgAAAAAwiSAFAAAAACYxtQ8AAKCYBO/eLvecbOV5eimlfozp+sOntis3L1se7l6KLme+HoDjEKQAAACKSetBD8svKUHpoRH6ZsMe0/VT1j+slIwEBftGaGZX8/UAHIcgBQAAAKeIi4tzSr8VKlRQZGSkU/oGrhZBCgAAAA51+mSSLBaLevXq5ZT+ff38tDcujjAFlyJIAQAAwKHSU8/IMAz1Gzdd1es3dGjfCYf2672RA5WcnEyQgksRpAAAAOAU4dE1FF0vxtXDAJyC5c8BAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAExi1T4AAIBismTZFkmGJItd9W/dtUWGDFnsrAfgOAQpAACAYpJbJqBI9b6eRasH4DhM7QMAAAAAkwhSAAAAAGCSy4PUsWPH1KtXL5UvX16+vr5q0KCBfvnlF+txwzD06quvKjw8XL6+vmrfvr0OHDhg08epU6fUs2dPBQYGqmzZshowYIDS0tKK+6kAAABcVu2PZ6rB9Amq/fFMu+q/i5up/+6coO/i7KsH4DguDVIpKSlq0aKFPD09tWzZMu3Zs0dTpkxRcHCwtc2kSZM0ffp0zZo1S1u2bJG/v786dOigzMxMa5uePXvq999/16pVq7RkyRL98MMPGjhwoCueEgAAwCXVnvOeGsycqNpz3rOr/ru97+mr3RP13V776gE4jksXm5g4caKqVKmiOXPmWPdFR0dbfzYMQ9OmTdPLL7+se+65R5I0f/58hYaG6ptvvlGPHj0UFxen5cuXa+vWrWratKkkacaMGerUqZMmT56siIiI4n1SAAAAAK55Lr0itXjxYjVt2lT333+/QkJC1KhRI3344YfW44cPH1ZiYqLat29v3RcUFKTmzZtr8+bNkqTNmzerbNmy1hAlSe3bt5ebm5u2bNly0fNmZWUpNTXV5gEAAAAAV8ulQeqPP/7Q+++/r5o1a2rFihUaNGiQnnrqKc2bN0+SlJiYKEkKDQ21qQsNDbUeS0xMVEhIiM1xDw8PlStXztrm3yZMmKCgoCDro0qVKo5+agAAAACuYS4NUvn5+WrcuLHGjx+vRo0aaeDAgXrsscc0a9Ysp5531KhROnPmjPVx9OhRp54PAAAAwLXFpUEqPDxcdevWtdlXp04dxcfHS5LCwsIkSUlJSTZtkpKSrMfCwsJ04sQJm+O5ubk6deqUtc2/eXt7KzAw0OYBAAAAAFfLpUGqRYsW2rdvn82+/fv3KyoqSlLBwhNhYWFas2aN9Xhqaqq2bNmi2NhYSVJsbKxOnz6tbdu2WdusXbtW+fn5at68eTE8CwAAAADXG5eu2jd8+HDdcsstGj9+vB544AH9/PPPmj17tmbPni1JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XUsEVrI4dO1qnBObk5Gjo0KHq0aMHK/YBAAAAcAqXBqlmzZrp66+/1qhRozR27FhFR0dr2rRp6tmzp7XNc889p3PnzmngwIE6ffq0WrZsqeXLl8vHx8fa5rPPPtPQoUPVrl07ubm5qXv37po+fbornhIAAACA64BLg5Qk3XXXXbrrrrsuedxisWjs2LEaO3bsJduUK1dOCxYscMbwAAAAHOZUvYZKD6+kzHLl7aqPLtdQ5TMrKdDHvnoAjuPyIAUAAHC9+GHWwiLVP9O6aPUAHMeli00AAAAAQGlEkAIAAAAAkwhSAAAAAGAS90gBAAAUk1uf6CGfU38rs1x5u+6XmrK+h1Iz/1agT3nulwJcjCAFAABQTMr9vlN+SQlKD7Xvuy4Pn9qplIwEBfvyXZmAqzG1DwAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQTPb2GyzPtLPKKRNgV32n2oOVkXNWvp721QNwHLuC1B9//KFq1ao5eiwAAADXtL39hxapvlOdotUDcBy7pvbVqFFDt912mz799FNlZmY6ekwAAAAAUKLZFaR+/fVXNWzYUCNGjFBYWJgef/xx/fzzz44eGwAAAACUSHYFqZiYGL3zzjtKSEjQxx9/rOPHj6tly5aqX7++pk6dqpMnTzp6nAAAAKWeR9pZeaSlyiPtrF31GTlnlZ6Tqowc++oBOE6RVu3z8PBQt27dtGjRIk2cOFEHDx7UyJEjVaVKFfXu3VvHjx931DgBAABKvbvubK4HGkfqrjub21X/7JLmemxRpJ5dYl89AMcpUpD65ZdfNHjwYIWHh2vq1KkaOXKkDh06pFWrVikhIUH33HOPo8YJAAAAACWGXav2TZ06VXPmzNG+ffvUqVMnzZ8/X506dZKbW0Eui46O1ty5c1W1alVHjhUAAAAASgS7gtT777+v/v37q2/fvgoPD79om5CQEH300UdFGhwAAAAAlER2BakDBw5csY2Xl5f69OljT/cAAAAAUKLZdY/UnDlztGjRokL7Fy1apHnz5hV5UAAAAABQktkVpCZMmKAKFSoU2h8SEqLx48cXeVAAAAAAUJLZFaTi4+MVHR1daH9UVJTi4+OLPCgAAAAAKMnsClIhISHauXNnof07duxQ+fLlizwoAAAAACjJ7Fps4qGHHtJTTz2lgIAA3XrrrZKk9evX6+mnn1aPHj0cOkAAAOB68fHxSk5OdkrfFSpUUGRkpFP6BgBnsStIjRs3TkeOHFG7du3k4VHQRX5+vnr37s09UgAAXGPi4+NVu04dZaSnO6V/Xz8/7Y2Luy7C1Pr3F8g9J1t5nl521T/TeoFy87Ll4W5fPQDHsStIeXl56fPPP9e4ceO0Y8cO+fr6qkGDBoqKinL0+AAAgIslJycrIz1dgyfPVkT1Wg7tO+HQfr03cqCSk5OviyCVUj+mSPXR5YpWD8Bx7ApS59WqVUu1ajn2AxUAAJRMEdVrKbpejKuHAQAlgl1BKi8vT3PnztWaNWt04sQJ5efn2xxfu3atQwYHAAAAACWRXUHq6aef1ty5c9W5c2fVr19fFovF0eMCAAC45kR8v1zumZnK8/FRwm0dTdf/emy5cvIy5enuo8aVzNcDcBy7gtTChQv1xRdfqFOnTo4eDwAAwDXrpldHyC8pQemhEfpmg/kg9PHPI5SSkaBg3wg17kqQAlzJru+R8vLyUo0aNRw9FgAAAAAoFewKUs8884zeeecdGYbh6PEAAAAAQIln19S+H3/8Ud9//72WLVumevXqydPT0+b4V1995ZDBAQAAAEBJZFeQKlu2rLp27erosQAAAABAqWBXkJozZ46jxwEAAAAApYZd90hJUm5urlavXq0PPvhAZ8+elSQlJCQoLS3NYYMDAAAAgJLIritSf/75pzp27Kj4+HhlZWXp9ttvV0BAgCZOnKisrCzNmjXL0eMEAAAAgBLDritSTz/9tJo2baqUlBT5+vpa93ft2lVr1qxx2OAAAAAAoCSy64rUhg0btGnTJnl5ednsr1q1qo4dO+aQgQEAAFxrcvz8leMfoBw/f7vqfTz95ZMTIB9P++oBOI5dQSo/P195eXmF9v/1118KCAgo8qAAAACuRUtXbC1S/eS7ilYPwHHsmtp3xx13aNq0adZti8WitLQ0vfbaa+rUqZOjxgYAAAAAJZJdV6SmTJmiDh06qG7dusrMzNTDDz+sAwcOqEKFCvrPf/7j6DECAAAAQIliV5CqXLmyduzYoYULF2rnzp1KS0vTgAED1LNnT5vFJwAAAADgWmRXkJIkDw8P9erVy5FjAQAAuKbFTHxFXmdOKzuorLY/P850/YLfXtG57NPy9yqrhxuZrwfgOHYFqfnz51/2eO/eve0aDAAAwLWs6pL/yi8pQemhEXYFqU1H/quUjAQF+0YQpAAXsytIPf300zbbOTk5Sk9Pl5eXl/z8/AhSAAAAAK5pdq3al5KSYvNIS0vTvn371LJlSxabAAAAAHDNsytIXUzNmjX15ptvFrpaBQAAAADXGocFKalgAYqEhARHdgkAAAAAJY5d90gtXrzYZtswDB0/flwzZ85UixYtHDIwAAAAACip7ApS9957r822xWJRxYoV1bZtW02ZMsUR4wIAAACAEsuuIJWfn+/ocQAAAABAqeHQe6QAAAAA4Hpg1xWpESNGXHXbqVOn2nMKAACAa86xNnfI+0yKsoKC7apvVOkOpWWlqIy3ffUAHMeuIPXbb7/pt99+U05Ojm644QZJ0v79++Xu7q7GjRtb21ksFseMEgAA4Bqwddy0ItUPuKlo9QAcx64g1aVLFwUEBGjevHkKDi74PyIpKSnq16+fWrVqpWeeecahgwQAAACAksSue6SmTJmiCRMmWEOUJAUHB+v1119n1T4AAAAA1zy7glRqaqpOnjxZaP/Jkyd19uzZIg8KAAAAAEoyu6b2de3aVf369dOUKVN00003SZK2bNmiZ599Vt26dXPoAAEAAK4VHbq1ke/JE8qoGKIVX60zXf/y8jY6nXFCZX1D9HpH8/UAHMeuIDVr1iyNHDlSDz/8sHJycgo68vDQgAED9NZbbzl0gAAAANcK35Mn5JeUYHf96YwTSsmwvx6A49gVpPz8/PTee+/prbfe0qFDhyRJ1atXl7+/v0MHBwAAAAAlUZG+kPf48eM6fvy4atasKX9/fxmG4ahxAQAAAECJZVeQ+vvvv9WuXTvVqlVLnTp10vHjxyVJAwYMYOlzAAAAANc8u4LU8OHD5enpqfj4ePn5+Vn3P/jgg1q+fLnDBgcAAAAAJZFd90itXLlSK1asUOXKlW3216xZU3/++adDBgYAAAAAJZVdV6TOnTtncyXqvFOnTsnb27vIgwIAAACAksyuINWqVSvNnz/fum2xWJSfn69Jkybptttuc9jgAAAAAKAksmtq36RJk9SuXTv98ssvys7O1nPPPafff/9dp06d0saNGx09RgAAAAAoUewKUvXr19f+/fs1c+ZMBQQEKC0tTd26ddOQIUMUHh7u6DECAABcE357bow8MjKU6+trV/1DjcYoOzdDXh721QNwHNNBKicnRx07dtSsWbP00ksvOWNMAAAA16Q/u9xfpPoWVYtWD8BxTN8j5enpqZ07dzpjLAAAAABQKti12ESvXr300UcfOXosAAAAAFAq2HWPVG5urj7++GOtXr1aTZo0kb+/v83xqVOnOmRwAAAA15KAPw7ILS9X+e4eOlutpun6hNQDys/PlZubhyICzdcDcBxTQeqPP/5Q1apVtXv3bjVu3FiStH//fps2FovFcaMDAAC4hrTrc4/8khKUHhqhbzbsMV0/fs09SslIULBvhGZ2NV8PwHFMBamaNWvq+PHj+v777yVJDz74oKZPn67Q0FCnDA4AAAAASiJT90gZhmGzvWzZMp07d86hAwIAAACAks6uxSbO+3ewAgAAAIDrgakgZbFYCt0DxT1RAAAAAK43pu6RMgxDffv2lbe3tyQpMzNTTzzxRKFV+7766ivHjRAAAAAAShhTV6T69OmjkJAQBQUFKSgoSL169VJERIR1+/zDHm+++aYsFouGDRtm3ZeZmakhQ4aofPnyKlOmjLp3766kpCSbuvj4eHXu3Fl+fn4KCQnRs88+q9zcXLvGAAAAAABXw9QVqTlz5jhlEFu3btUHH3yghg0b2uwfPny4li5dqkWLFikoKEhDhw5Vt27dtHHjRklSXl6eOnfurLCwMG3atEnHjx9X79695enpqfHjxztlrAAAAABQpMUmHCEtLU09e/bUhx9+qODgYOv+M2fO6KOPPtLUqVPVtm1bNWnSRHPmzNGmTZv0008/SZJWrlypPXv26NNPP1VMTIzuvPNOjRs3Tu+++66ys7Nd9ZQAAAAAXONcHqSGDBmizp07q3379jb7t23bppycHJv9tWvXVmRkpDZv3ixJ2rx5sxo0aGDzPVYdOnRQamqqfv/990ueMysrS6mpqTYPAAAAALhapqb2OdrChQv166+/auvWrYWOJSYmysvLS2XLlrXZHxoaqsTERGubf38Z8Pnt820uZsKECRozZkwRRw8AAGDO8v+ulSU/T4abu1314zquVb6RJzeLffUAHMdlQero0aN6+umntWrVKvn4+BTruUeNGqURI0ZYt1NTU1WlSpViHQMAALj+ZIaEFak+2Ldo9QAcx2VT+7Zt26YTJ06ocePG8vDwkIeHh9avX6/p06fLw8NDoaGhys7O1unTp23qkpKSFBZW8CESFhZWaBW/89vn21yMt7e3AgMDbR4AAAAAcLVcFqTatWunXbt2afv27dZH06ZN1bNnT+vPnp6eWrNmjbVm3759io+PV2xsrCQpNjZWu3bt0okTJ6xtVq1apcDAQNWtW7fYnxMAAACA64PLpvYFBASofv36Nvv8/f1Vvnx56/4BAwZoxIgRKleunAIDA/Xkk08qNjZWN998syTpjjvuUN26dfXII49o0qRJSkxM1Msvv6whQ4ZYvzQYAACgpKi+cK4809OU41dGh3r0NV2/9uBcZeakycezjNrWMF8PwHFcutjElbz99ttyc3NT9+7dlZWVpQ4dOui9996zHnd3d9eSJUs0aNAgxcbGyt/fX3369NHYsWNdOGoAAICLa/DuJPklJSg9NMKuIPXVrklKyUhQsG/EdR+k4uLinNJvhQoVFBkZ6ZS+cW0pUUFq3bp1Nts+Pj5699139e67716yJioqSt99952TRwYAAICS4PTJJFksFvXq1csp/fv6+WlvXBxhCldUooIUAAAAcDnpqWdkGIb6jZuu6vUbOrTvhEP79d7IgUpOTiZI4YoIUgAAACh1wqNrKLpejKuHgeuYy1btAwAAAIDSiiAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1i1DwAAoJikRldXdkCgMitUtKs+PLC6/LwCFeRjXz0AxyFIAQAAFJO18/9XpPqX2hWtHoDjMLUPAAAAAEwiSAEAAACASQQpAAAAADCJe6QAAACKyS3PPCbvlL+VFVxem6Z8aLr+3Y2P6WzW3wrwLq8hLczXA3AcghQAAEAxCfl5o/ySEpQeGmFXfdyJjUrJSFCwr331AByHqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk/hCXgAArgHx8fFKTk52St9xcXFO6fd6dPCB3vI6m6rsgEC76m+r0VsZ2any9bKvHoDjEKQAACjl4uPjVbtOHWWkpzv1PNlZ2U7t/3qw+8kXilTfvUHR6gE4DkEKAIBSLjk5WRnp6Ro8ebYiqtdyeP871q/SomlvKDc31+F9A0BpRZACAOAaEVG9lqLrxTi834RD+x3eJwCUdiw2AQAAAAAmcUUKAACgmNzbqq78khKUHhqhbzbsMV0/9Ou6SslIULBvhGZ2NV8PwHG4IgUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACZ5uHoAAAAA14tNkz+Qe3a28ry87KoffMsHys3Lloe7ffUAHIcgBQAAUExONG9VpPq6oUWrB+A4TO0DAAAAAJMIUgAAAABgElP7AAAAiknIlg3We6Tsmea3J2mD9R4ppvkBrkWQAgAAKCa3jHxcfkkJSg+N0Dcb9piuf2/T40rJSFCwb4RmdjVfD8BxmNoHAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmebh6AAAAANeLbzbsKVL9zK5FqwfgOFyRAgAAAACTCFIAAAAAYBJBCgAAAABM4h4pAACAYlJ/xpvyOpuq7IBA7X7yBdP1/931pjKyU+XrFajuDczXA3AcghQAAEAxqfHFfPklJSg9NMKuIPX9wflKyUhQsG8EQQpwMab2AQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAEziC3kBAACKyYmbWsg75W9lBZe3q75OSAudzfpbAd721QNwHIIUAABAMdk05cMi1Q9pUbR6AI7D1D4AAAAAMIkgBQAAAAAmEaQAAAAAwCTukQIAACgmbXt3kU/ySWVWqKi18/9nuv6NNV10JvOkgnwq6qV25usBOA5BCgAAoJgEHj4kv6QEpZ9Ntav+eOohpWQkKD3bvnoAjsPUPgAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJfCEvAABAMdk15Dl5pqcpx6+MXfXdGjynzJw0+XjaVw/AcQhSAAAAxeRQj75Fqm9bo2j1AByHqX0AAAAAYBJBCgAAAABMcmmQmjBhgpo1a6aAgACFhITo3nvv1b59+2zaZGZmasiQISpfvrzKlCmj7t27KykpyaZNfHy8OnfuLD8/P4WEhOjZZ59Vbm5ucT4VAACAK/I5kSjfxGPyOZFoV31KRqL+Tj+mlAz76gE4jkvvkVq/fr2GDBmiZs2aKTc3Vy+++KLuuOMO7dmzR/7+/pKk4cOHa+nSpVq0aJGCgoI0dOhQdevWTRs3bpQk5eXlqXPnzgoLC9OmTZt0/Phx9e7dW56enho/frwrnx4AADbi4+OVnJzs8H7j4uIc3ieco2P3tvJLSlB6aIS+2bDHdP0ry9sqJSNBwb4RmtnVfD0Ax3FpkFq+fLnN9ty5cxUSEqJt27bp1ltv1ZkzZ/TRRx9pwYIFatu2rSRpzpw5qlOnjn766SfdfPPNWrlypfbs2aPVq1crNDRUMTExGjdunJ5//nmNHj1aXl5ehc6blZWlrKws63ZqaqpznygA4LoXHx+v2nXqKCM93WnnyM7KdlrfAABbJWrVvjNnzkiSypUrJ0natm2bcnJy1L59e2ub2rVrKzIyUps3b9bNN9+szZs3q0GDBgoNDbW26dChgwYNGqTff/9djRo1KnSeCRMmaMyYMU5+NgAAXJCcnKyM9HQNnjxbEdVrObTvHetXadG0N5jWDgDFqMQEqfz8fA0bNkwtWrRQ/fr1JUmJiYny8vJS2bJlbdqGhoYqMTHR2uafIer88fPHLmbUqFEaMWKEdTs1NVVVqlRx1FMBAOCSIqrXUnS9GIf2mXBov0P7AwBcWYkJUkOGDNHu3bv1448/Ov1c3t7e8vb2dvp5AAAAAFybSsTy50OHDtWSJUv0/fffq3Llytb9YWFhys7O1unTp23aJyUlKSwszNrm36v4nd8+3wYAAAAAHMmlQcowDA0dOlRff/211q5dq+joaJvjTZo0kaenp9asWWPdt2/fPsXHxys2NlaSFBsbq127dunEiRPWNqtWrVJgYKDq1q1bPE8EAAAAwHXFpVP7hgwZogULFujbb79VQECA9Z6moKAg+fr6KigoSAMGDNCIESNUrlw5BQYG6sknn1RsbKxuvvlmSdIdd9yhunXr6pFHHtGkSZOUmJiol19+WUOGDGH6HgAAAACncGmQev/99yVJbdq0sdk/Z84c9e3bV5L09ttvy83NTd27d1dWVpY6dOig9957z9rW3d1dS5Ys0aBBgxQbGyt/f3/16dNHY8eOLa6nAQAAAOA649IgZRjGFdv4+Pjo3Xff1bvvvnvJNlFRUfruu+8cOTQAAAAAuKQSs2ofAADAtW7NvG/llperfHf7fgV7sd23ys/PlZsbv8IBrsa/QgAAgGJytlrNItVHBBatHoDjlIjlzwEAAACgNCFIAQAAAIBJTO0DAAAoJlH/WySPjAzl+vrqzy73m67feGSRsnMz5OXhqxZVzdcDcByCFAAAQDFpNOk1+SUlKD00wq4g9Z/fXlNKRoKCfSMIUk4UFxfntL4rVKigyMhIp/WP4kOQAgAAACSdPpkki8WiXr16Oe0cvn5+2hsXR5i6BhCkAAAAAEnpqWdkGIb6jZuu6vUbOrz/hEP79d7IgUpOTiZIXQMIUgAAAMA/hEfXUHS9GFcPAyUcq/YBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJjEqn0AAADFJKNiiM2fZpX1DbH5E4DrEKQAAACKyYqv1hWp/vWORasH4DhM7QMAAAAAkwhSAAAAAGASQQoAAAAATOIeKQAAgGLS7JVh8j6ToqygYG0dN810/Uc/D1NaVorKeAdrwE3m6wE4DkEKAACgmFRat1J+SQlKD43QVjvqfzu2UikZCQr2jXD42ACYw9Q+AAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEl8IS8AAEAxOXJXd3mdOa3soLJ21d9StbvOZZ+Wv5d99QAchyAFAABQTLY/P65I9Q83Klo9AMdhah8AAAAAmESQAgAAAACTCFIAAAAAYBL3SAEAABSTzh2aye9EotJDwrR0xVbT9SOXNFNKeqKC/cI0+S7z9QAchyAFAMD/Fx8fr+TkZKf0HRcX55R+Ubp4pp+T57mz8kwPsKs+M+ecMnPPKjPHvnoAjkOQAgBABSGqdp06ykhPd+p5srOyndo/AKB4EKQAAJCUnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BAMWPIAUAwD9EVK+l6HoxDu834dB+h/cJAHAdVu0DAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJVfsAAACKyc9jp8o9M1N5Pj521fe/aapy8jLl6W5fPQDHIUgBAAAUk4TbOhapvnGlotUDcBym9gEAAACASQQpAAAAADCJqX0AAADFJHj3drnnZCvP00sp9WNM1x8+tV25ednycPdSdDnz9QAchyAFAABQTFoPelh+SQlKD43QNxv2mK6fsv5hpWQkKNg3QjO7mq8H4DhM7QMAAAAAk7giBQAAABSjuLg4p/RboUIFRUZGOqVvFEaQAgAAAIrB6ZNJslgs6tWrl1P69/Xz0964OMJUMSFIAQAAAMUgPfWMDMNQv3HTVb1+Q4f2nXBov94bOVDJyckEqWJCkAIAAACKUXh0DUXXi3H1MFBELDYBAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1hsAgAAoJgsWbZFkiHJYlf9W3dtkSFDFjvrATgOQQoAAKCY5JYJKFK9r2fR6gE4DkEKAFCqxMfHKzk52eH9xsXFObxPAMC1iyAFACg14uPjVbtOHWWkpzvtHNlZ2U7rGwBw7SBIAQBKjeTkZGWkp2vw5NmKqF7LoX3vWL9Ki6a9odzcXIf2C/xT7Y9nyjPtrHLKBGhv/6Gm67+Lm6mMnLPy9QxQpzrm6wE4DkEKAFDqRFSvpeh6MQ7tM+HQfof2B1xM7TnvyS8pQemhEfYFqb3vKSUjQcG+EQQpwMVY/hwAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUExO1Wuo9PBKyixX3q766HINVT6zkgJ97KsH4DgEKQAAgGLyw6yFRap/pnXR6gE4DkEKAOBQ8fHxSk5OdkrfcXFxTukXAACzCFIAAIeJj49X7Tp1lJGe7tTzZGdlO7V/ACitnPk/nCpUqKDIyEin9V/aEKQAAA6TnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BoDQ7fTJJFotFvXr1cto5fP38tDcujjD1/xGkAAAOF1G9lqLrxTi834RD+x3eJ1Ccbn2ih3xO/a3McuXtul9qyvoeSs38W4E+5blfCjbSU8/IMAz1Gzdd1es3dHj/CYf2672RA5WcnEyQ+v8IUgAAAMWk3O875ZeUoPTQCLvqD5/aqZSMBAX72lePa194dA2n/I8sFMb3SAEAAACASVyRAoDrkLNW1mNVPQDA9YIgBeCynLmUtVS6VwBy5mvjzNelOFbWY1U9AMC1jiAF4JKK4xfu0roCkLNfG2e+Ls5cWY9V9QAA14trJki9++67euutt5SYmKgbb7xRM2bM0E033eTqYQGlmrOXsi7NKwA587U5/7ps2LBBderUcWjf0oXpd85YWY9V9QAA14trIkh9/vnnGjFihGbNmqXmzZtr2rRp6tChg/bt26eQkBBXDw8o9Zy1lPV5zrqvpjimDTrjtSmO7wKRmH4HADCvNP8329GuiSA1depUPfbYY+rXr58kadasWVq6dKk+/vhjvfDCCy4enXnOvO8iKytL3t7eTulbcv59HaX1Xh1nj91Zf6/OXjjA2YGhtE4bdPZ3gTD9DgBgFv/NLqzUB6ns7Gxt27ZNo0aNsu5zc3NT+/bttXnz5ovWZGVlKSsry7p95swZSVJqaqpzB3sVjh49qqbNmikzI8NJZ7BIMpzUt+Tt46NP5s9XaGioQ/tNSkpS7969lZmZ6dB+/6k0j93Zf68Htv+izPRzDu/34G9bZRiGbn/kcYVHVXNo338n/qWl/zdDK1as0A033ODQviVp3759kqQjv+9w+GtzfnpcdmaGU173nP//+fdn3C65WRz7vjk/dmf07ez+Gbtr+k88fFCStG3bNqWlpTm0b6l4/q2aeV1uzc5UrqTU7EzFbd1ouv/ctEwpR8rNu3K9o8deUvovrX07u39nj704/pt95MgRlS1b1qF92+N8JjCMy7+OFuNKLUq4hIQEVapUSZs2bVJsbKx1/3PPPaf169dry5YthWpGjx6tMWPGFOcwAQAAAJQiR48eVeXKlS95vNRfkbLHqFGjNGLECOt2fn6+Tp06pfLly8tisbhwZNee1NRUValSRUePHlVgYKCrhwMX4r0AifcBCvA+wHm8FyCVvPeBYRg6e/asIiIiLtuu1AepChUqyN3dXUlJSTb7k5KSFBYWdtEab2/vQveTlITLiNeywMDAEvEPA67HewES7wMU4H2A83gvQCpZ74OgoKArtnErhnE4lZeXl5o0aaI1a9ZY9+Xn52vNmjU2U/0AAAAAwFFK/RUpSRoxYoT69Omjpk2b6qabbtK0adN07tw56yp+AAAAAOBI10SQevDBB3Xy5Em9+uqrSkxMVExMjJYvX+7w1ddgnre3t1577TWnLrmO0oH3AiTeByjA+wDn8V6AVHrfB6V+1T4AAAAAKG6l/h4pAAAAAChuBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFIrs/fffV8OGDa1fohYbG6tly5ZZj7dp00YWi8Xm8cQTT7hwxCgOb775piwWi4YNG2bdl5mZqSFDhqh8+fIqU6aMunfvXujLtHHtudh7gc+F68Po0aML/T3Xrl3bepzPhOvDld4HfB5cX44dO6ZevXqpfPny8vX1VYMGDfTLL79YjxuGoVdffVXh4eHy9fVV+/btdeDAAReO+NKuieXP4VqVK1fWm2++qZo1a8owDM2bN0/33HOPfvvtN9WrV0+S9Nhjj2ns2LHWGj8/P1cNF8Vg69at+uCDD9SwYUOb/cOHD9fSpUu1aNEiBQUFaejQoerWrZs2btzoopHC2S71XpD4XLhe1KtXT6tXr7Zue3hc+NWDz4Trx+XeBxKfB9eLlJQUtWjRQrfddpuWLVumihUr6sCBAwoODra2mTRpkqZPn6558+YpOjpar7zyijp06KA9e/bIx8fHhaMvjCCFIuvSpYvN9htvvKH3339fP/30kzVI+fn5KSwszBXDQzFLS0tTz5499eGHH+r111+37j9z5ow++ugjLViwQG3btpUkzZkzR3Xq1NFPP/2km2++2VVDhpNc6r1wHp8L1wcPD4+L/j3zmXB9udT74Dw+D64PEydOVJUqVTRnzhzrvujoaOvPhmFo2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjR7GP+XKY2geHysvL08KFC3Xu3DnFxsZa93/22WeqUKGC6tevr1GjRik9Pd2Fo4QzDRkyRJ07d1b79u1t9m/btk05OTk2+2vXrq3IyEht3ry5uIeJYnCp98J5fC5cHw4cOKCIiAhVq1ZNPXv2VHx8vCQ+E643l3ofnMfnwfVh8eLFatq0qe6//36FhISoUaNG+vDDD63HDx8+rMTERJvPhaCgIDVv3rxEfi5wRQoOsWvXLsXGxiozM1NlypTR119/rbp160qSHn74YUVFRSkiIkI7d+7U888/r3379umrr75y8ajhaAsXLtSvv/6qrVu3FjqWmJgoLy8vlS1b1mZ/aGioEhMTi2mEKC6Xey9IfC5cL5o3b665c+fqhhtu0PHjxzVmzBi1atVKu3fv5jPhOnK590FAQACfB9eRP/74Q++//75GjBihF198UVu3btVTTz0lLy8v9enTx/pvPzQ01KaupH4uEKTgEDfccIO2b9+uM2fO6Msvv1SfPn20fv161a1bVwMHDrS2a9CggcLDw9WuXTsdOnRI1atXd+Go4UhHjx7V008/rVWrVpW4OcwoXlfzXuBz4fpw5513Wn9u2LChmjdvrqioKH3xxRfy9fV14chQnC73PhgwYACfB9eR/Px8NW3aVOPHj5ckNWrUSLt379asWbPUp08fF4/OPKb2wSG8vLxUo0YNNWnSRBMmTNCNN96od95556JtmzdvLkk6ePBgcQ4RTrZt2zadOHFCjRs3loeHhzw8PLR+/XpNnz5dHh4eCg0NVXZ2tk6fPm1Tl5SUxLz4a8yV3gt5eXmFavhcuD6ULVtWtWrV0sGDBxUWFsZnwnXqn++Di+Hz4NoVHh5unbF0Xp06daxTPc//2//36p0l9XOBIAWnyM/PV1ZW1kWPbd++XVLBPyZcO9q1a6ddu3Zp+/bt1kfTpk3Vs2dP68+enp5as2aNtWbfvn2Kj4+3uZ8Opd+V3gvu7u6FavhcuD6kpaXp0KFDCg8PV5MmTfhMuE79831wMXweXLtatGihffv22ezbv3+/oqKiJBUsPBEWFmbzuZCamqotW7aUyM8FpvahyEaNGqU777xTkZGROnv2rBYsWKB169ZpxYoVOnTokBYsWKBOnTqpfPny2rlzp4YPH65bb731ossho/QKCAhQ/fr1bfb5+/urfPny1v0DBgzQiBEjVK5cOQUGBurJJ59UbGwsq3NdY670XuBz4foxcuRIdenSRVFRUUpISNBrr70md3d3PfTQQwoKCuIz4TpxufcBnwfXl+HDh+uWW27R+PHj9cADD+jnn3/W7NmzNXv2bEmyfufg66+/rpo1a1qXP4+IiNC9997r2sFfBEEKRXbixAn17t1bx48fV1BQkBo2bKgVK1bo9ttv19GjR7V69WpNmzZN586dU5UqVdS9e3e9/PLLrh42XODtt9+Wm5ubunfvrqysLHXo0EHvvfeeq4eFYubl5cXnwnXir7/+0kMPPaS///5bFStWVMuWLfXTTz+pYsWKkvhMuF5c7n2QmZnJ58F1pFmzZvr66681atQojR07VtHR0Zo2bZp69uxpbfPcc8/p3LlzGjhwoE6fPq2WLVtq+fLlJfL+a4thGIarBwEAAAAApQn3SAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAEqUI0eOyGKxaPv27a4eSonRpk0bDRs2zNXDAAD8A0EKAOBwFovlso/Ro0e7eoiFlISwsm7dOlksFp0+fdql4wAAXJmHqwcAALj2HD9+3Prz559/rldffVX79u2z7itTpowrhgUAgMNwRQoA4HBhYWHWR1BQkCwWi3U7JCREU6dOVeXKleXt7a2YmBgtX778kn3l5eWpf//+ql27tuLj4yVJ3377rRo3biwfHx9Vq1ZNY8aMUW5urrXGYrHo//7v/9S1a1f5+fmpZs2aWrx4cZGe048//qhWrVrJ19dXVapU0VNPPaVz585Zj1etWlXjx49X//79FRAQoMjISM2ePdumj02bNikmJkY+Pj5q2rSpvvnmG+s0xiNHjui2226TJAUHB8tisahv377W2vz8fD333HMqV66cwsLCSuRVPQC4nhCkAADF6p133tGUKVM0efJk7dy5Ux06dNDdd9+tAwcOFGqblZWl+++/X9u3b9eGDRsUGRmpDRs2qHfv3nr66ae1Z88effDBB5o7d67eeOMNm9oxY8bogQce0M6dO9WpUyf17NlTp06dsmvMhw4dUseOHdW9e3ft3LlTn3/+uX788UcNHTrUpt2UKVPUtGlT/fbbbxo8eLAGDRpkvRKXmpqqLl26qEGDBvr11181btw4Pf/889baKlWq6L///a8kad++fTp+/Ljeeecd6/F58+bJ399fW7Zs0aRJkzR27FitWrXKrucDAHAAAwAAJ5ozZ44RFBRk3Y6IiDDeeOMNmzbNmjUzBg8ebBiGYRw+fNiQZGzYsMFo166d0bJlS+P06dPWtu3atTPGjx9vU//JJ58Y4eHh1m1Jxssvv2zdTktLMyQZy5Ytu+Q4W7dubTz99NMXPTZgwABj4MCBNvs2bNhguLm5GRkZGYZhGEZUVJTRq1cv6/H8/HwjJCTEeP/99w3DMIz333/fKF++vLW9YRjGhx9+aEgyfvvtN8MwDOP77783JBkpKSmFxtayZUubfc2aNTOef/75Sz4fAIBzcY8UAKDYpKamKiEhQS1atLDZ36JFC+3YscNm30MPPaTKlStr7dq18vX1te7fsWOHNm7caHMFKi8vT5mZmUpPT5efn58kqWHDhtbj/v7+CgwM1IkTJ+wa944dO7Rz50599tln1n2GYSg/P1+HDx9WnTp1Cp3z/HTG8+fct2+fGjZsKB8fH2ubm2666arH8M++JSk8PNzu5wMAKDqCFACgROrUqZM+/fRTbd68WW3btrXuT0tL05gxY9StW7dCNf8MKZ6enjbHLBaL8vPz7RpLWlqaHn/8cT311FOFjkVGRjrlnP/mzL4BAOYRpAAAxSYwMFARERHauHGjWrdubd2/cePGQldnBg0apPr16+vuu+/W0qVLre0bN26sffv2qUaNGsU27saNG2vPnj1FOucNN9ygTz/9VFlZWfL29pYkbd261aaNl5eXpIIrbACAko0gBQAoVs8++6xee+01Va9eXTExMZozZ462b99uM23uvCeffFJ5eXm66667tGzZMrVs2VKvvvqq7rrrLkVGRuq+++6Tm5ubduzYod27d+v1118v0thOnjxZ6IuAw8PD9fzzz+vmm2/W0KFD9eijj8rf31979uzRqlWrNHPmzKvq++GHH9ZLL72kgQMH6oUXXlB8fLwmT54sqeDqkiRFRUXJYrFoyZIl6tSpk3x9fVkqHgBKKFbtAwAUq6eeekojRozQM888owYNGmj58uVavHixatasedH2w4YN05gxY9SpUydt2rRJHTp00JIlS7Ry5Uo1a9ZMN998s95++21FRUUVeWwLFixQo0aNbB4ffvihGjZsqPXr12v//v1q1aqVGjVqpFdffVURERFX3XdgYKD+97//afv27YqJidFLL72kV199VdKFKYmVKlXSmDFj9MILLyg0NLTQqoAAgJLDYhiG4epBAABwPfrss8/Ur18/nTlzxmZBDQBAycfUPgAAisn8+fNVrVo1VapUSTt27NDzzz+vBx54gBAFAKUQQQoAgGKSmJioV199VYmJiQoPD9f9999f6IuEAQClA1P7AAAAAMAkFpsAAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmPT/ALFDWFiHUDxIAAAAAElFTkSuQmCC",
458
+ "text/plain": [
459
+ "<Figure size 1000x600 with 1 Axes>"
460
+ ]
461
+ },
462
+ "metadata": {},
463
+ "output_type": "display_data"
464
+ }
465
+ ],
466
+ "source": [
467
+ "#统计图\n",
468
+ "import matplotlib.pyplot as plt\n",
469
+ "import seaborn as sns\n",
470
+ "import numpy as np\n",
471
+ "\n",
472
+ "# 假设这是您的 token_len_list\n",
473
+ "\n",
474
+ "# 设置画布大小\n",
475
+ "plt.figure(figsize=(10, 6))\n",
476
+ "\n",
477
+ "# 使用 seaborn 生成直方图\n",
478
+ "sns.histplot(token_len_list, bins=30, kde=False, color=\"skyblue\", edgecolor=\"black\")\n",
479
+ "\n",
480
+ "# 添加标题和标签\n",
481
+ "plt.title(\"Distribution of Token Lengths\")\n",
482
+ "plt.xlabel(\"Token Length\")\n",
483
+ "plt.ylabel(\"Frequency\")\n",
484
+ "\n",
485
+ "# 显示平均值线\n",
486
+ "mean_value = np.mean(token_len_list)\n",
487
+ "plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2)\n",
488
+ "plt.text(mean_value + 2, plt.ylim()[1]*0.9, f'Mean: {mean_value:.2f}', color='red')\n",
489
+ "\n",
490
+ "# 显示中位数线\n",
491
+ "median_value = np.median(token_len_list)\n",
492
+ "plt.axvline(median_value, color='green', linestyle='dashed', linewidth=2)\n",
493
+ "plt.text(median_value - 10, plt.ylim()[1]*0.8, f'Median: {median_value:.2f}', color='green')\n",
494
+ "\n",
495
+ "# 显示图形\n",
496
+ "plt.show()"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": 15,
502
+ "id": "9a65c8bc-6bf0-4605-8c38-409bbb14f2c7",
503
+ "metadata": {},
504
+ "outputs": [
505
+ {
506
+ "data": {
507
+ "application/vnd.jupyter.widget-view+json": {
508
+ "model_id": "a4e97d92506f419581c3711f26d7f683",
509
+ "version_major": 2,
510
+ "version_minor": 0
511
+ },
512
+ "text/plain": [
513
+ "Map: 0%| | 0/53275 [00:00<?, ? examples/s]"
514
+ ]
515
+ },
516
+ "metadata": {},
517
+ "output_type": "display_data"
518
+ },
519
+ {
520
+ "data": {
521
+ "application/vnd.jupyter.widget-view+json": {
522
+ "model_id": "9004c03cb9b24411b6bd9d33662402fb",
523
+ "version_major": 2,
524
+ "version_minor": 0
525
+ },
526
+ "text/plain": [
527
+ "Map: 0%| | 0/5920 [00:00<?, ? examples/s]"
528
+ ]
529
+ },
530
+ "metadata": {},
531
+ "output_type": "display_data"
532
+ }
533
+ ],
534
+ "source": [
535
+ "# 2. tokenize\n",
536
+ "def tokenize_function(examples):\n",
537
+ " examples['label'] = [int(item) for item in examples['label']]\n",
538
+ " return tokenizer(examples['sequence'], truncation=True, padding='max_length', max_length=128)\n",
539
+ "\n",
540
+ "# 3. 对数据集应用分词函数\n",
541
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
542
+ "\n",
543
+ "# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
544
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": 22,
550
+ "id": "4b0faa94-d0c4-4ce8-9976-dcefcb766f0b",
551
+ "metadata": {},
552
+ "outputs": [
553
+ {
554
+ "name": "stderr",
555
+ "output_type": "stream",
556
+ "text": [
557
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
558
+ " warnings.warn(\n",
559
+ "/tmp/ipykernel_2549/341301010.py:29: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
560
+ " trainer = Trainer(\n"
561
+ ]
562
+ }
563
+ ],
564
+ "source": [
565
+ "from transformers import TrainingArguments, Trainer\n",
566
+ "import numpy as np\n",
567
+ "import torch.nn as nn\n",
568
+ "\n",
569
+ "\n",
570
+ "\n",
571
+ "def compute_metrics(eval_pred):\n",
572
+ " predictions, labels = eval_pred\n",
573
+ " predictions = np.argmax(predictions, axis=1)\n",
574
+ " return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
575
+ "\n",
576
+ "# change training hyperparameters to archive better quality\n",
577
+ "training_args = TrainingArguments(\n",
578
+ " output_dir=\"ds_job_category_v0\",\n",
579
+ " learning_rate=1e-5,\n",
580
+ " lr_scheduler_type=\"constant_with_warmup\",\n",
581
+ " warmup_ratio=0.1,\n",
582
+ " optim='adamw_torch',\n",
583
+ " weight_decay=0.0,\n",
584
+ " per_device_train_batch_size=20,\n",
585
+ " per_device_eval_batch_size=20,\n",
586
+ " num_train_epochs=10,\n",
587
+ " evaluation_strategy=\"epoch\",\n",
588
+ " save_strategy=\"epoch\",\n",
589
+ " logging_strategy=\"epoch\",\n",
590
+ " load_best_model_at_end=True\n",
591
+ ")\n",
592
+ "\n",
593
+ "trainer = Trainer(\n",
594
+ " model=model,\n",
595
+ " args=training_args,\n",
596
+ " train_dataset=tokenized_datasets[\"train\"],\n",
597
+ " eval_dataset=tokenized_datasets[\"test\"],\n",
598
+ " tokenizer=tokenizer,\n",
599
+ " data_collator=data_collator,\n",
600
+ " compute_metrics=compute_metrics,\n",
601
+ ")"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": 17,
607
+ "id": "9b067740-9c0f-4df8-a5af-b68ec9d1f3e0",
608
+ "metadata": {},
609
+ "outputs": [
610
+ {
611
+ "data": {
612
+ "text/html": [
613
+ "\n",
614
+ " <div>\n",
615
+ " \n",
616
+ " <progress value='26640' max='26640' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
617
+ " [26640/26640 1:00:13, Epoch 10/10]\n",
618
+ " </div>\n",
619
+ " <table border=\"1\" class=\"dataframe\">\n",
620
+ " <thead>\n",
621
+ " <tr style=\"text-align: left;\">\n",
622
+ " <th>Epoch</th>\n",
623
+ " <th>Training Loss</th>\n",
624
+ " <th>Validation Loss</th>\n",
625
+ " <th>Accuracy</th>\n",
626
+ " </tr>\n",
627
+ " </thead>\n",
628
+ " <tbody>\n",
629
+ " <tr>\n",
630
+ " <td>1</td>\n",
631
+ " <td>0.324900</td>\n",
632
+ " <td>0.237557</td>\n",
633
+ " <td>0.916216</td>\n",
634
+ " </tr>\n",
635
+ " <tr>\n",
636
+ " <td>2</td>\n",
637
+ " <td>0.193100</td>\n",
638
+ " <td>0.212998</td>\n",
639
+ " <td>0.925338</td>\n",
640
+ " </tr>\n",
641
+ " <tr>\n",
642
+ " <td>3</td>\n",
643
+ " <td>0.126900</td>\n",
644
+ " <td>0.278650</td>\n",
645
+ " <td>0.923480</td>\n",
646
+ " </tr>\n",
647
+ " <tr>\n",
648
+ " <td>4</td>\n",
649
+ " <td>0.076900</td>\n",
650
+ " <td>0.362979</td>\n",
651
+ " <td>0.922804</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <td>5</td>\n",
655
+ " <td>0.047400</td>\n",
656
+ " <td>0.518552</td>\n",
657
+ " <td>0.915372</td>\n",
658
+ " </tr>\n",
659
+ " <tr>\n",
660
+ " <td>6</td>\n",
661
+ " <td>0.032000</td>\n",
662
+ " <td>0.698843</td>\n",
663
+ " <td>0.918412</td>\n",
664
+ " </tr>\n",
665
+ " <tr>\n",
666
+ " <td>7</td>\n",
667
+ " <td>0.029000</td>\n",
668
+ " <td>0.760331</td>\n",
669
+ " <td>0.915709</td>\n",
670
+ " </tr>\n",
671
+ " <tr>\n",
672
+ " <td>8</td>\n",
673
+ " <td>0.025900</td>\n",
674
+ " <td>0.769762</td>\n",
675
+ " <td>0.921959</td>\n",
676
+ " </tr>\n",
677
+ " <tr>\n",
678
+ " <td>9</td>\n",
679
+ " <td>0.021800</td>\n",
680
+ " <td>0.740165</td>\n",
681
+ " <td>0.923142</td>\n",
682
+ " </tr>\n",
683
+ " <tr>\n",
684
+ " <td>10</td>\n",
685
+ " <td>0.021300</td>\n",
686
+ " <td>0.738664</td>\n",
687
+ " <td>0.922973</td>\n",
688
+ " </tr>\n",
689
+ " </tbody>\n",
690
+ "</table><p>"
691
+ ],
692
+ "text/plain": [
693
+ "<IPython.core.display.HTML object>"
694
+ ]
695
+ },
696
+ "metadata": {},
697
+ "output_type": "display_data"
698
+ },
699
+ {
700
+ "data": {
701
+ "text/plain": [
702
+ "TrainOutput(global_step=26640, training_loss=0.08990609108864724, metrics={'train_runtime': 3619.5996, 'train_samples_per_second': 147.185, 'train_steps_per_second': 7.36, 'total_flos': 3.4801460969472e+16, 'train_loss': 0.08990609108864724, 'epoch': 10.0})"
703
+ ]
704
+ },
705
+ "execution_count": 17,
706
+ "metadata": {},
707
+ "output_type": "execute_result"
708
+ }
709
+ ],
710
+ "source": [
711
+ "trainer.train()"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": 20,
717
+ "id": "aa26e020-2dfd-4e0e-b330-250ee3e44a44",
718
+ "metadata": {},
719
+ "outputs": [
720
+ {
721
+ "data": {
722
+ "text/html": [],
723
+ "text/plain": [
724
+ "<IPython.core.display.HTML object>"
725
+ ]
726
+ },
727
+ "metadata": {},
728
+ "output_type": "display_data"
729
+ },
730
+ {
731
+ "data": {
732
+ "text/plain": [
733
+ "{'accuracy': 0.9253378378378379, 'f1': 0.927062706270627}"
734
+ ]
735
+ },
736
+ "execution_count": 20,
737
+ "metadata": {},
738
+ "output_type": "execute_result"
739
+ }
740
+ ],
741
+ "source": [
742
+ "#模型测试\n",
743
+ "import evaluate\n",
744
+ "predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
745
+ "preds = np.argmax(predictions.predictions, axis=-1)\n",
746
+ "metric = evaluate.load(\"glue\", \"mrpc\")\n",
747
+ "ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
748
+ "ret"
749
+ ]
750
+ },
751
+ {
752
+ "cell_type": "code",
753
+ "execution_count": 21,
754
+ "id": "5e6d99ad-66a0-4b85-9380-ae2b7ee88056",
755
+ "metadata": {},
756
+ "outputs": [
757
+ {
758
+ "data": {
759
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHHCAYAAACcHAM1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABRWklEQVR4nO3deVgV1f8H8PeAclkv4AKIEoKCguJuRrgmgYpbWGaSool9LdDc0MwNtyzcUnMpS1HT1Eot0VQUd3FNlFxIEEUTsERAUPb5/eGPySt45crFO17er555Hu7MmTOfuQ/px885Z0YQRVEEERERkYwZ6DoAIiIiomdhwkJERESyx4SFiIiIZI8JCxEREckeExYiIiKSPSYsREREJHtMWIiIiEj2mLAQERGR7DFhISIiItljwkL0Erp69Sp8fHxgaWkJQRCwfft2rfZ//fp1CIKAiIgIrfb7MuvcuTM6d+6s6zCIqiwmLETPKTExEf/73//g7OwMY2NjKJVKeHl5YfHixXj48GGlXjswMBBxcXGYM2cO1q9fjzZt2lTq9V6kIUOGQBAEKJXKMr/Hq1evQhAECIKA+fPna9z/7du3ERYWhtjYWC1ES0QvSjVdB0D0Mtq5cyfeeecdKBQKDB48GE2bNkV+fj6OHj2K0NBQXLx4Ed9++22lXPvhw4eIiYnB5MmTERISUinXcHR0xMOHD1G9evVK6f9ZqlWrhgcPHmDHjh3o37+/yrENGzbA2NgYubm5z9X37du3MWPGDNSvXx8tWrQo93l79+59rusRkXYwYSHSUFJSEgYMGABHR0dER0ejTp060rHg4GAkJCRg586dlXb9f/75BwBgZWVVadcQBAHGxsaV1v+zKBQKeHl54ccffyyVsGzcuBF+fn745ZdfXkgsDx48gKmpKYyMjF7I9YiobBwSItJQeHg4srOz8f3336skKyUaNmyITz75RPpcWFiIWbNmoUGDBlAoFKhfvz4+++wz5OXlqZxXv3599OzZE0ePHsWrr74KY2NjODs7Y926dVKbsLAwODo6AgBCQ0MhCALq168P4NFQSsnPjwsLC4MgCCr7oqKi0L59e1hZWcHc3ByNGjXCZ599Jh1/2hyW6OhodOjQAWZmZrCyskKfPn1w+fLlMq+XkJCAIUOGwMrKCpaWlhg6dCgePHjw9C/2CQMHDsTvv/+OjIwMad/p06dx9epVDBw4sFT79PR0jB8/Hh4eHjA3N4dSqUT37t1x/vx5qc3BgwfRtm1bAMDQoUOloaWS++zcuTOaNm2Ks2fPomPHjjA1NZW+lyfnsAQGBsLY2LjU/fv6+sLa2hq3b98u970S0bMxYSHS0I4dO+Ds7IzXX3+9XO2DgoIwbdo0tGrVCosWLUKnTp0wd+5cDBgwoFTbhIQEvP3223jzzTexYMECWFtbY8iQIbh48SIAwN/fH4sWLQIAvPfee1i/fj2++uorjeK/ePEievbsiby8PMycORMLFixA7969cezYMbXn7du3D76+vrhz5w7CwsIwduxYHD9+HF5eXrh+/Xqp9v3798f9+/cxd+5c9O/fHxEREZgxY0a54/T394cgCNi6dau0b+PGjWjcuDFatWpVqv21a9ewfft29OzZEwsXLkRoaCji4uLQqVMnKXlwc3PDzJkzAQAffvgh1q9fj/Xr16Njx45SP3fv3kX37t3RokULfPXVV+jSpUuZ8S1evBi1a9dGYGAgioqKAADffPMN9u7di6VLl8Le3r7c90pE5SASUbllZmaKAMQ+ffqUq31sbKwIQAwKClLZP378eBGAGB0dLe1zdHQUAYiHDx+W9t25c0dUKBTiuHHjpH1JSUkiAHHevHkqfQYGBoqOjo6lYpg+fbr4+P/qixYtEgGI//zzz1PjLrnGmjVrpH0tWrQQbWxsxLt370r7zp8/LxoYGIiDBw8udb0PPvhApc+33npLrFmz5lOv+fh9mJmZiaIoim+//bbYtWtXURRFsaioSLSzsxNnzJhR5neQm5srFhUVlboPhUIhzpw5U9p3+vTpUvdWolOnTiIAceXKlWUe69Spk8q+PXv2iADE2bNni9euXRPNzc3Fvn37PvMeiUhzrLAQaSArKwsAYGFhUa72u3btAgCMHTtWZf+4ceMAoNRcF3d3d3To0EH6XLt2bTRq1AjXrl177pifVDL35ddff0VxcXG5zklJSUFsbCyGDBmCGjVqSPubNWuGN998U7rPx40YMULlc4cOHXD37l3pOyyPgQMH4uDBg0hNTUV0dDRSU1PLHA4CHs17MTB49EdaUVER7t69Kw13/fHHH+W+pkKhwNChQ8vV1sfHB//73/8wc+ZM+Pv7w9jYGN988025r0VE5ceEhUgDSqUSAHD//v1ytb9x4wYMDAzQsGFDlf12dnawsrLCjRs3VPa/8sorpfqwtrbGvXv3njPi0t599114eXkhKCgItra2GDBgALZs2aI2eSmJs1GjRqWOubm54d9//0VOTo7K/ifvxdraGgA0upcePXrAwsICmzdvxoYNG9C2bdtS32WJ4uJiLFq0CC4uLlAoFKhVqxZq166NCxcuIDMzs9zXrFu3rkYTbOfPn48aNWogNjYWS5YsgY2NTbnPJaLyY8JCpAGlUgl7e3v8+eefGp335KTXpzE0NCxzvyiKz32NkvkVJUxMTHD48GHs27cPgwYNwoULF/Duu+/izTffLNW2IipyLyUUCgX8/f2xdu1abNu27anVFQD4/PPPMXbsWHTs2BE//PAD9uzZg6ioKDRp0qTclSTg0fejiXPnzuHOnTsAgLi4OI3OJaLyY8JCpKGePXsiMTERMTExz2zr6OiI4uJiXL16VWV/WloaMjIypBU/2mBtba2yoqbEk1UcADAwMEDXrl2xcOFCXLp0CXPmzEF0dDQOHDhQZt8lccbHx5c6duXKFdSqVQtmZmYVu4GnGDhwIM6dO4f79++XOVG5xM8//4wuXbrg+++/x4ABA+Dj4wNvb+9S30l5k8fyyMnJwdChQ+Hu7o4PP/wQ4eHhOH36tNb6J6L/MGEh0tCECRNgZmaGoKAgpKWllTqemJiIxYsXA3g0pAGg1EqehQsXAgD8/Py0FleDBg2QmZmJCxcuSPtSUlKwbds2lXbp6emlzi15gNqTS61L1KlTBy1atMDatWtVEoA///wTe/fule6zMnTp0gWzZs3C119/DTs7u6e2MzQ0LFW9+emnn/D333+r7CtJrMpK7jQ1ceJEJCcnY+3atVi4cCHq16+PwMDAp36PRPT8+OA4Ig01aNAAGzduxLvvvgs3NzeVJ90eP34cP/30E4YMGQIAaN68OQIDA/Htt98iIyMDnTp1wqlTp7B27Vr07dv3qUtmn8eAAQMwceJEvPXWWxg1ahQePHiAFStWwNXVVWXS6cyZM3H48GH4+fnB0dERd+7cwfLly1GvXj20b9/+qf3PmzcP3bt3h6enJ4YNG4aHDx9i6dKlsLS0RFhYmNbu40kGBgaYMmXKM9v17NkTM2fOxNChQ/H6668jLi4OGzZsgLOzs0q7Bg0awMrKCitXroSFhQXMzMzQrl07ODk5aRRXdHQ0li9fjunTp0vLrNesWYPOnTtj6tSpCA8P16g/InoGHa9SInpp/fXXX+Lw4cPF+vXri0ZGRqKFhYXo5eUlLl26VMzNzZXaFRQUiDNmzBCdnJzE6tWriw4ODuKkSZNU2ojio2XNfn5+pa7z5HLapy1rFkVR3Lt3r9i0aVPRyMhIbNSokfjDDz+UWta8f/9+sU+fPqK9vb1oZGQk2tvbi++99574119/lbrGk0t/9+3bJ3p5eYkmJiaiUqkUe/XqJV66dEmlTcn1nlw2vWbNGhGAmJSU9NTvVBRVlzU/zdOWNY8bN06sU6eOaGJiInp5eYkxMTFlLkf+9ddfRXd3d7FatWoq99mpUyexSZMmZV7z8X6ysrJER0dHsVWrVmJBQYFKuzFjxogGBgZiTEyM2nsgIs0IoqjBDDgiIiIiHeAcFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHB8dVsuLiYty+fRsWFhZafSQ4ERG9GKIo4v79+7C3t5feCK5tubm5yM/P10pfRkZGMDY21kpfcsKEpZLdvn0bDg4Oug6DiIgq6ObNm6hXr57W+83NzYWJRU2g8IFW+rOzs0NSUpLeJS1MWCqZhYUFAMCo7WgI1RQ6joaoctzYMUnXIRBVmvv3s+Di9Ir057m25efnA4UPoHAPBAyNKtZZUT5SL61Ffn4+ExbSTMkwkFBNwYSF9JZSqdR1CESVrtKH9asZQ6hgwiIK+js1lQkLERGRHAgAKpoU6fFUSSYsREREciAYPNoq2oee0t87IyIiIr3BCgsREZEcCIIWhoT0d0yICQsREZEccEhILf29MyIiItIbrLAQERHJAYeE1GLCQkREJAtaGBLS44ET/b0zIiIi0hussBAREckBh4TUYsJCREQkB1wlpJb+3hkRERHpDVZYiIiI5IBDQmoxYSEiIpIDDgmpxYSFiIhIDlhhUUt/UzEiIiLSG6ywEBERyQGHhNRiwkJERCQHgqCFhIVDQkREREQ6wwoLERGRHBgIj7aK9qGnmLAQERHJAeewqKW/d0ZERER6gxUWIiIiOeBzWNRiwkJERCQHHBJSS3/vjIiIiPQGKyxERERywCEhtZiwEBERyQGHhNRiwkJERCQHrLCopb+pGBEREekNVliIiIjkgENCajFhISIikgMOCamlv6kYERER6Q1WWIiIiGRBC0NCelyHYMJCREQkBxwSUkt/UzEiIiLSG6ywEBERyYEgaGGVkP5WWJiwEBERyQGXNaulv3dGREREas2dOxdt27aFhYUFbGxs0LdvX8THx6u06dy5MwRBUNlGjBih0iY5ORl+fn4wNTWFjY0NQkNDUVhYqNLm4MGDaNWqFRQKBRo2bIiIiAiNYmXCQkREJAclk24rumng0KFDCA4OxokTJxAVFYWCggL4+PggJydHpd3w4cORkpIibeHh4dKxoqIi+Pn5IT8/H8ePH8fatWsRERGBadOmSW2SkpLg5+eHLl26IDY2FqNHj0ZQUBD27NlT7lg5JERERCQHOhgS2r17t8rniIgI2NjY4OzZs+jYsaO039TUFHZ2dmX2sXfvXly6dAn79u2Dra0tWrRogVmzZmHixIkICwuDkZERVq5cCScnJyxYsAAA4ObmhqNHj2LRokXw9fUtV6yssBAREcmBFissWVlZKlteXl65QsjMzAQA1KhRQ2X/hg0bUKtWLTRt2hSTJk3CgwcPpGMxMTHw8PCAra2ttM/X1xdZWVm4ePGi1Mbb21ulT19fX8TExJT762GFhYiISM84ODiofJ4+fTrCwsLUnlNcXIzRo0fDy8sLTZs2lfYPHDgQjo6OsLe3x4ULFzBx4kTEx8dj69atAIDU1FSVZAWA9Dk1NVVtm6ysLDx8+BAmJibPvCcmLERERHKgxSGhmzdvQqlUSrsVCsUzTw0ODsaff/6Jo0ePquz/8MMPpZ89PDxQp04ddO3aFYmJiWjQoEHF4tUAh4SIiIjkQItDQkqlUmV7VsISEhKCyMhIHDhwAPXq1VPbtl27dgCAhIQEAICdnR3S0tJU2pR8Lpn38rQ2SqWyXNUVgAkLERFRlSWKIkJCQrBt2zZER0fDycnpmefExsYCAOrUqQMA8PT0RFxcHO7cuSO1iYqKglKphLu7u9Rm//79Kv1ERUXB09Oz3LEyYSEiIpKBJ5918rybJoKDg/HDDz9g48aNsLCwQGpqKlJTU/Hw4UMAQGJiImbNmoWzZ8/i+vXr+O233zB48GB07NgRzZo1AwD4+PjA3d0dgwYNwvnz57Fnzx5MmTIFwcHBUmVnxIgRuHbtGiZMmIArV65g+fLl2LJlC8aMGVPuWJmwEBERyYAuEpYVK1YgMzMTnTt3Rp06daRt8+bNAAAjIyPs27cPPj4+aNy4McaNG4d+/fphx44dUh+GhoaIjIyEoaEhPD098f7772Pw4MGYOXOm1MbJyQk7d+5EVFQUmjdvjgULFuC7774r95JmgJNuiYiIqixRFNUed3BwwKFDh57Zj6OjI3bt2qW2TefOnXHu3DmN4nscExYiIiI5EP5/q2gfeooJCxERkQw8z5BOGZ1oJxgZ4hwWIiIikj1WWIiIiGSAFRb1mLAQERHJABMW9ZiwEBERyQATFvU4h4WIiIhkjxUWIiIiOeCyZrWYsBAREckAh4TU45AQERERyR4rLERERDIgCNBChUU7scgRExYiIiIZEKCFISE9zlg4JERERESyxwoLERGRDHDSrXpMWIiIiOSAy5rV4pAQERERyR4rLERERHKghSEhkUNCREREVJm0MYel4quM5IsJCxERkQwwYVGPc1iIiIhI9lhhISIikgOuElKLCQsREZEMcEhIPQ4JERERkeyxwkJERCQDrLCox4SFiIhIBpiwqMchISIiIpI9VliIiIhkgBUW9ZiwEBERyQGXNavFISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBJizqMWEhIiKSASYs6nEOCxEREckeKyxERERywFVCajFhISIikgEOCanHISEiIiKSvZeiwiIIArZt24a+ffvqOhR6Aca81wE9O7jB5ZVayM0rwKmLNxG2KgoJN++qtGvrXg9ThnVF68b1UFRcjD8TU9Fvwnrk5hdKbXzauSB0cGc0cbZFXn4hjp2/jvenbZKOd2zphMkfvAE3J1s8yM3Hpj3nMev7/SgqLn5h90u0KGIvIg+cx9UbaTBWVMerHk6YPrIPXBxtAQD3MnPwxbe7cODkFdxKu4eaVubw69QMn43wg9LcBACwMfIEQmZuKLP/+N2fo3YNixd2P/R8WGFRT+cJS2pqKubMmYOdO3fi77//ho2NDVq0aIHRo0eja9euug4Poihi+vTpWLVqFTIyMuDl5YUVK1bAxcVF16HprdebO+K7X0/hXPzfqGZggKlB3tgaPhivDf0aD3ILADxKVn7+YhAW/XgEE5fuQmFRMZo626FYFKV+enVww+JxvTHr+/04fC4J1QwN4FbfRjre1NkWW+a+jwUbDmPE3G2oU8sCC8f0goGhgGkr977w+6aq69gfCRj2Tge0dHNEUVERZq3YgX4jlyFm82SYmSiQ8m8mUv7NxMxP+qKRkx1upqRj3BebkfJvJtZ+MQwA8JZ3K3R9zV2l3+CZPyAvv4DJyktCgBYSFj2exKLThOX69evw8vKClZUV5s2bBw8PDxQUFGDPnj0IDg7GlStXdBkeACA8PBxLlizB2rVr4eTkhKlTp8LX1xeXLl2CsbGxrsPTS+98+oPK54+/3IaEbRPRwtUexy/cAADM+bgbvtl2El/9eFRq93gFxtDAAHNDumPaN1H44fc/pP3xN/6Rfn6rS1NcvJaGeesPAQCSbqcj7Nu9WD2tP8LXHkT2w/xKuT+iJ/285GOVz8umvQ9X389w/vJNvN6qIdwb2GPdl0HScad6tTH5o14YMX0dCguLUK2aIUyMjWBibCS1+ffefRw58xeWTBn4wu6DqDLpdA7Lxx9/DEEQcOrUKfTr1w+urq5o0qQJxo4dixMnTjz1vIkTJ8LV1RWmpqZwdnbG1KlTUVBQIB0/f/48unTpAgsLCyiVSrRu3RpnzpwBANy4cQO9evWCtbU1zMzM0KRJE+zatavM64iiiK+++gpTpkxBnz590KxZM6xbtw63b9/G9u3btfpd0NMpzR4lhveyHgIAalmZoa27A/7JyMGepcMQ/3MoIhcNxWtNX5HOae5aB3VrW6JYLMahb0bg8k/j8dPc91UqLEbVqyHvseEjAHiYVwgTRXU0d7V/AXdGVLas7FwAgJWlqZo2D2FhZoxq1QzLPL5p1ymYGBuh9xstKiNEqgQlQ0IV3fSVzhKW9PR07N69G8HBwTAzMyt13MrK6qnnWlhYICIiApcuXcLixYuxatUqLFq0SDoeEBCAevXq4fTp0zh79iw+/fRTVK9eHQAQHByMvLw8HD58GHFxcfjyyy9hbm5e5nWSkpKQmpoKb29vaZ+lpSXatWuHmJiY57xz0oQgCJgb3A0n4m7g8vU7AID6dawBAJ8O7oy1O8/i7U/X4/zVFGyfHwjnujVU2wR2wfwfDmHAZxuQkf0QOxYNgZXFozH/6DMJeLWJA/q90RQGBgLq1LLAhMGdAAB2Ncv+nSCqbMXFxfhs4S9o19wZ7g3KTpzvZmRj/urdCOz7+lP7+eG3E3jbt7VK1YVkTtDSpqd0NiSUkJAAURTRuHFjjc+dMmWK9HP9+vUxfvx4bNq0CRMmTAAAJCcnIzQ0VOr78fkmycnJ6NevHzw8PAAAzs7OT71OamoqAMDW1lZlv62trXTsSXl5ecjLy5M+Z2VlaXJr9IT5n/jBzckG3UetlvYZGDz6PzIi8gw27o4FAMQl7Eanlk54v3srzPxuHwz+/18ZC344jB1HLgMAgsO34+LmcejbqQkiIs/gwJlETPtmLxaO7oWVk/yRl1+E+T8cwuvN6qO4WASRLoSG/4TL11Kw69vRZR7Pyn6Id8esRCMnO0z8sEeZbU5dSMJfSalYGTaoEiMlerF0lrCI4vP/hbB582YsWbIEiYmJyM7ORmFhIZRKpXR87NixCAoKwvr16+Ht7Y133nkHDRo0AACMGjUKH330Efbu3Qtvb2/069cPzZo1q/D9lJg7dy5mzJihtf6qsvBRPeD7mit6jF6N2//+l/il3r0PQHU+CgDEJ/+LejaWj9qkl26TX1CE6yn3pDYAsPznGCz/OQZ2NS2Qcf8hXrGzwvThb+J6yr1Kuy+ip5kwbwv2HP0TO7/5BHVtrUsdv5+Ti3c+WQELUwXWhw9H9acMB63/9Tg8XOuhhdsrZR4neeIqIfV0NiTk4uICQRA0nlgbExODgIAA9OjRA5GRkTh37hwmT56M/Pz/JkiGhYXh4sWL8PPzQ3R0NNzd3bFt2zYAQFBQEK5du4ZBgwYhLi4Obdq0wdKlS8u8lp2dHQAgLS1NZX9aWpp07EmTJk1CZmamtN28eVOj+6NHwkf1gF97N/QeF4Hk1AyVY8mpGbj9bxYaOtRS2d+wXk3cTHvU9vxfKcjNL1BpU83QAK/YWkltHpd69z5y8wvR7w0P3ErLwPmrKdq+JaKnEkURE+Ztwc6DF/Dr8pFwrFurVJus7IfoN3IZjKobYsOC/8FYUb3MvrIf5OHX/efwfu/XKjts0jLOYVFPZwlLjRo14Ovri2XLliEnJ6fU8YyMjDLPO378OBwdHTF58mS0adMGLi4uuHHjRql2rq6uGDNmDPbu3Qt/f3+sWbNGOubg4IARI0Zg69atGDduHFatWlXmtZycnGBnZ4f9+/dL+7KysnDy5El4enqWeY5CoYBSqVTZSDPzP/FDf+9mGD77Z2Q/yIeNtTlsrM1hbPRfQXDp5mP431vt0LujO5zsa+CzoW/A5ZVaWP//K4LuP8jDmh1n8OmQzujSpgEaOtTEgtE9AQDbD12U+hn5rhfcnWzQuH5tjH+/E0a/1x4Tv/6dQ0L0QoWGb8GW38/g21mBMDc1Rtq/WUj7NwsPcx/9Qywr+yH6jVqOB7n5WDJlIO5n50ptiopUnxm0LeoPFBYVo3/3trq4FaoAQdDOpq90uqx52bJl8PLywquvvoqZM2eiWbNmKCwsRFRUFFasWIHLly+XOsfFxQXJycnYtGkT2rZti507d0rVEwB4+PAhQkND8fbbb8PJyQm3bt3C6dOn0a9fPwDA6NGj0b17d7i6uuLevXs4cOAA3NzcyoxPEASMHj0as2fPhouLi7Ss2d7eng+xq0TD+rwKANj51Qcq+z/+cht+3BMLAFj5ywkYG1XD5x93g5WFCS5eS4V/6Dpcv/3fUM60lXtRWFSMlZ/6w1hRDWcv/40+4yOQ+f8rMADA+9WGGBfQAUbVq+HPxFQETP0R+04lVP5NEj1m9S+Pluf3GrFEZf/X0wIwsOdruBB/C2f/vA4AaO0/U6VN7PYwvGJfU/r8w28x6Nm5OSwtnr7CiOhlJIgVmUyiBSkpKZgzZw4iIyORkpKC2rVro3Xr1hgzZgw6d+78KMgnnnQ7YcIErF69Gnl5efDz88Nrr72GsLAwZGRkID8/H4GBgTh27BjS0tJQq1Yt+Pv7Y968eTA2NsbIkSPx+++/49atW1AqlejWrRsWLVqEmjVrlhlfyYPjvv32W2RkZKB9+/ZYvnw5XF1dy3V/WVlZsLS0hMJzIoRqCm18ZUSyk74/TNchEFWarKws2NWyQmZmZqVUzUv+nnAe+TMMFKVXzWqiOC8H15a+XWmx6pLOExZ9x4SFqgImLKTPXljCMupnGFYwYSnKy8G1JfqZsPDlh0RERCR7On+XEBEREXFZ87MwYSEiIpIBbazy0eN8hUNCREREJH+ssBAREcmAgYEgvXrkeYkVPF/OmLAQERHJAIeE1OOQEBEREckeKyxEREQywFVC6jFhISIikgEOCanHhIWIiEgGWGFRj3NYiIiISPZYYSEiIpIBVljUY4WFiIhIBkrmsFR008TcuXPRtm1bWFhYwMbGBn379kV8fLxKm9zcXAQHB6NmzZowNzdHv379kJaWptImOTkZfn5+MDU1hY2NDUJDQ1FYWKjS5uDBg2jVqhUUCgUaNmyIiIgIjWJlwkJERFRFHTp0CMHBwThx4gSioqJQUFAAHx8f5OTkSG3GjBmDHTt24KeffsKhQ4dw+/Zt+Pv7S8eLiorg5+eH/Px8HD9+HGvXrkVERASmTZsmtUlKSoKfnx+6dOmC2NhYjB49GkFBQdizZ0+5YxVEURS1c9tUlpLXhis8J0KoptB1OESVIn1/mK5DIKo0WVlZsKtlhczMTCiVykrp39LSEh6f/gZDY7MK9VWUm4O4L3o/d6z//PMPbGxscOjQIXTs2BGZmZmoXbs2Nm7ciLfffhsAcOXKFbi5uSEmJgavvfYafv/9d/Ts2RO3b9+Gra0tAGDlypWYOHEi/vnnHxgZGWHixInYuXMn/vzzT+laAwYMQEZGBnbv3l2u2FhhISIikgFtDgllZWWpbHl5eeWKITMzEwBQo0YNAMDZs2dRUFAAb29vqU3jxo3xyiuvICYmBgAQExMDDw8PKVkBAF9fX2RlZeHixYtSm8f7KGlT0kd5MGEhIiLSMw4ODrC0tJS2uXPnPvOc4uJijB49Gl5eXmjatCkAIDU1FUZGRrCyslJpa2tri9TUVKnN48lKyfGSY+raZGVl4eHDh+W6J64SIiIikgFtrhK6efOmypCQQvHsKQnBwcH4888/cfTo0QrFUFmYsBAREcmANp90q1QqNZrDEhISgsjISBw+fBj16tWT9tvZ2SE/Px8ZGRkqVZa0tDTY2dlJbU6dOqXSX8kqosfbPLmyKC0tDUqlEiYmJuWKkUNCREREVZQoiggJCcG2bdsQHR0NJycnleOtW7dG9erVsX//fmlffHw8kpOT4enpCQDw9PREXFwc7ty5I7WJioqCUqmEu7u71ObxPkralPRRHqywEBERyYAuHhwXHByMjRs34tdff4WFhYU058TS0hImJiawtLTEsGHDMHbsWNSoUQNKpRIjR46Ep6cnXnvtNQCAj48P3N3dMWjQIISHhyM1NRVTpkxBcHCwNBQ1YsQIfP3115gwYQI++OADREdHY8uWLdi5c2e5Y2XCQkREJAO6ePnhihUrAACdO3dW2b9mzRoMGTIEALBo0SIYGBigX79+yMvLg6+vL5YvXy61NTQ0RGRkJD766CN4enrCzMwMgYGBmDlzptTGyckJO3fuxJgxY7B48WLUq1cP3333HXx9fcsdKxMWIiIiGdBFhaU8j2IzNjbGsmXLsGzZsqe2cXR0xK5du9T207lzZ5w7d06j+B7HOSxEREQke6ywEBERyYEWhoSgv+8+ZMJCREQkB3xbs3ocEiIiIiLZY4WFiIhIBnSxSuhlwoSFiIhIBjgkpB6HhIiIiEj2WGEhIiKSAQ4JqceEhYiISAY4JKQeh4SIiIhI9lhhISIikgFWWNRjwkJERCQDnMOiHhMWIiIiGWCFRT3OYSEiIiLZY4WFiIhIBjgkpB4TFiIiIhngkJB6HBIiIiIi2WOFhYiISAYEaGFISCuRyBMTFiIiIhkwEAQYVDBjqej5csYhISIiIpI9VliIiIhkgKuE1GPCQkREJANcJaQeExYiIiIZMBAebRXtQ19xDgsRERHJHissREREciBoYUhHjyssTFiIiIhkgJNu1eOQEBEREckeKyxEREQyIPz/fxXtQ18xYSEiIpIBrhJSj0NCREREJHussBAREckAHxynXrkSlt9++63cHfbu3fu5gyEiIqqquEpIvXIlLH379i1XZ4IgoKioqCLxEBEREZVSroSluLi4suMgIiKq0gwEAQYVLJFU9Hw5q9AcltzcXBgbG2srFiIioiqLQ0LqabxKqKioCLNmzULdunVhbm6Oa9euAQCmTp2K77//XusBEhERVQUlk24ruukrjROWOXPmICIiAuHh4TAyMpL2N23aFN99951WgyMiIiICniNhWbduHb799lsEBATA0NBQ2t+8eXNcuXJFq8ERERFVFSVDQhXd9JXGc1j+/vtvNGzYsNT+4uJiFBQUaCUoIiKiqoaTbtXTuMLi7u6OI0eOlNr/888/o2XLlloJioiIiOhxGldYpk2bhsDAQPz9998oLi7G1q1bER8fj3Xr1iEyMrIyYiQiItJ7wv9vFe1DX2lcYenTpw927NiBffv2wczMDNOmTcPly5exY8cOvPnmm5URIxERkd7jKiH1nus5LB06dEBUVJS2YyEiIiIq03M/OO7MmTO4fPkygEfzWlq3bq21oIiIiKoaA+HRVtE+9JXGCcutW7fw3nvv4dixY7CysgIAZGRk4PXXX8emTZtQr149bcdIRESk9/i2ZvU0nsMSFBSEgoICXL58Genp6UhPT8fly5dRXFyMoKCgyoiRiIiIqjiNKyyHDh3C8ePH0ahRI2lfo0aNsHTpUnTo0EGrwREREVUlelwgqTCNExYHB4cyHxBXVFQEe3t7rQRFRERU1XBISD2Nh4TmzZuHkSNH4syZM9K+M2fO4JNPPsH8+fO1GhwREVFVUTLptqKbvipXhcXa2lola8vJyUG7du1Qrdqj0wsLC1GtWjV88MEH6Nu3b6UESkRERFVXuRKWr776qpLDICIiqto4JKReuRKWwMDAyo6DiIioSuOj+dV77gfHAUBubi7y8/NV9imVygoFRERERPQkjROWnJwcTJw4EVu2bMHdu3dLHS8qKtJKYERERFWJgSDAoIJDOhU9X840XiU0YcIEREdHY8WKFVAoFPjuu+8wY8YM2NvbY926dZURIxERkd4TBO1s+krjCsuOHTuwbt06dO7cGUOHDkWHDh3QsGFDODo6YsOGDQgICKiMOImIiKgK07jCkp6eDmdnZwCP5qukp6cDANq3b4/Dhw9rNzoiIqIqomSVUEU3faVxwuLs7IykpCQAQOPGjbFlyxYAjyovJS9DJCIiIs1wSEg9jROWoUOH4vz58wCATz/9FMuWLYOxsTHGjBmD0NBQrQdIREREpPEcljFjxkg/e3t748qVKzh79iwaNmyIZs2aaTU4IiKiqoKrhNTTuMLyJEdHR/j7+zNZISIiqgBdDAkdPnwYvXr1gr29PQRBwPbt21WODxkypNQcmW7duqm0SU9PR0BAAJRKJaysrDBs2DBkZ2ertLlw4QI6dOgAY2NjODg4IDw8XOPvp1wVliVLlpS7w1GjRmkcBBERUVWni0fz5+TkoHnz5vjggw/g7+9fZptu3bphzZo10meFQqFyPCAgACkpKYiKikJBQQGGDh2KDz/8EBs3bgQAZGVlwcfHB97e3li5ciXi4uLwwQcfwMrKCh9++GG5Yy1XwrJo0aJydSYIAhMWIiKil0T37t3RvXt3tW0UCgXs7OzKPHb58mXs3r0bp0+fRps2bQAAS5cuRY8ePTB//nzY29tjw4YNyM/Px+rVq2FkZIQmTZogNjYWCxcu1H7CUrIqiJ5fcuRnfG0B6S3rtiG6DoGo0ohF+c9upAUGqPg8jQrP8yjDwYMHYWNjA2tra7zxxhuYPXs2atasCQCIiYmBlZWVlKwAj+a3GhgY4OTJk3jrrbcQExODjh07wsjISGrj6+uLL7/8Evfu3YO1tXW54qjQu4SIiIhIO7Q5JJSVlaWyX6FQlBrKKY9u3brB398fTk5OSExMxGeffYbu3bsjJiYGhoaGSE1NhY2Njco51apVQ40aNZCamgoASE1NhZOTk0obW1tb6RgTFiIioirKwcFB5fP06dMRFhamcT8DBgyQfvbw8ECzZs3QoEEDHDx4EF27dq1omBphwkJERCQDggAYVHBVckmB5ubNmyrTEJ6nulIWZ2dn1KpVCwkJCejatSvs7Oxw584dlTaFhYVIT0+X5r3Y2dkhLS1NpU3J56fNjSlLZQx3ERERkYYMBO1swKNX5zy+aSthuXXrFu7evYs6deoAADw9PZGRkYGzZ89KbaKjo1FcXIx27dpJbQ4fPoyCggKpTVRUFBo1alTu4SCACQsREVGVlZ2djdjYWMTGxgJ4tMgmNjYWycnJyM7ORmhoKE6cOIHr169j//796NOnDxo2bAhfX18AgJubG7p164bhw4fj1KlTOHbsGEJCQjBgwADY29sDAAYOHAgjIyMMGzYMFy9exObNm7F48WKMHTtWo1ifK2E5cuQI3n//fXh6euLvv/8GAKxfvx5Hjx59nu6IiIiqPF28/PDMmTNo2bIlWrZsCQAYO3YsWrZsiWnTpsHQ0BAXLlxA79694erqimHDhqF169Y4cuSISsVmw4YNaNy4Mbp27YoePXqgffv2+Pbbb6XjlpaW2Lt3L5KSktC6dWuMGzcO06ZN02hJM/Acc1h++eUXDBo0CAEBATh37hzy8vIAAJmZmfj888+xa9cuTbskIiKq8gy0MIdF0/M7d+4MURSfenzPnj3P7KNGjRrSQ+KeplmzZjhy5IhmwT1B4wrL7NmzsXLlSqxatQrVq1eX9nt5eeGPP/6oUDBEREREZdG4whIfH4+OHTuW2m9paYmMjAxtxERERFTlPM+7gMrqQ19pXGGxs7NDQkJCqf1Hjx6Fs7OzVoIiIiKqakre1lzRTV9pnLAMHz4cn3zyCU6ePAlBEHD79m1s2LAB48ePx0cffVQZMRIREek9Ay1t+krjIaFPP/0UxcXF6Nq1Kx48eICOHTtCoVBg/PjxGDlyZGXESERERFWcxgmLIAiYPHkyQkNDkZCQgOzsbLi7u8Pc3Lwy4iMiIqoSOIdFved+NL+RkRHc3d21GQsREVGVZYCKz0ExgP5mLBonLF26dFH7YJro6OgKBURERET0JI0TlhYtWqh8LigoQGxsLP78808EBgZqKy4iIqIqhUNC6mmcsCxatKjM/WFhYcjOzq5wQERERFWRLp50+zLR2gqo999/H6tXr9ZWd0RERESS5550+6SYmBgYGxtrqzsiIqIqRRBQ4Um3HBJ6jL+/v8pnURSRkpKCM2fOYOrUqVoLjIiIqCrhHBb1NE5YLC0tVT4bGBigUaNGmDlzJnx8fLQWGBEREVEJjRKWoqIiDB06FB4eHrC2tq6smIiIiKocTrpVT6NJt4aGhvDx8eFbmYmIiLRM0NJ/+krjVUJNmzbFtWvXKiMWIiKiKqukwlLRTV9pnLDMnj0b48ePR2RkJFJSUpCVlaWyEREREWlbueewzJw5E+PGjUOPHj0AAL1791Z5RL8oihAEAUVFRdqPkoiISM9xDot65U5YZsyYgREjRuDAgQOVGQ8REVGVJAiC2nf1lbcPfVXuhEUURQBAp06dKi0YIiIiorJotKxZnzM3IiIiXeKQkHoaJSyurq7PTFrS09MrFBAREVFVxCfdqqdRwjJjxoxST7olIiIiqmwaJSwDBgyAjY1NZcVCRERUZRkIQoVffljR8+Ws3AkL568QERFVHs5hUa/cD44rWSVERERE9KKVu8JSXFxcmXEQERFVbVqYdKvHrxLSbA4LERERVQ4DCDCoYMZR0fPljAkLERGRDHBZs3oav/yQiIiI6EVjhYWIiEgGuEpIPSYsREREMsDnsKjHISEiIiKSPVZYiIiIZICTbtVjwkJERCQDBtDCkJAeL2vmkBARERHJHissREREMsAhIfWYsBAREcmAASo+7KHPwyb6fG9ERESkJ1hhISIikgFBECBUcEynoufLGRMWIiIiGRBQ8Zct62+6woSFiIhIFvikW/U4h4WIiIhkjxUWIiIimdDf+kjFMWEhIiKSAT6HRT0OCREREZHsscJCREQkA1zWrB4TFiIiIhngk27V0+d7IyIiIj3BCgsREZEMcEhIPSYsREREMsAn3arHISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBrhJSjwkLERGRDLDCop4+J2NERESkJ1hhISIikgGuElKPCQsREZEM8OWH6nFIiIiIiGSPFRYiIiIZMIAAgwoO6lT0fDljhYWIiEgGSoaEKrpp4vDhw+jVqxfs7e0hCAK2b9+uclwURUybNg116tSBiYkJvL29cfXqVZU26enpCAgIgFKphJWVFYYNG4bs7GyVNhcuXECHDh1gbGwMBwcHhIeHa/z9MGEhIiKqonJyctC8eXMsW7aszOPh4eFYsmQJVq5ciZMnT8LMzAy+vr7Izc2V2gQEBODixYuIiopCZGQkDh8+jA8//FA6npWVBR8fHzg6OuLs2bOYN28ewsLC8O2332oUK4eEiIiIZED4//8q2ocmunfvju7du5d5TBRFfPXVV5gyZQr69OkDAFi3bh1sbW2xfft2DBgwAJcvX8bu3btx+vRptGnTBgCwdOlS9OjRA/Pnz4e9vT02bNiA/Px8rF69GkZGRmjSpAliY2OxcOFClcTmWVhhISIikgFtDgllZWWpbHl5eRrHk5SUhNTUVHh7e0v7LC0t0a5dO8TExAAAYmJiYGVlJSUrAODt7Q0DAwOcPHlSatOxY0cYGRlJbXx9fREfH4979+6VOx4mLERERHrGwcEBlpaW0jZ37lyN+0hNTQUA2Nraquy3tbWVjqWmpsLGxkbleLVq1VCjRg2VNmX18fg1yoNDQkRERDIgaGGVUMmQ0M2bN6FUKqX9CoWiQv3KASssREREMqDNISGlUqmyPU/CYmdnBwBIS0tT2Z+WliYds7Ozw507d1SOFxYWIj09XaVNWX08fo3yYMJCREQkA7pY1qyOk5MT7OzssH//fmlfVlYWTp48CU9PTwCAp6cnMjIycPbsWalNdHQ0iouL0a5dO6nN4cOHUVBQILWJiopCo0aNYG1tXe54mLAQERFVUdnZ2YiNjUVsbCyARxNtY2NjkZycDEEQMHr0aMyePRu//fYb4uLiMHjwYNjb26Nv374AADc3N3Tr1g3Dhw/HqVOncOzYMYSEhGDAgAGwt7cHAAwcOBBGRkYYNmwYLl68iM2bN2Px4sUYO3asRrFyDgsREZEM6GJZ85kzZ9ClSxfpc0kSERgYiIiICEyYMAE5OTn48MMPkZGRgfbt22P37t0wNjaWztmwYQNCQkLQtWtXGBgYoF+/fliyZIl03NLSEnv37kVwcDBat26NWrVqYdq0aRotaQYAQRRFUaMzSCNZWVmwtLRE2t1MlQlQRPrEum2IrkMgqjRiUT7y4lYhM7Ny/hwv+Xvi19PXYGZuUaG+crLvo09b50qLVZc4JERERESyxyEhIiIiGdDFkNDLhAkLERGRDGhjlY82VwnJDYeEiIiISPZYYSEiIpIBARUf0tHjAgsTFiIiIjkwEB5tFe1DX3FIiIiIiGTvpaiwCIKAbdu2SU/Wo6pl4Zo9iDxwHldvpMFYUR2vNnNGWEgfuNT/7+2fEVuP4uc9Z3Ah/hbu5+TienQ4LC1MVfpp1nsabqakq+ybFtwbY4b4vJD7ICoxZogPenZpDhdHW+TmFeDUhWsI+/pXJNz4750sNjUtMHPUW+jcrjHMTRVIuHEHC1bvwY4DsVIbK6UpwkPfgW/7phBFEb9Fx2LSgp+R8zBfatPXuyXGDvVFg1dscPdeNlZtOYSlP+wHyQ9XCamn8wpLamoqRo4cCWdnZygUCjg4OKBXr14q7y7Qpa1bt8LHxwc1a9aEIAjS44vpxTn+RwKC3umIvavHY+vXISgoLIL/yK+R8zBPavMwtwBdPd2fmXx89j8/XPn9c2n78N1OlR0+USmvt2qI7346DJ8P5sM/5GtUr2aIrUtDYGpsJLVZETYYDR1tMHDsN/B673PsOBCLNXM/gIdrPanNqlmBaOxcB/4hX2PAmJV4vWVDfPXZQOm49+vu+HbWEKz55SheHzAH47/cjI8GvoHh73R8ofdL5SO3dwnJjU4rLNevX4eXlxesrKwwb948eHh4oKCgAHv27EFwcDCuXLmiy/AAADk5OWjfvj369++P4cOH6zqcKunnpcEqn5dPfx8uPpMQe/kmvFo1BAB8NPDRo6WPnv1LbV/mpsawraVfT3+kl887o5arfP54xg9IiPoCLdwccPxcIgDg1WbOGP/FJvxx6QYAYMHqPfj4vTfQws0BcX/dgmt9W3i/3gRdBocj9nIyAGDi/J+w5auPMHXxNqT+m4l3u7+KnQfPY83WowCAG3/fxaKIvfgk8E2s+unwC7xjKg8BFZ80q8f5im4rLB9//DEEQcCpU6fQr18/uLq6okmTJhg7dixOnDjx1PMmTpwIV1dXmJqawtnZGVOnTlV5C+T58+fRpUsXWFhYQKlUonXr1jhz5gwA4MaNG+jVqxesra1hZmaGJk2aYNeuXU+91qBBgzBt2jR4e3tr78apQrKycwEA1krTZ7Qs7au1e+HsPQEdA77AkvX7UFhYpO3wiDSmNH/0XpZ7WQ+kfacuXMNbb7aGldIUgiDA/83WUCiq4ejZqwCAth5OyMh6ICUrAHDwVDyKi0W0buoIADAyqoa8/EKVa+Xm5aOurTUc6tSo7Nsi0iqdVVjS09Oxe/duzJkzB2ZmZqWOW1lZPfVcCwsLREREwN7eHnFxcRg+fDgsLCwwYcIEAEBAQABatmyJFStWwNDQELGxsahevToAIDg4GPn5+Th8+DDMzMxw6dIlmJuba+2+8vLykJf331BFVlaW1vomoLi4GJMW/ox2zZ3h3tBeo3P/924nNG/sACulGU5duIaZy35D2r+ZmDOmXyVFS/RsgiBg7ti3cSI2EZcTU6T9QyetxurPP0DS/nAUFBbhYW4+BoWuQtKtfwEAtjWV+OfefZW+ioqKcS/rAWxrPqoiRp+4jDlj/NEx0hVHzlyFs0NtBAd0BQDY1bIsNaeLdMsAAgwqOKZjoMc1Fp0lLAkJCRBFEY0bN9b43ClTpkg/169fH+PHj8emTZukhCU5ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnitxGKXPnzsWMGTO02if9Z3z4FlxOTMHvq8ZofG7JH9QA0NSlLoyqV8OYz3/EtODeUBhV12aYROU2f0J/uDWog+7DF6nsnzyiJywtTNDn4yVIz8hBj07NsGbuB+gx/CtcSrxdrr7XbjsGp7q1sGnhCFSvZoj7OblYuekgJv3PD8XFxZVxO1QBHBJST2dDQhV5SfTmzZvh5eUFOzs7mJubY8qUKUhO/q8sOnbsWAQFBcHb2xtffPEFEhMTpWOjRo3C7Nmz4eXlhenTp+PChQsVuo8nTZo0CZmZmdJ28+ZNrfZflYWGb8GeI39ix4pRqGtrXeH+Wjepj8KiYiTf5r8ySTfCQ9+Bb4em6PXREty+kyHtr1+3Fj58txNGzvoBh0//hT+v/o3w737HucvJCPr/CbNpd7NQ21r1zb6GhgawVpoi7e5/ld2wr39FvU7j0Kz3NDTq9pk0J+b633cr/waJtEhnCYuLiwsEQdB4Ym1MTAwCAgLQo0cPREZG4ty5c5g8eTLy8/9bxhcWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cu1dp9KRQKKJVKlY0qRhRFhIZvwc6D5/HbilFwrFtLK/3G/XULBgYCateo2OvciZ5HeOg78OvcHL0/WoLk26rJQ8lqoeJi1X/YFRWJEP7/yWCn45JgpTRF88YO0vGObVxhYCDg7J83VM4rLhaR8k8mCgqL0M+nNU5duIa7GdmVcVtUEYKWNj2ls4SlRo0a8PX1xbJly5CTk1PqeEZGRpnnHT9+HI6Ojpg8eTLatGkDFxcX3Lhxo1Q7V1dXjBkzBnv37oW/vz/WrFkjHXNwcMCIESOwdetWjBs3DqtWrdLafZH2jf9yC7b8fhqrZg2Buakx0v7NQtq/WXiY+1+SmvZvFuLib+HazUfj+xcTbiMu/hbuZT763Tp14RpWbDyAuL9u4fqtf7Hl99OYvOgX9O/eFlbPMXmXqCLmT+yP/t3bYvjUCGQ/yIVNTQvY1LSAseLR0ORf11ORmHwHiya9h1bujqhftxaCA95Al3aNsOvg+f9vk4Z9xy9i8eSBaOXuiHbNnBEe2h9b9/6B1H8zAQA1LM0w1L89XBxt0dS1LuaO64c+XVti0oJfdHbv9HSClv7TVzpd1rxs2TJ4eXnh1VdfxcyZM9GsWTMUFhYiKioKK1aswOXLl0ud4+LiguTkZGzatAlt27bFzp07peoJADx8+BChoaF4++234eTkhFu3buH06dPo1+/RxMrRo0eje/fucHV1xb1793DgwAG4ubk9Ncb09HQkJyfj9u1HY8bx8fEAADs7O9jZ2Wnz66CnWP3LEQBAzxGLVfYvm/Y+BvZ6DQCwZusRfLnqd+mY34dfqbRRGFXH1qiz+GLVLuQXFMLRviY+eq8LggPeeDE3QfSYYW8/GtbZ+c1olf0fz1iPHyNPorCoGP1Hr8D0kD74ceH/YGaqQNLNf/Bx2HpEHb8ktR8+dS3mhfbH9uUjpQfHfTr/J5U+B/i1w8xP3oIgPKrK9BqxWBoWInqZCGJFJpNoQUpKCubMmYPIyEikpKSgdu3aaN26NcaMGYPOnTs/CvKJJ91OmDABq1evRl5eHvz8/PDaa68hLCwMGRkZyM/PR2BgII4dO4a0tDTUqlUL/v7+mDdvHoyNjTFy5Ej8/vvvuHXrFpRKJbp164ZFixahZs2aZcYXERGBoUOHlto/ffp0hIWFPfP+srKyYGlpibS7mRweIr1l3TZE1yEQVRqxKB95cauQmVk5f46X/D2xPzYZ5hYV6z/7fha6tnil0mLVJZ0nLPqOCQtVBUxYSJ+9qIQlWksJyxt6mrDo/NH8RERERM/yUrz8kIiISO/xQSxqMWEhIiKSAb6tWT0mLERERDKgjbct6/PbmjmHhYiIiGSPFRYiIiIZ4BQW9ZiwEBERyQEzFrU4JERERESyxwoLERGRDHCVkHpMWIiIiGSAq4TU45AQERERyR4rLERERDLAObfqMWEhIiKSA2YsanFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQBXCanHhIWIiEgGOIVFPc5hISIiItljhYWIiEgOWGJRiwkLERGRDHDSrXocEiIiIiLZY4WFiIhIBrhKSD0mLERERDLAKSzqcUiIiIiIZI8VFiIiIjlgiUUtJixEREQywFVC6nFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQCnsKjHhIWIiEgOmLGoxTksREREJHussBAREckAVwmpx4SFiIhIDrQw6VaP8xUOCREREZH8scJCREQkA5xzqx4TFiIiIjlgxqIWh4SIiIhI9lhhISIikgGuElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiIZ4Jxb9VhhISIikgNBS5sGwsLCIAiCyta4cWPpeG5uLoKDg1GzZk2Ym5ujX79+SEtLU+kjOTkZfn5+MDU1hY2NDUJDQ1FYWPgcX4B6rLAQERHJgK4m3TZp0gT79u2TPler9l9qMGbMGOzcuRM//fQTLC0tERISAn9/fxw7dgwAUFRUBD8/P9jZ2eH48eNISUnB4MGDUb16dXz++ecVupcnMWEhIiKqwqpVqwY7O7tS+zMzM/H9999j48aNeOONNwAAa9asgZubG06cOIHXXnsNe/fuxaVLl7Bv3z7Y2tqiRYsWmDVrFiZOnIiwsDAYGRlpLU4OCREREcmAgP9WCj339v99ZWVlqWx5eXlPve7Vq1dhb28PZ2dnBAQEIDk5GQBw9uxZFBQUwNvbW2rbuHFjvPLKK4iJiQEAxMTEwMPDA7a2tlIbX19fZGVl4eLFi1r9fpiwEBERyYA2p7A4ODjA0tJS2ubOnVvmNdu1a4eIiAjs3r0bK1asQFJSEjp06ID79+8jNTUVRkZGsLKyUjnH1tYWqampAIDU1FSVZKXkeMkxbeKQEBERkZ65efMmlEql9FmhUJTZrnv37tLPzZo1Q7t27eDo6IgtW7bAxMSk0uPUBCssREREMlDh4aDHHjynVCpVtqclLE+ysrKCq6srEhISYGdnh/z8fGRkZKi0SUtLk+a82NnZlVo1VPK5rHkxFcGEhYiISBZ0sK75CdnZ2UhMTESdOnXQunVrVK9eHfv375eOx8fHIzk5GZ6engAAT09PxMXF4c6dO1KbqKgoKJVKuLu7VyiWJ3FIiIiIqIoaP348evXqBUdHR9y+fRvTp0+HoaEh3nvvPVhaWmLYsGEYO3YsatSoAaVSiZEjR8LT0xOvvfYaAMDHxwfu7u4YNGgQwsPDkZqaiilTpiA4OLjcVZ3yYsJCREQkA7p4l9CtW7fw3nvv4e7du6hduzbat2+PEydOoHbt2gCARYsWwcDAAP369UNeXh58fX2xfPly6XxDQ0NERkbio48+gqenJ8zMzBAYGIiZM2dW7EbKIIiiKGq9V5JkZWXB0tISaXczVSZAEekT67Yhug6BqNKIRfnIi1uFzMzK+XO85O+JKzf+gUUF+7+flYXGjrUrLVZd4hwWIiIikj0OCREREcmALoaEXiZMWIiIiGRAV+8SelkwYSEiIpKDiq9Krvj5MsY5LERERCR7rLAQERHJAAss6jFhISIikgFOulWPQ0JEREQke6ywEBERyQBXCanHhIWIiEgOOIlFLQ4JERERkeyxwkJERCQDLLCox4SFiIhIBrhKSD0OCREREZHsscJCREQkCxVfJaTPg0JMWIiIiGSAQ0LqcUiIiIiIZI8JCxEREckeh4SIiIhkgENC6jFhISIikgE+ml89DgkRERGR7LHCQkREJAMcElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiI5YIlFLSYsREREMsBVQupxSIiIiIhkjxUWIiIiGeAqIfWYsBAREckAp7Cox4SFiIhIDpixqMU5LERERCR7rLAQERHJAFcJqceEhYiISAY46VY9JiyVTBRFAMD9rCwdR0JUecSifF2HQFRpSn6/S/48ryxZWvh7Qht9yBUTlkp2//59AEBDJwcdR0JERBVx//59WFpaar1fIyMj2NnZwUVLf0/Y2dnByMhIK33JiSBWdspYxRUXF+P27duwsLCAoM+1OpnIysqCg4MDbt68CaVSqetwiLSOv+MvniiKuH//Puzt7WFgUDlrVXJzc5Gfr51KpZGREYyNjbXSl5ywwlLJDAwMUK9ePV2HUeUolUr+YU56jb/jL1ZlVFYeZ2xsrJdJhjZxWTMRERHJHhMWIiIikj0mLKRXFAoFpk+fDoVCoetQiCoFf8epquKkWyIiIpI9VliIiIhI9piwEBERkewxYSEiIiLZY8JCsiYIArZv367rMIgqBX+/icqPCQvpTGpqKkaOHAlnZ2coFAo4ODigV69e2L9/v65DA/Do6ZbTpk1DnTp1YGJiAm9vb1y9elXXYdFLQu6/31u3boWPjw9q1qwJQRAQGxur65CI1GLCQjpx/fp1tG7dGtHR0Zg3bx7i4uKwe/dudOnSBcHBwboODwAQHh6OJUuWYOXKlTh58iTMzMzg6+uL3NxcXYdGMvcy/H7n5OSgffv2+PLLL3UdClH5iEQ60L17d7Fu3bpidnZ2qWP37t2TfgYgbtu2Tfo8YcIE0cXFRTQxMRGdnJzEKVOmiPn5+dLx2NhYsXPnzqK5ubloYWEhtmrVSjx9+rQoiqJ4/fp1sWfPnqKVlZVoamoquru7izt37iwzvuLiYtHOzk6cN2+etC8jI0NUKBTijz/+WMG7J30n99/vxyUlJYkAxHPnzj33/RK9CHyXEL1w6enp2L17N+bMmQMzM7NSx62srJ56roWFBSIiImBvb4+4uDgMHz4cFhYWmDBhAgAgICAALVu2xIoVK2BoaIjY2FhUr14dABAcHIz8/HwcPnwYZmZmuHTpEszNzcu8TlJSElJTU+Ht7S3ts7S0RLt27RATE4MBAwZU4BsgffYy/H4TvYyYsNALl5CQAFEU0bhxY43PnTJlivRz/fr1MX78eGzatEn6Az05ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnp14nNTUVAGBra6uy39bWVjpGVJaX4feb6GXEOSz0wokVeLjy5s2b4eXlBTs7O5ibm2PKlClITk6Wjo8dOxZBQUHw9vbGF198gcTEROnYqFGjMHv2bHh5eWH69Om4cOFChe6DqCz8/SaqHExY6IVzcXGBIAi4cuWKRufFxMQgICAAPXr0QGRkJM6dO4fJkycjPz9fahMWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cuLfNadnZ2AIC0tDSV/WlpadIxorK8DL/fRC8l3U6hoaqqW7duGk9KnD9/vujs7KzSdtiwYaKlpeVTrzNgwACxV69eZR779NNPRQ8PjzKPlUy6nT9/vrQvMzOTk26pXOT++/04TrqllwUrLKQTy5YtQ1FREV599VX88ssvuHr1Ki5fvowlS5bA09OzzHNcXFyQnJyMTZs2ITExEUuWLJH+dQkADx8+REhICA4ePIgbN27g2LFjOH36NNzc3AAAo0ePxp49e5CUlIQ//vgDBw4ckI49SRAEjB49GrNnz8Zvv/2GuLg4DB48GPb29ujbt6/Wvw/SL3L//QYeTQ6OjY3FpUuXAADx8fGIjY3lHC2SL11nTFR13b59WwwODhYdHR1FIyMjsW7dumLv3r3FAwcOSG3wxLLP0NBQsWbNmqK5ubn47rvviosWLZL+BZqXlycOGDBAdHBwEI2MjER7e3sxJCREfPjwoSiKohgSEiI2aNBAVCgUYu3atcVBgwaJ//7771PjKy4uFqdOnSra2tqKCoVC7Nq1qxgfH18ZXwXpIbn/fq9Zs0YEUGqbPn16JXwbRBUniGIFZogRERERvQAcEiIiIiLZY8JCREREsseEhYiIiGSPCQsRERHJHhMWIiIikj0mLERERCR7TFiIiIhI9piwEFUBQ4YMUXlCb+fOnTF69OgXHsfBgwchCAIyMjKe2kYQBGzfvr3cfYaFhaFFixYViuv69esQBAGxsbEV6oeIKg8TFiIdGTJkCARBgCAIMDIyQsOGDTFz5kwUFhZW+rW3bt2KWbNmlatteZIMIqLKVk3XARBVZd26dcOaNWuQl5eHXbt2ITg4GNWrV8ekSZNKtc3Pz4eRkZFWrlujRg2t9ENE9KKwwkKkQwqFAnZ2dnB0dMRHH30Eb29v/PbbbwD+G8aZM2cO7O3t0ahRIwDAzZs30b9/f1hZWaFGjRro06cPrl+/LvVZVFSEsWPHwsrKCjVr1sSECRPw5Bs4nhwSysvLw8SJE+Hg4ACFQoGGDRvi+++/x/Xr19GlSxcAgLW1NQRBwJAhQwAAxcXFmDt3LpycnGBiYoLmzZvj559/VrnOrl274OrqChMTE3Tp0kUlzvKaOHEiXF1dYWpqCmdnZ0ydOhUFBQWl2n3zzTdwcHCAqakp+vfvj8zMTJXj3333Hdzc3GBsbIzGjRtj+fLlGsdCRLrDhIVIRkxMTJCfny993r9/P+Lj4xEVFYXIyEgUFBTA19cXFhYWOHLkCI4dOwZzc3N069ZNOm/BggWIiIjA6tWrcfToUaSnp6u89bcsgwcPxo8//oglS5bg8uXL+Oabb2Bubg4HBwf88ssvAB69zTclJQWLFy8GAMydOxfr1q3DypUrcfHiRYwZMwbvv/8+Dh06BOBRYuXv749evXohNjYWQUFB+PTTTzX+TiwsLBAREYFLly5h8eLFWLVqFRYtWqTSJiEhAVu2bMGOHTuwe/dunDt3Dh9//LF0fMOGDZg2bRrmzJmDy5cv4/PPP8fUqVOxdu1ajeMhIh3R8csXiaqswMBAsU+fPqIoPnozdFRUlKhQKMTx48dLx21tbcW8vDzpnPXr14uNGjUSi4uLpX15eXmiiYmJuGfPHlEURbFOnTpieHi4dLygoECsV6+edC1RFMVOnTqJn3zyiSiKohgfHy8CEKOiosqM88CBAyIA8d69e9K+3Nxc0dTUVDx+/LhK22HDhonvvfeeKIqiOGnSJNHd3V3l+MSJE0v19SQ88QbjJ82bN09s3bq19Hn69OmioaGheOvWLWnf77//LhoYGIgpKSmiKIpigwYNxI0bN6r0M2vWLNHT01MURVFMSkoSAYjnzp176nWJSLc4h4VIhyIjI2Fubo6CggIUFxdj4MCBCAsLk457eHiozFs5f/48EhISYGFhodJPbm4uEhMTkZmZiZSUFLRr1046Vq1aNbRp06bUsFCJ2NhYGBoaolOnTuWOOyEhAQ8ePMCbb76psj8/Px8tW7YEAFy+fFklDgDw9PQs9zVKbN68GUuWLEFiYiKys7NRWFgIpVKp0uaVV15B3bp1Va5TXFyM+Ph4WFhYIDExEcOGDcPw4cOlNoWFhbC0tNQ4HiLSDSYsRDrUpUsXrFixAkZGRrC3t0e1aqr/S5qZmal8zs7ORuvWrbFhw4ZSfdWuXfu5YjAxMdH4nOzsbADAzp07VRIF4NG8HG2JiYlBQEAAZsyYAV9fX1haWmLTpk1YsGCBxrGuWrWqVAJlaGiotViJqHIxYSHSITMzMzRs2LDc7Vu1aoXNmzfDxsamVJWhRJ06dXDy5El07NgRwKNKwtmzZ9GqVasy23t4eKC4uBiHDh2Ct7d3qeMlFZ6ioiJpn7u7OxQKBZKTk59amXFzc5MmEJc4ceLEs2/yMcePH4ejoyMmT54s7btx40apdsnJybh9+zbs7e2l6xgYGKBRo0awtbWFvb09rl27hoCAAI2uT0TywUm3RC+RgIAA1KpVC3369MGRI0eQlJSEgwcPYtSoUbh16xYA4JNPPsEXX3yB7du348qVK/j444/VPkOlfv36CAwMxAcffIDt27dLfW7ZsgUA4OjoCEEQEBkZiX/++QfZ2dmwsLDA+PHjMWbMGKxduxaJiYn4448/sHTpUmki64gRI3D16lWEhoYiPj4eGzduREREhEb36+LiguTkZGzatAmJiYlYsmRJmROIjY2NERgYiPPnz+PIkSMYNWoU+vfvDzs7OwDAjBkzMHfuXCxZsgR//fUX4uLisGbNGixcuFCjeIhId5iwEL1ETE1NcfjwYbzyyivw9/eHm5sbhg0bhtzcXKniMm7cOAwaNAiBgYHw9PSEhYUF3nrrLbX9rlixAm+//TY+/vhjNG7cGMOHD0dOTg4AoG7dupgxYwY+/fRT2NraIiQkBAAwa9YsTJ06FXPnzoWbmxu6deuGnTt3wsnJCcCjeSW//PILtm/fjubNm2PlypX4/PPPNbrf3r17Y8yYMQgJCUGLFi1w/PhxTJ06tVS7hg0bwt/fHz169ICPjw+aNWumsmw5KCgI3333HdasWQMPDw906tQJERERUqxEJH+C+LSZeEREREQywQoLERERyR4TFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHhIWIiIhkjwkLERERyR4TFiIiIpI9JixEREQke0xYiIiISPb+D8eEamDpGfNzAAAAAElFTkSuQmCC",
760
+ "text/plain": [
761
+ "<Figure size 640x480 with 2 Axes>"
762
+ ]
763
+ },
764
+ "metadata": {},
765
+ "output_type": "display_data"
766
+ }
767
+ ],
768
+ "source": [
769
+ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
770
+ "import matplotlib.pyplot as plt\n",
771
+ "\n",
772
+ "# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
773
+ "cm = confusion_matrix(predictions.label_ids, preds)\n",
774
+ "\n",
775
+ "# 可视化混淆矩阵\n",
776
+ "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
777
+ "disp.plot(cmap=plt.cm.Blues)\n",
778
+ "plt.title('Confusion Matrix')\n",
779
+ "plt.show()"
780
+ ]
781
+ },
782
+ {
783
+ "cell_type": "code",
784
+ "execution_count": null,
785
+ "id": "23e3a640-88d7-4a1e-8515-7c417d50f018",
786
+ "metadata": {},
787
+ "outputs": [],
788
+ "source": []
789
+ }
790
+ ],
791
+ "metadata": {
792
+ "kernelspec": {
793
+ "display_name": "Python 3 (ipykernel)",
794
+ "language": "python",
795
+ "name": "python3"
796
+ },
797
+ "language_info": {
798
+ "codemirror_mode": {
799
+ "name": "ipython",
800
+ "version": 3
801
+ },
802
+ "file_extension": ".py",
803
+ "mimetype": "text/x-python",
804
+ "name": "python",
805
+ "nbconvert_exporter": "python",
806
+ "pygments_lexer": "ipython3",
807
+ "version": "3.12.3"
808
+ }
809
+ },
810
  "nbformat": 4,
811
  "nbformat_minor": 5
812
  }
03-gene-task/.ipynb_checkpoints/3-multi-seq-task-checkpoint.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
03-gene-task/.ipynb_checkpoints/5-regression-task-checkpoint.ipynb CHANGED
@@ -1,9 +1,563 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "0dd6e0d3-287c-4798-bb9b-5734ff4abf93",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c499a5c3-0244-41c4-9947-e166206204e2",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.5 回归类任务"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "4678171b-bbc8-49dd-ad04-48f5ef89b45e",
14
+ "metadata": {},
15
+ "source": [
16
+ "GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值。\n",
17
+ "\n",
18
+ "使用 GPT-2 进行回归问题的解决,可以将回归问题转化为自回归语言模型任务。GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值(如情感评分、价格预测等)。\n",
19
+ "\n",
20
+ "---\n",
21
+ "\n",
22
+ "### **1. 使用 GPT-2 做回归的核心思路**\n",
23
+ "\n",
24
+ "1. **调整输出层**:\n",
25
+ " - 默认情况下,GPT-2 的输出是一个词汇表大小的概率分布,用于预测下一个 token。\n",
26
+ " - 对于回归问题,可以将模型的最后一层替换为一个线性层,使得输出变为一个标量或多个连续值。\n",
27
+ " - gpt2的huggingface实现中,可以简单设置1个分类的分类header,实现回归预测。\n",
28
+ "\n",
29
+ "2. **损失函数**:\n",
30
+ " - 对于回归问题,使用均方误差(MSE)或均绝对误差(MAE)作为损失函数,而不是分类任务中常用的交叉熵。\n",
31
+ "\n",
32
+ "3. **输入格式**:\n",
33
+ " - 输入数据仍然是文本,可以通过特定的模板形式加入上下文信息。\n",
34
+ "\n",
35
+ "---\n",
36
+ "\n",
37
+ "### **2. GPT-2 回归任务的实现步骤**\n",
38
+ "\n",
39
+ "#### **(1)加载基础模型**\n",
40
+ "\n",
41
+ "从 Hugging Face Transformers 库加载 GPT-2 模型和分词器,并调整其配置以适应回归任务。\n",
42
+ "\n",
43
+ "```python\n",
44
+ "from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, AutoModelForSequenceClassification\n",
45
+ "\n",
46
+ "# 加载分词器\n",
47
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
48
+ "\n",
49
+ "# 调整模型配置,num_labels=1 表示回归任务\n",
50
+ "config = GPT2Config.from_pretrained(\"gpt2\", num_labels=1)\n",
51
+ "\n",
52
+ "# 加载模型,增加回归输出\n",
53
+ "model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", config=config)\n",
54
+ "```\n",
55
+ "\n",
56
+ "---\n",
57
+ "\n",
58
+ "### **3. 课程数据集**\n",
59
+ "\n",
60
+ "本例程使用了蛋白质稳定性分析的数据集,也就是一个蛋白质序列,对应一个float的数值,做回归预测分析。\n",
61
+ "\n",
62
+ "**蛋白质稳定性分析**是研究蛋白质在不同条件下保持其结构和功能的能力的过程。蛋白质稳定性是生物化学和生物技术领域的重要课题,影响着蛋白质的折叠、功能执行、以及在应用中的可用性(如工业酶、药物开发等)。\n"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 1,
68
+ "id": "1e8c0f86-af78-43e1-8db4-e2a2ea22f815",
69
+ "metadata": {},
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
75
+ ]
76
+ },
77
+ "execution_count": 1,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "import subprocess\n",
84
+ "import os\n",
85
+ "# 设置环境变量, autodl一般区域\n",
86
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
87
+ "output = result.stdout\n",
88
+ "for line in output.splitlines():\n",
89
+ " if '=' in line:\n",
90
+ " var, value = line.split('=', 1)\n",
91
+ " os.environ[var] = value\n",
92
+ "\n",
93
+ "\"\"\"\n",
94
+ "import os\n",
95
+ "\n",
96
+ "# 设置环境变量, autodl专区 其他idc\n",
97
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
98
+ "\n",
99
+ "# 打印环境变量以确认设置成功\n",
100
+ "print(os.environ.get('HF_ENDPOINT'))\n",
101
+ "\"\"\""
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 2,
107
+ "id": "c51a8d69-9a36-47e7-8084-f64e6a72e4f7",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "from transformers import AutoTokenizer, AutoModel\n",
112
+ "from tokenizers import Tokenizer\n",
113
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
114
+ "from transformers import AutoModelForSequenceClassification\n",
115
+ "from transformers import DataCollatorWithPadding"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 3,
121
+ "id": "a5aeb7c1-2d2a-4f57-ad8c-659613870e59",
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "#set tokenizer\n",
126
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
127
+ "tokenizer.pad_token = tokenizer.eos_token"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 4,
133
+ "id": "ad0c19cd-96a5-463e-8b7d-439646fef429",
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stderr",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']\n",
141
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "#set model\n",
147
+ "model = AutoModelForSequenceClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', num_labels=1)\n",
148
+ "model.config.pad_token_id = model.config.eos_token_id"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 5,
154
+ "id": "8c48cb0a-6142-4afc-823e-08fb33f74222",
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "text/plain": [
160
+ "DatasetDict({\n",
161
+ " train: Dataset({\n",
162
+ " features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
163
+ " num_rows: 62079\n",
164
+ " })\n",
165
+ " test: Dataset({\n",
166
+ " features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
167
+ " num_rows: 6898\n",
168
+ " })\n",
169
+ "})"
170
+ ]
171
+ },
172
+ "execution_count": 5,
173
+ "metadata": {},
174
+ "output_type": "execute_result"
175
+ }
176
+ ],
177
+ "source": [
178
+ "from datasets import load_dataset\n",
179
+ "# 1. load ~11k samples from promoters prediction dataset\n",
180
+ "dataset = load_dataset(\"csv\", data_files=\"data/protein_stab.csv\")['train'].train_test_split(test_size=0.1)\n",
181
+ "dataset"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "685dd025-f00a-4869-bc30-9843c77b6d8a",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/plain": [
193
+ "{'seq_id': 'train_prot_32672',\n",
194
+ " 'seq_type': 'prot',\n",
195
+ " 'seq': 'FYRLIIFKYPDYIDTYLRLAAIAKEKNNLQLSIEGNGSGGNGSGGNGSGN',\n",
196
+ " 'label': 0.7599999904632561}"
197
+ ]
198
+ },
199
+ "execution_count": 6,
200
+ "metadata": {},
201
+ "output_type": "execute_result"
202
+ }
203
+ ],
204
+ "source": [
205
+ "dataset[\"train\"][0]"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 7,
211
+ "id": "6e10dbbb-73ef-4b67-8290-77f8896298f5",
212
+ "metadata": {},
213
+ "outputs": [
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "datasets mean token lenght 17.24006958538707 min token length 12 max token length 35\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "token_len_list = []\n",
224
+ "for item in dataset[\"test\"]:\n",
225
+ " inputs = tokenizer.tokenize(item[\"seq\"])\n",
226
+ " token_len_list.append( len(inputs) )\n",
227
+ "\n",
228
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
229
+ "min_len = min(token_len_list)\n",
230
+ "max_len = max(token_len_list)\n",
231
+ "\n",
232
+ "print(\"datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 25,
238
+ "id": "ac58b5b4-bff0-404d-bcf5-2b93db2b37c0",
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "data": {
243
+ "application/vnd.jupyter.widget-view+json": {
244
+ "model_id": "419cce8c5ba249ac8c8773dd2d69992d",
245
+ "version_major": 2,
246
+ "version_minor": 0
247
+ },
248
+ "text/plain": [
249
+ "Map: 0%| | 0/62079 [00:00<?, ? examples/s]"
250
+ ]
251
+ },
252
+ "metadata": {},
253
+ "output_type": "display_data"
254
+ },
255
+ {
256
+ "name": "stderr",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.\n",
260
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
261
+ ]
262
+ },
263
+ {
264
+ "data": {
265
+ "application/vnd.jupyter.widget-view+json": {
266
+ "model_id": "0b9ea09fe3ea49b19f7d52aca7949acf",
267
+ "version_major": 2,
268
+ "version_minor": 0
269
+ },
270
+ "text/plain": [
271
+ "Map: 0%| | 0/6898 [00:00<?, ? examples/s]"
272
+ ]
273
+ },
274
+ "metadata": {},
275
+ "output_type": "display_data"
276
+ }
277
+ ],
278
+ "source": [
279
+ "# 2. tokenize\n",
280
+ "def tokenize_function(examples):\n",
281
+ " return tokenizer(examples['seq'], truncation=True, padding='max_length')\n",
282
+ "\n",
283
+ "# 3. 对数据集应用分词函数\n",
284
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
285
+ "\n",
286
+ "# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
287
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 26,
293
+ "id": "94f6d643-2cf7-4651-9a8d-1884b2bddd1c",
294
+ "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "name": "stderr",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
301
+ " warnings.warn(\n",
302
+ "/tmp/ipykernel_1347/4285456223.py:23: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
303
+ " trainer = Trainer(\n"
304
+ ]
305
+ }
306
+ ],
307
+ "source": [
308
+ "from transformers import TrainingArguments, Trainer\n",
309
+ "import numpy as np\n",
310
+ "from sklearn.metrics import mean_squared_error\n",
311
+ "\n",
312
+ "\n",
313
+ "def compute_metrics(eval_pred):\n",
314
+ " predictions, labels = eval_pred\n",
315
+ " rmse = mean_squared_error(labels, predictions)\n",
316
+ " return {\"rmse\": rmse}\n",
317
+ "\n",
318
+ "# 设置训练参数\n",
319
+ "training_args = TrainingArguments(\n",
320
+ " output_dir='./results',\n",
321
+ " evaluation_strategy=\"epoch\",\n",
322
+ " learning_rate=2e-5,\n",
323
+ " per_device_train_batch_size=20,\n",
324
+ " per_device_eval_batch_size=20,\n",
325
+ " num_train_epochs=10,\n",
326
+ " weight_decay=0.01,\n",
327
+ ")\n",
328
+ "\n",
329
+ "# 使用Trainer API进行训练(假设已有train_dataset和eval_dataset)\n",
330
+ "trainer = Trainer(\n",
331
+ " model=model,\n",
332
+ " args=training_args,\n",
333
+ " train_dataset=tokenized_datasets[\"train\"],\n",
334
+ " eval_dataset=tokenized_datasets[\"test\"],\n",
335
+ " tokenizer=tokenizer,\n",
336
+ " data_collator=data_collator,\n",
337
+ " compute_metrics=compute_metrics,\n",
338
+ ")"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "id": "dfe12979-d977-4404-bf9e-18c1f91a3e39",
345
+ "metadata": {},
346
+ "outputs": [
347
+ {
348
+ "data": {
349
+ "text/html": [
350
+ "\n",
351
+ " <div>\n",
352
+ " \n",
353
+ " <progress value='30987' max='31040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
354
+ " [30987/31040 1:00:56 < 00:06, 8.47 it/s, Epoch 9.98/10]\n",
355
+ " </div>\n",
356
+ " <table border=\"1\" class=\"dataframe\">\n",
357
+ " <thead>\n",
358
+ " <tr style=\"text-align: left;\">\n",
359
+ " <th>Epoch</th>\n",
360
+ " <th>Training Loss</th>\n",
361
+ " <th>Validation Loss</th>\n",
362
+ " <th>Rmse</th>\n",
363
+ " </tr>\n",
364
+ " </thead>\n",
365
+ " <tbody>\n",
366
+ " <tr>\n",
367
+ " <td>1</td>\n",
368
+ " <td>0.044600</td>\n",
369
+ " <td>0.163462</td>\n",
370
+ " <td>0.163462</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <td>2</td>\n",
374
+ " <td>0.041900</td>\n",
375
+ " <td>0.157900</td>\n",
376
+ " <td>0.157900</td>\n",
377
+ " </tr>\n",
378
+ " <tr>\n",
379
+ " <td>3</td>\n",
380
+ " <td>0.037700</td>\n",
381
+ " <td>0.159724</td>\n",
382
+ " <td>0.159724</td>\n",
383
+ " </tr>\n",
384
+ " <tr>\n",
385
+ " <td>4</td>\n",
386
+ " <td>0.031700</td>\n",
387
+ " <td>0.157686</td>\n",
388
+ " <td>0.157686</td>\n",
389
+ " </tr>\n",
390
+ " <tr>\n",
391
+ " <td>5</td>\n",
392
+ " <td>0.028800</td>\n",
393
+ " <td>0.157124</td>\n",
394
+ " <td>0.157124</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <td>6</td>\n",
398
+ " <td>0.025400</td>\n",
399
+ " <td>0.150852</td>\n",
400
+ " <td>0.150852</td>\n",
401
+ " </tr>\n",
402
+ " <tr>\n",
403
+ " <td>7</td>\n",
404
+ " <td>0.022300</td>\n",
405
+ " <td>0.159293</td>\n",
406
+ " <td>0.159293</td>\n",
407
+ " </tr>\n",
408
+ " <tr>\n",
409
+ " <td>8</td>\n",
410
+ " <td>0.019600</td>\n",
411
+ " <td>0.154608</td>\n",
412
+ " <td>0.154608</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <td>9</td>\n",
416
+ " <td>0.017300</td>\n",
417
+ " <td>0.156104</td>\n",
418
+ " <td>0.156104</td>\n",
419
+ " </tr>\n",
420
+ " </tbody>\n",
421
+ "</table><p>"
422
+ ],
423
+ "text/plain": [
424
+ "<IPython.core.display.HTML object>"
425
+ ]
426
+ },
427
+ "metadata": {},
428
+ "output_type": "display_data"
429
+ },
430
+ {
431
+ "name": "stderr",
432
+ "output_type": "stream",
433
+ "text": [
434
+ "IOPub message rate exceeded.\n",
435
+ "The Jupyter server will temporarily stop sending output\n",
436
+ "to the client in order to avoid crashing it.\n",
437
+ "To change this limit, set the config variable\n",
438
+ "`--ServerApp.iopub_msg_rate_limit`.\n",
439
+ "\n",
440
+ "Current values:\n",
441
+ "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
442
+ "ServerApp.rate_limit_window=3.0 (secs)\n",
443
+ "\n"
444
+ ]
445
+ }
446
+ ],
447
+ "source": [
448
+ "# 开始训练\n",
449
+ "trainer.train()"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": null,
455
+ "id": "060c4618-40d0-4934-bab8-36aab3a46de5",
456
+ "metadata": {},
457
+ "outputs": [],
458
+ "source": [
459
+ "#模型测试\n",
460
+ "predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
461
+ "predictions"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": 18,
467
+ "id": "1f8ef885-5bc9-4668-905b-6b2235209654",
468
+ "metadata": {},
469
+ "outputs": [
470
+ {
471
+ "data": {
472
+ "text/html": [
473
+ "\n",
474
+ " <div>\n",
475
+ " \n",
476
+ " <progress value='345' max='345' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
477
+ " [345/345 00:09]\n",
478
+ " </div>\n",
479
+ " "
480
+ ],
481
+ "text/plain": [
482
+ "<IPython.core.display.HTML object>"
483
+ ]
484
+ },
485
+ "metadata": {},
486
+ "output_type": "display_data"
487
+ },
488
+ {
489
+ "data": {
490
+ "text/plain": [
491
+ "{'eval_loss': 0.15949687361717224,\n",
492
+ " 'eval_rmse': 0.15949687361717224,\n",
493
+ " 'eval_runtime': 9.1483,\n",
494
+ " 'eval_samples_per_second': 754.017,\n",
495
+ " 'eval_steps_per_second': 37.712,\n",
496
+ " 'epoch': 10.0}"
497
+ ]
498
+ },
499
+ "execution_count": 18,
500
+ "metadata": {},
501
+ "output_type": "execute_result"
502
+ }
503
+ ],
504
+ "source": [
505
+ "trainer.evaluate()"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 23,
511
+ "id": "afabdbe9-9b96-4f9e-bef2-1d819431f8d1",
512
+ "metadata": {},
513
+ "outputs": [
514
+ {
515
+ "name": "stdout",
516
+ "output_type": "stream",
517
+ "text": [
518
+ "[[ 1.7208484 ]\n",
519
+ " [ 0.00225139]\n",
520
+ " [ 0.3325616 ]\n",
521
+ " [-0.34372616]\n",
522
+ " [-0.45505935]\n",
523
+ " [-0.06892765]\n",
524
+ " [ 0.15099108]\n",
525
+ " [ 0.12211376]\n",
526
+ " [ 0.3947332 ]\n",
527
+ " [ 0.23186803]]\n"
528
+ ]
529
+ }
530
+ ],
531
+ "source": [
532
+ "predictions.predictions[0:10].squeeze()"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 24,
538
+ "id": "fa9d17fd-eece-4c1e-99e0-3d19d36f7584",
539
+ "metadata": {},
540
+ "outputs": [
541
+ {
542
+ "data": {
543
+ "text/plain": [
544
+ "array([ 1.69, 0.84, 0.58, -0.15, 0.23, 0.03, 0.15, 0.2 , 0.51,\n",
545
+ " 1.1 ], dtype=float32)"
546
+ ]
547
+ },
548
+ "execution_count": 24,
549
+ "metadata": {},
550
+ "output_type": "execute_result"
551
+ }
552
+ ],
553
+ "source": [
554
+ "predictions.label_ids[0:10]"
555
+ ]
556
+ },
557
  {
558
  "cell_type": "code",
559
  "execution_count": null,
560
+ "id": "52252015-e068-414b-bd8a-79a5d1a2beec",
561
  "metadata": {},
562
  "outputs": [],
563
  "source": []
03-gene-task/1-category-task.ipynb CHANGED
@@ -1,9 +1,788 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "0e595792-7a4d-42d1-9e96-fd76cf00efe8",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "5840e900-43cb-4ab4-81a5-988b68fda9b1",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.1 序列分类任务"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "958e7b5f-759a-431c-8af0-325271facb41",
14
+ "metadata": {},
15
+ "source": [
16
+ "基于 GPT-2 模型,可以通过微调(fine-tuning)或使用提示(prompt-based)方法来完成多种下游任务。\n",
17
+ "本章主要使用经典的微调方式,提示微调则属于chatgpt的范围,放在下一章,以下是几种常见的下游任务及其简单描述:\n",
18
+ "\n",
19
+ "\n",
20
+ "### 1. **文本分类**\n",
21
+ "\n",
22
+ "#### 任务描述\n",
23
+ "\n",
24
+ "文本分类是将文本分配到一个或多个预定义类别中的任务。例如,情感分析、主题分类等。生物序列中对应如启动序列等分类问题。\n",
25
+ "\n",
26
+ "#### 使用的模型类型\n",
27
+ "\n",
28
+ "- **GPT2ForSequenceClassification或AutoModelForSequenceClassification**:该模型在 GPT-2 的基础上添加了一个分类头,用于处理文本分类任务。通过微调这个模型,可以将其应用于多种分类任务。\n",
29
+ "\n",
30
+ "### 2. **机器翻译**\n",
31
+ "\n",
32
+ "#### 任务描述\n",
33
+ "\n",
34
+ "机器翻译是指将一种语言的文本转换为另一种语言的过程。生物学中,可以是生物序列到功能描述(英文)的翻译。\n",
35
+ "\n",
36
+ "#### 使用的模型类型\n",
37
+ "\n",
38
+ "- **AutoModelForSeq2SeqLM**:虽然 GPT-2 不是专门为机器翻译设计的模型,但可以通过构造特定格式的提示,让 GPT-2 根据上下文生成目标语言的翻译结果。\n",
39
+ "- **注意**:对于机器翻译任务,通常更推荐使用专门为此类任务设计的模型,如 T5 或 mBART。\n",
40
+ "\n",
41
+ "### 3. **词性标注 (POS Tagging)**\n",
42
+ "\n",
43
+ "#### 任务描述\n",
44
+ "\n",
45
+ "词性标注是指为每个单词分配其正确的词性标签(如名词、动词、形容词等)。生物学中,对应于结构预测任务,典型的如二级结构预测。\n",
46
+ "\n",
47
+ "#### 使用的模型类型\n",
48
+ "\n",
49
+ "- **AutoModelForTokenClassification**:该模型适用于标记级别的分类任务。通过微调,可以将 GPT-2 应用于词性标注,每个 token 的隐藏状态会被映射到相应的词性标签。\n",
50
+ "\n",
51
+ "### 4. **命名实体识别 (NER)**\n",
52
+ "\n",
53
+ "#### 任务描述\n",
54
+ "\n",
55
+ "命名实体识别是指识别文本中的人名、地名、组织机构等实体,并对其进行分类。生物学中,也对应于结构预测任务,典型的如膜结构预测。和词性标注类似。\n",
56
+ "\n",
57
+ "#### 使用的模型类型\n",
58
+ "\n",
59
+ "- **AutoModelForTokenClassification**:类似于词性标注,该模型可以用于 NER 任务,通过对每个 token 进行分类来识别和标注命名实体。\n",
60
+ "\n",
61
+ "### 5. **问答系统**\n",
62
+ "\n",
63
+ "#### 任务描述\n",
64
+ "\n",
65
+ "问答系统旨在根据给定的问题从文档或知识库中提取答案。目前一些最新的生物学大模型论文中,输入是包含生物序列的问题,回答则也是混合式的。一般是生物学领域的QA。\n",
66
+ "\n",
67
+ "#### 使用的模型类型\n",
68
+ "\n",
69
+ "- **AutoModelForQuestionAnswering**:该模型专门用于问答任务,能够理解问题并从上下文中提取答案。通过微调,它可以适应特定领域的问答需求。\n",
70
+ "\n",
71
+ "### 6. **文本生成**\n",
72
+ "\n",
73
+ "#### 任务描述\n",
74
+ "\n",
75
+ "文本生成是指根据给定的提示或前缀生成连贯的文本内容。生物学中,对应新的序列生成,如产生全新的蛋白质序列。\n",
76
+ "\n",
77
+ "#### 使用的模型类型\n",
78
+ "\n",
79
+ "- **GPT2LMHeadModel**:这是 GPT-2 的标准语言模型版本,擅长生成自然流畅的文本。它可以根据输入的提示生成后续文本,广泛应用于创作、对话系统等领域。\n",
80
+ "\n",
81
+ "### 6. **回归问题**\n",
82
+ "\n",
83
+ "#### 任务描述\n",
84
+ "\n",
85
+ "生物序列相关的回归问题,输入为序列,输出为一个float值。\n",
86
+ "\n",
87
+ "#### 使用的模型类型\n",
88
+ "\n",
89
+ "- huggingface没有特定的header,但一般回归问题,输出使用一个线性层即可,设定损失函数为均方误差(MSE)即可。最简单的,就是使用AutoModelForTokenClassification,类别数设置为1,输出的label为实测float值即可。\n",
90
+ "一个官方推荐的 [例子](https://github.com/huggingface/transformers/blob/7ae6f070044b0171a71f3269613bf02fd9fca6f2/src/transformers/models/bert/modeling_bert.py#L1564-L1575)\n",
91
+ "\n",
92
+ "### 小结\n",
93
+ "\n",
94
+ "GPT-2 可以通过微调或提示工程应用于多种下游任务。不同的任务需要使用特定类型的模型,这些模型基于 GPT-2 并添加了额外的组件或进行了调整,以更好地适应���定的任务需求\n",
95
+ "\n",
96
+ "<img src=\"img/gpt2-ft.png\" width=\"800px\" />"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 1,
102
+ "id": "eca17933-7b8f-44de-8c59-ea7a1c8a3b33",
103
+ "metadata": {},
104
+ "outputs": [
105
+ {
106
+ "data": {
107
+ "text/plain": [
108
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
109
+ ]
110
+ },
111
+ "execution_count": 1,
112
+ "metadata": {},
113
+ "output_type": "execute_result"
114
+ }
115
+ ],
116
+ "source": [
117
+ "import subprocess\n",
118
+ "import os\n",
119
+ "# 设置环境变量, autodl一般区域\n",
120
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
121
+ "output = result.stdout\n",
122
+ "for line in output.splitlines():\n",
123
+ " if '=' in line:\n",
124
+ " var, value = line.split('=', 1)\n",
125
+ " os.environ[var] = value\n",
126
+ "\n",
127
+ "\"\"\"\n",
128
+ "import os\n",
129
+ "\n",
130
+ "# 设置环境变量, autodl专区 其他idc\n",
131
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
132
+ "\n",
133
+ "# 打印环境变量以确认设置成功\n",
134
+ "print(os.environ.get('HF_ENDPOINT'))\n",
135
+ "\"\"\""
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 2,
141
+ "id": "108d9c3c-ae4d-4110-a532-a40a6fe1f9df",
142
+ "metadata": {},
143
+ "outputs": [],
144
+ "source": [
145
+ "from transformers import AutoTokenizer, AutoModel\n",
146
+ "from tokenizers import Tokenizer\n",
147
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
148
+ "from transformers import AutoModelForSequenceClassification\n",
149
+ "from transformers import DataCollatorWithPadding"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 6,
155
+ "id": "bcdc9f7a-1ea5-4647-b87e-ac72ddf17818",
156
+ "metadata": {},
157
+ "outputs": [
158
+ {
159
+ "data": {
160
+ "application/vnd.jupyter.widget-view+json": {
161
+ "model_id": "c2e31c61549449e78a4e1fe0e884233f",
162
+ "version_major": 2,
163
+ "version_minor": 0
164
+ },
165
+ "text/plain": [
166
+ "tokenizer_config.json: 0%| | 0.00/580 [00:00<?, ?B/s]"
167
+ ]
168
+ },
169
+ "metadata": {},
170
+ "output_type": "display_data"
171
+ },
172
+ {
173
+ "data": {
174
+ "application/vnd.jupyter.widget-view+json": {
175
+ "model_id": "da2009ca96634f759f052a9a4ff7e41e",
176
+ "version_major": 2,
177
+ "version_minor": 0
178
+ },
179
+ "text/plain": [
180
+ "vocab.json: 0%| | 0.00/642k [00:00<?, ?B/s]"
181
+ ]
182
+ },
183
+ "metadata": {},
184
+ "output_type": "display_data"
185
+ },
186
+ {
187
+ "data": {
188
+ "application/vnd.jupyter.widget-view+json": {
189
+ "model_id": "b6b6ec58d8cb4878aa2e0786ff0bbcf4",
190
+ "version_major": 2,
191
+ "version_minor": 0
192
+ },
193
+ "text/plain": [
194
+ "merges.txt: 0%| | 0.00/323k [00:00<?, ?B/s]"
195
+ ]
196
+ },
197
+ "metadata": {},
198
+ "output_type": "display_data"
199
+ },
200
+ {
201
+ "data": {
202
+ "application/vnd.jupyter.widget-view+json": {
203
+ "model_id": "5dbb5171eb6242bdbded42c87ef46c27",
204
+ "version_major": 2,
205
+ "version_minor": 0
206
+ },
207
+ "text/plain": [
208
+ "special_tokens_map.json: 0%| | 0.00/473 [00:00<?, ?B/s]"
209
+ ]
210
+ },
211
+ "metadata": {},
212
+ "output_type": "display_data"
213
+ }
214
+ ],
215
+ "source": [
216
+ "#set tokenizer\n",
217
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/dna_gpt2_v0\")\n",
218
+ "tokenizer.pad_token = tokenizer.eos_token"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "code",
223
+ "execution_count": 3,
224
+ "id": "0e930ef5-865a-4528-84b5-ddae6d710a99",
225
+ "metadata": {},
226
+ "outputs": [
227
+ {
228
+ "name": "stderr",
229
+ "output_type": "stream",
230
+ "text": [
231
+ "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/dna_gpt2_v0 and are newly initialized: ['score.weight']\n",
232
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
233
+ ]
234
+ },
235
+ {
236
+ "data": {
237
+ "text/plain": [
238
+ "GPT2ForSequenceClassification(\n",
239
+ " (transformer): GPT2Model(\n",
240
+ " (wte): Embedding(30000, 768)\n",
241
+ " (wpe): Embedding(1024, 768)\n",
242
+ " (drop): Dropout(p=0.1, inplace=False)\n",
243
+ " (h): ModuleList(\n",
244
+ " (0-11): 12 x GPT2Block(\n",
245
+ " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
246
+ " (attn): GPT2SdpaAttention(\n",
247
+ " (c_attn): Conv1D(nf=2304, nx=768)\n",
248
+ " (c_proj): Conv1D(nf=768, nx=768)\n",
249
+ " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
250
+ " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
251
+ " )\n",
252
+ " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
253
+ " (mlp): GPT2MLP(\n",
254
+ " (c_fc): Conv1D(nf=3072, nx=768)\n",
255
+ " (c_proj): Conv1D(nf=768, nx=3072)\n",
256
+ " (act): NewGELUActivation()\n",
257
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
258
+ " )\n",
259
+ " )\n",
260
+ " )\n",
261
+ " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
262
+ " )\n",
263
+ " (score): Linear(in_features=768, out_features=2, bias=False)\n",
264
+ ")"
265
+ ]
266
+ },
267
+ "execution_count": 3,
268
+ "metadata": {},
269
+ "output_type": "execute_result"
270
+ }
271
+ ],
272
+ "source": [
273
+ "#set model\n",
274
+ "model = AutoModelForSequenceClassification.from_pretrained('dnagpt/dna_gpt2_v0', num_labels=2)\n",
275
+ "model.config.pad_token_id = model.config.eos_token_id\n",
276
+ "model"
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "markdown",
281
+ "id": "bd14794b-e507-4c1d-be47-0e0144835f18",
282
+ "metadata": {},
283
+ "source": [
284
+ "在生物学中,**启动子(promoter)** 是一段特定的DNA序列,它位于基因的上游(通常是5'端),并且是转录起始的关键调控元件。启动子的主要功能是为RNA聚合酶提供结合位点,并招募其他转录因子,以启动基因转录过程。以下是关于启动子的一些重要概念和特点:\n",
285
+ "\n",
286
+ "### 启动子的功能\n",
287
+ "\n",
288
+ "1. **转录起始**:\n",
289
+ " - 启动子是基因表达的第一步,它决定了何时、何地以及多频繁地进行转录。\n",
290
+ " \n",
291
+ "2. **调控基因表达**:\n",
292
+ " - 不同类型的启动子可以调节不同组织或细胞类型中的基因表达水平。例如,在某些细胞中高度活跃而在其他细胞中不活跃。\n",
293
+ "\n",
294
+ "3. **与转录因子和其他蛋白质相互作用**:\n",
295
+ " - 启动子区域通常包含多个顺式作用元件(cis-regulatory elements),这些元件可以与特定的转录因子或其他调控蛋白结合,进一步精细调整基因表达。\n",
296
+ " \n",
297
+ " \n",
298
+ "在生物学中,启动子(promoter)序列的二分类问题通常是指将DNA序列分为两类:**启动子序列**和**非启动子序列**。这种分类任务的目标是通过机器学习或生物信息学方法来预测给定的DNA序列是否具有启动子功能。\n",
299
+ "\n",
300
+ "### 二分类问题中的两个类别\n",
301
+ "\n",
302
+ "1. **启动子序列(Promoter Sequences)**:\n",
303
+ " - 这些序列包含能够指导转录起始的调控元件,通常是位于基因5'端上游区域的一段DNA。\n",
304
+ " - 启动子序列可能含有特定的保守基序(motifs),如TATA盒、CAAT盒等,这些基序对于RNA聚合酶及其辅助因子的结合至关重要。\n",
305
+ "\n",
306
+ "2. **非启动子序列(Non-Promoter Sequences)**:\n",
307
+ " - 这类序列指的是那些不具有启动子功能的DNA片段。它们可以来自基因内部(编码区或内含子)、基因间区域(intergenic regions)或其他调控元件(如增强子、沉默子等),但明确不是启动子。\n",
308
+ " - 非启动子序列不具备启动转录的能力,或者至少在自然条件下不会作为主要的转录起始点。\n",
309
+ "\n",
310
+ "### 启动子的研究意义\n",
311
+ "\n",
312
+ "理解启动子的工作机制对于揭示基因表达调控网络非常重要。这不仅有助于基础科学研究,而且对于医学应用也有着深远的影响,比如开发新的治疗策略来纠正异常的基因表达模式,或者利用合成生物学设计定制化的基因表达系统。\n"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 8,
318
+ "id": "aee08f3f-6cda-4975-8cb9-9a7bfacb9eac",
319
+ "metadata": {},
320
+ "outputs": [
321
+ {
322
+ "data": {
323
+ "application/vnd.jupyter.widget-view+json": {
324
+ "model_id": "82d2ec71cf6648469040897d9174a55f",
325
+ "version_major": 2,
326
+ "version_minor": 0
327
+ },
328
+ "text/plain": [
329
+ "README.md: 0%| | 0.00/314 [00:00<?, ?B/s]"
330
+ ]
331
+ },
332
+ "metadata": {},
333
+ "output_type": "display_data"
334
+ },
335
+ {
336
+ "data": {
337
+ "application/vnd.jupyter.widget-view+json": {
338
+ "model_id": "40183e0714ea4155a2c0772fb7c72a00",
339
+ "version_major": 2,
340
+ "version_minor": 0
341
+ },
342
+ "text/plain": [
343
+ "train-00000-of-00001.parquet: 0%| | 0.00/8.66M [00:00<?, ?B/s]"
344
+ ]
345
+ },
346
+ "metadata": {},
347
+ "output_type": "display_data"
348
+ },
349
+ {
350
+ "data": {
351
+ "application/vnd.jupyter.widget-view+json": {
352
+ "model_id": "8e5ebe15df194e3c8bf5811777755947",
353
+ "version_major": 2,
354
+ "version_minor": 0
355
+ },
356
+ "text/plain": [
357
+ "Generating train split: 0%| | 0/59195 [00:00<?, ? examples/s]"
358
+ ]
359
+ },
360
+ "metadata": {},
361
+ "output_type": "display_data"
362
+ }
363
+ ],
364
+ "source": [
365
+ "from datasets import load_dataset\n",
366
+ "# 1. load ~11k samples from promoters prediction dataset\n",
367
+ "dataset = load_dataset(\"dnagpt/dna_promoter_300\")['train'].train_test_split(test_size=0.1)"
368
+ ]
369
+ },
370
+ {
371
+ "cell_type": "code",
372
+ "execution_count": 9,
373
+ "id": "6ac9fe5b-2175-42d8-949c-cb12bc8fb65c",
374
+ "metadata": {},
375
+ "outputs": [
376
+ {
377
+ "data": {
378
+ "text/plain": [
379
+ "DatasetDict({\n",
380
+ " train: Dataset({\n",
381
+ " features: ['sequence', 'label'],\n",
382
+ " num_rows: 53275\n",
383
+ " })\n",
384
+ " test: Dataset({\n",
385
+ " features: ['sequence', 'label'],\n",
386
+ " num_rows: 5920\n",
387
+ " })\n",
388
+ "})"
389
+ ]
390
+ },
391
+ "execution_count": 9,
392
+ "metadata": {},
393
+ "output_type": "execute_result"
394
+ }
395
+ ],
396
+ "source": [
397
+ "dataset"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "code",
402
+ "execution_count": 10,
403
+ "id": "b5025f95-ca5d-42b1-95e1-55495f77d009",
404
+ "metadata": {},
405
+ "outputs": [
406
+ {
407
+ "data": {
408
+ "text/plain": [
409
+ "{'sequence': 'CCTGACGCCCACCGCAAGCTGCCGGGTAAGACCGGGTCGACTTCAGCGCGGCCCGCTGCACGAGAGACCATTATGGTGATCCGCCCGCCTGACACTACTGATATGTTGGGATTACAGGCGTGAGCCACGGCGCCCGGCGGGCAAGACACCCTCAGAGCACAGGGTGAATCCATGGTTAAAATACAGCGGGAAGTTAGCGCCGAAGTCGCCGTGTAATTTGTGCGCGGTTCAGGTTCATGTATTCAGAATCATTTTACTAGGTTTAGGGCTCGCCGCTGCCTCAGTGGCTTTCAGGCGCTT',\n",
410
+ " 'label': 0}"
411
+ ]
412
+ },
413
+ "execution_count": 10,
414
+ "metadata": {},
415
+ "output_type": "execute_result"
416
+ }
417
+ ],
418
+ "source": [
419
+ "dataset[\"train\"][0]"
420
+ ]
421
+ },
422
+ {
423
+ "cell_type": "code",
424
+ "execution_count": 13,
425
+ "id": "ac999213-67b1-4294-8d92-80b8c6c68acd",
426
+ "metadata": {},
427
+ "outputs": [
428
+ {
429
+ "name": "stdout",
430
+ "output_type": "stream",
431
+ "text": [
432
+ "dna datasets mean token lenght 52.41266891891892 min token length 33 max token length 60\n"
433
+ ]
434
+ }
435
+ ],
436
+ "source": [
437
+ "token_len_list = []\n",
438
+ "for item in dataset[\"test\"]:\n",
439
+ " inputs = tokenizer.tokenize(item[\"sequence\"])\n",
440
+ " token_len_list.append( len(inputs) )\n",
441
+ "\n",
442
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
443
+ "min_len = min(token_len_list)\n",
444
+ "max_len = max(token_len_list)\n",
445
+ "\n",
446
+ "print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": 14,
452
+ "id": "72a2dec3-043b-41e4-afd8-4dbd8c8fcbb0",
453
+ "metadata": {},
454
+ "outputs": [
455
+ {
456
+ "data": {
457
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABdnElEQVR4nO3deZyN9f//8eeZfTEzxjIbZowte2NLEyIUkQotiqylLBXSotVSREiopG9ZKh+lT4sP2UVCkrJlrNHImGEyjDH7zPX7Y36OTmO7zpwzZ4bH/XY7N3Nd1/v1vt7nOE7z7Hpf72MxDMMQAAAAAOCqubl6AAAAAABQ2hCkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpADAyUaPHi2LxVIs52rTpo3atGlj3V63bp0sFou+/PLLYjl/3759VbVq1WI5l73S0tL06KOPKiwsTBaLRcOGDXPq+c7//ScnJzv1PNe6vn37qkyZMq4eBgBYEaQAwIS5c+fKYrFYHz4+PoqIiFCHDh00ffp0nT171iHnSUhI0OjRo7V9+3aH9OdIJXlsV2P8+PGaO3euBg0apE8++USPPPJIoTbnw8+VHv8MraVBcQdrs9LT0zV69GitW7fO1UMBgCvycPUAAKA0Gjt2rKKjo5WTk6PExEStW7dOw4YN09SpU7V48WI1bNjQ2vbll1/WCy+8YKr/hIQEjRkzRlWrVlVMTMxV161cudLUeexxubF9+OGHys/Pd/oYimLt2rW6+eab9dprr12yTbdu3VSjRg3rdlpamgYNGqSuXbuqW7du1v2hoaFOHev1Jj09XWPGjJGkUhdSAVx/CFIAYIc777xTTZs2tW6PGjVKa9eu1V133aW7775bcXFx8vX1lSR5eHjIw8O5H7fp6eny8/OTl5eXU89zJZ6eni49/9U4ceKE6tate9k2DRs2tAnDycnJGjRokBo2bKhevXo5e4gAgFKAqX0A4CBt27bVK6+8oj///FOffvqpdf/F7pFatWqVWrZsqbJly6pMmTK64YYb9OKLL0oqmH7VrFkzSVK/fv2s08jmzp0rqeD/1NevX1/btm3TrbfeKj8/P2vtv++ROi8vL08vvviiwsLC5O/vr7vvvltHjx61aVO1alX17du3UO0/+7zS2C52j9S5c+f0zDPPqEqVKvL29tYNN9ygyZMnyzAMm3YWi0VDhw7VN998o/r168vb21v16tXT8uXLL/6C/8uJEyc0YMAAhYaGysfHRzfeeKPmzZtnPX5+Wtvhw4e1dOlS69iPHDlyVf1fzNq1a9WqVSv5+/urbNmyuueeexQXF3fFuj///FM1atRQ/fr1lZSUJEk6ffq0hg0bZn2datSooYkTJ9pc4Tty5IgsFosmT56s2bNnq3r16vL29lazZs20detWu5/HvzljLIsWLVLdunXl4+Oj+vXr6+uvv7Z5vxw5ckQVK1aUJI0ZM8b69zN69Gibfo4dO6Z7771XZcqUUcWKFTVy5Ejl5eXZtFm4cKGaNGmigIAABQYGqkGDBnrnnXcc9voAgMQVKQBwqEceeUQvvviiVq5cqccee+yibX7//XfdddddatiwocaOHStvb28dPHhQGzdulCTVqVNHY8eO1auvvqqBAweqVatWkqRbbrnF2sfff/+tO++8Uz169FCvXr2uOMXsjTfekMVi0fPPP68TJ05o2rRpat++vbZv3269cnY1rmZs/2QYhu6++259//33GjBggGJiYrRixQo9++yzOnbsmN5++22b9j/++KO++uorDR48WAEBAZo+fbq6d++u+Ph4lS9f/pLjysjIUJs2bXTw4EENHTpU0dHRWrRokfr27avTp0/r6aefVp06dfTJJ59o+PDhqly5sp555hlJsv7ybtbq1at15513qlq1aho9erQyMjI0Y8YMtWjRQr/++uslF904dOiQ2rZtq3LlymnVqlWqUKGC0tPT1bp1ax07dkyPP/64IiMjtWnTJo0aNUrHjx/XtGnTbPpYsGCBzp49q8cff1wWi0WTJk1St27d9McffxT5qqAzxrJ06VI9+OCDatCggSZMmKCUlBQNGDBAlSpVsvZTsWJFvf/++4WmUP7zymBeXp46dOig5s2ba/LkyVq9erWmTJmi6tWra9CgQZIK/ifFQw89pHbt2mnixImSpLi4OG3cuFFPP/10kV4bALBhAACu2pw5cwxJxtatWy/ZJigoyGjUqJF1+7XXXjP++XH79ttvG5KMkydPXrKPrVu3GpKMOXPmFDrWunVrQ5Ixa9asix5r3bq1dfv77783JBmVKlUyUlNTrfu/+OILQ5LxzjvvWPdFRUUZffr0uWKflxtbnz59jKioKOv2N998Y0gyXn/9dZt29913n2GxWIyDBw9a90kyvLy8bPbt2LHDkGTMmDGj0Ln+adq0aYYk49NPP7Xuy87ONmJjY40yZcrYPPeoqCijc+fOl+3v306ePGlIMl577TXrvpiYGCMkJMT4+++/bcbr5uZm9O7d27rv/N//yZMnjbi4OCMiIsJo1qyZcerUKWubcePGGf7+/sb+/fttzvvCCy8Y7u7uRnx8vGEYhnH48GFDklG+fHmb+m+//daQZPzvf/+77PM4/35YtGjRJds4YywNGjQwKleubJw9e9a6b926dYYkm/fLxV7n8/r06WNIMsaOHWuzv1GjRkaTJk2s208//bQRGBho5ObmXva1AICiYmofADhYmTJlLrt6X9myZSVJ3377rd0LM3h7e6tfv35X3b53794KCAiwbt93330KDw/Xd999Z9f5r9Z3330nd3d3PfXUUzb7n3nmGRmGoWXLltnsb9++vapXr27dbtiwoQIDA/XHH39c8TxhYWF66KGHrPs8PT311FNPKS0tTevXr3fAs7ng+PHj2r59u/r27aty5crZjPf222+/6Ou6e/dutW7dWlWrVtXq1asVHBxsPbZo0SK1atVKwcHBSk5Otj7at2+vvLw8/fDDDzZ9Pfjggzb1568MXul1uhqOHktCQoJ27dql3r172yxf3rp1azVo0MD0+J544gmb7VatWtk877Jly+rcuXNatWqV6b4BwAyCFAA4WFpamk1o+bcHH3xQLVq00KOPPqrQ0FD16NFDX3zxhalQValSJVMLS9SsWdNm22KxqEaNGkW6P+hq/Pnnn4qIiCj0etSpU8d6/J8iIyML9REcHKyUlJQrnqdmzZpyc7P9z9qlzlNU5/u74YYbCh2rU6eOkpOTde7cOZv9Xbp0UUBAgFasWKHAwECbYwcOHNDy5ctVsWJFm0f79u0lFdz/9U//fp3OB5krvU5Xw9FjOf9a/XMVxPMutu9yfHx8Ck3F/Pf7Y/DgwapVq5buvPNOVa5cWf3797/q++wAwAzukQIAB/rrr7905syZy/6C6Ovrqx9++EHff/+9li5dquXLl+vzzz9X27ZttXLlSrm7u1/xPGbua7pal/rS4Ly8vKsakyNc6jzGvxamKI26d++uefPm6bPPPtPjjz9ucyw/P1+33367nnvuuYvW1qpVy2bbma9TSRrLv13N+zAkJETbt2/XihUrtGzZMi1btkxz5sxR7969bRYfAYCiIkgBgAN98sknkqQOHTpctp2bm5vatWundu3aaerUqRo/frxeeuklff/992rfvv0lQ429Dhw4YLNtGIYOHjxocyN/cHCwTp8+Xaj2zz//VLVq1azbZsYWFRWl1atX6+zZszZXpfbu3Ws97ghRUVHauXOn8vPzba5KOfo8/zyfJO3bt6/Qsb1796pChQry9/e32f/WW2/Jw8PDupDGww8/bD1WvXp1paWlWa/6uJKjx3L+tTp48GChY//e56j3vZeXl7p06aIuXbooPz9fgwcP1gcffKBXXnnF9FUwALgUpvYBgIOsXbtW48aNU3R0tHr27HnJdqdOnSq07/wX22ZlZUmS9ZfwiwUbe8yfP9/mvq0vv/xSx48f15133mndV716df3000/Kzs627luyZEmhZdLNjK1Tp07Ky8vTzJkzbfa//fbbslgsNucvik6dOikxMVGff/65dV9ubq5mzJihMmXKqHXr1g45z3nh4eGKiYnRvHnzbF6H3bt3a+XKlerUqVOhGovFotmzZ+u+++5Tnz59tHjxYuuxBx54QJs3b9aKFSsK1Z0+fVq5ubkOHf/lOHosERERql+/vubPn6+0tDTr/vXr12vXrl02bf38/Kznsdfff/9ts+3m5mb9Hwbn/30BgCNwRQoA7LBs2TLt3btXubm5SkpK0tq1a7Vq1SpFRUVp8eLF8vHxuWTt2LFj9cMPP6hz586KiorSiRMn9N5776ly5cpq2bKlpIJQU7ZsWc2aNUsBAQHy9/dX8+bNFR0dbdd4y5Urp5YtW6pfv35KSkrStGnTVKNGDZsl2h999FF9+eWX6tixox544AEdOnRIn376qc3iD2bH1qVLF91222166aWXdOTIEd14441auXKlvv32Ww0bNqxQ3/YaOHCgPvjgA/Xt21fbtm1T1apV9eWXX2rjxo2aNm3aZe9Zs9dbb72lO++8U7GxsRowYIB1+fOgoKBC3310npubmz799FPde++9euCBB/Tdd9+pbdu2evbZZ7V48WLddddd6tu3r5o0aaJz585p165d+vLLL3XkyBFVqFDBYWP/73//a71a9099+vRxyljGjx+ve+65Ry1atFC/fv2UkpKimTNnqn79+jbhytfXV3Xr1tXnn3+uWrVqqVy5cqpfv77q169/1ed69NFHderUKbVt21aVK1fWn3/+qRkzZigmJsZ6zxwAOIRL1wwEgFLm/PLn5x9eXl5GWFiYcfvttxvvvPOOzTLb5/17+fM1a9YY99xzjxEREWF4eXkZERERxkMPPVRouelvv/3WqFu3ruHh4WGz3Hjr1q2NevXqXXR8l1r+/D//+Y8xatQoIyQkxPD19TU6d+5s/Pnnn4Xqp0yZYlSqVMnw9vY2WrRoYfzyyy+F+rzc2P69/LlhGMbZs2eN4cOHGxEREYanp6dRs2ZN46233jLy8/Nt2kkyhgwZUmhMl1qW/d+SkpKMfv36GRUqVDC8vLyMBg0aXHSJdkctf24YhrF69WqjRYsWhq+vrxEYGGh06dLF2LNnj02bfy5/fl56errRunVro0yZMsZPP/1kGEbB6zRq1CijRo0ahpeXl1GhQgXjlltuMSZPnmxkZ2cbhnFhyfG33nqr0BgvNr5/O/9+uNRjw4YNThvLwoULjdq1axve3t5G/fr1jcWLFxvdu3c3ateubdNu06ZNRpMmTQwvLy+bfvr06WP4+/sXOte//319+eWXxh133GGEhIQYXl5eRmRkpPH4448bx48fv+xrAwBmWQzjGriDFwAAlDoxMTGqWLEiS5UDKJW4RwoAADhVTk5OoXur1q1bpx07dqhNmzauGRQAFBFXpAAAgFMdOXJE7du3V69evRQREaG9e/dq1qxZCgoK0u7du1W+fHlXDxEATGOxCQAA4FTBwcFq0qSJ/u///k8nT56Uv7+/OnfurDfffJMQBaDU4ooUAAAAAJjEPVIAAAAAYBJBCgAAAABM4h4pSfn5+UpISFBAQIAsFourhwMAAADARQzD0NmzZxURESE3t0tfdyJISUpISFCVKlVcPQwAAAAAJcTRo0dVuXLlSx4nSEkKCAiQVPBiBQYGung0AADgmlW7tnT8uBQeLu3da758Zm0dP3tc4QHh2jvUfD2AK0tNTVWVKlWsGeFSCFKSdTpfYGAgQQoAADjP+WlCbm6SHb9zuPm4STkFf/I7C+BcV7rlh8UmAAAAAMAkghQAAAAAmESQAgAAAACTuEcKAACguGzdKuXlSe7u9pU/tlV5Rp7cLfbVA3AcghQAAEBxCQ8vWnlA0eoBOA5T+wAAAADAJIIUAAAAAJjE1D4AAIDiMnu2lJYmlSkjDRxovnzbbKVlp6mMVxkNbGK+HoDjWAzDMFw9CFdLTU1VUFCQzpw5w5fbAQAA56lcWTp2TKpUSfrrL/PlUyvr2NljqhRQSX+NMF8P4MquNhswtQ8AAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHDdqFVLCgqSQkPtKy9fS0E+QQr1t68egONYDMMwXD0IV7vaby8GAAAoTvHx8UpOTnZK3xUqVFBkZKRT+gZKs6vNBlyRAgAAKIHi4+NVu04dZaSnO6V/Xz8/7Y2LI0wBdiJIAQAAlEDJycnKSE/X4MmzFVG9lkP7Tji0X++NHKjk5GSCFGAnghQAAEAJFlG9lqLrxbh6GAD+hSAFAABQXHr2lJKTpQoVpM8+M13+7sbHdDbrbwV4l9eQFh86YYAArhZBCgAAoLisXy8dOyZVqmRXedyJjUrJSFCwb4SDBwbALL5HCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQXB57TDpzRgoKsqv8thq9lZGdKl+vQAcPDIBZBCkAAIDi8tprRSrv3uAFBw0EQFExtQ8AAABF17evZLFITzxR+NiQIQXH+vYt7lFdnfNj/+ejY8cLx48ckQYMkKKjJV9fqXr1glCcnX11/RuGdOedBf1+843tsaeekpo0kby9pZgYxzwfFAuuSAEAAMAxqlSRFi6U3n67IHBIUmamtGCBFBnp2rFdSceO0pw5F7a9vS/8vHevlJ8vffCBVKOGtHt3wTTNc+ekyZOv3Pe0aQUh6lL695e2bJF27rR7+Ch+XJECAACAYzRuXBCmvvrqwr6vvioIUY0a2bbNz5cmTLhwlefGG6Uvv7xwPC/P9irQDTdI77xj20ffvtK99xaEmfBwqXz5gqtfOTnmx+7tLYWFXXgEB184dj5k3XGHVK2adPfd0siRts/zUrZvl6ZMkT7++OLHp08vGHO1aubHDJciSAEAABSXypULrkxUrmxX+dCv66rngrIa+nVdBw/Mgfr3t72y8/HHUr9+hdtNmCDNny/NmiX9/rs0fLjUq5e0fn3B8fz8gtdp0SJpzx7p1VelF1+UvvjCtp/vv5cOHSr4c948ae7cgsd5o0dLVateedzr1kkhIQWBbdAg6e+/L9/+zBmpXLnLt0lPlx5+WHr33YJwhmsKU/sAAADgOL16SaNGSX/+WbC9cWPBdL916y60ycqSxo+XVq+WYmML9lWrJv34Y8H0udatJU9PacyYCzXR0dLmzQVB6oEHLuwPDpZmzpTc3aXataXOnaU1awqm3klShQoF9zRdTseOUrduBec4dKggsN15Z8H53N0Ltz94UJox48rT+oYPl265Rbrnnsu3Q6lEkAIAAIDjVKxYEGbmzi1YZKFz54Iw808HDxZcrbn9dtv92dm2UwDffbfgilZ8vJSRUXD83wsy1KtnG3bCw6Vduy5sDx1a8LicHj0u/NyggdSwYUH4WrdOatfOtu2xYwXB6/77L4S1i1m8WFq7Vvrtt8ufG6UWQQoAAACO1b//hfDy7ruFj6elFfy5dKlUqZLtsfOLPCxcWHAf0pQpBVetAgKkt94qWJThnzw9bbctloJpgUVRrVpB+Dt40DZIJSRIt91WcJVp9uzL97F2bcHVrbJlbfd37y61amV7hQ6lEkEKAAAAjtWxY8HVI4tF6tCh8PG6dQsCU3x8wTS+i9m4sSCwDB58Yd+hQ84Z77/99VfBPVLh4Rf2HTtWEKKaNCm4B8ztCksNvPCC9OijtvsaNChY0bBLF8ePGcWOIAUAAADHcneX4uIu/PxvAQEFV5uGDy+4etSyZcHiDRs3SoGBUp8+Us2aBYtRrFhRcO/SJ59IW7cW/GzGzJnS118X3Dd1MWlpBfdide9esCDEoUPSc88VLHN+PgQeOya1aSNFRRXcF3Xy5IX684tIHDtWcPVq/nzpppsurP73b5GRts/h4MGCMSQmFkxf3L69YH/dupKXl7nnimJFkAIAAIDjBQZe/vi4cQX3U02YIP3xR8EUuMaNCxZ6kKTHHy+4v+jBBwuubD30UMHVqWXLzI0jOfnyV7Lc3Qu+v2nePOn0aSkiomCZ83HjLkwzXLWqIPAcPFh4xUXDKPgzJ0fat6/g3i8zHn30wkqF0oV7xA4fvrrVBuEyFsM4/7d//UpNTVVQUJDOnDmjwCv9owcAALBX5coFVy4qVSqYPnYZv/76q5o0aaLXv16n6HoxkgqWP0/JSFCwb4Rmdt1j9zAO/75dL3dto23btqlx48Z29wNci642G/A9UgAAAABgEkEKAAAAAEziHikAAIDi8umnBV9Ge/7eG5MG3/KBcvOy5eHOIgSAqxGkAAAAikubNkUqrxvayjHj+P/izq+s5wQVKlRQZGSk0/oHXI0gBQAAcJ05fTJJFotFvXr1cto5fP38tDcujjCFaxZBCgAA4DqTnnpGhmGo37jpql6/ocP7Tzi0X++NHKjk5GSCFK5ZBCkAAIDism7dhXuk7Jjmtydpg/UeKUdM8wuPrmFdWh2AOQQpAACA4tKr11V/j9TFvLfpcYd8jxSAomP5cwAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATOJ7pAAAAOwUHx+v5OTkq25fPydHXpKyc3K0+9dfL9s2Li6uiKMD4EwEKQAAADvEx8erdp06ykhPv+qao5IqSzpx4oSaNGlyVTXZWdn2DRCAUxGkAAAA7JCcnKyM9HQNnjxbEdVrXVXNvH/8/PoV2u5Yv0qLpr2h3Nxc676ZXfeYHygApyBIAQAAFEFE9VqKrhfj8H4TDu13eJ8AHIfFJgAAAADAJIIUAAAAAJjE1D4AAIBiUn/Gm/I6m6rsgEDtfvIF0/X/3fWmMrJT5esVqO4NzNcDcByCFAAAQDGp8cV8+SUlKD00wq4g9f3B+UrJSFCwbwRBCnAxpvYBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1wapPLy8vTKK68oOjpavr6+ql69usaNGyfDMKxtDMPQq6++qvDwcPn6+qp9+/Y6cOCATT+nTp1Sz549FRgYqLJly2rAgAFKS0sr7qcDAAAA4Drh0iA1ceJEvf/++5o5c6bi4uI0ceJETZo0STNmzLC2mTRpkqZPn65Zs2Zpy5Yt8vf3V4cOHZSZmWlt07NnT/3+++9atWqVlixZoh9++EEDBw50xVMCAAAAcB1w6Rfybtq0Sffcc486d+4sSapatar+85//6Oeff5ZUcDVq2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjh+Li4rR8+XJt3bpVTZs2lSTNmDFDnTp10uTJkxUREeGaJwcAAPAvJ25qIe+Uv5UVXN6u+johLXQ2628FeNtXD8BxXBqkbrnlFs2ePVv79+9XrVq1tGPHDv3444+aOnWqJOnw4cNKTExU+/btrTVBQUFq3ry5Nm/erB49emjz5s0qW7asNURJUvv27eXm5qYtW7aoa9euhc6blZWlrKws63ZqaqoTnyUAAECBTVM+LFL9kBZFqwfgOC4NUi+88IJSU1NVu3Ztubu7Ky8vT2+88YZ69uwpSUpMTJQkhYaG2tSFhoZajyUmJiokJMTmuIeHh8qVK2dt828TJkzQmDFjHP10AAAAAFwnXHqP1BdffKHPPvtMCxYs0K+//qp58+Zp8uTJmjdvnlPPO2rUKJ05c8b6OHr0qFPPBwClwboj62QZY9HpzNOSpLnb56rsm2VdOiYAAEoqlwapZ599Vi+88IJ69OihBg0a6JFHHtHw4cM1YcIESVJYWJgkKSkpyaYuKSnJeiwsLEwnTpywOZ6bm6tTp05Z2/ybt7e3AgMDbR4AUJL1/aavLGMsemLJE4WODVk6RJYxFvX9pq9Dz/lgvQe1/8n9Du3zalWdVlWWMRabx5s/vmk9vi95n26bd5tCJ4fK53UfVXunml5e+7Jy8nIu22/8mXh1XtBZfm/4KeStED278lnl5ufatFl3ZJ0af9BY3q97q8b0Gpq7fa4zniIAoJRz6dS+9PR0ubnZZjl3d3fl5+dLkqKjoxUWFqY1a9YoJiZGUsH9TFu2bNGgQYMkSbGxsTp9+rS2bdumJk2aSJLWrl2r/Px8NW/evPieDAA4WZXAKlq4e6He7vC2fD19JUmZuZlasHuBIoMiHX4+X09f63lcYWybsXqsyWPW7QCvAOvPnu6e6t2wtxqHN1ZZn7LakbRDj/3vMeUb+RrfbvxF+8vLz1PnBZ0VViZMmwZs0vGzx9X7m97ydPe01hxOOazOCzrriSZP6LNun2nN4TV6dPGjCi8Trg41Ojj3CeO60LZ3F/kkn1RmhYpaO/9/puvfWNNFZzJPKsinol5qZ74egOO4NEh16dJFb7zxhiIjI1WvXj399ttvmjp1qvr37y9JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XklSnTh117NhRjz32mGbNmqWcnBwNHTpUPXr0YMU+ANeUxuGNdSjlkL6K+0o9GxbcS/pV3FeKDIpUdNlom7b5Rr4m/jhRs3+drcS0RNUqX0uv3PqK7qt7n7XNdwe+07Dlw3Q09ahurnyz+tzYx6aPudvnatjyYTr9wmlJ0qFThzRi5Qj99NdPOpd9TnUq1tGEdhPUvtqFBYGqTquqgU0G6uCpg1q0Z5GCfYL18q0va2AT819JEeAdoLAyF59ZUC24mqoFV7NuR5WN0roj67QhfsMl+1t5aKX2nNyj1Y+sVmiZUMWExWjcbeP0/OrnNbrNaHm5e2nWL7MUXTZaUzpMkSTVqVhHP8b/qLd/epsgBYcIPHxIfkkJSj9r30JXx1MPKSUjQenZLJQFuJpLp/bNmDFD9913nwYPHqw6depo5MiRevzxxzVu3Dhrm+eee05PPvmkBg4cqGbNmiktLU3Lly+Xj4+Ptc1nn32m2rVrq127durUqZNatmyp2bNnu+IpAYBT9Y/prznb51i3P/7tY/WL6Veo3YQNEzR/53zN6jxLvw/+XcNvHq5eX/XS+iPrJUlHzxxVt8+7qUutLtr++HY92uhRvbD6hcueOy07TZ1qdNKa3mv02+O/qWP1juryny6KPxNv027K5ilqGtFUvz3+mwY3G6xBSwdpX/I+6/E2c9tc1TTEN398U+UnlVejDxrprY1vFZqC908HTx3U8oPL1Tqq9SXbbP5rsxqENFBomQsLGHWo3kGpWan6/cTv1jb/DIbn22z+a/MVxwsAuL649IpUQECApk2bpmnTpl2yjcVi0dixYzV27NhLtilXrpwWLFjghBECQMnSq2EvjVozSn+e/lOStPHoRi28b6HWHVlnbZOVm6XxP47X6kdWK7ZKrKSCKzg/xv+oD7Z9oNZVW+v9X95X9XLVrVdebqhwg3ad2KWJGyde8tw3ht2oG8NutG6PaztOX+/9Wov3LdbQm4Za93eq2UmDmw2WJD3f4nm9/dPb+v7I97qhwg2SpMigSIWXCb/s83yq+VNqHN5Y5XzLadPRTRq1ZpSOpx3X1A5Tbdrd8tEt+vX4r8rKy9LAxgM19rZL/7ciMS3RJkRJsm4npiVeaONfuE1qVqoycjJcOtURAFCyuDRIAQDMqehfUZ1rddbc7XNlyFDnmp1Vwa+CTZuDpw4qPSddt39yu83+7LxsNQpvJEmKS45T80q295HGVo697LnTstM0et1oLT2wVMfPHldufq4ycjMKXZFqGNLQ+rPFYlFYmTCdOHdhUaD5Xedf8XmOiB1xob/QhvJy99LjSx7XhHYT5O3hbT32+X2f62z2We1I3KFnVz2ryZsm67kWz12xfwAAioogBQClTP+Y/hq6rOAK0Lud3i10PC07TZK09OGlqhRYyeaYt7t3ofZXa+TKkVr1xypNvn2yapSrIV9PX933xX3Kzsu2aefp7mmzbZFF+Ua+3eeVpOaVmis3P1dHTh+xXtmSpCpBVSRJdSvWVZ6Rp4H/G6hnYp+Ru5t7oT7CyoTp52M/2+xLSkuyHjv/Z9K5pEJtAr0DuRoFALBBkAKAUqZjjY7KzsuWRRZ1qF54AYS6FevK291b8Wfi1brqxe8ZqlOhjhbvW2yz76e/frrseTce3ai+N/ZV1zpdJRUEtiOnj9j3JEzanrhdbhY3hfiHXLJNvpGvnPwc5Rv5clfhIBVbOVZvbHhDJ86dsPaz6o9VCvQOVN2Kda1tvjv4nU3dqj9WXfFqHQDg+kOQAoBSxt3NXXFD4qw//1uAd4BG3jJSw1cMV76Rr5aRLXUm64w2xm9UoHeg+sT00RNNn9CUzVP07Mpn9WjjR7Xt+DbN3TH3suetWa6mvtr7lbrc0EUWWfTK96/YdaWp99e9VSmgkia0n3DR45uPbtaWY1t0W9XbFOAdoM1HN2v4iuHq1bCXgn2DJUmf7fxMnu6eahDSQN4e3vol4ReNWjNKD9Z70HpF7Ou4rzVqzSjtHbpXknRH9TtUt2JdPfL1I5rUfpIS0xL18tqXNaTZEOt0wSeaPqGZW2fquVXPqX+j/lp7eK2++P0LLX14qennCQC4thGkAKAUCvS+/BeJj7ttnCr6VdSEHyfoj5Q/VNanrBqHN9aLrV6UVLDgw38f+K+GrxiuGT/P0E2VbtL4tuPVf3H/S/Y5tcNU9f+2v2756BZV8Kug51s8r9Qs80swx5+Jl5vl0ovGent4a+HuhRq9brSy8rIUXTZaw28ebnPflIebhyZunKj9f++XYRiKKhuloc2GanjscGubM1lntO/vC6sFuru5a8lDSzRo6SDFfhQrfy9/9bmxj80CFdHB0Vr68FINXzFc72x5R5UDK+v/7v4/lj4HABRiMQzDcPUgXC01NVVBQUE6c+aMAgMv/8sJAACAJP36669q0qSJXv96naLrxVxVzb2t6hZ8j1RohL7ZsOeybTcu/kLvjRyoFz/9TvVuukWSNPTrukrJSFCwb4Rmdr18vdm+Henw79v1ctc22rZtmxo3buzw/gFnutpswBUpAACAYrJryHPyTE9Tjl8Zu+q7NXhOmTlp8vG0rx6A4xCkAAAAismhHn2LVN+2RtHqATjOpSepAwAAAAAuiiAFAAAAACYxtQ8AAKCY+JxIlCU/T4abuzJDwkzXp2QkKt/Ik5vFXcG+5usBOA5BCgAAoJh07N72qlftu5hXlrd1yKp9AIqOqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGCSh6sHAAAAcL1YM+9bueXlKt/dvl/BXmz3rfLzc+Xmxq9wgKvxrxAAAKCYnK1Ws0j1EYFFqwfgOEztAwAAAACTCFIAAAAAYBJT+wAAAIpJ1P8WySMjQ7m+vvqzy/2m6zceWaTs3Ax5efiqRVXz9QAchyAFAABQTBpNek1+SQlKD42wK0j957fXlJKRoGDfCIIU4GJM7QMAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYxBfyAgAAFJOMiiE2f5pV1jfE5k8ArkOQAgAAKCYrvlpXpPrXOxatHoDjEKQAAMA1LT4+XsnJyQ7vNy4uzuF9Aig9CFIAAOCaFR8fr9p16igjPd1p58jOynZa3wBKLoIUAAC4ZiUnJysjPV2DJ89WRPVaDu17x/pVWjTtDeXm5jq0XwClA0EKAABc8yKq11J0vRiH9plwaL/pmmavDJP3mRRlBQVr67hppus/+nmY0rJSVMY7WANuMl8PwHEIUgAAAMWk0rqV8ktKUHpohLbaUf/bsZVKyUhQsG+Ew8cGwBy+RwoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUEyO3NVdXmdOKzuorF31t1TtrnPZp+XvZV89AMchSAEAABST7c+PK1L9w42KVg/AcZjaBwAAAAAmEaQAAAAAwCSCFAAAAACYxD1SAAAAxaRzh2byO5Go9JAwLV2x1XT9yCXNlJKeqGC/ME2+y3w9AMfhihQAAEAx8Uw/J89zZ+WZfs6u+sycc8rMPavMHPvqATgOQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHC9+HnsVLlnZirPx8eu+v43TVVOXqY83e2rB+A4BCkAAIBiknBbxyLVN65UtHoAjsPUPgAAAAAwiSAFAAAAACYxtQ8AAKCYBO/eLvecbOV5eimlfozp+sOntis3L1se7l6KLme+HoDjEKQAAACKSetBD8svKUHpoRH6ZsMe0/VT1j+slIwEBftGaGZX8/UAHIcgBQAAAKeIi4tzSr8VKlRQZGSkU/oGrhZBCgAAAA51+mSSLBaLevXq5ZT+ff38tDcujjAFlyJIAQAAwKHSU8/IMAz1Gzdd1es3dGjfCYf2672RA5WcnEyQgksRpAAAAOAU4dE1FF0vxtXDAJyC5c8BAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAExi1T4AAIBismTZFkmGJItd9W/dtUWGDFnsrAfgOAQpAACAYpJbJqBI9b6eRasH4DhM7QMAAAAAkwhSAAAAAGCSy4PUsWPH1KtXL5UvX16+vr5q0KCBfvnlF+txwzD06quvKjw8XL6+vmrfvr0OHDhg08epU6fUs2dPBQYGqmzZshowYIDS0tKK+6kAAABcVu2PZ6rB9Amq/fFMu+q/i5up/+6coO/i7KsH4DguDVIpKSlq0aKFPD09tWzZMu3Zs0dTpkxRcHCwtc2kSZM0ffp0zZo1S1u2bJG/v786dOigzMxMa5uePXvq999/16pVq7RkyRL98MMPGjhwoCueEgAAwCXVnvOeGsycqNpz3rOr/ru97+mr3RP13V776gE4jksXm5g4caKqVKmiOXPmWPdFR0dbfzYMQ9OmTdPLL7+se+65R5I0f/58hYaG6ptvvlGPHj0UFxen5cuXa+vWrWratKkkacaMGerUqZMmT56siIiI4n1SAAAAAK55Lr0itXjxYjVt2lT333+/QkJC1KhRI3344YfW44cPH1ZiYqLat29v3RcUFKTmzZtr8+bNkqTNmzerbNmy1hAlSe3bt5ebm5u2bNly0fNmZWUpNTXV5gEAAAAAV8ulQeqPP/7Q+++/r5o1a2rFihUaNGiQnnrqKc2bN0+SlJiYKEkKDQ21qQsNDbUeS0xMVEhIiM1xDw8PlStXztrm3yZMmKCgoCDro0qVKo5+agAAAACuYS4NUvn5+WrcuLHGjx+vRo0aaeDAgXrsscc0a9Ysp5531KhROnPmjPVx9OhRp54PAAAAwLXFpUEqPDxcdevWtdlXp04dxcfHS5LCwsIkSUlJSTZtkpKSrMfCwsJ04sQJm+O5ubk6deqUtc2/eXt7KzAw0OYBAAAAAFfLpUGqRYsW2rdvn82+/fv3KyoqSlLBwhNhYWFas2aN9Xhqaqq2bNmi2NhYSVJsbKxOnz6tbdu2WdusXbtW+fn5at68eTE8CwAAAADXG5eu2jd8+HDdcsstGj9+vB544AH9/PPPmj17tmbPni1JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XUsEVrI4dO1qnBObk5Gjo0KHq0aMHK/YBAAAAcAqXBqlmzZrp66+/1qhRozR27FhFR0dr2rRp6tmzp7XNc889p3PnzmngwIE6ffq0WrZsqeXLl8vHx8fa5rPPPtPQoUPVrl07ubm5qXv37po+fbornhIAAACA64BLg5Qk3XXXXbrrrrsuedxisWjs2LEaO3bsJduUK1dOCxYscMbwAAAAHOZUvYZKD6+kzHLl7aqPLtdQ5TMrKdDHvnoAjuPyIAUAAHC9+GHWwiLVP9O6aPUAHMeli00AAAAAQGlEkAIAAAAAkwhSAAAAAGAS90gBAAAUk1uf6CGfU38rs1x5u+6XmrK+h1Iz/1agT3nulwJcjCAFAABQTMr9vlN+SQlKD7Xvuy4Pn9qplIwEBfvyXZmAqzG1DwAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQTPb2GyzPtLPKKRNgV32n2oOVkXNWvp721QNwHLuC1B9//KFq1ao5eiwAAADXtL39hxapvlOdotUDcBy7pvbVqFFDt912mz799FNlZmY6ekwAAAAAUKLZFaR+/fVXNWzYUCNGjFBYWJgef/xx/fzzz44eGwAAAACUSHYFqZiYGL3zzjtKSEjQxx9/rOPHj6tly5aqX7++pk6dqpMnTzp6nAAAAKWeR9pZeaSlyiPtrF31GTlnlZ6Tqowc++oBOE6RVu3z8PBQt27dtGjRIk2cOFEHDx7UyJEjVaVKFfXu3VvHjx931DgBAABKvbvubK4HGkfqrjub21X/7JLmemxRpJ5dYl89AMcpUpD65ZdfNHjwYIWHh2vq1KkaOXKkDh06pFWrVikhIUH33HOPo8YJAAAAACWGXav2TZ06VXPmzNG+ffvUqVMnzZ8/X506dZKbW0Eui46O1ty5c1W1alVHjhUAAAAASgS7gtT777+v/v37q2/fvgoPD79om5CQEH300UdFGhwAAAAAlER2BakDBw5csY2Xl5f69OljT/cAAAAAUKLZdY/UnDlztGjRokL7Fy1apHnz5hV5UAAAAABQktkVpCZMmKAKFSoU2h8SEqLx48cXeVAAAAAAUJLZFaTi4+MVHR1daH9UVJTi4+OLPCgAAAAAKMnsClIhISHauXNnof07duxQ+fLlizwoAAAAACjJ7Fps4qGHHtJTTz2lgIAA3XrrrZKk9evX6+mnn1aPHj0cOkAAAOB68fHxSk5OdkrfFSpUUGRkpFP6BgBnsStIjRs3TkeOHFG7du3k4VHQRX5+vnr37s09UgAAXGPi4+NVu04dZaSnO6V/Xz8/7Y2Luy7C1Pr3F8g9J1t5nl521T/TeoFy87Ll4W5fPQDHsStIeXl56fPPP9e4ceO0Y8cO+fr6qkGDBoqKinL0+AAAgIslJycrIz1dgyfPVkT1Wg7tO+HQfr03cqCSk5OviyCVUj+mSPXR5YpWD8Bx7ApS59WqVUu1ajn2AxUAAJRMEdVrKbpejKuHAQAlgl1BKi8vT3PnztWaNWt04sQJ5efn2xxfu3atQwYHAAAAACWRXUHq6aef1ty5c9W5c2fVr19fFovF0eMCAAC45kR8v1zumZnK8/FRwm0dTdf/emy5cvIy5enuo8aVzNcDcBy7gtTChQv1xRdfqFOnTo4eDwAAwDXrpldHyC8pQemhEfpmg/kg9PHPI5SSkaBg3wg17kqQAlzJru+R8vLyUo0aNRw9FgAAAAAoFewKUs8884zeeecdGYbh6PEAAAAAQIln19S+H3/8Ud9//72WLVumevXqydPT0+b4V1995ZDBAQAAAEBJZFeQKlu2rLp27erosQAAAABAqWBXkJozZ46jxwEAAAAApYZd90hJUm5urlavXq0PPvhAZ8+elSQlJCQoLS3NYYMDAAAAgJLIritSf/75pzp27Kj4+HhlZWXp9ttvV0BAgCZOnKisrCzNmjXL0eMEAAAAgBLDritSTz/9tJo2baqUlBT5+vpa93ft2lVr1qxx2OAAAAAAoCSy64rUhg0btGnTJnl5ednsr1q1qo4dO+aQgQEAAFxrcvz8leMfoBw/f7vqfTz95ZMTIB9P++oBOI5dQSo/P195eXmF9v/1118KCAgo8qAAAACuRUtXbC1S/eS7ilYPwHHsmtp3xx13aNq0adZti8WitLQ0vfbaa+rUqZOjxgYAAAAAJZJdV6SmTJmiDh06qG7dusrMzNTDDz+sAwcOqEKFCvrPf/7j6DECAAAAQIliV5CqXLmyduzYoYULF2rnzp1KS0vTgAED1LNnT5vFJwAAAADgWmRXkJIkDw8P9erVy5FjAQAAuKbFTHxFXmdOKzuorLY/P850/YLfXtG57NPy9yqrhxuZrwfgOHYFqfnz51/2eO/eve0aDAAAwLWs6pL/yi8pQemhEXYFqU1H/quUjAQF+0YQpAAXsytIPf300zbbOTk5Sk9Pl5eXl/z8/AhSAAAAAK5pdq3al5KSYvNIS0vTvn371LJlSxabAAAAAHDNsytIXUzNmjX15ptvFrpaBQAAAADXGocFKalgAYqEhARHdgkAAAAAJY5d90gtXrzYZtswDB0/flwzZ85UixYtHDIwAAAAACip7ApS9957r822xWJRxYoV1bZtW02ZMsUR4wIAAACAEsuuIJWfn+/ocQAAAABAqeHQe6QAAAAA4Hpg1xWpESNGXHXbqVOn2nMKAACAa86xNnfI+0yKsoKC7apvVOkOpWWlqIy3ffUAHMeuIPXbb7/pt99+U05Ojm644QZJ0v79++Xu7q7GjRtb21ksFseMEgAA4Bqwddy0ItUPuKlo9QAcx64g1aVLFwUEBGjevHkKDi74PyIpKSnq16+fWrVqpWeeecahgwQAAACAksSue6SmTJmiCRMmWEOUJAUHB+v1119n1T4AAAAA1zy7glRqaqpOnjxZaP/Jkyd19uzZIg8KAAAAAEoyu6b2de3aVf369dOUKVN00003SZK2bNmiZ599Vt26dXPoAAEAAK4VHbq1ke/JE8qoGKIVX60zXf/y8jY6nXFCZX1D9HpH8/UAHMeuIDVr1iyNHDlSDz/8sHJycgo68vDQgAED9NZbbzl0gAAAANcK35Mn5JeUYHf96YwTSsmwvx6A49gVpPz8/PTee+/prbfe0qFDhyRJ1atXl7+/v0MHBwAAAAAlUZG+kPf48eM6fvy4atasKX9/fxmG4ahxAQAAAECJZVeQ+vvvv9WuXTvVqlVLnTp10vHjxyVJAwYMYOlzAAAAANc8u4LU8OHD5enpqfj4ePn5+Vn3P/jgg1q+fLnDBgcAAAAAJZFd90itXLlSK1asUOXKlW3216xZU3/++adDBgYAAAAAJZVdV6TOnTtncyXqvFOnTsnb27vIgwIAAACAksyuINWqVSvNnz/fum2xWJSfn69Jkybptttuc9jgAAAAAKAksmtq36RJk9SuXTv98ssvys7O1nPPPafff/9dp06d0saNGx09RgAAAAAoUewKUvXr19f+/fs1c+ZMBQQEKC0tTd26ddOQIUMUHh7u6DECAABcE357bow8MjKU6+trV/1DjcYoOzdDXh721QNwHNNBKicnRx07dtSsWbP00ksvOWNMAAAA16Q/u9xfpPoWVYtWD8BxTN8j5enpqZ07dzpjLAAAAABQKti12ESvXr300UcfOXosAAAAAFAq2HWPVG5urj7++GOtXr1aTZo0kb+/v83xqVOnOmRwAAAA15KAPw7ILS9X+e4eOlutpun6hNQDys/PlZubhyICzdcDcBxTQeqPP/5Q1apVtXv3bjVu3FiStH//fps2FovFcaMDAAC4hrTrc4/8khKUHhqhbzbsMV0/fs09SslIULBvhGZ2NV8PwHFMBamaNWvq+PHj+v777yVJDz74oKZPn67Q0FCnDA4AAAAASiJT90gZhmGzvWzZMp07d86hAwIAAACAks6uxSbO+3ewAgAAAIDrgakgZbFYCt0DxT1RAAAAAK43pu6RMgxDffv2lbe3tyQpMzNTTzzxRKFV+7766ivHjRAAAAAAShhTV6T69OmjkJAQBQUFKSgoSL169VJERIR1+/zDHm+++aYsFouGDRtm3ZeZmakhQ4aofPnyKlOmjLp3766kpCSbuvj4eHXu3Fl+fn4KCQnRs88+q9zcXLvGAAAAAABXw9QVqTlz5jhlEFu3btUHH3yghg0b2uwfPny4li5dqkWLFikoKEhDhw5Vt27dtHHjRklSXl6eOnfurLCwMG3atEnHjx9X79695enpqfHjxztlrAAAAABQpMUmHCEtLU09e/bUhx9+qODgYOv+M2fO6KOPPtLUqVPVtm1bNWnSRHPmzNGmTZv0008/SZJWrlypPXv26NNPP1VMTIzuvPNOjRs3Tu+++66ys7Nd9ZQAAAAAXONcHqSGDBmizp07q3379jb7t23bppycHJv9tWvXVmRkpDZv3ixJ2rx5sxo0aGDzPVYdOnRQamqqfv/990ueMysrS6mpqTYPAAAAALhapqb2OdrChQv166+/auvWrYWOJSYmysvLS2XLlrXZHxoaqsTERGubf38Z8Pnt820uZsKECRozZkwRRw8AAGDO8v+ulSU/T4abu1314zquVb6RJzeLffUAHMdlQero0aN6+umntWrVKvn4+BTruUeNGqURI0ZYt1NTU1WlSpViHQMAALj+ZIaEFak+2Ldo9QAcx2VT+7Zt26YTJ06ocePG8vDwkIeHh9avX6/p06fLw8NDoaGhys7O1unTp23qkpKSFBZW8CESFhZWaBW/89vn21yMt7e3AgMDbR4AAAAAcLVcFqTatWunXbt2afv27dZH06ZN1bNnT+vPnp6eWrNmjbVm3759io+PV2xsrCQpNjZWu3bt0okTJ6xtVq1apcDAQNWtW7fYnxMAAACA64PLpvYFBASofv36Nvv8/f1Vvnx56/4BAwZoxIgRKleunAIDA/Xkk08qNjZWN998syTpjjvuUN26dfXII49o0qRJSkxM1Msvv6whQ4ZYvzQYAACgpKi+cK4809OU41dGh3r0NV2/9uBcZeakycezjNrWMF8PwHFcutjElbz99ttyc3NT9+7dlZWVpQ4dOui9996zHnd3d9eSJUs0aNAgxcbGyt/fX3369NHYsWNdOGoAAICLa/DuJPklJSg9NMKuIPXVrklKyUhQsG/EdR+k4uLinNJvhQoVFBkZ6ZS+cW0pUUFq3bp1Nts+Pj5699139e67716yJioqSt99952TRwYAAICS4PTJJFksFvXq1csp/fv6+WlvXBxhCldUooIUAAAAcDnpqWdkGIb6jZuu6vUbOrTvhEP79d7IgUpOTiZI4YoIUgAAACh1wqNrKLpejKuHgeuYy1btAwAAAIDSiiAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1i1DwAAoJikRldXdkCgMitUtKs+PLC6/LwCFeRjXz0AxyFIAQAAFJO18/9XpPqX2hWtHoDjMLUPAAAAAEwiSAEAAACASQQpAAAAADCJe6QAAACKyS3PPCbvlL+VFVxem6Z8aLr+3Y2P6WzW3wrwLq8hLczXA3AcghQAAEAxCfl5o/ySEpQeGmFXfdyJjUrJSFCwr331AByHqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk/hCXgAArgHx8fFKTk52St9xcXFO6fd6dPCB3vI6m6rsgEC76m+r0VsZ2any9bKvHoDjEKQAACjl4uPjVbtOHWWkpzv1PNlZ2U7t/3qw+8kXilTfvUHR6gE4DkEKAIBSLjk5WRnp6Ro8ebYiqtdyeP871q/SomlvKDc31+F9A0BpRZACAOAaEVG9lqLrxTi834RD+x3eJwCUdiw2AQAAAAAmcUUKAACgmNzbqq78khKUHhqhbzbsMV0/9Ou6SslIULBvhGZ2NV8PwHG4IgUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACZ5uHoAAAAA14tNkz+Qe3a28ry87KoffMsHys3Lloe7ffUAHIcgBQAAUExONG9VpPq6oUWrB+A4TO0DAAAAAJMIUgAAAABgElP7AAAAiknIlg3We6Tsmea3J2mD9R4ppvkBrkWQAgAAKCa3jHxcfkkJSg+N0Dcb9piuf2/T40rJSFCwb4RmdjVfD8BxmNoHAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmebh6AAAAANeLbzbsKVL9zK5FqwfgOFyRAgAAAACTCFIAAAAAYBJBCgAAAABM4h4pAACAYlJ/xpvyOpuq7IBA7X7yBdP1/931pjKyU+XrFajuDczXA3AcghQAAEAxqfHFfPklJSg9NMKuIPX9wflKyUhQsG8EQQpwMab2AQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAEziC3kBAACKyYmbWsg75W9lBZe3q75OSAudzfpbAd721QNwHIIUAABAMdk05cMi1Q9pUbR6AI7D1D4AAAAAMIkgBQAAAAAmEaQAAAAAwCTukQIAACgmbXt3kU/ySWVWqKi18/9nuv6NNV10JvOkgnwq6qV25usBOA5BCgAAoJgEHj4kv6QEpZ9Ntav+eOohpWQkKD3bvnoAjsPUPgAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJfCEvAABAMdk15Dl5pqcpx6+MXfXdGjynzJw0+XjaVw/AcQhSAAAAxeRQj75Fqm9bo2j1AByHqX0AAAAAYBJBCgAAAABMcmmQmjBhgpo1a6aAgACFhITo3nvv1b59+2zaZGZmasiQISpfvrzKlCmj7t27KykpyaZNfHy8OnfuLD8/P4WEhOjZZ59Vbm5ucT4VAACAK/I5kSjfxGPyOZFoV31KRqL+Tj+mlAz76gE4jkvvkVq/fr2GDBmiZs2aKTc3Vy+++KLuuOMO7dmzR/7+/pKk4cOHa+nSpVq0aJGCgoI0dOhQdevWTRs3bpQk5eXlqXPnzgoLC9OmTZt0/Phx9e7dW56enho/frwrnx4AADbi4+OVnJzs8H7j4uIc3ieco2P3tvJLSlB6aIS+2bDHdP0ry9sqJSNBwb4RmtnVfD0Ax3FpkFq+fLnN9ty5cxUSEqJt27bp1ltv1ZkzZ/TRRx9pwYIFatu2rSRpzpw5qlOnjn766SfdfPPNWrlypfbs2aPVq1crNDRUMTExGjdunJ5//nmNHj1aXl5ehc6blZWlrKws63ZqaqpznygA4LoXHx+v2nXqKCM93WnnyM7KdlrfAABbJWrVvjNnzkiSypUrJ0natm2bcnJy1L59e2ub2rVrKzIyUps3b9bNN9+szZs3q0GDBgoNDbW26dChgwYNGqTff/9djRo1KnSeCRMmaMyYMU5+NgAAXJCcnKyM9HQNnjxbEdVrObTvHetXadG0N5jWDgDFqMQEqfz8fA0bNkwtWrRQ/fr1JUmJiYny8vJS2bJlbdqGhoYqMTHR2uafIer88fPHLmbUqFEaMWKEdTs1NVVVqlRx1FMBAOCSIqrXUnS9GIf2mXBov0P7AwBcWYkJUkOGDNHu3bv1448/Ov1c3t7e8vb2dvp5AAAAAFybSsTy50OHDtWSJUv0/fffq3Llytb9YWFhys7O1unTp23aJyUlKSwszNrm36v4nd8+3wYAAAAAHMmlQcowDA0dOlRff/211q5dq+joaJvjTZo0kaenp9asWWPdt2/fPsXHxys2NlaSFBsbq127dunEiRPWNqtWrVJgYKDq1q1bPE8EAAAAwHXFpVP7hgwZogULFujbb79VQECA9Z6moKAg+fr6KigoSAMGDNCIESNUrlw5BQYG6sknn1RsbKxuvvlmSdIdd9yhunXr6pFHHtGkSZOUmJiol19+WUOGDGH6HgAAAACncGmQev/99yVJbdq0sdk/Z84c9e3bV5L09ttvy83NTd27d1dWVpY6dOig9957z9rW3d1dS5Ys0aBBgxQbGyt/f3/16dNHY8eOLa6nAQAAAOA649IgZRjGFdv4+Pjo3Xff1bvvvnvJNlFRUfruu+8cOTQAAAAAuKQSs2ofAADAtW7NvG/llperfHf7fgV7sd23ys/PlZsbv8IBrsa/QgAAgGJytlrNItVHBBatHoDjlIjlzwEAAACgNCFIAQAAAIBJTO0DAAAoJlH/WySPjAzl+vrqzy73m67feGSRsnMz5OXhqxZVzdcDcByCFAAAQDFpNOk1+SUlKD00wq4g9Z/fXlNKRoKCfSMIUk4UFxfntL4rVKigyMhIp/WP4kOQAgAAACSdPpkki8WiXr16Oe0cvn5+2hsXR5i6BhCkAAAAAEnpqWdkGIb6jZuu6vUbOrz/hEP79d7IgUpOTiZIXQMIUgAAAMA/hEfXUHS9GFcPAyUcq/YBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJjEqn0AAADFJKNiiM2fZpX1DbH5E4DrEKQAAACKyYqv1hWp/vWORasH4DhM7QMAAAAAkwhSAAAAAGASQQoAAAAATOIeKQAAgGLS7JVh8j6ToqygYG0dN810/Uc/D1NaVorKeAdrwE3m6wE4DkEKAACgmFRat1J+SQlKD43QVjvqfzu2UikZCQr2jXD42ACYw9Q+AAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEl8IS8AAEAxOXJXd3mdOa3soLJ21d9StbvOZZ+Wv5d99QAchyAFAABQTLY/P65I9Q83Klo9AMdhah8AAAAAmESQAgAAAACTCFIAAAAAYBL3SAEAABSTzh2aye9EotJDwrR0xVbT9SOXNFNKeqKC/cI0+S7z9QAchyAFAMD/Fx8fr+TkZKf0HRcX55R+Ubp4pp+T57mz8kwPsKs+M+ecMnPPKjPHvnoAjkOQAgBABSGqdp06ykhPd+p5srOyndo/AKB4EKQAAJCUnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BAMWPIAUAwD9EVK+l6HoxDu834dB+h/cJAHAdVu0DAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJVfsAAACKyc9jp8o9M1N5Pj521fe/aapy8jLl6W5fPQDHIUgBAAAUk4TbOhapvnGlotUDcBym9gEAAACASQQpAAAAADCJqX0AAADFJHj3drnnZCvP00sp9WNM1x8+tV25ednycPdSdDnz9QAchyAFAABQTFoPelh+SQlKD43QNxv2mK6fsv5hpWQkKNg3QjO7mq8H4DhM7QMAAAAAk7giBQAAABSjuLg4p/RboUIFRUZGOqVvFEaQAgAAAIrB6ZNJslgs6tWrl1P69/Xz0964OMJUMSFIAQAAAMUgPfWMDMNQv3HTVb1+Q4f2nXBov94bOVDJyckEqWJCkAIAAACKUXh0DUXXi3H1MFBELDYBAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1hsAgAAoJgsWbZFkiHJYlf9W3dtkSFDFjvrATgOQQoAAKCY5JYJKFK9r2fR6gE4DkEKAFCqxMfHKzk52eH9xsXFObxPAMC1iyAFACg14uPjVbtOHWWkpzvtHNlZ2U7rGwBw7SBIAQBKjeTkZGWkp2vw5NmKqF7LoX3vWL9Ki6a9odzcXIf2C/xT7Y9nyjPtrHLKBGhv/6Gm67+Lm6mMnLPy9QxQpzrm6wE4DkEKAFDqRFSvpeh6MQ7tM+HQfof2B1xM7TnvyS8pQemhEfYFqb3vKSUjQcG+EQQpwMVY/hwAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUExO1Wuo9PBKyixX3q766HINVT6zkgJ97KsH4DgEKQAAgGLyw6yFRap/pnXR6gE4DkEKAOBQ8fHxSk5OdkrfcXFxTukXAACzCFIAAIeJj49X7Tp1lJGe7tTzZGdlO7V/ACitnPk/nCpUqKDIyEin9V/aEKQAAA6TnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BoDQ7fTJJFotFvXr1cto5fP38tDcujjD1/xGkAAAOF1G9lqLrxTi834RD+x3eJ1Ccbn2ih3xO/a3McuXtul9qyvoeSs38W4E+5blfCjbSU8/IMAz1Gzdd1es3dHj/CYf2672RA5WcnEyQ+v8IUgAAAMWk3O875ZeUoPTQCLvqD5/aqZSMBAX72lePa194dA2n/I8sFMb3SAEAAACASVyRAoDrkLNW1mNVPQDA9YIgBeCynLmUtVS6VwBy5mvjzNelOFbWY1U9AMC1jiAF4JKK4xfu0roCkLNfG2e+Ls5cWY9V9QAA14trJki9++67euutt5SYmKgbb7xRM2bM0E033eTqYQGlmrOXsi7NKwA587U5/7ps2LBBderUcWjf0oXpd85YWY9V9QAA14trIkh9/vnnGjFihGbNmqXmzZtr2rRp6tChg/bt26eQkBBXDw8o9Zy1lPV5zrqvpjimDTrjtSmO7wKRmH4HADCvNP8329GuiSA1depUPfbYY+rXr58kadasWVq6dKk+/vhjvfDCCy4enXnOvO8iKytL3t7eTulbcv59HaX1Xh1nj91Zf6/OXjjA2YGhtE4bdPZ3gTD9DgBgFv/NLqzUB6ns7Gxt27ZNo0aNsu5zc3NT+/bttXnz5ovWZGVlKSsry7p95swZSVJqaqpzB3sVjh49qqbNmikzI8NJZ7BIMpzUt+Tt46NP5s9XaGioQ/tNSkpS7969lZmZ6dB+/6k0j93Zf68Htv+izPRzDu/34G9bZRiGbn/kcYVHVXNo338n/qWl/zdDK1as0A033ODQviVp3759kqQjv+9w+GtzfnpcdmaGU173nP//+fdn3C65WRz7vjk/dmf07ez+Gbtr+k88fFCStG3bNqWlpTm0b6l4/q2aeV1uzc5UrqTU7EzFbd1ouv/ctEwpR8rNu3K9o8deUvovrX07u39nj704/pt95MgRlS1b1qF92+N8JjCMy7+OFuNKLUq4hIQEVapUSZs2bVJsbKx1/3PPPaf169dry5YthWpGjx6tMWPGFOcwAQAAAJQiR48eVeXKlS95vNRfkbLHqFGjNGLECOt2fn6+Tp06pfLly8tisbhwZNee1NRUValSRUePHlVgYKCrhwMX4r0AifcBCvA+wHm8FyCVvPeBYRg6e/asIiIiLtuu1AepChUqyN3dXUlJSTb7k5KSFBYWdtEab2/vQveTlITLiNeywMDAEvEPA67HewES7wMU4H2A83gvQCpZ74OgoKArtnErhnE4lZeXl5o0aaI1a9ZY9+Xn52vNmjU2U/0AAAAAwFFK/RUpSRoxYoT69Omjpk2b6qabbtK0adN07tw56yp+AAAAAOBI10SQevDBB3Xy5Em9+uqrSkxMVExMjJYvX+7w1ddgnre3t1577TWnLrmO0oH3AiTeByjA+wDn8V6AVHrfB6V+1T4AAAAAKG6l/h4pAAAAAChuBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFIrs/fffV8OGDa1fohYbG6tly5ZZj7dp00YWi8Xm8cQTT7hwxCgOb775piwWi4YNG2bdl5mZqSFDhqh8+fIqU6aMunfvXujLtHHtudh7gc+F68Po0aML/T3Xrl3bepzPhOvDld4HfB5cX44dO6ZevXqpfPny8vX1VYMGDfTLL79YjxuGoVdffVXh4eHy9fVV+/btdeDAAReO+NKuieXP4VqVK1fWm2++qZo1a8owDM2bN0/33HOPfvvtN9WrV0+S9Nhjj2ns2LHWGj8/P1cNF8Vg69at+uCDD9SwYUOb/cOHD9fSpUu1aNEiBQUFaejQoerWrZs2btzoopHC2S71XpD4XLhe1KtXT6tXr7Zue3hc+NWDz4Trx+XeBxKfB9eLlJQUtWjRQrfddpuWLVumihUr6sCBAwoODra2mTRpkqZPn6558+YpOjpar7zyijp06KA9e/bIx8fHhaMvjCCFIuvSpYvN9htvvKH3339fP/30kzVI+fn5KSwszBXDQzFLS0tTz5499eGHH+r111+37j9z5ow++ugjLViwQG3btpUkzZkzR3Xq1NFPP/2km2++2VVDhpNc6r1wHp8L1wcPD4+L/j3zmXB9udT74Dw+D64PEydOVJUqVTRnzhzrvujoaOvPhmFo2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjR7GP+XKY2geHysvL08KFC3Xu3DnFxsZa93/22WeqUKGC6tevr1GjRik9Pd2Fo4QzDRkyRJ07d1b79u1t9m/btk05OTk2+2vXrq3IyEht3ry5uIeJYnCp98J5fC5cHw4cOKCIiAhVq1ZNPXv2VHx8vCQ+E643l3ofnMfnwfVh8eLFatq0qe6//36FhISoUaNG+vDDD63HDx8+rMTERJvPhaCgIDVv3rxEfi5wRQoOsWvXLsXGxiozM1NlypTR119/rbp160qSHn74YUVFRSkiIkI7d+7U888/r3379umrr75y8ajhaAsXLtSvv/6qrVu3FjqWmJgoLy8vlS1b1mZ/aGioEhMTi2mEKC6Xey9IfC5cL5o3b665c+fqhhtu0PHjxzVmzBi1atVKu3fv5jPhOnK590FAQACfB9eRP/74Q++//75GjBihF198UVu3btVTTz0lLy8v9enTx/pvPzQ01KaupH4uEKTgEDfccIO2b9+uM2fO6Msvv1SfPn20fv161a1bVwMHDrS2a9CggcLDw9WuXTsdOnRI1atXd+Go4UhHjx7V008/rVWrVpW4OcwoXlfzXuBz4fpw5513Wn9u2LChmjdvrqioKH3xxRfy9fV14chQnC73PhgwYACfB9eR/Px8NW3aVOPHj5ckNWrUSLt379asWbPUp08fF4/OPKb2wSG8vLxUo0YNNWnSRBMmTNCNN96od95556JtmzdvLkk6ePBgcQ4RTrZt2zadOHFCjRs3loeHhzw8PLR+/XpNnz5dHh4eCg0NVXZ2tk6fPm1Tl5SUxLz4a8yV3gt5eXmFavhcuD6ULVtWtWrV0sGDBxUWFsZnwnXqn++Di+Hz4NoVHh5unbF0Xp06daxTPc//2//36p0l9XOBIAWnyM/PV1ZW1kWPbd++XVLBPyZcO9q1a6ddu3Zp+/bt1kfTpk3Vs2dP68+enp5as2aNtWbfvn2Kj4+3uZ8Opd+V3gvu7u6FavhcuD6kpaXp0KFDCg8PV5MmTfhMuE79831wMXweXLtatGihffv22ezbv3+/oqKiJBUsPBEWFmbzuZCamqotW7aUyM8FpvahyEaNGqU777xTkZGROnv2rBYsWKB169ZpxYoVOnTokBYsWKBOnTqpfPny2rlzp4YPH65bb731ossho/QKCAhQ/fr1bfb5+/urfPny1v0DBgzQiBEjVK5cOQUGBurJJ59UbGwsq3NdY670XuBz4foxcuRIdenSRVFRUUpISNBrr70md3d3PfTQQwoKCuIz4TpxufcBnwfXl+HDh+uWW27R+PHj9cADD+jnn3/W7NmzNXv2bEmyfufg66+/rpo1a1qXP4+IiNC9997r2sFfBEEKRXbixAn17t1bx48fV1BQkBo2bKgVK1bo9ttv19GjR7V69WpNmzZN586dU5UqVdS9e3e9/PLLrh42XODtt9+Wm5ubunfvrqysLHXo0EHvvfeeq4eFYubl5cXnwnXir7/+0kMPPaS///5bFStWVMuWLfXTTz+pYsWKkvhMuF5c7n2QmZnJ58F1pFmzZvr66681atQojR07VtHR0Zo2bZp69uxpbfPcc8/p3LlzGjhwoE6fPq2WLVtq+fLlJfL+a4thGIarBwEAAAAApQn3SAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAEqUI0eOyGKxaPv27a4eSonRpk0bDRs2zNXDAAD8A0EKAOBwFovlso/Ro0e7eoiFlISwsm7dOlksFp0+fdql4wAAXJmHqwcAALj2HD9+3Prz559/rldffVX79u2z7itTpowrhgUAgMNwRQoA4HBhYWHWR1BQkCwWi3U7JCREU6dOVeXKleXt7a2YmBgtX778kn3l5eWpf//+ql27tuLj4yVJ3377rRo3biwfHx9Vq1ZNY8aMUW5urrXGYrHo//7v/9S1a1f5+fmpZs2aWrx4cZGe048//qhWrVrJ19dXVapU0VNPPaVz585Zj1etWlXjx49X//79FRAQoMjISM2ePdumj02bNikmJkY+Pj5q2rSpvvnmG+s0xiNHjui2226TJAUHB8tisahv377W2vz8fD333HMqV66cwsLCSuRVPQC4nhCkAADF6p133tGUKVM0efJk7dy5Ux06dNDdd9+tAwcOFGqblZWl+++/X9u3b9eGDRsUGRmpDRs2qHfv3nr66ae1Z88effDBB5o7d67eeOMNm9oxY8bogQce0M6dO9WpUyf17NlTp06dsmvMhw4dUseOHdW9e3ft3LlTn3/+uX788UcNHTrUpt2UKVPUtGlT/fbbbxo8eLAGDRpkvRKXmpqqLl26qEGDBvr11181btw4Pf/889baKlWq6L///a8kad++fTp+/Ljeeecd6/F58+bJ399fW7Zs0aRJkzR27FitWrXKrucDAHAAAwAAJ5ozZ44RFBRk3Y6IiDDeeOMNmzbNmjUzBg8ebBiGYRw+fNiQZGzYsMFo166d0bJlS+P06dPWtu3atTPGjx9vU//JJ58Y4eHh1m1Jxssvv2zdTktLMyQZy5Ytu+Q4W7dubTz99NMXPTZgwABj4MCBNvs2bNhguLm5GRkZGYZhGEZUVJTRq1cv6/H8/HwjJCTEeP/99w3DMIz333/fKF++vLW9YRjGhx9+aEgyfvvtN8MwDOP77783JBkpKSmFxtayZUubfc2aNTOef/75Sz4fAIBzcY8UAKDYpKamKiEhQS1atLDZ36JFC+3YscNm30MPPaTKlStr7dq18vX1te7fsWOHNm7caHMFKi8vT5mZmUpPT5efn58kqWHDhtbj/v7+CgwM1IkTJ+wa944dO7Rz50599tln1n2GYSg/P1+HDx9WnTp1Cp3z/HTG8+fct2+fGjZsKB8fH2ubm2666arH8M++JSk8PNzu5wMAKDqCFACgROrUqZM+/fRTbd68WW3btrXuT0tL05gxY9StW7dCNf8MKZ6enjbHLBaL8vPz7RpLWlqaHn/8cT311FOFjkVGRjrlnP/mzL4BAOYRpAAAxSYwMFARERHauHGjWrdubd2/cePGQldnBg0apPr16+vuu+/W0qVLre0bN26sffv2qUaNGsU27saNG2vPnj1FOucNN9ygTz/9VFlZWfL29pYkbd261aaNl5eXpIIrbACAko0gBQAoVs8++6xee+01Va9eXTExMZozZ462b99uM23uvCeffFJ5eXm66667tGzZMrVs2VKvvvqq7rrrLkVGRuq+++6Tm5ubduzYod27d+v1118v0thOnjxZ6IuAw8PD9fzzz+vmm2/W0KFD9eijj8rf31979uzRqlWrNHPmzKvq++GHH9ZLL72kgQMH6oUXXlB8fLwmT54sqeDqkiRFRUXJYrFoyZIl6tSpk3x9fVkqHgBKKFbtAwAUq6eeekojRozQM888owYNGmj58uVavHixatasedH2w4YN05gxY9SpUydt2rRJHTp00JIlS7Ry5Uo1a9ZMN998s95++21FRUUVeWwLFixQo0aNbB4ffvihGjZsqPXr12v//v1q1aqVGjVqpFdffVURERFX3XdgYKD+97//afv27YqJidFLL72kV199VdKFKYmVKlXSmDFj9MILLyg0NLTQqoAAgJLDYhiG4epBAABwPfrss8/Ur18/nTlzxmZBDQBAycfUPgAAisn8+fNVrVo1VapUSTt27NDzzz+vBx54gBAFAKUQQQoAgGKSmJioV199VYmJiQoPD9f9999f6IuEAQClA1P7AAAAAMAkFpsAAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmPT/ALFDWFiHUDxIAAAAAElFTkSuQmCC",
458
+ "text/plain": [
459
+ "<Figure size 1000x600 with 1 Axes>"
460
+ ]
461
+ },
462
+ "metadata": {},
463
+ "output_type": "display_data"
464
+ }
465
+ ],
466
+ "source": [
467
+ "#统计图\n",
468
+ "import matplotlib.pyplot as plt\n",
469
+ "import seaborn as sns\n",
470
+ "import numpy as np\n",
471
+ "\n",
472
+ "# 假设这是您的 token_len_list\n",
473
+ "\n",
474
+ "# 设置画布大小\n",
475
+ "plt.figure(figsize=(10, 6))\n",
476
+ "\n",
477
+ "# 使用 seaborn 生成直方图\n",
478
+ "sns.histplot(token_len_list, bins=30, kde=False, color=\"skyblue\", edgecolor=\"black\")\n",
479
+ "\n",
480
+ "# 添加标题和标签\n",
481
+ "plt.title(\"Distribution of Token Lengths\")\n",
482
+ "plt.xlabel(\"Token Length\")\n",
483
+ "plt.ylabel(\"Frequency\")\n",
484
+ "\n",
485
+ "# 显示平均值线\n",
486
+ "mean_value = np.mean(token_len_list)\n",
487
+ "plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2)\n",
488
+ "plt.text(mean_value + 2, plt.ylim()[1]*0.9, f'Mean: {mean_value:.2f}', color='red')\n",
489
+ "\n",
490
+ "# 显示中位数线\n",
491
+ "median_value = np.median(token_len_list)\n",
492
+ "plt.axvline(median_value, color='green', linestyle='dashed', linewidth=2)\n",
493
+ "plt.text(median_value - 10, plt.ylim()[1]*0.8, f'Median: {median_value:.2f}', color='green')\n",
494
+ "\n",
495
+ "# 显示图形\n",
496
+ "plt.show()"
497
+ ]
498
+ },
499
+ {
500
+ "cell_type": "code",
501
+ "execution_count": 15,
502
+ "id": "9a65c8bc-6bf0-4605-8c38-409bbb14f2c7",
503
+ "metadata": {},
504
+ "outputs": [
505
+ {
506
+ "data": {
507
+ "application/vnd.jupyter.widget-view+json": {
508
+ "model_id": "a4e97d92506f419581c3711f26d7f683",
509
+ "version_major": 2,
510
+ "version_minor": 0
511
+ },
512
+ "text/plain": [
513
+ "Map: 0%| | 0/53275 [00:00<?, ? examples/s]"
514
+ ]
515
+ },
516
+ "metadata": {},
517
+ "output_type": "display_data"
518
+ },
519
+ {
520
+ "data": {
521
+ "application/vnd.jupyter.widget-view+json": {
522
+ "model_id": "9004c03cb9b24411b6bd9d33662402fb",
523
+ "version_major": 2,
524
+ "version_minor": 0
525
+ },
526
+ "text/plain": [
527
+ "Map: 0%| | 0/5920 [00:00<?, ? examples/s]"
528
+ ]
529
+ },
530
+ "metadata": {},
531
+ "output_type": "display_data"
532
+ }
533
+ ],
534
+ "source": [
535
+ "# 2. tokenize\n",
536
+ "def tokenize_function(examples):\n",
537
+ " examples['label'] = [int(item) for item in examples['label']]\n",
538
+ " return tokenizer(examples['sequence'], truncation=True, padding='max_length', max_length=128)\n",
539
+ "\n",
540
+ "# 3. 对数据集应用分词函数\n",
541
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
542
+ "\n",
543
+ "# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
544
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
545
+ ]
546
+ },
547
+ {
548
+ "cell_type": "code",
549
+ "execution_count": 22,
550
+ "id": "4b0faa94-d0c4-4ce8-9976-dcefcb766f0b",
551
+ "metadata": {},
552
+ "outputs": [
553
+ {
554
+ "name": "stderr",
555
+ "output_type": "stream",
556
+ "text": [
557
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
558
+ " warnings.warn(\n",
559
+ "/tmp/ipykernel_2549/341301010.py:29: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
560
+ " trainer = Trainer(\n"
561
+ ]
562
+ }
563
+ ],
564
+ "source": [
565
+ "from transformers import TrainingArguments, Trainer\n",
566
+ "import numpy as np\n",
567
+ "import torch.nn as nn\n",
568
+ "\n",
569
+ "\n",
570
+ "\n",
571
+ "def compute_metrics(eval_pred):\n",
572
+ " predictions, labels = eval_pred\n",
573
+ " predictions = np.argmax(predictions, axis=1)\n",
574
+ " return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
575
+ "\n",
576
+ "# change training hyperparameters to archive better quality\n",
577
+ "training_args = TrainingArguments(\n",
578
+ " output_dir=\"ds_job_category_v0\",\n",
579
+ " learning_rate=1e-5,\n",
580
+ " lr_scheduler_type=\"constant_with_warmup\",\n",
581
+ " warmup_ratio=0.1,\n",
582
+ " optim='adamw_torch',\n",
583
+ " weight_decay=0.0,\n",
584
+ " per_device_train_batch_size=20,\n",
585
+ " per_device_eval_batch_size=20,\n",
586
+ " num_train_epochs=10,\n",
587
+ " evaluation_strategy=\"epoch\",\n",
588
+ " save_strategy=\"epoch\",\n",
589
+ " logging_strategy=\"epoch\",\n",
590
+ " load_best_model_at_end=True\n",
591
+ ")\n",
592
+ "\n",
593
+ "trainer = Trainer(\n",
594
+ " model=model,\n",
595
+ " args=training_args,\n",
596
+ " train_dataset=tokenized_datasets[\"train\"],\n",
597
+ " eval_dataset=tokenized_datasets[\"test\"],\n",
598
+ " tokenizer=tokenizer,\n",
599
+ " data_collator=data_collator,\n",
600
+ " compute_metrics=compute_metrics,\n",
601
+ ")"
602
+ ]
603
+ },
604
+ {
605
+ "cell_type": "code",
606
+ "execution_count": 17,
607
+ "id": "9b067740-9c0f-4df8-a5af-b68ec9d1f3e0",
608
+ "metadata": {},
609
+ "outputs": [
610
+ {
611
+ "data": {
612
+ "text/html": [
613
+ "\n",
614
+ " <div>\n",
615
+ " \n",
616
+ " <progress value='26640' max='26640' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
617
+ " [26640/26640 1:00:13, Epoch 10/10]\n",
618
+ " </div>\n",
619
+ " <table border=\"1\" class=\"dataframe\">\n",
620
+ " <thead>\n",
621
+ " <tr style=\"text-align: left;\">\n",
622
+ " <th>Epoch</th>\n",
623
+ " <th>Training Loss</th>\n",
624
+ " <th>Validation Loss</th>\n",
625
+ " <th>Accuracy</th>\n",
626
+ " </tr>\n",
627
+ " </thead>\n",
628
+ " <tbody>\n",
629
+ " <tr>\n",
630
+ " <td>1</td>\n",
631
+ " <td>0.324900</td>\n",
632
+ " <td>0.237557</td>\n",
633
+ " <td>0.916216</td>\n",
634
+ " </tr>\n",
635
+ " <tr>\n",
636
+ " <td>2</td>\n",
637
+ " <td>0.193100</td>\n",
638
+ " <td>0.212998</td>\n",
639
+ " <td>0.925338</td>\n",
640
+ " </tr>\n",
641
+ " <tr>\n",
642
+ " <td>3</td>\n",
643
+ " <td>0.126900</td>\n",
644
+ " <td>0.278650</td>\n",
645
+ " <td>0.923480</td>\n",
646
+ " </tr>\n",
647
+ " <tr>\n",
648
+ " <td>4</td>\n",
649
+ " <td>0.076900</td>\n",
650
+ " <td>0.362979</td>\n",
651
+ " <td>0.922804</td>\n",
652
+ " </tr>\n",
653
+ " <tr>\n",
654
+ " <td>5</td>\n",
655
+ " <td>0.047400</td>\n",
656
+ " <td>0.518552</td>\n",
657
+ " <td>0.915372</td>\n",
658
+ " </tr>\n",
659
+ " <tr>\n",
660
+ " <td>6</td>\n",
661
+ " <td>0.032000</td>\n",
662
+ " <td>0.698843</td>\n",
663
+ " <td>0.918412</td>\n",
664
+ " </tr>\n",
665
+ " <tr>\n",
666
+ " <td>7</td>\n",
667
+ " <td>0.029000</td>\n",
668
+ " <td>0.760331</td>\n",
669
+ " <td>0.915709</td>\n",
670
+ " </tr>\n",
671
+ " <tr>\n",
672
+ " <td>8</td>\n",
673
+ " <td>0.025900</td>\n",
674
+ " <td>0.769762</td>\n",
675
+ " <td>0.921959</td>\n",
676
+ " </tr>\n",
677
+ " <tr>\n",
678
+ " <td>9</td>\n",
679
+ " <td>0.021800</td>\n",
680
+ " <td>0.740165</td>\n",
681
+ " <td>0.923142</td>\n",
682
+ " </tr>\n",
683
+ " <tr>\n",
684
+ " <td>10</td>\n",
685
+ " <td>0.021300</td>\n",
686
+ " <td>0.738664</td>\n",
687
+ " <td>0.922973</td>\n",
688
+ " </tr>\n",
689
+ " </tbody>\n",
690
+ "</table><p>"
691
+ ],
692
+ "text/plain": [
693
+ "<IPython.core.display.HTML object>"
694
+ ]
695
+ },
696
+ "metadata": {},
697
+ "output_type": "display_data"
698
+ },
699
+ {
700
+ "data": {
701
+ "text/plain": [
702
+ "TrainOutput(global_step=26640, training_loss=0.08990609108864724, metrics={'train_runtime': 3619.5996, 'train_samples_per_second': 147.185, 'train_steps_per_second': 7.36, 'total_flos': 3.4801460969472e+16, 'train_loss': 0.08990609108864724, 'epoch': 10.0})"
703
+ ]
704
+ },
705
+ "execution_count": 17,
706
+ "metadata": {},
707
+ "output_type": "execute_result"
708
+ }
709
+ ],
710
+ "source": [
711
+ "trainer.train()"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": 20,
717
+ "id": "aa26e020-2dfd-4e0e-b330-250ee3e44a44",
718
+ "metadata": {},
719
+ "outputs": [
720
+ {
721
+ "data": {
722
+ "text/html": [],
723
+ "text/plain": [
724
+ "<IPython.core.display.HTML object>"
725
+ ]
726
+ },
727
+ "metadata": {},
728
+ "output_type": "display_data"
729
+ },
730
+ {
731
+ "data": {
732
+ "text/plain": [
733
+ "{'accuracy': 0.9253378378378379, 'f1': 0.927062706270627}"
734
+ ]
735
+ },
736
+ "execution_count": 20,
737
+ "metadata": {},
738
+ "output_type": "execute_result"
739
+ }
740
+ ],
741
+ "source": [
742
+ "#模型测试\n",
743
+ "import evaluate\n",
744
+ "predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
745
+ "preds = np.argmax(predictions.predictions, axis=-1)\n",
746
+ "metric = evaluate.load(\"glue\", \"mrpc\")\n",
747
+ "ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
748
+ "ret"
749
+ ]
750
+ },
751
+ {
752
+ "cell_type": "code",
753
+ "execution_count": 21,
754
+ "id": "5e6d99ad-66a0-4b85-9380-ae2b7ee88056",
755
+ "metadata": {},
756
+ "outputs": [
757
+ {
758
+ "data": {
759
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHHCAYAAACcHAM1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABRWklEQVR4nO3deVgV1f8H8PeAclkv4AKIEoKCguJuRrgmgYpbWGaSool9LdDc0MwNtyzcUnMpS1HT1Eot0VQUd3FNlFxIEEUTsERAUPb5/eGPySt45crFO17er555Hu7MmTOfuQ/px885Z0YQRVEEERERkYwZ6DoAIiIiomdhwkJERESyx4SFiIiIZI8JCxEREckeExYiIiKSPSYsREREJHtMWIiIiEj2mLAQERGR7DFhISIiItljwkL0Erp69Sp8fHxgaWkJQRCwfft2rfZ//fp1CIKAiIgIrfb7MuvcuTM6d+6s6zCIqiwmLETPKTExEf/73//g7OwMY2NjKJVKeHl5YfHixXj48GGlXjswMBBxcXGYM2cO1q9fjzZt2lTq9V6kIUOGQBAEKJXKMr/Hq1evQhAECIKA+fPna9z/7du3ERYWhtjYWC1ES0QvSjVdB0D0Mtq5cyfeeecdKBQKDB48GE2bNkV+fj6OHj2K0NBQXLx4Ed9++22lXPvhw4eIiYnB5MmTERISUinXcHR0xMOHD1G9evVK6f9ZqlWrhgcPHmDHjh3o37+/yrENGzbA2NgYubm5z9X37du3MWPGDNSvXx8tWrQo93l79+59rusRkXYwYSHSUFJSEgYMGABHR0dER0ejTp060rHg4GAkJCRg586dlXb9f/75BwBgZWVVadcQBAHGxsaV1v+zKBQKeHl54ccffyyVsGzcuBF+fn745ZdfXkgsDx48gKmpKYyMjF7I9YiobBwSItJQeHg4srOz8f3336skKyUaNmyITz75RPpcWFiIWbNmoUGDBlAoFKhfvz4+++wz5OXlqZxXv3599OzZE0ePHsWrr74KY2NjODs7Y926dVKbsLAwODo6AgBCQ0MhCALq168P4NFQSsnPjwsLC4MgCCr7oqKi0L59e1hZWcHc3ByNGjXCZ599Jh1/2hyW6OhodOjQAWZmZrCyskKfPn1w+fLlMq+XkJCAIUOGwMrKCpaWlhg6dCgePHjw9C/2CQMHDsTvv/+OjIwMad/p06dx9epVDBw4sFT79PR0jB8/Hh4eHjA3N4dSqUT37t1x/vx5qc3BgwfRtm1bAMDQoUOloaWS++zcuTOaNm2Ks2fPomPHjjA1NZW+lyfnsAQGBsLY2LjU/fv6+sLa2hq3b98u970S0bMxYSHS0I4dO+Ds7IzXX3+9XO2DgoIwbdo0tGrVCosWLUKnTp0wd+5cDBgwoFTbhIQEvP3223jzzTexYMECWFtbY8iQIbh48SIAwN/fH4sWLQIAvPfee1i/fj2++uorjeK/ePEievbsiby8PMycORMLFixA7969cezYMbXn7du3D76+vrhz5w7CwsIwduxYHD9+HF5eXrh+/Xqp9v3798f9+/cxd+5c9O/fHxEREZgxY0a54/T394cgCNi6dau0b+PGjWjcuDFatWpVqv21a9ewfft29OzZEwsXLkRoaCji4uLQqVMnKXlwc3PDzJkzAQAffvgh1q9fj/Xr16Njx45SP3fv3kX37t3RokULfPXVV+jSpUuZ8S1evBi1a9dGYGAgioqKAADffPMN9u7di6VLl8Le3r7c90pE5SASUbllZmaKAMQ+ffqUq31sbKwIQAwKClLZP378eBGAGB0dLe1zdHQUAYiHDx+W9t25c0dUKBTiuHHjpH1JSUkiAHHevHkqfQYGBoqOjo6lYpg+fbr4+P/qixYtEgGI//zzz1PjLrnGmjVrpH0tWrQQbWxsxLt370r7zp8/LxoYGIiDBw8udb0PPvhApc+33npLrFmz5lOv+fh9mJmZiaIoim+//bbYtWtXURRFsaioSLSzsxNnzJhR5neQm5srFhUVlboPhUIhzpw5U9p3+vTpUvdWolOnTiIAceXKlWUe69Spk8q+PXv2iADE2bNni9euXRPNzc3Fvn37PvMeiUhzrLAQaSArKwsAYGFhUa72u3btAgCMHTtWZf+4ceMAoNRcF3d3d3To0EH6XLt2bTRq1AjXrl177pifVDL35ddff0VxcXG5zklJSUFsbCyGDBmCGjVqSPubNWuGN998U7rPx40YMULlc4cOHXD37l3pOyyPgQMH4uDBg0hNTUV0dDRSU1PLHA4CHs17MTB49EdaUVER7t69Kw13/fHHH+W+pkKhwNChQ8vV1sfHB//73/8wc+ZM+Pv7w9jYGN988025r0VE5ceEhUgDSqUSAHD//v1ytb9x4wYMDAzQsGFDlf12dnawsrLCjRs3VPa/8sorpfqwtrbGvXv3njPi0t599114eXkhKCgItra2GDBgALZs2aI2eSmJs1GjRqWOubm54d9//0VOTo7K/ifvxdraGgA0upcePXrAwsICmzdvxoYNG9C2bdtS32WJ4uJiLFq0CC4uLlAoFKhVqxZq166NCxcuIDMzs9zXrFu3rkYTbOfPn48aNWogNjYWS5YsgY2NTbnPJaLyY8JCpAGlUgl7e3v8+eefGp335KTXpzE0NCxzvyiKz32NkvkVJUxMTHD48GHs27cPgwYNwoULF/Duu+/izTffLNW2IipyLyUUCgX8/f2xdu1abNu27anVFQD4/PPPMXbsWHTs2BE//PAD9uzZg6ioKDRp0qTclSTg0fejiXPnzuHOnTsAgLi4OI3OJaLyY8JCpKGePXsiMTERMTExz2zr6OiI4uJiXL16VWV/WloaMjIypBU/2mBtba2yoqbEk1UcADAwMEDXrl2xcOFCXLp0CXPmzEF0dDQOHDhQZt8lccbHx5c6duXKFdSqVQtmZmYVu4GnGDhwIM6dO4f79++XOVG5xM8//4wuXbrg+++/x4ABA+Dj4wNvb+9S30l5k8fyyMnJwdChQ+Hu7o4PP/wQ4eHhOH36tNb6J6L/MGEh0tCECRNgZmaGoKAgpKWllTqemJiIxYsXA3g0pAGg1EqehQsXAgD8/Py0FleDBg2QmZmJCxcuSPtSUlKwbds2lXbp6emlzi15gNqTS61L1KlTBy1atMDatWtVEoA///wTe/fule6zMnTp0gWzZs3C119/DTs7u6e2MzQ0LFW9+emnn/D333+r7CtJrMpK7jQ1ceJEJCcnY+3atVi4cCHq16+PwMDAp36PRPT8+OA4Ig01aNAAGzduxLvvvgs3NzeVJ90eP34cP/30E4YMGQIAaN68OQIDA/Htt98iIyMDnTp1wqlTp7B27Vr07dv3qUtmn8eAAQMwceJEvPXWWxg1ahQePHiAFStWwNXVVWXS6cyZM3H48GH4+fnB0dERd+7cwfLly1GvXj20b9/+qf3PmzcP3bt3h6enJ4YNG4aHDx9i6dKlsLS0RFhYmNbu40kGBgaYMmXKM9v17NkTM2fOxNChQ/H6668jLi4OGzZsgLOzs0q7Bg0awMrKCitXroSFhQXMzMzQrl07ODk5aRRXdHQ0li9fjunTp0vLrNesWYPOnTtj6tSpCA8P16g/InoGHa9SInpp/fXXX+Lw4cPF+vXri0ZGRqKFhYXo5eUlLl26VMzNzZXaFRQUiDNmzBCdnJzE6tWriw4ODuKkSZNU2ojio2XNfn5+pa7z5HLapy1rFkVR3Lt3r9i0aVPRyMhIbNSokfjDDz+UWta8f/9+sU+fPqK9vb1oZGQk2tvbi++99574119/lbrGk0t/9+3bJ3p5eYkmJiaiUqkUe/XqJV66dEmlTcn1nlw2vWbNGhGAmJSU9NTvVBRVlzU/zdOWNY8bN06sU6eOaGJiInp5eYkxMTFlLkf+9ddfRXd3d7FatWoq99mpUyexSZMmZV7z8X6ysrJER0dHsVWrVmJBQYFKuzFjxogGBgZiTEyM2nsgIs0IoqjBDDgiIiIiHeAcFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHB8dVsuLiYty+fRsWFhZafSQ4ERG9GKIo4v79+7C3t5feCK5tubm5yM/P10pfRkZGMDY21kpfcsKEpZLdvn0bDg4Oug6DiIgq6ObNm6hXr57W+83NzYWJRU2g8IFW+rOzs0NSUpLeJS1MWCqZhYUFAMCo7WgI1RQ6joaoctzYMUnXIRBVmvv3s+Di9Ir057m25efnA4UPoHAPBAyNKtZZUT5SL61Ffn4+ExbSTMkwkFBNwYSF9JZSqdR1CESVrtKH9asZQ6hgwiIK+js1lQkLERGRHAgAKpoU6fFUSSYsREREciAYPNoq2oee0t87IyIiIr3BCgsREZEcCIIWhoT0d0yICQsREZEccEhILf29MyIiItIbrLAQERHJAYeE1GLCQkREJAtaGBLS44ET/b0zIiIi0hussBAREckBh4TUYsJCREQkB1wlpJb+3hkRERHpDVZYiIiI5IBDQmoxYSEiIpIDDgmpxYSFiIhIDlhhUUt/UzEiIiLSG6ywEBERyQGHhNRiwkJERCQHgqCFhIVDQkREREQ6wwoLERGRHBgIj7aK9qGnmLAQERHJAeewqKW/d0ZERER6gxUWIiIiOeBzWNRiwkJERCQHHBJSS3/vjIiIiPQGKyxERERywCEhtZiwEBERyQGHhNRiwkJERCQHrLCopb+pGBEREekNVliIiIjkgENCajFhISIikgMOCamlv6kYERER6Q1WWIiIiGRBC0NCelyHYMJCREQkBxwSUkt/UzEiIiLSG6ywEBERyYEgaGGVkP5WWJiwEBERyQGXNaulv3dGREREas2dOxdt27aFhYUFbGxs0LdvX8THx6u06dy5MwRBUNlGjBih0iY5ORl+fn4wNTWFjY0NQkNDUVhYqNLm4MGDaNWqFRQKBRo2bIiIiAiNYmXCQkREJAclk24rumng0KFDCA4OxokTJxAVFYWCggL4+PggJydHpd3w4cORkpIibeHh4dKxoqIi+Pn5IT8/H8ePH8fatWsRERGBadOmSW2SkpLg5+eHLl26IDY2FqNHj0ZQUBD27NlT7lg5JERERCQHOhgS2r17t8rniIgI2NjY4OzZs+jYsaO039TUFHZ2dmX2sXfvXly6dAn79u2Dra0tWrRogVmzZmHixIkICwuDkZERVq5cCScnJyxYsAAA4ObmhqNHj2LRokXw9fUtV6yssBAREcmBFissWVlZKlteXl65QsjMzAQA1KhRQ2X/hg0bUKtWLTRt2hSTJk3CgwcPpGMxMTHw8PCAra2ttM/X1xdZWVm4ePGi1Mbb21ulT19fX8TExJT762GFhYiISM84ODiofJ4+fTrCwsLUnlNcXIzRo0fDy8sLTZs2lfYPHDgQjo6OsLe3x4ULFzBx4kTEx8dj69atAIDU1FSVZAWA9Dk1NVVtm6ysLDx8+BAmJibPvCcmLERERHKgxSGhmzdvQqlUSrsVCsUzTw0ODsaff/6Jo0ePquz/8MMPpZ89PDxQp04ddO3aFYmJiWjQoEHF4tUAh4SIiIjkQItDQkqlUmV7VsISEhKCyMhIHDhwAPXq1VPbtl27dgCAhIQEAICdnR3S0tJU2pR8Lpn38rQ2SqWyXNUVgAkLERFRlSWKIkJCQrBt2zZER0fDycnpmefExsYCAOrUqQMA8PT0RFxcHO7cuSO1iYqKglKphLu7u9Rm//79Kv1ERUXB09Oz3LEyYSEiIpKBJ5918rybJoKDg/HDDz9g48aNsLCwQGpqKlJTU/Hw4UMAQGJiImbNmoWzZ8/i+vXr+O233zB48GB07NgRzZo1AwD4+PjA3d0dgwYNwvnz57Fnzx5MmTIFwcHBUmVnxIgRuHbtGiZMmIArV65g+fLl2LJlC8aMGVPuWJmwEBERyYAuEpYVK1YgMzMTnTt3Rp06daRt8+bNAAAjIyPs27cPPj4+aNy4McaNG4d+/fphx44dUh+GhoaIjIyEoaEhPD098f7772Pw4MGYOXOm1MbJyQk7d+5EVFQUmjdvjgULFuC7774r95JmgJNuiYiIqixRFNUed3BwwKFDh57Zj6OjI3bt2qW2TefOnXHu3DmN4nscExYiIiI5EP5/q2gfeooJCxERkQw8z5BOGZ1oJxgZ4hwWIiIikj1WWIiIiGSAFRb1mLAQERHJABMW9ZiwEBERyQATFvU4h4WIiIhkjxUWIiIiOeCyZrWYsBAREckAh4TU45AQERERyR4rLERERDIgCNBChUU7scgRExYiIiIZEKCFISE9zlg4JERERESyxwoLERGRDHDSrXpMWIiIiOSAy5rV4pAQERERyR4rLERERHKghSEhkUNCREREVJm0MYel4quM5IsJCxERkQwwYVGPc1iIiIhI9lhhISIikgOuElKLCQsREZEMcEhIPQ4JERERkeyxwkJERCQDrLCox4SFiIhIBpiwqMchISIiIpI9VliIiIhkgBUW9ZiwEBERyQGXNavFISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBJizqMWEhIiKSASYs6nEOCxEREckeKyxERERywFVCajFhISIikgEOCanHISEiIiKSvZeiwiIIArZt24a+ffvqOhR6Aca81wE9O7jB5ZVayM0rwKmLNxG2KgoJN++qtGvrXg9ThnVF68b1UFRcjD8TU9Fvwnrk5hdKbXzauSB0cGc0cbZFXn4hjp2/jvenbZKOd2zphMkfvAE3J1s8yM3Hpj3nMev7/SgqLn5h90u0KGIvIg+cx9UbaTBWVMerHk6YPrIPXBxtAQD3MnPwxbe7cODkFdxKu4eaVubw69QMn43wg9LcBACwMfIEQmZuKLP/+N2fo3YNixd2P/R8WGFRT+cJS2pqKubMmYOdO3fi77//ho2NDVq0aIHRo0eja9euug4Poihi+vTpWLVqFTIyMuDl5YUVK1bAxcVF16HprdebO+K7X0/hXPzfqGZggKlB3tgaPhivDf0aD3ILADxKVn7+YhAW/XgEE5fuQmFRMZo626FYFKV+enVww+JxvTHr+/04fC4J1QwN4FbfRjre1NkWW+a+jwUbDmPE3G2oU8sCC8f0goGhgGkr977w+6aq69gfCRj2Tge0dHNEUVERZq3YgX4jlyFm82SYmSiQ8m8mUv7NxMxP+qKRkx1upqRj3BebkfJvJtZ+MQwA8JZ3K3R9zV2l3+CZPyAvv4DJyktCgBYSFj2exKLThOX69evw8vKClZUV5s2bBw8PDxQUFGDPnj0IDg7GlStXdBkeACA8PBxLlizB2rVr4eTkhKlTp8LX1xeXLl2CsbGxrsPTS+98+oPK54+/3IaEbRPRwtUexy/cAADM+bgbvtl2El/9eFRq93gFxtDAAHNDumPaN1H44fc/pP3xN/6Rfn6rS1NcvJaGeesPAQCSbqcj7Nu9WD2tP8LXHkT2w/xKuT+iJ/285GOVz8umvQ9X389w/vJNvN6qIdwb2GPdl0HScad6tTH5o14YMX0dCguLUK2aIUyMjWBibCS1+ffefRw58xeWTBn4wu6DqDLpdA7Lxx9/DEEQcOrUKfTr1w+urq5o0qQJxo4dixMnTjz1vIkTJ8LV1RWmpqZwdnbG1KlTUVBQIB0/f/48unTpAgsLCyiVSrRu3RpnzpwBANy4cQO9evWCtbU1zMzM0KRJE+zatavM64iiiK+++gpTpkxBnz590KxZM6xbtw63b9/G9u3btfpd0NMpzR4lhveyHgIAalmZoa27A/7JyMGepcMQ/3MoIhcNxWtNX5HOae5aB3VrW6JYLMahb0bg8k/j8dPc91UqLEbVqyHvseEjAHiYVwgTRXU0d7V/AXdGVLas7FwAgJWlqZo2D2FhZoxq1QzLPL5p1ymYGBuh9xstKiNEqgQlQ0IV3fSVzhKW9PR07N69G8HBwTAzMyt13MrK6qnnWlhYICIiApcuXcLixYuxatUqLFq0SDoeEBCAevXq4fTp0zh79iw+/fRTVK9eHQAQHByMvLw8HD58GHFxcfjyyy9hbm5e5nWSkpKQmpoKb29vaZ+lpSXatWuHmJiY57xz0oQgCJgb3A0n4m7g8vU7AID6dawBAJ8O7oy1O8/i7U/X4/zVFGyfHwjnujVU2wR2wfwfDmHAZxuQkf0QOxYNgZXFozH/6DMJeLWJA/q90RQGBgLq1LLAhMGdAAB2Ncv+nSCqbMXFxfhs4S9o19wZ7g3KTpzvZmRj/urdCOz7+lP7+eG3E3jbt7VK1YVkTtDSpqd0NiSUkJAAURTRuHFjjc+dMmWK9HP9+vUxfvx4bNq0CRMmTAAAJCcnIzQ0VOr78fkmycnJ6NevHzw8PAAAzs7OT71OamoqAMDW1lZlv62trXTsSXl5ecjLy5M+Z2VlaXJr9IT5n/jBzckG3UetlvYZGDz6PzIi8gw27o4FAMQl7Eanlk54v3srzPxuHwz+/18ZC344jB1HLgMAgsO34+LmcejbqQkiIs/gwJlETPtmLxaO7oWVk/yRl1+E+T8cwuvN6qO4WASRLoSG/4TL11Kw69vRZR7Pyn6Id8esRCMnO0z8sEeZbU5dSMJfSalYGTaoEiMlerF0lrCI4vP/hbB582YsWbIEiYmJyM7ORmFhIZRKpXR87NixCAoKwvr16+Ht7Y133nkHDRo0AACMGjUKH330Efbu3Qtvb2/069cPzZo1q/D9lJg7dy5mzJihtf6qsvBRPeD7mit6jF6N2//+l/il3r0PQHU+CgDEJ/+LejaWj9qkl26TX1CE6yn3pDYAsPznGCz/OQZ2NS2Qcf8hXrGzwvThb+J6yr1Kuy+ip5kwbwv2HP0TO7/5BHVtrUsdv5+Ti3c+WQELUwXWhw9H9acMB63/9Tg8XOuhhdsrZR4neeIqIfV0NiTk4uICQRA0nlgbExODgIAA9OjRA5GRkTh37hwmT56M/Pz/JkiGhYXh4sWL8PPzQ3R0NNzd3bFt2zYAQFBQEK5du4ZBgwYhLi4Obdq0wdKlS8u8lp2dHQAgLS1NZX9aWpp07EmTJk1CZmamtN28eVOj+6NHwkf1gF97N/QeF4Hk1AyVY8mpGbj9bxYaOtRS2d+wXk3cTHvU9vxfKcjNL1BpU83QAK/YWkltHpd69z5y8wvR7w0P3ErLwPmrKdq+JaKnEkURE+Ztwc6DF/Dr8pFwrFurVJus7IfoN3IZjKobYsOC/8FYUb3MvrIf5OHX/efwfu/XKjts0jLOYVFPZwlLjRo14Ovri2XLliEnJ6fU8YyMjDLPO378OBwdHTF58mS0adMGLi4uuHHjRql2rq6uGDNmDPbu3Qt/f3+sWbNGOubg4IARI0Zg69atGDduHFatWlXmtZycnGBnZ4f9+/dL+7KysnDy5El4enqWeY5CoYBSqVTZSDPzP/FDf+9mGD77Z2Q/yIeNtTlsrM1hbPRfQXDp5mP431vt0LujO5zsa+CzoW/A5ZVaWP//K4LuP8jDmh1n8OmQzujSpgEaOtTEgtE9AQDbD12U+hn5rhfcnWzQuH5tjH+/E0a/1x4Tv/6dQ0L0QoWGb8GW38/g21mBMDc1Rtq/WUj7NwsPcx/9Qywr+yH6jVqOB7n5WDJlIO5n50ptiopUnxm0LeoPFBYVo3/3trq4FaoAQdDOpq90uqx52bJl8PLywquvvoqZM2eiWbNmKCwsRFRUFFasWIHLly+XOsfFxQXJycnYtGkT2rZti507d0rVEwB4+PAhQkND8fbbb8PJyQm3bt3C6dOn0a9fPwDA6NGj0b17d7i6uuLevXs4cOAA3NzcyoxPEASMHj0as2fPhouLi7Ss2d7eng+xq0TD+rwKANj51Qcq+z/+cht+3BMLAFj5ywkYG1XD5x93g5WFCS5eS4V/6Dpcv/3fUM60lXtRWFSMlZ/6w1hRDWcv/40+4yOQ+f8rMADA+9WGGBfQAUbVq+HPxFQETP0R+04lVP5NEj1m9S+Pluf3GrFEZf/X0wIwsOdruBB/C2f/vA4AaO0/U6VN7PYwvGJfU/r8w28x6Nm5OSwtnr7CiOhlJIgVmUyiBSkpKZgzZw4iIyORkpKC2rVro3Xr1hgzZgw6d+78KMgnnnQ7YcIErF69Gnl5efDz88Nrr72GsLAwZGRkID8/H4GBgTh27BjS0tJQq1Yt+Pv7Y968eTA2NsbIkSPx+++/49atW1AqlejWrRsWLVqEmjVrlhlfyYPjvv32W2RkZKB9+/ZYvnw5XF1dy3V/WVlZsLS0hMJzIoRqCm18ZUSyk74/TNchEFWarKws2NWyQmZmZqVUzUv+nnAe+TMMFKVXzWqiOC8H15a+XWmx6pLOExZ9x4SFqgImLKTPXljCMupnGFYwYSnKy8G1JfqZsPDlh0RERCR7On+XEBEREXFZ87MwYSEiIpIBbazy0eN8hUNCREREJH+ssBAREcmAgYEgvXrkeYkVPF/OmLAQERHJAIeE1OOQEBEREckeKyxEREQywFVC6jFhISIikgEOCanHhIWIiEgGWGFRj3NYiIiISPZYYSEiIpIBVljUY4WFiIhIBkrmsFR008TcuXPRtm1bWFhYwMbGBn379kV8fLxKm9zcXAQHB6NmzZowNzdHv379kJaWptImOTkZfn5+MDU1hY2NDUJDQ1FYWKjS5uDBg2jVqhUUCgUaNmyIiIgIjWJlwkJERFRFHTp0CMHBwThx4gSioqJQUFAAHx8f5OTkSG3GjBmDHTt24KeffsKhQ4dw+/Zt+Pv7S8eLiorg5+eH/Px8HD9+HGvXrkVERASmTZsmtUlKSoKfnx+6dOmC2NhYjB49GkFBQdizZ0+5YxVEURS1c9tUlpLXhis8J0KoptB1OESVIn1/mK5DIKo0WVlZsKtlhczMTCiVykrp39LSEh6f/gZDY7MK9VWUm4O4L3o/d6z//PMPbGxscOjQIXTs2BGZmZmoXbs2Nm7ciLfffhsAcOXKFbi5uSEmJgavvfYafv/9d/Ts2RO3b9+Gra0tAGDlypWYOHEi/vnnHxgZGWHixInYuXMn/vzzT+laAwYMQEZGBnbv3l2u2FhhISIikgFtDgllZWWpbHl5eeWKITMzEwBQo0YNAMDZs2dRUFAAb29vqU3jxo3xyiuvICYmBgAQExMDDw8PKVkBAF9fX2RlZeHixYtSm8f7KGlT0kd5MGEhIiLSMw4ODrC0tJS2uXPnPvOc4uJijB49Gl5eXmjatCkAIDU1FUZGRrCyslJpa2tri9TUVKnN48lKyfGSY+raZGVl4eHDh+W6J64SIiIikgFtrhK6efOmypCQQvHsKQnBwcH4888/cfTo0QrFUFmYsBAREcmANp90q1QqNZrDEhISgsjISBw+fBj16tWT9tvZ2SE/Px8ZGRkqVZa0tDTY2dlJbU6dOqXSX8kqosfbPLmyKC0tDUqlEiYmJuWKkUNCREREVZQoiggJCcG2bdsQHR0NJycnleOtW7dG9erVsX//fmlffHw8kpOT4enpCQDw9PREXFwc7ty5I7WJioqCUqmEu7u71ObxPkralPRRHqywEBERyYAuHhwXHByMjRs34tdff4WFhYU058TS0hImJiawtLTEsGHDMHbsWNSoUQNKpRIjR46Ep6cnXnvtNQCAj48P3N3dMWjQIISHhyM1NRVTpkxBcHCwNBQ1YsQIfP3115gwYQI++OADREdHY8uWLdi5c2e5Y2XCQkREJAO6ePnhihUrAACdO3dW2b9mzRoMGTIEALBo0SIYGBigX79+yMvLg6+vL5YvXy61NTQ0RGRkJD766CN4enrCzMwMgYGBmDlzptTGyckJO3fuxJgxY7B48WLUq1cP3333HXx9fcsdKxMWIiIiGdBFhaU8j2IzNjbGsmXLsGzZsqe2cXR0xK5du9T207lzZ5w7d06j+B7HOSxEREQke6ywEBERyYEWhoSgv+8+ZMJCREQkB3xbs3ocEiIiIiLZY4WFiIhIBnSxSuhlwoSFiIhIBjgkpB6HhIiIiEj2WGEhIiKSAQ4JqceEhYiISAY4JKQeh4SIiIhI9lhhISIikgFWWNRjwkJERCQDnMOiHhMWIiIiGWCFRT3OYSEiIiLZY4WFiIhIBjgkpB4TFiIiIhngkJB6HBIiIiIi2WOFhYiISAYEaGFISCuRyBMTFiIiIhkwEAQYVDBjqej5csYhISIiIpI9VliIiIhkgKuE1GPCQkREJANcJaQeExYiIiIZMBAebRXtQ19xDgsRERHJHissREREciBoYUhHjyssTFiIiIhkgJNu1eOQEBEREckeKyxEREQyIPz/fxXtQ18xYSEiIpIBrhJSj0NCREREJHussBAREckAHxynXrkSlt9++63cHfbu3fu5gyEiIqqquEpIvXIlLH379i1XZ4IgoKioqCLxEBEREZVSroSluLi4suMgIiKq0gwEAQYVLJFU9Hw5q9AcltzcXBgbG2srFiIioiqLQ0LqabxKqKioCLNmzULdunVhbm6Oa9euAQCmTp2K77//XusBEhERVQUlk24ruukrjROWOXPmICIiAuHh4TAyMpL2N23aFN99951WgyMiIiICniNhWbduHb799lsEBATA0NBQ2t+8eXNcuXJFq8ERERFVFSVDQhXd9JXGc1j+/vtvNGzYsNT+4uJiFBQUaCUoIiKiqoaTbtXTuMLi7u6OI0eOlNr/888/o2XLlloJioiIiOhxGldYpk2bhsDAQPz9998oLi7G1q1bER8fj3Xr1iEyMrIyYiQiItJ7wv9vFe1DX2lcYenTpw927NiBffv2wczMDNOmTcPly5exY8cOvPnmm5URIxERkd7jKiH1nus5LB06dEBUVJS2YyEiIiIq03M/OO7MmTO4fPkygEfzWlq3bq21oIiIiKoaA+HRVtE+9JXGCcutW7fw3nvv4dixY7CysgIAZGRk4PXXX8emTZtQr149bcdIRESk9/i2ZvU0nsMSFBSEgoICXL58Genp6UhPT8fly5dRXFyMoKCgyoiRiIiIqjiNKyyHDh3C8ePH0ahRI2lfo0aNsHTpUnTo0EGrwREREVUlelwgqTCNExYHB4cyHxBXVFQEe3t7rQRFRERU1XBISD2Nh4TmzZuHkSNH4syZM9K+M2fO4JNPPsH8+fO1GhwREVFVUTLptqKbvipXhcXa2lola8vJyUG7du1Qrdqj0wsLC1GtWjV88MEH6Nu3b6UESkRERFVXuRKWr776qpLDICIiqto4JKReuRKWwMDAyo6DiIioSuOj+dV77gfHAUBubi7y8/NV9imVygoFRERERPQkjROWnJwcTJw4EVu2bMHdu3dLHS8qKtJKYERERFWJgSDAoIJDOhU9X840XiU0YcIEREdHY8WKFVAoFPjuu+8wY8YM2NvbY926dZURIxERkd4TBO1s+krjCsuOHTuwbt06dO7cGUOHDkWHDh3QsGFDODo6YsOGDQgICKiMOImIiKgK07jCkp6eDmdnZwCP5qukp6cDANq3b4/Dhw9rNzoiIqIqomSVUEU3faVxwuLs7IykpCQAQOPGjbFlyxYAjyovJS9DJCIiIs1wSEg9jROWoUOH4vz58wCATz/9FMuWLYOxsTHGjBmD0NBQrQdIREREpPEcljFjxkg/e3t748qVKzh79iwaNmyIZs2aaTU4IiKiqoKrhNTTuMLyJEdHR/j7+zNZISIiqgBdDAkdPnwYvXr1gr29PQRBwPbt21WODxkypNQcmW7duqm0SU9PR0BAAJRKJaysrDBs2DBkZ2ertLlw4QI6dOgAY2NjODg4IDw8XOPvp1wVliVLlpS7w1GjRmkcBBERUVWni0fz5+TkoHnz5vjggw/g7+9fZptu3bphzZo10meFQqFyPCAgACkpKYiKikJBQQGGDh2KDz/8EBs3bgQAZGVlwcfHB97e3li5ciXi4uLwwQcfwMrKCh9++GG5Yy1XwrJo0aJydSYIAhMWIiKil0T37t3RvXt3tW0UCgXs7OzKPHb58mXs3r0bp0+fRps2bQAAS5cuRY8ePTB//nzY29tjw4YNyM/Px+rVq2FkZIQmTZogNjYWCxcu1H7CUrIqiJ5fcuRnfG0B6S3rtiG6DoGo0ohF+c9upAUGqPg8jQrP8yjDwYMHYWNjA2tra7zxxhuYPXs2atasCQCIiYmBlZWVlKwAj+a3GhgY4OTJk3jrrbcQExODjh07wsjISGrj6+uLL7/8Evfu3YO1tXW54qjQu4SIiIhIO7Q5JJSVlaWyX6FQlBrKKY9u3brB398fTk5OSExMxGeffYbu3bsjJiYGhoaGSE1NhY2Njco51apVQ40aNZCamgoASE1NhZOTk0obW1tb6RgTFiIioirKwcFB5fP06dMRFhamcT8DBgyQfvbw8ECzZs3QoEEDHDx4EF27dq1omBphwkJERCQDggAYVHBVckmB5ubNmyrTEJ6nulIWZ2dn1KpVCwkJCejatSvs7Oxw584dlTaFhYVIT0+X5r3Y2dkhLS1NpU3J56fNjSlLZQx3ERERkYYMBO1swKNX5zy+aSthuXXrFu7evYs6deoAADw9PZGRkYGzZ89KbaKjo1FcXIx27dpJbQ4fPoyCggKpTVRUFBo1alTu4SCACQsREVGVlZ2djdjYWMTGxgJ4tMgmNjYWycnJyM7ORmhoKE6cOIHr169j//796NOnDxo2bAhfX18AgJubG7p164bhw4fj1KlTOHbsGEJCQjBgwADY29sDAAYOHAgjIyMMGzYMFy9exObNm7F48WKMHTtWo1ifK2E5cuQI3n//fXh6euLvv/8GAKxfvx5Hjx59nu6IiIiqPF28/PDMmTNo2bIlWrZsCQAYO3YsWrZsiWnTpsHQ0BAXLlxA79694erqimHDhqF169Y4cuSISsVmw4YNaNy4Mbp27YoePXqgffv2+Pbbb6XjlpaW2Lt3L5KSktC6dWuMGzcO06ZN02hJM/Acc1h++eUXDBo0CAEBATh37hzy8vIAAJmZmfj888+xa9cuTbskIiKq8gy0MIdF0/M7d+4MURSfenzPnj3P7KNGjRrSQ+KeplmzZjhy5IhmwT1B4wrL7NmzsXLlSqxatQrVq1eX9nt5eeGPP/6oUDBEREREZdG4whIfH4+OHTuW2m9paYmMjAxtxERERFTlPM+7gMrqQ19pXGGxs7NDQkJCqf1Hjx6Fs7OzVoIiIiKqakre1lzRTV9pnLAMHz4cn3zyCU6ePAlBEHD79m1s2LAB48ePx0cffVQZMRIREek9Ay1t+krjIaFPP/0UxcXF6Nq1Kx48eICOHTtCoVBg/PjxGDlyZGXESERERFWcxgmLIAiYPHkyQkNDkZCQgOzsbLi7u8Pc3Lwy4iMiIqoSOIdFved+NL+RkRHc3d21GQsREVGVZYCKz0ExgP5mLBonLF26dFH7YJro6OgKBURERET0JI0TlhYtWqh8LigoQGxsLP78808EBgZqKy4iIqIqhUNC6mmcsCxatKjM/WFhYcjOzq5wQERERFWRLp50+zLR2gqo999/H6tXr9ZWd0RERESS5550+6SYmBgYGxtrqzsiIqIqRRBQ4Um3HBJ6jL+/v8pnURSRkpKCM2fOYOrUqVoLjIiIqCrhHBb1NE5YLC0tVT4bGBigUaNGmDlzJnx8fLQWGBEREVEJjRKWoqIiDB06FB4eHrC2tq6smIiIiKocTrpVT6NJt4aGhvDx8eFbmYmIiLRM0NJ/+krjVUJNmzbFtWvXKiMWIiKiKqukwlLRTV9pnLDMnj0b48ePR2RkJFJSUpCVlaWyEREREWlbueewzJw5E+PGjUOPHj0AAL1791Z5RL8oihAEAUVFRdqPkoiISM9xDot65U5YZsyYgREjRuDAgQOVGQ8REVGVJAiC2nf1lbcPfVXuhEUURQBAp06dKi0YIiIiorJotKxZnzM3IiIiXeKQkHoaJSyurq7PTFrS09MrFBAREVFVxCfdqqdRwjJjxoxST7olIiIiqmwaJSwDBgyAjY1NZcVCRERUZRkIQoVffljR8+Ws3AkL568QERFVHs5hUa/cD44rWSVERERE9KKVu8JSXFxcmXEQERFVbVqYdKvHrxLSbA4LERERVQ4DCDCoYMZR0fPljAkLERGRDHBZs3oav/yQiIiI6EVjhYWIiEgGuEpIPSYsREREMsDnsKjHISEiIiKSPVZYiIiIZICTbtVjwkJERCQDBtDCkJAeL2vmkBARERHJHissREREMsAhIfWYsBAREcmAASo+7KHPwyb6fG9ERESkJ1hhISIikgFBECBUcEynoufLGRMWIiIiGRBQ8Zct62+6woSFiIhIFvikW/U4h4WIiIhkjxUWIiIimdDf+kjFMWEhIiKSAT6HRT0OCREREZHsscJCREQkA1zWrB4TFiIiIhngk27V0+d7IyIiIj3BCgsREZEMcEhIPSYsREREMsAn3arHISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBrhJSjwkLERGRDLDCop4+J2NERESkJ1hhISIikgGuElKPCQsREZEM8OWH6nFIiIiIiGSPFRYiIiIZMIAAgwoO6lT0fDljhYWIiEgGSoaEKrpp4vDhw+jVqxfs7e0hCAK2b9+uclwURUybNg116tSBiYkJvL29cfXqVZU26enpCAgIgFKphJWVFYYNG4bs7GyVNhcuXECHDh1gbGwMBwcHhIeHa/z9MGEhIiKqonJyctC8eXMsW7aszOPh4eFYsmQJVq5ciZMnT8LMzAy+vr7Izc2V2gQEBODixYuIiopCZGQkDh8+jA8//FA6npWVBR8fHzg6OuLs2bOYN28ewsLC8O2332oUK4eEiIiIZED4//8q2ocmunfvju7du5d5TBRFfPXVV5gyZQr69OkDAFi3bh1sbW2xfft2DBgwAJcvX8bu3btx+vRptGnTBgCwdOlS9OjRA/Pnz4e9vT02bNiA/Px8rF69GkZGRmjSpAliY2OxcOFClcTmWVhhISIikgFtDgllZWWpbHl5eRrHk5SUhNTUVHh7e0v7LC0t0a5dO8TExAAAYmJiYGVlJSUrAODt7Q0DAwOcPHlSatOxY0cYGRlJbXx9fREfH4979+6VOx4mLERERHrGwcEBlpaW0jZ37lyN+0hNTQUA2Nraquy3tbWVjqWmpsLGxkbleLVq1VCjRg2VNmX18fg1yoNDQkRERDIgaGGVUMmQ0M2bN6FUKqX9CoWiQv3KASssREREMqDNISGlUqmyPU/CYmdnBwBIS0tT2Z+WliYds7Ozw507d1SOFxYWIj09XaVNWX08fo3yYMJCREQkA7pY1qyOk5MT7OzssH//fmlfVlYWTp48CU9PTwCAp6cnMjIycPbsWalNdHQ0iouL0a5dO6nN4cOHUVBQILWJiopCo0aNYG1tXe54mLAQERFVUdnZ2YiNjUVsbCyARxNtY2NjkZycDEEQMHr0aMyePRu//fYb4uLiMHjwYNjb26Nv374AADc3N3Tr1g3Dhw/HqVOncOzYMYSEhGDAgAGwt7cHAAwcOBBGRkYYNmwYLl68iM2bN2Px4sUYO3asRrFyDgsREZEM6GJZ85kzZ9ClSxfpc0kSERgYiIiICEyYMAE5OTn48MMPkZGRgfbt22P37t0wNjaWztmwYQNCQkLQtWtXGBgYoF+/fliyZIl03NLSEnv37kVwcDBat26NWrVqYdq0aRotaQYAQRRFUaMzSCNZWVmwtLRE2t1MlQlQRPrEum2IrkMgqjRiUT7y4lYhM7Ny/hwv+Xvi19PXYGZuUaG+crLvo09b50qLVZc4JERERESyxyEhIiIiGdDFkNDLhAkLERGRDGhjlY82VwnJDYeEiIiISPZYYSEiIpIBARUf0tHjAgsTFiIiIjkwEB5tFe1DX3FIiIiIiGTvpaiwCIKAbdu2SU/Wo6pl4Zo9iDxwHldvpMFYUR2vNnNGWEgfuNT/7+2fEVuP4uc9Z3Ah/hbu5+TienQ4LC1MVfpp1nsabqakq+ybFtwbY4b4vJD7ICoxZogPenZpDhdHW+TmFeDUhWsI+/pXJNz4750sNjUtMHPUW+jcrjHMTRVIuHEHC1bvwY4DsVIbK6UpwkPfgW/7phBFEb9Fx2LSgp+R8zBfatPXuyXGDvVFg1dscPdeNlZtOYSlP+wHyQ9XCamn8wpLamoqRo4cCWdnZygUCjg4OKBXr14q7y7Qpa1bt8LHxwc1a9aEIAjS44vpxTn+RwKC3umIvavHY+vXISgoLIL/yK+R8zBPavMwtwBdPd2fmXx89j8/XPn9c2n78N1OlR0+USmvt2qI7346DJ8P5sM/5GtUr2aIrUtDYGpsJLVZETYYDR1tMHDsN/B673PsOBCLNXM/gIdrPanNqlmBaOxcB/4hX2PAmJV4vWVDfPXZQOm49+vu+HbWEKz55SheHzAH47/cjI8GvoHh73R8ofdL5SO3dwnJjU4rLNevX4eXlxesrKwwb948eHh4oKCgAHv27EFwcDCuXLmiy/AAADk5OWjfvj369++P4cOH6zqcKunnpcEqn5dPfx8uPpMQe/kmvFo1BAB8NPDRo6WPnv1LbV/mpsawraVfT3+kl887o5arfP54xg9IiPoCLdwccPxcIgDg1WbOGP/FJvxx6QYAYMHqPfj4vTfQws0BcX/dgmt9W3i/3gRdBocj9nIyAGDi/J+w5auPMHXxNqT+m4l3u7+KnQfPY83WowCAG3/fxaKIvfgk8E2s+unwC7xjKg8BFZ80q8f5im4rLB9//DEEQcCpU6fQr18/uLq6okmTJhg7dixOnDjx1PMmTpwIV1dXmJqawtnZGVOnTlV5C+T58+fRpUsXWFhYQKlUonXr1jhz5gwA4MaNG+jVqxesra1hZmaGJk2aYNeuXU+91qBBgzBt2jR4e3tr78apQrKycwEA1krTZ7Qs7au1e+HsPQEdA77AkvX7UFhYpO3wiDSmNH/0XpZ7WQ+kfacuXMNbb7aGldIUgiDA/83WUCiq4ejZqwCAth5OyMh6ICUrAHDwVDyKi0W0buoIADAyqoa8/EKVa+Xm5aOurTUc6tSo7Nsi0iqdVVjS09Oxe/duzJkzB2ZmZqWOW1lZPfVcCwsLREREwN7eHnFxcRg+fDgsLCwwYcIEAEBAQABatmyJFStWwNDQELGxsahevToAIDg4GPn5+Th8+DDMzMxw6dIlmJuba+2+8vLykJf331BFVlaW1vomoLi4GJMW/ox2zZ3h3tBeo3P/924nNG/sACulGU5duIaZy35D2r+ZmDOmXyVFS/RsgiBg7ti3cSI2EZcTU6T9QyetxurPP0DS/nAUFBbhYW4+BoWuQtKtfwEAtjWV+OfefZW+ioqKcS/rAWxrPqoiRp+4jDlj/NEx0hVHzlyFs0NtBAd0BQDY1bIsNaeLdMsAAgwqOKZjoMc1Fp0lLAkJCRBFEY0bN9b43ClTpkg/169fH+PHj8emTZukhCU5ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnitxGKXPnzsWMGTO02if9Z3z4FlxOTMHvq8ZofG7JH9QA0NSlLoyqV8OYz3/EtODeUBhV12aYROU2f0J/uDWog+7DF6nsnzyiJywtTNDn4yVIz8hBj07NsGbuB+gx/CtcSrxdrr7XbjsGp7q1sGnhCFSvZoj7OblYuekgJv3PD8XFxZVxO1QBHBJST2dDQhV5SfTmzZvh5eUFOzs7mJubY8qUKUhO/q8sOnbsWAQFBcHb2xtffPEFEhMTpWOjRo3C7Nmz4eXlhenTp+PChQsVuo8nTZo0CZmZmdJ28+ZNrfZflYWGb8GeI39ix4pRqGtrXeH+Wjepj8KiYiTf5r8ySTfCQ9+Bb4em6PXREty+kyHtr1+3Fj58txNGzvoBh0//hT+v/o3w737HucvJCPr/CbNpd7NQ21r1zb6GhgawVpoi7e5/ld2wr39FvU7j0Kz3NDTq9pk0J+b633cr/waJtEhnCYuLiwsEQdB4Ym1MTAwCAgLQo0cPREZG4ty5c5g8eTLy8/9bxhcWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cu1dp9KRQKKJVKlY0qRhRFhIZvwc6D5/HbilFwrFtLK/3G/XULBgYCateo2OvciZ5HeOg78OvcHL0/WoLk26rJQ8lqoeJi1X/YFRWJEP7/yWCn45JgpTRF88YO0vGObVxhYCDg7J83VM4rLhaR8k8mCgqL0M+nNU5duIa7GdmVcVtUEYKWNj2ls4SlRo0a8PX1xbJly5CTk1PqeEZGRpnnHT9+HI6Ojpg8eTLatGkDFxcX3Lhxo1Q7V1dXjBkzBnv37oW/vz/WrFkjHXNwcMCIESOwdetWjBs3DqtWrdLafZH2jf9yC7b8fhqrZg2Buakx0v7NQtq/WXiY+1+SmvZvFuLib+HazUfj+xcTbiMu/hbuZT763Tp14RpWbDyAuL9u4fqtf7Hl99OYvOgX9O/eFlbPMXmXqCLmT+yP/t3bYvjUCGQ/yIVNTQvY1LSAseLR0ORf11ORmHwHiya9h1bujqhftxaCA95Al3aNsOvg+f9vk4Z9xy9i8eSBaOXuiHbNnBEe2h9b9/6B1H8zAQA1LM0w1L89XBxt0dS1LuaO64c+XVti0oJfdHbv9HSClv7TVzpd1rxs2TJ4eXnh1VdfxcyZM9GsWTMUFhYiKioKK1aswOXLl0ud4+LiguTkZGzatAlt27bFzp07peoJADx8+BChoaF4++234eTkhFu3buH06dPo1+/RxMrRo0eje/fucHV1xb1793DgwAG4ubk9Ncb09HQkJyfj9u1HY8bx8fEAADs7O9jZ2Wnz66CnWP3LEQBAzxGLVfYvm/Y+BvZ6DQCwZusRfLnqd+mY34dfqbRRGFXH1qiz+GLVLuQXFMLRviY+eq8LggPeeDE3QfSYYW8/GtbZ+c1olf0fz1iPHyNPorCoGP1Hr8D0kD74ceH/YGaqQNLNf/Bx2HpEHb8ktR8+dS3mhfbH9uUjpQfHfTr/J5U+B/i1w8xP3oIgPKrK9BqxWBoWInqZCGJFJpNoQUpKCubMmYPIyEikpKSgdu3aaN26NcaMGYPOnTs/CvKJJ91OmDABq1evRl5eHvz8/PDaa68hLCwMGRkZyM/PR2BgII4dO4a0tDTUqlUL/v7+mDdvHoyNjTFy5Ej8/vvvuHXrFpRKJbp164ZFixahZs2aZcYXERGBoUOHlto/ffp0hIWFPfP+srKyYGlpibS7mRweIr1l3TZE1yEQVRqxKB95cauQmVk5f46X/D2xPzYZ5hYV6z/7fha6tnil0mLVJZ0nLPqOCQtVBUxYSJ+9qIQlWksJyxt6mrDo/NH8RERERM/yUrz8kIiISO/xQSxqMWEhIiKSAb6tWT0mLERERDKgjbct6/PbmjmHhYiIiGSPFRYiIiIZ4BQW9ZiwEBERyQEzFrU4JERERESyxwoLERGRDHCVkHpMWIiIiGSAq4TU45AQERERyR4rLERERDLAObfqMWEhIiKSA2YsanFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQBXCanHhIWIiEgGOIVFPc5hISIiItljhYWIiEgOWGJRiwkLERGRDHDSrXocEiIiIiLZY4WFiIhIBrhKSD0mLERERDLAKSzqcUiIiIiIZI8VFiIiIjlgiUUtJixEREQywFVC6nFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQCnsKjHhIWIiEgOmLGoxTksREREJHussBAREckAVwmpx4SFiIhIDrQw6VaP8xUOCREREZH8scJCREQkA5xzqx4TFiIiIjlgxqIWh4SIiIhI9lhhISIikgGuElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiIZ4Jxb9VhhISIikgNBS5sGwsLCIAiCyta4cWPpeG5uLoKDg1GzZk2Ym5ujX79+SEtLU+kjOTkZfn5+MDU1hY2NDUJDQ1FYWPgcX4B6rLAQERHJgK4m3TZp0gT79u2TPler9l9qMGbMGOzcuRM//fQTLC0tERISAn9/fxw7dgwAUFRUBD8/P9jZ2eH48eNISUnB4MGDUb16dXz++ecVupcnMWEhIiKqwqpVqwY7O7tS+zMzM/H9999j48aNeOONNwAAa9asgZubG06cOIHXXnsNe/fuxaVLl7Bv3z7Y2tqiRYsWmDVrFiZOnIiwsDAYGRlpLU4OCREREcmAgP9WCj339v99ZWVlqWx5eXlPve7Vq1dhb28PZ2dnBAQEIDk5GQBw9uxZFBQUwNvbW2rbuHFjvPLKK4iJiQEAxMTEwMPDA7a2tlIbX19fZGVl4eLFi1r9fpiwEBERyYA2p7A4ODjA0tJS2ubOnVvmNdu1a4eIiAjs3r0bK1asQFJSEjp06ID79+8jNTUVRkZGsLKyUjnH1tYWqampAIDU1FSVZKXkeMkxbeKQEBERkZ65efMmlEql9FmhUJTZrnv37tLPzZo1Q7t27eDo6IgtW7bAxMSk0uPUBCssREREMlDh4aDHHjynVCpVtqclLE+ysrKCq6srEhISYGdnh/z8fGRkZKi0SUtLk+a82NnZlVo1VPK5rHkxFcGEhYiISBZ0sK75CdnZ2UhMTESdOnXQunVrVK9eHfv375eOx8fHIzk5GZ6engAAT09PxMXF4c6dO1KbqKgoKJVKuLu7VyiWJ3FIiIiIqIoaP348evXqBUdHR9y+fRvTp0+HoaEh3nvvPVhaWmLYsGEYO3YsatSoAaVSiZEjR8LT0xOvvfYaAMDHxwfu7u4YNGgQwsPDkZqaiilTpiA4OLjcVZ3yYsJCREQkA7p4l9CtW7fw3nvv4e7du6hduzbat2+PEydOoHbt2gCARYsWwcDAAP369UNeXh58fX2xfPly6XxDQ0NERkbio48+gqenJ8zMzBAYGIiZM2dW7EbKIIiiKGq9V5JkZWXB0tISaXczVSZAEekT67Yhug6BqNKIRfnIi1uFzMzK+XO85O+JKzf+gUUF+7+flYXGjrUrLVZd4hwWIiIikj0OCREREcmALoaEXiZMWIiIiGRAV+8SelkwYSEiIpKDiq9Krvj5MsY5LERERCR7rLAQERHJAAss6jFhISIikgFOulWPQ0JEREQke6ywEBERyQBXCanHhIWIiEgOOIlFLQ4JERERkeyxwkJERCQDLLCox4SFiIhIBrhKSD0OCREREZHsscJCREQkCxVfJaTPg0JMWIiIiGSAQ0LqcUiIiIiIZI8JCxEREckeh4SIiIhkgENC6jFhISIikgE+ml89DgkRERGR7LHCQkREJAMcElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiI5YIlFLSYsREREMsBVQupxSIiIiIhkjxUWIiIiGeAqIfWYsBAREckAp7Cox4SFiIhIDpixqMU5LERERCR7rLAQERHJAFcJqceEhYiISAY46VY9JiyVTBRFAMD9rCwdR0JUecSifF2HQFRpSn6/S/48ryxZWvh7Qht9yBUTlkp2//59AEBDJwcdR0JERBVx//59WFpaar1fIyMj2NnZwUVLf0/Y2dnByMhIK33JiSBWdspYxRUXF+P27duwsLCAoM+1OpnIysqCg4MDbt68CaVSqetwiLSOv+MvniiKuH//Puzt7WFgUDlrVXJzc5Gfr51KpZGREYyNjbXSl5ywwlLJDAwMUK9ePV2HUeUolUr+YU56jb/jL1ZlVFYeZ2xsrJdJhjZxWTMRERHJHhMWIiIikj0mLKRXFAoFpk+fDoVCoetQiCoFf8epquKkWyIiIpI9VliIiIhI9piwEBERkewxYSEiIiLZY8JCsiYIArZv367rMIgqBX+/icqPCQvpTGpqKkaOHAlnZ2coFAo4ODigV69e2L9/v65DA/Do6ZbTpk1DnTp1YGJiAm9vb1y9elXXYdFLQu6/31u3boWPjw9q1qwJQRAQGxur65CI1GLCQjpx/fp1tG7dGtHR0Zg3bx7i4uKwe/dudOnSBcHBwboODwAQHh6OJUuWYOXKlTh58iTMzMzg6+uL3NxcXYdGMvcy/H7n5OSgffv2+PLLL3UdClH5iEQ60L17d7Fu3bpidnZ2qWP37t2TfgYgbtu2Tfo8YcIE0cXFRTQxMRGdnJzEKVOmiPn5+dLx2NhYsXPnzqK5ubloYWEhtmrVSjx9+rQoiqJ4/fp1sWfPnqKVlZVoamoquru7izt37iwzvuLiYtHOzk6cN2+etC8jI0NUKBTijz/+WMG7J30n99/vxyUlJYkAxHPnzj33/RK9CHyXEL1w6enp2L17N+bMmQMzM7NSx62srJ56roWFBSIiImBvb4+4uDgMHz4cFhYWmDBhAgAgICAALVu2xIoVK2BoaIjY2FhUr14dABAcHIz8/HwcPnwYZmZmuHTpEszNzcu8TlJSElJTU+Ht7S3ts7S0RLt27RATE4MBAwZU4BsgffYy/H4TvYyYsNALl5CQAFEU0bhxY43PnTJlivRz/fr1MX78eGzatEn6Az05ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnp14nNTUVAGBra6uy39bWVjpGVJaX4feb6GXEOSz0wokVeLjy5s2b4eXlBTs7O5ibm2PKlClITk6Wjo8dOxZBQUHw9vbGF198gcTEROnYqFGjMHv2bHh5eWH69Om4cOFChe6DqCz8/SaqHExY6IVzcXGBIAi4cuWKRufFxMQgICAAPXr0QGRkJM6dO4fJkycjPz9fahMWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cuLfNadnZ2AIC0tDSV/WlpadIxorK8DL/fRC8l3U6hoaqqW7duGk9KnD9/vujs7KzSdtiwYaKlpeVTrzNgwACxV69eZR779NNPRQ8PjzKPlUy6nT9/vrQvMzOTk26pXOT++/04TrqllwUrLKQTy5YtQ1FREV599VX88ssvuHr1Ki5fvowlS5bA09OzzHNcXFyQnJyMTZs2ITExEUuWLJH+dQkADx8+REhICA4ePIgbN27g2LFjOH36NNzc3AAAo0ePxp49e5CUlIQ//vgDBw4ckI49SRAEjB49GrNnz8Zvv/2GuLg4DB48GPb29ujbt6/Wvw/SL3L//QYeTQ6OjY3FpUuXAADx8fGIjY3lHC2SL11nTFR13b59WwwODhYdHR1FIyMjsW7dumLv3r3FAwcOSG3wxLLP0NBQsWbNmqK5ubn47rvviosWLZL+BZqXlycOGDBAdHBwEI2MjER7e3sxJCREfPjwoSiKohgSEiI2aNBAVCgUYu3atcVBgwaJ//7771PjKy4uFqdOnSra2tqKCoVC7Nq1qxgfH18ZXwXpIbn/fq9Zs0YEUGqbPn16JXwbRBUniGIFZogRERERvQAcEiIiIiLZY8JCREREsseEhYiIiGSPCQsRERHJHhMWIiIikj0mLERERCR7TFiIiIhI9piwEFUBQ4YMUXlCb+fOnTF69OgXHsfBgwchCAIyMjKe2kYQBGzfvr3cfYaFhaFFixYViuv69esQBAGxsbEV6oeIKg8TFiIdGTJkCARBgCAIMDIyQsOGDTFz5kwUFhZW+rW3bt2KWbNmlatteZIMIqLKVk3XARBVZd26dcOaNWuQl5eHXbt2ITg4GNWrV8ekSZNKtc3Pz4eRkZFWrlujRg2t9ENE9KKwwkKkQwqFAnZ2dnB0dMRHH30Eb29v/PbbbwD+G8aZM2cO7O3t0ahRIwDAzZs30b9/f1hZWaFGjRro06cPrl+/LvVZVFSEsWPHwsrKCjVr1sSECRPw5Bs4nhwSysvLw8SJE+Hg4ACFQoGGDRvi+++/x/Xr19GlSxcAgLW1NQRBwJAhQwAAxcXFmDt3LpycnGBiYoLmzZvj559/VrnOrl274OrqChMTE3Tp0kUlzvKaOHEiXF1dYWpqCmdnZ0ydOhUFBQWl2n3zzTdwcHCAqakp+vfvj8zMTJXj3333Hdzc3GBsbIzGjRtj+fLlGsdCRLrDhIVIRkxMTJCfny993r9/P+Lj4xEVFYXIyEgUFBTA19cXFhYWOHLkCI4dOwZzc3N069ZNOm/BggWIiIjA6tWrcfToUaSnp6u89bcsgwcPxo8//oglS5bg8uXL+Oabb2Bubg4HBwf88ssvAB69zTclJQWLFy8GAMydOxfr1q3DypUrcfHiRYwZMwbvv/8+Dh06BOBRYuXv749evXohNjYWQUFB+PTTTzX+TiwsLBAREYFLly5h8eLFWLVqFRYtWqTSJiEhAVu2bMGOHTuwe/dunDt3Dh9//LF0fMOGDZg2bRrmzJmDy5cv4/PPP8fUqVOxdu1ajeMhIh3R8csXiaqswMBAsU+fPqIoPnozdFRUlKhQKMTx48dLx21tbcW8vDzpnPXr14uNGjUSi4uLpX15eXmiiYmJuGfPHlEURbFOnTpieHi4dLygoECsV6+edC1RFMVOnTqJn3zyiSiKohgfHy8CEKOiosqM88CBAyIA8d69e9K+3Nxc0dTUVDx+/LhK22HDhonvvfeeKIqiOGnSJNHd3V3l+MSJE0v19SQ88QbjJ82bN09s3bq19Hn69OmioaGheOvWLWnf77//LhoYGIgpKSmiKIpigwYNxI0bN6r0M2vWLNHT01MURVFMSkoSAYjnzp176nWJSLc4h4VIhyIjI2Fubo6CggIUFxdj4MCBCAsLk457eHiozFs5f/48EhISYGFhodJPbm4uEhMTkZmZiZSUFLRr1046Vq1aNbRp06bUsFCJ2NhYGBoaolOnTuWOOyEhAQ8ePMCbb76psj8/Px8tW7YEAFy+fFklDgDw9PQs9zVKbN68GUuWLEFiYiKys7NRWFgIpVKp0uaVV15B3bp1Va5TXFyM+Ph4WFhYIDExEcOGDcPw4cOlNoWFhbC0tNQ4HiLSDSYsRDrUpUsXrFixAkZGRrC3t0e1aqr/S5qZmal8zs7ORuvWrbFhw4ZSfdWuXfu5YjAxMdH4nOzsbADAzp07VRIF4NG8HG2JiYlBQEAAZsyYAV9fX1haWmLTpk1YsGCBxrGuWrWqVAJlaGiotViJqHIxYSHSITMzMzRs2LDc7Vu1aoXNmzfDxsamVJWhRJ06dXDy5El07NgRwKNKwtmzZ9GqVasy23t4eKC4uBiHDh2Ct7d3qeMlFZ6ioiJpn7u7OxQKBZKTk59amXFzc5MmEJc4ceLEs2/yMcePH4ejoyMmT54s7btx40apdsnJybh9+zbs7e2l6xgYGKBRo0awtbWFvb09rl27hoCAAI2uT0TywUm3RC+RgIAA1KpVC3369MGRI0eQlJSEgwcPYtSoUbh16xYA4JNPPsEXX3yB7du348qVK/j444/VPkOlfv36CAwMxAcffIDt27dLfW7ZsgUA4OjoCEEQEBkZiX/++QfZ2dmwsLDA+PHjMWbMGKxduxaJiYn4448/sHTpUmki64gRI3D16lWEhoYiPj4eGzduREREhEb36+LiguTkZGzatAmJiYlYsmRJmROIjY2NERgYiPPnz+PIkSMYNWoU+vfvDzs7OwDAjBkzMHfuXCxZsgR//fUX4uLisGbNGixcuFCjeIhId5iwEL1ETE1NcfjwYbzyyivw9/eHm5sbhg0bhtzcXKniMm7cOAwaNAiBgYHw9PSEhYUF3nrrLbX9rlixAm+//TY+/vhjNG7cGMOHD0dOTg4AoG7dupgxYwY+/fRT2NraIiQkBAAwa9YsTJ06FXPnzoWbmxu6deuGnTt3wsnJCcCjeSW//PILtm/fjubNm2PlypX4/PPPNbrf3r17Y8yYMQgJCUGLFi1w/PhxTJ06tVS7hg0bwt/fHz169ICPjw+aNWumsmw5KCgI3333HdasWQMPDw906tQJERERUqxEJH+C+LSZeEREREQywQoLERERyR4TFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHhIWIiIhkjwkLERERyR4TFiIiIpI9JixEREQke0xYiIiISPb+D8eEamDpGfNzAAAAAElFTkSuQmCC",
760
+ "text/plain": [
761
+ "<Figure size 640x480 with 2 Axes>"
762
+ ]
763
+ },
764
+ "metadata": {},
765
+ "output_type": "display_data"
766
+ }
767
+ ],
768
+ "source": [
769
+ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
770
+ "import matplotlib.pyplot as plt\n",
771
+ "\n",
772
+ "# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
773
+ "cm = confusion_matrix(predictions.label_ids, preds)\n",
774
+ "\n",
775
+ "# 可视化混淆矩阵\n",
776
+ "disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
777
+ "disp.plot(cmap=plt.cm.Blues)\n",
778
+ "plt.title('Confusion Matrix')\n",
779
+ "plt.show()"
780
+ ]
781
+ },
782
  {
783
  "cell_type": "code",
784
  "execution_count": null,
785
+ "id": "23e3a640-88d7-4a1e-8515-7c417d50f018",
786
  "metadata": {},
787
  "outputs": [],
788
  "source": []
03-gene-task/2-structure-predict.ipynb CHANGED
@@ -1,9 +1,962 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "9de04383-42fa-493d-95ec-6295f65a1c39",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "212e1052-e0d9-404f-a4ee-db199a4c6d17",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.2 序列结构预测"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "0eb5d83c-8dd6-498b-adc9-1f74c97c3427",
14
+ "metadata": {},
15
+ "source": [
16
+ "蛋白质的结构可分为四级:\n",
17
+ "\n",
18
+ "1. 一级结构也就是氨基酸序列;\n",
19
+ "2. 二级结构是周期性的结构构象,比如α螺旋β折叠等\n",
20
+ "3. 三级结构是整条多肽链的三维空间结构\n",
21
+ "4. 四级结构是几个蛋白质分子形成的复合体结构,比如三聚体,四聚 体等\n",
22
+ "\n",
23
+ "\n",
24
+ "二级结构(Secondary Structure)是指生物大分子如蛋白质和核酸(RNA 和 DNA)中局部的、有规则的空间构象。这些结构是由分子内的一些化学键或相互作用稳定下来的,但不涉及整个分子的整体折叠状态。以下是关于蛋白质和 RNA 二级结构的简单介绍:\n",
25
+ "\n",
26
+ "### 蛋白质的二级结构\n",
27
+ "\n",
28
+ "蛋白质的二级结构主要由主链原子间的氢键形成,具体包括以下几种常见的类型:\n",
29
+ "\n",
30
+ "1. **α-螺旋 (Alpha Helix)**\n",
31
+ " - **描述**:一个右手螺旋结构,每个氨基酸残基沿螺旋轴旋转约 100 度,并沿着轴向上移动约 1.5 Å。\n",
32
+ " - **特点**:通过相邻的肽键之间形成的氢键稳定,通常每 3.6 个氨基酸残基转一圈。\n",
33
+ "\n",
34
+ "2. **β-折叠片 (Beta Sheet)**\n",
35
+ " - **描述**:由多个几乎平行或反平行排列的多肽链组成,链间通过氢键连接。\n",
36
+ " - **特点**:可以是平行(所有链同向)或反平行(相邻链方向相反),提供了高度刚性的平面结构。\n",
37
+ "\n",
38
+ "3. **转角 (Turns)**\n",
39
+ " - **描述**:短的序列片段,通常包含 3 到 4 个氨基酸残基,用于改变多肽链的方向。\n",
40
+ " - **特点**:最常见的类型是 β-转角(beta turn),它使得链可以在空间上回折。\n",
41
+ "\n",
42
+ "4. **无规则卷曲 (Random Coil)**\n",
43
+ " - **描述**:没有固定模式的区域,可能是由于缺乏足够的氢键或其他稳定力。\n",
44
+ " - **特点**:虽然称为“无规则”,但实际上可能在特定环境下具有功能性意义。\n",
45
+ "\n",
46
+ "\n",
47
+ "<img src=\"img/protein-structure-1.png\" width=\"500px\" />\n",
48
+ "\n",
49
+ "蛋白质的二级结构经常用图形来形象的描述。比如下图中黄色的箭头代表对应的氨基酸 具有β折片结构。波浪线代表螺旋结构,小鼓包是转角。此外,以字母形式书写的二级结构序列能够更加精准的描述。\n",
50
+ "其中,E 代表β折叠,H 代表α螺旋,T 代表转角。没有写任何字母的地方是松散的 coil 结构。很多序列预测数据集中,一般不区分转角和coil结构。\n",
51
+ "\n",
52
+ "\n",
53
+ "<img src=\"img/protein-structure-2.png\" width=\"500px\" />\n"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "id": "c90a583c-f6a5-4a41-8e7e-da27b7e95c50",
59
+ "metadata": {},
60
+ "source": [
61
+ "获得实验测定的蛋白质或 RNA 的二级结构数据,通常需要依赖于实验室技术和公共数据库中已发表的实验结果。以下是一些常用的资源和方法,帮助你获取经过实验验证的二级结构数据:\n",
62
+ "\n",
63
+ "### 1. **蛋白质二级结构数据**\n",
64
+ "\n",
65
+ "#### a. **PDB (Protein Data Bank)**\n",
66
+ "\n",
67
+ "- **网址**:[RCSB PDB](https://www.rcsb.org/)\n",
68
+ "- **特点**:PDB 是一个全球性的生物大分子结构数据库,包含通过 X 射线晶体学、核磁共振(NMR)和冷冻电镜(Cryo-EM)等实验方法测定的蛋白质三维结构。\n",
69
+ "- **使用方法**:\n",
70
+ " - 搜索特定蛋白质的 PDB ID 或名称。\n",
71
+ " - 查看详细条目页面,其中包含了蛋白质的三级结构信息,可以通过可视化工具如 PyMOL 或 Chimera 来观察二级结构元素(如 α-螺旋、β-折叠片等)。\n",
72
+ "\n",
73
+ "<img src=\"img/pdb1.png\" width=\"600px\" />\n",
74
+ "\n",
75
+ "From https://www.rcsb.org/sequence/9rsa\n",
76
+ "\n",
77
+ "#### b. **PDBe (Protein Data Bank in Europe)**\n",
78
+ "\n",
79
+ "- **网址**:[PDBe](https://www.ebi.ac.uk/pdbe/)\n",
80
+ "- **特点**:PDBe 是欧洲的 PDB 镜像站点,提供了与 RCSB PDB 类似的功能,并且有额外的分析工具和注释信息。\n",
81
+ "- **使用方法**:\n",
82
+ " - 搜索蛋白质的 PDB ID 或名称。\n",
83
+ " - 使用 PDBe-KB 和其他工具来获取详细的结构信息和二级结构注释。\n",
84
+ "\n",
85
+ "#### c. **Biomolecule Structure Knowledgebase (BSK)**\n",
86
+ "\n",
87
+ "- **网址**:[BSK](https://bsk.pdbj.org/)\n",
88
+ "- **特点**:BSK 是日本的 PDB 镜像站点,同样提供丰富的结构数据和分析工具。\n",
89
+ "- **使用方法**:\n",
90
+ " - 搜索蛋白质的 PDB ID 或名称。\n",
91
+ " - 浏览条目以获取详细的结构信息和二级结构注释。\n",
92
+ "\n",
93
+ "\n",
94
+ "\n",
95
+ "### 3. **实验方法**\n",
96
+ "\n",
97
+ "如果你需要最新的或特定条件下的二级结构数据,可能需要参考文献中的实验方法。以下是一些常见的实验技术:\n",
98
+ "\n",
99
+ "#### a. **X 射线晶体学**\n",
100
+ "\n",
101
+ "- **原理**:通过解析蛋白质或 RNA 晶体的衍射图案来确定其三维结构。\n",
102
+ "- **应用**:适用于能够形成稳定晶体的分子。\n",
103
+ "\n",
104
+ "#### b. **核磁共振(NMR)**\n",
105
+ "\n",
106
+ "- **原理**:利用核磁共振波谱技术来确定溶液状态下分子的结构。\n",
107
+ "- **应用**:适用于较小的蛋白质和 RNA 分子。\n",
108
+ "\n",
109
+ "#### c. **冷冻电镜(Cryo-EM)**\n",
110
+ "\n",
111
+ "- **原理**:通过低温冷冻样品并在电子显微镜下成像来确定分子结构。\n",
112
+ "- **应用**:适用于较大的复合物和难以结晶的分子。\n",
113
+ "\n",
114
+ "\n",
115
+ "\n",
116
+ "### 4. **文献检索**\n",
117
+ "\n",
118
+ "#### a. **PubMed**\n",
119
+ "\n",
120
+ "- **网址**:[PubMed](https://pubmed.ncbi.nlm.nih.gov/)\n",
121
+ "- **特点**:PubMed 是一个广泛使用的生物医学文献数据库,提供了大量关于蛋白质和 RNA 功能及结构的研究论文。\n",
122
+ "- **使用方法**:\n",
123
+ " - 使用关键词搜索与特定蛋白质或 RNA 相关的实验研究。\n",
124
+ " - 阅读论文以获取详细的实验数据和二级结构描述。\n",
125
+ "\n",
126
+ "### 总结\n",
127
+ "\n",
128
+ "获得实验测定的蛋白质或 RNA 的二级结构数据主要依赖于公共数据库如 PDB 和 NDB,这些数据库收录了通过多种实验方法测定的结构信息。此外,查阅相关文献也是一种重要的途径,可以找到最新的或特定条件下的实验结果。对于具体的实验方法,如 X 射线晶体学、NMR 和 Cryo-EM 等,它们各自有适用的场景和优势。\n"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "markdown",
133
+ "id": "1cadfd11-2130-429d-848f-39371356ca10",
134
+ "metadata": {},
135
+ "source": [
136
+ "## 整理好的数据\n",
137
+ "\n",
138
+ "https://huggingface.co/datasets/proteinea/secondary_structure_prediction\n",
139
+ "\n",
140
+ "<img src=\"img/ds_structure.png\" width=\"600px\" />\n",
141
+ "\n",
142
+ "https://huggingface.co/datasets/genbio-ai/rna-secondary-structure-prediction"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": 24,
148
+ "id": "134a72e3-597a-446e-9193-d060a6e677f6",
149
+ "metadata": {},
150
+ "outputs": [
151
+ {
152
+ "data": {
153
+ "text/plain": [
154
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
155
+ ]
156
+ },
157
+ "execution_count": 24,
158
+ "metadata": {},
159
+ "output_type": "execute_result"
160
+ }
161
+ ],
162
+ "source": [
163
+ "import subprocess\n",
164
+ "import os\n",
165
+ "# 设置环境变量, autodl一般区域\n",
166
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
167
+ "output = result.stdout\n",
168
+ "for line in output.splitlines():\n",
169
+ " if '=' in line:\n",
170
+ " var, value = line.split('=', 1)\n",
171
+ " os.environ[var] = value\n",
172
+ "\n",
173
+ "\"\"\"\n",
174
+ "import os\n",
175
+ "\n",
176
+ "# 设置环境变量, autodl专区 其他idc\n",
177
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
178
+ "\n",
179
+ "# 打印环境变量以确认设置成功\n",
180
+ "print(os.environ.get('HF_ENDPOINT'))\n",
181
+ "\"\"\""
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 25,
187
+ "id": "b43dd5f2-6b23-4b51-ad04-7b7ded732cb7",
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer\n",
192
+ "from tokenizers import Tokenizer\n",
193
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
194
+ "from transformers import AutoModelForTokenClassification \n",
195
+ "from transformers import DataCollatorWithPadding"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 26,
201
+ "id": "4c66fa5b-b8b8-4dfd-ada1-32ed9e690c33",
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "#set tokenizer,dna protein \n",
206
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
207
+ "tokenizer.pad_token = tokenizer.eos_token"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 27,
213
+ "id": "70a3fd79-48bf-4452-a7ee-689f1b11e987",
214
+ "metadata": {},
215
+ "outputs": [],
216
+ "source": [
217
+ "from datasets import load_dataset\n",
218
+ "# 1. load ~11k samples from promoters prediction dataset\n",
219
+ "dataset = load_dataset(\"proteinea/secondary_structure_prediction\")['train'].train_test_split(test_size=0.1)"
220
+ ]
221
+ },
222
+ {
223
+ "cell_type": "code",
224
+ "execution_count": 28,
225
+ "id": "13cd141e-98c3-47da-8e21-cba5576707fe",
226
+ "metadata": {},
227
+ "outputs": [
228
+ {
229
+ "data": {
230
+ "text/plain": [
231
+ "DatasetDict({\n",
232
+ " train: Dataset({\n",
233
+ " features: ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'],\n",
234
+ " num_rows: 9712\n",
235
+ " })\n",
236
+ " test: Dataset({\n",
237
+ " features: ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'],\n",
238
+ " num_rows: 1080\n",
239
+ " })\n",
240
+ "})"
241
+ ]
242
+ },
243
+ "execution_count": 28,
244
+ "metadata": {},
245
+ "output_type": "execute_result"
246
+ }
247
+ ],
248
+ "source": [
249
+ "dataset"
250
+ ]
251
+ },
252
+ {
253
+ "cell_type": "code",
254
+ "execution_count": 29,
255
+ "id": "7936af74-3f5f-43c1-aa69-fd7b08989e24",
256
+ "metadata": {},
257
+ "outputs": [
258
+ {
259
+ "data": {
260
+ "text/plain": [
261
+ "{'input': 'MTQTQPVTPTPPASFQTQHDPRTRLGATPLPGGAGTRFRLWTSTARTVAVRVNGTEHVMTSLGGGIYELELPVGPGARYLFVLDGVPTPDPYARFLPDGVHGEAEVVDFGTFDWTDADWHGIKLADCVFYEVHVGTFTPEGTYRAAAEKLPYLKELGVTAIQVMPLAAFDGQRGWGYDGAAFYAPYAPYGRPEDLMALVDAAHRLGLGVFLDVVYNHFGPSGNYLSSYAPSYFTDRFSSAWGMGLDYAEPHMRRYVTGNARMWLRDYHFDGLRLDATPYMTDDSETHILTELAQEIHELGGTHLLLAEDHRNLPDLVTVNHLDGIWTDDFHHETRVTLTGEQEGYYAGYRGGAEALAYTIRRGWRYEGQFWAVKGEEHERGHPSDALEAPNFVYCIQNHDQIGNRPLGERLHQSDGVTLHEYRGAAALLLTLPMTPLLFQGQEWAASTPFQFFSDHAGELGQAVSEGRKKEFGGFSGFSGEDVPDPQAEQTFLNSKLNWAEREGGEHARTLRLYRDLLRLRREDPVLHNRQRENLTTGHDGDVLWVRTVTGAGERVLLWNLGQDTRAVAEVKLPFTVPRRLLLHTEGREDLTLGAGEAVLVG',\n",
262
+ " 'dssp3': 'CCCCCCCCCCCCCCCCCCCCHHHCCEEEECHHHCCEEEEEECCCCCCEEEEECCEEEECEEEECCEEEEEECCCCCCEEEEEECCEEECCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCHHHCCEEEECHHHHCCCCCHHHHHHCHHHHHHHCCCEEEECCCEECCCCCCCCCCCCEEEEECHHHCCHHHHHHHHHHHHHCCCEEEEEECCCCCCCCCCCHHHHCHHHEEEEEECCCCEEECCCCHHHHHHHHHHHHHHHHHHCCCEEEECCHHHCCCCCCCCHHHHHHHHHHCCCCCCEEEEECCCCCCHHHHCCCCCEEECCHHHHHHHHHHHCCCCHHHHHCCCCHHHHHHHHHHCCCCEEEEECCCCCCEEEECCCCCCCHHHEEEECCCHHHHHCCCCCCCHHHCCCCCHHHHHHHHHHHHHCCCEEEEECCHHHCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCHHHHHCCCCCCHHHHCHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHEEEEEECCEEEEEEEECCEEEEEEEECCCCCEEHHHCCCCCCCCCCEEEECCCCCCCEECCCCEEEEC',\n",
263
+ " 'dssp8': 'CCCCCCCCCCCCCCCCCSCCGGGCSEEEECGGGCCEEEEEECSSCSSEEEEETTEEEECEEEETTEEEEEESCCTTCEEEEEETTEEECCTTCSCCTTCTTSCEECCCTTSSCCCCTTCCCCCGGGCCEEEECHHHHSSSCSHHHHHHTHHHHHHHTCCEEEECCCEECSSSCCCSTTCCEEEEECGGGCCHHHHHHHHHHHHHTTCEEEEEECCSCCCSSSCCHHHHCGGGEEEEEECSSSEEECTTSHHHHHHHHHHHHIIIIIHCCSEEEETTGGGCCCCSSSCHHHHHHHHHHTTCSCCEEEEECSSCCTHHHHTTCCSEEECTHHHHHHHHHHHCCCSGGGGGCCCSHHHHHHHHHHSSSCEEEEECCTTCCEEEECCCTTCCGGGEEEESCCHHHHHTSTTCCCGGGSTTCCHHHHHHHHHHHHHSSSEEEEETTGGGTCSSCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCTTSHHHHHTTSCCSGGGGSHHHHHHHHHHHHHHHHHHHCTTTTCCCGGGEEEEEETTEEEEEEEETTEEEEEEEECSSSCEEGGGSCCSSCCCCCEEEETTCCSSSEECTTCEEEEC',\n",
264
+ " 'disorder': '0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0',\n",
265
+ " 'cb513_mask': '1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0'}"
266
+ ]
267
+ },
268
+ "execution_count": 29,
269
+ "metadata": {},
270
+ "output_type": "execute_result"
271
+ }
272
+ ],
273
+ "source": [
274
+ "dataset[\"train\"][0]"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 30,
280
+ "id": "47b1ac0c-e934-4ac3-b869-509515b15aa1",
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "name": "stdout",
285
+ "output_type": "stream",
286
+ "text": [
287
+ "dna datasets mean token lenght 96.07685185185186 min token length 7 max token length 576\n"
288
+ ]
289
+ }
290
+ ],
291
+ "source": [
292
+ "token_len_list = []\n",
293
+ "for item in dataset[\"test\"]:\n",
294
+ " inputs = tokenizer.tokenize(item[\"input\"])\n",
295
+ " token_len_list.append( len(inputs) )\n",
296
+ "\n",
297
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
298
+ "min_len = min(token_len_list)\n",
299
+ "max_len = max(token_len_list)\n",
300
+ "\n",
301
+ "print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": 31,
307
+ "id": "1b32de6e-fe08-426e-983e-7dd157c9af62",
308
+ "metadata": {},
309
+ "outputs": [
310
+ {
311
+ "name": "stdout",
312
+ "output_type": "stream",
313
+ "text": [
314
+ "Number of unique labels: 3\n",
315
+ "Label to ID mapping: {'C': 0, 'H': 1, 'E': 2, '<pad>': 3}\n"
316
+ ]
317
+ }
318
+ ],
319
+ "source": [
320
+ "from collections import Counter\n",
321
+ "\n",
322
+ "# Confirm the number of labels and create a mapping from string labels to integer IDs.\n",
323
+ "all_labels = [label for item in dataset[\"train\"] for label in item[\"dssp3\"]]\n",
324
+ "label_counts = Counter(all_labels)\n",
325
+ "num_labels = len(label_counts)\n",
326
+ "\n",
327
+ "# Define a special ID for padding. Make sure this ID is not used by any actual label.\n",
328
+ "# If you have 3 classes, start with 3 or higher.\n",
329
+ "pad_token_label_id = num_labels # Assuming no other labels have this ID.\n",
330
+ "\n",
331
+ "label_to_id = {label: i for i, (label, _) in enumerate(label_counts.items())}\n",
332
+ "label_to_id['<pad>'] = pad_token_label_id # Add padding token to the mapping.\n",
333
+ "id_to_label = {v: k for k, v in label_to_id.items()}\n",
334
+ "\n",
335
+ "print(f\"Number of unique labels: {num_labels}\")\n",
336
+ "print(\"Label to ID mapping:\", label_to_id)"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": 32,
342
+ "id": "2bd65f47-3325-4357-a896-9a0abf160e8a",
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "name": "stderr",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
350
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
351
+ ]
352
+ }
353
+ ],
354
+ "source": [
355
+ "#set model\n",
356
+ "#model = AutoModelForTokenClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', )\n",
357
+ "model = AutoModelForTokenClassification.from_pretrained(\n",
358
+ " 'dnagpt/gene_eng_gpt2_v0',\n",
359
+ " num_labels=num_labels + 1, # Include the padding label in the count.\n",
360
+ " id2label=id_to_label,\n",
361
+ " label2id=label_to_id\n",
362
+ ")"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 33,
368
+ "id": "e247ac1e-bcd4-4aaf-9f91-dc939e5abe89",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "# 5. Preprocess the data\n",
373
+ "from transformers import DataCollatorForTokenClassification\n",
374
+ "import torch\n",
375
+ "# Define the maximum sequence length based on your model or dataset requirements.\n",
376
+ "max_seq_length = 128 # Adjust this value as needed.\n",
377
+ "\n",
378
+ "def preprocess_function(examples):\n",
379
+ " tokenized_inputs = tokenizer(\n",
380
+ " examples[\"input\"], \n",
381
+ " truncation=True, \n",
382
+ " padding='max_length', \n",
383
+ " max_length=max_seq_length,\n",
384
+ " return_tensors=\"pt\" # Return PyTorch tensors directly.\n",
385
+ " )\n",
386
+ " \n",
387
+ " labels = []\n",
388
+ " for label in examples['dssp3']:\n",
389
+ " label_ids = [label_to_id[l] if l in label_to_id else pad_token_label_id for l in label]\n",
390
+ " # Ensure labels are padded/truncated to the same length as inputs.\n",
391
+ " if len(label_ids) > max_seq_length:\n",
392
+ " label_ids = label_ids[:max_seq_length]\n",
393
+ " else:\n",
394
+ " label_ids = label_ids + [pad_token_label_id] * (max_seq_length - len(label_ids))\n",
395
+ " \n",
396
+ " labels.append(label_ids)\n",
397
+ " \n",
398
+ " tokenized_inputs[\"labels\"] = torch.tensor(labels)\n",
399
+ "\n",
400
+ " return tokenized_inputs"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 34,
406
+ "id": "8144d093-e8d3-41ff-ae4f-82aa1f28d689",
407
+ "metadata": {},
408
+ "outputs": [
409
+ {
410
+ "data": {
411
+ "application/vnd.jupyter.widget-view+json": {
412
+ "model_id": "707978d4f8304cada1041f8e794d79b7",
413
+ "version_major": 2,
414
+ "version_minor": 0
415
+ },
416
+ "text/plain": [
417
+ "Map: 0%| | 0/9712 [00:00<?, ? examples/s]"
418
+ ]
419
+ },
420
+ "metadata": {},
421
+ "output_type": "display_data"
422
+ },
423
+ {
424
+ "data": {
425
+ "application/vnd.jupyter.widget-view+json": {
426
+ "model_id": "7ab7ae3ed05244bab1fe13050aad3764",
427
+ "version_major": 2,
428
+ "version_minor": 0
429
+ },
430
+ "text/plain": [
431
+ "Map: 0%| | 0/1080 [00:00<?, ? examples/s]"
432
+ ]
433
+ },
434
+ "metadata": {},
435
+ "output_type": "display_data"
436
+ }
437
+ ],
438
+ "source": [
439
+ "tokenized_datasets = dataset.map(preprocess_function, batched=True)"
440
+ ]
441
+ },
442
+ {
443
+ "cell_type": "code",
444
+ "execution_count": 35,
445
+ "id": "de5067da-a010-4e0d-b99b-659ee2d3cf3c",
446
+ "metadata": {},
447
+ "outputs": [],
448
+ "source": [
449
+ "# Remove columns that are not required by the model.\n",
450
+ "columns_to_remove = ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask']\n",
451
+ "tokenized_datasets.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"], output_all_columns=True)"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "code",
456
+ "execution_count": 36,
457
+ "id": "fcce6f4e-9716-4fe4-9250-e201a442bbbc",
458
+ "metadata": {},
459
+ "outputs": [],
460
+ "source": [
461
+ "# Set up data collator for handling padding during batching.\n",
462
+ "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=8, label_pad_token_id=pad_token_label_id)"
463
+ ]
464
+ },
465
+ {
466
+ "cell_type": "code",
467
+ "execution_count": 37,
468
+ "id": "fa3e62b3-dba4-4cef-9bb7-de410f4bb444",
469
+ "metadata": {},
470
+ "outputs": [
471
+ {
472
+ "name": "stderr",
473
+ "output_type": "stream",
474
+ "text": [
475
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
476
+ " warnings.warn(\n",
477
+ "/tmp/ipykernel_1443/204012889.py:41: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
478
+ " trainer = Trainer(\n"
479
+ ]
480
+ }
481
+ ],
482
+ "source": [
483
+ "# 6. Prepare training\n",
484
+ "import evaluate\n",
485
+ "import numpy as np\n",
486
+ "\n",
487
+ "metric = evaluate.load(\"seqeval\")\n",
488
+ "\n",
489
+ "def compute_metrics(p):\n",
490
+ " predictions, labels = p\n",
491
+ " predictions = np.argmax(predictions, axis=2)\n",
492
+ "\n",
493
+ " # Remove ignored index (special tokens)\n",
494
+ " true_predictions = [\n",
495
+ " [id_to_label[p] for (p, l) in zip(prediction, label) if l != pad_token_label_id]\n",
496
+ " for prediction, label in zip(predictions, labels)\n",
497
+ " ]\n",
498
+ " true_labels = [\n",
499
+ " [id_to_label[l] for (p, l) in zip(prediction, label) if l != pad_token_label_id]\n",
500
+ " for prediction, label in zip(predictions, labels)\n",
501
+ " ]\n",
502
+ "\n",
503
+ " results = metric.compute(predictions=true_predictions, references=true_labels)\n",
504
+ " return {\n",
505
+ " \"precision\": results[\"overall_precision\"],\n",
506
+ " \"recall\": results[\"overall_recall\"],\n",
507
+ " \"f1\": results[\"overall_f1\"],\n",
508
+ " \"accuracy\": results[\"overall_accuracy\"],\n",
509
+ " }\n",
510
+ " \n",
511
+ "training_args = TrainingArguments(\n",
512
+ " output_dir=\"./results\",\n",
513
+ " evaluation_strategy=\"epoch\",\n",
514
+ " learning_rate=1e-5,\n",
515
+ " lr_scheduler_type=\"constant_with_warmup\",\n",
516
+ " optim='adamw_torch',\n",
517
+ " per_device_train_batch_size=16,\n",
518
+ " per_device_eval_batch_size=16,\n",
519
+ " num_train_epochs=20,\n",
520
+ " weight_decay=0.01,\n",
521
+ ")\n",
522
+ "\n",
523
+ "trainer = Trainer(\n",
524
+ " model=model,\n",
525
+ " args=training_args,\n",
526
+ " train_dataset=tokenized_datasets[\"train\"],\n",
527
+ " eval_dataset=tokenized_datasets[\"test\"],\n",
528
+ " tokenizer=tokenizer,\n",
529
+ " data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
530
+ " compute_metrics=compute_metrics,\n",
531
+ ")"
532
+ ]
533
+ },
534
+ {
535
+ "cell_type": "code",
536
+ "execution_count": 38,
537
+ "id": "8a76f326-1097-47bb-bb9d-03b77c4f8f4f",
538
+ "metadata": {},
539
+ "outputs": [
540
+ {
541
+ "data": {
542
+ "text/html": [
543
+ "\n",
544
+ " <div>\n",
545
+ " \n",
546
+ " <progress value='8001' max='12140' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
547
+ " [ 8001/12140 09:41 < 05:00, 13.76 it/s, Epoch 13.18/20]\n",
548
+ " </div>\n",
549
+ " <table border=\"1\" class=\"dataframe\">\n",
550
+ " <thead>\n",
551
+ " <tr style=\"text-align: left;\">\n",
552
+ " <th>Epoch</th>\n",
553
+ " <th>Training Loss</th>\n",
554
+ " <th>Validation Loss</th>\n",
555
+ " <th>Precision</th>\n",
556
+ " <th>Recall</th>\n",
557
+ " <th>F1</th>\n",
558
+ " <th>Accuracy</th>\n",
559
+ " </tr>\n",
560
+ " </thead>\n",
561
+ " <tbody>\n",
562
+ " <tr>\n",
563
+ " <td>1</td>\n",
564
+ " <td>1.102200</td>\n",
565
+ " <td>0.923186</td>\n",
566
+ " <td>0.314843</td>\n",
567
+ " <td>0.125521</td>\n",
568
+ " <td>0.179485</td>\n",
569
+ " <td>0.503214</td>\n",
570
+ " </tr>\n",
571
+ " <tr>\n",
572
+ " <td>2</td>\n",
573
+ " <td>0.942100</td>\n",
574
+ " <td>0.883362</td>\n",
575
+ " <td>0.357413</td>\n",
576
+ " <td>0.153466</td>\n",
577
+ " <td>0.214731</td>\n",
578
+ " <td>0.521366</td>\n",
579
+ " </tr>\n",
580
+ " <tr>\n",
581
+ " <td>3</td>\n",
582
+ " <td>0.898500</td>\n",
583
+ " <td>0.895442</td>\n",
584
+ " <td>0.355443</td>\n",
585
+ " <td>0.194234</td>\n",
586
+ " <td>0.251199</td>\n",
587
+ " <td>0.522545</td>\n",
588
+ " </tr>\n",
589
+ " <tr>\n",
590
+ " <td>4</td>\n",
591
+ " <td>0.870200</td>\n",
592
+ " <td>0.891230</td>\n",
593
+ " <td>0.367170</td>\n",
594
+ " <td>0.050731</td>\n",
595
+ " <td>0.089145</td>\n",
596
+ " <td>0.526761</td>\n",
597
+ " </tr>\n",
598
+ " <tr>\n",
599
+ " <td>5</td>\n",
600
+ " <td>0.831900</td>\n",
601
+ " <td>0.890030</td>\n",
602
+ " <td>0.373252</td>\n",
603
+ " <td>0.197096</td>\n",
604
+ " <td>0.257971</td>\n",
605
+ " <td>0.530358</td>\n",
606
+ " </tr>\n",
607
+ " <tr>\n",
608
+ " <td>6</td>\n",
609
+ " <td>0.815800</td>\n",
610
+ " <td>0.867876</td>\n",
611
+ " <td>0.378696</td>\n",
612
+ " <td>0.236628</td>\n",
613
+ " <td>0.291262</td>\n",
614
+ " <td>0.540153</td>\n",
615
+ " </tr>\n",
616
+ " <tr>\n",
617
+ " <td>7</td>\n",
618
+ " <td>0.800900</td>\n",
619
+ " <td>0.873521</td>\n",
620
+ " <td>0.380925</td>\n",
621
+ " <td>0.212640</td>\n",
622
+ " <td>0.272927</td>\n",
623
+ " <td>0.544393</td>\n",
624
+ " </tr>\n",
625
+ " <tr>\n",
626
+ " <td>8</td>\n",
627
+ " <td>0.785100</td>\n",
628
+ " <td>0.872138</td>\n",
629
+ " <td>0.385372</td>\n",
630
+ " <td>0.156363</td>\n",
631
+ " <td>0.222462</td>\n",
632
+ " <td>0.547684</td>\n",
633
+ " </tr>\n",
634
+ " <tr>\n",
635
+ " <td>9</td>\n",
636
+ " <td>0.774100</td>\n",
637
+ " <td>0.885855</td>\n",
638
+ " <td>0.384813</td>\n",
639
+ " <td>0.180280</td>\n",
640
+ " <td>0.245531</td>\n",
641
+ " <td>0.549681</td>\n",
642
+ " </tr>\n",
643
+ " <tr>\n",
644
+ " <td>10</td>\n",
645
+ " <td>0.750800</td>\n",
646
+ " <td>0.884582</td>\n",
647
+ " <td>0.388464</td>\n",
648
+ " <td>0.206529</td>\n",
649
+ " <td>0.269681</td>\n",
650
+ " <td>0.555933</td>\n",
651
+ " </tr>\n",
652
+ " <tr>\n",
653
+ " <td>11</td>\n",
654
+ " <td>0.737500</td>\n",
655
+ " <td>0.886323</td>\n",
656
+ " <td>0.396929</td>\n",
657
+ " <td>0.202713</td>\n",
658
+ " <td>0.268369</td>\n",
659
+ " <td>0.557624</td>\n",
660
+ " </tr>\n",
661
+ " <tr>\n",
662
+ " <td>12</td>\n",
663
+ " <td>0.731000</td>\n",
664
+ " <td>0.878285</td>\n",
665
+ " <td>0.365956</td>\n",
666
+ " <td>0.315728</td>\n",
667
+ " <td>0.338991</td>\n",
668
+ " <td>0.555857</td>\n",
669
+ " </tr>\n",
670
+ " <tr>\n",
671
+ " <td>13</td>\n",
672
+ " <td>0.708900</td>\n",
673
+ " <td>0.912278</td>\n",
674
+ " <td>0.377030</td>\n",
675
+ " <td>0.249346</td>\n",
676
+ " <td>0.300174</td>\n",
677
+ " <td>0.555030</td>\n",
678
+ " </tr>\n",
679
+ " </tbody>\n",
680
+ "</table><p>"
681
+ ],
682
+ "text/plain": [
683
+ "<IPython.core.display.HTML object>"
684
+ ]
685
+ },
686
+ "metadata": {},
687
+ "output_type": "display_data"
688
+ },
689
+ {
690
+ "name": "stderr",
691
+ "output_type": "stream",
692
+ "text": [
693
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
694
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
695
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
696
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
697
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
698
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
699
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
700
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
701
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
702
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
703
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
704
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
705
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
706
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
707
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
708
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
709
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
710
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
711
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
712
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
713
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
714
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
715
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
716
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
717
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
718
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
719
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
720
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
721
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
722
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
723
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
724
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
725
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
726
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
727
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
728
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
729
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
730
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
731
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
732
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
733
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
734
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
735
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
736
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
737
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
738
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
739
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
740
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
741
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
742
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
743
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
744
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
745
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
746
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
747
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
748
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
749
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
750
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
751
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
752
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
753
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
754
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
755
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
756
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
757
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
758
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
759
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
760
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
761
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
762
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
763
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
764
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
765
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
766
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
767
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
768
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
769
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
770
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
771
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
772
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
773
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
774
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
775
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
776
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
777
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
778
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
779
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
780
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
781
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
782
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
783
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
784
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
785
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
786
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
787
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
788
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
789
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
790
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
791
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
792
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
793
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
794
+ " warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
795
+ "/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
796
+ " _warn_prf(average, modifier, msg_start, len(result))\n"
797
+ ]
798
+ },
799
+ {
800
+ "ename": "RuntimeError",
801
+ "evalue": "[enforce fail at inline_container.cc:595] . unexpected pos 1216226560 vs 1216226452",
802
+ "output_type": "error",
803
+ "traceback": [
804
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
805
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
806
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:628\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _open_zipfile_writer(f) \u001b[38;5;28;01mas\u001b[39;00m opened_zipfile:\n\u001b[0;32m--> 628\u001b[0m \u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_protocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_disable_byteorder_record\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
807
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:862\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(obj, zip_file, pickle_module, pickle_protocol, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 861\u001b[0m num_bytes \u001b[38;5;241m=\u001b[39m storage\u001b[38;5;241m.\u001b[39mnbytes()\n\u001b[0;32m--> 862\u001b[0m \u001b[43mzip_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_record\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_bytes\u001b[49m\u001b[43m)\u001b[49m\n",
808
+ "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:764] . PytorchStreamWriter failed writing file data/94: file write failed",
809
+ "\nDuring handling of the above exception, another exception occurred:\n",
810
+ "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
811
+ "Cell \u001b[0;32mIn[38], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Start training\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
812
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:2164\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 2162\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 2163\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2165\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2166\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2167\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2168\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2169\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
813
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:2591\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2589\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m=\u001b[39m epoch \u001b[38;5;241m+\u001b[39m (step \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m steps_skipped) \u001b[38;5;241m/\u001b[39m steps_in_epoch\n\u001b[1;32m 2590\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[0;32m-> 2591\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_log_save_evaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2592\u001b[0m \u001b[43m \u001b[49m\u001b[43mtr_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_norm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_time\u001b[49m\n\u001b[1;32m 2593\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2594\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2595\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_substep_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n",
814
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3056\u001b[0m, in \u001b[0;36mTrainer._maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)\u001b[0m\n\u001b[1;32m 3053\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_save \u001b[38;5;241m=\u001b[39m is_new_best_metric\n\u001b[1;32m 3055\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[0;32m-> 3056\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3057\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_save(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n",
815
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3192\u001b[0m, in \u001b[0;36mTrainer._save_checkpoint\u001b[0;34m(self, model, trial)\u001b[0m\n\u001b[1;32m 3188\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_model(output_dir, _internal_call\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 3190\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39msave_only_model:\n\u001b[1;32m 3191\u001b[0m \u001b[38;5;66;03m# Save optimizer and scheduler\u001b[39;00m\n\u001b[0;32m-> 3192\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_optimizer_and_scheduler\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[38;5;66;03m# Save RNG state\u001b[39;00m\n\u001b[1;32m 3194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_save_rng_state(output_dir)\n",
816
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3313\u001b[0m, in \u001b[0;36mTrainer._save_optimizer_and_scheduler\u001b[0;34m(self, output_dir)\u001b[0m\n\u001b[1;32m 3308\u001b[0m save_fsdp_optimizer(\n\u001b[1;32m 3309\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfsdp_plugin, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptimizer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, output_dir\n\u001b[1;32m 3310\u001b[0m )\n\u001b[1;32m 3311\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[1;32m 3312\u001b[0m \u001b[38;5;66;03m# deepspeed.save_checkpoint above saves model/optim/sched\u001b[39;00m\n\u001b[0;32m-> 3313\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstate_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOPTIMIZER_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3315\u001b[0m \u001b[38;5;66;03m# Save SCHEDULER & SCALER\u001b[39;00m\n\u001b[1;32m 3316\u001b[0m is_deepspeed_custom_scheduler \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_deepspeed_enabled \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\n\u001b[1;32m 3317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlr_scheduler, DeepSpeedSchedulerWrapper\n\u001b[1;32m 3318\u001b[0m )\n",
817
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:627\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 624\u001b[0m _check_save_filelike(f)\n\u001b[1;32m 626\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _use_new_zipfile_serialization:\n\u001b[0;32m--> 627\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_open_zipfile_writer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_protocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_disable_byteorder_record\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mreturn\u001b[39;49;00m\n",
818
+ "File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:475\u001b[0m, in \u001b[0;36m_open_zipfile_writer_file.__exit__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfile_like\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_end_of_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 477\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream\u001b[38;5;241m.\u001b[39mclose()\n",
819
+ "\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:595] . unexpected pos 1216226560 vs 1216226452"
820
+ ]
821
+ }
822
+ ],
823
+ "source": [
824
+ "# Start training\n",
825
+ "trainer.train()"
826
+ ]
827
+ },
828
+ {
829
+ "cell_type": "code",
830
+ "execution_count": 39,
831
+ "id": "950c460f-5631-4c9a-819b-1e3ac484cc65",
832
+ "metadata": {},
833
+ "outputs": [
834
+ {
835
+ "data": {
836
+ "text/plain": [
837
+ "{'eval_loss': 0.9122781157493591,\n",
838
+ " 'eval_precision': 0.3770299145299145,\n",
839
+ " 'eval_recall': 0.2493464283190843,\n",
840
+ " 'eval_f1': 0.3001743716242079,\n",
841
+ " 'eval_accuracy': 0.5550300748427384}"
842
+ ]
843
+ },
844
+ "execution_count": 39,
845
+ "metadata": {},
846
+ "output_type": "execute_result"
847
+ }
848
+ ],
849
+ "source": [
850
+ "results = trainer.evaluate()\n",
851
+ "results"
852
+ ]
853
+ },
854
+ {
855
+ "cell_type": "code",
856
+ "execution_count": 40,
857
+ "id": "8174c1c6-a5bc-4fe3-8f9b-356625531e7d",
858
+ "metadata": {},
859
+ "outputs": [
860
+ {
861
+ "name": "stdout",
862
+ "output_type": "stream",
863
+ "text": [
864
+ ">>> Perplexity: 2.49\n"
865
+ ]
866
+ }
867
+ ],
868
+ "source": [
869
+ "import math\n",
870
+ "eval_results = trainer.evaluate()\n",
871
+ "print(f\">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
872
+ ]
873
+ },
874
+ {
875
+ "cell_type": "code",
876
+ "execution_count": 41,
877
+ "id": "6a22f131-9e5f-4125-942a-22d1b1e6373b",
878
+ "metadata": {},
879
+ "outputs": [
880
+ {
881
+ "data": {
882
+ "text/plain": [
883
+ "('./secondary_structure_model/tokenizer_config.json',\n",
884
+ " './secondary_structure_model/special_tokens_map.json',\n",
885
+ " './secondary_structure_model/vocab.json',\n",
886
+ " './secondary_structure_model/merges.txt',\n",
887
+ " './secondary_structure_model/added_tokens.json')"
888
+ ]
889
+ },
890
+ "execution_count": 41,
891
+ "metadata": {},
892
+ "output_type": "execute_result"
893
+ }
894
+ ],
895
+ "source": [
896
+ "# 保存模型\n",
897
+ "model.save_pretrained(\"./secondary_structure_model\")\n",
898
+ "tokenizer.save_pretrained(\"./secondary_structure_model\")"
899
+ ]
900
+ },
901
+ {
902
+ "cell_type": "code",
903
+ "execution_count": 42,
904
+ "id": "d5817a6c-c707-4005-9210-2a12ff0d43b0",
905
+ "metadata": {},
906
+ "outputs": [],
907
+ "source": [
908
+ "# 加载模型\n",
909
+ "model = AutoModelForTokenClassification.from_pretrained(\"./secondary_structure_model\")\n",
910
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"./secondary_structure_model\")"
911
+ ]
912
+ },
913
+ {
914
+ "cell_type": "code",
915
+ "execution_count": 43,
916
+ "id": "2f6ebdc6-8ff8-4947-ada4-05ff4b28e0f3",
917
+ "metadata": {},
918
+ "outputs": [],
919
+ "source": [
920
+ "# 进行预测\n",
921
+ "def predict_secondary_structure(sequence):\n",
922
+ " inputs = tokenizer(sequence, return_tensors=\"pt\", truncation=True, padding=True)\n",
923
+ " outputs = model(**inputs)\n",
924
+ " predictions = outputs.logits.argmax(dim=-1)\n",
925
+ " return predictions"
926
+ ]
927
+ },
928
+ {
929
+ "cell_type": "code",
930
+ "execution_count": 44,
931
+ "id": "841ebba8-7619-411f-a11e-841de3a3f064",
932
+ "metadata": {},
933
+ "outputs": [
934
+ {
935
+ "name": "stderr",
936
+ "output_type": "stream",
937
+ "text": [
938
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
939
+ ]
940
+ },
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "tensor([[0, 0, 0, 0, 2, 2, 2, 2]])\n"
946
+ ]
947
+ }
948
+ ],
949
+ "source": [
950
+ "# 示例预测\n",
951
+ "sequence = \"ACDEFGHIKLMNPQRSTVWY\"\n",
952
+ "predictions = predict_secondary_structure(sequence)\n",
953
+ "print(predictions)"
954
+ ]
955
+ },
956
  {
957
  "cell_type": "code",
958
  "execution_count": null,
959
+ "id": "37e7d22e-0545-422b-b8ba-7990ca127d8a",
960
  "metadata": {},
961
  "outputs": [],
962
  "source": []
03-gene-task/3-multi-seq-task.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
03-gene-task/4-fun-predict.ipynb CHANGED
@@ -1,9 +1,763 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "c8a5134f-ab2e-486d-928d-ea4ba2d1f126",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "ce0fa061-3f49-46c3-ba5c-8dcca7d283d3",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.4 功能预测任务"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "e0fe3429-dbff-4e83-827a-2e31da60dfc3",
14
+ "metadata": {
15
+ "jp-MarkdownHeadingCollapsed": true
16
+ },
17
+ "source": [
18
+ "## 数据准备\n",
19
+ "\n",
20
+ "获得蛋白质序列及其对应的功能描述是生物信息学中的一个重要任务,通常涉及从公共数据库中检索数据或通过实验方法确定。以下是几种常用的方法和资源,帮助你获取蛋白质序列和功能描述:\n",
21
+ "\n",
22
+ "### 1. **使用公共数据库**\n",
23
+ "\n",
24
+ "#### a. **UniProt (Universal Protein Resource)**\n",
25
+ "\n",
26
+ "- **网址**:[UniProt](https://www.uniprot.org/)\n",
27
+ "- **特点**:UniProt 是一个综合性的蛋白质数据库,提供了丰富的注释信息,包括蛋白质序列、结构、功能、亚细胞定位等。\n",
28
+ "- **使用方法**:\n",
29
+ " - 在搜索栏中输入蛋白质名称、基因名称或序列 ID。\n",
30
+ " - 浏览结果页面以查看详细的注释信息,包括功能描述、GO(Gene Ontology)术语、文献引用等。\n",
31
+ "\n",
32
+ "\n",
33
+ "<img src=\"img/function.png\" width=\"500px\" />\n",
34
+ "\n",
35
+ "<img src=\"img/sequence.png\" width=\"500px\" />\n",
36
+ "\n",
37
+ "#### b. **NCBI (National Center for Biotechnology Information)**\n",
38
+ "\n",
39
+ "- **网址**:[NCBI](https://www.ncbi.nlm.nih.gov/)\n",
40
+ "- **特点**:NCBI 提供多个相关数据库,如 GenBank、RefSeq 和 Protein 数据库,涵盖广泛的生物物种和蛋白质信息。\n",
41
+ "- **使用方法**:\n",
42
+ " - 使用 NCBI 的搜索工具 Entrez 或 BLAST 搜索蛋白质序列或功能描述。\n",
43
+ " - 访问特定的蛋白质条目以获取详细信息,包括序列、功能、参考文献等。\n",
44
+ "\n",
45
+ "#### c. **PDB (Protein Data Bank)**\n",
46
+ "\n",
47
+ "- **网址**:[PDB](https://www.rcsb.org/)\n",
48
+ "- **特点**:PDB 主要包含蛋白质的三维结构信息,但也提供相关的功能描述和文献引用。\n",
49
+ "- **使用方法**:\n",
50
+ " - 使用 PDB 的搜索功能查找特定蛋白质的结构信息。\n",
51
+ " - 查看每个条目的详细页面以获取功能描述和其他相关信息。\n",
52
+ "\n",
53
+ "#### d. **Ensembl**\n",
54
+ "\n",
55
+ "- **网址**:[Ensembl](https://www.ensembl.org/)\n",
56
+ "- **特点**:Ensembl 提供基因组浏览器和注释信息,涵盖了多种物种的基因和蛋白质数据。\n",
57
+ "- **使用方法**:\n",
58
+ " - 使用 Ensembl 的搜索功能查找特定蛋白质或基因。\n",
59
+ " - 浏览条目页面以获取详细的注释信息,包括功能描述、GO 术语等。\n",
60
+ "\n",
61
+ "### 2. **通过生物信息学工具**\n",
62
+ "\n",
63
+ "#### a. **BLAST (Basic Local Alignment Search Tool)**\n",
64
+ "\n",
65
+ "- **网址**:[BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)\n",
66
+ "- **特点**:BLAST 是一种常用的比对工具,可以帮助你根据已知的蛋白质序列找到相似的序列,并获取其功能描述。\n",
67
+ "- **使用方法**:\n",
68
+ " - 输入你的蛋白质序列。\n",
69
+ " - 选择适当的数据库(如 NR、Swiss-Prot 等)进行比对。\n",
70
+ " - 分析比对结果,查看相似序列的功能描述。\n",
71
+ "\n",
72
+ "#### b. **InterProScan**\n",
73
+ "\n",
74
+ "- **网址**:[InterPro](https://www.ebi.ac.uk/interpro/)\n",
75
+ "- **特点**:InterProScan 是一种用于识别蛋白质家族、结构域和重要位点的工具,可以提供详细的注释信息。\n",
76
+ "- **使用方法**:\n",
77
+ " - 输入你的蛋白质序列。\n",
78
+ " - 运行 InterProScan 分析,获取功能描述、结构域信息等。\n",
79
+ "\n",
80
+ "### 3. **通过文献和出版物**\n",
81
+ "\n",
82
+ "#### a. **PubMed**\n",
83
+ "\n",
84
+ "- **网址**:[PubMed](https://pubmed.ncbi.nlm.nih.gov/)\n",
85
+ "- **特点**:PubMed 是一个广泛使用的生物医学文献数据库,提供了大量关于蛋白质功能的研究论文。\n",
86
+ "- **使用方法**:\n",
87
+ " - 使用关键词搜索与特定蛋白质相关的研究论文。\n",
88
+ " - 阅读论文以获取详细的实验数据和功能描述。\n",
89
+ "\n",
90
+ "#### b. **Google Scholar**\n",
91
+ "\n",
92
+ "- **网址**:[Google Scholar](https://scholar.google.com/)\n",
93
+ "- **特点**:Google Scholar 是一个学术搜索引擎,涵盖广泛的科学文献。\n",
94
+ "- **使用方法**:\n",
95
+ " - 使用关键词搜索与特定蛋白质相关的研究论文。\n",
96
+ " - 阅读论文以获取详细的实验数据和功能描述。"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "markdown",
101
+ "id": "e794b698-ef26-4695-9b77-ff7314210e8b",
102
+ "metadata": {},
103
+ "source": [
104
+ "## 整理好的数据\n",
105
+ "\n",
106
+ "https://huggingface.co/datasets/PharMolix/MutaDescribe\n",
107
+ "\n",
108
+ "<img src=\"img/dataset.png\" width=\"500px\" />\n",
109
+ "\n",
110
+ "https://huggingface.co/datasets/jonghyunlee/UniProt_function_text_descriptions?row=2"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 1,
116
+ "id": "e6c59f74-877a-4a74-9017-e61eb713e285",
117
+ "metadata": {},
118
+ "outputs": [
119
+ {
120
+ "data": {
121
+ "text/plain": [
122
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
123
+ ]
124
+ },
125
+ "execution_count": 1,
126
+ "metadata": {},
127
+ "output_type": "execute_result"
128
+ }
129
+ ],
130
+ "source": [
131
+ "import subprocess\n",
132
+ "import os\n",
133
+ "# 设置环境变量, autodl一般区域\n",
134
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
135
+ "output = result.stdout\n",
136
+ "for line in output.splitlines():\n",
137
+ " if '=' in line:\n",
138
+ " var, value = line.split('=', 1)\n",
139
+ " os.environ[var] = value\n",
140
+ "\n",
141
+ "\"\"\"\n",
142
+ "import os\n",
143
+ "\n",
144
+ "# 设置环境变量, autodl专区 其他idc\n",
145
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
146
+ "\n",
147
+ "# 打印环境变量以确认设置成功\n",
148
+ "print(os.environ.get('HF_ENDPOINT'))\n",
149
+ "\"\"\""
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 2,
155
+ "id": "f038b5f3-b2a5-45bd-b66a-0475b1f2c026",
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "from transformers import AutoTokenizer, AutoModel\n",
160
+ "from tokenizers import Tokenizer\n",
161
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
162
+ "from transformers import AutoModelForSeq2SeqLM \n",
163
+ "from transformers import DataCollatorWithPadding"
164
+ ]
165
+ },
166
+ {
167
+ "cell_type": "code",
168
+ "execution_count": 3,
169
+ "id": "7c861666-010e-46d6-aaf0-c63e52920d99",
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "#set tokenizer,dna protein \n",
174
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
175
+ "tokenizer.pad_token = tokenizer.eos_token"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": 4,
181
+ "id": "32532c2d-962d-4aa0-a823-87d6c62a411f",
182
+ "metadata": {},
183
+ "outputs": [],
184
+ "source": [
185
+ "from datasets import load_dataset\n",
186
+ "# 1. load ~11k samples from promoters prediction dataset\n",
187
+ "dataset = load_dataset(\"jonghyunlee/UniProt_function_text_descriptions\")['train'].select(range(5000)).train_test_split(test_size=0.05)"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": 5,
193
+ "id": "3829dc86-1274-440b-aa2b-e100f112e9bf",
194
+ "metadata": {},
195
+ "outputs": [
196
+ {
197
+ "data": {
198
+ "text/plain": [
199
+ "DatasetDict({\n",
200
+ " train: Dataset({\n",
201
+ " features: ['entry', 'entry_name', 'protein_name', 'sequence', 'function'],\n",
202
+ " num_rows: 4750\n",
203
+ " })\n",
204
+ " test: Dataset({\n",
205
+ " features: ['entry', 'entry_name', 'protein_name', 'sequence', 'function'],\n",
206
+ " num_rows: 250\n",
207
+ " })\n",
208
+ "})"
209
+ ]
210
+ },
211
+ "execution_count": 5,
212
+ "metadata": {},
213
+ "output_type": "execute_result"
214
+ }
215
+ ],
216
+ "source": [
217
+ "dataset"
218
+ ]
219
+ },
220
+ {
221
+ "cell_type": "code",
222
+ "execution_count": 6,
223
+ "id": "6cb4a847-8fd1-4cdd-a905-e0595250c712",
224
+ "metadata": {},
225
+ "outputs": [
226
+ {
227
+ "data": {
228
+ "text/plain": [
229
+ "{'entry': 'A1TFU9',\n",
230
+ " 'entry_name': 'HPXO_MYCVP',\n",
231
+ " 'protein_name': 'FAD-dependent urate hydroxylase (EC 1.14.13.113) (Flavoprotein urate hydroxylase)',\n",
232
+ " 'sequence': 'MKVVIVGAGMGGMSAAIALRQIGIDTVVYERVTENKPVGAAISVWSNGVKCLNYLGLQEETAELGGKVETMSYVDGHTGDTMCRFSMHPLIEQVGQRPYPIARAELQLMLMKAYGIDDINFGMKMVGVENDTAGSAAKATFADGTTVSADVIIGADGAGSITREYVLGGPVSRRYAGYVNYNGLVSTDDAIGPATEWTTYVGDGKRVSVMPVSDDRFYFFFDVVEPQGSPYEEGRVREVLRAHFAGWTPGVQTLIDTLDPLATNRVEILDLDPFHTWVKGRVAVLGDAAHNTTPDIGQGGCSAMEDAIALQWAFKDHPDDVHAALAAYQSARTERAADLVLRARKRCDVTHAKDPQVTSRWYDELRNEDGTNIIRGIVGNIVGGPLTPVTAATEG',\n",
233
+ " 'function': 'Catalyzes the hydroxylation of urate to 5-hydroxyisourate (HIU). Is likely to be involved in the urate degradation pathway to allantoin. Prefers NADH over NADPH as the electron donor. '}"
234
+ ]
235
+ },
236
+ "execution_count": 6,
237
+ "metadata": {},
238
+ "output_type": "execute_result"
239
+ }
240
+ ],
241
+ "source": [
242
+ "dataset[\"train\"][0]"
243
+ ]
244
+ },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 10,
248
+ "id": "2eb330e7-28ea-46f1-b9bb-093352e1c5d8",
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "name": "stdout",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "dna datasets mean token lenght 269.515 min token length 24 max token length 4577\n"
256
+ ]
257
+ }
258
+ ],
259
+ "source": [
260
+ "token_len_list = []\n",
261
+ "for item in dataset[\"test\"].select(range(200)):\n",
262
+ " inputs = tokenizer.tokenize(item[\"sequence\"])\n",
263
+ " token_len_list.append( len(inputs) )\n",
264
+ "\n",
265
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
266
+ "min_len = min(token_len_list)\n",
267
+ "max_len = max(token_len_list)\n",
268
+ "\n",
269
+ "print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 11,
275
+ "id": "f904a4ad-6fe1-4588-b5fd-5541e26a9bfd",
276
+ "metadata": {},
277
+ "outputs": [
278
+ {
279
+ "name": "stdout",
280
+ "output_type": "stream",
281
+ "text": [
282
+ "dna datasets mean token lenght 271.02 min token length 23 max token length 1934\n"
283
+ ]
284
+ }
285
+ ],
286
+ "source": [
287
+ "token_len_list = []\n",
288
+ "for item in dataset[\"test\"].select(range(50)):\n",
289
+ " inputs = tokenizer.tokenize(item[\"function\"])\n",
290
+ " token_len_list.append( len(inputs) )\n",
291
+ "\n",
292
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
293
+ "min_len = min(token_len_list)\n",
294
+ "max_len = max(token_len_list)\n",
295
+ "\n",
296
+ "print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
297
+ ]
298
+ },
299
+ {
300
+ "cell_type": "code",
301
+ "execution_count": 6,
302
+ "id": "73ea434d-6262-4d47-a6e9-3ffb136d8cd0",
303
+ "metadata": {},
304
+ "outputs": [
305
+ {
306
+ "data": {
307
+ "application/vnd.jupyter.widget-view+json": {
308
+ "model_id": "32b5e6b926f24df282da36e7f14cc9ba",
309
+ "version_major": 2,
310
+ "version_minor": 0
311
+ },
312
+ "text/plain": [
313
+ "Map: 0%| | 0/4750 [00:00<?, ? examples/s]"
314
+ ]
315
+ },
316
+ "metadata": {},
317
+ "output_type": "display_data"
318
+ },
319
+ {
320
+ "data": {
321
+ "application/vnd.jupyter.widget-view+json": {
322
+ "model_id": "6082815e684f4469bee48ada6a049998",
323
+ "version_major": 2,
324
+ "version_minor": 0
325
+ },
326
+ "text/plain": [
327
+ "Map: 0%| | 0/250 [00:00<?, ? examples/s]"
328
+ ]
329
+ },
330
+ "metadata": {},
331
+ "output_type": "display_data"
332
+ }
333
+ ],
334
+ "source": [
335
+ "max_length = 128\n",
336
+ "\n",
337
+ "def preprocess_function(examples):\n",
338
+ " # 直接从 examples 中提取字段\n",
339
+ " inputs = examples[\"sequence\"] # 获取所有样本的 \"sequence\"\n",
340
+ " targets = examples[\"function\"] # 获取所有样本的 \"function\"\n",
341
+ "\n",
342
+ " # 对数据进行编码\n",
343
+ " model_inputs = tokenizer(\n",
344
+ " inputs, text_target=targets, max_length=max_length, truncation=True\n",
345
+ " )\n",
346
+ " return model_inputs\n",
347
+ "\n",
348
+ "\n",
349
+ "# 应用分词\n",
350
+ "tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset[\"train\"].column_names,)\n"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "code",
355
+ "execution_count": 10,
356
+ "id": "36c17576-cf54-4bfd-8d98-fe41a4b7d2cc",
357
+ "metadata": {},
358
+ "outputs": [
359
+ {
360
+ "name": "stdout",
361
+ "output_type": "stream",
362
+ "text": [
363
+ "{'input_ids': [9206, 609, 532, 2065, 487, 1241, 50, 1785, 4070, 827, 28817, 3840, 1105, 8309, 2993, 3449, 47036, 22588, 5215, 636, 4189, 12265, 3721, 7075, 69183, 3040, 814, 1209, 1910, 217, 474, 13943, 15033, 535, 558, 51164, 12333, 56886, 1174, 338, 20934, 9865, 46, 1131, 3021, 336, 11005, 20318, 748, 396, 46, 38, 46, 54, 12036, 482, 4807, 284, 13333, 87969, 1482, 618, 371, 46, 49, 29703, 46, 5669, 55496, 40, 2682, 2186, 84535, 471, 12020, 280, 1751, 46, 545, 3968, 1660, 354, 1309, 84775, 328, 3802, 52, 46, 33718, 797, 46, 39, 487, 965, 16953, 790, 8503, 53823, 365, 39878, 41235, 17957, 25823, 785, 967, 1371, 543, 8660, 1510, 308, 46, 46, 35663, 3804, 4662, 15100, 8524, 2378, 254, 2399, 38462, 1700, 3223, 1296, 478, 1972, 809, 251], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [31598, 406, 418, 11784, 170, 2001, 170, 4596, 26168, 170, 36668, 69, 170, 3225, 283, 829, 170, 223, 362, 170, 5700, 83, 170, 450, 170, 65, 170, 14420, 270, 170, 2001, 170, 5429, 52715, 468, 370, 3550, 46, 33, 170, 37558, 77, 170, 39404, 73, 65, 170, 1389, 170, 5739, 548, 456, 170, 1389, 170, 15898, 456, 170, 666, 170, 2021, 672, 170, 2001, 170, 13336, 1471, 1882, 170, 59356, 307, 170, 8, 518, 46, 13, 73310, 562, 170, 1389, 170, 518, 46, 13, 1681, 65, 9, 14, 26398, 78, 26168, 170, 36668, 69, 170, 4655, 792, 170, 436, 170, 35585, 24270, 952, 170, 248, 170, 4655, 792, 170, 1978, 170, 21544, 13, 35, 80, 39, 170, 21033, 597, 13, 78252, 952, 3550, 46, 33, 170, 37558, 77]}\n",
364
+ "{'input_ids': [45, 504, 1187, 659, 46, 5874, 86301, 412, 51, 86301, 3970, 4570, 59926, 3476, 43, 1517, 46, 244, 46, 335, 9187, 1342, 36689, 14000, 542, 40307, 3757, 14421, 412, 3762, 256, 1588, 723, 12505, 36170, 4898, 846, 87891, 3670, 4020, 4651, 12182, 1121, 60975, 264, 1201, 404, 10714, 256, 1396, 4709, 22460, 2538, 254, 1173, 71302, 423, 11201, 1259, 4013, 87933, 23361, 3410, 46, 41032, 407, 1131, 23083, 2151, 333, 4143, 28020, 213, 52, 21197, 46, 58776, 46, 10528, 1424, 19812, 5974, 46, 54], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [50, 18991, 19522, 170, 1978, 170, 4067, 26168, 170, 1389, 170, 11307, 170, 666, 170, 2250, 85871, 170, 2001, 170, 666, 170, 11086, 655, 1836, 170, 1285, 79, 45056, 17704, 170, 298, 36851, 875, 170, 8371, 6081, 170, 1389, 170, 666, 170, 10434, 10228, 69, 170, 8, 70, 6326, 9, 170, 82935, 468, 69, 14, 26398, 1268, 456, 170, 2901, 170, 70249, 362, 6356, 170, 248, 170, 79448, 456, 170, 1389, 170, 3774, 456, 170, 79448, 69, 170, 11086, 655, 597, 170, 1338, 982, 14, 170]}\n"
365
+ ]
366
+ }
367
+ ],
368
+ "source": [
369
+ "# 查看处理后的数据,使用正确的拆分(例如 'train')\n",
370
+ "print(tokenized_datasets['train'][0]) # 查看 'train' 数据集中的第一个样本\n",
371
+ "print(tokenized_datasets[\"test\"][0])# 查看 'test' 数据集中的第一个样本\n"
372
+ ]
373
+ },
374
+ {
375
+ "cell_type": "code",
376
+ "execution_count": 16,
377
+ "id": "9aec8d0b-ef4f-43f3-b986-53edfc7a509f",
378
+ "metadata": {},
379
+ "outputs": [
380
+ {
381
+ "data": {
382
+ "application/vnd.jupyter.widget-view+json": {
383
+ "model_id": "086f8f974e99413da19fc5c18030f1c6",
384
+ "version_major": 2,
385
+ "version_minor": 0
386
+ },
387
+ "text/plain": [
388
+ "model.safetensors: 69%|######9 | 430M/620M [00:00<?, ?B/s]"
389
+ ]
390
+ },
391
+ "metadata": {},
392
+ "output_type": "display_data"
393
+ },
394
+ {
395
+ "data": {
396
+ "application/vnd.jupyter.widget-view+json": {
397
+ "model_id": "187fabd0fd8b49419f7fa1ecf3ff6216",
398
+ "version_major": 2,
399
+ "version_minor": 0
400
+ },
401
+ "text/plain": [
402
+ "generation_config.json: 0%| | 0.00/111 [00:00<?, ?B/s]"
403
+ ]
404
+ },
405
+ "metadata": {},
406
+ "output_type": "display_data"
407
+ },
408
+ {
409
+ "name": "stderr",
410
+ "output_type": "stream",
411
+ "text": [
412
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
413
+ " warnings.warn(\n"
414
+ ]
415
+ },
416
+ {
417
+ "name": "stdout",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "[2025-01-05 16:06:01,794] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
421
+ ]
422
+ },
423
+ {
424
+ "name": "stderr",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
428
+ "collect2: error: ld returned 1 exit status\n",
429
+ "/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
430
+ "/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
431
+ "/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
432
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
433
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
434
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
435
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
436
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
437
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
438
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
439
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
440
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
441
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
442
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
443
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
444
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
445
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
446
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
447
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
448
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
449
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
450
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
451
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
452
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
453
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
454
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
455
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
456
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
457
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
458
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
459
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
460
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
461
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
462
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
463
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
464
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
465
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
466
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
467
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
468
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
469
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
470
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
471
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
472
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
473
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
474
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
475
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
476
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
477
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
478
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
479
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
480
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
481
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
482
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
483
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
484
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
485
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
486
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
487
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
488
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
489
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
490
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
491
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
492
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
493
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
494
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
495
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
496
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
497
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
498
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
499
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
500
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
501
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
502
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
503
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
504
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
505
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
506
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
507
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
508
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
509
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
510
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
511
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
512
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
513
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
514
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
515
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
516
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
517
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
518
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
519
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
520
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
521
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
522
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
523
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
524
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
525
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
526
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
527
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
528
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
529
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
530
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
531
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
532
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
533
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
534
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
535
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
536
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
537
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
538
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
539
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
540
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
541
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
542
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
543
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
544
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
545
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
546
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
547
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
548
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
549
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
550
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
551
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
552
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
553
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
554
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
555
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
556
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
557
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
558
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
559
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
560
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
561
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
562
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
563
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
564
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
565
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
566
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
567
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
568
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
569
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
570
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
571
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
572
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
573
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
574
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
575
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
576
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
577
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
578
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
579
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
580
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
581
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
582
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
583
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
584
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
585
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
586
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
587
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
588
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
589
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
590
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
591
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
592
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
593
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
594
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
595
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
596
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
597
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
598
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
599
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
600
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
601
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
602
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
603
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
604
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
605
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
606
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
607
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
608
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
609
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
610
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
611
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
612
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
613
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
614
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
615
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
616
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
617
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
618
+ "/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
619
+ "collect2: error: ld returned 1 exit status\n"
620
+ ]
621
+ },
622
+ {
623
+ "data": {
624
+ "text/html": [
625
+ "\n",
626
+ " <div>\n",
627
+ " \n",
628
+ " <progress value='2970' max='2970' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
629
+ " [2970/2970 03:02, Epoch 5/5]\n",
630
+ " </div>\n",
631
+ " <table border=\"1\" class=\"dataframe\">\n",
632
+ " <thead>\n",
633
+ " <tr style=\"text-align: left;\">\n",
634
+ " <th>Epoch</th>\n",
635
+ " <th>Training Loss</th>\n",
636
+ " <th>Validation Loss</th>\n",
637
+ " </tr>\n",
638
+ " </thead>\n",
639
+ " <tbody>\n",
640
+ " <tr>\n",
641
+ " <td>1</td>\n",
642
+ " <td>5.123600</td>\n",
643
+ " <td>4.953184</td>\n",
644
+ " </tr>\n",
645
+ " <tr>\n",
646
+ " <td>2</td>\n",
647
+ " <td>4.958600</td>\n",
648
+ " <td>4.907465</td>\n",
649
+ " </tr>\n",
650
+ " <tr>\n",
651
+ " <td>3</td>\n",
652
+ " <td>4.893500</td>\n",
653
+ " <td>4.876421</td>\n",
654
+ " </tr>\n",
655
+ " <tr>\n",
656
+ " <td>4</td>\n",
657
+ " <td>4.801000</td>\n",
658
+ " <td>4.860046</td>\n",
659
+ " </tr>\n",
660
+ " <tr>\n",
661
+ " <td>5</td>\n",
662
+ " <td>4.709600</td>\n",
663
+ " <td>4.876781</td>\n",
664
+ " </tr>\n",
665
+ " </tbody>\n",
666
+ "</table><p>"
667
+ ],
668
+ "text/plain": [
669
+ "<IPython.core.display.HTML object>"
670
+ ]
671
+ },
672
+ "metadata": {},
673
+ "output_type": "display_data"
674
+ },
675
+ {
676
+ "data": {
677
+ "text/plain": [
678
+ "TrainOutput(global_step=2970, training_loss=4.859284495344066, metrics={'train_runtime': 183.1426, 'train_samples_per_second': 129.68, 'train_steps_per_second': 16.217, 'total_flos': 1551421440000000.0, 'train_loss': 4.859284495344066, 'epoch': 5.0})"
679
+ ]
680
+ },
681
+ "execution_count": 16,
682
+ "metadata": {},
683
+ "output_type": "execute_result"
684
+ }
685
+ ],
686
+ "source": [
687
+ "from transformers import Trainer, TrainingArguments, GPT2LMHeadModel\n",
688
+ "from transformers import DataCollatorForSeq2Seq\n",
689
+ "\n",
690
+ "\n",
691
+ "# 加载预训练的 GPT-2 模型\n",
692
+ "model = GPT2LMHeadModel.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
693
+ "\n",
694
+ "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
695
+ "\n",
696
+ "# 训练参数\n",
697
+ "training_args = TrainingArguments(\n",
698
+ " output_dir=\"gpt2_protein_function\", # 输出目录\n",
699
+ " evaluation_strategy=\"epoch\",\n",
700
+ " learning_rate=5e-5,\n",
701
+ " per_device_train_batch_size=8,\n",
702
+ " per_device_eval_batch_size=8,\n",
703
+ " num_train_epochs=5,\n",
704
+ " save_steps=500,\n",
705
+ " save_total_limit=2,\n",
706
+ " logging_steps=500,\n",
707
+ ")\n",
708
+ "\n",
709
+ "# 初始化 Trainer\n",
710
+ "trainer = Trainer(\n",
711
+ " model=model,\n",
712
+ " args=training_args,\n",
713
+ " train_dataset=tokenized_datasets['train'], # 传递 'train' 拆分数据\n",
714
+ " eval_dataset=tokenized_datasets['test'], # 传递 'test' 拆分数据\n",
715
+ " data_collator=data_collator,\n",
716
+ ")\n",
717
+ "\n",
718
+ "# 训练模型\n",
719
+ "trainer.train()\n"
720
+ ]
721
+ },
722
+ {
723
+ "cell_type": "code",
724
+ "execution_count": null,
725
+ "id": "0e9e5526-39ab-4a75-a09d-f4d14a691211",
726
+ "metadata": {},
727
+ "outputs": [],
728
+ "source": [
729
+ "# 保存训练好的模型和 tokenizer\n",
730
+ "model.save_pretrained(run_path)\n",
731
+ "tokenizer.save_pretrained(run_path)\n"
732
+ ]
733
+ },
734
+ {
735
+ "cell_type": "code",
736
+ "execution_count": 19,
737
+ "id": "18cba087-6620-4d08-ae3a-0ffbd37c7f69",
738
+ "metadata": {},
739
+ "outputs": [
740
+ {
741
+ "name": "stdout",
742
+ "output_type": "stream",
743
+ "text": [
744
+ "{'eval_loss': 4.8767805099487305, 'eval_runtime': 0.489, 'eval_samples_per_second': 511.252, 'eval_steps_per_second': 65.44, 'epoch': 5.0}\n",
745
+ "Perplexity: 131.21\n"
746
+ ]
747
+ }
748
+ ],
749
+ "source": [
750
+ "import math\n",
751
+ "# 评估模型在验证集上的表现\n",
752
+ "eval_results = trainer.evaluate()\n",
753
+ "print(eval_results)\n",
754
+ "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
755
+ ]
756
+ },
757
  {
758
  "cell_type": "code",
759
  "execution_count": null,
760
+ "id": "4e032392-f018-44de-b1f4-0143025a660c",
761
  "metadata": {},
762
  "outputs": [],
763
  "source": []
03-gene-task/5-regression-task.ipynb CHANGED
@@ -1,9 +1,563 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
  "execution_count": null,
6
- "id": "0dd6e0d3-287c-4798-bb9b-5734ff4abf93",
7
  "metadata": {},
8
  "outputs": [],
9
  "source": []
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c499a5c3-0244-41c4-9947-e166206204e2",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 3.5 回归类任务"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "4678171b-bbc8-49dd-ad04-48f5ef89b45e",
14
+ "metadata": {},
15
+ "source": [
16
+ "GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值。\n",
17
+ "\n",
18
+ "使用 GPT-2 进行回归问题的解决,可以将回归问题转化为自回归语言模型任务。GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值(如情感评分、价格预测等)。\n",
19
+ "\n",
20
+ "---\n",
21
+ "\n",
22
+ "### **1. 使用 GPT-2 做回归的核心思路**\n",
23
+ "\n",
24
+ "1. **调整输出层**:\n",
25
+ " - 默认情况下,GPT-2 的输出是一个词汇表大小的概率分布,用于预测下一个 token。\n",
26
+ " - 对于回归问题,可以将模型的最后一层替换为一个线性层,使得输出变为一个标量或多个连续值。\n",
27
+ " - gpt2的huggingface实现中,可以简单设置1个分类的分类header,实现回归预测。\n",
28
+ "\n",
29
+ "2. **损失函数**:\n",
30
+ " - 对于回归问题,使用均方误差(MSE)或均绝对误差(MAE)作为损失函数,而不是分类任务中常用的交叉熵。\n",
31
+ "\n",
32
+ "3. **输入格式**:\n",
33
+ " - 输入数据仍然是文本,可以通过特定的模板形式加入上下文信息。\n",
34
+ "\n",
35
+ "---\n",
36
+ "\n",
37
+ "### **2. GPT-2 回归任务的实现步骤**\n",
38
+ "\n",
39
+ "#### **(1)加载基础模型**\n",
40
+ "\n",
41
+ "从 Hugging Face Transformers 库加载 GPT-2 模型和分词器,并调整其配置以适应回归任务。\n",
42
+ "\n",
43
+ "```python\n",
44
+ "from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, AutoModelForSequenceClassification\n",
45
+ "\n",
46
+ "# 加载分词器\n",
47
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
48
+ "\n",
49
+ "# 调整模型配置,num_labels=1 表示回归任务\n",
50
+ "config = GPT2Config.from_pretrained(\"gpt2\", num_labels=1)\n",
51
+ "\n",
52
+ "# 加载模型,增加回归输出\n",
53
+ "model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", config=config)\n",
54
+ "```\n",
55
+ "\n",
56
+ "---\n",
57
+ "\n",
58
+ "### **3. 课程数据集**\n",
59
+ "\n",
60
+ "本例程使用了蛋白质稳定性分析的数据集,也就是一个蛋白质序列,对应一个float的数值,做回归预测分析。\n",
61
+ "\n",
62
+ "**蛋白质稳定性分析**是研究蛋白质在不同条件下保持其结构和功能的能力的过程。蛋白质稳定性是生物化学和生物技术领域的重要课题,影响着蛋白质的折叠、功能执行、以及在应用中的可用性(如工业酶、药物开发等)。\n"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 1,
68
+ "id": "1e8c0f86-af78-43e1-8db4-e2a2ea22f815",
69
+ "metadata": {},
70
+ "outputs": [
71
+ {
72
+ "data": {
73
+ "text/plain": [
74
+ "\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
75
+ ]
76
+ },
77
+ "execution_count": 1,
78
+ "metadata": {},
79
+ "output_type": "execute_result"
80
+ }
81
+ ],
82
+ "source": [
83
+ "import subprocess\n",
84
+ "import os\n",
85
+ "# 设置环境变量, autodl一般区域\n",
86
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
87
+ "output = result.stdout\n",
88
+ "for line in output.splitlines():\n",
89
+ " if '=' in line:\n",
90
+ " var, value = line.split('=', 1)\n",
91
+ " os.environ[var] = value\n",
92
+ "\n",
93
+ "\"\"\"\n",
94
+ "import os\n",
95
+ "\n",
96
+ "# 设置环境变量, autodl专区 其他idc\n",
97
+ "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
98
+ "\n",
99
+ "# 打印环境变量以确认设置成功\n",
100
+ "print(os.environ.get('HF_ENDPOINT'))\n",
101
+ "\"\"\""
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 2,
107
+ "id": "c51a8d69-9a36-47e7-8084-f64e6a72e4f7",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "from transformers import AutoTokenizer, AutoModel\n",
112
+ "from tokenizers import Tokenizer\n",
113
+ "from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
114
+ "from transformers import AutoModelForSequenceClassification\n",
115
+ "from transformers import DataCollatorWithPadding"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 3,
121
+ "id": "a5aeb7c1-2d2a-4f57-ad8c-659613870e59",
122
+ "metadata": {},
123
+ "outputs": [],
124
+ "source": [
125
+ "#set tokenizer\n",
126
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
127
+ "tokenizer.pad_token = tokenizer.eos_token"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 4,
133
+ "id": "ad0c19cd-96a5-463e-8b7d-439646fef429",
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "name": "stderr",
138
+ "output_type": "stream",
139
+ "text": [
140
+ "Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']\n",
141
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
142
+ ]
143
+ }
144
+ ],
145
+ "source": [
146
+ "#set model\n",
147
+ "model = AutoModelForSequenceClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', num_labels=1)\n",
148
+ "model.config.pad_token_id = model.config.eos_token_id"
149
+ ]
150
+ },
151
+ {
152
+ "cell_type": "code",
153
+ "execution_count": 5,
154
+ "id": "8c48cb0a-6142-4afc-823e-08fb33f74222",
155
+ "metadata": {},
156
+ "outputs": [
157
+ {
158
+ "data": {
159
+ "text/plain": [
160
+ "DatasetDict({\n",
161
+ " train: Dataset({\n",
162
+ " features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
163
+ " num_rows: 62079\n",
164
+ " })\n",
165
+ " test: Dataset({\n",
166
+ " features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
167
+ " num_rows: 6898\n",
168
+ " })\n",
169
+ "})"
170
+ ]
171
+ },
172
+ "execution_count": 5,
173
+ "metadata": {},
174
+ "output_type": "execute_result"
175
+ }
176
+ ],
177
+ "source": [
178
+ "from datasets import load_dataset\n",
179
+ "# 1. load ~11k samples from promoters prediction dataset\n",
180
+ "dataset = load_dataset(\"csv\", data_files=\"data/protein_stab.csv\")['train'].train_test_split(test_size=0.1)\n",
181
+ "dataset"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 6,
187
+ "id": "685dd025-f00a-4869-bc30-9843c77b6d8a",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/plain": [
193
+ "{'seq_id': 'train_prot_32672',\n",
194
+ " 'seq_type': 'prot',\n",
195
+ " 'seq': 'FYRLIIFKYPDYIDTYLRLAAIAKEKNNLQLSIEGNGSGGNGSGGNGSGN',\n",
196
+ " 'label': 0.7599999904632561}"
197
+ ]
198
+ },
199
+ "execution_count": 6,
200
+ "metadata": {},
201
+ "output_type": "execute_result"
202
+ }
203
+ ],
204
+ "source": [
205
+ "dataset[\"train\"][0]"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 7,
211
+ "id": "6e10dbbb-73ef-4b67-8290-77f8896298f5",
212
+ "metadata": {},
213
+ "outputs": [
214
+ {
215
+ "name": "stdout",
216
+ "output_type": "stream",
217
+ "text": [
218
+ "datasets mean token lenght 17.24006958538707 min token length 12 max token length 35\n"
219
+ ]
220
+ }
221
+ ],
222
+ "source": [
223
+ "token_len_list = []\n",
224
+ "for item in dataset[\"test\"]:\n",
225
+ " inputs = tokenizer.tokenize(item[\"seq\"])\n",
226
+ " token_len_list.append( len(inputs) )\n",
227
+ "\n",
228
+ "mean_len = sum(token_len_list)/len(token_len_list)\n",
229
+ "min_len = min(token_len_list)\n",
230
+ "max_len = max(token_len_list)\n",
231
+ "\n",
232
+ "print(\"datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 25,
238
+ "id": "ac58b5b4-bff0-404d-bcf5-2b93db2b37c0",
239
+ "metadata": {},
240
+ "outputs": [
241
+ {
242
+ "data": {
243
+ "application/vnd.jupyter.widget-view+json": {
244
+ "model_id": "419cce8c5ba249ac8c8773dd2d69992d",
245
+ "version_major": 2,
246
+ "version_minor": 0
247
+ },
248
+ "text/plain": [
249
+ "Map: 0%| | 0/62079 [00:00<?, ? examples/s]"
250
+ ]
251
+ },
252
+ "metadata": {},
253
+ "output_type": "display_data"
254
+ },
255
+ {
256
+ "name": "stderr",
257
+ "output_type": "stream",
258
+ "text": [
259
+ "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.\n",
260
+ "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
261
+ ]
262
+ },
263
+ {
264
+ "data": {
265
+ "application/vnd.jupyter.widget-view+json": {
266
+ "model_id": "0b9ea09fe3ea49b19f7d52aca7949acf",
267
+ "version_major": 2,
268
+ "version_minor": 0
269
+ },
270
+ "text/plain": [
271
+ "Map: 0%| | 0/6898 [00:00<?, ? examples/s]"
272
+ ]
273
+ },
274
+ "metadata": {},
275
+ "output_type": "display_data"
276
+ }
277
+ ],
278
+ "source": [
279
+ "# 2. tokenize\n",
280
+ "def tokenize_function(examples):\n",
281
+ " return tokenizer(examples['seq'], truncation=True, padding='max_length')\n",
282
+ "\n",
283
+ "# 3. 对数据集应用分词函数\n",
284
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
285
+ "\n",
286
+ "# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
287
+ "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": 26,
293
+ "id": "94f6d643-2cf7-4651-9a8d-1884b2bddd1c",
294
+ "metadata": {},
295
+ "outputs": [
296
+ {
297
+ "name": "stderr",
298
+ "output_type": "stream",
299
+ "text": [
300
+ "/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
301
+ " warnings.warn(\n",
302
+ "/tmp/ipykernel_1347/4285456223.py:23: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
303
+ " trainer = Trainer(\n"
304
+ ]
305
+ }
306
+ ],
307
+ "source": [
308
+ "from transformers import TrainingArguments, Trainer\n",
309
+ "import numpy as np\n",
310
+ "from sklearn.metrics import mean_squared_error\n",
311
+ "\n",
312
+ "\n",
313
+ "def compute_metrics(eval_pred):\n",
314
+ " predictions, labels = eval_pred\n",
315
+ " rmse = mean_squared_error(labels, predictions)\n",
316
+ " return {\"rmse\": rmse}\n",
317
+ "\n",
318
+ "# 设置训练参数\n",
319
+ "training_args = TrainingArguments(\n",
320
+ " output_dir='./results',\n",
321
+ " evaluation_strategy=\"epoch\",\n",
322
+ " learning_rate=2e-5,\n",
323
+ " per_device_train_batch_size=20,\n",
324
+ " per_device_eval_batch_size=20,\n",
325
+ " num_train_epochs=10,\n",
326
+ " weight_decay=0.01,\n",
327
+ ")\n",
328
+ "\n",
329
+ "# 使用Trainer API进行训练(假设已有train_dataset和eval_dataset)\n",
330
+ "trainer = Trainer(\n",
331
+ " model=model,\n",
332
+ " args=training_args,\n",
333
+ " train_dataset=tokenized_datasets[\"train\"],\n",
334
+ " eval_dataset=tokenized_datasets[\"test\"],\n",
335
+ " tokenizer=tokenizer,\n",
336
+ " data_collator=data_collator,\n",
337
+ " compute_metrics=compute_metrics,\n",
338
+ ")"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "id": "dfe12979-d977-4404-bf9e-18c1f91a3e39",
345
+ "metadata": {},
346
+ "outputs": [
347
+ {
348
+ "data": {
349
+ "text/html": [
350
+ "\n",
351
+ " <div>\n",
352
+ " \n",
353
+ " <progress value='30987' max='31040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
354
+ " [30987/31040 1:00:56 < 00:06, 8.47 it/s, Epoch 9.98/10]\n",
355
+ " </div>\n",
356
+ " <table border=\"1\" class=\"dataframe\">\n",
357
+ " <thead>\n",
358
+ " <tr style=\"text-align: left;\">\n",
359
+ " <th>Epoch</th>\n",
360
+ " <th>Training Loss</th>\n",
361
+ " <th>Validation Loss</th>\n",
362
+ " <th>Rmse</th>\n",
363
+ " </tr>\n",
364
+ " </thead>\n",
365
+ " <tbody>\n",
366
+ " <tr>\n",
367
+ " <td>1</td>\n",
368
+ " <td>0.044600</td>\n",
369
+ " <td>0.163462</td>\n",
370
+ " <td>0.163462</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <td>2</td>\n",
374
+ " <td>0.041900</td>\n",
375
+ " <td>0.157900</td>\n",
376
+ " <td>0.157900</td>\n",
377
+ " </tr>\n",
378
+ " <tr>\n",
379
+ " <td>3</td>\n",
380
+ " <td>0.037700</td>\n",
381
+ " <td>0.159724</td>\n",
382
+ " <td>0.159724</td>\n",
383
+ " </tr>\n",
384
+ " <tr>\n",
385
+ " <td>4</td>\n",
386
+ " <td>0.031700</td>\n",
387
+ " <td>0.157686</td>\n",
388
+ " <td>0.157686</td>\n",
389
+ " </tr>\n",
390
+ " <tr>\n",
391
+ " <td>5</td>\n",
392
+ " <td>0.028800</td>\n",
393
+ " <td>0.157124</td>\n",
394
+ " <td>0.157124</td>\n",
395
+ " </tr>\n",
396
+ " <tr>\n",
397
+ " <td>6</td>\n",
398
+ " <td>0.025400</td>\n",
399
+ " <td>0.150852</td>\n",
400
+ " <td>0.150852</td>\n",
401
+ " </tr>\n",
402
+ " <tr>\n",
403
+ " <td>7</td>\n",
404
+ " <td>0.022300</td>\n",
405
+ " <td>0.159293</td>\n",
406
+ " <td>0.159293</td>\n",
407
+ " </tr>\n",
408
+ " <tr>\n",
409
+ " <td>8</td>\n",
410
+ " <td>0.019600</td>\n",
411
+ " <td>0.154608</td>\n",
412
+ " <td>0.154608</td>\n",
413
+ " </tr>\n",
414
+ " <tr>\n",
415
+ " <td>9</td>\n",
416
+ " <td>0.017300</td>\n",
417
+ " <td>0.156104</td>\n",
418
+ " <td>0.156104</td>\n",
419
+ " </tr>\n",
420
+ " </tbody>\n",
421
+ "</table><p>"
422
+ ],
423
+ "text/plain": [
424
+ "<IPython.core.display.HTML object>"
425
+ ]
426
+ },
427
+ "metadata": {},
428
+ "output_type": "display_data"
429
+ },
430
+ {
431
+ "name": "stderr",
432
+ "output_type": "stream",
433
+ "text": [
434
+ "IOPub message rate exceeded.\n",
435
+ "The Jupyter server will temporarily stop sending output\n",
436
+ "to the client in order to avoid crashing it.\n",
437
+ "To change this limit, set the config variable\n",
438
+ "`--ServerApp.iopub_msg_rate_limit`.\n",
439
+ "\n",
440
+ "Current values:\n",
441
+ "ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
442
+ "ServerApp.rate_limit_window=3.0 (secs)\n",
443
+ "\n"
444
+ ]
445
+ }
446
+ ],
447
+ "source": [
448
+ "# 开始训练\n",
449
+ "trainer.train()"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": null,
455
+ "id": "060c4618-40d0-4934-bab8-36aab3a46de5",
456
+ "metadata": {},
457
+ "outputs": [],
458
+ "source": [
459
+ "#模型测试\n",
460
+ "predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
461
+ "predictions"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": 18,
467
+ "id": "1f8ef885-5bc9-4668-905b-6b2235209654",
468
+ "metadata": {},
469
+ "outputs": [
470
+ {
471
+ "data": {
472
+ "text/html": [
473
+ "\n",
474
+ " <div>\n",
475
+ " \n",
476
+ " <progress value='345' max='345' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
477
+ " [345/345 00:09]\n",
478
+ " </div>\n",
479
+ " "
480
+ ],
481
+ "text/plain": [
482
+ "<IPython.core.display.HTML object>"
483
+ ]
484
+ },
485
+ "metadata": {},
486
+ "output_type": "display_data"
487
+ },
488
+ {
489
+ "data": {
490
+ "text/plain": [
491
+ "{'eval_loss': 0.15949687361717224,\n",
492
+ " 'eval_rmse': 0.15949687361717224,\n",
493
+ " 'eval_runtime': 9.1483,\n",
494
+ " 'eval_samples_per_second': 754.017,\n",
495
+ " 'eval_steps_per_second': 37.712,\n",
496
+ " 'epoch': 10.0}"
497
+ ]
498
+ },
499
+ "execution_count": 18,
500
+ "metadata": {},
501
+ "output_type": "execute_result"
502
+ }
503
+ ],
504
+ "source": [
505
+ "trainer.evaluate()"
506
+ ]
507
+ },
508
+ {
509
+ "cell_type": "code",
510
+ "execution_count": 23,
511
+ "id": "afabdbe9-9b96-4f9e-bef2-1d819431f8d1",
512
+ "metadata": {},
513
+ "outputs": [
514
+ {
515
+ "name": "stdout",
516
+ "output_type": "stream",
517
+ "text": [
518
+ "[[ 1.7208484 ]\n",
519
+ " [ 0.00225139]\n",
520
+ " [ 0.3325616 ]\n",
521
+ " [-0.34372616]\n",
522
+ " [-0.45505935]\n",
523
+ " [-0.06892765]\n",
524
+ " [ 0.15099108]\n",
525
+ " [ 0.12211376]\n",
526
+ " [ 0.3947332 ]\n",
527
+ " [ 0.23186803]]\n"
528
+ ]
529
+ }
530
+ ],
531
+ "source": [
532
+ "predictions.predictions[0:10].squeeze()"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 24,
538
+ "id": "fa9d17fd-eece-4c1e-99e0-3d19d36f7584",
539
+ "metadata": {},
540
+ "outputs": [
541
+ {
542
+ "data": {
543
+ "text/plain": [
544
+ "array([ 1.69, 0.84, 0.58, -0.15, 0.23, 0.03, 0.15, 0.2 , 0.51,\n",
545
+ " 1.1 ], dtype=float32)"
546
+ ]
547
+ },
548
+ "execution_count": 24,
549
+ "metadata": {},
550
+ "output_type": "execute_result"
551
+ }
552
+ ],
553
+ "source": [
554
+ "predictions.label_ids[0:10]"
555
+ ]
556
+ },
557
  {
558
  "cell_type": "code",
559
  "execution_count": null,
560
+ "id": "52252015-e068-414b-bd8a-79a5d1a2beec",
561
  "metadata": {},
562
  "outputs": [],
563
  "source": []
03-gene-task/data/.ipynb_checkpoints/protein_stab-checkpoint.csv ADDED
The diff for this file is too large to render. See raw diff
 
03-gene-task/data/dna_protein_full.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d096f1fcfa22524d216190ef7a65fc3025755c3dc82e167ff4f059562b85f046
3
+ size 26089081
03-gene-task/data/protein_stab.csv ADDED
The diff for this file is too large to render. See raw diff
 
03-gene-task/img/.ipynb_checkpoints/dataset-checkpoint.png ADDED
03-gene-task/img/2_structure.png ADDED
03-gene-task/img/dataset.png ADDED
03-gene-task/img/ds_structure.png ADDED
03-gene-task/img/function.png ADDED
03-gene-task/img/gpt2-ft.png ADDED
03-gene-task/img/pdb1.png ADDED
03-gene-task/img/protein-structure-1-2.png ADDED

Git LFS Details

  • SHA256: 731d20ac62df8f9b63bbb718572388ba684ad34fbe790d7cf2ddbcc3e0a5c53c
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
03-gene-task/img/protein-structure-1.png ADDED
03-gene-task/img/protein-structure-2.png ADDED
03-gene-task/img/sequence.png ADDED
04-gene-sft/.ipynb_checkpoints/1-finetue-intro-checkpoint.ipynb ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "32216f81-0979-4afd-8c8c-16729cd0dab6",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.1 模型微调VS指令微调"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "7cd7f9b2-c0a3-48c2-848e-a1e9c7188f03",
14
+ "metadata": {},
15
+ "source": [
16
+ "## 一个典型的知乎问题\n",
17
+ "\n",
18
+ "### **问题**\n",
19
+ "\n",
20
+ "用LLM实现文本二分类,微调base模型还是微调chat模型比较好?[问题](https://www.zhihu.com/question/632473480/answer/38930949853)\n",
21
+ "\n",
22
+ "我想用开源LLM(例如chatglm,baichuan)实现文本二分类(比如正负情感分类),有一组训练数据可以用于微调模型,提升分类性能,这时候应该选择base模型还是chat模型?\n",
23
+ "\n",
24
+ "\n",
25
+ "### **回答**\n",
26
+ "1 如果是使用2分类的header,base模型好一些。\n",
27
+ "\n",
28
+ "也就是使用如下类似的的设置。\n",
29
+ "\n",
30
+ "model = AutoModelForSequenceClassification.from_pretrained(\n",
31
+ "\"yuanzhoulvpi/gpt2_chinese\", num_labels=2\n",
32
+ ")\n",
33
+ "\n",
34
+ "\n",
35
+ "\n",
36
+ "2 如果是把分类问题,改成指令微调的模式,就是像\n",
37
+ "\n",
38
+ "```\n",
39
+ "{\n",
40
+ "\n",
41
+ "\"instruction\": \"你现在在做一项情感分类的任务,如果是积极情感,则回答积极。消极情感则回答消极。\"\n",
42
+ "\"input\":他家的奶茶超级好喝。。。\n",
43
+ "\"output\":“积极”\n",
44
+ "\n",
45
+ "}\n",
46
+ "```\n",
47
+ "\n",
48
+ "然后进行指令微调,lora/peft调整部分参数就行,一般是chat模型比较好。\n",
49
+ "\n",
50
+ "\n",
51
+ "\n",
52
+ "这种二分类问题,用llm就是大材小用了,一般就是选个小的的模型,用AutoModelForSequenceClassification效果最好,如果追求SOTA,有些研究表明搞成指令微调模式效果可能更好。"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "id": "2cfcc1e9-ddda-4a1c-871b-0508fd421ed5",
58
+ "metadata": {},
59
+ "source": [
60
+ "## 大模型微调(Fine-tuning)和指令微调(Instruction Tuning)\n",
61
+ "\n",
62
+ "普通的大模型微调(Fine-tuning)和指令微调(Instruction Tuning)是两种不同的训练方法,它们适用于不同的应用场景,并且在实现细节上也有所区别。\n",
63
+ "\n",
64
+ "\n",
65
+ "#### 1. **定义**\n",
66
+ "\n",
67
+ "普通微调是指在一个预训练好的大模型基础上,针对特定任务添加一个或多个新层(通常称为头部或 header),然后使用特定任务的数据集对整个模型(包括新添加的层)进行再训练的过程。对于分类任务,常见的做法是在 GPT-2 的顶部添加一个分类头。\n",
68
+ "\n",
69
+ "#### 2. **具体步骤**\n",
70
+ "\n",
71
+ "- **添加分类头**:为 GPT-2 添加一个分类头,该头通常包含线性层(全连接层)以及可能的激活函数和归一化层。\n",
72
+ " \n",
73
+ "- **准备数据**:准备好用于微调的任务特定数据集,如文本分类、情感分析等。\n",
74
+ " \n",
75
+ "- **微调过程**:\n",
76
+ " - 使用任务特定的数据集对整个模型(包括预训练权重和新添加的分类头)进行再训练。\n",
77
+ " - 通常会调整学习率、批量大小等超参数以优化性能。\n",
78
+ " - 可能只对新添加的层进行训练,或者对整个模型进行微调(取决于资源和需求)。\n",
79
+ "\n",
80
+ "#### 3. **适用场景**\n",
81
+ "\n",
82
+ "- **任务明确**:当有清晰的任务目标时,例如文本分类、命名实体识别等。\n",
83
+ "- **标签数据可用**:拥有足够的标注数据来进行监督学习。\n",
84
+ "\n",
85
+ "#### 4. **优点**\n",
86
+ "\n",
87
+ "- **针对性强**:能够有效地提升模型在特定任务上的表现。\n",
88
+ "- **资源利用效率高**:相比于从头开始训练,微调需要的计算资源和时间较少。\n",
89
+ "\n",
90
+ "#### 5. **缺点**\n",
91
+ "\n",
92
+ "- **泛化能力有限**:微调后的模型可能在未见过的任务或领域中表现不佳。\n",
93
+ "\n",
94
+ "### 指令微调(Instruction Tuning)\n",
95
+ "\n",
96
+ "#### 1. **定义**\n",
97
+ "\n",
98
+ "指令微调是一种更为通用的微调方法,它旨在让模型理解和遵循自然语言指令,而不是直接针对某个特定任务进行优化。这种方法通过提供一系列指令-输出对来训练模型,使其学会根据指令生成适当的响应。\n",
99
+ "\n",
100
+ "#### 2. **具体步骤**\n",
101
+ "\n",
102
+ "- **构造指令数据集**:创建一个包含各种指令及其预期输出的数据集。这些指令可以覆盖多种任务类型,如问答、翻译、摘要生成等。\n",
103
+ " \n",
104
+ "- **微调过程**:\n",
105
+ " - 使用指令数据集对模型进行训练,使模型能够理解并执行不同类型的指令。\n",
106
+ " - 强调模型对自然语言指令的理解和执行,而非特定于某一任务的优化。\n",
107
+ "\n",
108
+ "#### 3. **适用场景**\n",
109
+ "\n",
110
+ "- **多任务适应**:当希望模型能够在多种不同类型的任务中表现出色时。\n",
111
+ "- **少样本学习**:在仅有少量示例的情况下,仍然可以让模型快速适应新任务。\n",
112
+ "\n",
113
+ "#### 4. **优点**\n",
114
+ "\n",
115
+ "- **灵活性高**:模型可以在没有额外训练的情况下处理新的任务。\n",
116
+ "- **跨领域泛化能力强**:更有可能在未曾见过的任务或领域中保持良好的性能。\n",
117
+ "\n",
118
+ "#### 5. **缺点**\n",
119
+ "\n",
120
+ "- **复杂度增加**:指令微调通常涉及更多的训练数据和更复杂的训练过程。\n",
121
+ "- **评估难度较大**:由于任务的多样性,评估模型性能变得更加困难。\n",
122
+ "\n",
123
+ "\n",
124
+ "### 小结\n",
125
+ "\n",
126
+ "普通微调侧重于提高模型在特定任务上的性能,而指令微调则更加注重模型对自然语言指令的理解和执行能力。选择哪种方法取决于你的具体需求和应用场景。如果你有一个明确的任务并且有大量的标注数据,那么普通微调可能是更好的选择;如果你希望模型具有更高的灵活性和跨任务适应能力,则可以考虑指令微调。"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "id": "6203be53-18a5-447d-9071-32e031934b9c",
132
+ "metadata": {},
133
+ "source": [
134
+ "## 从GPT到chatGPT\n",
135
+ "\n",
136
+ "关键点在于指令微调(Instruction Tuning)\n",
137
+ "* 将所有任务统一为指令形式\n",
138
+ "* 多任务精调\n",
139
+ "* 与人类对齐(多样性)\n",
140
+ "* 进一步分为有监督指令微调和带有人类反馈的强化学习(RLHF)\n",
141
+ "\n",
142
+ "告别微调\n",
143
+ "\n",
144
+ "因为GPT-3使用了天量级的数据来进行预训练,所以学到的知识也更多更通用,以致于GPT-3打出的口号就是“告别微调的GPT-3”。\n",
145
+ "\n",
146
+ "相比于BERT这种预训练+微调的两阶段模型,GPT-3的目标是模型更加通用,从而解决BERT这种下游任务微调需要依赖领域标注数据的情况。\n",
147
+ "\n",
148
+ "拿我们实际业务举例,我主要做分本分类任务。对于使用BERT来完成文本分类任务来说,首先我需要使用海量的无标注文本数据进行预训练学习语言学知识。\n",
149
+ "\n",
150
+ "幸运的是这种预训练过程一般是一次性的,训练完成后可以把模型保存下来继续使用。很多大厂比如谷歌、Facebook等把得到的预训练模型开源了出来,所以咱们只需要导入预训练好的模型权重就可以直接使用了,相当于完成了模型的预训练过程;第二阶段就是微调了,对于文本分类等下游任务来说, 我们需要一批带标签的训练语料来微调模型。不同的下游任务会需要特定的训练语料。这时候面临的一个最大的问题是训练语料是需要人工标注的,而标注的成本是非常高的。除此之外不同的标注人员因为经验阅历等不同导致对同一条文本的理解也不同,所以容易出现标注不一致的问题。当标注数据量较少时还容易出现模型过拟合。归根结底就是微调是需要标注数据的,而获取标注数据的成本是很高的。\n",
151
+ "\n",
152
+ "为了解决这个问题,GPT-3可以让NLPer不用标注训练语料就能很好的完成下游任务,让GPT-3更通用更便利。GPT-3不需要进行微调的结构图如下所示:\n",
153
+ "\n",
154
+ "<img src='img/sft.png' width='600px' />"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "markdown",
159
+ "id": "28e037df-734b-4fe7-ac07-311f1b3a7d7b",
160
+ "metadata": {},
161
+ "source": [
162
+ "## 指令微调数据构建\n",
163
+ "\n",
164
+ "<img src='img/sft2.png' width='800px' />\n",
165
+ "\n",
166
+ "\n",
167
+ "\n",
168
+ "根据典型的分类语料数据,构建指令微调数据\n",
169
+ "\n",
170
+ "目前如llama等都使用Alpaca格式\n",
171
+ "\n",
172
+ "指令数据当做一般的文本,进行无监督的训练,和预训练流程一致"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "id": "64312191-423f-4a18-aa0c-036374e93fb2",
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "import subprocess\n",
183
+ "import os\n",
184
+ "# 设置环境变量, autodl一般区域\n",
185
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
186
+ "output = result.stdout\n",
187
+ "for line in output.splitlines():\n",
188
+ " if '=' in line:\n",
189
+ " var, value = line.split('=', 1)\n",
190
+ " os.environ[var] = value"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "code",
195
+ "execution_count": null,
196
+ "id": "32c16282-f9f1-4545-b522-daf2b39b4ead",
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "#原始模型\n",
201
+ "from transformers import AutoModel\n",
202
+ "model = AutoModel.from_pretrained(\"gpt2\")\n",
203
+ "model"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "id": "1149163f-4d89-472e-8d45-ebcbb5f9575e",
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "#分类微调模型\n",
214
+ "from transformers import AutoModelForSequenceClassification\n",
215
+ "ft_model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", num_labels=2)\n",
216
+ "ft_model"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": 1,
222
+ "id": "09735059-507c-48c4-893f-ca0da21ce5e8",
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "#指令微调模型\n",
227
+ "from transformers import AutoModelForCausalLM\n",
228
+ "sft_model = AutoModelForMaskedLM.from_pretrained(\"gpt2\")\n",
229
+ "sft_model"
230
+ ]
231
+ }
232
+ ],
233
+ "metadata": {
234
+ "kernelspec": {
235
+ "display_name": "Python 3 (ipykernel)",
236
+ "language": "python",
237
+ "name": "python3"
238
+ },
239
+ "language_info": {
240
+ "codemirror_mode": {
241
+ "name": "ipython",
242
+ "version": 3
243
+ },
244
+ "file_extension": ".py",
245
+ "mimetype": "text/x-python",
246
+ "name": "python",
247
+ "nbconvert_exporter": "python",
248
+ "pygments_lexer": "ipython3",
249
+ "version": "3.12.3"
250
+ }
251
+ },
252
+ "nbformat": 4,
253
+ "nbformat_minor": 5
254
+ }
04-gene-sft/.ipynb_checkpoints/2-gpt2-instruction-ft-checkpoint.ipynb ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c2e6b786-d18e-4a0d-aa59-0792dcb49c5f",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.2 基于GPT2的指令微调"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "dc04d5e3-7623-4d59-9f3b-ad03e339db11",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "from datasets import load_dataset\n",
19
+ "# 1. load ~11k samples from promoters prediction dataset\n",
20
+ "dataset = load_dataset(\"dnagpt/dna_promoter_300\")\n",
21
+ "dataset"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "id": "93d09d8d-f521-49f7-b0e0-7ac089dfbf49",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "def build_prompt(example):\n",
32
+ " if int(example['label']) == 1:\n",
33
+ " label = 'promoter'\n",
34
+ " else:\n",
35
+ " label = 'Non-promoter'\n",
36
+ "\n",
37
+ " instruction = \"Determine core promoter detection of following dna sequence, The result will be one of the following: Non-promoter, promoter.\"\n",
38
+ " \n",
39
+ " input = example[\"sequence\"]\n",
40
+ " input_text = f\"\\n\\n### Input:\\n{input}\"\n",
41
+ "\n",
42
+ "\n",
43
+ " output = label\n",
44
+ "\n",
45
+ " prompt = {\"instruction\":instruction, \n",
46
+ " \"input\":input,\n",
47
+ " \"output\":output\n",
48
+ " }\n",
49
+ "\n",
50
+ " return prompt"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": null,
56
+ "id": "9f9c0e5a-6591-47ac-b358-d746a00dfc0a",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "example = dna_dataset[\"train\"][0]\n",
61
+ "print(build_prompt(example))"
62
+ ]
63
+ },
64
+ {
65
+ "cell_type": "code",
66
+ "execution_count": null,
67
+ "id": "83070a23-1604-4d28-b371-e01060331ed5",
68
+ "metadata": {},
69
+ "outputs": [],
70
+ "source": [
71
+ "import json\n",
72
+ "ins_file = open(\"data/dna_promoter_300.jsonl\", \"w\")\n",
73
+ "ins_list = []\n",
74
+ "for ins in dna_dataset[\"train\"]:\n",
75
+ " if ins[\"sequence\"]==\"sequence\":\n",
76
+ " continue\n",
77
+ " ins = build_prompt(ins)\n",
78
+ " ins_file.write(json.dumps(ins)+\"\\n\")\n",
79
+ " ins_list.append(ins)\n",
80
+ "ins_file.close()"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "id": "89fb8ed3-aa58-462f-b2a6-ce445c597a33",
87
+ "metadata": {},
88
+ "outputs": [],
89
+ "source": [
90
+ "dna_ft_dataset = load_dataset(\"json\", data_files='data/dna_promoter_300.jsonl')\n",
91
+ "dna_ft_dataset"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "id": "e4f7b75f-6ccb-4fda-8004-40df7d52678f",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "data = dna_ft_dataset[\"train\"].train_test_split(train_size=0.9, seed=42)\n",
102
+ "data"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "id": "36d9ee0e-8423-4529-aa7e-fda2728fab2f",
109
+ "metadata": {},
110
+ "outputs": [],
111
+ "source": [
112
+ "# 初始化tokenizer\n",
113
+ "from datasets import load_dataset\n",
114
+ "from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig\n",
115
+ "from transformers import GPT2Tokenizer,GPT2Model,AutoModel\n",
116
+ "from transformers import DataCollatorForLanguageModeling\n",
117
+ "from transformers import Trainer, TrainingArguments\n",
118
+ "from tokenizers import Tokenizer\n",
119
+ "from transformers import GPT2TokenizerFast\n",
120
+ "\n",
121
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
122
+ "tokenizer.pad_token = tokenizer.eos_token"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "code",
127
+ "execution_count": null,
128
+ "id": "871baee0-f06f-4422-a741-af533f7d92e1",
129
+ "metadata": {},
130
+ "outputs": [],
131
+ "source": [
132
+ "#构建提示词\n",
133
+ "def format_input(entry):\n",
134
+ " instruction_text = (\n",
135
+ " f\"Below is an instruction that describes a task. \"\n",
136
+ " f\"Write a response that appropriately completes the request.\"\n",
137
+ " f\"\\n\\n### Instruction:\\n{entry['instruction']}\"\n",
138
+ " )\n",
139
+ "\n",
140
+ " input_text = f\"\\n\\n### Input:\\n{entry['input']}\" if entry[\"input\"] else \"\"\n",
141
+ "\n",
142
+ " return instruction_text + input_text + \"\\n\\n### Response:\\n\"\n",
143
+ "\n",
144
+ "#构建提示词\n",
145
+ "def build_prompt(entry):\n",
146
+ "\n",
147
+ " input_data = format_input(entry)\n",
148
+ "\n",
149
+ " desired_response = entry['output']\n",
150
+ "\n",
151
+ " return input_data + desired_response"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": null,
157
+ "id": "bca1c275-cc3d-43df-923e-e6604d584226",
158
+ "metadata": {},
159
+ "outputs": [],
160
+ "source": [
161
+ "example = data[\"test\"][0]\n",
162
+ "example"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": null,
168
+ "id": "76f2e027-0a31-4919-bb7e-404c786e1599",
169
+ "metadata": {},
170
+ "outputs": [],
171
+ "source": [
172
+ "prompt = build_prompt(example)\n",
173
+ "print(prompt)"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": null,
179
+ "id": "932b54ca-7e27-47cd-b67d-7ef8386b6608",
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "print('tokens: ', ' '.join(tokenizer.tokenize(prompt)))"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "id": "26671faf-68d0-4a44-978e-e1a24e86c9b1",
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "def tokenize_function(example):\n",
194
+ " prompt = build_prompt(example)\n",
195
+ " result = tokenizer(prompt, padding='max_length', truncation=True, max_length=1024) # max_length=1024\n",
196
+ " return result\n",
197
+ "\n",
198
+ "\n",
199
+ "# Use batched=False for easy\n",
200
+ "tokenized_datasets = data.map(\n",
201
+ " tokenize_function, batched=False,remove_columns=['instruction', 'input', 'output']\n",
202
+ ")\n",
203
+ "tokenized_datasets"
204
+ ]
205
+ },
206
+ {
207
+ "cell_type": "code",
208
+ "execution_count": null,
209
+ "id": "3d46c8b1-9fb3-431a-87ea-c278468543e7",
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "tokenized_datasets[\"train\"]"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": null,
219
+ "id": "26985c81-4335-4ac0-9a5a-84a5b4f2d0e4",
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "# 创建DataCollator\n",
224
+ "data_collator = DataCollatorForLanguageModeling(\n",
225
+ " tokenizer=tokenizer,\n",
226
+ " mlm=False, # 因为GPT2是自回归模型,不需要MLM\n",
227
+ ")"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "e18d3095-d6dd-423b-84fb-dca4a629d450",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "model = GPT2LMHeadModel.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": null,
243
+ "id": "12134cf2-676a-4176-a733-35caab2fd520",
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=1000):\n",
248
+ " # Tokenize\n",
249
+ " input_ids = tokenizer.encode(\n",
250
+ " text,\n",
251
+ " return_tensors=\"pt\",\n",
252
+ " truncation=True,\n",
253
+ " max_length=max_input_tokens\n",
254
+ " # return_attention_mask=True,\n",
255
+ " )\n",
256
+ "\n",
257
+ " # Generate\n",
258
+ " device = model.device\n",
259
+ " generated_tokens_with_prompt = model.generate(\n",
260
+ " input_ids=input_ids.to(device),\n",
261
+ " #max_length=max_output_tokens,\n",
262
+ " max_new_tokens=5,\n",
263
+ " )\n",
264
+ "\n",
265
+ " # Decode\n",
266
+ " #generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)\n",
267
+ " # Strip the prompt\n",
268
+ " #generated_text_answer = generated_text_with_prompt[0][len(text):]\n",
269
+ " \n",
270
+ " generated_text_with_prompt = tokenizer.decode(generated_tokens_with_prompt[0], skip_special_tokens=True)\n",
271
+ " generated_text_answer = generated_text_with_prompt[len(text):]\n",
272
+ "\n",
273
+ "\n",
274
+ " return generated_text_answer\n",
275
+ "\n",
276
+ "# 如果需要进一步清理\n",
277
+ "def clean_generated_text(text):\n",
278
+ " # 去除 'Ġ' 符号并替换为空格\n",
279
+ " text = text.replace('Ġ', ' ')\n",
280
+ " # 去除多余的空格\n",
281
+ " text = ' '.join(text.split())\n",
282
+ " return text"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "id": "b9a2e2a9-a1ff-44b0-a550-623a16d0d7a2",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "input_text = format_input(data[\"test\"][0])\n",
293
+ "\n",
294
+ "print(\"input (test):\", input_text)\n",
295
+ "\n",
296
+ "print(\"--------------------------\\n\")\n",
297
+ "\n",
298
+ "print(\"model's answer: \\n\")\n",
299
+ "print(inference(input_text, model, tokenizer))"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": null,
305
+ "id": "63b54fe2-f077-4ca8-974e-1bcc41ce57d6",
306
+ "metadata": {},
307
+ "outputs": [],
308
+ "source": [
309
+ "training_args = TrainingArguments(\n",
310
+ " output_dir='./results_small',\n",
311
+ " overwrite_output_dir=True,\n",
312
+ " num_train_epochs=3,\n",
313
+ " per_device_train_batch_size=8,\n",
314
+ " save_steps=2000,\n",
315
+ " save_total_limit=2,\n",
316
+ " prediction_loss_only=True,\n",
317
+ " fp16=True, #v100没法用\n",
318
+ " )"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": null,
324
+ "id": "61df123d-e67d-4548-998a-de1e2781e774",
325
+ "metadata": {},
326
+ "outputs": [],
327
+ "source": [
328
+ "# 初始化Trainer\n",
329
+ "trainer = Trainer(\n",
330
+ " model=model,\n",
331
+ " args=training_args,\n",
332
+ " train_dataset=tokenized_datasets['train'],\n",
333
+ " eval_dataset=tokenized_datasets['test'],\n",
334
+ " data_collator=data_collator\n",
335
+ ")"
336
+ ]
337
+ },
338
+ {
339
+ "cell_type": "code",
340
+ "execution_count": null,
341
+ "id": "a9cd936a-5ea6-43e3-9848-27080f818606",
342
+ "metadata": {},
343
+ "outputs": [],
344
+ "source": [
345
+ "# 开始训练\n",
346
+ "trainer.train()"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": null,
352
+ "id": "315aae76-44b4-4513-8139-40ef22934873",
353
+ "metadata": {},
354
+ "outputs": [],
355
+ "source": [
356
+ "save_dir = 'gpt_ft/final'\n",
357
+ "trainer.save_model(save_dir)\n",
358
+ "print(\"Saved model to:\", save_dir)"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": null,
364
+ "id": "28d2dbbc-02ff-4120-b230-b19905a786cd",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "ave_dir = 'gpt_ft/final'\n",
369
+ "finetuned_model = GPT2LMHeadModel.from_pretrained(save_dir, local_files_only=True)"
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "id": "08987c3c-063a-4e9b-9ebb-e637b0b5bccd",
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "finetuned_model"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "code",
384
+ "execution_count": null,
385
+ "id": "d75010e8-6d6a-40ef-852e-0d705adc3da8",
386
+ "metadata": {},
387
+ "outputs": [],
388
+ "source": [
389
+ "print(\"input (test):\", input_text)\n",
390
+ "\n",
391
+ "print(\"--------------------------\\n\")\n",
392
+ "\n",
393
+ "print(\"model's answer: \\n\")\n",
394
+ "print(inference(input_text, finetuned_model, tokenizer))\n",
395
+ "\n",
396
+ "print(\"--------------------------\\n\")\n",
397
+ "print(\"real answer: \\n\")\n",
398
+ "print(data[\"test\"][0][\"output\"])"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": null,
404
+ "id": "64365e15-510e-4abf-92f5-c78b660b37dc",
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": [
408
+ "test_data = data[\"test\"].select(range(100))\n",
409
+ "\n",
410
+ "data_list = []\n",
411
+ "\n",
412
+ "for entry in test_data:\n",
413
+ " input_text = format_input(entry)\n",
414
+ " #print(input_text)\n",
415
+ " response_text = inference(input_text, finetuned_model, tokenizer)\n",
416
+ " #print(response_text)\n",
417
+ " data = {\n",
418
+ " \"instruction\":entry[\"instruction\"],\n",
419
+ " \"input\":entry[\"input\"],\n",
420
+ " \"output\":entry[\"output\"],\n",
421
+ " \"model_response\":response_text\n",
422
+ " }\n",
423
+ "\n",
424
+ " data_list.append(data)"
425
+ ]
426
+ },
427
+ {
428
+ "cell_type": "code",
429
+ "execution_count": null,
430
+ "id": "a45fb780-fc3f-401c-b6e0-6f7d0c1682de",
431
+ "metadata": {},
432
+ "outputs": [],
433
+ "source": [
434
+ "import json\n",
435
+ "\n",
436
+ "# 定义输出文件路径\n",
437
+ "output_file = 'gpt2-small3-1024.json'\n",
438
+ "\n",
439
+ "# 将 Dataset 对象导出为 JSON 文件\n",
440
+ "# test_data.to_json(output_file)\n",
441
+ "with open(output_file, \"w\") as file:\n",
442
+ " json.dump(data_list, file, indent=4) # \"indent\" for pretty-printing"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": null,
448
+ "id": "a83c8881-c763-4bba-8b85-584a6722a38e",
449
+ "metadata": {},
450
+ "outputs": [],
451
+ "source": [
452
+ "import json\n",
453
+ "\n",
454
+ "\n",
455
+ "\n",
456
+ "with open(output_file, \"r\") as file:\n",
457
+ " test_data = json.load(file)\n",
458
+ "\n",
459
+ "all_num = len(test_data)\n",
460
+ "right_sum = 0\n",
461
+ "same_sum = 0\n",
462
+ "for item in test_data:\n",
463
+ " output = item[\"output\"]\n",
464
+ " #output = \" \".join(tokenizer.tokenize(output))\n",
465
+ " model_response = item[\"model_response\"]\n",
466
+ " if model_response == output: #same it\n",
467
+ " same_sum = same_sum + 1\n",
468
+ " \n",
469
+ " if model_response.find(output)!=-1: #find it\n",
470
+ " right_sum = right_sum + 1\n",
471
+ "\n",
472
+ "\n",
473
+ "print(\"presicion\", right_sum/all_num, \"same\", same_sum/all_num)"
474
+ ]
475
+ }
476
+ ],
477
+ "metadata": {
478
+ "kernelspec": {
479
+ "display_name": "Python 3 (ipykernel)",
480
+ "language": "python",
481
+ "name": "python3"
482
+ },
483
+ "language_info": {
484
+ "codemirror_mode": {
485
+ "name": "ipython",
486
+ "version": 3
487
+ },
488
+ "file_extension": ".py",
489
+ "mimetype": "text/x-python",
490
+ "name": "python",
491
+ "nbconvert_exporter": "python",
492
+ "pygments_lexer": "ipython3",
493
+ "version": "3.12.3"
494
+ }
495
+ },
496
+ "nbformat": 4,
497
+ "nbformat_minor": 5
498
+ }
04-gene-sft/.ipynb_checkpoints/3-llama-expand-dict-checkpoint.ipynb ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "6bad311a-c949-4246-9e6b-6d4ec76699b7",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.3 基于llama的基因数据词典扩充"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "d42860cf-14fc-48f5-ac6c-1fd92a6a92ba",
14
+ "metadata": {},
15
+ "source": [
16
+ "前面介绍了huggingface自带的分词器构建代码,这里介绍下更为通用的sentencepiece,部分huggingface其实就是来自于这个框架。\n",
17
+ "\n",
18
+ "SentencePiece 是一个语言无关的分词框架,由 Google 开发并开源。它不同于传统的基于词汇表(如词典)的分词方法,而是采用一种无监督的学习方式来训练模型,从而将文本分割成“子词”单元(subword units)。这种方法使得 SentencePiece 在处理未知词、罕见词以及多语言文本时表现出色。\n",
19
+ "\n",
20
+ "### 主要特点\n",
21
+ "\n",
22
+ "1. **语言无关**:\n",
23
+ " - SentencePiece 不依赖于任何特定语言的规则或词典,因此它可以应用于任何语言,甚至是混合语言的文本。\n",
24
+ "\n",
25
+ "2. **子词分词**:\n",
26
+ " - 它生成的是子词级别的 token,而不是完整的单词。这种方式可以有效地处理 OOV (out-of-vocabulary) 问题,并且有助于减少词汇表的大小。\n",
27
+ "\n",
28
+ "3. **无监督学习**:\n",
29
+ " - SentencePiece 使用无监督的方法从原始文本中学习分词规则,这意味着你只需要提供未标注的文本数据即可训练分词模型。\n",
30
+ "\n",
31
+ "4. **灵活的分词粒度**:\n",
32
+ " - 可以通过调整参数控制分词的粒度,即生成的子词单元的平均长度。这允许根据具体应用需求优化性能。\n",
33
+ "\n",
34
+ "5. **支持 BPE 和 Unigram LM**:\n",
35
+ " - SentencePiece 实现了两种流行的分词算法:Byte Pair Encoding (BPE) 和 Unigram Language Model (Unigram LM)。这两种方法各有优劣,可以根据任务选择合适的一种。\n",
36
+ "\n",
37
+ "6. **易于集成**:\n",
38
+ " - 提供了多种编程语言的绑定,包括 Python、C++、Go 等,方便在不同环境中使用。\n",
39
+ "\n",
40
+ "### 工作流程\n",
41
+ "\n",
42
+ "1. **准备语料库**:\n",
43
+ " - 收集用于训练分词模型的未标注文本数据。\n",
44
+ "\n",
45
+ "2. **训练模型**:\n",
46
+ " - 使用 `sentencepiece_trainer` 工具对收集到的文本进行训练,生成分词模型文件。\n",
47
+ " ```bash\n",
48
+ " spm_train --input=your_corpus.txt --model_prefix=myprefix --vocab_size=8000\n",
49
+ " ```\n",
50
+ "\n",
51
+ "3. **编码和解码**:\n",
52
+ " - 训练完成后,可以使用生成的模型对新文本进行编码(分词)和解码(还原)。\n",
53
+ " ```python\n",
54
+ " import sentencepiece as spm\n",
55
+ "\n",
56
+ " # 加载训练好的模型\n",
57
+ " sp = spm.SentencePieceProcessor(model_file='myprefix.model')\n",
58
+ "\n",
59
+ " # 分词\n",
60
+ " encoded = sp.encode(\"这是一个测试句子。\", out_type=str)\n",
61
+ " print(encoded)\n",
62
+ "\n",
63
+ " # 还原\n",
64
+ " decoded = sp.decode(encoded)\n",
65
+ " print(decoded)\n",
66
+ " ```\n",
67
+ "\n",
68
+ "### 应用场景\n",
69
+ "\n",
70
+ "- **自然语言处理 (NLP)**:广泛应用于各种 NLP 任务,如机器翻译、文本分类、情感分析等。\n",
71
+ "- **多语言支持**:特别适合处理包含多种语言的文本。\n",
72
+ "- **低资源语言**:对于那些缺乏丰富词汇资源的语言尤其有用。\n",
73
+ "- **预训练语言模型**:许多现代预训练语言模型(如 BERT、T5、mBART)都采用了 SentencePiece 作为其分词工具。\n",
74
+ "\n",
75
+ "### 小结\n",
76
+ "\n",
77
+ "SentencePiece 是一个强大而灵活的分词框架,适用于广泛的文本处理任务。它的无监督学习特性、语言无关性和高效的子词分词能力使其成为处理复杂和多样化文本数据的理想选择。希望这个简单的介绍能帮助你理解 SentencePiece 的基本概念和应用场景。如果有更多问题或需要进一步的帮助,请随时提问!"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "id": "a8dedb50-a428-4146-8edf-84e699abf81b",
83
+ "metadata": {},
84
+ "source": [
85
+ "## GENE分词器构建"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "id": "39b5bf12-eaf0-432e-a2b0-99ba437daf3e",
92
+ "metadata": {},
93
+ "outputs": [],
94
+ "source": [
95
+ "!pip install sentencepiece"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "id": "3b732b8e-53d1-4bfa-891b-2d63b886cc4a",
102
+ "metadata": {},
103
+ "outputs": [],
104
+ "source": [
105
+ "import sentencepiece as spm\n",
106
+ "\n",
107
+ "spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',\n",
108
+ " model_prefix='gene_bpe_seg', \n",
109
+ " vocab_size=60000,\n",
110
+ " model_type='bpe', #默认是unigram\n",
111
+ " num_threads=10,\n",
112
+ " )"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": null,
118
+ "id": "19a06b82-31b8-48cb-9c83-ec016da2da8a",
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": [
122
+ "from sentencepiece import SentencePieceProcessor\n",
123
+ "model_path = \"gene_bpe_seg.model\"\n",
124
+ "sp_model = SentencePieceProcessor(model_file=model_path)\n",
125
+ "mm = sp_model.EncodeAsPieces(\"TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV\")\n",
126
+ "print(mm)"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "markdown",
131
+ "id": "958f7bd6-060f-48f4-8afe-02c3032312eb",
132
+ "metadata": {},
133
+ "source": [
134
+ "## 合并词典到llama\n",
135
+ "\n",
136
+ "我们以基础版本的llama为例,进行合并,请注意llama的使用限制。\n",
137
+ "\n",
138
+ "新版本的llama需要自行认证下载。[链接](https://huggingface.co/meta-llama)\n",
139
+ "\n",
140
+ "```\n",
141
+ "#建议在终端下执行\n",
142
+ "pip install -U huggingface_hub\n",
143
+ "export HF_ENDPOINT=https://hf-mirror.com\n",
144
+ "huggingface-cli download --resume-download yahma/llama-7b-hf --local-dir llama-7b-hf\n",
145
+ "```"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": null,
151
+ "id": "3bafcc33-2923-4026-bc39-c6ec716d2e3c",
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "import os\n",
156
+ "os.environ[\"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION\"]=\"python\"\n",
157
+ "from transformers import LlamaTokenizer\n",
158
+ "from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model\n",
159
+ "import sentencepiece as spm"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "id": "66cb86ed-3225-4bb0-8aca-6005bc918d03",
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": [
169
+ "llama_tokenizer_dir = \"llama-7b-hf\" \n",
170
+ "dna_sp_model_file = \"gene_bpe_seg.model\"\n",
171
+ "\n",
172
+ "# load\n",
173
+ "llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)\n",
174
+ "dna_sp_model = spm.SentencePieceProcessor()\n",
175
+ "dna_sp_model.Load(dna_sp_model_file)\n",
176
+ "\n",
177
+ "llama_spm = sp_pb2_model.ModelProto()\n",
178
+ "llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())\n",
179
+ "dna_spm = sp_pb2_model.ModelProto()\n",
180
+ "dna_spm.ParseFromString(dna_sp_model.serialized_model_proto())\n",
181
+ "\n",
182
+ "# print number of tokens\n",
183
+ "print(len(llama_tokenizer),len(dna_sp_model))\n",
184
+ "print(llama_tokenizer.all_special_tokens)\n",
185
+ "print(llama_tokenizer.all_special_ids)\n",
186
+ "print(llama_tokenizer.special_tokens_map)"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "id": "7ba4240e-bc08-4be0-8ca3-c4e7a47fa055",
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "## Add dna tokens to LLaMA tokenizer\n",
197
+ "llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)\n",
198
+ "print(len(llama_spm_tokens_set))\n",
199
+ "print(f\"Before:{len(llama_spm_tokens_set)}\")\n",
200
+ "for p in dna_spm.pieces:\n",
201
+ " piece = p.piece\n",
202
+ " score = p.score\n",
203
+ " if piece not in llama_spm_tokens_set:\n",
204
+ " new_p = sp_pb2_model.ModelProto().SentencePiece()\n",
205
+ " new_p.piece = piece\n",
206
+ " new_p.score = score # 0?\n",
207
+ " llama_spm.pieces.append(new_p)\n",
208
+ "print(f\"New model pieces: {len(llama_spm.pieces)}\")"
209
+ ]
210
+ },
211
+ {
212
+ "cell_type": "code",
213
+ "execution_count": null,
214
+ "id": "a240a7d8-c1a9-4473-a5c5-157a25f97c16",
215
+ "metadata": {},
216
+ "outputs": [],
217
+ "source": [
218
+ "## Save\n",
219
+ "output_sp_dir = 'merged_gene_eng_tokenizer_sp'\n",
220
+ "output_hf_dir = 'merged_gene_eng_tokenizer_hf' # the path to save dna-LLaMA tokenizer\n",
221
+ "os.makedirs(output_sp_dir,exist_ok=True)\n",
222
+ "with open(output_sp_dir+'/gene_eng_llama_tokenizer.model', 'wb') as f:\n",
223
+ " f.write(llama_spm.SerializeToString())\n",
224
+ "\n",
225
+ "tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/gene_eng_llama_tokenizer.model')\n",
226
+ "tokenizer.save_pretrained(output_hf_dir)\n",
227
+ "print(f\"gene-LLaMA tokenizer has been saved to {output_hf_dir}\")"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "cbd1f648-f8a0-4f16-b516-2ce3e7c7cfee",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": [
237
+ "# Test\n",
238
+ "llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)\n",
239
+ "dna_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)\n",
240
+ "print(tokenizer.all_special_tokens)\n",
241
+ "print(tokenizer.all_special_ids)\n",
242
+ "print(tokenizer.special_tokens_map)\n",
243
+ "text='''TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV,\n",
244
+ "The primary use of LLaMA is research on large language models, including'''\n",
245
+ "print(\"Test text:\\n\",text)\n",
246
+ "print(f\"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}\")\n",
247
+ "print(f\"Tokenized by GENE-LLaMA tokenizer:{dna_llama_tokenizer.tokenize(text)}\")"
248
+ ]
249
+ }
250
+ ],
251
+ "metadata": {
252
+ "kernelspec": {
253
+ "display_name": "Python 3 (ipykernel)",
254
+ "language": "python",
255
+ "name": "python3"
256
+ },
257
+ "language_info": {
258
+ "codemirror_mode": {
259
+ "name": "ipython",
260
+ "version": 3
261
+ },
262
+ "file_extension": ".py",
263
+ "mimetype": "text/x-python",
264
+ "name": "python",
265
+ "nbconvert_exporter": "python",
266
+ "pygments_lexer": "ipython3",
267
+ "version": "3.12.3"
268
+ }
269
+ },
270
+ "nbformat": 4,
271
+ "nbformat_minor": 5
272
+ }
04-gene-sft/.ipynb_checkpoints/4-deepspeed-intro-checkpoint.ipynb ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "c2e5c9f4-4378-4d39-bc4f-fb4b4a2b2481",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.4 deepspeed分布式训练简介"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "3383c2d7-91a9-4940-b3b2-698fb7d9dbb7",
14
+ "metadata": {},
15
+ "source": [
16
+ "## 使用gpt2+deepspeed训练"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "markdown",
21
+ "id": "c0d29667-1e75-46df-8f65-cae27609ee3f",
22
+ "metadata": {},
23
+ "source": [
24
+ "## deepspeed简介\n",
25
+ "\n",
26
+ "DeepSpeed 是微软开发的开源深度学习优化库,专为大规模模型训练和推理设计,能够显著提升训练速度、降低显存占用,并支持分布式计算。以下是 DeepSpeed 的关键特点和功能:\n",
27
+ "\n",
28
+ "---\n",
29
+ "\n",
30
+ "### **1. 核心特点**\n",
31
+ "\n",
32
+ "#### **(1)高效分布式训练**\n",
33
+ "DeepSpeed 提供先进的分布式技术(如 ZeRO 优化器),支持数百亿甚至上万亿参数的模型训练,同时降低单设备显存需求。\n",
34
+ "\n",
35
+ "#### **(2)显存优化**\n",
36
+ "通过显存分片(ZeRO)、梯度累积和混合精度训练,DeepSpeed 能够在有限显存的情况下训练大模型。\n",
37
+ "\n",
38
+ "#### **(3)性能提升**\n",
39
+ "DeepSpeed 优化了通信和计算过程,可提升多 GPU 分布式训练效率。\n",
40
+ "\n",
41
+ "#### **(4)灵活性**\n",
42
+ "与 PyTorch 无缝集成,并兼容 Hugging Face `transformers` 和其他主流深度学习库。\n",
43
+ "\n",
44
+ "#### **(5)推理优化**\n",
45
+ "支持高效推理(如量化和张量并行),适合大模型的生产部署。\n",
46
+ "\n",
47
+ "---\n",
48
+ "\n",
49
+ "### **2. 核心技术**\n",
50
+ "\n",
51
+ "#### **(1)ZeRO 优化器**\n",
52
+ "ZeRO(Zero Redundancy Optimizer)是 DeepSpeed 的核心技术之一,分为 3 个阶段:\n",
53
+ "- **Stage 1**:分片优化器状态(如动量、方差)。\n",
54
+ "- **Stage 2**:分片优化器状态和梯度。\n",
55
+ "- **Stage 3**:分片优化器状态、梯度和模型参数,实现全分片优化。\n",
56
+ "\n",
57
+ "每个阶段都进一步减少显存需求,Stage 3 可支持超大规模模型(如 GPT-3)。\n",
58
+ "\n",
59
+ "#### **(2)混合精度训练**\n",
60
+ "通过 FP16 或 BF16(半精度浮点数)计算,显著减少显存占用并提升计算效率。\n",
61
+ "\n",
62
+ "#### **(3)数据并行与模型并行**\n",
63
+ "- 数据并行:将数据划分到多个设备,每个设备计算部分梯度。\n",
64
+ "- 模型并行:将模型的不同部分分配到多个设备。\n",
65
+ "- 张量并行:将张量运算分解到多个 GPU 上。\n",
66
+ "\n",
67
+ "#### **(4)梯度累积**\n",
68
+ "支持更大的有效批量大小,适合显存受限的设备。\n",
69
+ "\n",
70
+ "#### **(5)推理优化**\n",
71
+ "- 推理阶段的显存优化和加速技术。\n",
72
+ "- 量化推理,减少模型大小和运行时开销。\n",
73
+ "\n",
74
+ "---\n",
75
+ "\n",
76
+ "### **3. 适用场景**\n",
77
+ "\n",
78
+ "#### **(1)大规模模型训练**\n",
79
+ "适合训练数十亿或上万亿参数的模型,如 GPT-3、BERT、T5 等。\n",
80
+ "\n",
81
+ "#### **(2)分布式训练**\n",
82
+ "支持单机多卡、多机多卡分布式训练,能高效利用多 GPU 环境。\n",
83
+ "\n",
84
+ "#### **(3)显存受限的模型微调**\n",
85
+ "通过显存优化技术,能在较小显存设备(如 16GB GPU)上微调大模型。\n",
86
+ "\n",
87
+ "#### **(4)高效推理**\n",
88
+ "适用于大语言模型的生产部署,支持推理加速和量化。\n",
89
+ "\n",
90
+ "---\n",
91
+ "\n",
92
+ "### **4. 优势与局限性**\n",
93
+ "\n",
94
+ "#### **优势**\n",
95
+ "1. 显存需求显著降低,适合超大规模模型训练。\n",
96
+ "2. 支持多种分布式模式,扩展性强。\n",
97
+ "3. 与 PyTorch 和 Hugging Face 无缝集成。\n",
98
+ "4. 推理优化技术降低部署成本。\n",
99
+ "\n",
100
+ "#### **局限性**\n",
101
+ "1. 配置和调优可能较为复杂。\n",
102
+ "2. 对小规模模型或数据集的性能提升有限。\n",
103
+ "\n",
104
+ "---\n",
105
+ "\n",
106
+ "### **5. 安装与基本用法**\n",
107
+ "\n",
108
+ "#### **安装**\n",
109
+ "```bash\n",
110
+ "pip install deepspeed\n",
111
+ "```\n",
112
+ "\n",
113
+ "#### **基本用法**\n",
114
+ "DeepSpeed 通过配置文件启用特性,例如 ZeRO 优化器:\n",
115
+ "```python\n",
116
+ "from transformers import GPT2LMHeadModel, TrainingArguments, Trainer\n",
117
+ "import deepspeed\n",
118
+ "\n",
119
+ "# 配置 DeepSpeed\n",
120
+ "deepspeed_config = {\n",
121
+ " \"train_batch_size\": 64,\n",
122
+ " \"gradient_accumulation_steps\": 8,\n",
123
+ " \"fp16\": {\n",
124
+ " \"enabled\": True\n",
125
+ " },\n",
126
+ " \"zero_optimization\": {\n",
127
+ " \"stage\": 2,\n",
128
+ " \"overlap_comm\": True\n",
129
+ " }\n",
130
+ "}\n",
131
+ "\n",
132
+ "# 保存配置文件\n",
133
+ "import json\n",
134
+ "with open(\"deepspeed_config.json\", \"w\") as f:\n",
135
+ " json.dump(deepspeed_config, f)\n",
136
+ "\n",
137
+ "# 集成到 Hugging Face Trainer\n",
138
+ "training_args = TrainingArguments(\n",
139
+ " output_dir=\"./results\",\n",
140
+ " per_device_train_batch_size=4,\n",
141
+ " num_train_epochs=3,\n",
142
+ " learning_rate=5e-5,\n",
143
+ " fp16=True,\n",
144
+ " deepspeed=\"./deepspeed_config.json\" # DeepSpeed 配置文件\n",
145
+ ")\n",
146
+ "\n",
147
+ "trainer = Trainer(\n",
148
+ " model=GPT2LMHeadModel.from_pretrained(\"gpt2\"),\n",
149
+ " args=training_args,\n",
150
+ " train_dataset=train_dataset,\n",
151
+ " eval_dataset=eval_dataset\n",
152
+ ")\n",
153
+ "\n",
154
+ "trainer.train()\n",
155
+ "```\n",
156
+ "\n",
157
+ "---\n",
158
+ "\n",
159
+ "### **6. 总结**\n",
160
+ "\n",
161
+ "DeepSpeed 是大模型训练的强力工具,特别是在多 GPU 环境下,其显存优化和分布式训练技术能显著提升训练效率。适用于以下场景:\n",
162
+ "- 超大规模模型的训练和微调。\n",
163
+ "- 多机多卡环境的分布式训练。\n",
164
+ "- 高效推理部署。\n",
165
+ "\n",
166
+ "如果需要进一步优化模型训练或部署性能,DeepSpeed 是值得尝试的工具!"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "markdown",
171
+ "id": "75b8219d-8069-4b18-96c8-d5024ee049f1",
172
+ "metadata": {},
173
+ "source": [
174
+ "## 大模型并行方法\n",
175
+ "\n",
176
+ "大模型的并行训练旨在克服单个 GPU 显存的限制和加速训练过程,通常适用于参数规模较大的模型(如 GPT-3、T5 等)。并行训练主要包括以下几种方法,每种方法适用于不同的场景和模型特性。\n",
177
+ "\n",
178
+ "---\n",
179
+ "\n",
180
+ "### **1. 数据并行(Data Parallelism)**\n",
181
+ "\n",
182
+ "#### **原理**\n",
183
+ "- 将数据切分成多个小批次,每个 GPU 处理其中一部分。\n",
184
+ "- 模型副本被复制到每个 GPU。\n",
185
+ "- 每个 GPU 独立计算梯度,最终通过梯度同步(如 AllReduce 操作)更新参数。\n",
186
+ "\n",
187
+ "#### **特点**\n",
188
+ "- **优点**:\n",
189
+ " - 实现简单,是最常用的并行方法。\n",
190
+ " - 对模型大小没有限制。\n",
191
+ "- **缺点**:\n",
192
+ " - 模型副本需要完整加载到每个 GPU,占用显存。\n",
193
+ " - 在超大规模模型中,显存压力较大。\n",
194
+ "\n",
195
+ "#### **适用场景**\n",
196
+ "- 参数规模适中,显存可以容纳整个模型的场景。\n",
197
+ "\n",
198
+ "---\n",
199
+ "\n",
200
+ "### **2. 模型并行(Model Parallelism)**\n",
201
+ "\n",
202
+ "#### **原理**\n",
203
+ "- 将模型切分成不同的部分,将不同部分分配到不同的 GPU。\n",
204
+ "- 前向传播和后向传播时,数据在模型的不同部分之间传递。\n",
205
+ "\n",
206
+ "#### **特点**\n",
207
+ "- **优点**:\n",
208
+ " - 不需要复制整个模型,可以支持超大规模模型。\n",
209
+ "- **缺点**:\n",
210
+ " - GPU 之间通信频繁,可能成为性能瓶颈。\n",
211
+ " - 实现复杂,切分模型需要精心设计。\n",
212
+ " \n",
213
+ "#### **适用场景**\n",
214
+ "- 单个 GPU 无法容纳完整模型参数的场景。\n",
215
+ "\n",
216
+ "#### **具体实现**\n",
217
+ "- 将 Transformer 的不同层分配到不同的 GPU。\n",
218
+ "- 常用工具:DeepSpeed 的 Pipeline Parallelism、NVIDIA Megatron-LM。\n",
219
+ "\n",
220
+ "---\n",
221
+ "\n",
222
+ "### **3. 张量并行(Tensor Parallelism)**\n",
223
+ "\n",
224
+ "#### **原理**\n",
225
+ "- 将模型内部的张量(如权重矩阵)切分为多个子张量,并分配到不同 GPU。\n",
226
+ "- GPU 之间协作完成矩阵计算。\n",
227
+ "\n",
228
+ "#### **特点**\n",
229
+ "- **优点**:\n",
230
+ " - 减少了每个 GPU 的显存占用,同时保持模型整体完整性。\n",
231
+ "- **缺点**:\n",
232
+ " - 实现较复杂,需要优化通信操作。\n",
233
+ " - 通信开销较高,适合较大批量的训练。\n",
234
+ "\n",
235
+ "#### **适用场景**\n",
236
+ "- 参数非常大的模型(如 GPT-3)。\n",
237
+ "- 需要极致优化显存的场景。\n",
238
+ "\n",
239
+ "#### **具体实现**\n",
240
+ "- NVIDIA 的 Megatron-LM 和 Hugging Face Transformers 提供了张量并行的支持。\n",
241
+ "\n",
242
+ "---\n",
243
+ "\n",
244
+ "### **4. 管道并行(Pipeline Parallelism)**\n",
245
+ "\n",
246
+ "#### **原理**\n",
247
+ "- 将模型分为不同的部分(通常是按层划分),每部分分配到不同的 GPU。\n",
248
+ "- 数据按照流水线的方式流经每个 GPU。\n",
249
+ "\n",
250
+ "#### **特点**\n",
251
+ "- **优点**:\n",
252
+ " - 减少每个 GPU 的显存压力。\n",
253
+ " - 通过流水线增加计算效率。\n",
254
+ "- **缺点**:\n",
255
+ " - 引入流水线延迟。\n",
256
+ " - 实现复杂,需管理数据依赖和同步。\n",
257
+ "\n",
258
+ "#### **适用场景**\n",
259
+ "- 模型非常深,层数较多的场景。\n",
260
+ "\n",
261
+ "#### **具体实现**\n",
262
+ "- DeepSpeed 的 Pipeline Parallelism。\n",
263
+ "\n",
264
+ "---\n",
265
+ "\n",
266
+ "### **5. 混合并行(Hybrid Parallelism)**\n",
267
+ "\n",
268
+ "#### **原理**\n",
269
+ "- 将数据并行、模型并行、张量并行和管道并行组合使用,充分利用多 GPU 资源。\n",
270
+ "- 不同的并行方法在不同维度协同工作。\n",
271
+ "\n",
272
+ "#### **特点**\n",
273
+ "- **优点**:\n",
274
+ " - 灵活且适应性强,适合超大规模模型。\n",
275
+ "- **缺点**:\n",
276
+ " - 配置复杂,依赖于框架和训练任务。\n",
277
+ "\n",
278
+ "#### **适用场景**\n",
279
+ "- 超大规模模型(如 GPT-3 或参数量 >1T)。\n",
280
+ "- 多机多卡的大型训练环境。\n",
281
+ "\n",
282
+ "#### **具体实现**\n",
283
+ "- NVIDIA Megatron-LM 和 DeepSpeed 的混合并行支持。\n",
284
+ "\n",
285
+ "---\n",
286
+ "\n",
287
+ "### **6. ZeRO 优化并行(Zero Redundancy Optimizer)**\n",
288
+ "\n",
289
+ "#### **原理**\n",
290
+ "- 通过分片存储模型参数、优化器状态和梯度,显著减少每个 GPU 的显存占用。\n",
291
+ "\n",
292
+ "#### **特点**\n",
293
+ "- **优点**:\n",
294
+ " - 极大降低显存需求。\n",
295
+ " - 支持超大规模模型。\n",
296
+ "- **缺点**:\n",
297
+ " - 对 GPU 间通信要求较高。\n",
298
+ " - 比数据并行复杂。\n",
299
+ "\n",
300
+ "#### **适用场景**\n",
301
+ "- 超大模型的高效训练。\n",
302
+ "\n",
303
+ "#### **具体实现**\n",
304
+ "- DeepSpeed 提供的 ZeRO Stage 1/2/3。\n",
305
+ "\n",
306
+ "---\n",
307
+ "\n",
308
+ "### **方法对比**\n",
309
+ "\n",
310
+ "| 并行方法 | 主要优点 | 主要缺点 | 适用场景 |\n",
311
+ "|---------------|-------------------------------|-------------------------------|---------------------------|\n",
312
+ "| 数据并行 | 简单高效,易实现 | 模型副本占用大量显存 | 模型规模适中,显存足够 |\n",
313
+ "| 模型并行 | 支持大模型 | 通信开销大,切分复杂 | 超大模型,显存有限 |\n",
314
+ "| 张量并行 | 高效利用显存 | 实现复杂,通信频繁 | 参数规模极大的模型 |\n",
315
+ "| 管道并行 | 显存需求降低,适合深模型 | 流水线延迟,数据同步复杂 | 层数多的大型模型 |\n",
316
+ "| 混合并行 | 灵活适配超大规模模型 | 配置复杂,依赖框架 | 超大规模模型(如 GPT-3) |\n",
317
+ "| ZeRO 并行 | 极大节省显存,占用少 | 通信成本高 | 超大规模模型显存优化 |\n",
318
+ "\n",
319
+ "---\n",
320
+ "\n",
321
+ "### **总结**\n",
322
+ "- **中等规模模型**:优先使用 **数据并行**。\n",
323
+ "- **单卡显存不足**:采用 **模型并行** 或 **张量并行**。\n",
324
+ "- **超大规模模型**:使用 **混合并行** 或 DeepSpeed 的 **ZeRO 优化**。\n",
325
+ "\n",
326
+ "对于现代超大规模模型,通常采用混合并行方法,比如 NVIDIA 的 Megatron-LM 和微软的 DeepSpeed,它们综合了多种并行策略,能够有效利用计算资源并加速训练。如果您有具体的硬件环境或模型需求,可以进一步探讨适合的并行方案!"
327
+ ]
328
+ },
329
+ {
330
+ "cell_type": "markdown",
331
+ "id": "cd848439-bac8-46b2-9a0f-59ae7c343954",
332
+ "metadata": {},
333
+ "source": [
334
+ "## deepspeed的并行设计\n",
335
+ "\n",
336
+ "\n",
337
+ "是的,DeepSpeed 支持多种并行策略,包括 **数据并行**、**模型并行** 和 **张量并行**,并且可以通过其配置文件灵活地设置这些并行模式。\n",
338
+ "\n",
339
+ "---\n",
340
+ "\n",
341
+ "### **1. 数据并行**\n",
342
+ "\n",
343
+ "#### **原理**\n",
344
+ "在数据并行中,DeepSpeed 将数据批次划分到多个 GPU,每个 GPU 上都有模型的完整副本,计算独立的梯度。最终通过 `AllReduce` 操作同步梯度并更新模型参数。\n",
345
+ "\n",
346
+ "#### **如何设置**\n",
347
+ "DeepSpeed 默认支持数据并行,启用 `zero_optimization` 后会自动结合 ZeRO 优化器进行分片数据并行:\n",
348
+ "```json\n",
349
+ "{\n",
350
+ " \"train_batch_size\": 64,\n",
351
+ " \"gradient_accumulation_steps\": 8,\n",
352
+ " \"fp16\": {\n",
353
+ " \"enabled\": true\n",
354
+ " },\n",
355
+ " \"zero_optimization\": {\n",
356
+ " \"stage\": 1\n",
357
+ " }\n",
358
+ "}\n",
359
+ "```\n",
360
+ "\n",
361
+ "---\n",
362
+ "\n",
363
+ "### **2. 模型并行**\n",
364
+ "\n",
365
+ "#### **原理**\n",
366
+ "模型并行将模型的不同部分(如 Transformer 层或权重张量)分布到多个 GPU。DeepSpeed 本身不直接实现模型并行,但可以与模型并行框架(如 NVIDIA Megatron-LM)集成。\n",
367
+ "\n",
368
+ "#### **如何设置**\n",
369
+ "如果使用模型并行(如层级分割):\n",
370
+ "1. 使用 DeepSpeed 的 Pipeline Parallelism:\n",
371
+ " ```json\n",
372
+ " {\n",
373
+ " \"train_batch_size\": 64,\n",
374
+ " \"pipeline_parallel_size\": 2, # 设置流水线并行 GPU 数量\n",
375
+ " \"fp16\": {\n",
376
+ " \"enabled\": true\n",
377
+ " },\n",
378
+ " \"zero_optimization\": {\n",
379
+ " \"stage\": 1\n",
380
+ " }\n",
381
+ " }\n",
382
+ " ```\n",
383
+ "\n",
384
+ "2. 与 NVIDIA Megatron-LM 集成:\n",
385
+ " 在代码中使用 Megatron-LM 的模型并行支持,然后结合 DeepSpeed:\n",
386
+ " ```python\n",
387
+ " from megatron import get_model_parallel_world_size\n",
388
+ " import deepspeed\n",
389
+ "\n",
390
+ " model = MyModel(...)\n",
391
+ " model = deepspeed.initialize(\n",
392
+ " model=model,\n",
393
+ " model_parallel_size=get_model_parallel_world_size(),\n",
394
+ " config=\"./deepspeed_config.json\"\n",
395
+ " )\n",
396
+ " ```\n",
397
+ "\n",
398
+ "---\n",
399
+ "\n",
400
+ "### **3. 张量并行**\n",
401
+ "\n",
402
+ "#### **原理**\n",
403
+ "张量并行将模型参数张量(如权重矩阵)分片到多个 GPU,并通过通信协作完成计算。DeepSpeed 提供了张量并行的支持(在 ZeRO Stage 3 中),或者通过集成 Megatron-LM 实现。\n",
404
+ "\n",
405
+ "#### **如何设置**\n",
406
+ "1. **使用 ZeRO Stage 3**:\n",
407
+ " ZeRO Stage 3 会分片模型参数和优化器状态,类似于张量并行的效果:\n",
408
+ " ```json\n",
409
+ " {\n",
410
+ " \"train_batch_size\": 64,\n",
411
+ " \"gradient_accumulation_steps\": 8,\n",
412
+ " \"fp16\": {\n",
413
+ " \"enabled\": true\n",
414
+ " },\n",
415
+ " \"zero_optimization\": {\n",
416
+ " \"stage\": 3,\n",
417
+ " \"offload_optimizer\": {\n",
418
+ " \"device\": \"cpu\",\n",
419
+ " \"pin_memory\": true\n",
420
+ " },\n",
421
+ " \"offload_param\": {\n",
422
+ " \"device\": \"cpu\",\n",
423
+ " \"pin_memory\": true\n",
424
+ " }\n",
425
+ " }\n",
426
+ " }\n",
427
+ " ```\n",
428
+ "\n",
429
+ "2. **集成 Megatron-LM**:\n",
430
+ " 如果需要更复杂的张量并行方案(如矩阵切分),可以通过 Megatron-LM 实现,然后与 DeepSpeed 集成。\n",
431
+ "\n",
432
+ "---\n",
433
+ "\n",
434
+ "### **4. 混合并行**\n",
435
+ "\n",
436
+ "#### **原理**\n",
437
+ "混合并行结合了数据并行、模型并行和张量并行。DeepSpeed 提供了对这些模式的集成支持,允许您灵活配置。\n",
438
+ "\n",
439
+ "#### **如何设置**\n",
440
+ "结合数据并行和流水线并行:\n",
441
+ "```json\n",
442
+ "{\n",
443
+ " \"train_batch_size\": 64,\n",
444
+ " \"gradient_accumulation_steps\": 8,\n",
445
+ " \"fp16\": {\n",
446
+ " \"enabled\": true\n",
447
+ " },\n",
448
+ " \"pipeline_parallel_size\": 2, # 流水线并行\n",
449
+ " \"zero_optimization\": {\n",
450
+ " \"stage\": 2\n",
451
+ " }\n",
452
+ "}\n",
453
+ "```\n",
454
+ "\n",
455
+ "与张量并行结合:\n",
456
+ "1. 在代码中配置张量并行:\n",
457
+ " ```python\n",
458
+ " from megatron import get_tensor_parallel_world_size\n",
459
+ " model = MyModel(...)\n",
460
+ " model = deepspeed.initialize(\n",
461
+ " model=model,\n",
462
+ " tensor_parallel_size=get_tensor_parallel_world_size(),\n",
463
+ " config=\"./deepspeed_config.json\"\n",
464
+ " )\n",
465
+ " ```\n",
466
+ "\n",
467
+ "2. DeepSpeed 配置文件中启用 ZeRO Stage 3。\n",
468
+ "\n",
469
+ "---\n",
470
+ "\n",
471
+ "### **5. 选择并行策略**\n",
472
+ "\n",
473
+ "| 并行模式 | **支持方式** | **适用场景** |\n",
474
+ "|---------------|------------------------------------------|-----------------------------------------|\n",
475
+ "| 数据并行 | 默认支持,结合 ZeRO 优化器 | 模型参数较小,显存压力不大的场景 |\n",
476
+ "| 模型并行 | 使用 Pipeline Parallelism 或集成 Megatron-LM | 模型参数非常大,单 GPU 无法容纳的场景 |\n",
477
+ "| 张量并行 | ZeRO Stage 3 或集成 Megatron-LM | 参数矩阵非常大,需要分片计算的场景 |\n",
478
+ "| 混合并行 | 结合数据并行、模型并行和张量并行 | 超大规模模型(如 GPT-3)训练 |\n",
479
+ "\n",
480
+ "---\n",
481
+ "\n",
482
+ "### **6. 示例代码**\n",
483
+ "\n",
484
+ "以下是集成 ZeRO 和 Pipeline Parallelism 的完整示例:\n",
485
+ "```python\n",
486
+ "import deepspeed\n",
487
+ "from transformers import GPT2LMHeadModel, TrainingArguments, Trainer\n",
488
+ "from datasets import load_dataset\n",
489
+ "\n",
490
+ "# 加载数据\n",
491
+ "dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train\")\n",
492
+ "\n",
493
+ "# 加载模型\n",
494
+ "model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
495
+ "\n",
496
+ "# 配置 DeepSpeed\n",
497
+ "deepspeed_config = {\n",
498
+ " \"train_batch_size\": 64,\n",
499
+ " \"gradient_accumulation_steps\": 8,\n",
500
+ " \"pipeline_parallel_size\": 2, # 流水线并行\n",
501
+ " \"fp16\": {\n",
502
+ " \"enabled\": True\n",
503
+ " },\n",
504
+ " \"zero_optimization\": {\n",
505
+ " \"stage\": 2\n",
506
+ " }\n",
507
+ "}\n",
508
+ "\n",
509
+ "# 保存配置文件\n",
510
+ "import json\n",
511
+ "with open(\"deepspeed_config.json\", \"w\") as f:\n",
512
+ " json.dump(deepspeed_config, f)\n",
513
+ "\n",
514
+ "# 训练参数\n",
515
+ "training_args = TrainingArguments(\n",
516
+ " output_dir=\"./results\",\n",
517
+ " per_device_train_batch_size=4,\n",
518
+ " num_train_epochs=3,\n",
519
+ " deepspeed=\"./deepspeed_config.json\", # 指定 DeepSpeed 配置文件\n",
520
+ ")\n",
521
+ "\n",
522
+ "# 初始化 Trainer\n",
523
+ "trainer = Trainer(\n",
524
+ " model=model,\n",
525
+ " args=training_args,\n",
526
+ " train_dataset=dataset\n",
527
+ ")\n",
528
+ "\n",
529
+ "# 开始训练\n",
530
+ "trainer.train()\n",
531
+ "```\n",
532
+ "\n",
533
+ "---\n",
534
+ "\n",
535
+ "### **总结**\n",
536
+ "\n",
537
+ "- **数据并行**:默认支持,结合 ZeRO 进行优化。\n",
538
+ "- **模型并行**:使用 Pipeline Parallelism 或与 Megatron-LM 集成。\n",
539
+ "- **张量并行**:通过 ZeRO Stage 3 或 Megatron-LM 实现。\n",
540
+ "- **混合并行**:灵活结合多种并行方法,用于超大规模模型。\n",
541
+ "\n",
542
+ "DeepSpeed 的配置高度灵活,可以根据模型大小、显存限制和硬件条件选择适合的并行策略。"
543
+ ]
544
+ },
545
+ {
546
+ "cell_type": "markdown",
547
+ "id": "ab2812bc-f743-4f18-b49c-972781484dc6",
548
+ "metadata": {},
549
+ "source": [
550
+ "## gpt2的训练\n",
551
+ "\n",
552
+ "```\n",
553
+ "#一般方式训练gpt2\n",
554
+ "python pretain_gpt2.py\n",
555
+ "\n",
556
+ "\n",
557
+ "#deepspeed训练gpt2, 只多一行代码\n",
558
+ "torchrun --nproc_per_node=6 deepspeed_pretrain_gpt2.py\n",
559
+ "\n",
560
+ "```"
561
+ ]
562
+ },
563
+ {
564
+ "cell_type": "code",
565
+ "execution_count": null,
566
+ "id": "9cb60dc2-4cec-492d-836b-67694829acf2",
567
+ "metadata": {},
568
+ "outputs": [],
569
+ "source": []
570
+ }
571
+ ],
572
+ "metadata": {
573
+ "kernelspec": {
574
+ "display_name": "Python 3 (ipykernel)",
575
+ "language": "python",
576
+ "name": "python3"
577
+ },
578
+ "language_info": {
579
+ "codemirror_mode": {
580
+ "name": "ipython",
581
+ "version": 3
582
+ },
583
+ "file_extension": ".py",
584
+ "mimetype": "text/x-python",
585
+ "name": "python",
586
+ "nbconvert_exporter": "python",
587
+ "pygments_lexer": "ipython3",
588
+ "version": "3.12.3"
589
+ }
590
+ },
591
+ "nbformat": 4,
592
+ "nbformat_minor": 5
593
+ }
04-gene-sft/.ipynb_checkpoints/5-llama-continue-train-checkpoint.ipynb ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "1e6d4978-4f0f-4268-aa23-d864857bd6c8",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.5 基于llama的基因大模型持续预训练"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "1ad15cdf-386a-48bf-b44d-5014b1df8f8e",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": []
18
+ }
19
+ ],
20
+ "metadata": {
21
+ "kernelspec": {
22
+ "display_name": "Python 3 (ipykernel)",
23
+ "language": "python",
24
+ "name": "python3"
25
+ },
26
+ "language_info": {
27
+ "codemirror_mode": {
28
+ "name": "ipython",
29
+ "version": 3
30
+ },
31
+ "file_extension": ".py",
32
+ "mimetype": "text/x-python",
33
+ "name": "python",
34
+ "nbconvert_exporter": "python",
35
+ "pygments_lexer": "ipython3",
36
+ "version": "3.12.3"
37
+ }
38
+ },
39
+ "nbformat": 4,
40
+ "nbformat_minor": 5
41
+ }
04-gene-sft/.ipynb_checkpoints/6-llama-instruction-ft-checkpoint.ipynb ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "963e9ae0-ac68-44be-8c7d-fb9842784362",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 4.6 基于llama的基因大模型指令微调"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "182b82c4-d484-4c15-a600-03c3b51367ec",
14
+ "metadata": {},
15
+ "source": [
16
+ "**PEFT**(Parameter-Efficient Fine-Tuning,参数高效微调)是一种优化技术,旨在以最小的参数更新实现对大规模预训练模型(如 GPT、BERT 等)的微调。PEFT 技术通过减少微调所需的参数量,显著降低了存储和计算开销,同时保留模型的性能,特别适合资源受限的场景和领域特定任务的定制化。\n",
17
+ "\n",
18
+ "---\n",
19
+ "\n",
20
+ "### **1. 核心思想**\n",
21
+ "传统的微调方式需要更新整个预训练模型的所有参数,PEFT 技术通过只调整少量的参数(如特定层或额外添加的小型模块)实现微调目标,大幅减少了训练开销和存储需求。\n",
22
+ "\n",
23
+ "---\n",
24
+ "\n",
25
+ "### **2. 常见的 PEFT 方法**\n",
26
+ "\n",
27
+ "#### **(1)Adapter 模型**\n",
28
+ "- 在每一层 Transformer 的输出中插入小型适配器模块,仅训练适配器模块的参数。\n",
29
+ "- 原始模型参数保持冻结不变。\n",
30
+ "- 优点:适配器模块参数量小,能适应不同任务。\n",
31
+ "\n",
32
+ "示例方法:\n",
33
+ "- **AdapterFusion**\n",
34
+ "- **MAD-X**\n",
35
+ "\n",
36
+ "---\n",
37
+ "\n",
38
+ "#### **(2)Prefix Tuning**\n",
39
+ "- 在 Transformer 的输入前添加一组可学习的前缀向量,这些前缀与模型的注意力机制交互。\n",
40
+ "- 只调整前缀向量的参数,而不更新原始模型。\n",
41
+ "- 优点:对生成任务效果显著,参数量进一步减少。\n",
42
+ "\n",
43
+ "---\n",
44
+ "\n",
45
+ "#### **(3)LoRA(Low-Rank Adaptation)**\n",
46
+ "- 将预训练模型中的部分权重分解为两个低秩矩阵,仅调整这些低秩矩阵的参数。\n",
47
+ "- 原始权重保持冻结状态。\n",
48
+ "- 优点:参数量极小,计算高效。\n",
49
+ " \n",
50
+ "---\n",
51
+ "\n",
52
+ "#### **(4)Prompt Tuning**\n",
53
+ "- 在输入文本中添加可学习的提示(Prompt)。\n",
54
+ "- 适合 NLP 任务中的文本生成、分类等。\n",
55
+ "- 优点:实现简单,易于集成到现有框架。\n",
56
+ "\n",
57
+ "---\n",
58
+ "\n",
59
+ "### **3. PEFT 的优势**\n",
60
+ "\n",
61
+ "1. **显著减少参数更新量**:\n",
62
+ " - 微调传统的大模型(如 GPT-3)需要更新数百亿参数,而 PEFT 仅需更新百万级别甚至更少的参数。\n",
63
+ "\n",
64
+ "2. **高效存储**:\n",
65
+ " - 每个任务的微调结果只需存储少量额外参数,而不是整个模型。\n",
66
+ "\n",
67
+ "3. **适用多任务**:\n",
68
+ " - 同一预训练模型可以通过不同的 PEFT 模块适配多个任务,无需重新训练。\n",
69
+ "\n",
70
+ "4. **降低计算开销**:\n",
71
+ " - 训练所需的内存和计算显著减少,适合资源有限的环境。\n",
72
+ "\n",
73
+ "---\n",
74
+ "\n",
75
+ "### **4. 应用场景**\n",
76
+ "\n",
77
+ "1. **领域特定任务**:\n",
78
+ " - 医疗、法律、金融等领域微调预训练模型。\n",
79
+ "\n",
80
+ "2. **多任务学习**:\n",
81
+ " - 适配多个任务,复用同一模型的预训练权重。\n",
82
+ "\n",
83
+ "3. **资源受限场景**:\n",
84
+ " - 移动设备、边缘设备上的模型部署。\n",
85
+ "\n",
86
+ "---\n",
87
+ "\n",
88
+ "### **5. Hugging Face PEFT 库**\n",
89
+ "\n",
90
+ "Hugging Face 提供了专门的 PEFT 库,支持多种参数高效微调技术:\n",
91
+ "- **安装**:\n",
92
+ " ```bash\n",
93
+ " pip install peft\n",
94
+ " ```\n",
95
+ "- **使用 LoRA 微调示例**:\n",
96
+ " ```python\n",
97
+ " from transformers import AutoModelForCausalLM, AutoTokenizer\n",
98
+ " from peft import LoraConfig, get_peft_model, TaskType\n",
99
+ "\n",
100
+ " # 加载模型和分词器\n",
101
+ " model_name = \"gpt2\"\n",
102
+ " model = AutoModelForCausalLM.from_pretrained(model_name)\n",
103
+ " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
104
+ "\n",
105
+ " # 配置 LoRA\n",
106
+ " lora_config = LoraConfig(\n",
107
+ " task_type=TaskType.CAUSAL_LM,\n",
108
+ " r=8,\n",
109
+ " lora_alpha=32,\n",
110
+ " target_modules=[\"q_proj\", \"v_proj\"],\n",
111
+ " lora_dropout=0.1,\n",
112
+ " bias=\"none\"\n",
113
+ " )\n",
114
+ "\n",
115
+ " # 使用 LoRA 微调模型\n",
116
+ " model = get_peft_model(model, lora_config)\n",
117
+ " model.print_trainable_parameters()\n",
118
+ "\n",
119
+ " # 微调代码...\n",
120
+ " ```\n",
121
+ "\n",
122
+ "---\n",
123
+ "\n",
124
+ "### **6. PEFT 的局限性**\n",
125
+ "1. **特定任务限制**:\n",
126
+ " - 在一些复杂任务中,PEFT 方法可能不如全量微调效果好。\n",
127
+ "\n",
128
+ "2. **需要设计合适的模块**:\n",
129
+ " - 不同任务需要选择和设计合��的 PEFT 技术。\n",
130
+ "\n",
131
+ "3. **与模型架构相关**:\n",
132
+ " - PEFT 技术可能需要对模型架构进行一定程度的修改。\n",
133
+ "\n",
134
+ "---\n",
135
+ "\n",
136
+ "### **7. 总结**\n",
137
+ "PEFT 是一个极具潜力的技术,特别适合在有限资源下对大模型进行微调。它在许多领域和任务中已显示出良好的效果,例如 LoRA 和 Adapter 模型已经成为高效微调的主流方法。\n",
138
+ "\n",
139
+ "如果您需要实现高效微调,可以结合 Hugging Face 的 PEFT 库快速上手。"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 1,
145
+ "id": "5aa3d240-44e1-4811-8f61-d6ff2500a798",
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "import subprocess\n",
150
+ "import os\n",
151
+ "# 设置环境变量, autodl一般区域\n",
152
+ "result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
153
+ "output = result.stdout\n",
154
+ "for line in output.splitlines():\n",
155
+ " if '=' in line:\n",
156
+ " var, value = line.split('=', 1)\n",
157
+ " os.environ[var] = value"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "markdown",
162
+ "id": "17bdb69d-3f0f-465e-bd60-2047a088e264",
163
+ "metadata": {},
164
+ "source": [
165
+ "如果您不确定模型中有哪些模块可以微调,可以打印模型结构:"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "id": "41a0c049-9134-4d89-aad0-1aa2241a9fca",
172
+ "metadata": {},
173
+ "outputs": [],
174
+ "source": [
175
+ "from transformers import AutoModelForCausalLM\n",
176
+ "\n",
177
+ "model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
178
+ "\n",
179
+ "# 打印所有模块名称\n",
180
+ "for name, module in model.named_modules():\n",
181
+ " print(name)"
182
+ ]
183
+ }
184
+ ],
185
+ "metadata": {
186
+ "kernelspec": {
187
+ "display_name": "Python 3 (ipykernel)",
188
+ "language": "python",
189
+ "name": "python3"
190
+ },
191
+ "language_info": {
192
+ "codemirror_mode": {
193
+ "name": "ipython",
194
+ "version": 3
195
+ },
196
+ "file_extension": ".py",
197
+ "mimetype": "text/x-python",
198
+ "name": "python",
199
+ "nbconvert_exporter": "python",
200
+ "pygments_lexer": "ipython3",
201
+ "version": "3.12.3"
202
+ }
203
+ },
204
+ "nbformat": 4,
205
+ "nbformat_minor": 5
206
+ }
04-gene-sft/.ipynb_checkpoints/build_gene_bpe_seg-checkpoint.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+
3
+ spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',
4
+ model_prefix='gene_bpe_seg',
5
+ vocab_size=60000,
6
+ model_type='bpe', #默认是unigram
7
+ num_threads=10,
8
+ )
9
+
10
+ from sentencepiece import SentencePieceProcessor
11
+ model_path = "gene_bpe_seg.model"
12
+ sp_model = SentencePieceProcessor(model_file=model_path)
13
+ mm = sp_model.EncodeAsPieces("TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV")
14
+ print(mm)
04-gene-sft/.ipynb_checkpoints/deepspeed_pretrain_gpt2-checkpoint.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import os
2
+
3
+ # # 设置环境变量
4
+ # os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
5
+
6
+ # # 打印环境变量以确认设置成功
7
+ # print(os.environ.get('HF_ENDPOINT'))
8
+ import subprocess
9
+ import os
10
+ # 设置环境变量, autodl一般区域
11
+ result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
12
+ output = result.stdout
13
+ for line in output.splitlines():
14
+ if '=' in line:
15
+ var, value = line.split('=', 1)
16
+ os.environ[var] = value
17
+
18
+
19
+
20
+ import math
21
+ from transformers import (
22
+ GPT2Config,
23
+ GPT2LMHeadModel,
24
+ GPT2TokenizerFast,
25
+ TrainingArguments,
26
+ Trainer,
27
+ DataCollatorForLanguageModeling,
28
+ )
29
+ from datasets import Dataset
30
+ from datasets import load_dataset
31
+ import evaluate
32
+ import numpy as np
33
+ from transformers import AutoTokenizer,AutoConfig
34
+
35
+
36
+ # 加载 OpenWebText 数据集
37
+ dataset = load_dataset("text", data_files="../01-data_env/data/dna_1g.txt")["train"].train_test_split(test_size=0.01, shuffle=True)
38
+
39
+ # 定义最大输入长度
40
+ max_length = 256
41
+
42
+
43
+ # 数据预处理
44
+ def preprocess_function(examples):
45
+ return tokenizer(examples["text"], truncation=True, max_length=max_length)
46
+
47
+
48
+
49
+ # 初始化 GPT-2 分词器
50
+ tokenizer = AutoTokenizer.from_pretrained("gpt2_tokenizer")
51
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=5)
52
+
53
+ # 4. 创建一个数据收集器,用于动态填充和遮蔽
54
+ data_collator = DataCollatorForLanguageModeling(
55
+ tokenizer=tokenizer, mlm=False
56
+ )
57
+
58
+
59
+
60
+ # 加载并调整 GPT-2 配置
61
+ config = AutoConfig.from_pretrained(
62
+ "gpt2", # 加载 GPT-2 的默认配置
63
+ vocab_size=len(tokenizer), # 更新词汇表大小为自定义分词器的词汇表大小
64
+ n_ctx=max_length, # 最大上下文长度(序列长度)
65
+ n_positions=max_length, # 最大位置编码长度,通常与 n_ctx 一致
66
+ )
67
+
68
+ # 初始化 GPT-2 模型
69
+ model = GPT2LMHeadModel(config)
70
+
71
+ # 定义训练参数
72
+ training_args = TrainingArguments(
73
+ output_dir="./gpt2-small",
74
+ overwrite_output_dir=True,
75
+ num_train_epochs=5,
76
+ per_device_train_batch_size=64,
77
+ save_steps=10000,
78
+ save_total_limit=2,
79
+ logging_dir="./logs",
80
+ logging_steps=20000,
81
+ evaluation_strategy="steps",
82
+ eval_steps=10000,
83
+ learning_rate=5e-5,
84
+ warmup_steps=500,
85
+ weight_decay=0.01,
86
+ fp16=True, # 启用混合精度训练
87
+ deepspeed="ds_zero2_no_offload.json"
88
+ )
89
+
90
+
91
+ # 初始化 Trainer
92
+ trainer = Trainer(
93
+ model=model,
94
+ args=training_args,
95
+ train_dataset=tokenized_dataset["train"],
96
+ eval_dataset=tokenized_dataset["test"],
97
+ tokenizer=tokenizer,
98
+ data_collator=data_collator,
99
+ )
100
+
101
+ # 开始训练
102
+ trainer.train()
103
+
104
+
105
+ # 评估 perplexity
106
+ eval_results = trainer.evaluate()
107
+ perplexity = math.exp(eval_results["eval_loss"])
108
+ print(f"Perplexity: {perplexity}")
109
+
110
+
111
+
112
+ out_model_path = "gpt2-small-gene-openweb"
113
+ trainer.save_model(out_model_path)
114
+ tokenizer.save_pretrained(out_model_path)
04-gene-sft/.ipynb_checkpoints/ds_zero2_no_offload-checkpoint.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fp16": {
3
+ "enabled": "auto",
4
+ "loss_scale": 0,
5
+ "loss_scale_window": 100,
6
+ "initial_scale_power": 16,
7
+ "hysteresis": 2,
8
+ "min_loss_scale": 1e-10
9
+ },
10
+
11
+ "zero_optimization": {
12
+ "stage": 2,
13
+ "allgather_partitions": true,
14
+ "allgather_bucket_size": 1e8,
15
+ "overlap_comm": true,
16
+ "reduce_scatter": true,
17
+ "reduce_bucket_size": 1e8,
18
+ "contiguous_gradients": true
19
+ },
20
+
21
+ "gradient_accumulation_steps": "auto",
22
+ "gradient_clipping": "auto",
23
+ "steps_per_print": 2000,
24
+ "train_batch_size": "auto",
25
+ "train_micro_batch_size_per_gpu": "auto",
26
+ "wall_clock_breakdown": false
27
+ }