Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- 02-gpt2_bert/.ipynb_checkpoints/2-dna-gpt-checkpoint.ipynb +507 -27
- 02-gpt2_bert/.ipynb_checkpoints/3-dna-bert-checkpoint.ipynb +533 -32
- 02-gpt2_bert/.ipynb_checkpoints/5-multi-seq-gpt-checkpoint.ipynb +0 -0
- 02-gpt2_bert/1-dna-bpe.ipynb +1 -1
- 02-gpt2_bert/2-dna-gpt.ipynb +507 -27
- 02-gpt2_bert/3-dna-bert.ipynb +0 -0
- 02-gpt2_bert/4-gene-feature.ipynb +125 -161
- 02-gpt2_bert/5-multi-seq-gpt.ipynb +0 -0
- 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/special_tokens_map-checkpoint.json +7 -0
- 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer-checkpoint.json +0 -0
- 02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer_config-checkpoint.json +53 -0
- 02-gpt2_bert/gene_en_bpe.py +19 -0
- 02-gpt2_bert/gene_eng_dict.json +0 -0
- 02-gpt2_bert/gene_eng_dict/merges.txt +3 -0
- 02-gpt2_bert/gene_eng_dict/special_tokens_map.json +5 -0
- 02-gpt2_bert/gene_eng_dict/tokenizer.json +0 -0
- 02-gpt2_bert/gene_eng_dict/tokenizer_config.json +20 -0
- 02-gpt2_bert/gene_eng_dict/vocab.json +0 -0
- 03-gene-task/.ipynb_checkpoints/1-category-task-checkpoint.ipynb +808 -2
- 03-gene-task/.ipynb_checkpoints/3-multi-seq-task-checkpoint.ipynb +0 -0
- 03-gene-task/.ipynb_checkpoints/5-regression-task-checkpoint.ipynb +555 -1
- 03-gene-task/1-category-task.ipynb +780 -1
- 03-gene-task/2-structure-predict.ipynb +954 -1
- 03-gene-task/3-multi-seq-task.ipynb +0 -0
- 03-gene-task/4-fun-predict.ipynb +755 -1
- 03-gene-task/5-regression-task.ipynb +555 -1
- 03-gene-task/data/.ipynb_checkpoints/protein_stab-checkpoint.csv +0 -0
- 03-gene-task/data/dna_protein_full.json +3 -0
- 03-gene-task/data/protein_stab.csv +0 -0
- 03-gene-task/img/.ipynb_checkpoints/dataset-checkpoint.png +0 -0
- 03-gene-task/img/2_structure.png +0 -0
- 03-gene-task/img/dataset.png +0 -0
- 03-gene-task/img/ds_structure.png +0 -0
- 03-gene-task/img/function.png +0 -0
- 03-gene-task/img/gpt2-ft.png +0 -0
- 03-gene-task/img/pdb1.png +0 -0
- 03-gene-task/img/protein-structure-1-2.png +3 -0
- 03-gene-task/img/protein-structure-1.png +0 -0
- 03-gene-task/img/protein-structure-2.png +0 -0
- 03-gene-task/img/sequence.png +0 -0
- 04-gene-sft/.ipynb_checkpoints/1-finetue-intro-checkpoint.ipynb +254 -0
- 04-gene-sft/.ipynb_checkpoints/2-gpt2-instruction-ft-checkpoint.ipynb +498 -0
- 04-gene-sft/.ipynb_checkpoints/3-llama-expand-dict-checkpoint.ipynb +272 -0
- 04-gene-sft/.ipynb_checkpoints/4-deepspeed-intro-checkpoint.ipynb +593 -0
- 04-gene-sft/.ipynb_checkpoints/5-llama-continue-train-checkpoint.ipynb +41 -0
- 04-gene-sft/.ipynb_checkpoints/6-llama-instruction-ft-checkpoint.ipynb +206 -0
- 04-gene-sft/.ipynb_checkpoints/build_gene_bpe_seg-checkpoint.py +14 -0
- 04-gene-sft/.ipynb_checkpoints/deepspeed_pretrain_gpt2-checkpoint.py +114 -0
- 04-gene-sft/.ipynb_checkpoints/ds_zero2_no_offload-checkpoint.json +27 -0
.gitattributes
CHANGED
@@ -35,4 +35,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
*.psd filter=lfs diff=lfs merge=lfs -text
|
37 |
*.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
38 |
img/gpt2_bridge.png filter=lfs diff=lfs merge=lfs -text
|
|
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
*.psd filter=lfs diff=lfs merge=lfs -text
|
37 |
*.txt filter=lfs diff=lfs merge=lfs -text
|
38 |
+
03-gene-task/data/dna_protein_full.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
03-gene-task/img/protein-structure-1-2.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
04-gene-sft/sft_data/train_data.json filter=lfs diff=lfs merge=lfs -text
|
41 |
img/gpt2_bridge.png filter=lfs diff=lfs merge=lfs -text
|
02-gpt2_bert/.ipynb_checkpoints/2-dna-gpt-checkpoint.ipynb
CHANGED
@@ -49,9 +49,9 @@
|
|
49 |
"\n",
|
50 |
"### 历史背景\n",
|
51 |
"\n",
|
52 |
-
"- **发布日期**:GPT-
|
53 |
" \n",
|
54 |
-
"- **开发动机**:GPT-2
|
55 |
"\n",
|
56 |
"- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
|
57 |
"\n",
|
@@ -73,6 +73,24 @@
|
|
73 |
{
|
74 |
"cell_type": "code",
|
75 |
"execution_count": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
"id": "70581590-096f-45f8-b13b-b84e88615849",
|
77 |
"metadata": {},
|
78 |
"outputs": [],
|
@@ -96,7 +114,7 @@
|
|
96 |
},
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
-
"execution_count":
|
100 |
"id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
@@ -117,7 +135,7 @@
|
|
117 |
},
|
118 |
{
|
119 |
"cell_type": "code",
|
120 |
-
"execution_count":
|
121 |
"id": "87435829-f522-4820-a51d-11fa4afee6d7",
|
122 |
"metadata": {},
|
123 |
"outputs": [],
|
@@ -136,58 +154,53 @@
|
|
136 |
]
|
137 |
},
|
138 |
{
|
139 |
-
"cell_type": "
|
140 |
-
"
|
141 |
-
"id": "0a0adfdd-4be9-4027-a12d-3bf848be3012",
|
142 |
"metadata": {},
|
143 |
-
"outputs": [],
|
144 |
"source": [
|
145 |
-
"
|
|
|
|
|
|
|
|
|
146 |
]
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
-
"execution_count":
|
151 |
"id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
|
152 |
"metadata": {},
|
153 |
"outputs": [
|
154 |
{
|
155 |
"data": {
|
156 |
"application/vnd.jupyter.widget-view+json": {
|
157 |
-
"model_id": "
|
158 |
"version_major": 2,
|
159 |
"version_minor": 0
|
160 |
},
|
161 |
"text/plain": [
|
162 |
-
"
|
163 |
]
|
164 |
},
|
165 |
"metadata": {},
|
166 |
"output_type": "display_data"
|
167 |
},
|
168 |
{
|
169 |
-
"
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
},
|
175 |
-
"text/plain": [
|
176 |
-
"Map (num_proc=15): 0%| | 0/971635 [00:00<?, ? examples/s]"
|
177 |
-
]
|
178 |
-
},
|
179 |
-
"metadata": {},
|
180 |
-
"output_type": "display_data"
|
181 |
},
|
182 |
{
|
183 |
"data": {
|
184 |
"application/vnd.jupyter.widget-view+json": {
|
185 |
-
"model_id": "
|
186 |
"version_major": 2,
|
187 |
"version_minor": 0
|
188 |
},
|
189 |
"text/plain": [
|
190 |
-
"Map (num_proc=
|
191 |
]
|
192 |
},
|
193 |
"metadata": {},
|
@@ -197,6 +210,7 @@
|
|
197 |
"source": [
|
198 |
"# 1. load dna dataset\n",
|
199 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
|
|
200 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
201 |
"\n",
|
202 |
"# 2. tokenize\n",
|
@@ -212,6 +226,174 @@
|
|
212 |
")"
|
213 |
]
|
214 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
{
|
216 |
"cell_type": "code",
|
217 |
"execution_count": 5,
|
@@ -4656,10 +4838,308 @@
|
|
4656 |
},
|
4657 |
{
|
4658 |
"cell_type": "code",
|
4659 |
-
"execution_count":
|
4660 |
"id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
|
4661 |
"metadata": {},
|
4662 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4663 |
"source": []
|
4664 |
}
|
4665 |
],
|
|
|
49 |
"\n",
|
50 |
"### 历史背景\n",
|
51 |
"\n",
|
52 |
+
"- **发布日期**:GPT(Generative Pre-trained Transformer)的第一个版本,即 GPT-1,是在 2018 年由 OpenAI 发布的。具体来说,关于 GPT-1 的研究论文《Improving Language Understanding by Generative Pre-Training》在 2018 年 6 月发布。\n",
|
53 |
" \n",
|
54 |
+
"- **开发动机**:GPT-2 2019年发表,是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
|
55 |
"\n",
|
56 |
"- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
|
57 |
"\n",
|
|
|
73 |
{
|
74 |
"cell_type": "code",
|
75 |
"execution_count": 1,
|
76 |
+
"id": "83af3495-b1fd-4ea1-84d7-9224b7094c0f",
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"import subprocess\n",
|
81 |
+
"import os\n",
|
82 |
+
"# 设置环境变量, autodl一般区域\n",
|
83 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
84 |
+
"output = result.stdout\n",
|
85 |
+
"for line in output.splitlines():\n",
|
86 |
+
" if '=' in line:\n",
|
87 |
+
" var, value = line.split('=', 1)\n",
|
88 |
+
" os.environ[var] = value"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": 2,
|
94 |
"id": "70581590-096f-45f8-b13b-b84e88615849",
|
95 |
"metadata": {},
|
96 |
"outputs": [],
|
|
|
114 |
},
|
115 |
{
|
116 |
"cell_type": "code",
|
117 |
+
"execution_count": 6,
|
118 |
"id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
|
|
135 |
},
|
136 |
{
|
137 |
"cell_type": "code",
|
138 |
+
"execution_count": 7,
|
139 |
"id": "87435829-f522-4820-a51d-11fa4afee6d7",
|
140 |
"metadata": {},
|
141 |
"outputs": [],
|
|
|
154 |
]
|
155 |
},
|
156 |
{
|
157 |
+
"cell_type": "markdown",
|
158 |
+
"id": "05875e2f-32e7-485d-9399-99dc1e4bf71f",
|
|
|
159 |
"metadata": {},
|
|
|
160 |
"source": [
|
161 |
+
"## 训练数据\n",
|
162 |
+
"\n",
|
163 |
+
"接着是训练数据集,最重要的是构建模型的输入和输出。\n",
|
164 |
+
"\n",
|
165 |
+
"这里使用DataCollatorForLanguageModeling ,它是专为语言建模而设计(顾名思义)。除了堆叠和填充批次,它还负责创建语言模型标签——在因果语言建模中,输入也用作标签(只是移动了一个元素),并且这个数据整理器在训练期间即时创建它们,所以我们不需要复制 input_ids。"
|
166 |
]
|
167 |
},
|
168 |
{
|
169 |
"cell_type": "code",
|
170 |
+
"execution_count": 9,
|
171 |
"id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
|
172 |
"metadata": {},
|
173 |
"outputs": [
|
174 |
{
|
175 |
"data": {
|
176 |
"application/vnd.jupyter.widget-view+json": {
|
177 |
+
"model_id": "3db6964a82794db7ac007c7aa513ad33",
|
178 |
"version_major": 2,
|
179 |
"version_minor": 0
|
180 |
},
|
181 |
"text/plain": [
|
182 |
+
"Map (num_proc=15): 0%| | 0/90 [00:00<?, ? examples/s]"
|
183 |
]
|
184 |
},
|
185 |
"metadata": {},
|
186 |
"output_type": "display_data"
|
187 |
},
|
188 |
{
|
189 |
+
"name": "stderr",
|
190 |
+
"output_type": "stream",
|
191 |
+
"text": [
|
192 |
+
"num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.\n"
|
193 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
},
|
195 |
{
|
196 |
"data": {
|
197 |
"application/vnd.jupyter.widget-view+json": {
|
198 |
+
"model_id": "ba2c0d0e766949c79e4db6e6bd881f06",
|
199 |
"version_major": 2,
|
200 |
"version_minor": 0
|
201 |
},
|
202 |
"text/plain": [
|
203 |
+
"Map (num_proc=10): 0%| | 0/10 [00:00<?, ? examples/s]"
|
204 |
]
|
205 |
},
|
206 |
"metadata": {},
|
|
|
210 |
"source": [
|
211 |
"# 1. load dna dataset\n",
|
212 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
213 |
+
"#dataset = raw_dataset[\"train\"].select(range(100)).train_test_split(test_size=0.1, shuffle=True)\n",
|
214 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
215 |
"\n",
|
216 |
"# 2. tokenize\n",
|
|
|
226 |
")"
|
227 |
]
|
228 |
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": 10,
|
232 |
+
"id": "2eb1ff7a-f733-404b-a6ed-da82a677da3f",
|
233 |
+
"metadata": {},
|
234 |
+
"outputs": [
|
235 |
+
{
|
236 |
+
"name": "stdout",
|
237 |
+
"output_type": "stream",
|
238 |
+
"text": [
|
239 |
+
"[{'input_ids': [20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978, 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412, 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65, 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84, 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137, 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419, 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468, 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65, 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138, 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003, 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772, 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079, 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269, 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614, 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}]\n"
|
240 |
+
]
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"source": [
|
244 |
+
"samples = [tokenized_datasets[\"train\"][0]]\n",
|
245 |
+
"print(samples)"
|
246 |
+
]
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"cell_type": "code",
|
250 |
+
"execution_count": 11,
|
251 |
+
"id": "260283a4-5ceb-4ef6-be1b-a4765fb74b20",
|
252 |
+
"metadata": {},
|
253 |
+
"outputs": [
|
254 |
+
{
|
255 |
+
"name": "stdout",
|
256 |
+
"output_type": "stream",
|
257 |
+
"text": [
|
258 |
+
"{'input_ids': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
|
259 |
+
" 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
|
260 |
+
" 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
|
261 |
+
" 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
|
262 |
+
" 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
|
263 |
+
" 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
|
264 |
+
" 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
|
265 |
+
" 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
|
266 |
+
" 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
|
267 |
+
" 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
|
268 |
+
" 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
|
269 |
+
" 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
|
270 |
+
" 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
|
271 |
+
" 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
|
272 |
+
" 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0,\n",
|
273 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
274 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
275 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
276 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
277 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
278 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
279 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
280 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
281 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
282 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
283 |
+
" 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
284 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
285 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
286 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
287 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
288 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
289 |
+
" 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
290 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
291 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
292 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
293 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
|
294 |
+
" 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
|
295 |
+
" 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
|
296 |
+
" 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
|
297 |
+
" 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
|
298 |
+
" 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
|
299 |
+
" 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
|
300 |
+
" 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
|
301 |
+
" 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
|
302 |
+
" 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
|
303 |
+
" 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
|
304 |
+
" 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
|
305 |
+
" 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
|
306 |
+
" 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
|
307 |
+
" 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, -100,\n",
|
308 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
309 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
310 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
311 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
312 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
313 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
314 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
315 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
316 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
317 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
318 |
+
" -100, -100, -100, -100, -100, -100]])}\n"
|
319 |
+
]
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"source": [
|
323 |
+
"io_data = data_collator(samples)\n",
|
324 |
+
"print(io_data)"
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "markdown",
|
329 |
+
"id": "80a84504-eaa3-43a9-ba13-3a2b73942c59",
|
330 |
+
"metadata": {},
|
331 |
+
"source": [
|
332 |
+
"这段代码展示了如何加载 DNA 数据集、对其进行分词处理,并为语言模型训练准备数据。让我们逐段解析代码,并特别关注 `DataCollatorForLanguageModeling` 函数。\n",
|
333 |
+
"\n",
|
334 |
+
"### 1. 加载 DNA 数据集\n",
|
335 |
+
"\n",
|
336 |
+
"```python\n",
|
337 |
+
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
338 |
+
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
339 |
+
"```\n",
|
340 |
+
"\n",
|
341 |
+
"- **`load_dataset`**:使用 Hugging Face 的 `datasets` 库加载文本文件作为数据集。这里指定的是一个本地的 DNA 序列文本文件 `dna_1g.txt`。\n",
|
342 |
+
"- **`train_test_split`**:将原始数据集分割为训练集和测试集,其中测试集占 10%(`test_size=0.1`),并随机打乱数据(`shuffle=True`)。\n",
|
343 |
+
"\n",
|
344 |
+
"### 2. 定义分词函数\n",
|
345 |
+
"\n",
|
346 |
+
"```python\n",
|
347 |
+
"def tokenize_function(examples):\n",
|
348 |
+
" return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
|
349 |
+
"```\n",
|
350 |
+
"\n",
|
351 |
+
"- **`tokenize_function`**:这是一个自定义的分词函数,用于对数据集中的每条记录进行分词处理。\n",
|
352 |
+
"- **参数解释**:\n",
|
353 |
+
" - `examples['text']`:获取数据集中每条记录的文本内容。\n",
|
354 |
+
" - `truncation=True`:确保所有输入序列被截断到 `max_length` 指定的最大长度。\n",
|
355 |
+
" - `padding='max_length'`:将所有输入序列填充到 `max_length` 指定的最大长度,以保证批次内所有序列具有相同的长度。\n",
|
356 |
+
" - `max_length`:指定最大序列长度,需要根据具体任务和模型要求设置。\n",
|
357 |
+
"\n",
|
358 |
+
"### 3. 对数据集应用分词函数\n",
|
359 |
+
"\n",
|
360 |
+
"```python\n",
|
361 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15)\n",
|
362 |
+
"```\n",
|
363 |
+
"\n",
|
364 |
+
"- **`map`**:将 `tokenize_function` 应用到整个数据集上。`batched=True` 表示批量处理,可以显著提高处理速度。\n",
|
365 |
+
"- **`remove_columns=['text']`**:分词后不再需要原始文本列,因此将其移除。\n",
|
366 |
+
"- **`num_proc=15`**:指定使用的 CPU 核心数(或进程数),可以根据你的硬件资源调整。这有助于加速分词过程。\n",
|
367 |
+
"\n",
|
368 |
+
"### 4. 创建数据收集器\n",
|
369 |
+
"\n",
|
370 |
+
"```python\n",
|
371 |
+
"data_collator = DataCollatorForLanguageModeling(\n",
|
372 |
+
" tokenizer=tokenizer, mlm=False\n",
|
373 |
+
")\n",
|
374 |
+
"```\n",
|
375 |
+
"\n",
|
376 |
+
"#### `DataCollatorForLanguageModeling` 函数详解\n",
|
377 |
+
"\n",
|
378 |
+
"`DataCollatorForLanguageModeling` 是 Hugging Face 提供的一个工具,用于在训练语言模型时动态地处理批次数据。它主要用于两种任务:\n",
|
379 |
+
"\n",
|
380 |
+
"- **Masked Language Modeling (MLM)**:遮蔽某些 token 并预测它们,常用于预训练模型(如 BERT)。\n",
|
381 |
+
"- **Causal Language Modeling (CLM)**:基于前文预测下一个 token,适用于生成式模型(如 GPT 系列)。\n",
|
382 |
+
"\n",
|
383 |
+
"在这个例子中,`mlm=False` 表明我们正在处理因果语言建模(CLM),即每个 token 只能依赖于其前面的 token 进行预测。这对于像 GPT 这样的生成模型非常适用。\n",
|
384 |
+
"\n",
|
385 |
+
"- **`tokenizer=tokenizer`**:指定用于编码和解码的分词器对象。\n",
|
386 |
+
"- **`mlm=False`**:关闭 MLM 模式,因为我们不需要遮蔽任何 token。对于因果语言建模,模型会尝试根据之前的上下文预测下一个 token。"
|
387 |
+
]
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"cell_type": "markdown",
|
391 |
+
"id": "3fbe9480-c394-4bab-bdee-e80f21e0259a",
|
392 |
+
"metadata": {},
|
393 |
+
"source": [
|
394 |
+
"### 开始训练"
|
395 |
+
]
|
396 |
+
},
|
397 |
{
|
398 |
"cell_type": "code",
|
399 |
"execution_count": 5,
|
|
|
4838 |
},
|
4839 |
{
|
4840 |
"cell_type": "code",
|
4841 |
+
"execution_count": 3,
|
4842 |
"id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
|
4843 |
"metadata": {},
|
4844 |
"outputs": [],
|
4845 |
+
"source": [
|
4846 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dna_bpe_dict\")\n",
|
4847 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
4848 |
+
]
|
4849 |
+
},
|
4850 |
+
{
|
4851 |
+
"cell_type": "code",
|
4852 |
+
"execution_count": 5,
|
4853 |
+
"id": "76f7c636-20c0-47a1-83c1-72e5ee101c0f",
|
4854 |
+
"metadata": {},
|
4855 |
+
"outputs": [],
|
4856 |
+
"source": [
|
4857 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
4858 |
+
"model = AutoModel.from_pretrained('dna_gpt2_v0')"
|
4859 |
+
]
|
4860 |
+
},
|
4861 |
+
{
|
4862 |
+
"cell_type": "code",
|
4863 |
+
"execution_count": 6,
|
4864 |
+
"id": "c041ad1b-7fe4-4d00-a77e-8ab17f020600",
|
4865 |
+
"metadata": {},
|
4866 |
+
"outputs": [
|
4867 |
+
{
|
4868 |
+
"name": "stdout",
|
4869 |
+
"output_type": "stream",
|
4870 |
+
"text": [
|
4871 |
+
"[2024-12-30 20:29:16,315] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
4872 |
+
]
|
4873 |
+
},
|
4874 |
+
{
|
4875 |
+
"name": "stderr",
|
4876 |
+
"output_type": "stream",
|
4877 |
+
"text": [
|
4878 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
4879 |
+
"collect2: error: ld returned 1 exit status\n",
|
4880 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4881 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4882 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4883 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
4884 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
4885 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
4886 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
4887 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
4888 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
4889 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
4890 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4891 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
4892 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
4893 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
4894 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
4895 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4896 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
4897 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
4898 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4899 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4900 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
4901 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
4902 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4903 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
4904 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
4905 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
4906 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
4907 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
4908 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
4909 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
4910 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
4911 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
4912 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4913 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
4914 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
4915 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
4916 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
4917 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
4918 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
4919 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
4920 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
4921 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
4922 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4923 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
4924 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
4925 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4926 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
4927 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
4928 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4929 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
4930 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
4931 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
4932 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
4933 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
4934 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
4935 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
4936 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
4937 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
4938 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
4939 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
4940 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
4941 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
4942 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4943 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
4944 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4945 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4946 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
4947 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4948 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4949 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
4950 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
4951 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
4952 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
4953 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
4954 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
4955 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4956 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
4957 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
4958 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
4959 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4960 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
4961 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
4962 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
4963 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
4964 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
4965 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
4966 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
4967 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
4968 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
4969 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
4970 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
4971 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
4972 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
4973 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4974 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
4975 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
4976 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
4977 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4978 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4979 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
4980 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
4981 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
4982 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
4983 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
4984 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
4985 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
4986 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
4987 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
4988 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4989 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
4990 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
4991 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4992 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4993 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
4994 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
4995 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
4996 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
4997 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4998 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
4999 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
5000 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
5001 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
5002 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
5003 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
5004 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
5005 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
5006 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
5007 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
5008 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
5009 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
5010 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
5011 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
5012 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
5013 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
5014 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
5015 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
5016 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
5017 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5018 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
5019 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
5020 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
5021 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
5022 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
5023 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
5024 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
5025 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
5026 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
5027 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
5028 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
5029 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
5030 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
5031 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
5032 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
5033 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
5034 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
5035 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
5036 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
5037 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
5038 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
5039 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
5040 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
5041 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
5042 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
5043 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
5044 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
5045 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
5046 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
5047 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5048 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
5049 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5050 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
5051 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
5052 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
5053 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
5054 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
5055 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
5056 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
5057 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
5058 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
5059 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
5060 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
5061 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
5062 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
5063 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
5064 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
5065 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
5066 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
5067 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
5068 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
5069 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
5070 |
+
"collect2: error: ld returned 1 exit status\n"
|
5071 |
+
]
|
5072 |
+
},
|
5073 |
+
{
|
5074 |
+
"data": {
|
5075 |
+
"application/vnd.jupyter.widget-view+json": {
|
5076 |
+
"model_id": "857d0b6286fb4eaaafcb8911cef664dc",
|
5077 |
+
"version_major": 2,
|
5078 |
+
"version_minor": 0
|
5079 |
+
},
|
5080 |
+
"text/plain": [
|
5081 |
+
"model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
|
5082 |
+
]
|
5083 |
+
},
|
5084 |
+
"metadata": {},
|
5085 |
+
"output_type": "display_data"
|
5086 |
+
},
|
5087 |
+
{
|
5088 |
+
"data": {
|
5089 |
+
"text/plain": [
|
5090 |
+
"CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', commit_message='Upload model', commit_description='', oid='e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
|
5091 |
+
]
|
5092 |
+
},
|
5093 |
+
"execution_count": 6,
|
5094 |
+
"metadata": {},
|
5095 |
+
"output_type": "execute_result"
|
5096 |
+
}
|
5097 |
+
],
|
5098 |
+
"source": [
|
5099 |
+
"model.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_***\")"
|
5100 |
+
]
|
5101 |
+
},
|
5102 |
+
{
|
5103 |
+
"cell_type": "code",
|
5104 |
+
"execution_count": 7,
|
5105 |
+
"id": "8a28a45b-56ba-4328-8edf-4cd7ee9289c5",
|
5106 |
+
"metadata": {},
|
5107 |
+
"outputs": [
|
5108 |
+
{
|
5109 |
+
"data": {
|
5110 |
+
"application/vnd.jupyter.widget-view+json": {
|
5111 |
+
"model_id": "42c48d91578f41439d7b3ec26a6d566c",
|
5112 |
+
"version_major": 2,
|
5113 |
+
"version_minor": 0
|
5114 |
+
},
|
5115 |
+
"text/plain": [
|
5116 |
+
"README.md: 0%| | 0.00/5.17k [00:00<?, ?B/s]"
|
5117 |
+
]
|
5118 |
+
},
|
5119 |
+
"metadata": {},
|
5120 |
+
"output_type": "display_data"
|
5121 |
+
},
|
5122 |
+
{
|
5123 |
+
"data": {
|
5124 |
+
"text/plain": [
|
5125 |
+
"CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/16138639cb17307b84421e443a1c67f4fe188121', commit_message='Upload tokenizer', commit_description='', oid='16138639cb17307b84421e443a1c67f4fe188121', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
|
5126 |
+
]
|
5127 |
+
},
|
5128 |
+
"execution_count": 7,
|
5129 |
+
"metadata": {},
|
5130 |
+
"output_type": "execute_result"
|
5131 |
+
}
|
5132 |
+
],
|
5133 |
+
"source": [
|
5134 |
+
"tokenizer.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_**\")"
|
5135 |
+
]
|
5136 |
+
},
|
5137 |
+
{
|
5138 |
+
"cell_type": "code",
|
5139 |
+
"execution_count": null,
|
5140 |
+
"id": "ec5364cc-4386-4db8-a400-cd788657de84",
|
5141 |
+
"metadata": {},
|
5142 |
+
"outputs": [],
|
5143 |
"source": []
|
5144 |
}
|
5145 |
],
|
02-gpt2_bert/.ipynb_checkpoints/3-dna-bert-checkpoint.ipynb
CHANGED
@@ -1,8 +1,102 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
"id": "a3ec4b86-2029-4d50-9bbf-64b208249165",
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
@@ -10,12 +104,13 @@
|
|
10 |
"from tokenizers import Tokenizer\n",
|
11 |
"from tokenizers.models import WordPiece\n",
|
12 |
"from tokenizers.trainers import WordPieceTrainer\n",
|
13 |
-
"from tokenizers.pre_tokenizers import Whitespace"
|
|
|
14 |
]
|
15 |
},
|
16 |
{
|
17 |
"cell_type": "code",
|
18 |
-
"execution_count":
|
19 |
"id": "47b3fc92-df22-4e4b-bdf9-671bda924c49",
|
20 |
"metadata": {},
|
21 |
"outputs": [],
|
@@ -29,11 +124,20 @@
|
|
29 |
"execution_count": null,
|
30 |
"id": "73f59aa6-8cce-4124-a3ee-7a5617b91ea7",
|
31 |
"metadata": {},
|
32 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
"source": [
|
34 |
"# 设置训练参数\n",
|
35 |
"trainer = WordPieceTrainer(\n",
|
36 |
-
" vocab_size=
|
37 |
" min_frequency=2, # 最小词频\n",
|
38 |
" special_tokens=[\n",
|
39 |
" \"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"\n",
|
@@ -45,7 +149,7 @@
|
|
45 |
},
|
46 |
{
|
47 |
"cell_type": "code",
|
48 |
-
"execution_count":
|
49 |
"id": "7a0ccd64-5172-4f40-9868-cdf02687ae10",
|
50 |
"metadata": {},
|
51 |
"outputs": [],
|
@@ -75,10 +179,23 @@
|
|
75 |
},
|
76 |
{
|
77 |
"cell_type": "code",
|
78 |
-
"execution_count":
|
79 |
"id": "48e1f20b-cd1a-49fa-be2b-aba30a24e706",
|
80 |
"metadata": {},
|
81 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
"source": [
|
83 |
"new_tokenizer = Tokenizer.from_file(\"dna_wordpiece_dict.json\")\n",
|
84 |
"\n",
|
@@ -95,7 +212,7 @@
|
|
95 |
},
|
96 |
{
|
97 |
"cell_type": "code",
|
98 |
-
"execution_count":
|
99 |
"id": "c94dc601-86ec-421c-8638-c8d8b5078682",
|
100 |
"metadata": {},
|
101 |
"outputs": [],
|
@@ -112,7 +229,7 @@
|
|
112 |
},
|
113 |
{
|
114 |
"cell_type": "code",
|
115 |
-
"execution_count":
|
116 |
"id": "b2658cd2-0ac5-483e-b04d-2716993770e3",
|
117 |
"metadata": {},
|
118 |
"outputs": [],
|
@@ -123,49 +240,103 @@
|
|
123 |
},
|
124 |
{
|
125 |
"cell_type": "code",
|
126 |
-
"execution_count":
|
127 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
"metadata": {},
|
129 |
"outputs": [],
|
130 |
"source": [
|
131 |
-
"
|
132 |
-
"\n",
|
133 |
-
"# Building the config\n",
|
134 |
-
"#config = BertConfig()\n",
|
135 |
"\n",
|
|
|
|
|
136 |
"\n",
|
137 |
"# 构建配置\n",
|
138 |
-
"config =
|
139 |
-
"
|
140 |
-
"
|
141 |
-
"
|
142 |
-
"
|
143 |
-
"
|
144 |
-
" eos_token_id=tokenizer.sep_token_id # BERT 使用 [SEP] 作为句子结束标记\n",
|
145 |
")\n",
|
146 |
"\n",
|
147 |
-
"\n",
|
148 |
"# Building the model from the config\n",
|
149 |
-
"model =
|
150 |
]
|
151 |
},
|
152 |
{
|
153 |
"cell_type": "code",
|
154 |
-
"execution_count":
|
155 |
"id": "afc2cdd1-228e-4ee7-95f5-07718f00723d",
|
156 |
"metadata": {},
|
157 |
"outputs": [],
|
158 |
"source": [
|
159 |
"# 1. load dna dataset\n",
|
160 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
|
|
161 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
162 |
"\n",
|
163 |
"# 2. tokenize\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
"def tokenize_function(examples):\n",
|
165 |
" return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
|
166 |
"\n",
|
|
|
167 |
"# 3. 对数据集应用分词函数\n",
|
168 |
-
"tokenized_datasets = dataset.map(tokenize_function, batched=
|
169 |
"\n",
|
170 |
"# 4. 创建一个数据收集器,用于动态填充和遮蔽,注意mlm=true\n",
|
171 |
"data_collator = DataCollatorForLanguageModeling(\n",
|
@@ -175,13 +346,71 @@
|
|
175 |
},
|
176 |
{
|
177 |
"cell_type": "code",
|
178 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
"id": "604491f9-2ee7-4722-aad6-02e98457b5ee",
|
180 |
"metadata": {},
|
181 |
"outputs": [],
|
182 |
"source": [
|
183 |
"run_path = \"bert_run\"\n",
|
184 |
-
"train_epoches =
|
185 |
"batch_size = 10\n",
|
186 |
"\n",
|
187 |
"\n",
|
@@ -208,10 +437,182 @@
|
|
208 |
},
|
209 |
{
|
210 |
"cell_type": "code",
|
211 |
-
"execution_count":
|
212 |
"id": "d91a8bfb-f3ff-4031-a0d7-ebedc200d65a",
|
213 |
"metadata": {},
|
214 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
"source": [
|
216 |
"trainer.train()\n",
|
217 |
"trainer.save_model(\"dna_bert_v0\")"
|
@@ -219,7 +620,27 @@
|
|
219 |
},
|
220 |
{
|
221 |
"cell_type": "code",
|
222 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
"id": "fc4ad6ad-6433-471f-8510-1ae46558d4ce",
|
224 |
"metadata": {},
|
225 |
"outputs": [],
|
@@ -227,6 +648,86 @@
|
|
227 |
"#upload model\n",
|
228 |
"#model.push_to_hub(\"dna_bert_v0\", organization=\"dnagpt\", use_auth_token=\"hf_*******\")"
|
229 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
}
|
231 |
],
|
232 |
"metadata": {
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6c42f2f6-2332-40c7-9b69-50a0f0c12901",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 2.3 从头训练dna bert大模型"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"attachments": {
|
13 |
+
"6a042b8f-c47d-4f6d-b601-b80124836ec4.jpg": {
|
14 |
+
"image/jpeg": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAYEBQYFBAYGBQYHBwYIChAKCgkJChQODwwQFxQYGBcU\nFhYaHSUfGhsjHBYWICwgIyYnKSopGR8tMC0oMCUoKSj/2wBDAQcHBwoIChMKChMoGhYaKCgoKCgo\nKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCgoKCj/wAARCAGaBDgDASIA\nAhEBAxEB/8QAHAABAAIDAQEBAAAAAAAAAAAAAAUGAwQHAgEI/8QAWhAAAQMCAQQLCgkLAQQKAwEA\nAAECAwQFEQYSIVUTFRYxNEFRkZPR0gcUU1RhcXJ0krIXIjI1UoGUorEIMzZCVnN1oaOkwSMkYmTC\nJTdDY4KDlbPh8ERF0/H/xAAbAQEBAAMBAQEAAAAAAAAAAAAAAwECBAUGB//EAD8RAQABAgEGCwYF\nBAIDAQEAAAABAgMRBBITITFRFBUyM0FSU4GRobEFNGFxctEGkqLB8CI1QuEjQ4LC8SRF/9oADAMB\nAAIRAxEAPwD9UgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAujfI+a9WuGT\nY5bjSMfvZrpmov4gSAMcE8NQzPgljlbyscjk/kZAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAGrUzPdJsEC4Owxe76KdZiqYpjGRllqIolwe9EXk4zyysgc7BHoi+XQY4omRp8VNPGq76\nnpzWvTByIqcinNOU69jbNbINBjnUj0xXGnXRp/UXqN86KK4rjGGoADYAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAACCuGU9BSzrTw7JWVW9sVO3PVPOu8RvX7diM65VhClqzXdnC3GKdBWUu9\n+m009kSNnFs0yIvMFuuUEWmWyxvam/sUyKvMc3GVrdVh9NX2dHArm+n80fdZgV2nyso9lSG4w1Fv\nmXinZg3nLBG9krEfG5r2LpRzVxRTos5Tav8AN1Y4ePgjdsXLPLjB6ABdEAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAqoiKqrgiGpdbhT2u3zVlW/Mhi\nbiq8a+RDk17ymyiyht9fV26N1JZoGrnuRcFcnJjxr5EA37ncbplzfZrXZ53U9qhXCSVqqmenKv8A\nhCZp+5hYoqfCd1TI/D40mejf5YH3uURxUmRj6pGYvc973qm+uG8hCuqcpsu3q2kTa2zq5Wq/H5Sf\ni78AK7c5GZHZQNTJi5vqE3pIt9PRXDQpa4u6Hd2RJJU5PTbFxvbnJ/gmbdYMnsjKPvqskjWVN+ef\nS5V5Gp1EnYcrbNfZFgo6hEm4opW5jnJ5EXfAxZLZY23KFVigV0NUiYrDJoX6uUspzXukZOtoEblD\nZ02CpgejpEZoRUx38C85O3FLtZaStbo2aNHKnIvGBIgByo1qqq4ImlQNe4V1NbqV1RWzMhhbvucu\nBBNyhr61M60WSpmi4pZ3JC13mRdP8jBZKfdHW7d16Z9G1ypb4HfJRqLhsqpxq7i5EwLZvAVtbxe6\nZM+tsD3R8a007ZFT6tCknZ7zRXeN60ki7JGuEkT0zXsXkVq6UJErmVlskaxLzamo260aK9Ebo74Y\nml0TuXFN7kXACwTytggklfjmsarlw5ETErNLlZPVU8c9NYLnJDImcx6Nbg5OXfJd9ZFcMnX1lOuM\nM9KsrF8itxQ18iv0TtXq7fwA1N0lZ+zt05m9Z6TKuGHBblb7hQMX/tJocWJ51THAseB8c1r2q1zU\nVqpgqKm+BF3K7upoIJqKiqLhHNpR1NgqInLvkfukrP2dunM3rPOTzEteUVztEOPeisbVwM4o85VR\nzU8mKY/WWfACpx5XzSVc1MywXRZ4ka57c1uhFxw4/Ipn3SVn7O3Tmb1n21fpte/3FP8A8xZMANW2\nVb62kbNLSzUrlVU2KXDOTmNoADzLIyKNz5XtYxqYq5y4IhAy5Y2ONyolc16JvujY5yJ9aIatXAmU\neUU9HUKq2u3o3ZIscEmlcmKI7lREw0eUssNNDDGkcMMbGImCNa1EQDUtd5t11RVoKuKZU32td8ZP\nOm+SBX8o8noayFaqga2lusKZ8FRGmC4ppzV5UXeVCQyeuG2llo6xW5rpY0c5vI7jTnAkAAAAAAAA\nAAAAAHieRIoXvX9VMSPp6iBiK188ezKuL/jJjibNcucsMXE52K+ZNJCVOTNHNK+RHysc5cVwXE5M\npq1xS2hNtkY75L2r5lPquRqYuVETylMS00y3B1HT3GVs7eJW6PNiZK6ypRw7LX3N6R45qYNVVVec\n5mVomqqZrVbLNEiKmCorkMttlR8KszkdmLgjsd9vEpXaXJmhliZKlRLKxyYouOGJN0cDKOeCKFFS\nNY1jRPNpT/JaxVhVhvYlJAA7moAAK1lNlrZcm62Gkuk8jJ5WZ7GsjV+KfURXwqZM+Hqvsz+ojMo2\ntf3bcnmvajm95TaFTE6P3rT+Ai9hAKY3up5LY/6lZNEnLJA9qfgWqz3i33mm74tdXDVRcbo3Y4ef\nkM8lDSSNVslNA5q8Sxopy/Lu0w5C3O35U5PolJEtQynrqWPRHKxy4Z2HKgHTbtcKe1W6orq16sp4\nGK97kTHBEKe3uq5Luajm1FS5q6UVKZ6ov8iR7prkf3PL45N5aRyoeu5zTwuyFsSuhjVVo48VVqfR\nAjfhUyZ8PVfZn9Q+FTJnw9V9mf1F171p/ARewg71p/ARewgES/Ki2R5MLf3yvbbUbnq9WKi4Y4b2\n/vlfTuq5MKiKlRUqi8aUz+oyd2ZrWdzG9tYiNRIm4IiYfrIT2S1NAuTdrVYYlVaaPTmJ9FAK98Ke\nTHh6r7M/qJG05f5M3WpSnpLrBs7tCMkxYq85Y+9afwEXsIV/KzIqzZR0EkNVRxRz5q7HPG1Gvjdx\nKioBJZQ32hsFAlZcXvbArkbixivXFfIhWfhUyZ8PVfZn9RrdyC4VdXabjZ7w9KmptFStNsj9Oc39\nVS/d60/gIvYQCkL3V8lkejFqqhHrvNWnfiv1YHr4VMmfD1X2Z/URWUcMSd2fJpiRsRq0kuKZqYKd\nK71p/ARewgEPkxlVa8pVnS1ySv2HDPz4nM3/ADk6eI4o48djYxmO/mpgewAAAAAAAAAAAAACkZWX\napqLgtuhiqmUbNE8kLFVz/InkM1rvdotcLYY6GppGcbnQrivlVS44JyHl7GORc9rVTyoeTPs+7pa\nr0XNc76ccI3Rrehwy1NuLU0ao3ThjO/Y0aC50VwbjSVEcnkRdPMbhR8oNrqqsdDY6WWW5tX87Srm\ntYv+8u8WawJcm0LW3jYlnTecxcVVPL5TTJ8qqruTaqjHDpjk/L4T4mUZLTboi5TOGPRO3/54N2pp\n4aqJYqiJksa77XpihWqi3VeTz3VVmV8tFjjLRuXHBONWkhlRbKq4Uf8AsFVJDOzSjUdg1/kUr9jt\n9JcM6B9bcKS5R6JIXy6UXlTHfQ58rqmq9FFNGFX+NWOHdH2navklMRamua8aemnDHx+8bF0tlfBc\nqOOppnYsem9xovIptFaydsdXZK6VG1KVFHNpXO0Oa7l8pZT2MkuXblqJvU5tXS8/KaLdFyYtTjT0\nAAOlAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHNu63\nPJU1VntDHK1lTLnPw49KJ/ksGVlDDb+59cqSmY1kUVKqIjU5CB7rkElPJabtExXJSy4PVOLSip+B\naat0WVOSFQ2gmYqVlO5jXLvNcqcf1gQfc4XDueuVf++IXJDKC4vyfprVk/QOmq2ZyPqJdEUeKrzn\n2zXZ2TGT9TY77SzUc6tk2KZUzo5FXiRyFp7mKNTI2hVERFVHKvl0qBTYKeS0ZQrV5dU81ZnL/pVS\nrnws/wDDxFiveRVoyhhSvs0zKWpd8ZssHyHL5UTeXyoT+WrUdkrc0ciL/oOXShRbNZqy0ZJwX6y3\nB8MjadZp6eX40ciImK4JxKBoXm85RWO1VVov9P31BMxY46lV/wA8f16S89y9ix5F0KK5FxznaOLF\nymzaJ4cr8k4pa6mYjalrkczfRFRVTFOYq3cymmteUF2yenermQqskfkwXBfxQDpZA5dzyQZJ3HYV\nzZZWJTtXkWRyMRfvE8VzuhJhkpVS4KqQSQ1C4cjJWOX+SKBPUdPHSUkNPC1GxRMSNiJxIiYIZQio\nqIqLii6QACoipgu8QVZbr1LVSPpr5sELlxbH3qx2anJiu+YdqsoP2i/s4+oCSnpIKGxT01JGkUEc\nL0Yxu8iYKa2RX6J2r1dv4GjYK6qr8mLi6vmbPNDJUwbIjEbnIxVRFwTRxEbkpk/Uz5N26Vt+ucSP\nhaqMY5uDfImgC9YnmSRkUbpJHtYxqYq5y4IiFd3NVf7RXb2mdk9NyUppXNW51ldcWtXFGVEyqz62\npgigYsm37aXy43qNFSke1tLTOX/tGtVVc9PIqrgnmLOeY2NjY1kbUaxqYI1EwREPQFbtX6bXz9xT\n/wDMWQrdq/Ta+fuKf/mLFK/Y4nvzXOzUVc1qYqvmQD0Ct7rG6lv32BxPUc/fVLHMkcsWe3HMlbmu\nb5FTiUDWtdtZb5K57ZFe6qnWd2KYYYoiYfyN8EVfb7SWhsbJc+asmXNgpIUzpZl/3W8nKq6E41Ay\nX+5MtVrmqX6XombExN9710Nan1njJehfbbBRUsuGysjTPw+kulf5qaFstFXWXBl1v6sWoZwekYuM\ndP5cf1n+XmLGAAAAAAAAAAAAAAac+muj8jFUyGObRXs8sa/iZDgv8ttCIr+9qSr2eKFnfb00v5DB\ns8dwRKe4Ma+NVxRU0YKeLqqrXSY+Q1E30wPNqu1RW9O3Yom3GMbVqhiZDE2ONqNY1MEROIxz6JKd\n3JKn80VP8mWPFWNx38EMVR8unTllb/lf8HoWuVDzZbwAPRaAAA5llG5rO7bk857kaneU2lVwOj99\nU/h4vbQ5Tl7aKS+d12w0Nwa59O+jlVyNerV0Lo0oWH4K8l/Fqn7S/rAucldSRtV0lVA1qb6ukRDm\n2W9WmXdxoMnLEiz0UVQyor6xqf6bGt3mIvGqrychI1PckyWnjViw1bfKlS/rIito7v3MIY66hrJb\nnk0x6NqaWZE2SBqrhntcm+iAW7umtRvc8vjU3kpHIg7nNRA3IWxI6aNFSjjxRXJ9Exd0SpirO5pd\n6mnej4ZaJZGOTjRURUKxkT3OMnbjklaaypp6hZ5qZj3qlQ9EVVTToRQOn99U/h4vbQd9U/h4vbQp\nXwVZL+LVP2l/WfU7leTCLj3tU/aX9YGTu0ae5le8PBN95CxZK/o1avVY/dQrXdijbD3LrxGxPiMh\na1EXkRzSJsOR+UE1koJYss7jDG+Bjmxtjbg1M1NCAdPNK8XOktFvmrK+ZkMEbVVVcuGPkTlUpa5F\nZRqipu4ufRtK7d8kb5k+7birljysgpl2R0FXnNkjRN9WJirVXzoBZe5FQVDaC6XmridDJd6t1SyN\nyYOazebj9RfiMyavNJf7JS3K3qve8zMURUwVq8aL5iTA5plH/wBdWTPqkp0s5plH/wBdWTPqkp0s\nAAAAAAAAAAAAAAAAAR19oqi4UDqelqe9lfoc9ExXDkQkQqoiYquCGl23TcomirZLe3XNFUVU7YU+\nhs99s0CRW+aimiT9R0eaq+dTZjyjkpZGxXuiko3LoSVPjRr9ZtV+VFtpZVhje+qqN7Y6dueuP1aD\nSnut1ronMZk690Lkw/2iRG4/UeFVoLH9GTXZxjowmuPLGY8XqxFy9/VlFuNfTjFM+eqfBZIpGTRt\nkic17HJijkXFFI68WSluma+VHR1DPkTxrg9v1kHkxS3i3XF8ctFsVulXFGJKjkiXyceBbzps1Rld\nr/low3xMecY+Tku0zkt3/irx3TE/ZW2w5R0HxYZ6aviTe2VM1/Oetsco10JaadF5Vm0FiIfKK7d4\nRMgpW7LcKj4sMSaV9JfIaXbegomrS1REfGJ8MYmVLd2b1cU6OmZn4YekxCLtlyvdblD3nOtPHFAm\nfNsSZ3mbivGbOVNJWUlruFfTXetjfGx0jGIjFa3yfJ3iSyctW1dDmyOz6mVc+aT6TlMeWf6K3T9w\n47cgtXLdrG9MzVOvXOOHw/nSjllyiu5hbiIiNWrp+KKvMNdbMnqmuiu9bJKkbcEkRioiuc1MdDd/\nSSKWesWPFl9r0eqaFVsapzZpgyyTOyJq2oqtxjjTFN9PjNPT8npZadWperm3FPCov+DtcjPYrs+W\nyT1NydGjqV8jJZGpg1yM33InmNTJa418lVJDdnfHqokradubhmMVcFj8qt+L7RCPqZ67J+is1DBB\n31LUPilZirWLHG7F6rhiqZ2hPrU3b4+800lDc6qkoo4aGTGR0MznO2JyZrkwVqcWn6gMtNea2nyl\nuCVkiPteztp2/Fw2B6tRUVV5FVcPOS+UVZNS7Xd7vzdlrIo36McWqulDQssEFdWZRwzNbLBNO1FT\nicisQjK6omglt1pr3q+ppq6B0Urv+3hzsEd6SbzvLgvGBJ0qVF8r7hstwqKWKmmWFkFOqNcmH6zl\nVFXTzHpzrpbG3CGSd9TTNpnTQVL2pnscmjMdhoXiVFw5T3DSW2/umqthkp6uKR0L3MkzJEVq4aVT\n/JoXB9VbZqm2Pq5KymqKKaVqy4LJErcE0u42rncfGgG5arbWVVspaiS93BHyxNe5ESPDFUx+ieqi\nWtlro7LQVb0dFEktVWPajnoiqua1EwwzlwXzIhhsWTtLJZaF7pq3F0DFXCpeifJTymW3Zluyvrqe\nVyolZTxPgc9fl7Gitc3HjVMUX61A9VVsuNvhdU2u41VTNGmctPVORzZcP1UVERWryKasldLe7ram\n0NfUUtJU0L6nCJG4qucxExxRd7FSyXCtgt9FNV1UiMghar3OXkT/AD5CiWe01Hfthppp6mjmS3Ty\nPSJ2a5qulY7NXzY4fUBNXdlwsdK2viuk9VHHIxskFQ1q57XORvxVREVF0nm03asXLG60lXJnUTpU\nipkwRNjc2Jj1bjx45yr9SmG+2yS1sprjJXVNbFBURZ8FS7Obg56Nzk/3kzkX6jG+mfPJlLLTp/tN\nNcGTxL/vNgiXD60xQCx5Q1r6K2PdT4d8yuSGFF+m5cE5t/6iq2mtuVzpcmYpblPG+ppZZJ5YmtRZ\nHNwRF0ouG+S9JVMvt7ppotNJRwpL5NlemhPOjcecrdjo466HJCKV0rW95zrjG9WLvt40AuMFoqIp\no3uvFfIjXIqsdmYO8i4NIrJ+mrrrblrJrzWxvfPO3MjRiNajZXtRExbyIhL0tipqaoZNHNVucxcU\nR87nJ9aKpX8krPJU2XZW3OvhR1TU/Ejkwa3/AF372gCXtFRWU18qLXV1PfjEhbPHM5qNeiKqorXY\naOLfNnKCWso2RV1IqyRQLjPAiY57ONU8qb5H5Pwut2UFfQSyOqXuiZO2ok0yORVVM1y8eCp/Mlr7\ncm22hWTM2WeRUjhhTfkeu81P/u8BHXG7OrnUtDZJkWepakr50TFIYuN3nXeROo2cnqueqmu7Z356\nQVroY9G81GMXDnVSBsNHJkpcEbWq10F0eiumRMEinX/s/I1eLy+cl8leE3/+JP8A/bjAnwAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEJlurEyUuayMa9qQroVMTlWSdPlLaLRHeLLhUUcirslOi4\n7y4Lin+UOyXiibcbXVUb/kzRqznOe9zC8ttk9Tk5c3bDPHK7Yc/Qjl42+fjQCYs2VtlyngWgukTI\nZ3aHU9QmhV8i/wD1SEvEj8irhEzJ2u75jmfptb1V6pjxtw3i2ZRZGWm+O2WWJYKnw0PxXfXymBlJ\nk7kVSLU1D2tmVPzsq58si+TjAr2UVLlHe7JWVt0lS20McSvbSRri5+H0lJFz1j7kL3J4jhz6P8kR\nW3y+ZbukorDSrTWt3xZKiVN9PKv+ELBlXSss/c0qqN8qO2OnbEjt7OXFE0AZu5i9rMhqJz1RrW7I\nqqvF8dSv5Cu217oV7usOPezWrG13LiqYe6V/JxmUt7sEFntsPe9uRXZ9S7FqORVxVMePzIdUyUsM\nGT1pZSQLnv8AlSSYYK93KBMmCvpYq6hqKWdM6GeN0b05UVMF/EzgCvZHVsjqN1rrnf8ASNvwhkx3\n5Gp8mRPIqYfXiWEh73Y2XCWOqppn0dxh0R1MaacPouT9ZvkU02V+UdHhHV2iGvw3pqOdrMfOx+GH\n1KoFkI3KG5stNskqHJnyr8SGJPlSyLoa1PKqkc+732b4tHk7JE5d59XUxtannRiuU9WuxVDq9tzv\ntS2rr2IqQsY3Nhp8d/Mby/7y6QIjISGWmyJuMNQ/ZJo56psj+V2c7H+ZPZFfonavV2/gY7Naaijt\nFxppVjWSonqJGZqqqYPcqpjo8pu5OUUtusVDR1CtWWGJrHZq4pinIBIgAAAAK3av02vn7in/AOYs\nhEUNumgyjuVc9WbDURxMYiLpxbnY485LgAABAXy71KVjbVZmNluL25z3v+RTs+k7y8icZnsVgp7U\n6Soc99VcZk/1qubS9/kT6LeRqaCFt1NlHbaq4yR263VDqmodLsz6xzXK3eaipmLhghv9+ZUaotn2\n93/8wLGCud+ZUaotn293/wDMl7XJXS02dcqeGnnxX4kMqyNw4tKon4AbgAAAAAAAAAAAADUrEzZo\nH8WKtX6z2equNZadzW/K32+dDUjWWViPZI1EXiVm95N848op1xLaGtdaJ0ypLEmLkTBU5TUobfI+\nZrpmq1jVxXHjJbMn8Kz2P/kZk/hWex/8nFNmmas5005RXTTmwzmJfj1sDfoo56/gn4qecyfwrPY/\n+T3QIr3STu053xWr5E4/rXE67FONWLmluAA7moAAOZ5Qf9d+TvqUx0w5rlza8omZfWq+2G0suMdN\nTPic11QyLS5fKuJsbocvv2Lg/wDUY+sDoRU+6tVQUnc/vTqlUzX07o2ov6znaERPrIhb73QpPixZ\nI0MKr+tLXtcifUinikyNvV/udPX5d18M8NO/ZIbbSoqQtdxK5V+UB5uFNNSdwqSCpRUlZa/jIu+m\nKYlk7m/6B2H1OP3TNl1b6i55HXWgoI0kqZ6d0cbMUbivJiuhClZPV+XtmsdDbm5HwStpYWxI9bhG\nmdgmGOGIHUwc93Q5ffsXB/6jH1jdDl9+xcH/AKjH1gbndo/6s75+6b7yFiyV/Rq1eqx+6hWsrqO+\nZS9zKvpJbY2nvFQzNSlSdrkRUcn6+OG8harBTy0ljoKeobmzRQMY9uOOCo1EUDfPMzUdE9q6UVqo\np6CpiioBzbuGPVthu1Lj/p01ymYxPopnbx0ko3czsdysFTlBBcKbMp5611RTyo9qo9rvIi4p9ZeQ\nOaZR/wDXVkz6pKdLKPerFcanun2O7w06Ot9NTyRyy57UzXLvJhjipeAAAAAAAAAAAAAAAAANS6XC\nntlI6oq3o1ibycbl5EK6ymuOUa7LXvkoravyadi4PkTlcpgvarTZTxVN8Yr7amine1MWRu5XJylu\nieyWNr4nNcxyYo5q4oqHjVVzll2q3XOFNOrN6Z+M/DdHT07npYcFt010RjVV09EfCPjva9vt9Jb4\nkjo4GRN/3U0r51No1bnRNr6R8LnvjVdLXsXBWrylFkdeLRVrDc7pVQU6r/p1KM2SNfPxoYyjKYyP\nCnR/074wiI+ezBrYyfheM5/9W6ccZ+W3F0Q8TSxwsV8r2sanG5cCr0tFXV7EdFlNsjF8C1uP4m5D\nknSOej6+oqa13/ev0cyFKb1+7GNu33zMYeWLWqxatzhcueETj54MdXlEtTKtLYoVq6hdCyYf6bPK\nqm5YbJ3lI+srZO+blL8uV36qcjeRCWpaaCliSOmiZExOJqYGUvZyOc+Lt+c6qNm6PlG/4y0uZTEU\nzbsxhE7d8/P4fAMNbSw1tLLTVLM+GVua9uOGKGYHe5GvWUVPWUTqSoZnwOREVuKpvKip+BsIiImC\nbwAEfRWahoqySqpoMyeTHF2cq764rgi72k3KiGOogkhmaj43tVrmrxopkAGnbrbS25r0pI8xH4Z2\nLlXHBME3/Ih8uFro7hPSTVcDZJaSTZYX4qisd9X4G6AIqusNHVVTqlFnp6l2h8lPK6NX8mdhoX6z\n1R2KhpWVCIySWSobmSyzSK9728mcunDyISYAx08MdNTxwwtzY42o1qY7yJvGC522lucCRVkSPai5\nzVRVRzHcStVNKL5UNsAQ0OTlEyeOWd9VVuiXOjbUzOkaxeVEXRj5VJJ1HA6uZWKz/aGRuia7FdDV\nVFVMPOiGcAYK+jgr6V9NVMz4X4KrcVTeVFTSnlRD5TUcFNLUyQszX1D0klXFVznZqNx5mobAA07Z\nbKO1xSR0MKRMkkWRyIqri5d9dJpS5NW18NHE1k0TaRrmQrDO9itRd9MUXFfrJkAQ8GT9LDMyVs9w\nVzHI5EdWyuRcOVFdgpI0NHBQ06QUrMyJHOfm4qulzlcu/wCVVM4A1+84O/u/Mz/aFj2JXYr8nHHD\nDznyWhp5a6Gskjzp4Wq2NyquDcd9UTex8psgDXuFHBcKOWlq40kglTBzV0fz4l8pGPyZoHTzTMfW\nxPmdnybFVysRzsETHBHYY4IhNgDStttht6PSGSpfn7+zTvl5s5VwN0AAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAeJZo4WK6WRjGpxuVEQD2VnKzI6hyicyaRz6erYmDZo99eTHlJOW/WqNcH19P9T8\nfwMS5S2dP/2EHOvUb6OvdLfR1blRZkxlhSxrDS5QtfCmhM/FXYedUUpveqWzKrDLpKqoZhi1+crm\nu5NPJ5EOwbprNrCDnXqNervWTtZHmVdTSTN3sHpj/gaKvdJo690tOPLXJmko2pT1cTImJg2ONmGH\nkRCo3Gtru6HdoKKhilp7NE/PfI5MMcOPz8iFiZSZEMnWZrKHO38FRcOYnIMoLFBGkcFZTRsTea1M\nE/AaKvdJo690pajpoqOlip6diMhiajGNTiRDMQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2\nsIOdeobprNrCDnXqGir3SaOvdKYBD7prNrCDnXqG6azawg516hoq90mjr3SmAQ+6azawg516hums\n2sIOdeoaKvdJo690pgEPums2sIOdeobprNrCDnXqGir3SaOvdKYBD7prNrCDnXqG6azawg516hoq\n90mjr3SmAQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2sIOdeobprNrCDnXqGir3SaOvdKYB\nD7prNrCDnXqG6azawg516hoq90mjr3SmAQ+6azawg516hums2sIOdeoaKvdJo690pgEPums2sIOd\neo9JlJaF3rhBzjRV7pNHXulLA06e50NSqJBVwPcvE16Y8xuGsxMbWsxMbQAGGAAAAAAAAA0Z071l\ndJp2B64uw/VXl8ymzPURQJjK9G+TjNR10hXFGRyP8zThynLcltf0Xa4id3T4KU266tcQ2EVFRFRc\nUXjBGLOiKqwRVEePEiYpzHzvh7tEqVKt5GtzTzuH5Ljy/Kfs30Ve5uyOWeRYIsf99ybyJyec3mNR\njUa1MERMEQjYq9kTEayllaiciHrbRPF5uY6rftPIqIwz/KfsxNm5PQkQR22ieLzcw20b4vNzG/G+\nR9fyn7MaCvckQR+2jfATcw20Z4CbmHG+R9fyn7Ggr3JAEftozwM3MNtGeBm5hxvkfaev2NBc3JAE\nftrH4GbmG2sfgZuYcb5H2keZoLm5IAj9tYvBTcw21i8FLzION8i7SPM0FzckAR+2sXgpeYbbQ+Dl\n5hxvkXaQaC5uSAI/baHwcvMNtofBy8w43yLtINBc3JAEfttD9CTmG20H0JOYcb5F2sGgubkgCP22\ng+jJzDbaD6MnMON8i7WDQXNyQBH7bQfRk5httB9GT2TPG+RdrBoLm5IA0Ntafkk9kbbU/JJ7I43y\nLtY8TQ3NzfBoba0//eeyem3OlVdL1TztU2j2rkc7LtPixobm5ug8RTRypjG9rk8ins7aa6a4zqZx\nhpMYbQAGzAAAAAA8Twx1ETop2NkjcmCtcmKKVt+T1XbnufYK1YY1XFaab40f1chZwq4b5zX8lt38\nJrjXGyY1THevayiu1jFM6p6J1x4Kwlyv8HxZ7O2ZfpQy6DzJW36sasUdniia7Qq1D0VOYkrhlNY7\nc5W112oYHpvtfO1HJ9WOJFP7ouSTF036i+pyr/gjxdcmMNJVh3euDM+0bFE66aYn5z6YsVtyLiSq\nWqub2PkVcdigbmMTmLgxrWMRrERGomCInEVL4SckNfUfO7qHwk5Ia+o+d3UdGS+z7eSxMWqMMdu+\nUL/tOMonG5cicPjC3AqPwk5Ia+o+d3UPhJyQ19R87uo6cyrchwmz148YW4FR+EnJDX1Hzu6h8JOS\nGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHwk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87uofCT\nkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHw\nk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87uofCTkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h\n8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO7qHwk5Ia+o+d3UMyrccJs9ePGFuBUfhJyQ19R87u\nofCTkhr6j53dQzKtxwmz148YW4FR+EnJDX1Hzu6h8JOSGvqPnd1DMq3HCbPXjxhbgVH4SckNfUfO\n7qCd0jJFf/31Hzr1DMq3HCbPXjxhbgVqmy7yXqXI2K/W9VX6UyN/En6Srp6yJJaSeKeNd50b0ci/\nWhiaZja3puUV8mYlmABhuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABHZQ3Day0VFSmGe1uDEX6S6EM\n0xNU4QzETVOEIy/X6WOqS3WmPZq52hV30Z5zXpsklqnJPe6uapmXSrEdg1PIbeRtr70t6Vc/xqup\n/wBR7nb+C6UQsJeq5o/6bfitVXo/6aPFCxZL2eNuHeTHeVyqpk3OWjxCHmU3q+uprfTOnrJmxRJx\nu4/InKQL8rWOXGltlfMzicrWsx9pUJ6S5PTLSK7k7JlIbnLR4hDzDc5aPEIeYjN1c2pK3pI+0N1c\n2pK3pI+0Zzrm+WcbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ\n8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXN\nqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w\n3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqS\nt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3O\nWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6\nSPtDOub5MbvxSe5y0eIQ8w3OWjxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8w3OWj\nxCHmIzdXNqSt6SPtDdXNqSt6SPtDOub5MbvxSe5y0eIQ8x8XJu0L/wDgQ/zI7dXNx2Wt9uPtGxR5\nVUUszIquOoonvXBq1DMGqvJnJoMZ9yOmWJquRtmXypyPtMyLmQuhdxKxy6CMliu2TC7LHK6utqfK\na75TELofHNR7Va5EVqpgqKbU36tlWuGYvVbKtcNa210FxpGVFM7OY7nReRTaKdQtWwZVLRtVUoq1\nM6NOJruQuJrcoimdWyWLlEUzq2SAAmmAAAaVfVOY5sFOmM7/AORuOVGtVy7yJiR9qZsjpap+lz3Y\nJ5EPOy65XVVRk1qcJrxxndEbe/ohW3ERE1z0MlNb2MXPn/1ZV31dvG41EamDURE8h9IWryjpY6h8\nFJFUV0zND0pmZyNXkVy6E5zpyfJbWTU5tunD175aVVzVrqlNAr239bxWCvw/eRJ/zDb+u1BXdJF2\ni+MNMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV\n3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0MYMYWEFe2/rtQV3SRdobf12oK7pIu0\nMYMYWEFe2/rtQV3SRdobf12oK7pIu0MYM6FhBXtv67UFd0kXaG39dqCu6SLtDGDOhYQV7b+u1BXd\nJF2ht/Xagruki7QxgzoWEFe2/rtQV3SRdobf12oK7pIu0MYM6FhGCchXtv67UFd0kXaG39dqCu6S\nLtDGDOjesOCciDBORCvbf12oK7pIu0Nv67UFd0kXaMajOjesGCciDNTkQr+39dqCu6SLtH3dFMzT\nU2W4xM43NRkmH1Ncqj+kzo3p/Nb9FOY8uhicnxo2L9RgttwpblBs1HKkjMcF4lavIqb6KbRibdFW\nqYbYy0J7czHPplWKRN7BdB6oapz3rBUJmzN/mbpoXaPBjahmiSNcceVDyspyeMixynJowiNdVMbJ\njp1dExtWoq0n9Ffc3weYXpJE16bzkxPR61NUVRFUbJQmMAAGwAACMykvlFk9aZrhcpEZDGm9xuXi\nRPKctpqfKvumOWpqKqSyZOOX/TjjxSSZvL5fwMt8jdl73Um2eRVdZrOiSTtRdD38i/8A3iOvRRsi\njbHG1GsaiI1qJgiIW5uPi8/CcrrnGcKI1fOenuUC29yLJSja3ZqSWrkTfdNKun6kwJVvc6ySamix\n0n15y/5LYV+65WW23zup0WSpnbocyBudmryKu8hG5fzIzq6sI+MumjI7XJoojwanweZJ6jo+Zesf\nB5knqOj9leswLl1HxWusX/xM6z5u6j1VWe3H1nJxrkvb0/mj7r8Wz2X6f9Nj4PMk9R0fsr1j4PMk\n9R0fsr1mvu6j1VWe3H1jd1Hqqs9uPrHGuS9vT+aPuzxbPZfp/wBNj4PMk9R0fsr1j4PMk9R0fsr1\nmvu6j1VWe3H1jd1Hqqs9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe\n3H1jd1Hqqs9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe3H1jd1Hqq\ns9uPrHGuS9vT+aPucWz2X6f9Nj4PMk9R0fsr1j4PMk9R0fsr1mvu6j1VWe3H1jd1Hqqs9uPrHGuS\n9vT+aPucWz2X6f8ATY+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7n\nFs9l+n/TY+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/TY\n+DzJPUdH7K9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/TY+DzJPUdH7\nK9Y+DzJPUdH7K9Zr7uo9VVntx9Y3dR6qrPbj6xxrkvb0/mj7nFs9l+n/AE2Pg8yT1HR+yvWPg8yT\n1HR+yvWa+7qPVVZ7cfWN3Ueqqz24+sca5L29P5o+5xbPZfp/02Pg8yT1HR+yvWPg8yT1HR+yvWa+\n7qPVVZ7cfWN3Ueqqz24+sca5L29P5o+5xbPZfp/02Pg8yT1HR+yvWPg7yT1HR8y9Zr7uo9VVntx9\nZkjy6ps7/Wt9bG3lRGu/kimY9qZNM4Rep/NH3Yn2dhttfp/08VPcyyRnYrVs0LMeNjnNX8StXDuU\nOtb3VuRV2q7fWN0pE9+LH+T/AP3E6Xa7nSXSn2ahmbIzeVN5WryKnEbh203auiXLXkVir/HCfhqn\nyc6yEy5qqi5uyeyrgSjvcehrsMGz+bynRTn3diyZ21sLrrQIsd2tibPFKzQ5Wt0qn+fqLBkBf0yl\nyToLkuCTSMzZkTikbod/PT9ZmuImM6GtiuuiubFycemJ3x94WEAE3YAAAAAAAAAAAAAAAAAAAAAA\nAAAAAVTuhKr6Ohp0XRNUIi/h/ktZU8udNXY05alPxaWyfnIWsc5C1sajGNa1MERMEQ+gLvKRRUa6\nSrccoKl0vxoaNUiiau9nYYud5+I9mrS6a67L/wAbJ+CG0dduMKYd9qMKIAAbqAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAeJomTROjlaj2OTBWrvKewBKZF1Mj6Ooo5nq91JJmNc5dKsVMW4/gWIqmR3zpd\n08sXuqWs4qowmYedXGFUwqeXTdjktdUmh0c6JiWti5zUXlQqvdC+baVeSdpaIPzLPRQrXzdPepXz\ndPe9gAiiAADDWrm0kq/7qmO1phQxebE9XHgM3oi38Ch9FDzZ1+0I+FH/ALK/9Xejcq6mSKgjp6dy\nslqpEhRyb7UXfVPqPlHTRUdMyCnYjI2JgiJ+Jhyo4fZU/wCJX3VNw7q3LcAATTAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAARNWiW68UddB8VJ5EgnRN5yL8lV8qKWsqmUfBKbyVcPvoWstRsWo2BirG51\nLKi/RUynio/MSeippfjOtVRO6VKdsNe1OzqGPHi0G2aVn4CzzqbpzezZxyS1M9WPRvd5cgAO5MPM\nrsyNzuRFU9GGt0Uk3oL+AYnY5h3Dmd81GU1yfpfPXObj5EVTqpy38n7Tk7c15a551Ipe5cuTII//\nAD0oLLS4S2+xyLTuzZpnJE1yb6Y768xzmKNsTEaxNCfz8pde6P8AN9Cn/Ep7qlMPzv8AGd6vS27O\nP9OGPfjg+s9i0Rm1V9OOAAD4l7YAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADbsdY+23ulnjXBkr0h\nmTici6EX6lOrHG5dDoF5J4vfQ7Im8fqH4TvV3chwrnHNmYj5YRP7vl/a1EUX8Y6YeZo2zRPjkRHM\ne1WuReNFOW/k/vdDZ73bnKqpSV7mpj5dH/KdUOU9xDRdstmcTbkvvyH1dPIq7ngXtWUWp+r0dWAB\nJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8OsXrKe80tk/OR/Ohaxy4WwLvKAu8\npFFz+k4bdfXZP8G0atJw26+uyf4No66OTD0LfIgABu3AAAAAAAAYK6R0NDUSswz2RucmPKiKpV6S\n53WngtFTWVMNRDcGq3NSLMdG/Y1cipp0powLXPE2aGSJ+OY9qtXDkVMCIosm6OlfC7ZaqfYGKyFJ\npVckaKmC5qcS4aMTEtaomZ1K/acoa5bIlxqap0z+9klWFaTY2Zy6E+PjvYqStVWXSz0c1bWVNPVw\ntpnyrGjUY5r0RMEbyt06Taocm6WkpkpkqK2alSPYkgmmzmZvJhge6fJ6jjc5Z31FUixOga2okz0Y\nxd9qecxhLSKasEdV191tPe7qyphqEqo34I2LN2N6Mzkw5UMMN0utJT2yoq6mGoir2ORESLMdG/Y1\nei7+lNBLQZO0ka4yS1U+bGsUaTSq7Y2qmC5v1cZ8pMm6OnWNXS1U+xRujiSaVXJGipguanEuHGMJ\nM2pXqS+3J9gmr3VyvmbS7LmOo8xqO0bzsdJJZO3eqqrz3o+rbVxLTbM5ywbErHYoiInKhuQ5M00d\nE6kWrr30qx7FsT58Wo3yaCS2vgSvgq0zkmhhWBunQrVw3+XeERJFNXS3AAbKgAAAAAAAAAA28jvn\nW7/+V7qlrKpkd863f/yvdUtZx18qXn3OVKqd0P5spv37S0QfmWeihV+6H82U379paIPzLPRQpXzV\nPe3q5unvewARRAABrXLgM3onqg4FD6KHm5cBm9E9UHA4fRQ83/8Aof8Ah/7K/wDV3oXKj5wsvrC+\n6puGnlR84WX1hfdU3Dtr2uW5tAAaJgAAAAAAABS8pLzX0+VW19PWOpqdKJk/xKTZ3K5XuauOnQmC\nIXQhbnk9BXXTbBKutpqlYUgV1PNmZzEcrkRdHKqm1OHSzCs1GVVwoLhdIKpzHU0dOiU86x5uE2x5\n2Dk8vIb1Dcbxe5KtKKrgpEoookVXRZ2yyvjR646dDdKIStRkvb6qhrqWq2aZlY1qSufJi5VamCOR\neJfKeajJaikXGKarp86JsEuwzK3ZmNTBEdyrhox3zOMGpoxVt5utbVwUdVTUi0UUefgzZEllc3Hf\nx0NNG2Xq8X+sjbQVcNE3a9lSrHQ56LIr3NVMcd74pPVOTFFI9HU8lTR/6SQvSmlViPYiYIi/Vx75\njnyToXzskp5qykzadtNm08ysRY2qqoi86jGBWEyquVdVUzGVK0bXU2e9IqRZ8ZEerV49CaD1dspb\njSV13j2zbGlFFC6Ji0au2VXRo5c5f1dPMWWTJOiSaGSknraJYoUgTvaZWIrUXHTy6TbisNIxtwRz\nppFr2MjndI/FXI1mYn14GcaTGG/QyyTUVPLM1GyPja5yIuKIqpipnMNHTspKSGnjVysiYkbVcuK4\nImGlTMTYAAAAAAAAAABE5ScDp/W4PfQtZVMpOB0/rcHvoWsrRsWo2B4qPzEnoqezHUfmJPRUxe5u\nr5SpTthrWfgLPOpumlZ+AM86m6cvsz3O19Meje9y5AAdyYYK7gc/oO/AzmCu4HP6DvwDE7HM/wAn\n39Grj67IdTOWfk+/o1cfXZDqZS7y5cuQe70fJUO6P830PrKe6pTS5d0j5vofWU91Smn5r+M/erf0\n/vL672NzVXz/AGAAfHvYAABguEroaCpljwz44nObjyoiqVimvVWqQ4VLZ1kp3SuRYczMVG46OXSW\nqoibPTywvxzJGqx2HIqYGpNaqeWKmjdno2nYsbMHacFbm6fqPSyLKMnt0TTepxmfhG777sHNft3K\nqomicP8A6haa7Vbbc6pfOs0ixtzWOp8xqOcqIi48aaTauE9yt1LPI+phmRI85qqzNc12KIujjTSb\ncNlhZA6CSapmgVmZsckmKInk8p9bZafCXZ3zVCyR7FjK/FUbyIdVWVZJn4xEYY7M2NcasI6MMNfz\n+KMWr2bhjrw3tO/XOpo3SJA5qIlIsqYtx+Niif5NGS9VcTapI6ls+x0+y57ocxWOxRN7jQl0sVMs\nczZpKiZZY9iV0kmKo3kTkM9baqesejpc9F2NYviuwxaqov8AgzayrIrcU0VU44bZwj4d++Nsd5Va\nv1TNUTh3q/Ne6yKGs2KpZPsdOkiPdDmK1yuRMMOMzVFyr4aLZIJ1nlWWNiNkp9j310+cma+0U1bn\nbNnoqxbCua7D4uKLz6Dy20R/F2Woqpka9r2pJJjgrVxQ2jLcjzYnMjHHXGEa9nw+ezBjQX8ZjHz/\nAJ+6N20qa7v11HO2FkMLZURY85UXBcWr9aG9QzVaWN1VUTtllfDsrcGZqN+Ljh5TPFaaWKWtkja5\nq1aYSoi6N7DRyb5sMpY20KUiY7Ekex7+nDDDfOS/lOTzEUWqdWMdEY4Ya9e3b4q27VyJxqnXr6fB\nBR19wp4KOaeeOZlVE5UTY81WORiuTzpoMFFeKpLZ35LO6V2wI/Y1p8xucuH63GmKktT2Omhzc6Se\nXMYsbNkkV2YipguH1H2mssMMKQLPUy06R7HsUkmLc3DDeOqcqyLCcacdcf4xGMYzq+GrCOhKLN/G\nNfn8mtVVNwtlLLUVE0NQxIVdm5uaqP0YYcqaTxVVdfbnRJUTxz7PG/DBmbmPRud9aaDdis1M3P2Z\n0tQjo1iRJn52a1eJBHZqdqqskk8yoxY2bJJjmNXQuHWSpynJY5URO3H+mIx1asN2E7d7ebV3o9fH\nxRrbhcKWnpJaiaOZtVE5UwjzVY5GZyedNBrxXWt2nlq1qldI2nSTNdTZrUVcOPjJiCx00SIjnzy5\nsaxs2STOzGqmC4ch8ZY4W0rqZ1TVvgVmx5jpcURPIWjK8i6aemP8Y1xjP7YR0YtNDf3+bXs1xnnu\nTqd1QlRGkKPVyxbGrVx3vKTxrd5xd+R1KZ2yMj2JNOhU8psnlZXdt3a4qtxhq8+7CPJ12aKqKcKp\nxAAcqoAAMcu/D++i99DsqbxxqXfh/fxe+h2VN4/Svwd7lV9U+kPmvbPPx8v3kU5T3EvnzLn+JL78\nh1ZTlPcS+fMuf4kvvyH2NHIqfO3+ftd/o6sACTsAAAAAAAAAAAAAAAAAAAAAAAAAAAKnlzw6xesp\n7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgXeAIoqDE1Y7pdo3aHJVK/DyORFQ2CQyjs9QtXtjbWJJKr\nc2aDHDZETeVF5UIF9yiiXNqoqqnem+2SB2KcyKdNuuMMJdlq5Tm4TLeBH7cUXhX9C/sjbii8K/oX\n9k3zo3q59O9IAj9uKLwr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSA\nI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLwr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG\n3FF4V/Qv7Izo3mfTvSAI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLwr+hf2RtxReFf0L+yM6N\n5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSAI/bii8K/oX9kbcUXhX9C/sjOjeZ9O9IAj9uKLw\nr+hf2RtxReFf0L+yM6N5n070gCP24ovCv6F/ZG3FF4V/Qv7Izo3mfTvSAI/bei4pHr/5L+ozQyVd\neux2ujme52jZpWLHGzyqq7/mQTXTHSxNymOlLZGJnV13lT5KyMZ9aN/+S1GhY7ay1UDKdrle/FXS\nSLvvcu+pvnJM4zi4KpxmZVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFa+ap71aubp73sAEU\nQAAa1y4DN5j1QcDh9FDzcuAzeY9UPA4fRQ82P7hP0f8Asr/1d6GyrTNntMy/JZUoir50VDbNq7UE\ndyoJKaVVajtLXJvtcm8qFeSvqrcmw3ilnxboSpgjWSN6cq4aWr5FO6uJnW5q6ZnXCXBEbo7Xx1D0\n88EnZG6S1eMu6CTsmmEp4SlwRG6S1eMu6CTsjdJavGXdBJ2RhJhKXBEbpLV4y7oJOyN0lq8Zd0En\nZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsjdJavGXdBJ2RhJhKXBEbpLV4y7o\nJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsjdJavGXdBJ2RhJ\nhKXBEbpLV4y7oJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7I3SWrxl3QSdkYSYSlwRG6S1eMu6CTsj\ndJavGXdBJ2RhJhKXBEbpLV4y7oJOyN0lq8Zd0EnZGEmEpcERuktXjLugk7ITKChfop0qah/E2Kne\nqrzogwkzZesofjR0USfKkq4sE8y4/wCC1FdtdBVVtwjuFyi73ZEi9706ri5FXfc7Dj8hYitMYQtT\nGEBjqPzEnoqZDHU8Hk9FTS9zdXylvTthrWfgDPOpumlZ+AM+s3Tm9me52vpj0b3eXIADuTDBXcDn\n9B34GcwV3A5/Qd+AYnY5n+T7+jVx9dkOpnLPyff0auPrsh1Mpd5cuXIPd6PkqfdGYq2mmk4mVDVX\n60VClHV7rQxXKgmpKhFzJG4YpvovEqHMrhabja5FZVU0ssSfJqIWq5rk8qJpRT4j8Veyr+VTRfsU\n52EYTEbX0nsrKrdqJt3JwxaoMK1DE30lRf3Tuod8x8knRO6j4rivLexq/LP2e3wqz148YZgYe+Y+\nSTondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+xwqz148YZg\nYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+xwqz1\n48YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2NX5Z+\nxwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0TuocWZb2\nNX5Z+xwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qHfMfJJ0Tuo\ncWZb2NX5Z+xwqz148YZgYe+Y+STondQ75j5JOid1DizLexq/LP2OFWevHjDMDD3zHySdE7qPcb3S\nuzYIKiVy7yMhcq/gZj2XlszhFmr8s/YnKrEf5x4w9IxZamkibpc+ojRE/wDEi/4OxJvFIySycqUr\nI7hc49i2PTDAq4qir+s7y+Qu5+lfh7ILmQ5HmXdVUzjhu/mD5n2jlFN+9nUbI1CnKe4l8+Zc/wAS\nX35DqynKe4l8+Zc/xJffkPoqORU8W/z9rv8AR1YAEnYAAAAAAAAAAAAAAAAAAAAAAAAAAAVPLnh1\ni9ZT3mlsKnlzw6xesp7zS2T85H86FrHLhbAARRAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFq+ap71qubp73sAEUQAAa1z4DL5j1Q8Dh9FD\nxc+Ay+Y90PBIvRQ82P7hP0R6yr/1d7OAD0kgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMdT\nweT0VMhjqeDyeipK/wA3V8pZp2w1rPwCM3TTtHAI/rNw5vZnudr6Y9G93lyAA7kwwV3A5/Qd+BnM\nFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej5AAJusAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAFOU9xL58y5/iS+/IdWU5T3EvnzLn+JL78hWjkVOO/z9rv9HVgASdgAAAA\nAAAAAAAAAAAAAAAAAAAAAABU8ueHWL1lPeaWwqeXPDrF6ynvNLZPzkfzoWscuFsABFEAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABVO6H82U379paIPzLPRQq/dD+bKb9+0tEH5lnooWr5qnvW\nq5unvewARRAABq3TgMvmMlFwSH0UMd04DL5jJRcEi9FDzY/uE/RHrKv/AFd7MAD0kgAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAMdVweT0VMhjquDy+ipK/zVXylmnbDXtHAIzcNO0cAjNw5/Zvu\nlr6Y9G93lyAA7UwwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD\n3ej5AAJusAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOU9xL58y5/iS+/IdWU5T3EvnzLn+J\nL78hWjkVOO/z9rv9HVgASdgAAAAAAAAAAAAAAAAAAAAAAAAAABU8ueHWL1lPeaWwqeXPDrF6ynvN\nLZPzkfzoWscuFsABFEAAAAAAAAAAAAAAABVb13QcmLLlJS2G5XaCC51GGZC7ix3sV4sS0SSMjidJ\nI5rY2orlcq6ETlOTZadw2yZVd0Gnypqq6shla5jpqePDNlVnydK6U3kxOpVtFDWW6eimRVgmidC9\nEXD4qpgv8lAgMkcvcm8r6qspsn7pDWT0q4SMbii4Y4Ypjvp5Tdyvyps+SFpW5ZQVsdJS5yMRztKu\ncvEicalG7kvcXtHc3vNfcqGvq6yapbsTEmRESNmOOGjfXyk33XO5zQd0rJ6K2XCpnpHwTJNDPEiK\nrXYKioqLoVFRQLVYbxQX6009ytFTHVUVQ3OjlYuhTfK53PskqLIjJWjsVtkllhp0VVklX4z3KuKq\nvJ5ixgAAAAAAAAAAAAAAAAVTuh/NlN+/aWiD8yz0UKv3Q/mym/ftLRB+ZZ6KFq+ap71qubp73sAE\nUQAAat04DL5jJR8Ei9FDHdOAy+YyUfBYvRQ82n+4VfRHrKv/AF97MAD0kgAAAAAAAAAAAABHZQ3u\n35PWie53iqjpaKBMXyP3k8nnNfJLKe0ZW2htysFZHV0jnKzPbxOTfRU4lNLuj5HUWXeSlVYrjLLD\nFMqObLFhnMcm8unf8xodybue0Hc3ybfabfUzVSyzLPLNKiIrnKiJoRNCJgiAbmVeX+TWSdfR0V+u\nkNJU1S/6bHY729ivInlLO17XxpI1yKxUzkci6FTlOV91buKWfui3+hutfX1dLLA1I5Ww4KkrEXHD\nTvL5Tp9PSxU9DHSRoqQxxpG1Mf1UTACuWHug5MX7KCrstqu0FRcabHPib5NC4Lx4FqOS5B9w+yZH\nZdVeUtJXVk8smfsMEmCNiz1xXSml3kOtAAAAAAAAAAAAAAAAADFVcGl9FTKYqrg0voqSv81V8pbU\n7YYLRwCM3DTtPAIzcOf2b7pa+mPRtd5cgAO1MMFdwOf0HfgZzBXcDn9B34Bidjmf5Pv6NXH12Q6m\ncs/J9/Rq4+uyHUyl3ly5cg93o+QACbrAAAAAAAAAAAAAGhfbvQWK1VFyu1THTUVO3OkleuCIhpZH\n5VWbK+07Y5P1sdXSo9Y3OboVrk4lRd4w90DJOiy2yVrbFcpJYoKlE/1IlwcxyLiip9ZD9yLucUHc\n1sE9uoKqerkqJtmmnlREVy4YIiIm8iIBJZX5e5N5IVNJT5Q3SGjmqlwja7FVwxwxXDeTyllilZLE\nyWJ7XxvRHNci4oqLvKcv7rfcZtHdIu1vuFdX1dHPTN2J+woipJHjjhp3l8p0qgooaG209DAipTwR\nNhYiriua1ME/kgFds/dByYvOU1TYLddoJrpT458LePDfwXeXAtZyTI7uG2TJjuiVGVdLXVk0rnPf\nDTPwzYnPxzlxTSu+uB1sAAAAAAAAAAAAAAAAApynuJfPmXP8SX35DqynKe4l8+Zc/wASX35CtHIq\ncd/n7Xf6OrAAk7AAAAAAAAAAAAAAAAAAAAAAAAAAACp5c8OsXrKe80thU8ueHWL1lPeaWyfnI/nQ\ntY5cLYACKIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACqd0P5spv37S0QfmWeihV+6H82U\n379paIPzLPRQtXzVPetVzdPe9gAiiAADUuvAZfMZaPgsXooYrrwGXzGaj4LF6KHm0/3Cr6I9ZVnm\no+bKAD0kgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMVXwaX0VMpiq+DS+ipHKOaq+UtqdsM\nNp4BGbZqWngEZtkPZ3ulr6Y9G13lyAA7UwwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8\nn39Grj67IdTKXeXLlyD3ej5AAJusAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFOU9xL58y5/\niS+/IdWU5T3EvnzLn+JL78hWjkVOO/z9rv8AR1YAEnYAAAAAAAAAAAAAAAAAAAAAAAAAAAVPLnh1\ni9ZT3mlsKnlzw6xesp7zS2T85H86FrHLhbAARRAad0uVLbKfZquTNaq4NaiYucvIicakC/KqpeuN\nNZpnM4lmmbGq/VpMxEzsZimZ2QtQKnunuOpWfbW9k+7p7jqVn21vZM5lW5to6ty1gqm6e46lZ9tb\n2RunuOpWfbW9kZlW40dW5awVTdPcdSs+2t7I3T3HUrPtreyMyrcaOrctYKpunuOpWfbW9kbp7jqV\nn21vZGZVuNHVuWsFU3T3HUrPtreyN09x1Kz7a3sjMq3Gjq3LWCqbp7jqVn21vZG6e46lZ9tb2RmV\nbjR1blrBVN09x1Kz7a3sjdPcdSs+2t7IzKtxo6ty1gqm6e46lZ9tb2RunuOpWfbW9kZlW40dW5aw\nVTdPcdSs+2t7I3T3HUrPtreyMyrcaOrctYKpunuOpWfbW9kbp7jqVn21vZGZVuNHVuWsFU3T3HUr\nPtreyN09x1Kz7a3sjMq3Gjq3LWCqbp7jqVn21vZG6e46lZ9tb2RmVbjR1blrBVN09x47Kz6qxvZM\n1PlZEkjWXKjnokcuCSuVHx4+Vyb31oJpmOhiaKo2wsoPjXI5qOaqKi6UVD6atVU7ofzZTfv2log/\nMs9FCr90P5spv37S0QfmWeihavmqe9arm6e97ABFEAAGpdeAS+YzUnBYvRQw3XgEvmM1JwWL0UPN\np/uFX0R6yrPNx82UBVwTFd4gZ8o2vlfFa6OevVi4OkYqNjReTOXf+rE9JJPAru3N44rHH9dc3sjb\nm86ji+3N7JjOhjOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6D\nOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3\nN7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzey\nM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86j\ni+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I25vOo4vtzeyM6DOhYgV3bm86ji+3N7I28uUemex\nyZnHsNSyRebQMYM6FiBo2q6U1zic6mc5HMXB8b0zXsXkVDeMshiq+Cy+iplMVXwWX0VI5RzVfyn0\nbU8qGG1cAi8xtmpauAReY2yPs73W19MejN3lyAA7GgYK7gc/oO/AzmCu4HP6DvwDE7HM/wAn39Gr\nj67IdTOWfk+/o1cfXZDqZS7y5cuQe70fIAKxdMsqKknfBSQy1srFwcseCMReTOX/ABic927RZpz7\nlURG+dTtpoqrnCmMZWcFHXLqfis/PVJ2T5u6qNTf3Sdk4uN8h7anxhbgl/qT4LyCjbuqjU390nZG\n7qo1N/dJ2RxvkPbU+MHBL/UnwXkFG3dVGpv7pOyN3VRqb+6TsjjfIe2p8YOCX+pPgvIKNu6qNTf3\nSdkbuqjU390nZHG+Q9tT4wcEv9SfBeQUbd1Uam/uk7I3dVGpv7pOyON8h7anxg4Jf6k+C8go27qo\n1N/dJ2Ru6qNTf3Sdkcb5D21PjBwS/wBSfBeQUbd1Uam/uk7I3dVGpv7pOyON8h7anxg4Jf6k+C8g\no27qo1N/dJ2Ru6qNTf3Sdkcb5D21PjBwS/1J8F5BRt3VRqb+6Tsjd1Uam/uk7I43yHtqfGDgl/qT\n4LyCjbuqjU390nZG7qo1N/dJ2RxvkPbU+MHBL/UnwXkFG3dVGpv7pOyN3VRqb+6TsjjfIe2p8YOC\nX+pPgvIKNu6qNTf3SdkbuqjU390nZHG+Q9tT4wcEv9SfBeQUbd1Uam/uk7J7jy6fnJs1ola3jWOd\nrl5sEMx7WyGZwi9T4wcEvx/hPguwI6zXmjvEKyUb1zm6HxuTBzV8qEid8TExjDnmMNUinKe4l8+Z\nc/xJffkOrKcp7iXz5lz/ABJffkLUcipx3+ftd/o6sACTsAAAAAAAAAAAAAAAAAAAAAAAAAAAKnlz\nw6xesp7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgAXeUiiotdKtflDWTSLnMpXJBEnE1cMXL5zIatLw\n26r/AMbJ+CG0dduMKYd9qMKIAAbqAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAeXsa9jmvRHNcmCovGe\ngBJZEzv72qqJ7lclLJmsVd/MVMUT6iyFUyO+dLv54vdUtZxVRhMvOrjCqYVTuh/NlN+/aWiD8yz0\nUKv3Q/mym/ftLRB+ZZ6KFa+ap71Kubp73sAEUQAAal24BIZqTg0XooYLtwCQz0vBovRQ82j+4VfR\nHrKs83HzRGVsz0oYaWJysdVytiVyb6N4/wCR7p4Y6eFkULUZGxMGtTiNfKjh9lT/AIlfdU3Dtr2u\nW5tAAaJgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIq4f7FdaCvi+K58qQTYfrtdvY+ZS1FUyj4JTe\ntw++hay1GxajYGKs4LL6KmUw1nBZfRUnlPM1/KfRWjlQxWrgMXmNs1bXwGLzG0R9n+62/pj0Zucu\nQAHY0DBXcDn9B34GcwV3A5/Qd+AYnY5n+T7+jVx9dkOpnLPyff0auPrsh1Mpd5cuXIPd6Pkr2XNb\nJR2J7YHKySd6Qo5N9EXf/kc9jY2NiMYmDU3kLr3SPm+h9ZT3VKYfnX4zu1aa3ax1YY9+OH7PrPYt\nEZlVXTiAA+Ke0AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA2LTVOt96o6mNc1HSNikT6TXLhp+vA6y\nhxuX5UH7+L30OyJvH6f+ErtVzIcKp5NUxHywif3fMe16Ipv4x0wKcp7iXz5lz/El9+Q6spynuJfP\nmXP8SX35D6yjkVPn7/P2u/0dWABJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8O\nsXrKe80tk/OR/Ohaxy4WwLvKAu8pFFz+k4bdfXZP8G0atJw26+uyf4No66OTD0LfIgABu3AAAAAA\nAAa1yc5luqnsVWubE9UVOJc1Sn0ck1HBYZ4q+pmlrmq2eGWXPxTYnOz0TiwVELu9jZGOY9Ec1yYK\ni8aGjRWW20L1fSUNPC5W5qqxmC4chiYa1UzM6lLtFRXQZMJcca11R3oj0llqkkaqr+tmfzJOvctm\nopKmiuc89S6hklSGR2yJIqIi7InJhj9ZP0ditVE9H0lBTxORqtxa3DRyGSitNBRPkfSUkMTpEzXK\n1u+nJ5vIYwaRRMQrNcslrSidR3CoqVqoH7Jny5+ODMdkTk0mCKWehprNNBcKmaStiek0Ukufo2Jz\ns9OTBUTnLXTWa20qyLT0UEayNVrs1u+i76eYUdlttGrlpaGniVzVYqtZvtXfTzDAzJUqkfWMyUmr\nXPr2zLR56Svq85FVcNKN4lJXJ2WpZlAyB610UL6PZFjqpdk2R2cnxmrxYf5JuHJ60Qo5IrdTMRzc\nxURm+nIb/esGzxT7EzZomKxj8NLWrvonk0IMCKJhmABsqAAAAAAAAAADbyO+dbv/AOV7qlrKpkd8\n63f/AMr3VLWcdfKl59zlSqndD+bKb9+0tEH5lnooVfuh/NlN+/aWiD8yz0UKV81T3t6ubp73sAEU\nQAAad24BIZ6Xg0XooYLvwCQz0vBovRQ82j+4V/RHrKs83HzQeVHzhZfWF91TcNPKj5wsvrC+6puH\nbXtctzaAA0TAAAAAAAACiZTOqp8tXUsa1skDLeyXYqeqSFEcsjkzlx39CYF7I242K13KoSevoKeo\nmRuYj3txVG444ebSptTOBCh3G7XG01d8ldUTrbmQNgwc/OdBIseLXY+VdCqb9udJeJLhthc6mlSh\npoNi2OXMwR0SOWV30tOKcmguCWe3JTTU6UUGwTtRkrM3Q9qJgiL9Rjq7DaqxYVqaCnkWJqMZnM3m\npvJ5vIZzoZxVikc681ta2ru9RHFSU0ToZIn7Ej8W4rKqcf4GjZ5qi/VsbrjcKulzbVHNjFNsaZ6v\nemeqb2lERS719ktle+N1ZQwTOjTNarm7ycnm8h5rbBaa2ZktXbqaaRjEja5zMcGpvJ5hnQYufUtb\ncbtWUiTPrp07zzlSnqUgzlR6tR/lxRDJd5q1lwyg2Jbs9KOGBYlhqfiwqsWKq5P1tOlS+VuT1orn\nsfV26mlexmxtVzN5vInkNiC2UMDJ2Q0sTGztayVEb8trW5qIvLgmgznwYvVre6W20kkkiSvdExzn\nomCOVUTSbR4gijghjhhY1kUbUa1rd5qJvIeybAAAAAAAAAAAInKTgdP63B76FrKplJwOn9bg99C1\nlaNi1GwMNZwWX0VMxhreCS+ipPKeZr+U+itHKhjtfAYvMbRq2vgMXmNonkHutv6Y9GbnLkAB1tAw\nV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej5Kh3R/m+h9ZT3\nVKaXLukfN9D6ynuqU0/Nfxn71b+n95fXexuaq+f7AAPj3sAAA1ro5zLZVvYqtc2F6oqcS5qlThnq\nYmUrkfWR7LSve500mcki5mPxeReMucjGyRuZI1HMcitci7yovEYX0dNIyJj4I3NiTNYip8lMMME+\nrQelkWW28nomiunHH7fdzX7FVyqKonDD7qvDLU09mfUotSyR0TE2SSdHomcqIrsOLfxNm6t2vpqh\nKa4TOesGfmOfnLocnx0Xi3yap7VQUyu2CkhZnNzVwbvpyHqnt1HTtkbDTRMbImDkRu+nJ5jqq9pW\nZrzoiduOGEa9mqcZnZh59CMZLXm4Y9H83ILKOrmY6Xved6IlCr/iO3lzkwXzmlVz1NMlWxjqyFve\nmejZZM5XOzkTFq8X/wAloitVDFFJHHSxNZImD0zflJyGaejpqhUWaFj1RqsTOTHQvF/IWvaVi1FN\nEUYxHy17OjZvKsluVzNWOtT62oqqaGtbG+sgTvVHo2WTOcq5yJi1eI2axlVFQtSnfVQTSTxsa6Sf\nZMcVLNUUVNU47PBHJizY/jJj8Xfw8xihtVBB+apImaUdobxpvKbx7Vs5sY064nHZjE7Pj8OmJY4J\nXjOvUgI6mS5uuivlnhfTwtVWMercx6IuJJUEboMnFmSaZ8slOkiukfnKi5vFyEmlJTo+Z6QsR8yY\nSLh8tPKe0hjSBIUY1IkbmZmGjDkOS/l9FcRTRThTjE4fKNfmrRk9VM41TjOtVmSS0lPb5IqyaR9T\nC/ZGPfnfqKucnJgpiopqqGx9+f7Vsy0zXZ8k+e1VVExXN+vEs1Pa6GmztgpYY85uaua3fTkPlNaa\nCmdnQUkMa4ZvxW8XIdU+07GExmzOuOiNeuZwnCfjh07EoyW5jGv+av50IeuctrpZZaSvllmWnV6M\ne7Px3vjpyYYnmtz7e+nSlrJpknhkz85+dvNxR6cmn8SdprdR0qvWnpoo1emDsG76cnmPMFsooM/Y\naWJmema7Bu+nJ5iVOX2o2xM7eiNerVjuzdsbe5vOT1zs1eOr/wCq8kktHT0L4auaR9TA9ZGPfnYY\nMxzk5NJgjdUMyelqVdVtlWmRySOqM5FVcN5OItFPa6GmztgpYY85uaua3fTkPEdmt0SOSOjhajm5\nqojd9OQtHtSx1Z2xOyNeuZw2/HDp2NOCXN6Lsr5mXhYXd8xxrAj8yofn5y476FjMewRbM2XY27K1\nuY12GlE5DIeVld+L9cVxGGp12bc26c2ZAAcqoAAMcu/D++i99DsqbxxqXfh/fxe+h2VN4/Svwd7l\nV9U+kPmvbPPx8v3kU5T3EvnzLn+JL78h1ZTlPcS+fMuf4kvvyH2NHIqfO3+ftd/o6sACTsAAAAAA\nAAAAAAAAAAAAAAAAAAAAAKnlzw6xesp7zS2FTy54dYvWU95pbJ+cj+dC1jlwtgAIoqEjFgvN2gdo\nds+yp5Uciaf5GcmsoLI6vlZV0UjYa6NM1FcmLZG/Rd1kA+K7QrmzWiZ68sEjXIvOqHRRcjDCXXau\n0xThLKDBjcdS1/3O0MbjqWv+52jfSU71dLRvZwYMbjqWv+52hjcdS1/3O0NJTvNLRvZwYMbjqWv+\n52hjcdS1/wBztDSU7zS0b2cGDG46lr/udoY3HUtf9ztDSU7zS0b2cGDG46lr/udoY3HUtf8Ac7Q0\nlO80tG9nBgxuOpa/7naGNx1LX/c7Q0lO80tG9nBgxuOpa/7naGNx1LX/AHO0NJTvNLRvZwYMbjqW\nv+52hjcdS1/3O0NJTvNLRvZwYMbjqWv+52hjcdS1/wBztDSU7zS0b2cGDG46lr/udoY3HUtf9ztD\nSU7zS0b2cGDG46lr/udoY3HUtf8Ac7Q0lO80tG9nBgxuOpa/7naGNx1LX/c7Q0lO80tG9nBgxuK7\n1lr+dnaM0NrvFeqRrA23wr8qWR6Peif7rU4/KqmJuUx0sTeojpb2RTc+e61CfIdK2NF5c1NP4lpN\na20MNuoo6ambmxsTDTvqvGq+U2TmmcZxcNU4ziqndD+bKb9+0tEH5lnooVfuh/NlN+/aWiD8yz0U\nK181T3q1c3T3vYAIogAA07vwCQ2Kbg8Xooa934BIbFLweP0UPMo/uFf0R6yrPNx80Hlcmx7W1K/I\nhqW5y8iKmBtm/W0sVbSyU9Q3OikTNchXUp7va02JsCXKmb8iRr0ZKicjkXQvnQ9CunHY5q6ZnYkw\nRe2Vam/Yrjj5Nj7Q2yrNRXL+n2jTNlpmylARe2VZqK5f0+0NsqzUVy/p9oZsmbKUBF7ZVmorl/T7\nQ2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe2VZqK5f0+0NsqzUVy/p9oZsmbK\nUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe2VZqK5f0+0Nsq\nzUVy/p9oZsmbKUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtDbKs1Fcv6faGbJmylARe\n2VZqK5f0+0NsqzUVy/p9oZsmbKUBF7ZVmorl/T7Q2yrNRXL+n2hmyZspQEXtlWaiuX9PtH1K25Sa\nILHVI7iWaRjE50VRmyZsvN+/1HW+mbpfLVMVE8jVzlX+RaiEtFpmZVrX3SRklYrc1jI/kQt5Ex31\n8pNlKYwhWmMIDDW8El9FTMYa7gk3oqSyrma/lPopRyoY7XwGLzG0ats4DF5jaJ5B7rb+mPRm5y5A\nAdbQMFdwOf0HfgZzBXcDn9B34Bidjmf5Pv6NXH12Q6mcs/J9/Rq4+uyHUyl3ly5cg93o+Sq90WJX\nWWKZExSGdrneRN7/ACUg65VU8VVTSQVDEfFI1WuavGhz245J3OikclC1tbS/q4uRsjU5Fx0L5z47\n8Texr+XZl7J4xmnVMfD4PovZmWUWMaLmqJQoNtbVd037TVfUres+bVXfVFX93tHx3EHtHsp8vu9j\njDJ+u1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8meMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7Kf\nI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQe0ey\nnyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7KfI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtH\nsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQe0eynyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7\nR7KfI4wyfrw1QbW1V31RV/d7Q2qu+qKv7vaHEHtHsp8jjDJ+vDVBtbVXfVFX93tDaq76oq/u9ocQ\ne0eynyOMMn68NUG1tVd9UVf3e0Nqrvqir+72hxB7R7KfI4wyfrw1QbW1V31RV/d7R7jst5ldmstc\nrVXjke1qfiZj8P8AtGZw0U+X3Yn2hk0f5tOGJaiuooGJi6SoZh9S4r+B2BN4q+S2TLrdP37cHskr\nMMGNZ8mJPJyr5S0H6J7C9nV+zski1c5UzjL53L8pjKLudTs2CnKe4l8+Zc/xJffkOrKcp7iXz5lz\n/El9+Q96jkVPGv8AP2u/0dWABJ2AAAAAAAAAAAAAAAAAAAAAAAAAAAFTy54dYvWU95pbCp5c8OsX\nrKe80tk/OR/Ohaxy4WwAEUQHxyo1MXKiInGpHzXy1wuVstwpWOTiWRAJHAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzaz\no+lQCVwGBFborNrOj6VBuis2s6PpUAlcBgRW6Kzazo+lQborNrOj6VAJUEWmUNnXeudJ0qG9TVdP\nVNzqaeOVvKxyKBmAAFU7ofzZTfv2log/Ms9FCr90P5spv37S0QfmWeihavmqe9arm6e97ABFEAAG\nnd+ASfUbFNweP0UNa8cAf9Rs03B4/RQ8yj+4V/RT6yrPNR82QAwVVZTUrc6pniiTle5EPTSZxgRa\n5Q2dN+50nSofN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nwIrdFZtZ0nSoN0Vm1nSdKgErgMCK3RWbWdJ0qDdFZtZ0nSoBK4DAit0Vm1nSdKg3RWbWdJ0qASuA\nIrdFZtZ0nSoZYb3a5nI2K4Ur3LxJIgEgAio5EVFxReNAAMFdwSb0VM5gruBzeipz5VzFfyn0bUcq\nHi2cBi8xtGtbOAxeY2TXIfdrf0x6M3OXIADqaBgruBz+g78DOYK7gc/oO/AMTscz/J9/Rq4+uyHU\nzln5Pv6NXH12Q6mUu8uXLkHu9HyADDU1dPStzqmeOJvK9yITdbMMCLXKGzouC3Oj6VD5uis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwGBFborNrOj6VBuis2s6Pp\nUAlcBgRW6Kzazo+lQborNrOj6VAJXAYEVuis2s6PpUG6Kzazo+lQCVwBFborNrOj6VDNBerZUPRs\nFfTSOXibIigb4CLimKbwAKcp7iXz5lz/ABJffkOrKcp7iXz5lz/El9+QrRyKnHf5+13+jqwAJOwA\nAAAAAAAAAAAAAAAAAAAAAAAAAAqeXPDrF6ynvNLYVPLnh1i9ZT3mlsn5yP50LWOXC2ABd4iio96q\n5LxcZ4M9zbfTP2NWNXDZXpv4+RDFHSU0bcI6eJqeRiGCzrnUsr133VEqr7am8ddFMRDvtURFMMew\nReCj9hBsEXgo/YQyA2UwY9gi8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8\nFH7CGQAwY9gi8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8FH7CGQAwY9gi\n8FH7CDYIvBR+whkAMGPYIvBR+wg2CLwUfsIZADBj2CLwUfsINgi8FH7CGQAwY9gi8FH7CDYIvBR+\nwhkAMGPYIvBR+wg2CLwUfsIZADBiWCFd+GP2ENd9EkT0noF71qm6Wvj0IvkcnGhugTETtYmmJjCV\nlsFx20tkVQ5uZJpZI3kcmhSRKzkOv+jcmcTap2HMilmOOYwl50xhOCqd0P5spv37S0QfmWeihV+6\nH82U379paIPzLPRQrXzVPerVzdPe9gAiiAADSvHAH/UbNNweP0UNa8cAf50Nmn/MR+ih5lv+4V/T\nT6yrPNx82lf691utzpImo6d7kjiavG5d4h6SzwNXZq1Eq6x2l8sqZ2nkROJDYyp01lmYu8tTp+pq\nm4d9c9DluT0MKUtOm9Tw9GnUO9afwEPRp1GYE02HvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9\nafwEPRp1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afw\nEPRp1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afwEPR\np1GYAYe9afwEPRp1DvWn8BD0adRmAGHvWn8BD0adQ71p/AQ9GnUZgBh71p/AQ9GnUO9afwEPRp1G\nYAYe9afwEPRp1GOa30czVbLSwOReJY0NoARdC59kucFO17nW6qdmNa5cdifvoiLyKWkquUWimpHJ\nvtq4cPrdgWotTOML0TjAYK7gc3oqZzBX8Dm9FSOV8xX8p9FKOVDzbeAw+Y2TWtvAYfMbJrkPu1v6\nY9GbnLkAB1NAwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39Grj67IdTKXeXLlyD3ej\n5I3KK5pabTNVZuc9MGsbyuXQhy6fPq5lqK56zzu0q5+lE8iJxIXbujuXa2jbxOqW48ylLPgvxfl1\n61VRk9urCJjGcOnXg+o9j2KKoquVRjOx52Nn0GcyDY2fQbzIegfDaWvfL3cync87Gz6DeZBsbPoN\n5kPQGlr3yZlO552Nn0G8yDY2fQbzIegNLXvkzKdzzsbPoN5kGxs+g3mQ9AaWvfJmU7nnY2fQbzIN\njZ9BvMh6A0te+TMp3POxs+g3mQbGz6DeZD0Bpa98mZTuedjZ9BvMg2Nn0G8yHoDS175Mync87Gz6\nDeZBsbPoN5kPQGlr3yZlO552Nn0G8yDY2fQbzIegNLXvkzKdzzsbPoN5kGxs+g3mQ9AaWvfJmU7n\nnY2fQbzINjZ9BvMh6A0te+TMp3POxs+g3mQbGz6DeZD0Bpa98mZTuedjZ9BvMh5dBE5MHRsX6jID\nMXrlM4xVPixNFM6phY8ibxNT3BlsqJHSQSoqwq5cVaqac3HkwL8cktrlbe7U5N/vlqc6KdbTeP1X\n8O5ZcyvIaa7s4zEzGPyfKe0bNNm/NNOwU5T3EvnzLn+JL78h1ZTlPcS+fMuf4kvvyH0NHIqeLf5+\n13+jqwAJOwAAAAAAAAAAAAAAAAAAAAAAAAAAAqeXPDrF6ynvNLYVPLnh1i9ZT3mlsn5yP50LWOXC\n2Bd4Bd4ii51ZeBO/fzf+443zQsvAnfv5v/ccb52U8mHo0cmAAGzYAAAAAAABjqZUgp5ZnIqpGxXq\niceCYkHRZSbMtKtTQTU0dWxXQSOejkeqNV2bo3lwRSarIlno54WqiOkjcxFXeTFMCuUljuT4rbBc\nJaJtPQNVY0gz1dI/MVqK5VRMETFV0GJaVY46megylWpt6V89BJT0Sw7NsiytcuHEmamnEysv6xo9\n9xoZ6SFIHVDZHKjkzW76LhvL5CIteSlRDau8KmntcaLBsTqmnV6yqqbyqioib6ElNa7rc6aWlulT\nTxUrqd0ObTYqsjlwweucmjDDQiGNbWJrwe2ZQqxWd+0E1KksbpYVc5Fz8ExwXDeXA80uUuelOtVQ\nTUzKmN0kD1ejkfg3OzdG8uCGKe0XO47Clyko2tpo3JHsGd/qPVuajnYpoTyJiY4LFcpoKCC4S0TI\naFjtiSDPVXvVisRXKqJgiYroQazGpmhymlfa33B9skZSpDsyO2Zqq5OJMENy3Xl9TcGUdVRSUs0k\nOzx5z0cjm4oi729vkDSZL1sVnloFprRE59PsK1ETpM9y8q4oTFusEdtvEVXRNjihWmWGdiK5Vc7F\nFaqY/WNZE16sU8ADZUAAAAAAAAAAG/kP8i6etr7rSzlYyH+RdPW191pZziq2y86vlSqndD+bKb9+\n0tEH5lnooVfuh/NlN+/aWiD8yz0UK181T3qVc3T3vYAIogAA0rxwB/nQ2af8xH6KGteOAv8AOhtU\n/wCYj9FDzLf9wr+mn1lWebj5ygcqPnCy+sL7qm4aeVHzhZfWF91TcO6va5bm0ABomAAAAAAAAEBd\ncoJaS9LbKW3SVcyU7ahzklaxEarlam/5UJ8qOUWTlVX5R7ZQ01rq4lpG0+x1qvTNVHudnJmovLgZ\npw6SGw3K6mStuFHPTyRVNHT98KxXIqPTNxVqLyoen5Tulc9tuts9YsMLJqjNejdjz25yNTHfdhpw\nIy5ZHVNfSXJdmpqatnRi074s5WxKjM1zVxTHNXeNqmsd4tTqhbRNQu77iiSXZ85FikYxGZ7cE+Mi\noiaFw0obYUs6m3NlMr5XMt1uqKxY4WzTIioxY0cmKNwXfdhxGFcrNnqGR2u3T1zXUratXMe1qoxy\nqmGC8eKKG2e8W+qqJrZVUlQ+riY2Z9XnNVJGphnpmpguPJoNSiybutmq432aWgkZ3iykc6qV6Kjk\ne5yuRGpp0u3sRqNTZ3XpUTwx2y3y1eyQJPisrY81McMFx48UPVTlVNDPXsbaJ5GUDI31D2zNTMzm\nZ2CcuCEQ7Iqqp6mnfDFa7gxlPsbkrc9vx1crlciNRcExU258i2V0t4nr46bZquKJsGxOfhCrI81e\nTRjgqb+gz/SalvpJ2VVLDURY7HKxHtx38FTEymvb45YaCmiqXNdOyNrXubvK5EwVUNgmwAAAAAAA\nAAACJyk4HT+twe+hayqZScDp/W4PfQtZWjYtRsDBX8Dm9FTOYLhwKb0VJZZzFfyn0Vo5UPNt4DD6\nJsmtbeAw+ibJrkXu1v6Y9GbnKkAB1NAwV3A5/Qd+BnMFdwOf0HfgGJ2OZ/k+/o1cfXZDqZyz8n39\nGrj67IdTKXeXLlyD3ej5Kh3R/m+h9ZT3VKaXLuj/ADfQ+sp7qlNPzX8Z+9W/p/eX13sbmqvn+wAD\n497AAAMVVMlPTTTORVSNivVE48ExIll9VGxOqKOSJs0SyxrnouciNzsPJoJSuidUUVRCxUR0kbmI\nq7yKqKhCbnUijpu9dijlbA6GZcVwfi3DR9ek9LIqclmidPt6Nu757/hPdtc1+b0VRo9n+21FelWk\ndVVFI+GBGI9HbIjldjvJgnGuIlvL6eOV1XRSxKyPZE0oqOTHDDHiXTvGlTWGdKJ1NJFQwLmNRJoM\n5XK5qoqKqKiaMUNupoLhXwTx1k0ETHRZjWQ4uRXY45y4pjxbx1VW8iivozcdeudmrDDXOPTj+yUV\nX83pxw+G3yZ7nd46BXI+J71bAs+heJFRMP5mCa+rTpKlVRyRSNi2Zrc9FzkxRN/i3zXq7RXXCOoW\nrfTMkdT7BGkauVN/FVVVTyCsydRVlSi2KJk1PsUjXKulcUVF4/KLVrIKYppuTr6dvw6cfn0T3bWK\nqsomZmmNTPPfu9mTd9UckcjItmRuei5zcUTf4t8+1N8dSUqz1dFJGxHtYmEiOVcV8hrV2TmcyoZQ\nbFE2anSJ7XK7BXI5FRePRvntlnncxjHQUEDWzMkXYVcucjV0ouKG0Uez5pir469cxOGr4/PrdzGd\nlOMx/PT7NqovCMdOlNTuqEhjbKqteiIrVRVx/kZqO4vntz6yWmfCxGbI1Fciq5uGPEaVFZJKV9zR\nszXQ1EeZC1ccWJguhfJpJCKkeyzMo1c3ZEgSLO4scMMTlvxklMRTb164168cMNfw2q0Temcavjq1\ndzTgvmc2J1RSSwNmjWSJyuRyOwTHDRvLgKa9rJSd9TUj4aZYtlR6yNcuCpiiYIYI7TXSw00VXJTN\njpo3NYkWcqucrVaiqq7yaeIw0dgmZQd6TQ0EabCkazQq5XqqYYKqKmG+h1TayDCdcY4xsmdmM7Ne\n3DDfrSivKMY/m7+dCQS8LG1762klpo2xrKjlVHIqJxaN5fIfEvKxqiVdJJAr43SR4uRc7BMVTyLg\neJrfX18EsFfPDHE6JWIkGK5zuJy4pxYbx5nt1fXOYtc+mbsLHIxIs5c56twzlx3k072klTbyT/LC\nNuOEzq1asN+M7duHwbzVe6MfCO/H9nuG+YtY6opJYGyxrJEquRUfgmOGjeXA8tvkneD6ySheyFIt\nlRdlaqryJh9ZiZaK6aGnirJKZrKaJzY0izlVzlbm4qq8Rrw2GpbbJKRYbdGr4diWWNX5zt7f0eQt\nFrIOnDbGOudmM7Ne7De0z8o+P87t/wAkvRXJ09Z3tPTPp5Vj2RuLkcip9RIkTRWhlFcknpUYyJ0O\nZI3FVVXIuhfxJY8rK9DnxNjZh5+M+suuzn5v/JtAAcqoAAMtB882r1pn+TribyHI6D55tXrTP8nX\nE3kP0/8ACX9v/wDKf2fL+1/eO6BTlPcS+fMuf4kvvyHVlOU9xL58y5/iS+/IfWUcip4F/n7Xf6Or\nAAk7AAAAAAAAAAAAAAAAAAAAAAAAAAACp5c8OsXrKe80thU8u9E9mfxNqU/FpbJ+cj+dC1jlwtgX\neAIoud2dMKSRq77Z5UX21N4xXWB1muk6yoqUNU9ZWS8THrvtXk5UPbXsemLXNVPIp10TE0w77VUT\nTD0D5inKMU5TdR9B8xTlGKcoH0HzFOUYpygfQfMU5RinKB9B8xTlGKcoH0HzFOUYpygfQfMU5Rin\nKB9B8xTlGKcoH0HzFOUYpygfQfMU5RinKB9B8xTlGKcoH0HzFOUYpygfQfMU5UNaprGRqkcKbNUv\n0Rws0ucv+POYmcNpMxGuU1kOn+lcncS1TsPZQsxGZN251stUcMqo6dyrJKqb2culSTOOZxl5szjO\nKqd0P5spv37S0QfmWeihV+6DpoKNvGtQ0tEKYRMTyIVr5qnvVq5unvewARRAABpXjgL/ADobVP8A\nmI/RQ1bxwB/nQ2afTBH6KHmW/wC4V/TT6yrPNx80FlTorbM7iSpX3VNw+5R0MldbsKfBKmF6SxY8\nbk4vrI+33SCrbmudsVQ3RJDJocxeTA7646XLcjpb4PmcnKgxTlQmm+g+YpyoMU5UA+g+YpyoMU5U\nA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+\nYpyoMU5UA+g+YpyoMU5UA+g+YpyoMU5UA+g+YpyoeZJY42q6SRjUTjVcAI3KPTS0reNauHD2kLUV\nSnXbu606wIq0FI/ZHS8Uj95ETlRC1lqIwhaiMIDXuHApvRU2DXuK4UM3okMsnDJ7nyn0Wt8qHy28\nBh9E2TXt6YUUPomwMi1ZPb+mPQucqQAHS0DDXcDn9B34GYxVaY0sycrF/AMTscx/J9/Rq4+uyHUz\nlvcA+LYrvHxsrnoqHUil3ly5Mg93oVHujp/0bRLxJUp+ClMOmZT2xbtZ5qeNUSZMHxqu9nJpQ5ir\nlZI6KdqwzsXB8b9Cop+f/jHJLtVdvKKYxpiMJ+GvF9V7GvUxFVuZ17XoDFOUaOU+Gwl7uMAGjlGj\nlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlG\njlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlGjlGEmMAGjlPLntamLnIieVTMUzOqIMYbFuRXXq1Im/3\n01fxOtpvIc7yJtklbdI7g9itpKfHY1VPzj10Yp5EOiH6t+HMluZLkNNN2MJmZnD5vk/aV2m7fmad\nkahTlPcS+e8uP4kvvyHVjlPcN+PXZYzJvSXFcF/8T1/yfRUcip4l/n7Xf6OrAAm7AAAAAAAAAAAA\nAAAAAAAAAAAAAAACsd0GFz7MydnyqeVr/q3uos5hrKeOrpZaeZMY5Gq1yec3t1ZlUVN7dWZVFRRV\nDaqjhnjXFsjEcn1oZim5P1z7FWOs90dmxo7GnmXeVF4i5IqKiKi4opm5RmT8Ohm5RmT8HiWJk0bo\n5WNex2hWuTFFIiTJezyOxWiY30VVv4KTQJpoLcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AILcnZvFV6R3WNydm8VXpHdZOgCC3J2bxVekd1jcnZ\nvFV6R3WToAgtydm8VXpHdY3J2bxVekd1k6AIJMk7Mi8E/qO6yQoLVQ2/FaOmiicu+5E0r9ZugAAa\nN3udNa6R09S9Ew+S1N9y8iGYiZnCGYiZnCEBlcvfV6tNAzS7ZNkcnIhbUTBMCq5K0c9ZWzXq4NzZ\nJdELF/VaWore1YUR0K3dWFEdAACKIAANe4M2Sjlam/hifLbJslFEvImCmyulNJFxuW31To38HkXF\nq8inl5VVwbKacoq5MxmzO7XjE/Lo71qIz6JpjbtShpV9poa9yOq6aOR6aEcqaU+s3UVFTFFxQHqb\nUUGuSto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V/W\nTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V/\nWTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6V\n/WTgAg9yto8Wd0r+sblbR4s7pX9ZOACD3K2jxZ3Sv6xuVtHizulf1k4AIPcraPFndK/rG5W0eLO6\nV/WTgAg9yto8Wd0r+s9x5M2hjse82u9NyuT+akyAPMUbImIyNrWMTQjWpgiHoAAaV3fm0T043KjU\nN1VRExXQhGKu2Fa1G8HhXFV5VPN9p3P+GbFHLr1RHz2z8ojWrZj+rOnZDfpm5lPG3kahkAPQopii\nmKY6E5nGcQAGzAHIioqLvKAByfuTP2qy0yrsUy5r9n74javG1V4udDrByzuo2eutF6o8s7DGslRS\n/Fq4k/7SPl5tHMXjJPKW3ZT2qOttsyOxT/UjVfjxu40chW5Gd/XDhySrRTOT1bY2fGJTZpXC1UNw\nwWspYpVTeVzdKfWboJO5ALkhZFXgTfad1jchZPEk9t3WT4NcyncznSgNyFk8ST23dY3IWTxJPbd1\nk+BmU7jOlAbkLJ4kntu6xuQsniSe27rJ8DMp3GdKA3IWTxJPbd1jchZPEk9t3WT4GZTuM6UBuQsn\niSe27rG5CyeJJ7busnwMyncZ0oDchZPEk9t3WNyFk8ST23dZPgZlO4zpQG5CyeJJ7busbkLJ4knt\nu6yfAzKdxnSgNyFk8ST23dY3IWTxJPbd1k+BmU7jOlAbkLJ4kntu6xuQsniSe27rJ8DMp3GdKA3I\nWTxJPbd1jchZPEk9t3WT4GZTuM6UBuQsniSe27rG5CyeJJ7busnwMyncZ0oDchZPEk9t3WNyFk8S\nT23dZPgZlO4zpQG5CyeJJ7busywZL2aB6OZQxKqb2di78SaAiimNkGMvjGtY1GsRGtTQiImCIfQf\nHORjVc5URqaVVeI2Yad6r47XaKyunVEjp4nSLj5ExwKB3AaOSHI2aumRc+uqny4rxomj8UUisvr5\nNlzdoskMmH59Or0dX1bdLGtRd7HjRP5rgdWtFvp7Ta6Wgo2ZlPTxpGxPIifiVmMyjCdsuGirT5Rn\n08mmJjvnb4NsAEncAAAAAAAAAAAAAAAAAAAAAAAAAAAAANG7WululPsVUzHD5Lk32r5CvNtl/tHx\nbdVMqqdN6OXfQt4KU3aqYw2wpTcmmMNsKlt1lBFols+evK1T4uUN61HLzqW4YG2lp6kebbSU9WPN\nUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3RXnUc\nvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnU\ncvOW7AYDSUdSPM0lHV9VR3RXnUcvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzjdFedRy85bsBg\nNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3RXnUcvON0V51HLzluwGA0lHUjzN\nJR1fVUd0V51HLzjdFedRy85bsBgNJR1I8zSUdX1VHdFedRy843RXnUcvOW7AYDSUdSPM0lHV9VR3\nRXnUcvON0V51HLzluwGA0lHUjzNJR1fVUd0V51HLzn1MoL2u9Y3/AFqW3AYDSUdSPM0lPVjzVJa3\nKer+LDRQ0yL+s9d4zW/Jhz6lKu9VC1k6aUYvyULOBN6cMKYwYm9OGFMYCIiIiImCJxAAikAAAAAB\n4miZNGrJG4tU9g1qppriaaoxiWYnDXCN72qqVf8AZpEfH9Bx978qm6H0ir5iRB53F029Vi7VTG7V\nMecSppceVESj+/5eOkkG2EnikpIAzwTKu3n8tJn0dVH7YSeKyjbF/ispIAcFyrt5/LBn0dXzR+2L\n/FZRti7xWUkAOC5X2/6YM+jq+aP2yd4rKNsneKy8xIAcFyvt/wBMGfR1fNHbZL4rLzDbJfFZeYkQ\nOC5X2/6YM+jq+aO2zXxaXmG2f/DS8xIgcFyzt/0wZ9HV80dtn/w0vMNs/wDh5eYkQODZZ2/6YM+j\nq+aP2zTxeXmPm2aeLy8xIjAcGyzt/wBMGfR1fNH7Zp4vLzDbNPAS8xIYDAcGyzt/0x9zPt9XzR+2\nbfAS8w20b4CXmJDAYDg2Wdv+mPuZ9vq+aP20b4GXmG2jPAy8xIYIME5BwfLe3j8sfczrfV80ftoz\nwMvMNtGeCl5iQwTkGCcg4PlvbR+WPuZ1vq+aP20j8FLzDbSPwUvMSGCcgwTkHB8t7aPy/wCzOt9X\nzR+2kfgpeY+LcnO0RU0rl8qEjgnIBwbLJ23/AApj7yZ9HV80asVXWfn12KL6Kb6m/DEyGNGRpg1D\n2C2T5FRYqm5MzVVPTO3/AFHya1XJqjDZAADsaAAAAAD45qOarXIitVMFReM5nf8AuZvhuT7rkZcH\n2iucuLok/NPXzcR00G1Nc07Eb1ii9GFcOTNvHdMtXxKuzUlya3RskTkRV/mely+y0ZofkVMq+Ryn\nVxgnIhvpI6aYQ4LXHJuz5T+zk3wg5Y/sTU86j4Qcsf2JqedTrOCciDBORDGfT1Tg13tZ8I+zk3wg\n5Y/sTU86j4Qcsf2JqedTrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTk\nQZ9PVODXe1nwj7OTfCDlj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4\nQcsf2JqedTrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1\nnwj7OTfCDlj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4Qcsf2JqedT\nrOCciDBORBn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1nwj7OTfCDl\nj+xNTzqPhByx/Ymp51Os4JyIME5EGfT1Tg13tZ8I+zk3wg5Y/sTU86j4Qcsf2JqedTrOCciDBORB\nn09U4Nd7WfCPs5N8IOWP7E1POo+EHLH9iannU6zgnIgwTkQZ9PVODXe1nwj7OTfCDlj+xNTzqfU7\noGWK72RNR9aqdYwTkQYJyIM+nqnBrvaz4R9nKFyyy+qvi0mSCROXeWR+j+amKTJvL3KzBmUVzhtd\nvd8uClX4zk5Fw6zrmAM6TDZEHA5q5yuZjds9EJknkxbMl7elLa4UbjpfI7S968qqTYBOZmZxl10U\nU0Rm0xhAADDYAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\nAAAAAAAB/9k=\n"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"cell_type": "markdown",
|
18 |
+
"id": "7ee017c2-79bf-4890-ba8a-61c52ca4810b",
|
19 |
+
"metadata": {},
|
20 |
+
"source": [
|
21 |
+
"### BERT大模型简介\n",
|
22 |
+
"\n",
|
23 |
+
"BERT(Bidirectional Encoder Representations from Transformers)是一个在自然语言处理领域具有里程碑意义的预训练模型,由Google AI在2018年提出。它基于Transformer架构中的编码器部分构建,并采用了双向训练方法来理解文本中词语的上下文信息。\n",
|
24 |
+
"\n",
|
25 |
+
"### BERT的主要特点:\n",
|
26 |
+
"\n",
|
27 |
+
"- **双向性**:与传统的单向语言模型不同,BERT能够同时利用左右两侧的上下文信息来进行预测,这使得它在理解语义方面更为强大。\n",
|
28 |
+
"- **预训练和微调**:BERT的训练分为两个阶段。首先是在大规模无标签语料库上进行预训练,然后针对特定任务使用少量标注数据进行微调。这种迁移学习的方法显著提高了下游任务的表现。\n",
|
29 |
+
"- **Masked Language Model (MLM)**:作为预训练的一部分,BERT随机遮蔽输入序列中的一些词,并尝试根据上下文恢复这些被遮蔽的词。这种方法增强了模型对句子内部结构的理解。\n",
|
30 |
+
"- **Next Sentence Prediction (NSP)**:另一个预训练任务是判断给定的两个句子是否连续出现,帮助BERT更好地捕捉句子间的逻辑关系。\n",
|
31 |
+
"- **输入表示**:BERT的输入由三部分组成:\n",
|
32 |
+
"\n",
|
33 |
+
" Token Embeddings:将词语映射为向量。\n",
|
34 |
+
"\n",
|
35 |
+
" Segment Embeddings:用于区分句子对(如问答任务中的问题和答案)。\n",
|
36 |
+
"\n",
|
37 |
+
" Position Embeddings:表示词语在序列中的位置。\n",
|
38 |
+
"\n",
|
39 |
+
"**特殊标记**:\n",
|
40 |
+
"\n",
|
41 |
+
" [CLS]:用于分类任务的输出。\n",
|
42 |
+
"\n",
|
43 |
+
" [SEP]:用于分隔句子对。\n",
|
44 |
+
"\n",
|
45 |
+
" [MASK]:用于掩码语言模型任务。\n",
|
46 |
+
"\n",
|
47 |
+
"### 模型配置\n",
|
48 |
+
"\n",
|
49 |
+
"BERT有两种主要变体:\n",
|
50 |
+
"- **BERT Base**:12层(或称作块/层),每层有12个自注意力头,总参数量约为1.1亿。\n",
|
51 |
+
"- **BERT Large**:24层,每层有16个自注意力头,参数量增加到约3.4亿,理论上具备更强的表达能力。\n",
|
52 |
+
"\n",
|
53 |
+
"### 网络结构\n",
|
54 |
+
"BERT Base的基本网络结构和GPT2的区别如下所示:\n",
|
55 |
+
"\n",
|
56 |
+
"![bert.jpg](attachment:6a042b8f-c47d-4f6d-b601-b80124836ec4.jpg)"
|
57 |
+
]
|
58 |
+
},
|
59 |
{
|
60 |
"cell_type": "code",
|
61 |
+
"execution_count": 31,
|
62 |
+
"id": "602ad045-0a4c-4f48-afd2-04d09c0c0f71",
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [
|
65 |
+
{
|
66 |
+
"data": {
|
67 |
+
"text/plain": [
|
68 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
69 |
+
]
|
70 |
+
},
|
71 |
+
"execution_count": 31,
|
72 |
+
"metadata": {},
|
73 |
+
"output_type": "execute_result"
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"source": [
|
77 |
+
"import subprocess\n",
|
78 |
+
"import os\n",
|
79 |
+
"# 设置环境变量, autodl一般区域\n",
|
80 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
81 |
+
"output = result.stdout\n",
|
82 |
+
"for line in output.splitlines():\n",
|
83 |
+
" if '=' in line:\n",
|
84 |
+
" var, value = line.split('=', 1)\n",
|
85 |
+
" os.environ[var] = value\n",
|
86 |
+
"\"\"\"\n",
|
87 |
+
"import os\n",
|
88 |
+
"\n",
|
89 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
90 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
91 |
+
"\n",
|
92 |
+
"# 打印环境变量以确认设置成功\n",
|
93 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
94 |
+
"\"\"\""
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 32,
|
100 |
"id": "a3ec4b86-2029-4d50-9bbf-64b208249165",
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
|
|
104 |
"from tokenizers import Tokenizer\n",
|
105 |
"from tokenizers.models import WordPiece\n",
|
106 |
"from tokenizers.trainers import WordPieceTrainer\n",
|
107 |
+
"from tokenizers.pre_tokenizers import Whitespace\n",
|
108 |
+
"from transformers import PreTrainedTokenizerFast,AutoModelForMaskedLM"
|
109 |
]
|
110 |
},
|
111 |
{
|
112 |
"cell_type": "code",
|
113 |
+
"execution_count": 16,
|
114 |
"id": "47b3fc92-df22-4e4b-bdf9-671bda924c49",
|
115 |
"metadata": {},
|
116 |
"outputs": [],
|
|
|
124 |
"execution_count": null,
|
125 |
"id": "73f59aa6-8cce-4124-a3ee-7a5617b91ea7",
|
126 |
"metadata": {},
|
127 |
+
"outputs": [
|
128 |
+
{
|
129 |
+
"name": "stdout",
|
130 |
+
"output_type": "stream",
|
131 |
+
"text": [
|
132 |
+
"\n",
|
133 |
+
"\n"
|
134 |
+
]
|
135 |
+
}
|
136 |
+
],
|
137 |
"source": [
|
138 |
"# 设置训练参数\n",
|
139 |
"trainer = WordPieceTrainer(\n",
|
140 |
+
" vocab_size=30000, # 词汇表大小\n",
|
141 |
" min_frequency=2, # 最小词频\n",
|
142 |
" special_tokens=[\n",
|
143 |
" \"[PAD]\", \"[UNK]\", \"[CLS]\", \"[SEP]\", \"[MASK]\"\n",
|
|
|
149 |
},
|
150 |
{
|
151 |
"cell_type": "code",
|
152 |
+
"execution_count": 4,
|
153 |
"id": "7a0ccd64-5172-4f40-9868-cdf02687ae10",
|
154 |
"metadata": {},
|
155 |
"outputs": [],
|
|
|
179 |
},
|
180 |
{
|
181 |
"cell_type": "code",
|
182 |
+
"execution_count": 17,
|
183 |
"id": "48e1f20b-cd1a-49fa-be2b-aba30a24e706",
|
184 |
"metadata": {},
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"text/plain": [
|
189 |
+
"('dna_wordpiece_dict/tokenizer_config.json',\n",
|
190 |
+
" 'dna_wordpiece_dict/special_tokens_map.json',\n",
|
191 |
+
" 'dna_wordpiece_dict/tokenizer.json')"
|
192 |
+
]
|
193 |
+
},
|
194 |
+
"execution_count": 17,
|
195 |
+
"metadata": {},
|
196 |
+
"output_type": "execute_result"
|
197 |
+
}
|
198 |
+
],
|
199 |
"source": [
|
200 |
"new_tokenizer = Tokenizer.from_file(\"dna_wordpiece_dict.json\")\n",
|
201 |
"\n",
|
|
|
212 |
},
|
213 |
{
|
214 |
"cell_type": "code",
|
215 |
+
"execution_count": 33,
|
216 |
"id": "c94dc601-86ec-421c-8638-c8d8b5078682",
|
217 |
"metadata": {},
|
218 |
"outputs": [],
|
|
|
229 |
},
|
230 |
{
|
231 |
"cell_type": "code",
|
232 |
+
"execution_count": 34,
|
233 |
"id": "b2658cd2-0ac5-483e-b04d-2716993770e3",
|
234 |
"metadata": {},
|
235 |
"outputs": [],
|
|
|
240 |
},
|
241 |
{
|
242 |
"cell_type": "code",
|
243 |
+
"execution_count": 19,
|
244 |
+
"id": "20b35091-791e-4a6f-8f2d-fda39348daa3",
|
245 |
+
"metadata": {},
|
246 |
+
"outputs": [
|
247 |
+
{
|
248 |
+
"data": {
|
249 |
+
"text/plain": [
|
250 |
+
"{'input_ids': [5, 761, 12283], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}"
|
251 |
+
]
|
252 |
+
},
|
253 |
+
"execution_count": 19,
|
254 |
+
"metadata": {},
|
255 |
+
"output_type": "execute_result"
|
256 |
+
}
|
257 |
+
],
|
258 |
+
"source": [
|
259 |
+
"tokenizer(\"ATCGGATCG\")"
|
260 |
+
]
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"cell_type": "code",
|
264 |
+
"execution_count": 35,
|
265 |
+
"id": "3a485c0a-c1a7-4b1c-b16e-3086593fd328",
|
266 |
+
"metadata": {},
|
267 |
+
"outputs": [
|
268 |
+
{
|
269 |
+
"data": {
|
270 |
+
"text/plain": [
|
271 |
+
"PreTrainedTokenizerFast(name_or_path='dna_wordpiece_dict', vocab_size=30000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={\n",
|
272 |
+
"\t0: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
273 |
+
"\t1: AddedToken(\"[UNK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
274 |
+
"\t2: AddedToken(\"[CLS]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
275 |
+
"\t3: AddedToken(\"[SEP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
276 |
+
"\t4: AddedToken(\"[MASK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
|
277 |
+
"}\n",
|
278 |
+
")"
|
279 |
+
]
|
280 |
+
},
|
281 |
+
"execution_count": 35,
|
282 |
+
"metadata": {},
|
283 |
+
"output_type": "execute_result"
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"source": [
|
287 |
+
"tokenizer"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"cell_type": "code",
|
292 |
+
"execution_count": 36,
|
293 |
+
"id": "a48642d6-69d1-4fff-903a-b0711ab7691b",
|
294 |
"metadata": {},
|
295 |
"outputs": [],
|
296 |
"source": [
|
297 |
+
"from transformers import BertConfig,BertForMaskedLM\n",
|
|
|
|
|
|
|
298 |
"\n",
|
299 |
+
"# 设置最大输入长度\n",
|
300 |
+
"max_length = 128 # 最大输入长度\n",
|
301 |
"\n",
|
302 |
"# 构建配置\n",
|
303 |
+
"config = BertConfig(\n",
|
304 |
+
" vocab_size=len(tokenizer), # 词汇表大小,与分词器一致\n",
|
305 |
+
" max_position_embeddings=max_length, # 最大位置嵌入数,与 max_length 一致\n",
|
306 |
+
" pad_token_id=tokenizer.pad_token_id, # 填充标记的 ID\n",
|
307 |
+
" bos_token_id=tokenizer.cls_token_id, # 句子开始标记的 ID(BERT 使用 [CLS])\n",
|
308 |
+
" eos_token_id=tokenizer.sep_token_id, # 句子结束标记的 ID(BERT 使用 [SEP])\n",
|
|
|
309 |
")\n",
|
310 |
"\n",
|
|
|
311 |
"# Building the model from the config\n",
|
312 |
+
"model = BertForMaskedLM(config)"
|
313 |
]
|
314 |
},
|
315 |
{
|
316 |
"cell_type": "code",
|
317 |
+
"execution_count": 37,
|
318 |
"id": "afc2cdd1-228e-4ee7-95f5-07718f00723d",
|
319 |
"metadata": {},
|
320 |
"outputs": [],
|
321 |
"source": [
|
322 |
"# 1. load dna dataset\n",
|
323 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
324 |
+
"#dataset = raw_dataset[\"train\"].select(range(1000)).train_test_split(test_size=0.1, shuffle=True)\n",
|
325 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
326 |
"\n",
|
327 |
"# 2. tokenize\n",
|
328 |
+
"# def tokenize_function(examples):\n",
|
329 |
+
"# return tokenizer(examples['text'][:100], truncation=True, padding='max_length', max_length=max_length)\n",
|
330 |
+
"\n",
|
331 |
+
"# 2. tokenize, 必须设置最大长度\n",
|
332 |
+
"#默认是100,设置成1000就行了。否则如果字符串长超过100,就是有bug,只生成1个unk了\n",
|
333 |
+
"tokenizer._tokenizer.model.max_input_chars_per_word = 10000\n",
|
334 |
"def tokenize_function(examples):\n",
|
335 |
" return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
|
336 |
"\n",
|
337 |
+
"\n",
|
338 |
"# 3. 对数据集应用分词函数\n",
|
339 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=False, remove_columns=['text'], num_proc=15) # 设置为你的 CPU 核心数或根据需要调整\n",
|
340 |
"\n",
|
341 |
"# 4. 创建一个数据收集器,用于动态填充和遮蔽,注意mlm=true\n",
|
342 |
"data_collator = DataCollatorForLanguageModeling(\n",
|
|
|
346 |
},
|
347 |
{
|
348 |
"cell_type": "code",
|
349 |
+
"execution_count": 38,
|
350 |
+
"id": "9345610a-631e-49bb-a10c-bc0646a694fa",
|
351 |
+
"metadata": {},
|
352 |
+
"outputs": [
|
353 |
+
{
|
354 |
+
"data": {
|
355 |
+
"text/plain": [
|
356 |
+
"{'text': 'GAATCATATTTCTCTTAGTAATGCTTTCTATTCAAAATATTAAATGGAAATTTTAAAAAGAATTTTAGAGCTAATTCTAAACAATCTCTTTTACATTCATTCAACTGCCATTTTACTTTTCTGAAGTTGCTACCTTATTTTCTCTACCAAATATCATTATTTTAATTCATTGATTACATACACAGCTAGTTCTAAAGTTAAAAAAAGTACTTACTAAGCACTTATTAAACAATAAAATATGTTATTCACATTTGAAGCATAAGTAACCAATAAGTAACCAATTGTAACCAATGGTTACGTTAGCATAAATAGGATAAATATTTTAATGTATTAGCATTAGTTTTCTGATAGCACCCTGATCTGCTTGCACAAGAACCCAGGGACGGTTTTACTAGTACGCCAAATAAGCTGCAGCCAAGGGCTCTTATAATATTTAGAGGAACCTATTTTAAAACATGGAACTTTTTTCTTTAAAGTTCTATAAAATTTGATTATTTATGAGTGATAAAGGCCTTAAAAATTTATTCTGCTTTGGCTCCCTGAAATGTTACAAATGTTTTAGGTACTAAAGTGTTATTTATATGAATGCCATATACACTCATAAATTATAACTAAGATTGACTAAATAGCTAATGAATTCCCACAATTTGCCATAAAACATTAACCGGCTTGCCTTACCGTACCAAGTTTAAACTATTTAAAGTTAAAAATTTGGCTTGGGCTAAAGTCACCCTTAGTGCCAAAGTATTATTAATGTGAATCAGCATTTTTTGTTTTGAATCGAATCTTGAATTATAGCAGGTCCTGATTCGCGTTTAAATCATATTTCAAATCAATGTTTTTAAGTCTAAATTTAAGCTTTTTTTGTCTTGTTTTTTTGTCCTGTTTTTTTTTGTCCTGTTTTTGGCCCTGTGAATCAGCATTTTTTGTTTTGAATCGAATCTTGAATCGAAGTAGGTCTTGATTCGCGTTTTTAATCACATTTCAAATCAAAATTTTT'}"
|
357 |
+
]
|
358 |
+
},
|
359 |
+
"execution_count": 38,
|
360 |
+
"metadata": {},
|
361 |
+
"output_type": "execute_result"
|
362 |
+
}
|
363 |
+
],
|
364 |
+
"source": [
|
365 |
+
"dataset[\"train\"][0]"
|
366 |
+
]
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"cell_type": "code",
|
370 |
+
"execution_count": 24,
|
371 |
+
"id": "8dfc0ff6-2b11-4020-98aa-c8b2b83b9bbc",
|
372 |
+
"metadata": {},
|
373 |
+
"outputs": [
|
374 |
+
{
|
375 |
+
"data": {
|
376 |
+
"text/plain": [
|
377 |
+
"['GAA',\n",
|
378 |
+
" '##TCATATT',\n",
|
379 |
+
" '##TCTCTTA',\n",
|
380 |
+
" '##GTAATG',\n",
|
381 |
+
" '##CTTTCTATT',\n",
|
382 |
+
" '##CAAAATATTA',\n",
|
383 |
+
" '##AA',\n",
|
384 |
+
" '##TGGAA',\n",
|
385 |
+
" '##A',\n",
|
386 |
+
" '##TTTTAAAAA',\n",
|
387 |
+
" '##GAATTTTA',\n",
|
388 |
+
" '##GAGCTAA',\n",
|
389 |
+
" '##TT',\n",
|
390 |
+
" '##CTAAACAA',\n",
|
391 |
+
" '##TCTCTTTTA',\n",
|
392 |
+
" '##CATT',\n",
|
393 |
+
" '##CAT']"
|
394 |
+
]
|
395 |
+
},
|
396 |
+
"execution_count": 24,
|
397 |
+
"metadata": {},
|
398 |
+
"output_type": "execute_result"
|
399 |
+
}
|
400 |
+
],
|
401 |
+
"source": [
|
402 |
+
"tokenizer.tokenize(dataset[\"train\"][0][\"text\"][:100])"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
{
|
406 |
+
"cell_type": "code",
|
407 |
+
"execution_count": 39,
|
408 |
"id": "604491f9-2ee7-4722-aad6-02e98457b5ee",
|
409 |
"metadata": {},
|
410 |
"outputs": [],
|
411 |
"source": [
|
412 |
"run_path = \"bert_run\"\n",
|
413 |
+
"train_epoches = 200\n",
|
414 |
"batch_size = 10\n",
|
415 |
"\n",
|
416 |
"\n",
|
|
|
437 |
},
|
438 |
{
|
439 |
"cell_type": "code",
|
440 |
+
"execution_count": 26,
|
441 |
"id": "d91a8bfb-f3ff-4031-a0d7-ebedc200d65a",
|
442 |
"metadata": {},
|
443 |
+
"outputs": [
|
444 |
+
{
|
445 |
+
"data": {
|
446 |
+
"text/html": [
|
447 |
+
"\n",
|
448 |
+
" <div>\n",
|
449 |
+
" \n",
|
450 |
+
" <progress value='18000' max='18000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
451 |
+
" [18000/18000 13:45, Epoch 200/200]\n",
|
452 |
+
" </div>\n",
|
453 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
454 |
+
" <thead>\n",
|
455 |
+
" <tr style=\"text-align: left;\">\n",
|
456 |
+
" <th>Step</th>\n",
|
457 |
+
" <th>Training Loss</th>\n",
|
458 |
+
" </tr>\n",
|
459 |
+
" </thead>\n",
|
460 |
+
" <tbody>\n",
|
461 |
+
" <tr>\n",
|
462 |
+
" <td>500</td>\n",
|
463 |
+
" <td>9.029000</td>\n",
|
464 |
+
" </tr>\n",
|
465 |
+
" <tr>\n",
|
466 |
+
" <td>1000</td>\n",
|
467 |
+
" <td>8.534200</td>\n",
|
468 |
+
" </tr>\n",
|
469 |
+
" <tr>\n",
|
470 |
+
" <td>1500</td>\n",
|
471 |
+
" <td>8.344000</td>\n",
|
472 |
+
" </tr>\n",
|
473 |
+
" <tr>\n",
|
474 |
+
" <td>2000</td>\n",
|
475 |
+
" <td>8.243700</td>\n",
|
476 |
+
" </tr>\n",
|
477 |
+
" <tr>\n",
|
478 |
+
" <td>2500</td>\n",
|
479 |
+
" <td>8.190700</td>\n",
|
480 |
+
" </tr>\n",
|
481 |
+
" <tr>\n",
|
482 |
+
" <td>3000</td>\n",
|
483 |
+
" <td>8.172000</td>\n",
|
484 |
+
" </tr>\n",
|
485 |
+
" <tr>\n",
|
486 |
+
" <td>3500</td>\n",
|
487 |
+
" <td>8.167900</td>\n",
|
488 |
+
" </tr>\n",
|
489 |
+
" <tr>\n",
|
490 |
+
" <td>4000</td>\n",
|
491 |
+
" <td>8.123100</td>\n",
|
492 |
+
" </tr>\n",
|
493 |
+
" <tr>\n",
|
494 |
+
" <td>4500</td>\n",
|
495 |
+
" <td>8.081900</td>\n",
|
496 |
+
" </tr>\n",
|
497 |
+
" <tr>\n",
|
498 |
+
" <td>5000</td>\n",
|
499 |
+
" <td>8.115100</td>\n",
|
500 |
+
" </tr>\n",
|
501 |
+
" <tr>\n",
|
502 |
+
" <td>5500</td>\n",
|
503 |
+
" <td>8.094800</td>\n",
|
504 |
+
" </tr>\n",
|
505 |
+
" <tr>\n",
|
506 |
+
" <td>6000</td>\n",
|
507 |
+
" <td>8.088200</td>\n",
|
508 |
+
" </tr>\n",
|
509 |
+
" <tr>\n",
|
510 |
+
" <td>6500</td>\n",
|
511 |
+
" <td>8.089600</td>\n",
|
512 |
+
" </tr>\n",
|
513 |
+
" <tr>\n",
|
514 |
+
" <td>7000</td>\n",
|
515 |
+
" <td>8.068900</td>\n",
|
516 |
+
" </tr>\n",
|
517 |
+
" <tr>\n",
|
518 |
+
" <td>7500</td>\n",
|
519 |
+
" <td>8.067000</td>\n",
|
520 |
+
" </tr>\n",
|
521 |
+
" <tr>\n",
|
522 |
+
" <td>8000</td>\n",
|
523 |
+
" <td>8.066400</td>\n",
|
524 |
+
" </tr>\n",
|
525 |
+
" <tr>\n",
|
526 |
+
" <td>8500</td>\n",
|
527 |
+
" <td>8.036600</td>\n",
|
528 |
+
" </tr>\n",
|
529 |
+
" <tr>\n",
|
530 |
+
" <td>9000</td>\n",
|
531 |
+
" <td>8.057600</td>\n",
|
532 |
+
" </tr>\n",
|
533 |
+
" <tr>\n",
|
534 |
+
" <td>9500</td>\n",
|
535 |
+
" <td>8.057800</td>\n",
|
536 |
+
" </tr>\n",
|
537 |
+
" <tr>\n",
|
538 |
+
" <td>10000</td>\n",
|
539 |
+
" <td>8.069700</td>\n",
|
540 |
+
" </tr>\n",
|
541 |
+
" <tr>\n",
|
542 |
+
" <td>10500</td>\n",
|
543 |
+
" <td>8.032500</td>\n",
|
544 |
+
" </tr>\n",
|
545 |
+
" <tr>\n",
|
546 |
+
" <td>11000</td>\n",
|
547 |
+
" <td>8.042600</td>\n",
|
548 |
+
" </tr>\n",
|
549 |
+
" <tr>\n",
|
550 |
+
" <td>11500</td>\n",
|
551 |
+
" <td>8.037500</td>\n",
|
552 |
+
" </tr>\n",
|
553 |
+
" <tr>\n",
|
554 |
+
" <td>12000</td>\n",
|
555 |
+
" <td>8.068900</td>\n",
|
556 |
+
" </tr>\n",
|
557 |
+
" <tr>\n",
|
558 |
+
" <td>12500</td>\n",
|
559 |
+
" <td>8.047800</td>\n",
|
560 |
+
" </tr>\n",
|
561 |
+
" <tr>\n",
|
562 |
+
" <td>13000</td>\n",
|
563 |
+
" <td>8.055800</td>\n",
|
564 |
+
" </tr>\n",
|
565 |
+
" <tr>\n",
|
566 |
+
" <td>13500</td>\n",
|
567 |
+
" <td>8.050900</td>\n",
|
568 |
+
" </tr>\n",
|
569 |
+
" <tr>\n",
|
570 |
+
" <td>14000</td>\n",
|
571 |
+
" <td>8.054800</td>\n",
|
572 |
+
" </tr>\n",
|
573 |
+
" <tr>\n",
|
574 |
+
" <td>14500</td>\n",
|
575 |
+
" <td>8.026000</td>\n",
|
576 |
+
" </tr>\n",
|
577 |
+
" <tr>\n",
|
578 |
+
" <td>15000</td>\n",
|
579 |
+
" <td>8.050300</td>\n",
|
580 |
+
" </tr>\n",
|
581 |
+
" <tr>\n",
|
582 |
+
" <td>15500</td>\n",
|
583 |
+
" <td>8.054800</td>\n",
|
584 |
+
" </tr>\n",
|
585 |
+
" <tr>\n",
|
586 |
+
" <td>16000</td>\n",
|
587 |
+
" <td>8.059600</td>\n",
|
588 |
+
" </tr>\n",
|
589 |
+
" <tr>\n",
|
590 |
+
" <td>16500</td>\n",
|
591 |
+
" <td>8.042800</td>\n",
|
592 |
+
" </tr>\n",
|
593 |
+
" <tr>\n",
|
594 |
+
" <td>17000</td>\n",
|
595 |
+
" <td>8.024000</td>\n",
|
596 |
+
" </tr>\n",
|
597 |
+
" <tr>\n",
|
598 |
+
" <td>17500</td>\n",
|
599 |
+
" <td>8.030000</td>\n",
|
600 |
+
" </tr>\n",
|
601 |
+
" <tr>\n",
|
602 |
+
" <td>18000</td>\n",
|
603 |
+
" <td>8.050600</td>\n",
|
604 |
+
" </tr>\n",
|
605 |
+
" </tbody>\n",
|
606 |
+
"</table><p>"
|
607 |
+
],
|
608 |
+
"text/plain": [
|
609 |
+
"<IPython.core.display.HTML object>"
|
610 |
+
]
|
611 |
+
},
|
612 |
+
"metadata": {},
|
613 |
+
"output_type": "display_data"
|
614 |
+
}
|
615 |
+
],
|
616 |
"source": [
|
617 |
"trainer.train()\n",
|
618 |
"trainer.save_model(\"dna_bert_v0\")"
|
|
|
620 |
},
|
621 |
{
|
622 |
"cell_type": "code",
|
623 |
+
"execution_count": 28,
|
624 |
+
"id": "438f877b-63ca-473f-aa34-2a3291a52c18",
|
625 |
+
"metadata": {},
|
626 |
+
"outputs": [
|
627 |
+
{
|
628 |
+
"name": "stdout",
|
629 |
+
"output_type": "stream",
|
630 |
+
"text": [
|
631 |
+
"Perplexity: 145488.27\n"
|
632 |
+
]
|
633 |
+
}
|
634 |
+
],
|
635 |
+
"source": [
|
636 |
+
"import math\n",
|
637 |
+
"eval_results = trainer.evaluate()\n",
|
638 |
+
"print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
|
639 |
+
]
|
640 |
+
},
|
641 |
+
{
|
642 |
+
"cell_type": "code",
|
643 |
+
"execution_count": 29,
|
644 |
"id": "fc4ad6ad-6433-471f-8510-1ae46558d4ce",
|
645 |
"metadata": {},
|
646 |
"outputs": [],
|
|
|
648 |
"#upload model\n",
|
649 |
"#model.push_to_hub(\"dna_bert_v0\", organization=\"dnagpt\", use_auth_token=\"hf_*******\")"
|
650 |
]
|
651 |
+
},
|
652 |
+
{
|
653 |
+
"cell_type": "code",
|
654 |
+
"execution_count": 30,
|
655 |
+
"id": "bb01748e-4835-4014-bcb5-360931b26c99",
|
656 |
+
"metadata": {},
|
657 |
+
"outputs": [
|
658 |
+
{
|
659 |
+
"name": "stderr",
|
660 |
+
"output_type": "stream",
|
661 |
+
"text": [
|
662 |
+
"Some weights of BertModel were not initialized from the model checkpoint at dna_bert_v0 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
|
663 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
664 |
+
]
|
665 |
+
},
|
666 |
+
{
|
667 |
+
"data": {
|
668 |
+
"text/plain": [
|
669 |
+
"BertModel(\n",
|
670 |
+
" (embeddings): BertEmbeddings(\n",
|
671 |
+
" (word_embeddings): Embedding(30000, 768, padding_idx=0)\n",
|
672 |
+
" (position_embeddings): Embedding(128, 768)\n",
|
673 |
+
" (token_type_embeddings): Embedding(2, 768)\n",
|
674 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
675 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
676 |
+
" )\n",
|
677 |
+
" (encoder): BertEncoder(\n",
|
678 |
+
" (layer): ModuleList(\n",
|
679 |
+
" (0-11): 12 x BertLayer(\n",
|
680 |
+
" (attention): BertAttention(\n",
|
681 |
+
" (self): BertSdpaSelfAttention(\n",
|
682 |
+
" (query): Linear(in_features=768, out_features=768, bias=True)\n",
|
683 |
+
" (key): Linear(in_features=768, out_features=768, bias=True)\n",
|
684 |
+
" (value): Linear(in_features=768, out_features=768, bias=True)\n",
|
685 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
686 |
+
" )\n",
|
687 |
+
" (output): BertSelfOutput(\n",
|
688 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
689 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
690 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
691 |
+
" )\n",
|
692 |
+
" )\n",
|
693 |
+
" (intermediate): BertIntermediate(\n",
|
694 |
+
" (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
|
695 |
+
" (intermediate_act_fn): GELUActivation()\n",
|
696 |
+
" )\n",
|
697 |
+
" (output): BertOutput(\n",
|
698 |
+
" (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
|
699 |
+
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
|
700 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
701 |
+
" )\n",
|
702 |
+
" )\n",
|
703 |
+
" )\n",
|
704 |
+
" )\n",
|
705 |
+
" (pooler): BertPooler(\n",
|
706 |
+
" (dense): Linear(in_features=768, out_features=768, bias=True)\n",
|
707 |
+
" (activation): Tanh()\n",
|
708 |
+
" )\n",
|
709 |
+
")"
|
710 |
+
]
|
711 |
+
},
|
712 |
+
"execution_count": 30,
|
713 |
+
"metadata": {},
|
714 |
+
"output_type": "execute_result"
|
715 |
+
}
|
716 |
+
],
|
717 |
+
"source": [
|
718 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
719 |
+
"import torch\n",
|
720 |
+
"model = AutoModel.from_pretrained('dna_bert_v0')\n",
|
721 |
+
"model"
|
722 |
+
]
|
723 |
+
},
|
724 |
+
{
|
725 |
+
"cell_type": "code",
|
726 |
+
"execution_count": null,
|
727 |
+
"id": "894a6afa-070c-40af-b3f1-cd45bf541c53",
|
728 |
+
"metadata": {},
|
729 |
+
"outputs": [],
|
730 |
+
"source": []
|
731 |
}
|
732 |
],
|
733 |
"metadata": {
|
02-gpt2_bert/.ipynb_checkpoints/5-multi-seq-gpt-checkpoint.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/1-dna-bpe.ipynb
CHANGED
@@ -284,7 +284,7 @@
|
|
284 |
"id": "c24f10dc-1117-4493-9333-5ed6d898f44a",
|
285 |
"metadata": {},
|
286 |
"source": [
|
287 |
-
"###
|
288 |
"\n",
|
289 |
"以上方法展示了如何对 DNA 和蛋白质序列进行“分词”,以提取有用的特征。选择哪种方法取决于具体的任务需求和数据特性。对于简单的分类或回归任务,K-mer 分解或滑动窗口可能是足够的;而对于更复杂的任务,如序列标注或结构预测,基于词汇表的方法或嵌入表示可能会提供更好的性能。\n",
|
290 |
"\n",
|
|
|
284 |
"id": "c24f10dc-1117-4493-9333-5ed6d898f44a",
|
285 |
"metadata": {},
|
286 |
"source": [
|
287 |
+
"### **训练DNA BPE分词器**\n",
|
288 |
"\n",
|
289 |
"以上方法展示了如何对 DNA 和蛋白质序列进行“分词”,以提取有用的特征。选择哪种方法取决于具体的任务需求和数据特性。对于简单的分类或回归任务,K-mer 分解或滑动窗口可能是足够的;而对于更复杂的任务,如序列标注或结构预测,基于词汇表的方法或嵌入表示可能会提供更好的性能。\n",
|
290 |
"\n",
|
02-gpt2_bert/2-dna-gpt.ipynb
CHANGED
@@ -49,9 +49,9 @@
|
|
49 |
"\n",
|
50 |
"### 历史背景\n",
|
51 |
"\n",
|
52 |
-
"- **发布日期**:GPT-
|
53 |
" \n",
|
54 |
-
"- **开发动机**:GPT-2
|
55 |
"\n",
|
56 |
"- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
|
57 |
"\n",
|
@@ -73,6 +73,24 @@
|
|
73 |
{
|
74 |
"cell_type": "code",
|
75 |
"execution_count": 1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
"id": "70581590-096f-45f8-b13b-b84e88615849",
|
77 |
"metadata": {},
|
78 |
"outputs": [],
|
@@ -96,7 +114,7 @@
|
|
96 |
},
|
97 |
{
|
98 |
"cell_type": "code",
|
99 |
-
"execution_count":
|
100 |
"id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
@@ -117,7 +135,7 @@
|
|
117 |
},
|
118 |
{
|
119 |
"cell_type": "code",
|
120 |
-
"execution_count":
|
121 |
"id": "87435829-f522-4820-a51d-11fa4afee6d7",
|
122 |
"metadata": {},
|
123 |
"outputs": [],
|
@@ -136,58 +154,53 @@
|
|
136 |
]
|
137 |
},
|
138 |
{
|
139 |
-
"cell_type": "
|
140 |
-
"
|
141 |
-
"id": "0a0adfdd-4be9-4027-a12d-3bf848be3012",
|
142 |
"metadata": {},
|
143 |
-
"outputs": [],
|
144 |
"source": [
|
145 |
-
"
|
|
|
|
|
|
|
|
|
146 |
]
|
147 |
},
|
148 |
{
|
149 |
"cell_type": "code",
|
150 |
-
"execution_count":
|
151 |
"id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
|
152 |
"metadata": {},
|
153 |
"outputs": [
|
154 |
{
|
155 |
"data": {
|
156 |
"application/vnd.jupyter.widget-view+json": {
|
157 |
-
"model_id": "
|
158 |
"version_major": 2,
|
159 |
"version_minor": 0
|
160 |
},
|
161 |
"text/plain": [
|
162 |
-
"
|
163 |
]
|
164 |
},
|
165 |
"metadata": {},
|
166 |
"output_type": "display_data"
|
167 |
},
|
168 |
{
|
169 |
-
"
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
},
|
175 |
-
"text/plain": [
|
176 |
-
"Map (num_proc=15): 0%| | 0/971635 [00:00<?, ? examples/s]"
|
177 |
-
]
|
178 |
-
},
|
179 |
-
"metadata": {},
|
180 |
-
"output_type": "display_data"
|
181 |
},
|
182 |
{
|
183 |
"data": {
|
184 |
"application/vnd.jupyter.widget-view+json": {
|
185 |
-
"model_id": "
|
186 |
"version_major": 2,
|
187 |
"version_minor": 0
|
188 |
},
|
189 |
"text/plain": [
|
190 |
-
"Map (num_proc=
|
191 |
]
|
192 |
},
|
193 |
"metadata": {},
|
@@ -197,6 +210,7 @@
|
|
197 |
"source": [
|
198 |
"# 1. load dna dataset\n",
|
199 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
|
|
200 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
201 |
"\n",
|
202 |
"# 2. tokenize\n",
|
@@ -212,6 +226,174 @@
|
|
212 |
")"
|
213 |
]
|
214 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
{
|
216 |
"cell_type": "code",
|
217 |
"execution_count": 5,
|
@@ -4656,10 +4838,308 @@
|
|
4656 |
},
|
4657 |
{
|
4658 |
"cell_type": "code",
|
4659 |
-
"execution_count":
|
4660 |
"id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
|
4661 |
"metadata": {},
|
4662 |
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4663 |
"source": []
|
4664 |
}
|
4665 |
],
|
|
|
49 |
"\n",
|
50 |
"### 历史背景\n",
|
51 |
"\n",
|
52 |
+
"- **发布日期**:GPT(Generative Pre-trained Transformer)的第一个版本,即 GPT-1,是在 2018 年由 OpenAI 发布的。具体来说,关于 GPT-1 的研究论文《Improving Language Understanding by Generative Pre-Training》在 2018 年 6 月发布。\n",
|
53 |
" \n",
|
54 |
+
"- **开发动机**:GPT-2 2019年发表,是在 GPT-1 的基础上进行的重大改进。它引入了更多的参数和更大的训练数据集,显著提升了模型的能力。此外,GPT-2 还展示了强大的文本生成能力,甚至能够生成逼真的文章段落,这引发了关于 AI 伦理和社会影响的广泛讨论。\n",
|
55 |
"\n",
|
56 |
"- **伦理考虑**:由于 GPT-2 的强大生成能力,OpenAI 初始时对模型的发布采取了谨慎的态度,担心其可能被滥用(例如用于生成假新闻或恶意内容)。因此,他们选择了分阶段发布,并进行了广泛的伦理讨论和研究。\n",
|
57 |
"\n",
|
|
|
73 |
{
|
74 |
"cell_type": "code",
|
75 |
"execution_count": 1,
|
76 |
+
"id": "83af3495-b1fd-4ea1-84d7-9224b7094c0f",
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"import subprocess\n",
|
81 |
+
"import os\n",
|
82 |
+
"# 设置环境变量, autodl一般区域\n",
|
83 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
84 |
+
"output = result.stdout\n",
|
85 |
+
"for line in output.splitlines():\n",
|
86 |
+
" if '=' in line:\n",
|
87 |
+
" var, value = line.split('=', 1)\n",
|
88 |
+
" os.environ[var] = value"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": 2,
|
94 |
"id": "70581590-096f-45f8-b13b-b84e88615849",
|
95 |
"metadata": {},
|
96 |
"outputs": [],
|
|
|
114 |
},
|
115 |
{
|
116 |
"cell_type": "code",
|
117 |
+
"execution_count": 6,
|
118 |
"id": "34bd2710-fc99-4eda-9364-343f62f56b1e",
|
119 |
"metadata": {},
|
120 |
"outputs": [],
|
|
|
135 |
},
|
136 |
{
|
137 |
"cell_type": "code",
|
138 |
+
"execution_count": 7,
|
139 |
"id": "87435829-f522-4820-a51d-11fa4afee6d7",
|
140 |
"metadata": {},
|
141 |
"outputs": [],
|
|
|
154 |
]
|
155 |
},
|
156 |
{
|
157 |
+
"cell_type": "markdown",
|
158 |
+
"id": "05875e2f-32e7-485d-9399-99dc1e4bf71f",
|
|
|
159 |
"metadata": {},
|
|
|
160 |
"source": [
|
161 |
+
"## 训练数据\n",
|
162 |
+
"\n",
|
163 |
+
"接着是训练数据集,最重要的是构建模型的输入和输出。\n",
|
164 |
+
"\n",
|
165 |
+
"这里使用DataCollatorForLanguageModeling ,它是专为语言建模而设计(顾名思义)。除了堆叠和填充批次,它还负责创建语言模型标签——在因果语言建模中,输入也用作标签(只是移动了一个元素),并且这个数据整理器在训练期间即时创建它们,所以我们不需要复制 input_ids。"
|
166 |
]
|
167 |
},
|
168 |
{
|
169 |
"cell_type": "code",
|
170 |
+
"execution_count": 9,
|
171 |
"id": "6a6bbb6d-8f7d-4652-8961-bffc0466beaf",
|
172 |
"metadata": {},
|
173 |
"outputs": [
|
174 |
{
|
175 |
"data": {
|
176 |
"application/vnd.jupyter.widget-view+json": {
|
177 |
+
"model_id": "3db6964a82794db7ac007c7aa513ad33",
|
178 |
"version_major": 2,
|
179 |
"version_minor": 0
|
180 |
},
|
181 |
"text/plain": [
|
182 |
+
"Map (num_proc=15): 0%| | 0/90 [00:00<?, ? examples/s]"
|
183 |
]
|
184 |
},
|
185 |
"metadata": {},
|
186 |
"output_type": "display_data"
|
187 |
},
|
188 |
{
|
189 |
+
"name": "stderr",
|
190 |
+
"output_type": "stream",
|
191 |
+
"text": [
|
192 |
+
"num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.\n"
|
193 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
},
|
195 |
{
|
196 |
"data": {
|
197 |
"application/vnd.jupyter.widget-view+json": {
|
198 |
+
"model_id": "ba2c0d0e766949c79e4db6e6bd881f06",
|
199 |
"version_major": 2,
|
200 |
"version_minor": 0
|
201 |
},
|
202 |
"text/plain": [
|
203 |
+
"Map (num_proc=10): 0%| | 0/10 [00:00<?, ? examples/s]"
|
204 |
]
|
205 |
},
|
206 |
"metadata": {},
|
|
|
210 |
"source": [
|
211 |
"# 1. load dna dataset\n",
|
212 |
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
213 |
+
"#dataset = raw_dataset[\"train\"].select(range(100)).train_test_split(test_size=0.1, shuffle=True)\n",
|
214 |
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
215 |
"\n",
|
216 |
"# 2. tokenize\n",
|
|
|
226 |
")"
|
227 |
]
|
228 |
},
|
229 |
+
{
|
230 |
+
"cell_type": "code",
|
231 |
+
"execution_count": 10,
|
232 |
+
"id": "2eb1ff7a-f733-404b-a6ed-da82a677da3f",
|
233 |
+
"metadata": {},
|
234 |
+
"outputs": [
|
235 |
+
{
|
236 |
+
"name": "stdout",
|
237 |
+
"output_type": "stream",
|
238 |
+
"text": [
|
239 |
+
"[{'input_ids': [20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978, 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412, 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65, 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84, 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137, 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419, 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468, 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65, 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138, 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003, 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772, 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079, 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269, 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614, 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}]\n"
|
240 |
+
]
|
241 |
+
}
|
242 |
+
],
|
243 |
+
"source": [
|
244 |
+
"samples = [tokenized_datasets[\"train\"][0]]\n",
|
245 |
+
"print(samples)"
|
246 |
+
]
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"cell_type": "code",
|
250 |
+
"execution_count": 11,
|
251 |
+
"id": "260283a4-5ceb-4ef6-be1b-a4765fb74b20",
|
252 |
+
"metadata": {},
|
253 |
+
"outputs": [
|
254 |
+
{
|
255 |
+
"name": "stdout",
|
256 |
+
"output_type": "stream",
|
257 |
+
"text": [
|
258 |
+
"{'input_ids': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
|
259 |
+
" 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
|
260 |
+
" 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
|
261 |
+
" 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
|
262 |
+
" 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
|
263 |
+
" 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
|
264 |
+
" 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
|
265 |
+
" 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
|
266 |
+
" 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
|
267 |
+
" 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
|
268 |
+
" 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
|
269 |
+
" 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
|
270 |
+
" 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
|
271 |
+
" 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
|
272 |
+
" 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, 0,\n",
|
273 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
274 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
275 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
276 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
277 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
278 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
279 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
280 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
281 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
282 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
283 |
+
" 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
284 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
285 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
286 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
287 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
288 |
+
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
289 |
+
" 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
290 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
291 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
292 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
293 |
+
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([[ 20, 1378, 237, 486, 4849, 14831, 21023, 8523, 3566, 25978,\n",
|
294 |
+
" 16916, 1923, 911, 67, 637, 3261, 29515, 703, 181, 4412,\n",
|
295 |
+
" 3663, 8540, 3932, 19218, 3968, 126, 3289, 14553, 198, 65,\n",
|
296 |
+
" 24267, 2733, 1232, 32, 514, 919, 35, 4068, 172, 84,\n",
|
297 |
+
" 7767, 1062, 527, 8291, 2514, 28, 1429, 542, 864, 137,\n",
|
298 |
+
" 2492, 1922, 1744, 637, 3239, 282, 333, 1722, 120, 4419,\n",
|
299 |
+
" 39, 10654, 156, 1816, 1816, 1816, 5469, 4208, 1179, 468,\n",
|
300 |
+
" 112, 4596, 533, 188, 1959, 47, 1400, 64, 1986, 65,\n",
|
301 |
+
" 2086, 834, 16609, 1468, 414, 34, 682, 560, 49, 3138,\n",
|
302 |
+
" 14211, 2879, 16844, 122, 671, 262, 118, 1049, 347, 1003,\n",
|
303 |
+
" 113, 288, 1168, 11881, 13826, 297, 90, 189, 2166, 25772,\n",
|
304 |
+
" 5951, 27687, 20193, 205, 640, 50, 1082, 2015, 210, 7079,\n",
|
305 |
+
" 2295, 17153, 10491, 5749, 199, 108, 25861, 372, 8448, 269,\n",
|
306 |
+
" 103, 220, 243, 1150, 315, 823, 152, 9798, 229, 614,\n",
|
307 |
+
" 85, 2043, 48, 234, 5146, 524, 48, 468, 12858, -100,\n",
|
308 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
309 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
310 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
311 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
312 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
313 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
314 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
315 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
316 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
317 |
+
" -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,\n",
|
318 |
+
" -100, -100, -100, -100, -100, -100]])}\n"
|
319 |
+
]
|
320 |
+
}
|
321 |
+
],
|
322 |
+
"source": [
|
323 |
+
"io_data = data_collator(samples)\n",
|
324 |
+
"print(io_data)"
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "markdown",
|
329 |
+
"id": "80a84504-eaa3-43a9-ba13-3a2b73942c59",
|
330 |
+
"metadata": {},
|
331 |
+
"source": [
|
332 |
+
"这段代码展示了如何加载 DNA 数据集、对其进行分词处理,并为语言模型训练准备数据。让我们逐段解析代码,并特别关注 `DataCollatorForLanguageModeling` 函数。\n",
|
333 |
+
"\n",
|
334 |
+
"### 1. 加载 DNA 数据集\n",
|
335 |
+
"\n",
|
336 |
+
"```python\n",
|
337 |
+
"raw_dataset = load_dataset('text', data_files=\"../01-data_env/data/dna_1g.txt\")\n",
|
338 |
+
"dataset = raw_dataset[\"train\"].train_test_split(test_size=0.1, shuffle=True)\n",
|
339 |
+
"```\n",
|
340 |
+
"\n",
|
341 |
+
"- **`load_dataset`**:使用 Hugging Face 的 `datasets` 库加载文本文件作为数据集。这里指定的是一个本地的 DNA 序列文本文件 `dna_1g.txt`。\n",
|
342 |
+
"- **`train_test_split`**:将原始数据集分割为训练集和测试集,其中测试集占 10%(`test_size=0.1`),并随机打乱数据(`shuffle=True`)。\n",
|
343 |
+
"\n",
|
344 |
+
"### 2. 定义分词函数\n",
|
345 |
+
"\n",
|
346 |
+
"```python\n",
|
347 |
+
"def tokenize_function(examples):\n",
|
348 |
+
" return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n",
|
349 |
+
"```\n",
|
350 |
+
"\n",
|
351 |
+
"- **`tokenize_function`**:这是一个自定义的分词函数,用于对数据集中的每条记录进行分词处理。\n",
|
352 |
+
"- **参数解释**:\n",
|
353 |
+
" - `examples['text']`:获取数据集中每条记录的文本内容。\n",
|
354 |
+
" - `truncation=True`:确保所有输入序列被截断到 `max_length` 指定的最大长度。\n",
|
355 |
+
" - `padding='max_length'`:将所有输入序列填充到 `max_length` 指定的最大长度,以保证批次内所有序列具有相同的长度。\n",
|
356 |
+
" - `max_length`:指定最大序列长度,需要根据具体任务和模型要求设置。\n",
|
357 |
+
"\n",
|
358 |
+
"### 3. 对数据集应用分词函数\n",
|
359 |
+
"\n",
|
360 |
+
"```python\n",
|
361 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15)\n",
|
362 |
+
"```\n",
|
363 |
+
"\n",
|
364 |
+
"- **`map`**:将 `tokenize_function` 应用到整个数据集上。`batched=True` 表示批量处理,可以显著提高处理速度。\n",
|
365 |
+
"- **`remove_columns=['text']`**:分词后不再需要原始文本列,因此将其移除。\n",
|
366 |
+
"- **`num_proc=15`**:指定使用的 CPU 核心数(或进程数),可以根据你的硬件资源调整。这有助于加速分词过程。\n",
|
367 |
+
"\n",
|
368 |
+
"### 4. 创建数据收集器\n",
|
369 |
+
"\n",
|
370 |
+
"```python\n",
|
371 |
+
"data_collator = DataCollatorForLanguageModeling(\n",
|
372 |
+
" tokenizer=tokenizer, mlm=False\n",
|
373 |
+
")\n",
|
374 |
+
"```\n",
|
375 |
+
"\n",
|
376 |
+
"#### `DataCollatorForLanguageModeling` 函数详解\n",
|
377 |
+
"\n",
|
378 |
+
"`DataCollatorForLanguageModeling` 是 Hugging Face 提供的一个工具,用于在训练语言模型时动态地处理批次数据。它主要用于两种任务:\n",
|
379 |
+
"\n",
|
380 |
+
"- **Masked Language Modeling (MLM)**:遮蔽某些 token 并预测它们,常用于预训练模型(如 BERT)。\n",
|
381 |
+
"- **Causal Language Modeling (CLM)**:基于前文预测下一个 token,适用于生成式模型(如 GPT 系列)。\n",
|
382 |
+
"\n",
|
383 |
+
"在这个例子中,`mlm=False` 表明我们正在处理因果语言建模(CLM),即每个 token 只能依赖于其前面的 token 进行预测。这对于像 GPT 这样的生成模型非常适用。\n",
|
384 |
+
"\n",
|
385 |
+
"- **`tokenizer=tokenizer`**:指定用于编码和解码的分词器对象。\n",
|
386 |
+
"- **`mlm=False`**:关闭 MLM 模式,因为我们不需要遮蔽任何 token。对于因果语言建模,模型会尝试根据之前的上下文预测下一个 token。"
|
387 |
+
]
|
388 |
+
},
|
389 |
+
{
|
390 |
+
"cell_type": "markdown",
|
391 |
+
"id": "3fbe9480-c394-4bab-bdee-e80f21e0259a",
|
392 |
+
"metadata": {},
|
393 |
+
"source": [
|
394 |
+
"### 开始训练"
|
395 |
+
]
|
396 |
+
},
|
397 |
{
|
398 |
"cell_type": "code",
|
399 |
"execution_count": 5,
|
|
|
4838 |
},
|
4839 |
{
|
4840 |
"cell_type": "code",
|
4841 |
+
"execution_count": 3,
|
4842 |
"id": "cbf27648-d758-45ac-8665-e3466cd2fb65",
|
4843 |
"metadata": {},
|
4844 |
"outputs": [],
|
4845 |
+
"source": [
|
4846 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dna_bpe_dict\")\n",
|
4847 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
4848 |
+
]
|
4849 |
+
},
|
4850 |
+
{
|
4851 |
+
"cell_type": "code",
|
4852 |
+
"execution_count": 5,
|
4853 |
+
"id": "76f7c636-20c0-47a1-83c1-72e5ee101c0f",
|
4854 |
+
"metadata": {},
|
4855 |
+
"outputs": [],
|
4856 |
+
"source": [
|
4857 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
4858 |
+
"model = AutoModel.from_pretrained('dna_gpt2_v0')"
|
4859 |
+
]
|
4860 |
+
},
|
4861 |
+
{
|
4862 |
+
"cell_type": "code",
|
4863 |
+
"execution_count": 6,
|
4864 |
+
"id": "c041ad1b-7fe4-4d00-a77e-8ab17f020600",
|
4865 |
+
"metadata": {},
|
4866 |
+
"outputs": [
|
4867 |
+
{
|
4868 |
+
"name": "stdout",
|
4869 |
+
"output_type": "stream",
|
4870 |
+
"text": [
|
4871 |
+
"[2024-12-30 20:29:16,315] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
4872 |
+
]
|
4873 |
+
},
|
4874 |
+
{
|
4875 |
+
"name": "stderr",
|
4876 |
+
"output_type": "stream",
|
4877 |
+
"text": [
|
4878 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
4879 |
+
"collect2: error: ld returned 1 exit status\n",
|
4880 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4881 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4882 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
4883 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
4884 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
4885 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
4886 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
4887 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
4888 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
4889 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
4890 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4891 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
4892 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
4893 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
4894 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
4895 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4896 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
4897 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
4898 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4899 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4900 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
4901 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
4902 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4903 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
4904 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
4905 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
4906 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
4907 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
4908 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
4909 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
4910 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
4911 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
4912 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4913 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
4914 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
4915 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
4916 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
4917 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
4918 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
4919 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
4920 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
4921 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
4922 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4923 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
4924 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
4925 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4926 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
4927 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
4928 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4929 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
4930 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
4931 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
4932 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
4933 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
4934 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
4935 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
4936 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
4937 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
4938 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
4939 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
4940 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
4941 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
4942 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4943 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
4944 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4945 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4946 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
4947 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4948 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4949 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
4950 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
4951 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
4952 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
4953 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
4954 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
4955 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
4956 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
4957 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
4958 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
4959 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4960 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
4961 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
4962 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
4963 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
4964 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
4965 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
4966 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
4967 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
4968 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
4969 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
4970 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
4971 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
4972 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
4973 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4974 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
4975 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
4976 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
4977 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
4978 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4979 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
4980 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
4981 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
4982 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
4983 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
4984 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
4985 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
4986 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
4987 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
4988 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4989 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
4990 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
4991 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
4992 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
4993 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
4994 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
4995 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
4996 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
4997 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
4998 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
4999 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
5000 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
5001 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
5002 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
5003 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
5004 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
5005 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
5006 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
5007 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
5008 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
5009 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
5010 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
5011 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
5012 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
5013 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
5014 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
5015 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
5016 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
5017 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5018 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
5019 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
5020 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
5021 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
5022 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
5023 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
5024 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
5025 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
5026 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
5027 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
5028 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
5029 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
5030 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
5031 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
5032 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
5033 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
5034 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
5035 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
5036 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
5037 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
5038 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
5039 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
5040 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
5041 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
5042 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
5043 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
5044 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
5045 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
5046 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
5047 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5048 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
5049 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
5050 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
5051 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
5052 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
5053 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
5054 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
5055 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
5056 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
5057 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
5058 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
5059 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
5060 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
5061 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
5062 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
5063 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
5064 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
5065 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
5066 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
5067 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
5068 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
5069 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
5070 |
+
"collect2: error: ld returned 1 exit status\n"
|
5071 |
+
]
|
5072 |
+
},
|
5073 |
+
{
|
5074 |
+
"data": {
|
5075 |
+
"application/vnd.jupyter.widget-view+json": {
|
5076 |
+
"model_id": "857d0b6286fb4eaaafcb8911cef664dc",
|
5077 |
+
"version_major": 2,
|
5078 |
+
"version_minor": 0
|
5079 |
+
},
|
5080 |
+
"text/plain": [
|
5081 |
+
"model.safetensors: 0%| | 0.00/436M [00:00<?, ?B/s]"
|
5082 |
+
]
|
5083 |
+
},
|
5084 |
+
"metadata": {},
|
5085 |
+
"output_type": "display_data"
|
5086 |
+
},
|
5087 |
+
{
|
5088 |
+
"data": {
|
5089 |
+
"text/plain": [
|
5090 |
+
"CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', commit_message='Upload model', commit_description='', oid='e7c5a5c59e28329114d3ab64cb7b585f198b1f5f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
|
5091 |
+
]
|
5092 |
+
},
|
5093 |
+
"execution_count": 6,
|
5094 |
+
"metadata": {},
|
5095 |
+
"output_type": "execute_result"
|
5096 |
+
}
|
5097 |
+
],
|
5098 |
+
"source": [
|
5099 |
+
"model.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_***\")"
|
5100 |
+
]
|
5101 |
+
},
|
5102 |
+
{
|
5103 |
+
"cell_type": "code",
|
5104 |
+
"execution_count": 7,
|
5105 |
+
"id": "8a28a45b-56ba-4328-8edf-4cd7ee9289c5",
|
5106 |
+
"metadata": {},
|
5107 |
+
"outputs": [
|
5108 |
+
{
|
5109 |
+
"data": {
|
5110 |
+
"application/vnd.jupyter.widget-view+json": {
|
5111 |
+
"model_id": "42c48d91578f41439d7b3ec26a6d566c",
|
5112 |
+
"version_major": 2,
|
5113 |
+
"version_minor": 0
|
5114 |
+
},
|
5115 |
+
"text/plain": [
|
5116 |
+
"README.md: 0%| | 0.00/5.17k [00:00<?, ?B/s]"
|
5117 |
+
]
|
5118 |
+
},
|
5119 |
+
"metadata": {},
|
5120 |
+
"output_type": "display_data"
|
5121 |
+
},
|
5122 |
+
{
|
5123 |
+
"data": {
|
5124 |
+
"text/plain": [
|
5125 |
+
"CommitInfo(commit_url='https://huggingface.co/dnagpt/dna_gpt2_v0/commit/16138639cb17307b84421e443a1c67f4fe188121', commit_message='Upload tokenizer', commit_description='', oid='16138639cb17307b84421e443a1c67f4fe188121', pr_url=None, repo_url=RepoUrl('https://huggingface.co/dnagpt/dna_gpt2_v0', endpoint='https://huggingface.co', repo_type='model', repo_id='dnagpt/dna_gpt2_v0'), pr_revision=None, pr_num=None)"
|
5126 |
+
]
|
5127 |
+
},
|
5128 |
+
"execution_count": 7,
|
5129 |
+
"metadata": {},
|
5130 |
+
"output_type": "execute_result"
|
5131 |
+
}
|
5132 |
+
],
|
5133 |
+
"source": [
|
5134 |
+
"tokenizer.push_to_hub(\"dnagpt/dna_gpt2_v0\", token=\"hf_**\")"
|
5135 |
+
]
|
5136 |
+
},
|
5137 |
+
{
|
5138 |
+
"cell_type": "code",
|
5139 |
+
"execution_count": null,
|
5140 |
+
"id": "ec5364cc-4386-4db8-a400-cd788657de84",
|
5141 |
+
"metadata": {},
|
5142 |
+
"outputs": [],
|
5143 |
"source": []
|
5144 |
}
|
5145 |
],
|
02-gpt2_bert/3-dna-bert.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/4-gene-feature.ipynb
CHANGED
@@ -153,7 +153,7 @@
|
|
153 |
},
|
154 |
{
|
155 |
"cell_type": "code",
|
156 |
-
"execution_count":
|
157 |
"id": "f1ca177c-a80f-48a1-b2f9-16c13b3350db",
|
158 |
"metadata": {},
|
159 |
"outputs": [
|
@@ -163,7 +163,7 @@
|
|
163 |
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
164 |
]
|
165 |
},
|
166 |
-
"execution_count":
|
167 |
"metadata": {},
|
168 |
"output_type": "execute_result"
|
169 |
}
|
@@ -193,10 +193,18 @@
|
|
193 |
},
|
194 |
{
|
195 |
"cell_type": "code",
|
196 |
-
"execution_count":
|
197 |
"id": "2295739c-e80a-47be-9400-88bfab4b0bb6",
|
198 |
"metadata": {},
|
199 |
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
{
|
201 |
"data": {
|
202 |
"text/plain": [
|
@@ -208,7 +216,7 @@
|
|
208 |
"})"
|
209 |
]
|
210 |
},
|
211 |
-
"execution_count":
|
212 |
"metadata": {},
|
213 |
"output_type": "execute_result"
|
214 |
}
|
@@ -229,7 +237,7 @@
|
|
229 |
},
|
230 |
{
|
231 |
"cell_type": "code",
|
232 |
-
"execution_count":
|
233 |
"id": "9a47a1b1-21f2-4d71-801c-50f88e326ed3",
|
234 |
"metadata": {},
|
235 |
"outputs": [
|
@@ -240,7 +248,7 @@
|
|
240 |
" 'label': 0}"
|
241 |
]
|
242 |
},
|
243 |
-
"execution_count":
|
244 |
"metadata": {},
|
245 |
"output_type": "execute_result"
|
246 |
}
|
@@ -259,7 +267,7 @@
|
|
259 |
},
|
260 |
{
|
261 |
"cell_type": "code",
|
262 |
-
"execution_count":
|
263 |
"id": "4010d991-056a-43ce-8cca-30eeec8678f5",
|
264 |
"metadata": {},
|
265 |
"outputs": [],
|
@@ -267,198 +275,154 @@
|
|
267 |
"import numpy as np\n",
|
268 |
"from sklearn.model_selection import train_test_split\n",
|
269 |
"from sklearn.linear_model import LogisticRegression\n",
|
270 |
-
"from sklearn.datasets import load_iris\n",
|
271 |
"from sklearn.metrics import accuracy_score\n",
|
|
|
|
|
272 |
"\n",
|
|
|
|
|
|
|
|
|
273 |
"\n",
|
274 |
"def get_gpt2_feature(sequence):\n",
|
275 |
-
"
|
276 |
-
"
|
277 |
-
"
|
278 |
-
"
|
279 |
-
"
|
280 |
-
"
|
281 |
-
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
"X = []\n",
|
283 |
"Y = []\n",
|
284 |
"\n",
|
285 |
-
"
|
|
|
286 |
" sequence = item[\"sequence\"]\n",
|
287 |
" label = item[\"label\"]\n",
|
288 |
" x_v = get_gpt2_feature(sequence)\n",
|
289 |
" y_v = label\n",
|
290 |
" X.append(x_v)\n",
|
291 |
-
" Y.append(y_v)"
|
292 |
]
|
293 |
},
|
294 |
{
|
295 |
"cell_type": "code",
|
296 |
-
"execution_count":
|
297 |
-
"id": "
|
298 |
"metadata": {},
|
299 |
-
"outputs": [
|
300 |
-
{
|
301 |
-
"data": {
|
302 |
-
"text/plain": [
|
303 |
-
"array([[5.1, 3.5, 1.4, 0.2],\n",
|
304 |
-
" [4.9, 3. , 1.4, 0.2],\n",
|
305 |
-
" [4.7, 3.2, 1.3, 0.2],\n",
|
306 |
-
" [4.6, 3.1, 1.5, 0.2],\n",
|
307 |
-
" [5. , 3.6, 1.4, 0.2],\n",
|
308 |
-
" [5.4, 3.9, 1.7, 0.4],\n",
|
309 |
-
" [4.6, 3.4, 1.4, 0.3],\n",
|
310 |
-
" [5. , 3.4, 1.5, 0.2],\n",
|
311 |
-
" [4.4, 2.9, 1.4, 0.2],\n",
|
312 |
-
" [4.9, 3.1, 1.5, 0.1],\n",
|
313 |
-
" [5.4, 3.7, 1.5, 0.2],\n",
|
314 |
-
" [4.8, 3.4, 1.6, 0.2],\n",
|
315 |
-
" [4.8, 3. , 1.4, 0.1],\n",
|
316 |
-
" [4.3, 3. , 1.1, 0.1],\n",
|
317 |
-
" [5.8, 4. , 1.2, 0.2],\n",
|
318 |
-
" [5.7, 4.4, 1.5, 0.4],\n",
|
319 |
-
" [5.4, 3.9, 1.3, 0.4],\n",
|
320 |
-
" [5.1, 3.5, 1.4, 0.3],\n",
|
321 |
-
" [5.7, 3.8, 1.7, 0.3],\n",
|
322 |
-
" [5.1, 3.8, 1.5, 0.3],\n",
|
323 |
-
" [5.4, 3.4, 1.7, 0.2],\n",
|
324 |
-
" [5.1, 3.7, 1.5, 0.4],\n",
|
325 |
-
" [4.6, 3.6, 1. , 0.2],\n",
|
326 |
-
" [5.1, 3.3, 1.7, 0.5],\n",
|
327 |
-
" [4.8, 3.4, 1.9, 0.2],\n",
|
328 |
-
" [5. , 3. , 1.6, 0.2],\n",
|
329 |
-
" [5. , 3.4, 1.6, 0.4],\n",
|
330 |
-
" [5.2, 3.5, 1.5, 0.2],\n",
|
331 |
-
" [5.2, 3.4, 1.4, 0.2],\n",
|
332 |
-
" [4.7, 3.2, 1.6, 0.2],\n",
|
333 |
-
" [4.8, 3.1, 1.6, 0.2],\n",
|
334 |
-
" [5.4, 3.4, 1.5, 0.4],\n",
|
335 |
-
" [5.2, 4.1, 1.5, 0.1],\n",
|
336 |
-
" [5.5, 4.2, 1.4, 0.2],\n",
|
337 |
-
" [4.9, 3.1, 1.5, 0.2],\n",
|
338 |
-
" [5. , 3.2, 1.2, 0.2],\n",
|
339 |
-
" [5.5, 3.5, 1.3, 0.2],\n",
|
340 |
-
" [4.9, 3.6, 1.4, 0.1],\n",
|
341 |
-
" [4.4, 3. , 1.3, 0.2],\n",
|
342 |
-
" [5.1, 3.4, 1.5, 0.2],\n",
|
343 |
-
" [5. , 3.5, 1.3, 0.3],\n",
|
344 |
-
" [4.5, 2.3, 1.3, 0.3],\n",
|
345 |
-
" [4.4, 3.2, 1.3, 0.2],\n",
|
346 |
-
" [5. , 3.5, 1.6, 0.6],\n",
|
347 |
-
" [5.1, 3.8, 1.9, 0.4],\n",
|
348 |
-
" [4.8, 3. , 1.4, 0.3],\n",
|
349 |
-
" [5.1, 3.8, 1.6, 0.2],\n",
|
350 |
-
" [4.6, 3.2, 1.4, 0.2],\n",
|
351 |
-
" [5.3, 3.7, 1.5, 0.2],\n",
|
352 |
-
" [5. , 3.3, 1.4, 0.2],\n",
|
353 |
-
" [7. , 3.2, 4.7, 1.4],\n",
|
354 |
-
" [6.4, 3.2, 4.5, 1.5],\n",
|
355 |
-
" [6.9, 3.1, 4.9, 1.5],\n",
|
356 |
-
" [5.5, 2.3, 4. , 1.3],\n",
|
357 |
-
" [6.5, 2.8, 4.6, 1.5],\n",
|
358 |
-
" [5.7, 2.8, 4.5, 1.3],\n",
|
359 |
-
" [6.3, 3.3, 4.7, 1.6],\n",
|
360 |
-
" [4.9, 2.4, 3.3, 1. ],\n",
|
361 |
-
" [6.6, 2.9, 4.6, 1.3],\n",
|
362 |
-
" [5.2, 2.7, 3.9, 1.4],\n",
|
363 |
-
" [5. , 2. , 3.5, 1. ],\n",
|
364 |
-
" [5.9, 3. , 4.2, 1.5],\n",
|
365 |
-
" [6. , 2.2, 4. , 1. ],\n",
|
366 |
-
" [6.1, 2.9, 4.7, 1.4],\n",
|
367 |
-
" [5.6, 2.9, 3.6, 1.3],\n",
|
368 |
-
" [6.7, 3.1, 4.4, 1.4],\n",
|
369 |
-
" [5.6, 3. , 4.5, 1.5],\n",
|
370 |
-
" [5.8, 2.7, 4.1, 1. ],\n",
|
371 |
-
" [6.2, 2.2, 4.5, 1.5],\n",
|
372 |
-
" [5.6, 2.5, 3.9, 1.1],\n",
|
373 |
-
" [5.9, 3.2, 4.8, 1.8],\n",
|
374 |
-
" [6.1, 2.8, 4. , 1.3],\n",
|
375 |
-
" [6.3, 2.5, 4.9, 1.5],\n",
|
376 |
-
" [6.1, 2.8, 4.7, 1.2],\n",
|
377 |
-
" [6.4, 2.9, 4.3, 1.3],\n",
|
378 |
-
" [6.6, 3. , 4.4, 1.4],\n",
|
379 |
-
" [6.8, 2.8, 4.8, 1.4],\n",
|
380 |
-
" [6.7, 3. , 5. , 1.7],\n",
|
381 |
-
" [6. , 2.9, 4.5, 1.5],\n",
|
382 |
-
" [5.7, 2.6, 3.5, 1. ],\n",
|
383 |
-
" [5.5, 2.4, 3.8, 1.1],\n",
|
384 |
-
" [5.5, 2.4, 3.7, 1. ],\n",
|
385 |
-
" [5.8, 2.7, 3.9, 1.2],\n",
|
386 |
-
" [6. , 2.7, 5.1, 1.6],\n",
|
387 |
-
" [5.4, 3. , 4.5, 1.5],\n",
|
388 |
-
" [6. , 3.4, 4.5, 1.6],\n",
|
389 |
-
" [6.7, 3.1, 4.7, 1.5],\n",
|
390 |
-
" [6.3, 2.3, 4.4, 1.3],\n",
|
391 |
-
" [5.6, 3. , 4.1, 1.3],\n",
|
392 |
-
" [5.5, 2.5, 4. , 1.3],\n",
|
393 |
-
" [5.5, 2.6, 4.4, 1.2],\n",
|
394 |
-
" [6.1, 3. , 4.6, 1.4],\n",
|
395 |
-
" [5.8, 2.6, 4. , 1.2],\n",
|
396 |
-
" [5. , 2.3, 3.3, 1. ],\n",
|
397 |
-
" [5.6, 2.7, 4.2, 1.3],\n",
|
398 |
-
" [5.7, 3. , 4.2, 1.2],\n",
|
399 |
-
" [5.7, 2.9, 4.2, 1.3],\n",
|
400 |
-
" [6.2, 2.9, 4.3, 1.3],\n",
|
401 |
-
" [5.1, 2.5, 3. , 1.1],\n",
|
402 |
-
" [5.7, 2.8, 4.1, 1.3]])"
|
403 |
-
]
|
404 |
-
},
|
405 |
-
"execution_count": 49,
|
406 |
-
"metadata": {},
|
407 |
-
"output_type": "execute_result"
|
408 |
-
}
|
409 |
-
],
|
410 |
"source": [
|
411 |
-
"X"
|
412 |
]
|
413 |
},
|
414 |
{
|
415 |
"cell_type": "code",
|
416 |
-
"execution_count":
|
417 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
"metadata": {},
|
419 |
"outputs": [
|
420 |
{
|
421 |
-
"
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
|
427 |
-
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])"
|
428 |
-
]
|
429 |
-
},
|
430 |
-
"execution_count": 51,
|
431 |
-
"metadata": {},
|
432 |
-
"output_type": "execute_result"
|
433 |
}
|
434 |
],
|
435 |
"source": [
|
436 |
-
"
|
|
|
|
|
437 |
]
|
438 |
},
|
439 |
{
|
440 |
"cell_type": "code",
|
441 |
-
"execution_count":
|
442 |
-
"id": "
|
443 |
"metadata": {},
|
444 |
"outputs": [],
|
445 |
"source": [
|
446 |
-
"# 将数据分为训练集和测试集\n",
|
447 |
-
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
448 |
-
"\n",
|
449 |
-
"# 创建逻辑回归模型\n",
|
450 |
-
"model = LogisticRegression()\n",
|
451 |
-
"\n",
|
452 |
-
"# 训练模型\n",
|
453 |
-
"model.fit(X_train, y_train)\n",
|
454 |
-
"\n",
|
455 |
"# 在测试集上进行预测\n",
|
456 |
-
"y_pred = model.predict(X_test)
|
457 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
458 |
"# 计算准确率\n",
|
459 |
"accuracy = accuracy_score(y_test, y_pred)\n",
|
460 |
-
"print(f\"Accuracy: {accuracy * 100:.2f}%\")
|
461 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
"# 输出部分预测结果与真实标签对比\n",
|
463 |
"for i in range(5):\n",
|
464 |
" print(f\"True: {y_test[i]}, Predicted: {y_pred[i]}\")"
|
|
|
153 |
},
|
154 |
{
|
155 |
"cell_type": "code",
|
156 |
+
"execution_count": 2,
|
157 |
"id": "f1ca177c-a80f-48a1-b2f9-16c13b3350db",
|
158 |
"metadata": {},
|
159 |
"outputs": [
|
|
|
163 |
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
164 |
]
|
165 |
},
|
166 |
+
"execution_count": 2,
|
167 |
"metadata": {},
|
168 |
"output_type": "execute_result"
|
169 |
}
|
|
|
193 |
},
|
194 |
{
|
195 |
"cell_type": "code",
|
196 |
+
"execution_count": 3,
|
197 |
"id": "2295739c-e80a-47be-9400-88bfab4b0bb6",
|
198 |
"metadata": {},
|
199 |
"outputs": [
|
200 |
+
{
|
201 |
+
"name": "stderr",
|
202 |
+
"output_type": "stream",
|
203 |
+
"text": [
|
204 |
+
"Using the latest cached version of the dataset since dnagpt/dna_core_promoter couldn't be found on the Hugging Face Hub\n",
|
205 |
+
"Found the latest cached dataset configuration 'default' at /root/.cache/huggingface/datasets/dnagpt___dna_core_promoter/default/0.0.0/809065798bf4928f67397ddba23e4aa9cc5ac3ed (last modified on Fri Dec 27 16:05:19 2024).\n"
|
206 |
+
]
|
207 |
+
},
|
208 |
{
|
209 |
"data": {
|
210 |
"text/plain": [
|
|
|
216 |
"})"
|
217 |
]
|
218 |
},
|
219 |
+
"execution_count": 3,
|
220 |
"metadata": {},
|
221 |
"output_type": "execute_result"
|
222 |
}
|
|
|
237 |
},
|
238 |
{
|
239 |
"cell_type": "code",
|
240 |
+
"execution_count": 4,
|
241 |
"id": "9a47a1b1-21f2-4d71-801c-50f88e326ed3",
|
242 |
"metadata": {},
|
243 |
"outputs": [
|
|
|
248 |
" 'label': 0}"
|
249 |
]
|
250 |
},
|
251 |
+
"execution_count": 4,
|
252 |
"metadata": {},
|
253 |
"output_type": "execute_result"
|
254 |
}
|
|
|
267 |
},
|
268 |
{
|
269 |
"cell_type": "code",
|
270 |
+
"execution_count": 5,
|
271 |
"id": "4010d991-056a-43ce-8cca-30eeec8678f5",
|
272 |
"metadata": {},
|
273 |
"outputs": [],
|
|
|
275 |
"import numpy as np\n",
|
276 |
"from sklearn.model_selection import train_test_split\n",
|
277 |
"from sklearn.linear_model import LogisticRegression\n",
|
|
|
278 |
"from sklearn.metrics import accuracy_score\n",
|
279 |
+
"from transformers import GPT2Tokenizer, GPT2Model\n",
|
280 |
+
"import torch\n",
|
281 |
"\n",
|
282 |
+
"# 初始化 GPT-2 模型和分词器\n",
|
283 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
284 |
+
"tokenizer.pad_token = tokenizer.eos_token # 将填充符号设置为 eos_token\n",
|
285 |
+
"model = GPT2Model.from_pretrained(\"gpt2\")\n",
|
286 |
"\n",
|
287 |
"def get_gpt2_feature(sequence):\n",
|
288 |
+
" \"\"\"\n",
|
289 |
+
" 使用 GPT-2 模型提取特征向量。\n",
|
290 |
+
" :param sequence: DNA 序列 (字符串格式)\n",
|
291 |
+
" :return: 平均特征向量 (numpy 数组)\n",
|
292 |
+
" \"\"\"\n",
|
293 |
+
" # 将 DNA 序列分词并转换为 GPT-2 输入\n",
|
294 |
+
" inputs = tokenizer(sequence, return_tensors=\"pt\", padding=True, truncation=True)\n",
|
295 |
+
" with torch.no_grad():\n",
|
296 |
+
" outputs = model(**inputs)\n",
|
297 |
+
" # 提取最后一层的隐藏状态作为特征向量并平均,会对每个序列的所有 token 的特征进行平均,最终得到一个形状为 (1, 768) 的向量(对于 batch_size=1)\n",
|
298 |
+
" feature_vector = outputs.last_hidden_state.mean(dim=1).detach().numpy()\n",
|
299 |
+
" return feature_vector\n"
|
300 |
+
]
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"cell_type": "code",
|
304 |
+
"execution_count": 6,
|
305 |
+
"id": "057eee1e-9f9a-47a2-b577-588caec58d31",
|
306 |
+
"metadata": {},
|
307 |
+
"outputs": [
|
308 |
+
{
|
309 |
+
"name": "stderr",
|
310 |
+
"output_type": "stream",
|
311 |
+
"text": [
|
312 |
+
"Processing DNA data: 100%|██████████| 59196/59196 [25:16<00:00, 39.04it/s]\n"
|
313 |
+
]
|
314 |
+
}
|
315 |
+
],
|
316 |
+
"source": [
|
317 |
+
"from tqdm import tqdm\n",
|
318 |
+
"# 提取特征和标签\n",
|
319 |
"X = []\n",
|
320 |
"Y = []\n",
|
321 |
"\n",
|
322 |
+
"# 存储特征向量和标签\n",
|
323 |
+
"for item in tqdm(dna_data[\"train\"], desc=\"Processing DNA data\"):\n",
|
324 |
" sequence = item[\"sequence\"]\n",
|
325 |
" label = item[\"label\"]\n",
|
326 |
" x_v = get_gpt2_feature(sequence)\n",
|
327 |
" y_v = label\n",
|
328 |
" X.append(x_v)\n",
|
329 |
+
" Y.append(y_v)\n"
|
330 |
]
|
331 |
},
|
332 |
{
|
333 |
"cell_type": "code",
|
334 |
+
"execution_count": 11,
|
335 |
+
"id": "51133c2a-42e7-4e11-a6f9-6812a4e54182",
|
336 |
"metadata": {},
|
337 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
"source": [
|
339 |
+
"X = np.array(X).squeeze(1) # 去掉维度为1的那一维"
|
340 |
]
|
341 |
},
|
342 |
{
|
343 |
"cell_type": "code",
|
344 |
+
"execution_count": 17,
|
345 |
+
"id": "5ab0c188-6476-43c4-b361-a2bfe0ec7a8a",
|
346 |
+
"metadata": {},
|
347 |
+
"outputs": [],
|
348 |
+
"source": [
|
349 |
+
"# 将数据分为训练集和测试集\n",
|
350 |
+
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
|
351 |
+
"\n",
|
352 |
+
"# 创建逻辑回归模型\n",
|
353 |
+
"model = LogisticRegression(max_iter=200, solver='newton-cg')\n"
|
354 |
+
]
|
355 |
+
},
|
356 |
+
{
|
357 |
+
"cell_type": "code",
|
358 |
+
"execution_count": 18,
|
359 |
+
"id": "fd9be2bf-331e-4905-99e6-832e58a0463a",
|
360 |
"metadata": {},
|
361 |
"outputs": [
|
362 |
{
|
363 |
+
"name": "stderr",
|
364 |
+
"output_type": "stream",
|
365 |
+
"text": [
|
366 |
+
"Training Logistic Regression: 100%|██████████| 200/200 [27:45<00:00, 8.33s/it]\n"
|
367 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
}
|
369 |
],
|
370 |
"source": [
|
371 |
+
"# 训练模型\n",
|
372 |
+
"for i in tqdm(range(200), desc=\"Training Logistic Regression\"):\n",
|
373 |
+
" model.fit(X_train, y_train)"
|
374 |
]
|
375 |
},
|
376 |
{
|
377 |
"cell_type": "code",
|
378 |
+
"execution_count": 19,
|
379 |
+
"id": "5417e4e2-3bca-4718-83a1-f418ad8a65b6",
|
380 |
"metadata": {},
|
381 |
"outputs": [],
|
382 |
"source": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
383 |
"# 在测试集上进行预测\n",
|
384 |
+
"y_pred = model.predict(X_test)"
|
385 |
+
]
|
386 |
+
},
|
387 |
+
{
|
388 |
+
"cell_type": "code",
|
389 |
+
"execution_count": 20,
|
390 |
+
"id": "34371f07-0cbe-43cf-99a1-2ccd55e43e14",
|
391 |
+
"metadata": {},
|
392 |
+
"outputs": [
|
393 |
+
{
|
394 |
+
"name": "stdout",
|
395 |
+
"output_type": "stream",
|
396 |
+
"text": [
|
397 |
+
"Accuracy: 77.48%\n"
|
398 |
+
]
|
399 |
+
}
|
400 |
+
],
|
401 |
+
"source": [
|
402 |
"# 计算准确率\n",
|
403 |
"accuracy = accuracy_score(y_test, y_pred)\n",
|
404 |
+
"print(f\"Accuracy: {accuracy * 100:.2f}%\")"
|
405 |
+
]
|
406 |
+
},
|
407 |
+
{
|
408 |
+
"cell_type": "code",
|
409 |
+
"execution_count": 21,
|
410 |
+
"id": "15472a1e-7813-4ccd-878b-e0cf5d7ce095",
|
411 |
+
"metadata": {},
|
412 |
+
"outputs": [
|
413 |
+
{
|
414 |
+
"name": "stdout",
|
415 |
+
"output_type": "stream",
|
416 |
+
"text": [
|
417 |
+
"True: 0, Predicted: 0\n",
|
418 |
+
"True: 0, Predicted: 1\n",
|
419 |
+
"True: 1, Predicted: 1\n",
|
420 |
+
"True: 0, Predicted: 0\n",
|
421 |
+
"True: 0, Predicted: 0\n"
|
422 |
+
]
|
423 |
+
}
|
424 |
+
],
|
425 |
+
"source": [
|
426 |
"# 输出部分预测结果与真实标签对比\n",
|
427 |
"for i in range(5):\n",
|
428 |
" print(f\"True: {y_test[i]}, Predicted: {y_pred[i]}\")"
|
02-gpt2_bert/5-multi-seq-gpt.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/special_tokens_map-checkpoint.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer-checkpoint.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/dna_wordpiece_dict/.ipynb_checkpoints/tokenizer_config-checkpoint.json
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"4": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"extra_special_tokens": {},
|
47 |
+
"mask_token": "[MASK]",
|
48 |
+
"model_max_length": 1000000000000000019884624838656,
|
49 |
+
"pad_token": "[PAD]",
|
50 |
+
"sep_token": "[SEP]",
|
51 |
+
"tokenizer_class": "PreTrainedTokenizerFast",
|
52 |
+
"unk_token": "[UNK]"
|
53 |
+
}
|
02-gpt2_bert/gene_en_bpe.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import (
|
2 |
+
decoders,
|
3 |
+
models,
|
4 |
+
normalizers,
|
5 |
+
pre_tokenizers,
|
6 |
+
processors,
|
7 |
+
trainers,
|
8 |
+
Tokenizer,
|
9 |
+
)
|
10 |
+
|
11 |
+
tokenizer = Tokenizer(models.BPE())
|
12 |
+
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) #use_regex=False,空格当成一般字符串
|
13 |
+
trainer = trainers.BpeTrainer(vocab_size=90000, special_tokens=["<|endoftext|>"]) #9w words
|
14 |
+
|
15 |
+
tokenizer.train(["dna_1g.txt","protein_1g.txt","english_500m.txt"]
|
16 |
+
, trainer=trainer) #all file list, take 10-20 min
|
17 |
+
|
18 |
+
|
19 |
+
tokenizer.save("gene_eng_dict.json")
|
02-gpt2_bert/gene_eng_dict.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/gene_eng_dict/merges.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:713f6b800eca1349925657153ee5e0d0543e7e48909e4db9a18685dbf0f38794
|
3 |
+
size 744912
|
02-gpt2_bert/gene_eng_dict/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
02-gpt2_bert/gene_eng_dict/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
02-gpt2_bert/gene_eng_dict/tokenizer_config.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "<|endoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"bos_token": "<|endoftext|>",
|
14 |
+
"clean_up_tokenization_spaces": false,
|
15 |
+
"eos_token": "<|endoftext|>",
|
16 |
+
"extra_special_tokens": {},
|
17 |
+
"model_max_length": 1000000000000000019884624838656,
|
18 |
+
"tokenizer_class": "GPT2Tokenizer",
|
19 |
+
"unk_token": "<|endoftext|>"
|
20 |
+
}
|
02-gpt2_bert/gene_eng_dict/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
03-gene-task/.ipynb_checkpoints/1-category-task-checkpoint.ipynb
CHANGED
@@ -1,6 +1,812 @@
|
|
1 |
{
|
2 |
-
"cells": [
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
"nbformat": 4,
|
5 |
"nbformat_minor": 5
|
6 |
}
|
|
|
1 |
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "5840e900-43cb-4ab4-81a5-988b68fda9b1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.1 序列分类任务"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "958e7b5f-759a-431c-8af0-325271facb41",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"基于 GPT-2 模型,可以通过微调(fine-tuning)或使用提示(prompt-based)方法来完成多种下游任务。\n",
|
17 |
+
"本章主要使用经典的微调方式,提示微调则属于chatgpt的范围,放在下一章,以下是几种常见的下游任务及其简单描述:\n",
|
18 |
+
"\n",
|
19 |
+
"\n",
|
20 |
+
"### 1. **文本分类**\n",
|
21 |
+
"\n",
|
22 |
+
"#### 任务描述\n",
|
23 |
+
"\n",
|
24 |
+
"文本分类是将文本分配到一个或多个预定义类别中的任务。例如,情感分析、主题分类等。生物序列中对应如启动序列等分类问题。\n",
|
25 |
+
"\n",
|
26 |
+
"#### 使用的模型类型\n",
|
27 |
+
"\n",
|
28 |
+
"- **GPT2ForSequenceClassification或AutoModelForSequenceClassification**:该模型在 GPT-2 的基础上添加了一个分类头,用于处理文本分类任务。通过微调这个模型,可以将其应用于多种分类任务。\n",
|
29 |
+
"\n",
|
30 |
+
"### 2. **机器翻译**\n",
|
31 |
+
"\n",
|
32 |
+
"#### 任务描述\n",
|
33 |
+
"\n",
|
34 |
+
"机器翻译是指将一种语言的文本转换为另一种语言的过程。生物学中,可以是生物序列到功能描述(英文)的翻译。\n",
|
35 |
+
"\n",
|
36 |
+
"#### 使用的模型类型\n",
|
37 |
+
"\n",
|
38 |
+
"- **AutoModelForSeq2SeqLM**:虽然 GPT-2 不是专门为机器翻译设计的模型,但可以通过构造特定格式的提示,让 GPT-2 根据上下文生成目标语言的翻译结果。\n",
|
39 |
+
"- **注意**:对于机器翻译任务,通常更推荐使用专门为此类任务设计的模型,如 T5 或 mBART。\n",
|
40 |
+
"\n",
|
41 |
+
"### 3. **词性标注 (POS Tagging)**\n",
|
42 |
+
"\n",
|
43 |
+
"#### 任务描述\n",
|
44 |
+
"\n",
|
45 |
+
"词性标注是指为每个单词分配其正确的词性标签(如名词、动词、形容词等)。生物学中,对应于结构预测任务,典型的如二级结构预测。\n",
|
46 |
+
"\n",
|
47 |
+
"#### 使用的模型类型\n",
|
48 |
+
"\n",
|
49 |
+
"- **AutoModelForTokenClassification**:该模型适用于标记级别的分类任务。通过微调,可以将 GPT-2 应用于词性标注,每个 token 的隐藏状态会被映射到相应的词性标签。\n",
|
50 |
+
"\n",
|
51 |
+
"### 4. **命名实体识别 (NER)**\n",
|
52 |
+
"\n",
|
53 |
+
"#### 任务描述\n",
|
54 |
+
"\n",
|
55 |
+
"命名实体识别是指识别文本中的人名、地名、组织机构等实体,并对其进行分类。生物学中,也对应于结构预测任务,典型的如膜结构预测。和词性标注类似。\n",
|
56 |
+
"\n",
|
57 |
+
"#### 使用的模型类型\n",
|
58 |
+
"\n",
|
59 |
+
"- **AutoModelForTokenClassification**:类似于词性标注,该模型可以用于 NER 任务,通过对每个 token 进行分类来识别和标注命名实体。\n",
|
60 |
+
"\n",
|
61 |
+
"### 5. **问答系统**\n",
|
62 |
+
"\n",
|
63 |
+
"#### 任务描述\n",
|
64 |
+
"\n",
|
65 |
+
"问答系统旨在根据给定的问题从文档或知识库中提取答案。目前一些最新的生物学大模型论文中,输入是包含生物序列的问题,回答则也是混合式的。一般是生物学领域的QA。\n",
|
66 |
+
"\n",
|
67 |
+
"#### 使用的模型类型\n",
|
68 |
+
"\n",
|
69 |
+
"- **AutoModelForQuestionAnswering**:该模型专门用于问答任务,能够理解问题并从上下文中提取答案。通过微调,它可以适应特定领域的问答需求。\n",
|
70 |
+
"\n",
|
71 |
+
"### 6. **文本生成**\n",
|
72 |
+
"\n",
|
73 |
+
"#### 任务描述\n",
|
74 |
+
"\n",
|
75 |
+
"文本生成是指根据给定的提示或前缀生成连贯的文本内容。生物学中,对应新的序列生成,如产生全新的蛋白质序列。\n",
|
76 |
+
"\n",
|
77 |
+
"#### 使用的模型类型\n",
|
78 |
+
"\n",
|
79 |
+
"- **GPT2LMHeadModel**:这是 GPT-2 的标准语言模型版本,擅长生成自然流畅的文本。它可以根据输入的提示生成后续文本,广泛应用于创作、对话系统等领域。\n",
|
80 |
+
"\n",
|
81 |
+
"### 6. **回归问题**\n",
|
82 |
+
"\n",
|
83 |
+
"#### 任务描述\n",
|
84 |
+
"\n",
|
85 |
+
"生物序列相关的回归问题,输入为序列,输出为一个float值。\n",
|
86 |
+
"\n",
|
87 |
+
"#### 使用的模型类型\n",
|
88 |
+
"\n",
|
89 |
+
"- huggingface没有特定的header,但一般回归问题,输出使用一个线性层即可,设定损失函数为均方误差(MSE)即可。最简单的,就是使用AutoModelForTokenClassification,类别数设置为1,输出的label为实测float值即可。\n",
|
90 |
+
"一个官方推荐的 [例子](https://github.com/huggingface/transformers/blob/7ae6f070044b0171a71f3269613bf02fd9fca6f2/src/transformers/models/bert/modeling_bert.py#L1564-L1575)\n",
|
91 |
+
"\n",
|
92 |
+
"### 小结\n",
|
93 |
+
"\n",
|
94 |
+
"GPT-2 可以通过微调或提示工程应用于多种下游任务。不同的任务需要使用特定类型的模型,这些模型基于 GPT-2 并添加了额外的组件或进���了调整,以更好地适应特定的任务需求\n",
|
95 |
+
"\n",
|
96 |
+
"<img src=\"img/gpt2-ft.png\" width=\"800px\" />"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "code",
|
101 |
+
"execution_count": 1,
|
102 |
+
"id": "eca17933-7b8f-44de-8c59-ea7a1c8a3b33",
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [
|
105 |
+
{
|
106 |
+
"data": {
|
107 |
+
"text/plain": [
|
108 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
109 |
+
]
|
110 |
+
},
|
111 |
+
"execution_count": 1,
|
112 |
+
"metadata": {},
|
113 |
+
"output_type": "execute_result"
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"import subprocess\n",
|
118 |
+
"import os\n",
|
119 |
+
"# 设置环境变量, autodl一般区域\n",
|
120 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
121 |
+
"output = result.stdout\n",
|
122 |
+
"for line in output.splitlines():\n",
|
123 |
+
" if '=' in line:\n",
|
124 |
+
" var, value = line.split('=', 1)\n",
|
125 |
+
" os.environ[var] = value\n",
|
126 |
+
"\n",
|
127 |
+
"\"\"\"\n",
|
128 |
+
"import os\n",
|
129 |
+
"\n",
|
130 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
131 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
132 |
+
"\n",
|
133 |
+
"# 打印环境变量以确认设置成功\n",
|
134 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
135 |
+
"\"\"\""
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 2,
|
141 |
+
"id": "108d9c3c-ae4d-4110-a532-a40a6fe1f9df",
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [],
|
144 |
+
"source": [
|
145 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
146 |
+
"from tokenizers import Tokenizer\n",
|
147 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
148 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
149 |
+
"from transformers import DataCollatorWithPadding"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 6,
|
155 |
+
"id": "bcdc9f7a-1ea5-4647-b87e-ac72ddf17818",
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [
|
158 |
+
{
|
159 |
+
"data": {
|
160 |
+
"application/vnd.jupyter.widget-view+json": {
|
161 |
+
"model_id": "c2e31c61549449e78a4e1fe0e884233f",
|
162 |
+
"version_major": 2,
|
163 |
+
"version_minor": 0
|
164 |
+
},
|
165 |
+
"text/plain": [
|
166 |
+
"tokenizer_config.json: 0%| | 0.00/580 [00:00<?, ?B/s]"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
"metadata": {},
|
170 |
+
"output_type": "display_data"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"data": {
|
174 |
+
"application/vnd.jupyter.widget-view+json": {
|
175 |
+
"model_id": "da2009ca96634f759f052a9a4ff7e41e",
|
176 |
+
"version_major": 2,
|
177 |
+
"version_minor": 0
|
178 |
+
},
|
179 |
+
"text/plain": [
|
180 |
+
"vocab.json: 0%| | 0.00/642k [00:00<?, ?B/s]"
|
181 |
+
]
|
182 |
+
},
|
183 |
+
"metadata": {},
|
184 |
+
"output_type": "display_data"
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"application/vnd.jupyter.widget-view+json": {
|
189 |
+
"model_id": "b6b6ec58d8cb4878aa2e0786ff0bbcf4",
|
190 |
+
"version_major": 2,
|
191 |
+
"version_minor": 0
|
192 |
+
},
|
193 |
+
"text/plain": [
|
194 |
+
"merges.txt: 0%| | 0.00/323k [00:00<?, ?B/s]"
|
195 |
+
]
|
196 |
+
},
|
197 |
+
"metadata": {},
|
198 |
+
"output_type": "display_data"
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"data": {
|
202 |
+
"application/vnd.jupyter.widget-view+json": {
|
203 |
+
"model_id": "5dbb5171eb6242bdbded42c87ef46c27",
|
204 |
+
"version_major": 2,
|
205 |
+
"version_minor": 0
|
206 |
+
},
|
207 |
+
"text/plain": [
|
208 |
+
"special_tokens_map.json: 0%| | 0.00/473 [00:00<?, ?B/s]"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
"metadata": {},
|
212 |
+
"output_type": "display_data"
|
213 |
+
}
|
214 |
+
],
|
215 |
+
"source": [
|
216 |
+
"#set tokenizer\n",
|
217 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/dna_gpt2_v0\")\n",
|
218 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 3,
|
224 |
+
"id": "0e930ef5-865a-4528-84b5-ddae6d710a99",
|
225 |
+
"metadata": {},
|
226 |
+
"outputs": [
|
227 |
+
{
|
228 |
+
"name": "stderr",
|
229 |
+
"output_type": "stream",
|
230 |
+
"text": [
|
231 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/dna_gpt2_v0 and are newly initialized: ['score.weight']\n",
|
232 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"data": {
|
237 |
+
"text/plain": [
|
238 |
+
"GPT2ForSequenceClassification(\n",
|
239 |
+
" (transformer): GPT2Model(\n",
|
240 |
+
" (wte): Embedding(30000, 768)\n",
|
241 |
+
" (wpe): Embedding(1024, 768)\n",
|
242 |
+
" (drop): Dropout(p=0.1, inplace=False)\n",
|
243 |
+
" (h): ModuleList(\n",
|
244 |
+
" (0-11): 12 x GPT2Block(\n",
|
245 |
+
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
246 |
+
" (attn): GPT2SdpaAttention(\n",
|
247 |
+
" (c_attn): Conv1D(nf=2304, nx=768)\n",
|
248 |
+
" (c_proj): Conv1D(nf=768, nx=768)\n",
|
249 |
+
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
|
250 |
+
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
|
251 |
+
" )\n",
|
252 |
+
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
253 |
+
" (mlp): GPT2MLP(\n",
|
254 |
+
" (c_fc): Conv1D(nf=3072, nx=768)\n",
|
255 |
+
" (c_proj): Conv1D(nf=768, nx=3072)\n",
|
256 |
+
" (act): NewGELUActivation()\n",
|
257 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
258 |
+
" )\n",
|
259 |
+
" )\n",
|
260 |
+
" )\n",
|
261 |
+
" (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
262 |
+
" )\n",
|
263 |
+
" (score): Linear(in_features=768, out_features=2, bias=False)\n",
|
264 |
+
")"
|
265 |
+
]
|
266 |
+
},
|
267 |
+
"execution_count": 3,
|
268 |
+
"metadata": {},
|
269 |
+
"output_type": "execute_result"
|
270 |
+
}
|
271 |
+
],
|
272 |
+
"source": [
|
273 |
+
"#set model\n",
|
274 |
+
"model = AutoModelForSequenceClassification.from_pretrained('dnagpt/dna_gpt2_v0', num_labels=2)\n",
|
275 |
+
"model.config.pad_token_id = model.config.eos_token_id\n",
|
276 |
+
"model"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"cell_type": "markdown",
|
281 |
+
"id": "bd14794b-e507-4c1d-be47-0e0144835f18",
|
282 |
+
"metadata": {},
|
283 |
+
"source": [
|
284 |
+
"在生物学中,**启动子(promoter)** 是一段特定的DNA序列,它位于基因的上游(通常是5'端),并且是转录起始的关键调控元件。启动子的主要功能是为RNA聚合酶提供结合位点,并招募其他转录因子,以启动基因转录过程。以下是关于启动子的一些重要概念和特点:\n",
|
285 |
+
"\n",
|
286 |
+
"### 启动子的功能\n",
|
287 |
+
"\n",
|
288 |
+
"1. **转录起始**:\n",
|
289 |
+
" - 启动子是基因表达的第一步,它决定了何时、何地以及多频繁地进行转录。\n",
|
290 |
+
" \n",
|
291 |
+
"2. **调控基因表达**:\n",
|
292 |
+
" - 不同类型的启动子可以调节不同组织或细胞类型中的基因表达水平。例如,在某些细胞中高度活跃而在其他细胞中不活跃。\n",
|
293 |
+
"\n",
|
294 |
+
"3. **与转录因子和其他蛋白质相互作用**:\n",
|
295 |
+
" - 启动子区域通常包含多个顺式作用元件(cis-regulatory elements),这些元件可以与特定的转录因子或其他调控蛋白结合,进一步精细调整基因表达。\n",
|
296 |
+
" \n",
|
297 |
+
" \n",
|
298 |
+
"在生物学中,启动子(promoter)序列的二分类问题通常是指将DNA序列分为两类:**启动子序列**和**非启动子序列**。这种分类任务的目标是通过机器学习或生物信息学方法来预测给定的DNA序列是否具有启动子功能。\n",
|
299 |
+
"\n",
|
300 |
+
"### 二分类问题中的两个类别\n",
|
301 |
+
"\n",
|
302 |
+
"1. **启动子序列(Promoter Sequences)**:\n",
|
303 |
+
" - 这些序列包含能够指导转录起始的调控元件,通常是位于基因5'端上游区域的一段DNA。\n",
|
304 |
+
" - 启动子序列可能含有特定的保守基序(motifs),如TATA盒、CAAT盒等,这些基序对于RNA聚合酶及其辅助因子的结合至关重要。\n",
|
305 |
+
"\n",
|
306 |
+
"2. **非启动子序列(Non-Promoter Sequences)**:\n",
|
307 |
+
" - 这类序列指的是那些不具有启动子功能的DNA片段。它们可以来自基因内部(编码区或内含子)、基因间区域(intergenic regions)或其他调控元件(如增强子、沉默子等),但明确不是启动子。\n",
|
308 |
+
" - 非启动子序列不具备启动转录的能力,或者至少在自然条件下不会作为主要的转录起始点。\n",
|
309 |
+
"\n",
|
310 |
+
"### 启动子的研究意义\n",
|
311 |
+
"\n",
|
312 |
+
"理解启动子的工作机制对于揭示基因表达调控网络非常重要。这不仅有助于基础科学研究,而且对于医学应用也有着深远的影响,比如开发新的治疗策略来纠正异常的基因表达模式,或者利用合成生物学设计定制化的基因表达系统。\n"
|
313 |
+
]
|
314 |
+
},
|
315 |
+
{
|
316 |
+
"cell_type": "code",
|
317 |
+
"execution_count": 8,
|
318 |
+
"id": "aee08f3f-6cda-4975-8cb9-9a7bfacb9eac",
|
319 |
+
"metadata": {},
|
320 |
+
"outputs": [
|
321 |
+
{
|
322 |
+
"data": {
|
323 |
+
"application/vnd.jupyter.widget-view+json": {
|
324 |
+
"model_id": "82d2ec71cf6648469040897d9174a55f",
|
325 |
+
"version_major": 2,
|
326 |
+
"version_minor": 0
|
327 |
+
},
|
328 |
+
"text/plain": [
|
329 |
+
"README.md: 0%| | 0.00/314 [00:00<?, ?B/s]"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
"metadata": {},
|
333 |
+
"output_type": "display_data"
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"data": {
|
337 |
+
"application/vnd.jupyter.widget-view+json": {
|
338 |
+
"model_id": "40183e0714ea4155a2c0772fb7c72a00",
|
339 |
+
"version_major": 2,
|
340 |
+
"version_minor": 0
|
341 |
+
},
|
342 |
+
"text/plain": [
|
343 |
+
"train-00000-of-00001.parquet: 0%| | 0.00/8.66M [00:00<?, ?B/s]"
|
344 |
+
]
|
345 |
+
},
|
346 |
+
"metadata": {},
|
347 |
+
"output_type": "display_data"
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"data": {
|
351 |
+
"application/vnd.jupyter.widget-view+json": {
|
352 |
+
"model_id": "8e5ebe15df194e3c8bf5811777755947",
|
353 |
+
"version_major": 2,
|
354 |
+
"version_minor": 0
|
355 |
+
},
|
356 |
+
"text/plain": [
|
357 |
+
"Generating train split: 0%| | 0/59195 [00:00<?, ? examples/s]"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
"metadata": {},
|
361 |
+
"output_type": "display_data"
|
362 |
+
}
|
363 |
+
],
|
364 |
+
"source": [
|
365 |
+
"from datasets import load_dataset\n",
|
366 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
367 |
+
"dataset = load_dataset(\"dnagpt/dna_promoter_300\")['train'].train_test_split(test_size=0.1)"
|
368 |
+
]
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"cell_type": "code",
|
372 |
+
"execution_count": 9,
|
373 |
+
"id": "6ac9fe5b-2175-42d8-949c-cb12bc8fb65c",
|
374 |
+
"metadata": {},
|
375 |
+
"outputs": [
|
376 |
+
{
|
377 |
+
"data": {
|
378 |
+
"text/plain": [
|
379 |
+
"DatasetDict({\n",
|
380 |
+
" train: Dataset({\n",
|
381 |
+
" features: ['sequence', 'label'],\n",
|
382 |
+
" num_rows: 53275\n",
|
383 |
+
" })\n",
|
384 |
+
" test: Dataset({\n",
|
385 |
+
" features: ['sequence', 'label'],\n",
|
386 |
+
" num_rows: 5920\n",
|
387 |
+
" })\n",
|
388 |
+
"})"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
"execution_count": 9,
|
392 |
+
"metadata": {},
|
393 |
+
"output_type": "execute_result"
|
394 |
+
}
|
395 |
+
],
|
396 |
+
"source": [
|
397 |
+
"dataset"
|
398 |
+
]
|
399 |
+
},
|
400 |
+
{
|
401 |
+
"cell_type": "code",
|
402 |
+
"execution_count": 10,
|
403 |
+
"id": "b5025f95-ca5d-42b1-95e1-55495f77d009",
|
404 |
+
"metadata": {},
|
405 |
+
"outputs": [
|
406 |
+
{
|
407 |
+
"data": {
|
408 |
+
"text/plain": [
|
409 |
+
"{'sequence': 'CCTGACGCCCACCGCAAGCTGCCGGGTAAGACCGGGTCGACTTCAGCGCGGCCCGCTGCACGAGAGACCATTATGGTGATCCGCCCGCCTGACACTACTGATATGTTGGGATTACAGGCGTGAGCCACGGCGCCCGGCGGGCAAGACACCCTCAGAGCACAGGGTGAATCCATGGTTAAAATACAGCGGGAAGTTAGCGCCGAAGTCGCCGTGTAATTTGTGCGCGGTTCAGGTTCATGTATTCAGAATCATTTTACTAGGTTTAGGGCTCGCCGCTGCCTCAGTGGCTTTCAGGCGCTT',\n",
|
410 |
+
" 'label': 0}"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
"execution_count": 10,
|
414 |
+
"metadata": {},
|
415 |
+
"output_type": "execute_result"
|
416 |
+
}
|
417 |
+
],
|
418 |
+
"source": [
|
419 |
+
"dataset[\"train\"][0]"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": 13,
|
425 |
+
"id": "ac999213-67b1-4294-8d92-80b8c6c68acd",
|
426 |
+
"metadata": {},
|
427 |
+
"outputs": [
|
428 |
+
{
|
429 |
+
"name": "stdout",
|
430 |
+
"output_type": "stream",
|
431 |
+
"text": [
|
432 |
+
"dna datasets mean token lenght 52.41266891891892 min token length 33 max token length 60\n"
|
433 |
+
]
|
434 |
+
}
|
435 |
+
],
|
436 |
+
"source": [
|
437 |
+
"token_len_list = []\n",
|
438 |
+
"for item in dataset[\"test\"]:\n",
|
439 |
+
" inputs = tokenizer.tokenize(item[\"sequence\"])\n",
|
440 |
+
" token_len_list.append( len(inputs) )\n",
|
441 |
+
"\n",
|
442 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
443 |
+
"min_len = min(token_len_list)\n",
|
444 |
+
"max_len = max(token_len_list)\n",
|
445 |
+
"\n",
|
446 |
+
"print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": 14,
|
452 |
+
"id": "72a2dec3-043b-41e4-afd8-4dbd8c8fcbb0",
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [
|
455 |
+
{
|
456 |
+
"data": {
|
457 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABdnElEQVR4nO3deZyN9f//8eeZfTEzxjIbZowte2NLEyIUkQotiqylLBXSotVSREiopG9ZKh+lT4sP2UVCkrJlrNHImGEyjDH7zPX7Y36OTmO7zpwzZ4bH/XY7N3Nd1/v1vt7nOE7z7Hpf72MxDMMQAAAAAOCqubl6AAAAAABQ2hCkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpADAyUaPHi2LxVIs52rTpo3atGlj3V63bp0sFou+/PLLYjl/3759VbVq1WI5l73S0tL06KOPKiwsTBaLRcOGDXPq+c7//ScnJzv1PNe6vn37qkyZMq4eBgBYEaQAwIS5c+fKYrFYHz4+PoqIiFCHDh00ffp0nT171iHnSUhI0OjRo7V9+3aH9OdIJXlsV2P8+PGaO3euBg0apE8++USPPPJIoTbnw8+VHv8MraVBcQdrs9LT0zV69GitW7fO1UMBgCvycPUAAKA0Gjt2rKKjo5WTk6PExEStW7dOw4YN09SpU7V48WI1bNjQ2vbll1/WCy+8YKr/hIQEjRkzRlWrVlVMTMxV161cudLUeexxubF9+OGHys/Pd/oYimLt2rW6+eab9dprr12yTbdu3VSjRg3rdlpamgYNGqSuXbuqW7du1v2hoaFOHev1Jj09XWPGjJGkUhdSAVx/CFIAYIc777xTTZs2tW6PGjVKa9eu1V133aW7775bcXFx8vX1lSR5eHjIw8O5H7fp6eny8/OTl5eXU89zJZ6eni49/9U4ceKE6tate9k2DRs2tAnDycnJGjRokBo2bKhevXo5e4gAgFKAqX0A4CBt27bVK6+8oj///FOffvqpdf/F7pFatWqVWrZsqbJly6pMmTK64YYb9OKLL0oqmH7VrFkzSVK/fv2s08jmzp0rqeD/1NevX1/btm3TrbfeKj8/P2vtv++ROi8vL08vvviiwsLC5O/vr7vvvltHjx61aVO1alX17du3UO0/+7zS2C52j9S5c+f0zDPPqEqVKvL29tYNN9ygyZMnyzAMm3YWi0VDhw7VN998o/r168vb21v16tXT8uXLL/6C/8uJEyc0YMAAhYaGysfHRzfeeKPmzZtnPX5+Wtvhw4e1dOlS69iPHDlyVf1fzNq1a9WqVSv5+/urbNmyuueeexQXF3fFuj///FM1atRQ/fr1lZSUJEk6ffq0hg0bZn2datSooYkTJ9pc4Tty5IgsFosmT56s2bNnq3r16vL29lazZs20detWu5/HvzljLIsWLVLdunXl4+Oj+vXr6+uvv7Z5vxw5ckQVK1aUJI0ZM8b69zN69Gibfo4dO6Z7771XZcqUUcWKFTVy5Ejl5eXZtFm4cKGaNGmigIAABQYGqkGDBnrnnXcc9voAgMQVKQBwqEceeUQvvviiVq5cqccee+yibX7//XfdddddatiwocaOHStvb28dPHhQGzdulCTVqVNHY8eO1auvvqqBAweqVatWkqRbbrnF2sfff/+tO++8Uz169FCvXr2uOMXsjTfekMVi0fPPP68TJ05o2rRpat++vbZv3269cnY1rmZs/2QYhu6++259//33GjBggGJiYrRixQo9++yzOnbsmN5++22b9j/++KO++uorDR48WAEBAZo+fbq6d++u+Ph4lS9f/pLjysjIUJs2bXTw4EENHTpU0dHRWrRokfr27avTp0/r6aefVp06dfTJJ59o+PDhqly5sp555hlJsv7ybtbq1at15513qlq1aho9erQyMjI0Y8YMtWjRQr/++uslF904dOiQ2rZtq3LlymnVqlWqUKGC0tPT1bp1ax07dkyPP/64IiMjtWnTJo0aNUrHjx/XtGnTbPpYsGCBzp49q8cff1wWi0WTJk1St27d9McffxT5qqAzxrJ06VI9+OCDatCggSZMmKCUlBQNGDBAlSpVsvZTsWJFvf/++4WmUP7zymBeXp46dOig5s2ba/LkyVq9erWmTJmi6tWra9CgQZIK/ifFQw89pHbt2mnixImSpLi4OG3cuFFPP/10kV4bALBhAACu2pw5cwxJxtatWy/ZJigoyGjUqJF1+7XXXjP++XH79ttvG5KMkydPXrKPrVu3GpKMOXPmFDrWunVrQ5Ixa9asix5r3bq1dfv77783JBmVKlUyUlNTrfu/+OILQ5LxzjvvWPdFRUUZffr0uWKflxtbnz59jKioKOv2N998Y0gyXn/9dZt29913n2GxWIyDBw9a90kyvLy8bPbt2LHDkGTMmDGj0Ln+adq0aYYk49NPP7Xuy87ONmJjY40yZcrYPPeoqCijc+fOl+3v306ePGlIMl577TXrvpiYGCMkJMT4+++/bcbr5uZm9O7d27rv/N//yZMnjbi4OCMiIsJo1qyZcerUKWubcePGGf7+/sb+/fttzvvCCy8Y7u7uRnx8vGEYhnH48GFDklG+fHmb+m+//daQZPzvf/+77PM4/35YtGjRJds4YywNGjQwKleubJw9e9a6b926dYYkm/fLxV7n8/r06WNIMsaOHWuzv1GjRkaTJk2s208//bQRGBho5ObmXva1AICiYmofADhYmTJlLrt6X9myZSVJ3377rd0LM3h7e6tfv35X3b53794KCAiwbt93330KDw/Xd999Z9f5r9Z3330nd3d3PfXUUzb7n3nmGRmGoWXLltnsb9++vapXr27dbtiwoQIDA/XHH39c8TxhYWF66KGHrPs8PT311FNPKS0tTevXr3fAs7ng+PHj2r59u/r27aty5crZjPf222+/6Ou6e/dutW7dWlWrVtXq1asVHBxsPbZo0SK1atVKwcHBSk5Otj7at2+vvLw8/fDDDzZ9Pfjggzb1568MXul1uhqOHktCQoJ27dql3r172yxf3rp1azVo0MD0+J544gmb7VatWtk877Jly+rcuXNatWqV6b4BwAyCFAA4WFpamk1o+bcHH3xQLVq00KOPPqrQ0FD16NFDX3zxhalQValSJVMLS9SsWdNm22KxqEaNGkW6P+hq/Pnnn4qIiCj0etSpU8d6/J8iIyML9REcHKyUlJQrnqdmzZpyc7P9z9qlzlNU5/u74YYbCh2rU6eOkpOTde7cOZv9Xbp0UUBAgFasWKHAwECbYwcOHNDy5ctVsWJFm0f79u0lFdz/9U//fp3OB5krvU5Xw9FjOf9a/XMVxPMutu9yfHx8Ck3F/Pf7Y/DgwapVq5buvPNOVa5cWf3797/q++wAwAzukQIAB/rrr7905syZy/6C6Ovrqx9++EHff/+9li5dquXLl+vzzz9X27ZttXLlSrm7u1/xPGbua7pal/rS4Ly8vKsakyNc6jzGvxamKI26d++uefPm6bPPPtPjjz9ucyw/P1+33367nnvuuYvW1qpVy2bbma9TSRrLv13N+zAkJETbt2/XihUrtGzZMi1btkxz5sxR7969bRYfAYCiIkgBgAN98sknkqQOHTpctp2bm5vatWundu3aaerUqRo/frxeeuklff/992rfvv0lQ429Dhw4YLNtGIYOHjxocyN/cHCwTp8+Xaj2zz//VLVq1azbZsYWFRWl1atX6+zZszZXpfbu3Ws97ghRUVHauXOn8vPzba5KOfo8/zyfJO3bt6/Qsb1796pChQry9/e32f/WW2/Jw8PDupDGww8/bD1WvXp1paWlWa/6uJKjx3L+tTp48GChY//e56j3vZeXl7p06aIuXbooPz9fgwcP1gcffKBXXnnF9FUwALgUpvYBgIOsXbtW48aNU3R0tHr27HnJdqdOnSq07/wX22ZlZUmS9ZfwiwUbe8yfP9/mvq0vv/xSx48f15133mndV716df3000/Kzs627luyZEmhZdLNjK1Tp07Ky8vTzJkzbfa//fbbslgsNucvik6dOikxMVGff/65dV9ubq5mzJihMmXKqHXr1g45z3nh4eGKiYnRvHnzbF6H3bt3a+XKlerUqVOhGovFotmzZ+u+++5Tnz59tHjxYuuxBx54QJs3b9aKFSsK1Z0+fVq5ubkOHf/lOHosERERql+/vubPn6+0tDTr/vXr12vXrl02bf38/Kznsdfff/9ts+3m5mb9Hwbn/30BgCNwRQoA7LBs2TLt3btXubm5SkpK0tq1a7Vq1SpFRUVp8eLF8vHxuWTt2LFj9cMPP6hz586KiorSiRMn9N5776ly5cpq2bKlpIJQU7ZsWc2aNUsBAQHy9/dX8+bNFR0dbdd4y5Urp5YtW6pfv35KSkrStGnTVKNGDZsl2h999FF9+eWX6tixox544AEdOnRIn376qc3iD2bH1qVLF91222166aWXdOTIEd14441auXKlvv32Ww0bNqxQ3/YaOHCgPvjgA/Xt21fbtm1T1apV9eWXX2rjxo2aNm3aZe9Zs9dbb72lO++8U7GxsRowYIB1+fOgoKBC3310npubmz799FPde++9euCBB/Tdd9+pbdu2evbZZ7V48WLddddd6tu3r5o0aaJz585p165d+vLLL3XkyBFVqFDBYWP/73//a71a9099+vRxyljGjx+ve+65Ry1atFC/fv2UkpKimTNnqn79+jbhytfXV3Xr1tXnn3+uWrVqqVy5cqpfv77q169/1ed69NFHderUKbVt21aVK1fWn3/+qRkzZigmJsZ6zxwAOIRL1wwEgFLm/PLn5x9eXl5GWFiYcfvttxvvvPOOzTLb5/17+fM1a9YY99xzjxEREWF4eXkZERERxkMPPVRouelvv/3WqFu3ruHh4WGz3Hjr1q2NevXqXXR8l1r+/D//+Y8xatQoIyQkxPD19TU6d+5s/Pnnn4Xqp0yZYlSqVMnw9vY2WrRoYfzyyy+F+rzc2P69/LlhGMbZs2eN4cOHGxEREYanp6dRs2ZN46233jLy8/Nt2kkyhgwZUmhMl1qW/d+SkpKMfv36GRUqVDC8vLyMBg0aXHSJdkctf24YhrF69WqjRYsWhq+vrxEYGGh06dLF2LNnj02bfy5/fl56errRunVro0yZMsZPP/1kGEbB6zRq1CijRo0ahpeXl1GhQgXjlltuMSZPnmxkZ2cbhnFhyfG33nqr0BgvNr5/O/9+uNRjw4YNThvLwoULjdq1axve3t5G/fr1jcWLFxvdu3c3ateubdNu06ZNRpMmTQwvLy+bfvr06WP4+/sXOte//319+eWXxh133GGEhIQYXl5eRmRkpPH4448bx48fv+xrAwBmWQzjGriDFwAAlDoxMTGqWLEiS5UDKJW4RwoAADhVTk5OoXur1q1bpx07dqhNmzauGRQAFBFXpAAAgFMdOXJE7du3V69evRQREaG9e/dq1qxZCgoK0u7du1W+fHlXDxEATGOxCQAA4FTBwcFq0qSJ/u///k8nT56Uv7+/OnfurDfffJMQBaDU4ooUAAAAAJjEPVIAAAAAYBJBCgAAAABM4h4pSfn5+UpISFBAQIAsFourhwMAAADARQzD0NmzZxURESE3t0tfdyJISUpISFCVKlVcPQwAAAAAJcTRo0dVuXLlSx4nSEkKCAiQVPBiBQYGung0AADgmlW7tnT8uBQeLu3da758Zm0dP3tc4QHh2jvUfD2AK0tNTVWVKlWsGeFSCFKSdTpfYGAgQQoAADjP+WlCbm6SHb9zuPm4STkFf/I7C+BcV7rlh8UmAAAAAMAkghQAAAAAmESQAgAAAACTuEcKAACguGzdKuXlSe7u9pU/tlV5Rp7cLfbVA3AcghQAAEBxCQ8vWnlA0eoBOA5T+wAAAADAJIIUAAAAAJjE1D4AAIDiMnu2lJYmlSkjDRxovnzbbKVlp6mMVxkNbGK+HoDjWAzDMFw9CFdLTU1VUFCQzpw5w5fbAQAA56lcWTp2TKpUSfrrL/PlUyvr2NljqhRQSX+NMF8P4MquNhswtQ8AAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHDdqFVLCgqSQkPtKy9fS0E+QQr1t68egONYDMMwXD0IV7vaby8GAAAoTvHx8UpOTnZK3xUqVFBkZKRT+gZKs6vNBlyRAgAAKIHi4+NVu04dZaSnO6V/Xz8/7Y2LI0wBdiJIAQAAlEDJycnKSE/X4MmzFVG9lkP7Tji0X++NHKjk5GSCFGAnghQAAEAJFlG9lqLrxbh6GAD+hSAFAABQXHr2lJKTpQoVpM8+M13+7sbHdDbrbwV4l9eQFh86YYAArhZBCgAAoLisXy8dOyZVqmRXedyJjUrJSFCwb4SDBwbALL5HCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQXB57TDpzRgoKsqv8thq9lZGdKl+vQAcPDIBZBCkAAIDi8tprRSrv3uAFBw0EQFExtQ8AAABF17evZLFITzxR+NiQIQXH+vYt7lFdnfNj/+ejY8cLx48ckQYMkKKjJV9fqXr1glCcnX11/RuGdOedBf1+843tsaeekpo0kby9pZgYxzwfFAuuSAEAAMAxqlSRFi6U3n67IHBIUmamtGCBFBnp2rFdSceO0pw5F7a9vS/8vHevlJ8vffCBVKOGtHt3wTTNc+ekyZOv3Pe0aQUh6lL695e2bJF27rR7+Ch+XJECAACAYzRuXBCmvvrqwr6vvioIUY0a2bbNz5cmTLhwlefGG6Uvv7xwPC/P9irQDTdI77xj20ffvtK99xaEmfBwqXz5gqtfOTnmx+7tLYWFXXgEB184dj5k3XGHVK2adPfd0siRts/zUrZvl6ZMkT7++OLHp08vGHO1aubHDJciSAEAABSXypULrkxUrmxX+dCv66rngrIa+nVdBw/Mgfr3t72y8/HHUr9+hdtNmCDNny/NmiX9/rs0fLjUq5e0fn3B8fz8gtdp0SJpzx7p1VelF1+UvvjCtp/vv5cOHSr4c948ae7cgsd5o0dLVateedzr1kkhIQWBbdAg6e+/L9/+zBmpXLnLt0lPlx5+WHr33YJwhmsKU/sAAADgOL16SaNGSX/+WbC9cWPBdL916y60ycqSxo+XVq+WYmML9lWrJv34Y8H0udatJU9PacyYCzXR0dLmzQVB6oEHLuwPDpZmzpTc3aXataXOnaU1awqm3klShQoF9zRdTseOUrduBec4dKggsN15Z8H53N0Ltz94UJox48rT+oYPl265Rbrnnsu3Q6lEkAIAAIDjVKxYEGbmzi1YZKFz54Iw808HDxZcrbn9dtv92dm2UwDffbfgilZ8vJSRUXD83wsy1KtnG3bCw6Vduy5sDx1a8LicHj0u/NyggdSwYUH4WrdOatfOtu2xYwXB6/77L4S1i1m8WFq7Vvrtt8ufG6UWQQoAAACO1b//hfDy7ruFj6elFfy5dKlUqZLtsfOLPCxcWHAf0pQpBVetAgKkt94qWJThnzw9bbctloJpgUVRrVpB+Dt40DZIJSRIt91WcJVp9uzL97F2bcHVrbJlbfd37y61amV7hQ6lEkEKAAAAjtWxY8HVI4tF6tCh8PG6dQsCU3x8wTS+i9m4sSCwDB58Yd+hQ84Z77/99VfBPVLh4Rf2HTtWEKKaNCm4B8ztCksNvPCC9OijtvsaNChY0bBLF8ePGcWOIAUAAADHcneX4uIu/PxvAQEFV5uGDy+4etSyZcHiDRs3SoGBUp8+Us2aBYtRrFhRcO/SJ59IW7cW/GzGzJnS118X3Dd1MWlpBfdide9esCDEoUPSc88VLHN+PgQeOya1aSNFRRXcF3Xy5IX684tIHDtWcPVq/nzpppsurP73b5GRts/h4MGCMSQmFkxf3L69YH/dupKXl7nnimJFkAIAAIDjBQZe/vi4cQX3U02YIP3xR8EUuMaNCxZ6kKTHHy+4v+jBBwuubD30UMHVqWXLzI0jOfnyV7Lc3Qu+v2nePOn0aSkiomCZ83HjLkwzXLWqIPAcPFh4xUXDKPgzJ0fat6/g3i8zHn30wkqF0oV7xA4fvrrVBuEyFsM4/7d//UpNTVVQUJDOnDmjwCv9owcAALBX5coFVy4qVSqYPnYZv/76q5o0aaLXv16n6HoxkgqWP0/JSFCwb4Rmdt1j9zAO/75dL3dto23btqlx48Z29wNci642G/A9UgAAAABgEkEKAAAAAEziHikAAIDi8umnBV9Ge/7eG5MG3/KBcvOy5eHOIgSAqxGkAAAAikubNkUqrxvayjHj+P/izq+s5wQVKlRQZGSk0/oHXI0gBQAAcJ05fTJJFotFvXr1cto5fP38tDcujjCFaxZBCgAA4DqTnnpGhmGo37jpql6/ocP7Tzi0X++NHKjk5GSCFK5ZBCkAAIDism7dhXuk7Jjmtydpg/UeKUdM8wuPrmFdWh2AOQQpAACA4tKr11V/j9TFvLfpcYd8jxSAomP5cwAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATOJ7pAAAAOwUHx+v5OTkq25fPydHXpKyc3K0+9dfL9s2Li6uiKMD4EwEKQAAADvEx8erdp06ykhPv+qao5IqSzpx4oSaNGlyVTXZWdn2DRCAUxGkAAAA7JCcnKyM9HQNnjxbEdVrXVXNvH/8/PoV2u5Yv0qLpr2h3Nxc676ZXfeYHygApyBIAQAAFEFE9VqKrhfj8H4TDu13eJ8AHIfFJgAAAADAJIIUAAAAAJjE1D4AAIBiUn/Gm/I6m6rsgEDtfvIF0/X/3fWmMrJT5esVqO4NzNcDcByCFAAAQDGp8cV8+SUlKD00wq4g9f3B+UrJSFCwbwRBCnAxpvYBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1wapPLy8vTKK68oOjpavr6+ql69usaNGyfDMKxtDMPQq6++qvDwcPn6+qp9+/Y6cOCATT+nTp1Sz549FRgYqLJly2rAgAFKS0sr7qcDAAAA4Drh0iA1ceJEvf/++5o5c6bi4uI0ceJETZo0STNmzLC2mTRpkqZPn65Zs2Zpy5Yt8vf3V4cOHZSZmWlt07NnT/3+++9atWqVlixZoh9++EEDBw50xVMCAAAAcB1w6Rfybtq0Sffcc486d+4sSapatar+85//6Oeff5ZUcDVq2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjh+Li4rR8+XJt3bpVTZs2lSTNmDFDnTp10uTJkxUREeGaJwcAAPAvJ25qIe+Uv5UVXN6u+johLXQ2628FeNtXD8BxXBqkbrnlFs2ePVv79+9XrVq1tGPHDv3444+aOnWqJOnw4cNKTExU+/btrTVBQUFq3ry5Nm/erB49emjz5s0qW7asNURJUvv27eXm5qYtW7aoa9euhc6blZWlrKws63ZqaqoTnyUAAECBTVM+LFL9kBZFqwfgOC4NUi+88IJSU1NVu3Ztubu7Ky8vT2+88YZ69uwpSUpMTJQkhYaG2tSFhoZajyUmJiokJMTmuIeHh8qVK2dt828TJkzQmDFjHP10AAAAAFwnXHqP1BdffKHPPvtMCxYs0K+//qp58+Zp8uTJmjdvnlPPO2rUKJ05c8b6OHr0qFPPBwClwboj62QZY9HpzNOSpLnb56rsm2VdOiYAAEoqlwapZ599Vi+88IJ69OihBg0a6JFHHtHw4cM1YcIESVJYWJgkKSkpyaYuKSnJeiwsLEwnTpywOZ6bm6tTp05Z2/ybt7e3AgMDbR4AUJL1/aavLGMsemLJE4WODVk6RJYxFvX9pq9Dz/lgvQe1/8n9Du3zalWdVlWWMRabx5s/vmk9vi95n26bd5tCJ4fK53UfVXunml5e+7Jy8nIu22/8mXh1XtBZfm/4KeStED278lnl5ufatFl3ZJ0af9BY3q97q8b0Gpq7fa4zniIAoJRz6dS+9PR0ubnZZjl3d3fl5+dLkqKjoxUWFqY1a9YoJiZGUsH9TFu2bNGgQYMkSbGxsTp9+rS2bdumJk2aSJLWrl2r/Px8NW/evPieDAA4WZXAKlq4e6He7vC2fD19JUmZuZlasHuBIoMiHX4+X09f63lcYWybsXqsyWPW7QCvAOvPnu6e6t2wtxqHN1ZZn7LakbRDj/3vMeUb+RrfbvxF+8vLz1PnBZ0VViZMmwZs0vGzx9X7m97ydPe01hxOOazOCzrriSZP6LNun2nN4TV6dPGjCi8Trg41Ojj3CeO60LZ3F/kkn1RmhYpaO/9/puvfWNNFZzJPKsinol5qZ74egOO4NEh16dJFb7zxhiIjI1WvXj399ttvmjp1qvr37y9JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XklSnTh117NhRjz32mGbNmqWcnBwNHTpUPXr0YMU+ANeUxuGNdSjlkL6K+0o9GxbcS/pV3FeKDIpUdNlom7b5Rr4m/jhRs3+drcS0RNUqX0uv3PqK7qt7n7XNdwe+07Dlw3Q09ahurnyz+tzYx6aPudvnatjyYTr9wmlJ0qFThzRi5Qj99NdPOpd9TnUq1tGEdhPUvtqFBYGqTquqgU0G6uCpg1q0Z5GCfYL18q0va2AT819JEeAdoLAyF59ZUC24mqoFV7NuR5WN0roj67QhfsMl+1t5aKX2nNyj1Y+sVmiZUMWExWjcbeP0/OrnNbrNaHm5e2nWL7MUXTZaUzpMkSTVqVhHP8b/qLd/epsgBYcIPHxIfkkJSj9r30JXx1MPKSUjQenZLJQFuJpLp/bNmDFD9913nwYPHqw6depo5MiRevzxxzVu3Dhrm+eee05PPvmkBg4cqGbNmiktLU3Lly+Xj4+Ptc1nn32m2rVrq127durUqZNatmyp2bNnu+IpAYBT9Y/prznb51i3P/7tY/WL6Veo3YQNEzR/53zN6jxLvw/+XcNvHq5eX/XS+iPrJUlHzxxVt8+7qUutLtr++HY92uhRvbD6hcueOy07TZ1qdNKa3mv02+O/qWP1juryny6KPxNv027K5ilqGtFUvz3+mwY3G6xBSwdpX/I+6/E2c9tc1TTEN398U+UnlVejDxrprY1vFZqC908HTx3U8oPL1Tqq9SXbbP5rsxqENFBomQsLGHWo3kGpWan6/cTv1jb/DIbn22z+a/MVxwsAuL649IpUQECApk2bpmnTpl2yjcVi0dixYzV27NhLtilXrpwWLFjghBECQMnSq2EvjVozSn+e/lOStPHoRi28b6HWHVlnbZOVm6XxP47X6kdWK7ZKrKSCKzg/xv+oD7Z9oNZVW+v9X95X9XLVrVdebqhwg3ad2KWJGyde8tw3ht2oG8NutG6PaztOX+/9Wov3LdbQm4Za93eq2UmDmw2WJD3f4nm9/dPb+v7I97qhwg2SpMigSIWXCb/s83yq+VNqHN5Y5XzLadPRTRq1ZpSOpx3X1A5Tbdrd8tEt+vX4r8rKy9LAxgM19rZL/7ciMS3RJkRJsm4npiVeaONfuE1qVqoycjJcOtURAFCyuDRIAQDMqehfUZ1rddbc7XNlyFDnmp1Vwa+CTZuDpw4qPSddt39yu83+7LxsNQpvJEmKS45T80q295HGVo697LnTstM0et1oLT2wVMfPHldufq4ycjMKXZFqGNLQ+rPFYlFYmTCdOHdhUaD5Xedf8XmOiB1xob/QhvJy99LjSx7XhHYT5O3hbT32+X2f62z2We1I3KFnVz2ryZsm67kWz12xfwAAioogBQClTP+Y/hq6rOAK0Lud3i10PC07TZK09OGlqhRYyeaYt7t3ofZXa+TKkVr1xypNvn2yapSrIV9PX933xX3Kzsu2aefp7mmzbZFF+Ua+3eeVpOaVmis3P1dHTh+xXtmSpCpBVSRJdSvWVZ6Rp4H/G6hnYp+Ru5t7oT7CyoTp52M/2+xLSkuyHjv/Z9K5pEJtAr0DuRoFALBBkAKAUqZjjY7KzsuWRRZ1qF54AYS6FevK291b8Wfi1brqxe8ZqlOhjhbvW2yz76e/frrseTce3ai+N/ZV1zpdJRUEtiOnj9j3JEzanrhdbhY3hfiHXLJNvpGvnPwc5Rv5clfhIBVbOVZvbHhDJ86dsPaz6o9VCvQOVN2Kda1tvjv4nU3dqj9WXfFqHQDg+kOQAoBSxt3NXXFD4qw//1uAd4BG3jJSw1cMV76Rr5aRLXUm64w2xm9UoHeg+sT00RNNn9CUzVP07Mpn9WjjR7Xt+DbN3TH3suetWa6mvtr7lbrc0EUWWfTK96/YdaWp99e9VSmgkia0n3DR45uPbtaWY1t0W9XbFOAdoM1HN2v4iuHq1bCXgn2DJUmf7fxMnu6eahDSQN4e3vol4ReNWjNKD9Z70HpF7Ou4rzVqzSjtHbpXknRH9TtUt2JdPfL1I5rUfpIS0xL18tqXNaTZEOt0wSeaPqGZW2fquVXPqX+j/lp7eK2++P0LLX14qennCQC4thGkAKAUCvS+/BeJj7ttnCr6VdSEHyfoj5Q/VNanrBqHN9aLrV6UVLDgw38f+K+GrxiuGT/P0E2VbtL4tuPVf3H/S/Y5tcNU9f+2v2756BZV8Kug51s8r9Qs80swx5+Jl5vl0ovGent4a+HuhRq9brSy8rIUXTZaw28ebnPflIebhyZunKj9f++XYRiKKhuloc2GanjscGubM1lntO/vC6sFuru5a8lDSzRo6SDFfhQrfy9/9bmxj80CFdHB0Vr68FINXzFc72x5R5UDK+v/7v4/lj4HABRiMQzDcPUgXC01NVVBQUE6c+aMAgMv/8sJAACAJP36669q0qSJXv96naLrxVxVzb2t6hZ8j1RohL7ZsOeybTcu/kLvjRyoFz/9TvVuukWSNPTrukrJSFCwb4Rmdr18vdm+Henw79v1ctc22rZtmxo3buzw/gFnutpswBUpAACAYrJryHPyTE9Tjl8Zu+q7NXhOmTlp8vG0rx6A4xCkAAAAismhHn2LVN+2RtHqATjOpSepAwAAAAAuiiAFAAAAACYxtQ8AAKCY+JxIlCU/T4abuzJDwkzXp2QkKt/Ik5vFXcG+5usBOA5BCgAAoJh07N72qlftu5hXlrd1yKp9AIqOqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGCSh6sHAAAAcL1YM+9bueXlKt/dvl/BXmz3rfLzc+Xmxq9wgKvxrxAAAKCYnK1Ws0j1EYFFqwfgOEztAwAAAACTCFIAAAAAYBJT+wAAAIpJ1P8WySMjQ7m+vvqzy/2m6zceWaTs3Ax5efiqRVXz9QAchyAFAABQTBpNek1+SQlKD42wK0j957fXlJKRoGDfCIIU4GJM7QMAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYxBfyAgAAFJOMiiE2f5pV1jfE5k8ArkOQAgAAKCYrvlpXpPrXOxatHoDjEKQAAMA1LT4+XsnJyQ7vNy4uzuF9Aig9CFIAAOCaFR8fr9p16igjPd1p58jOynZa3wBKLoIUAAC4ZiUnJysjPV2DJ89WRPVaDu17x/pVWjTtDeXm5jq0XwClA0EKAABc8yKq11J0vRiH9plwaL/pmmavDJP3mRRlBQVr67hppus/+nmY0rJSVMY7WANuMl8PwHEIUgAAAMWk0rqV8ktKUHpohLbaUf/bsZVKyUhQsG+Ew8cGwBy+RwoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUEyO3NVdXmdOKzuorF31t1TtrnPZp+XvZV89AMchSAEAABST7c+PK1L9w42KVg/AcZjaBwAAAAAmEaQAAAAAwCSCFAAAAACYxD1SAAAAxaRzh2byO5Go9JAwLV2x1XT9yCXNlJKeqGC/ME2+y3w9AMfhihQAAEAx8Uw/J89zZ+WZfs6u+sycc8rMPavMHPvqATgOQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHC9+HnsVLlnZirPx8eu+v43TVVOXqY83e2rB+A4BCkAAIBiknBbxyLVN65UtHoAjsPUPgAAAAAwiSAFAAAAACYxtQ8AAKCYBO/eLvecbOV5eimlfozp+sOntis3L1se7l6KLme+HoDjEKQAAACKSetBD8svKUHpoRH6ZsMe0/VT1j+slIwEBftGaGZX8/UAHIcgBQAAAKeIi4tzSr8VKlRQZGSkU/oGrhZBCgAAAA51+mSSLBaLevXq5ZT+ff38tDcujjAFlyJIAQAAwKHSU8/IMAz1Gzdd1es3dGjfCYf2672RA5WcnEyQgksRpAAAAOAU4dE1FF0vxtXDAJyC5c8BAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAExi1T4AAIBismTZFkmGJItd9W/dtUWGDFnsrAfgOAQpAACAYpJbJqBI9b6eRasH4DhM7QMAAAAAkwhSAAAAAGCSy4PUsWPH1KtXL5UvX16+vr5q0KCBfvnlF+txwzD06quvKjw8XL6+vmrfvr0OHDhg08epU6fUs2dPBQYGqmzZshowYIDS0tKK+6kAAABcVu2PZ6rB9Amq/fFMu+q/i5up/+6coO/i7KsH4DguDVIpKSlq0aKFPD09tWzZMu3Zs0dTpkxRcHCwtc2kSZM0ffp0zZo1S1u2bJG/v786dOigzMxMa5uePXvq999/16pVq7RkyRL98MMPGjhwoCueEgAAwCXVnvOeGsycqNpz3rOr/ru97+mr3RP13V776gE4jksXm5g4caKqVKmiOXPmWPdFR0dbfzYMQ9OmTdPLL7+se+65R5I0f/58hYaG6ptvvlGPHj0UFxen5cuXa+vWrWratKkkacaMGerUqZMmT56siIiI4n1SAAAAAK55Lr0itXjxYjVt2lT333+/QkJC1KhRI3344YfW44cPH1ZiYqLat29v3RcUFKTmzZtr8+bNkqTNmzerbNmy1hAlSe3bt5ebm5u2bNly0fNmZWUpNTXV5gEAAAAAV8ulQeqPP/7Q+++/r5o1a2rFihUaNGiQnnrqKc2bN0+SlJiYKEkKDQ21qQsNDbUeS0xMVEhIiM1xDw8PlStXztrm3yZMmKCgoCDro0qVKo5+agAAAACuYS4NUvn5+WrcuLHGjx+vRo0aaeDAgXrsscc0a9Ysp5531KhROnPmjPVx9OhRp54PAAAAwLXFpUEqPDxcdevWtdlXp04dxcfHS5LCwsIkSUlJSTZtkpKSrMfCwsJ04sQJm+O5ubk6deqUtc2/eXt7KzAw0OYBAAAAAFfLpUGqRYsW2rdvn82+/fv3KyoqSlLBwhNhYWFas2aN9Xhqaqq2bNmi2NhYSVJsbKxOnz6tbdu2WdusXbtW+fn5at68eTE8CwAAAADXG5eu2jd8+HDdcsstGj9+vB544AH9/PPPmj17tmbPni1JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XUsEVrI4dO1qnBObk5Gjo0KHq0aMHK/YBAAAAcAqXBqlmzZrp66+/1qhRozR27FhFR0dr2rRp6tmzp7XNc889p3PnzmngwIE6ffq0WrZsqeXLl8vHx8fa5rPPPtPQoUPVrl07ubm5qXv37po+fbornhIAAACA64BLg5Qk3XXXXbrrrrsuedxisWjs2LEaO3bsJduUK1dOCxYscMbwAAAAHOZUvYZKD6+kzHLl7aqPLtdQ5TMrKdDHvnoAjuPyIAUAAHC9+GHWwiLVP9O6aPUAHMeli00AAAAAQGlEkAIAAAAAkwhSAAAAAGAS90gBAAAUk1uf6CGfU38rs1x5u+6XmrK+h1Iz/1agT3nulwJcjCAFAABQTMr9vlN+SQlKD7Xvuy4Pn9qplIwEBfvyXZmAqzG1DwAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQTPb2GyzPtLPKKRNgV32n2oOVkXNWvp721QNwHLuC1B9//KFq1ao5eiwAAADXtL39hxapvlOdotUDcBy7pvbVqFFDt912mz799FNlZmY6ekwAAAAAUKLZFaR+/fVXNWzYUCNGjFBYWJgef/xx/fzzz44eGwAAAACUSHYFqZiYGL3zzjtKSEjQxx9/rOPHj6tly5aqX7++pk6dqpMnTzp6nAAAAKWeR9pZeaSlyiPtrF31GTlnlZ6Tqowc++oBOE6RVu3z8PBQt27dtGjRIk2cOFEHDx7UyJEjVaVKFfXu3VvHjx931DgBAABKvbvubK4HGkfqrjub21X/7JLmemxRpJ5dYl89AMcpUpD65ZdfNHjwYIWHh2vq1KkaOXKkDh06pFWrVikhIUH33HOPo8YJAAAAACWGXav2TZ06VXPmzNG+ffvUqVMnzZ8/X506dZKbW0Eui46O1ty5c1W1alVHjhUAAAAASgS7gtT777+v/v37q2/fvgoPD79om5CQEH300UdFGhwAAAAAlER2BakDBw5csY2Xl5f69OljT/cAAAAAUKLZdY/UnDlztGjRokL7Fy1apHnz5hV5UAAAAABQktkVpCZMmKAKFSoU2h8SEqLx48cXeVAAAAAAUJLZFaTi4+MVHR1daH9UVJTi4+OLPCgAAAAAKMnsClIhISHauXNnof07duxQ+fLlizwoAAAAACjJ7Fps4qGHHtJTTz2lgIAA3XrrrZKk9evX6+mnn1aPHj0cOkAAAOB68fHxSk5OdkrfFSpUUGRkpFP6BgBnsStIjRs3TkeOHFG7du3k4VHQRX5+vnr37s09UgAAXGPi4+NVu04dZaSnO6V/Xz8/7Y2Luy7C1Pr3F8g9J1t5nl521T/TeoFy87Ll4W5fPQDHsStIeXl56fPPP9e4ceO0Y8cO+fr6qkGDBoqKinL0+AAAgIslJycrIz1dgyfPVkT1Wg7tO+HQfr03cqCSk5OviyCVUj+mSPXR5YpWD8Bx7ApS59WqVUu1ajn2AxUAAJRMEdVrKbpejKuHAQAlgl1BKi8vT3PnztWaNWt04sQJ5efn2xxfu3atQwYHAAAAACWRXUHq6aef1ty5c9W5c2fVr19fFovF0eMCAAC45kR8v1zumZnK8/FRwm0dTdf/emy5cvIy5enuo8aVzNcDcBy7gtTChQv1xRdfqFOnTo4eDwAAwDXrpldHyC8pQemhEfpmg/kg9PHPI5SSkaBg3wg17kqQAlzJru+R8vLyUo0aNRw9FgAAAAAoFewKUs8884zeeecdGYbh6PEAAAAAQIln19S+H3/8Ud9//72WLVumevXqydPT0+b4V1995ZDBAQAAAEBJZFeQKlu2rLp27erosQAAAABAqWBXkJozZ46jxwEAAAAApYZd90hJUm5urlavXq0PPvhAZ8+elSQlJCQoLS3NYYMDAAAAgJLIritSf/75pzp27Kj4+HhlZWXp9ttvV0BAgCZOnKisrCzNmjXL0eMEAAAAgBLDritSTz/9tJo2baqUlBT5+vpa93ft2lVr1qxx2OAAAAAAoCSy64rUhg0btGnTJnl5ednsr1q1qo4dO+aQgQEAAFxrcvz8leMfoBw/f7vqfTz95ZMTIB9P++oBOI5dQSo/P195eXmF9v/1118KCAgo8qAAAACuRUtXbC1S/eS7ilYPwHHsmtp3xx13aNq0adZti8WitLQ0vfbaa+rUqZOjxgYAAAAAJZJdV6SmTJmiDh06qG7dusrMzNTDDz+sAwcOqEKFCvrPf/7j6DECAAAAQIliV5CqXLmyduzYoYULF2rnzp1KS0vTgAED1LNnT5vFJwAAAADgWmRXkJIkDw8P9erVy5FjAQAAuKbFTHxFXmdOKzuorLY/P850/YLfXtG57NPy9yqrhxuZrwfgOHYFqfnz51/2eO/eve0aDAAAwLWs6pL/yi8pQemhEXYFqU1H/quUjAQF+0YQpAAXsytIPf300zbbOTk5Sk9Pl5eXl/z8/AhSAAAAAK5pdq3al5KSYvNIS0vTvn371LJlSxabAAAAAHDNsytIXUzNmjX15ptvFrpaBQAAAADXGocFKalgAYqEhARHdgkAAAAAJY5d90gtXrzYZtswDB0/flwzZ85UixYtHDIwAAAAACip7ApS9957r822xWJRxYoV1bZtW02ZMsUR4wIAAACAEsuuIJWfn+/ocQAAAABAqeHQe6QAAAAA4Hpg1xWpESNGXHXbqVOn2nMKAACAa86xNnfI+0yKsoKC7apvVOkOpWWlqIy3ffUAHMeuIPXbb7/pt99+U05Ojm644QZJ0v79++Xu7q7GjRtb21ksFseMEgAA4Bqwddy0ItUPuKlo9QAcx64g1aVLFwUEBGjevHkKDi74PyIpKSnq16+fWrVqpWeeecahgwQAAACAksSue6SmTJmiCRMmWEOUJAUHB+v1119n1T4AAAAA1zy7glRqaqpOnjxZaP/Jkyd19uzZIg8KAAAAAEoyu6b2de3aVf369dOUKVN00003SZK2bNmiZ599Vt26dXPoAAEAAK4VHbq1ke/JE8qoGKIVX60zXf/y8jY6nXFCZX1D9HpH8/UAHMeuIDVr1iyNHDlSDz/8sHJycgo68vDQgAED9NZbbzl0gAAAANcK35Mn5JeUYHf96YwTSsmwvx6A49gVpPz8/PTee+/prbfe0qFDhyRJ1atXl7+/v0MHBwAAAAAlUZG+kPf48eM6fvy4atasKX9/fxmG4ahxAQAAAECJZVeQ+vvvv9WuXTvVqlVLnTp10vHjxyVJAwYMYOlzAAAAANc8u4LU8OHD5enpqfj4ePn5+Vn3P/jgg1q+fLnDBgcAAAAAJZFd90itXLlSK1asUOXKlW3216xZU3/++adDBgYAAAAAJZVdV6TOnTtncyXqvFOnTsnb27vIgwIAAACAksyuINWqVSvNnz/fum2xWJSfn69Jkybptttuc9jgAAAAAKAksmtq36RJk9SuXTv98ssvys7O1nPPPafff/9dp06d0saNGx09RgAAAAAoUewKUvXr19f+/fs1c+ZMBQQEKC0tTd26ddOQIUMUHh7u6DECAABcE357bow8MjKU6+trV/1DjcYoOzdDXh721QNwHNNBKicnRx07dtSsWbP00ksvOWNMAAAA16Q/u9xfpPoWVYtWD8BxTN8j5enpqZ07dzpjLAAAAABQKti12ESvXr300UcfOXosAAAAAFAq2HWPVG5urj7++GOtXr1aTZo0kb+/v83xqVOnOmRwAAAA15KAPw7ILS9X+e4eOlutpun6hNQDys/PlZubhyICzdcDcBxTQeqPP/5Q1apVtXv3bjVu3FiStH//fps2FovFcaMDAAC4hrTrc4/8khKUHhqhbzbsMV0/fs09SslIULBvhGZ2NV8PwHFMBamaNWvq+PHj+v777yVJDz74oKZPn67Q0FCnDA4AAAAASiJT90gZhmGzvWzZMp07d86hAwIAAACAks6uxSbO+3ewAgAAAIDrgakgZbFYCt0DxT1RAAAAAK43pu6RMgxDffv2lbe3tyQpMzNTTzzxRKFV+7766ivHjRAAAAAAShhTV6T69OmjkJAQBQUFKSgoSL169VJERIR1+/zDHm+++aYsFouGDRtm3ZeZmakhQ4aofPnyKlOmjLp3766kpCSbuvj4eHXu3Fl+fn4KCQnRs88+q9zcXLvGAAAAAABXw9QVqTlz5jhlEFu3btUHH3yghg0b2uwfPny4li5dqkWLFikoKEhDhw5Vt27dtHHjRklSXl6eOnfurLCwMG3atEnHjx9X79695enpqfHjxztlrAAAAABQpMUmHCEtLU09e/bUhx9+qODgYOv+M2fO6KOPPtLUqVPVtm1bNWnSRHPmzNGmTZv0008/SZJWrlypPXv26NNPP1VMTIzuvPNOjRs3Tu+++66ys7Nd9ZQAAAAAXONcHqSGDBmizp07q3379jb7t23bppycHJv9tWvXVmRkpDZv3ixJ2rx5sxo0aGDzPVYdOnRQamqqfv/990ueMysrS6mpqTYPAAAAALhapqb2OdrChQv166+/auvWrYWOJSYmysvLS2XLlrXZHxoaqsTERGubf38Z8Pnt820uZsKECRozZkwRRw8AAGDO8v+ulSU/T4abu1314zquVb6RJzeLffUAHMdlQero0aN6+umntWrVKvn4+BTruUeNGqURI0ZYt1NTU1WlSpViHQMAALj+ZIaEFak+2Ldo9QAcx2VT+7Zt26YTJ06ocePG8vDwkIeHh9avX6/p06fLw8NDoaGhys7O1unTp23qkpKSFBZW8CESFhZWaBW/89vn21yMt7e3AgMDbR4AAAAAcLVcFqTatWunXbt2afv27dZH06ZN1bNnT+vPnp6eWrNmjbVm3759io+PV2xsrCQpNjZWu3bt0okTJ6xtVq1apcDAQNWtW7fYnxMAAACA64PLpvYFBASofv36Nvv8/f1Vvnx56/4BAwZoxIgRKleunAIDA/Xkk08qNjZWN998syTpjjvuUN26dfXII49o0qRJSkxM1Msvv6whQ4ZYvzQYAACgpKi+cK4809OU41dGh3r0NV2/9uBcZeakycezjNrWMF8PwHFcutjElbz99ttyc3NT9+7dlZWVpQ4dOui9996zHnd3d9eSJUs0aNAgxcbGyt/fX3369NHYsWNdOGoAAICLa/DuJPklJSg9NMKuIPXVrklKyUhQsG/EdR+k4uLinNJvhQoVFBkZ6ZS+cW0pUUFq3bp1Nts+Pj5699139e67716yJioqSt99952TRwYAAICS4PTJJFksFvXq1csp/fv6+WlvXBxhCldUooIUAAAAcDnpqWdkGIb6jZuu6vUbOrTvhEP79d7IgUpOTiZI4YoIUgAAACh1wqNrKLpejKuHgeuYy1btAwAAAIDSiiAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1i1DwAAoJikRldXdkCgMitUtKs+PLC6/LwCFeRjXz0AxyFIAQAAFJO18/9XpPqX2hWtHoDjMLUPAAAAAEwiSAEAAACASQQpAAAAADCJe6QAAACKyS3PPCbvlL+VFVxem6Z8aLr+3Y2P6WzW3wrwLq8hLczXA3AcghQAAEAxCfl5o/ySEpQeGmFXfdyJjUrJSFCwr331AByHqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk/hCXgAArgHx8fFKTk52St9xcXFO6fd6dPCB3vI6m6rsgEC76m+r0VsZ2any9bKvHoDjEKQAACjl4uPjVbtOHWWkpzv1PNlZ2U7t/3qw+8kXilTfvUHR6gE4DkEKAIBSLjk5WRnp6Ro8ebYiqtdyeP871q/SomlvKDc31+F9A0BpRZACAOAaEVG9lqLrxTi834RD+x3eJwCUdiw2AQAAAAAmcUUKAACgmNzbqq78khKUHhqhbzbsMV0/9Ou6SslIULBvhGZ2NV8PwHG4IgUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACZ5uHoAAAAA14tNkz+Qe3a28ry87KoffMsHys3Lloe7ffUAHIcgBQAAUExONG9VpPq6oUWrB+A4TO0DAAAAAJMIUgAAAABgElP7AAAAiknIlg3We6Tsmea3J2mD9R4ppvkBrkWQAgAAKCa3jHxcfkkJSg+N0Dcb9piuf2/T40rJSFCwb4RmdjVfD8BxmNoHAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmebh6AAAAANeLbzbsKVL9zK5FqwfgOFyRAgAAAACTCFIAAAAAYBJBCgAAAABM4h4pAACAYlJ/xpvyOpuq7IBA7X7yBdP1/931pjKyU+XrFajuDczXA3AcghQAAEAxqfHFfPklJSg9NMKuIPX9wflKyUhQsG8EQQpwMab2AQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAEziC3kBAACKyYmbWsg75W9lBZe3q75OSAudzfpbAd721QNwHIIUAABAMdk05cMi1Q9pUbR6AI7D1D4AAAAAMIkgBQAAAAAmEaQAAAAAwCTukQIAACgmbXt3kU/ySWVWqKi18/9nuv6NNV10JvOkgnwq6qV25usBOA5BCgAAoJgEHj4kv6QEpZ9Ntav+eOohpWQkKD3bvnoAjsPUPgAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJfCEvAABAMdk15Dl5pqcpx6+MXfXdGjynzJw0+XjaVw/AcQhSAAAAxeRQj75Fqm9bo2j1AByHqX0AAAAAYBJBCgAAAABMcmmQmjBhgpo1a6aAgACFhITo3nvv1b59+2zaZGZmasiQISpfvrzKlCmj7t27KykpyaZNfHy8OnfuLD8/P4WEhOjZZ59Vbm5ucT4VAACAK/I5kSjfxGPyOZFoV31KRqL+Tj+mlAz76gE4jkvvkVq/fr2GDBmiZs2aKTc3Vy+++KLuuOMO7dmzR/7+/pKk4cOHa+nSpVq0aJGCgoI0dOhQdevWTRs3bpQk5eXlqXPnzgoLC9OmTZt0/Phx9e7dW56enho/frwrnx4AADbi4+OVnJzs8H7j4uIc3ieco2P3tvJLSlB6aIS+2bDHdP0ry9sqJSNBwb4RmtnVfD0Ax3FpkFq+fLnN9ty5cxUSEqJt27bp1ltv1ZkzZ/TRRx9pwYIFatu2rSRpzpw5qlOnjn766SfdfPPNWrlypfbs2aPVq1crNDRUMTExGjdunJ5//nmNHj1aXl5ehc6blZWlrKws63ZqaqpznygA4LoXHx+v2nXqKCM93WnnyM7KdlrfAABbJWrVvjNnzkiSypUrJ0natm2bcnJy1L59e2ub2rVrKzIyUps3b9bNN9+szZs3q0GDBgoNDbW26dChgwYNGqTff/9djRo1KnSeCRMmaMyYMU5+NgAAXJCcnKyM9HQNnjxbEdVrObTvHetXadG0N5jWDgDFqMQEqfz8fA0bNkwtWrRQ/fr1JUmJiYny8vJS2bJlbdqGhoYqMTHR2uafIer88fPHLmbUqFEaMWKEdTs1NVVVqlRx1FMBAOCSIqrXUnS9GIf2mXBov0P7AwBcWYkJUkOGDNHu3bv1448/Ov1c3t7e8vb2dvp5AAAAAFybSsTy50OHDtWSJUv0/fffq3Llytb9YWFhys7O1unTp23aJyUlKSwszNrm36v4nd8+3wYAAAAAHMmlQcowDA0dOlRff/211q5dq+joaJvjTZo0kaenp9asWWPdt2/fPsXHxys2NlaSFBsbq127dunEiRPWNqtWrVJgYKDq1q1bPE8EAAAAwHXFpVP7hgwZogULFujbb79VQECA9Z6moKAg+fr6KigoSAMGDNCIESNUrlw5BQYG6sknn1RsbKxuvvlmSdIdd9yhunXr6pFHHtGkSZOUmJiol19+WUOGDGH6HgAAAACncGmQev/99yVJbdq0sdk/Z84c9e3bV5L09ttvy83NTd27d1dWVpY6dOig9957z9rW3d1dS5Ys0aBBgxQbGyt/f3/16dNHY8eOLa6nAQAAAOA649IgZRjGFdv4+Pjo3Xff1bvvvnvJNlFRUfruu+8cOTQAAAAAuKQSs2ofAADAtW7NvG/llperfHf7fgV7sd23ys/PlZsbv8IBrsa/QgAAgGJytlrNItVHBBatHoDjlIjlzwEAAACgNCFIAQAAAIBJTO0DAAAoJlH/WySPjAzl+vrqzy73m67feGSRsnMz5OXhqxZVzdcDcByCFAAAQDFpNOk1+SUlKD00wq4g9Z/fXlNKRoKCfSMIUk4UFxfntL4rVKigyMhIp/WP4kOQAgAAACSdPpkki8WiXr16Oe0cvn5+2hsXR5i6BhCkAAAAAEnpqWdkGIb6jZuu6vUbOrz/hEP79d7IgUpOTiZIXQMIUgAAAMA/hEfXUHS9GFcPAyUcq/YBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJjEqn0AAADFJKNiiM2fZpX1DbH5E4DrEKQAAACKyYqv1hWp/vWORasH4DhM7QMAAAAAkwhSAAAAAGASQQoAAAAATOIeKQAAgGLS7JVh8j6ToqygYG0dN810/Uc/D1NaVorKeAdrwE3m6wE4DkEKAACgmFRat1J+SQlKD43QVjvqfzu2UikZCQr2jXD42ACYw9Q+AAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEl8IS8AAEAxOXJXd3mdOa3soLJ21d9StbvOZZ+Wv5d99QAchyAFAABQTLY/P65I9Q83Klo9AMdhah8AAAAAmESQAgAAAACTCFIAAAAAYBL3SAEAABSTzh2aye9EotJDwrR0xVbT9SOXNFNKeqKC/cI0+S7z9QAchyAFAMD/Fx8fr+TkZKf0HRcX55R+Ubp4pp+T57mz8kwPsKs+M+ecMnPPKjPHvnoAjkOQAgBABSGqdp06ykhPd+p5srOyndo/AKB4EKQAAJCUnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BAMWPIAUAwD9EVK+l6HoxDu834dB+h/cJAHAdVu0DAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJVfsAAACKyc9jp8o9M1N5Pj521fe/aapy8jLl6W5fPQDHIUgBAAAUk4TbOhapvnGlotUDcBym9gEAAACASQQpAAAAADCJqX0AAADFJHj3drnnZCvP00sp9WNM1x8+tV25ednycPdSdDnz9QAchyAFAABQTFoPelh+SQlKD43QNxv2mK6fsv5hpWQkKNg3QjO7mq8H4DhM7QMAAAAAk7giBQAAABSjuLg4p/RboUIFRUZGOqVvFEaQAgAAAIrB6ZNJslgs6tWrl1P69/Xz0964OMJUMSFIAQAAAMUgPfWMDMNQv3HTVb1+Q4f2nXBov94bOVDJyckEqWJCkAIAAACKUXh0DUXXi3H1MFBELDYBAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1hsAgAAoJgsWbZFkiHJYlf9W3dtkSFDFjvrATgOQQoAAKCY5JYJKFK9r2fR6gE4DkEKAFCqxMfHKzk52eH9xsXFObxPAMC1iyAFACg14uPjVbtOHWWkpzvtHNlZ2U7rGwBw7SBIAQBKjeTkZGWkp2vw5NmKqF7LoX3vWL9Ki6a9odzcXIf2C/xT7Y9nyjPtrHLKBGhv/6Gm67+Lm6mMnLPy9QxQpzrm6wE4DkEKAFDqRFSvpeh6MQ7tM+HQfof2B1xM7TnvyS8pQemhEfYFqb3vKSUjQcG+EQQpwMVY/hwAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUExO1Wuo9PBKyixX3q766HINVT6zkgJ97KsH4DgEKQAAgGLyw6yFRap/pnXR6gE4DkEKAOBQ8fHxSk5OdkrfcXFxTukXAACzCFIAAIeJj49X7Tp1lJGe7tTzZGdlO7V/ACitnPk/nCpUqKDIyEin9V/aEKQAAA6TnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BoDQ7fTJJFotFvXr1cto5fP38tDcujjD1/xGkAAAOF1G9lqLrxTi834RD+x3eJ1Ccbn2ih3xO/a3McuXtul9qyvoeSs38W4E+5blfCjbSU8/IMAz1Gzdd1es3dHj/CYf2672RA5WcnEyQ+v8IUgAAAMWk3O875ZeUoPTQCLvqD5/aqZSMBAX72lePa194dA2n/I8sFMb3SAEAAACASVyRAoDrkLNW1mNVPQDA9YIgBeCynLmUtVS6VwBy5mvjzNelOFbWY1U9AMC1jiAF4JKK4xfu0roCkLNfG2e+Ls5cWY9V9QAA14trJki9++67euutt5SYmKgbb7xRM2bM0E033eTqYQGlmrOXsi7NKwA587U5/7ps2LBBderUcWjf0oXpd85YWY9V9QAA14trIkh9/vnnGjFihGbNmqXmzZtr2rRp6tChg/bt26eQkBBXDw8o9Zy1lPV5zrqvpjimDTrjtSmO7wKRmH4HADCvNP8329GuiSA1depUPfbYY+rXr58kadasWVq6dKk+/vhjvfDCCy4enXnOvO8iKytL3t7eTulbcv59HaX1Xh1nj91Zf6/OXjjA2YGhtE4bdPZ3gTD9DgBgFv/NLqzUB6ns7Gxt27ZNo0aNsu5zc3NT+/bttXnz5ovWZGVlKSsry7p95swZSVJqaqpzB3sVjh49qqbNmikzI8NJZ7BIMpzUt+Tt46NP5s9XaGioQ/tNSkpS7969lZmZ6dB+/6k0j93Zf68Htv+izPRzDu/34G9bZRiGbn/kcYVHVXNo338n/qWl/zdDK1as0A033ODQviVp3759kqQjv+9w+GtzfnpcdmaGU173nP//+fdn3C65WRz7vjk/dmf07ez+Gbtr+k88fFCStG3bNqWlpTm0b6l4/q2aeV1uzc5UrqTU7EzFbd1ouv/ctEwpR8rNu3K9o8deUvovrX07u39nj704/pt95MgRlS1b1qF92+N8JjCMy7+OFuNKLUq4hIQEVapUSZs2bVJsbKx1/3PPPaf169dry5YthWpGjx6tMWPGFOcwAQAAAJQiR48eVeXKlS95vNRfkbLHqFGjNGLECOt2fn6+Tp06pfLly8tisbhwZNee1NRUValSRUePHlVgYKCrhwMX4r0AifcBCvA+wHm8FyCVvPeBYRg6e/asIiIiLtuu1AepChUqyN3dXUlJSTb7k5KSFBYWdtEab2/vQveTlITLiNeywMDAEvEPA67HewES7wMU4H2A83gvQCpZ74OgoKArtnErhnE4lZeXl5o0aaI1a9ZY9+Xn52vNmjU2U/0AAAAAwFFK/RUpSRoxYoT69Omjpk2b6qabbtK0adN07tw56yp+AAAAAOBI10SQevDBB3Xy5Em9+uqrSkxMVExMjJYvX+7w1ddgnre3t1577TWnLrmO0oH3AiTeByjA+wDn8V6AVHrfB6V+1T4AAAAAKG6l/h4pAAAAAChuBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFIrs/fffV8OGDa1fohYbG6tly5ZZj7dp00YWi8Xm8cQTT7hwxCgOb775piwWi4YNG2bdl5mZqSFDhqh8+fIqU6aMunfvXujLtHHtudh7gc+F68Po0aML/T3Xrl3bepzPhOvDld4HfB5cX44dO6ZevXqpfPny8vX1VYMGDfTLL79YjxuGoVdffVXh4eHy9fVV+/btdeDAAReO+NKuieXP4VqVK1fWm2++qZo1a8owDM2bN0/33HOPfvvtN9WrV0+S9Nhjj2ns2LHWGj8/P1cNF8Vg69at+uCDD9SwYUOb/cOHD9fSpUu1aNEiBQUFaejQoerWrZs2btzoopHC2S71XpD4XLhe1KtXT6tXr7Zue3hc+NWDz4Trx+XeBxKfB9eLlJQUtWjRQrfddpuWLVumihUr6sCBAwoODra2mTRpkqZPn6558+YpOjpar7zyijp06KA9e/bIx8fHhaMvjCCFIuvSpYvN9htvvKH3339fP/30kzVI+fn5KSwszBXDQzFLS0tTz5499eGHH+r111+37j9z5ow++ugjLViwQG3btpUkzZkzR3Xq1NFPP/2km2++2VVDhpNc6r1wHp8L1wcPD4+L/j3zmXB9udT74Dw+D64PEydOVJUqVTRnzhzrvujoaOvPhmFo2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjR7GP+XKY2geHysvL08KFC3Xu3DnFxsZa93/22WeqUKGC6tevr1GjRik9Pd2Fo4QzDRkyRJ07d1b79u1t9m/btk05OTk2+2vXrq3IyEht3ry5uIeJYnCp98J5fC5cHw4cOKCIiAhVq1ZNPXv2VHx8vCQ+E643l3ofnMfnwfVh8eLFatq0qe6//36FhISoUaNG+vDDD63HDx8+rMTERJvPhaCgIDVv3rxEfi5wRQoOsWvXLsXGxiozM1NlypTR119/rbp160qSHn74YUVFRSkiIkI7d+7U888/r3379umrr75y8ajhaAsXLtSvv/6qrVu3FjqWmJgoLy8vlS1b1mZ/aGioEhMTi2mEKC6Xey9IfC5cL5o3b665c+fqhhtu0PHjxzVmzBi1atVKu3fv5jPhOnK590FAQACfB9eRP/74Q++//75GjBihF198UVu3btVTTz0lLy8v9enTx/pvPzQ01KaupH4uEKTgEDfccIO2b9+uM2fO6Msvv1SfPn20fv161a1bVwMHDrS2a9CggcLDw9WuXTsdOnRI1atXd+Go4UhHjx7V008/rVWrVpW4OcwoXlfzXuBz4fpw5513Wn9u2LChmjdvrqioKH3xxRfy9fV14chQnC73PhgwYACfB9eR/Px8NW3aVOPHj5ckNWrUSLt379asWbPUp08fF4/OPKb2wSG8vLxUo0YNNWnSRBMmTNCNN96od95556JtmzdvLkk6ePBgcQ4RTrZt2zadOHFCjRs3loeHhzw8PLR+/XpNnz5dHh4eCg0NVXZ2tk6fPm1Tl5SUxLz4a8yV3gt5eXmFavhcuD6ULVtWtWrV0sGDBxUWFsZnwnXqn++Di+Hz4NoVHh5unbF0Xp06daxTPc//2//36p0l9XOBIAWnyM/PV1ZW1kWPbd++XVLBPyZcO9q1a6ddu3Zp+/bt1kfTpk3Vs2dP68+enp5as2aNtWbfvn2Kj4+3uZ8Opd+V3gvu7u6FavhcuD6kpaXp0KFDCg8PV5MmTfhMuE79831wMXweXLtatGihffv22ezbv3+/oqKiJBUsPBEWFmbzuZCamqotW7aUyM8FpvahyEaNGqU777xTkZGROnv2rBYsWKB169ZpxYoVOnTokBYsWKBOnTqpfPny2rlzp4YPH65bb731ossho/QKCAhQ/fr1bfb5+/urfPny1v0DBgzQiBEjVK5cOQUGBurJJ59UbGwsq3NdY670XuBz4foxcuRIdenSRVFRUUpISNBrr70md3d3PfTQQwoKCuIz4TpxufcBnwfXl+HDh+uWW27R+PHj9cADD+jnn3/W7NmzNXv2bEmyfufg66+/rpo1a1qXP4+IiNC9997r2sFfBEEKRXbixAn17t1bx48fV1BQkBo2bKgVK1bo9ttv19GjR7V69WpNmzZN586dU5UqVdS9e3e9/PLLrh42XODtt9+Wm5ubunfvrqysLHXo0EHvvfeeq4eFYubl5cXnwnXir7/+0kMPPaS///5bFStWVMuWLfXTTz+pYsWKkvhMuF5c7n2QmZnJ58F1pFmzZvr66681atQojR07VtHR0Zo2bZp69uxpbfPcc8/p3LlzGjhwoE6fPq2WLVtq+fLlJfL+a4thGIarBwEAAAAApQn3SAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAEqUI0eOyGKxaPv27a4eSonRpk0bDRs2zNXDAAD8A0EKAOBwFovlso/Ro0e7eoiFlISwsm7dOlksFp0+fdql4wAAXJmHqwcAALj2HD9+3Prz559/rldffVX79u2z7itTpowrhgUAgMNwRQoA4HBhYWHWR1BQkCwWi3U7JCREU6dOVeXKleXt7a2YmBgtX778kn3l5eWpf//+ql27tuLj4yVJ3377rRo3biwfHx9Vq1ZNY8aMUW5urrXGYrHo//7v/9S1a1f5+fmpZs2aWrx4cZGe048//qhWrVrJ19dXVapU0VNPPaVz585Zj1etWlXjx49X//79FRAQoMjISM2ePdumj02bNikmJkY+Pj5q2rSpvvnmG+s0xiNHjui2226TJAUHB8tisahv377W2vz8fD333HMqV66cwsLCSuRVPQC4nhCkAADF6p133tGUKVM0efJk7dy5Ux06dNDdd9+tAwcOFGqblZWl+++/X9u3b9eGDRsUGRmpDRs2qHfv3nr66ae1Z88effDBB5o7d67eeOMNm9oxY8bogQce0M6dO9WpUyf17NlTp06dsmvMhw4dUseOHdW9e3ft3LlTn3/+uX788UcNHTrUpt2UKVPUtGlT/fbbbxo8eLAGDRpkvRKXmpqqLl26qEGDBvr11181btw4Pf/889baKlWq6L///a8kad++fTp+/Ljeeecd6/F58+bJ399fW7Zs0aRJkzR27FitWrXKrucDAHAAAwAAJ5ozZ44RFBRk3Y6IiDDeeOMNmzbNmjUzBg8ebBiGYRw+fNiQZGzYsMFo166d0bJlS+P06dPWtu3atTPGjx9vU//JJ58Y4eHh1m1Jxssvv2zdTktLMyQZy5Ytu+Q4W7dubTz99NMXPTZgwABj4MCBNvs2bNhguLm5GRkZGYZhGEZUVJTRq1cv6/H8/HwjJCTEeP/99w3DMIz333/fKF++vLW9YRjGhx9+aEgyfvvtN8MwDOP77783JBkpKSmFxtayZUubfc2aNTOef/75Sz4fAIBzcY8UAKDYpKamKiEhQS1atLDZ36JFC+3YscNm30MPPaTKlStr7dq18vX1te7fsWOHNm7caHMFKi8vT5mZmUpPT5efn58kqWHDhtbj/v7+CgwM1IkTJ+wa944dO7Rz50599tln1n2GYSg/P1+HDx9WnTp1Cp3z/HTG8+fct2+fGjZsKB8fH2ubm2666arH8M++JSk8PNzu5wMAKDqCFACgROrUqZM+/fRTbd68WW3btrXuT0tL05gxY9StW7dCNf8MKZ6enjbHLBaL8vPz7RpLWlqaHn/8cT311FOFjkVGRjrlnP/mzL4BAOYRpAAAxSYwMFARERHauHGjWrdubd2/cePGQldnBg0apPr16+vuu+/W0qVLre0bN26sffv2qUaNGsU27saNG2vPnj1FOucNN9ygTz/9VFlZWfL29pYkbd261aaNl5eXpIIrbACAko0gBQAoVs8++6xee+01Va9eXTExMZozZ462b99uM23uvCeffFJ5eXm66667tGzZMrVs2VKvvvqq7rrrLkVGRuq+++6Tm5ubduzYod27d+v1118v0thOnjxZ6IuAw8PD9fzzz+vmm2/W0KFD9eijj8rf31979uzRqlWrNHPmzKvq++GHH9ZLL72kgQMH6oUXXlB8fLwmT54sqeDqkiRFRUXJYrFoyZIl6tSpk3x9fVkqHgBKKFbtAwAUq6eeekojRozQM888owYNGmj58uVavHixatasedH2w4YN05gxY9SpUydt2rRJHTp00JIlS7Ry5Uo1a9ZMN998s95++21FRUUVeWwLFixQo0aNbB4ffvihGjZsqPXr12v//v1q1aqVGjVqpFdffVURERFX3XdgYKD+97//afv27YqJidFLL72kV199VdKFKYmVKlXSmDFj9MILLyg0NLTQqoAAgJLDYhiG4epBAABwPfrss8/Ur18/nTlzxmZBDQBAycfUPgAAisn8+fNVrVo1VapUSTt27NDzzz+vBx54gBAFAKUQQQoAgGKSmJioV199VYmJiQoPD9f9999f6IuEAQClA1P7AAAAAMAkFpsAAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmPT/ALFDWFiHUDxIAAAAAElFTkSuQmCC",
|
458 |
+
"text/plain": [
|
459 |
+
"<Figure size 1000x600 with 1 Axes>"
|
460 |
+
]
|
461 |
+
},
|
462 |
+
"metadata": {},
|
463 |
+
"output_type": "display_data"
|
464 |
+
}
|
465 |
+
],
|
466 |
+
"source": [
|
467 |
+
"#统计图\n",
|
468 |
+
"import matplotlib.pyplot as plt\n",
|
469 |
+
"import seaborn as sns\n",
|
470 |
+
"import numpy as np\n",
|
471 |
+
"\n",
|
472 |
+
"# 假设这是您的 token_len_list\n",
|
473 |
+
"\n",
|
474 |
+
"# 设置画布大小\n",
|
475 |
+
"plt.figure(figsize=(10, 6))\n",
|
476 |
+
"\n",
|
477 |
+
"# 使用 seaborn 生成直方图\n",
|
478 |
+
"sns.histplot(token_len_list, bins=30, kde=False, color=\"skyblue\", edgecolor=\"black\")\n",
|
479 |
+
"\n",
|
480 |
+
"# 添加标题和标签\n",
|
481 |
+
"plt.title(\"Distribution of Token Lengths\")\n",
|
482 |
+
"plt.xlabel(\"Token Length\")\n",
|
483 |
+
"plt.ylabel(\"Frequency\")\n",
|
484 |
+
"\n",
|
485 |
+
"# 显示平均值线\n",
|
486 |
+
"mean_value = np.mean(token_len_list)\n",
|
487 |
+
"plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2)\n",
|
488 |
+
"plt.text(mean_value + 2, plt.ylim()[1]*0.9, f'Mean: {mean_value:.2f}', color='red')\n",
|
489 |
+
"\n",
|
490 |
+
"# 显示中位数线\n",
|
491 |
+
"median_value = np.median(token_len_list)\n",
|
492 |
+
"plt.axvline(median_value, color='green', linestyle='dashed', linewidth=2)\n",
|
493 |
+
"plt.text(median_value - 10, plt.ylim()[1]*0.8, f'Median: {median_value:.2f}', color='green')\n",
|
494 |
+
"\n",
|
495 |
+
"# 显示图形\n",
|
496 |
+
"plt.show()"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "code",
|
501 |
+
"execution_count": 15,
|
502 |
+
"id": "9a65c8bc-6bf0-4605-8c38-409bbb14f2c7",
|
503 |
+
"metadata": {},
|
504 |
+
"outputs": [
|
505 |
+
{
|
506 |
+
"data": {
|
507 |
+
"application/vnd.jupyter.widget-view+json": {
|
508 |
+
"model_id": "a4e97d92506f419581c3711f26d7f683",
|
509 |
+
"version_major": 2,
|
510 |
+
"version_minor": 0
|
511 |
+
},
|
512 |
+
"text/plain": [
|
513 |
+
"Map: 0%| | 0/53275 [00:00<?, ? examples/s]"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
"metadata": {},
|
517 |
+
"output_type": "display_data"
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"data": {
|
521 |
+
"application/vnd.jupyter.widget-view+json": {
|
522 |
+
"model_id": "9004c03cb9b24411b6bd9d33662402fb",
|
523 |
+
"version_major": 2,
|
524 |
+
"version_minor": 0
|
525 |
+
},
|
526 |
+
"text/plain": [
|
527 |
+
"Map: 0%| | 0/5920 [00:00<?, ? examples/s]"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
"metadata": {},
|
531 |
+
"output_type": "display_data"
|
532 |
+
}
|
533 |
+
],
|
534 |
+
"source": [
|
535 |
+
"# 2. tokenize\n",
|
536 |
+
"def tokenize_function(examples):\n",
|
537 |
+
" examples['label'] = [int(item) for item in examples['label']]\n",
|
538 |
+
" return tokenizer(examples['sequence'], truncation=True, padding='max_length', max_length=128)\n",
|
539 |
+
"\n",
|
540 |
+
"# 3. 对数据集应用分词函数\n",
|
541 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
|
542 |
+
"\n",
|
543 |
+
"# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
|
544 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"cell_type": "code",
|
549 |
+
"execution_count": 22,
|
550 |
+
"id": "4b0faa94-d0c4-4ce8-9976-dcefcb766f0b",
|
551 |
+
"metadata": {},
|
552 |
+
"outputs": [
|
553 |
+
{
|
554 |
+
"name": "stderr",
|
555 |
+
"output_type": "stream",
|
556 |
+
"text": [
|
557 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
558 |
+
" warnings.warn(\n",
|
559 |
+
"/tmp/ipykernel_2549/341301010.py:29: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
|
560 |
+
" trainer = Trainer(\n"
|
561 |
+
]
|
562 |
+
}
|
563 |
+
],
|
564 |
+
"source": [
|
565 |
+
"from transformers import TrainingArguments, Trainer\n",
|
566 |
+
"import numpy as np\n",
|
567 |
+
"import torch.nn as nn\n",
|
568 |
+
"\n",
|
569 |
+
"\n",
|
570 |
+
"\n",
|
571 |
+
"def compute_metrics(eval_pred):\n",
|
572 |
+
" predictions, labels = eval_pred\n",
|
573 |
+
" predictions = np.argmax(predictions, axis=1)\n",
|
574 |
+
" return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
|
575 |
+
"\n",
|
576 |
+
"# change training hyperparameters to archive better quality\n",
|
577 |
+
"training_args = TrainingArguments(\n",
|
578 |
+
" output_dir=\"ds_job_category_v0\",\n",
|
579 |
+
" learning_rate=1e-5,\n",
|
580 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
581 |
+
" warmup_ratio=0.1,\n",
|
582 |
+
" optim='adamw_torch',\n",
|
583 |
+
" weight_decay=0.0,\n",
|
584 |
+
" per_device_train_batch_size=20,\n",
|
585 |
+
" per_device_eval_batch_size=20,\n",
|
586 |
+
" num_train_epochs=10,\n",
|
587 |
+
" evaluation_strategy=\"epoch\",\n",
|
588 |
+
" save_strategy=\"epoch\",\n",
|
589 |
+
" logging_strategy=\"epoch\",\n",
|
590 |
+
" load_best_model_at_end=True\n",
|
591 |
+
")\n",
|
592 |
+
"\n",
|
593 |
+
"trainer = Trainer(\n",
|
594 |
+
" model=model,\n",
|
595 |
+
" args=training_args,\n",
|
596 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
597 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
598 |
+
" tokenizer=tokenizer,\n",
|
599 |
+
" data_collator=data_collator,\n",
|
600 |
+
" compute_metrics=compute_metrics,\n",
|
601 |
+
")"
|
602 |
+
]
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"cell_type": "code",
|
606 |
+
"execution_count": 17,
|
607 |
+
"id": "9b067740-9c0f-4df8-a5af-b68ec9d1f3e0",
|
608 |
+
"metadata": {},
|
609 |
+
"outputs": [
|
610 |
+
{
|
611 |
+
"data": {
|
612 |
+
"text/html": [
|
613 |
+
"\n",
|
614 |
+
" <div>\n",
|
615 |
+
" \n",
|
616 |
+
" <progress value='26640' max='26640' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
617 |
+
" [26640/26640 1:00:13, Epoch 10/10]\n",
|
618 |
+
" </div>\n",
|
619 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
620 |
+
" <thead>\n",
|
621 |
+
" <tr style=\"text-align: left;\">\n",
|
622 |
+
" <th>Epoch</th>\n",
|
623 |
+
" <th>Training Loss</th>\n",
|
624 |
+
" <th>Validation Loss</th>\n",
|
625 |
+
" <th>Accuracy</th>\n",
|
626 |
+
" </tr>\n",
|
627 |
+
" </thead>\n",
|
628 |
+
" <tbody>\n",
|
629 |
+
" <tr>\n",
|
630 |
+
" <td>1</td>\n",
|
631 |
+
" <td>0.324900</td>\n",
|
632 |
+
" <td>0.237557</td>\n",
|
633 |
+
" <td>0.916216</td>\n",
|
634 |
+
" </tr>\n",
|
635 |
+
" <tr>\n",
|
636 |
+
" <td>2</td>\n",
|
637 |
+
" <td>0.193100</td>\n",
|
638 |
+
" <td>0.212998</td>\n",
|
639 |
+
" <td>0.925338</td>\n",
|
640 |
+
" </tr>\n",
|
641 |
+
" <tr>\n",
|
642 |
+
" <td>3</td>\n",
|
643 |
+
" <td>0.126900</td>\n",
|
644 |
+
" <td>0.278650</td>\n",
|
645 |
+
" <td>0.923480</td>\n",
|
646 |
+
" </tr>\n",
|
647 |
+
" <tr>\n",
|
648 |
+
" <td>4</td>\n",
|
649 |
+
" <td>0.076900</td>\n",
|
650 |
+
" <td>0.362979</td>\n",
|
651 |
+
" <td>0.922804</td>\n",
|
652 |
+
" </tr>\n",
|
653 |
+
" <tr>\n",
|
654 |
+
" <td>5</td>\n",
|
655 |
+
" <td>0.047400</td>\n",
|
656 |
+
" <td>0.518552</td>\n",
|
657 |
+
" <td>0.915372</td>\n",
|
658 |
+
" </tr>\n",
|
659 |
+
" <tr>\n",
|
660 |
+
" <td>6</td>\n",
|
661 |
+
" <td>0.032000</td>\n",
|
662 |
+
" <td>0.698843</td>\n",
|
663 |
+
" <td>0.918412</td>\n",
|
664 |
+
" </tr>\n",
|
665 |
+
" <tr>\n",
|
666 |
+
" <td>7</td>\n",
|
667 |
+
" <td>0.029000</td>\n",
|
668 |
+
" <td>0.760331</td>\n",
|
669 |
+
" <td>0.915709</td>\n",
|
670 |
+
" </tr>\n",
|
671 |
+
" <tr>\n",
|
672 |
+
" <td>8</td>\n",
|
673 |
+
" <td>0.025900</td>\n",
|
674 |
+
" <td>0.769762</td>\n",
|
675 |
+
" <td>0.921959</td>\n",
|
676 |
+
" </tr>\n",
|
677 |
+
" <tr>\n",
|
678 |
+
" <td>9</td>\n",
|
679 |
+
" <td>0.021800</td>\n",
|
680 |
+
" <td>0.740165</td>\n",
|
681 |
+
" <td>0.923142</td>\n",
|
682 |
+
" </tr>\n",
|
683 |
+
" <tr>\n",
|
684 |
+
" <td>10</td>\n",
|
685 |
+
" <td>0.021300</td>\n",
|
686 |
+
" <td>0.738664</td>\n",
|
687 |
+
" <td>0.922973</td>\n",
|
688 |
+
" </tr>\n",
|
689 |
+
" </tbody>\n",
|
690 |
+
"</table><p>"
|
691 |
+
],
|
692 |
+
"text/plain": [
|
693 |
+
"<IPython.core.display.HTML object>"
|
694 |
+
]
|
695 |
+
},
|
696 |
+
"metadata": {},
|
697 |
+
"output_type": "display_data"
|
698 |
+
},
|
699 |
+
{
|
700 |
+
"data": {
|
701 |
+
"text/plain": [
|
702 |
+
"TrainOutput(global_step=26640, training_loss=0.08990609108864724, metrics={'train_runtime': 3619.5996, 'train_samples_per_second': 147.185, 'train_steps_per_second': 7.36, 'total_flos': 3.4801460969472e+16, 'train_loss': 0.08990609108864724, 'epoch': 10.0})"
|
703 |
+
]
|
704 |
+
},
|
705 |
+
"execution_count": 17,
|
706 |
+
"metadata": {},
|
707 |
+
"output_type": "execute_result"
|
708 |
+
}
|
709 |
+
],
|
710 |
+
"source": [
|
711 |
+
"trainer.train()"
|
712 |
+
]
|
713 |
+
},
|
714 |
+
{
|
715 |
+
"cell_type": "code",
|
716 |
+
"execution_count": 20,
|
717 |
+
"id": "aa26e020-2dfd-4e0e-b330-250ee3e44a44",
|
718 |
+
"metadata": {},
|
719 |
+
"outputs": [
|
720 |
+
{
|
721 |
+
"data": {
|
722 |
+
"text/html": [],
|
723 |
+
"text/plain": [
|
724 |
+
"<IPython.core.display.HTML object>"
|
725 |
+
]
|
726 |
+
},
|
727 |
+
"metadata": {},
|
728 |
+
"output_type": "display_data"
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"data": {
|
732 |
+
"text/plain": [
|
733 |
+
"{'accuracy': 0.9253378378378379, 'f1': 0.927062706270627}"
|
734 |
+
]
|
735 |
+
},
|
736 |
+
"execution_count": 20,
|
737 |
+
"metadata": {},
|
738 |
+
"output_type": "execute_result"
|
739 |
+
}
|
740 |
+
],
|
741 |
+
"source": [
|
742 |
+
"#模型测试\n",
|
743 |
+
"import evaluate\n",
|
744 |
+
"predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
|
745 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
746 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
747 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
748 |
+
"ret"
|
749 |
+
]
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"cell_type": "code",
|
753 |
+
"execution_count": 21,
|
754 |
+
"id": "5e6d99ad-66a0-4b85-9380-ae2b7ee88056",
|
755 |
+
"metadata": {},
|
756 |
+
"outputs": [
|
757 |
+
{
|
758 |
+
"data": {
|
759 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHHCAYAAACcHAM1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABRWklEQVR4nO3deVgV1f8H8PeAclkv4AKIEoKCguJuRrgmgYpbWGaSool9LdDc0MwNtyzcUnMpS1HT1Eot0VQUd3FNlFxIEEUTsERAUPb5/eGPySt45crFO17er555Hu7MmTOfuQ/px885Z0YQRVEEERERkYwZ6DoAIiIiomdhwkJERESyx4SFiIiIZI8JCxEREckeExYiIiKSPSYsREREJHtMWIiIiEj2mLAQERGR7DFhISIiItljwkL0Erp69Sp8fHxgaWkJQRCwfft2rfZ//fp1CIKAiIgIrfb7MuvcuTM6d+6s6zCIqiwmLETPKTExEf/73//g7OwMY2NjKJVKeHl5YfHixXj48GGlXjswMBBxcXGYM2cO1q9fjzZt2lTq9V6kIUOGQBAEKJXKMr/Hq1evQhAECIKA+fPna9z/7du3ERYWhtjYWC1ES0QvSjVdB0D0Mtq5cyfeeecdKBQKDB48GE2bNkV+fj6OHj2K0NBQXLx4Ed9++22lXPvhw4eIiYnB5MmTERISUinXcHR0xMOHD1G9evVK6f9ZqlWrhgcPHmDHjh3o37+/yrENGzbA2NgYubm5z9X37du3MWPGDNSvXx8tWrQo93l79+59rusRkXYwYSHSUFJSEgYMGABHR0dER0ejTp060rHg4GAkJCRg586dlXb9f/75BwBgZWVVadcQBAHGxsaV1v+zKBQKeHl54ccffyyVsGzcuBF+fn745ZdfXkgsDx48gKmpKYyMjF7I9YiobBwSItJQeHg4srOz8f3336skKyUaNmyITz75RPpcWFiIWbNmoUGDBlAoFKhfvz4+++wz5OXlqZxXv3599OzZE0ePHsWrr74KY2NjODs7Y926dVKbsLAwODo6AgBCQ0MhCALq168P4NFQSsnPjwsLC4MgCCr7oqKi0L59e1hZWcHc3ByNGjXCZ599Jh1/2hyW6OhodOjQAWZmZrCyskKfPn1w+fLlMq+XkJCAIUOGwMrKCpaWlhg6dCgePHjw9C/2CQMHDsTvv/+OjIwMad/p06dx9epVDBw4sFT79PR0jB8/Hh4eHjA3N4dSqUT37t1x/vx5qc3BgwfRtm1bAMDQoUOloaWS++zcuTOaNm2Ks2fPomPHjjA1NZW+lyfnsAQGBsLY2LjU/fv6+sLa2hq3b98u970S0bMxYSHS0I4dO+Ds7IzXX3+9XO2DgoIwbdo0tGrVCosWLUKnTp0wd+5cDBgwoFTbhIQEvP3223jzzTexYMECWFtbY8iQIbh48SIAwN/fH4sWLQIAvPfee1i/fj2++uorjeK/ePEievbsiby8PMycORMLFixA7969cezYMbXn7du3D76+vrhz5w7CwsIwduxYHD9+HF5eXrh+/Xqp9v3798f9+/cxd+5c9O/fHxEREZgxY0a54/T394cgCNi6dau0b+PGjWjcuDFatWpVqv21a9ewfft29OzZEwsXLkRoaCji4uLQqVMnKXlwc3PDzJkzAQAffvgh1q9fj/Xr16Njx45SP3fv3kX37t3RokULfPXVV+jSpUuZ8S1evBi1a9dGYGAgioqKAADffPMN9u7di6VLl8Le3r7c90pE5SASUbllZmaKAMQ+ffqUq31sbKwIQAwKClLZP378eBGAGB0dLe1zdHQUAYiHDx+W9t25c0dUKBTiuHHjpH1JSUkiAHHevHkqfQYGBoqOjo6lYpg+fbr4+P/qixYtEgGI//zzz1PjLrnGmjVrpH0tWrQQbWxsxLt370r7zp8/LxoYGIiDBw8udb0PPvhApc+33npLrFmz5lOv+fh9mJmZiaIoim+//bbYtWtXURRFsaioSLSzsxNnzJhR5neQm5srFhUVlboPhUIhzpw5U9p3+vTpUvdWolOnTiIAceXKlWUe69Spk8q+PXv2iADE2bNni9euXRPNzc3Fvn37PvMeiUhzrLAQaSArKwsAYGFhUa72u3btAgCMHTtWZf+4ceMAoNRcF3d3d3To0EH6XLt2bTRq1AjXrl177pifVDL35ddff0VxcXG5zklJSUFsbCyGDBmCGjVqSPubNWuGN998U7rPx40YMULlc4cOHXD37l3pOyyPgQMH4uDBg0hNTUV0dDRSU1PLHA4CHs17MTB49EdaUVER7t69Kw13/fHHH+W+pkKhwNChQ8vV1sfHB//73/8wc+ZM+Pv7w9jYGN988025r0VE5ceEhUgDSqUSAHD//v1ytb9x4wYMDAzQsGFDlf12dnawsrLCjRs3VPa/8sorpfqwtrbGvXv3njPi0t599114eXkhKCgItra2GDBgALZs2aI2eSmJs1GjRqWOubm54d9//0VOTo7K/ifvxdraGgA0upcePXrAwsICmzdvxoYNG9C2bdtS32WJ4uJiLFq0CC4uLlAoFKhVqxZq166NCxcuIDMzs9zXrFu3rkYTbOfPn48aNWogNjYWS5YsgY2NTbnPJaLyY8JCpAGlUgl7e3v8+eefGp335KTXpzE0NCxzvyiKz32NkvkVJUxMTHD48GHs27cPgwYNwoULF/Duu+/izTffLNW2IipyLyUUCgX8/f2xdu1abNu27anVFQD4/PPPMXbsWHTs2BE//PAD9uzZg6ioKDRp0qTclSTg0fejiXPnzuHOnTsAgLi4OI3OJaLyY8JCpKGePXsiMTERMTExz2zr6OiI4uJiXL16VWV/WloaMjIypBU/2mBtba2yoqbEk1UcADAwMEDXrl2xcOFCXLp0CXPmzEF0dDQOHDhQZt8lccbHx5c6duXKFdSqVQtmZmYVu4GnGDhwIM6dO4f79++XOVG5xM8//4wuXbrg+++/x4ABA+Dj4wNvb+9S30l5k8fyyMnJwdChQ+Hu7o4PP/wQ4eHhOH36tNb6J6L/MGEh0tCECRNgZmaGoKAgpKWllTqemJiIxYsXA3g0pAGg1EqehQsXAgD8/Py0FleDBg2QmZmJCxcuSPtSUlKwbds2lXbp6emlzi15gNqTS61L1KlTBy1atMDatWtVEoA///wTe/fule6zMnTp0gWzZs3C119/DTs7u6e2MzQ0LFW9+emnn/D333+r7CtJrMpK7jQ1ceJEJCcnY+3atVi4cCHq16+PwMDAp36PRPT8+OA4Ig01aNAAGzduxLvvvgs3NzeVJ90eP34cP/30E4YMGQIAaN68OQIDA/Htt98iIyMDnTp1wqlTp7B27Vr07dv3qUtmn8eAAQMwceJEvPXWWxg1ahQePHiAFStWwNXVVWXS6cyZM3H48GH4+fnB0dERd+7cwfLly1GvXj20b9/+qf3PmzcP3bt3h6enJ4YNG4aHDx9i6dKlsLS0RFhYmNbu40kGBgaYMmXKM9v17NkTM2fOxNChQ/H6668jLi4OGzZsgLOzs0q7Bg0awMrKCitXroSFhQXMzMzQrl07ODk5aRRXdHQ0li9fjunTp0vLrNesWYPOnTtj6tSpCA8P16g/InoGHa9SInpp/fXXX+Lw4cPF+vXri0ZGRqKFhYXo5eUlLl26VMzNzZXaFRQUiDNmzBCdnJzE6tWriw4ODuKkSZNU2ojio2XNfn5+pa7z5HLapy1rFkVR3Lt3r9i0aVPRyMhIbNSokfjDDz+UWta8f/9+sU+fPqK9vb1oZGQk2tvbi++99574119/lbrGk0t/9+3bJ3p5eYkmJiaiUqkUe/XqJV66dEmlTcn1nlw2vWbNGhGAmJSU9NTvVBRVlzU/zdOWNY8bN06sU6eOaGJiInp5eYkxMTFlLkf+9ddfRXd3d7FatWoq99mpUyexSZMmZV7z8X6ysrJER0dHsVWrVmJBQYFKuzFjxogGBgZiTEyM2nsgIs0IoqjBDDgiIiIiHeAcFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHB8dVsuLiYty+fRsWFhZafSQ4ERG9GKIo4v79+7C3t5feCK5tubm5yM/P10pfRkZGMDY21kpfcsKEpZLdvn0bDg4Oug6DiIgq6ObNm6hXr57W+83NzYWJRU2g8IFW+rOzs0NSUpLeJS1MWCqZhYUFAMCo7WgI1RQ6joaoctzYMUnXIRBVmvv3s+Di9Ir057m25efnA4UPoHAPBAyNKtZZUT5SL61Ffn4+ExbSTMkwkFBNwYSF9JZSqdR1CESVrtKH9asZQ6hgwiIK+js1lQkLERGRHAgAKpoU6fFUSSYsREREciAYPNoq2oee0t87IyIiIr3BCgsREZEcCIIWhoT0d0yICQsREZEccEhILf29MyIiItIbrLAQERHJAYeE1GLCQkREJAtaGBLS44ET/b0zIiIi0hussBAREckBh4TUYsJCREQkB1wlpJb+3hkRERHpDVZYiIiI5IBDQmoxYSEiIpIDDgmpxYSFiIhIDlhhUUt/UzEiIiLSG6ywEBERyQGHhNRiwkJERCQHgqCFhIVDQkREREQ6wwoLERGRHBgIj7aK9qGnmLAQERHJAeewqKW/d0ZERER6gxUWIiIiOeBzWNRiwkJERCQHHBJSS3/vjIiIiPQGKyxERERywCEhtZiwEBERyQGHhNRiwkJERCQHrLCopb+pGBEREekNVliIiIjkgENCajFhISIikgMOCamlv6kYERER6Q1WWIiIiGRBC0NCelyHYMJCREQkBxwSUkt/UzEiIiLSG6ywEBERyYEgaGGVkP5WWJiwEBERyQGXNaulv3dGREREas2dOxdt27aFhYUFbGxs0LdvX8THx6u06dy5MwRBUNlGjBih0iY5ORl+fn4wNTWFjY0NQkNDUVhYqNLm4MGDaNWqFRQKBRo2bIiIiAiNYmXCQkREJAclk24rumng0KFDCA4OxokTJxAVFYWCggL4+PggJydHpd3w4cORkpIibeHh4dKxoqIi+Pn5IT8/H8ePH8fatWsRERGBadOmSW2SkpLg5+eHLl26IDY2FqNHj0ZQUBD27NlT7lg5JERERCQHOhgS2r17t8rniIgI2NjY4OzZs+jYsaO039TUFHZ2dmX2sXfvXly6dAn79u2Dra0tWrRogVmzZmHixIkICwuDkZERVq5cCScnJyxYsAAA4ObmhqNHj2LRokXw9fUtV6yssBAREcmBFissWVlZKlteXl65QsjMzAQA1KhRQ2X/hg0bUKtWLTRt2hSTJk3CgwcPpGMxMTHw8PCAra2ttM/X1xdZWVm4ePGi1Mbb21ulT19fX8TExJT762GFhYiISM84ODiofJ4+fTrCwsLUnlNcXIzRo0fDy8sLTZs2lfYPHDgQjo6OsLe3x4ULFzBx4kTEx8dj69atAIDU1FSVZAWA9Dk1NVVtm6ysLDx8+BAmJibPvCcmLERERHKgxSGhmzdvQqlUSrsVCsUzTw0ODsaff/6Jo0ePquz/8MMPpZ89PDxQp04ddO3aFYmJiWjQoEHF4tUAh4SIiIjkQItDQkqlUmV7VsISEhKCyMhIHDhwAPXq1VPbtl27dgCAhIQEAICdnR3S0tJU2pR8Lpn38rQ2SqWyXNUVgAkLERFRlSWKIkJCQrBt2zZER0fDycnpmefExsYCAOrUqQMA8PT0RFxcHO7cuSO1iYqKglKphLu7u9Rm//79Kv1ERUXB09Oz3LEyYSEiIpKBJ5918rybJoKDg/HDDz9g48aNsLCwQGpqKlJTU/Hw4UMAQGJiImbNmoWzZ8/i+vXr+O233zB48GB07NgRzZo1AwD4+PjA3d0dgwYNwvnz57Fnzx5MmTIFwcHBUmVnxIgRuHbtGiZMmIArV65g+fLl2LJlC8aMGVPuWJmwEBERyYAuEpYVK1YgMzMTnTt3Rp06daRt8+bNAAAjIyPs27cPPj4+aNy4McaNG4d+/fphx44dUh+GhoaIjIyEoaEhPD098f7772Pw4MGYOXOm1MbJyQk7d+5EVFQUmjdvjgULFuC7774r95JmgJNuiYiIqixRFNUed3BwwKFDh57Zj6OjI3bt2qW2TefOnXHu3DmN4nscExYiIiI5EP5/q2gfeooJCxERkQw8z5BOGZ1oJxgZ4hwWIiIikj1WWIiIiGSAFRb1mLAQERHJABMW9ZiwEBERyQATFvU4h4WIiIhkjxUWIiIiOeCyZrWYsBAREckAh4TU45AQERERyR4rLERERDIgCNBChUU7scgRExYiIiIZEKCFISE9zlg4JERERESyxwoLERGRDHDSrXpMWIiIiOSAy5rV4pAQERERyR4rLERERHKghSEhkUNCREREVJm0MYel4quM5IsJCxERkQwwYVGPc1iIiIhI9lhhISIikgOuElKLCQsREZEMcEhIPQ4JERERkeyxwkJERCQDrLCox4SFiIhIBpiwqMchISIiIpI9VliIiIhkgBUW9ZiwEBERyQGXNavFISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBJizqMWEhIiKSASYs6nEOCxEREckeKyxERERywFVCajFhISIikgEOCanHISEiIiKSvZeiwiIIArZt24a+ffvqOhR6Aca81wE9O7jB5ZVayM0rwKmLNxG2KgoJN++qtGvrXg9ThnVF68b1UFRcjD8TU9Fvwnrk5hdKbXzauSB0cGc0cbZFXn4hjp2/jvenbZKOd2zphMkfvAE3J1s8yM3Hpj3nMev7/SgqLn5h90u0KGIvIg+cx9UbaTBWVMerHk6YPrIPXBxtAQD3MnPwxbe7cODkFdxKu4eaVubw69QMn43wg9LcBACwMfIEQmZuKLP/+N2fo3YNixd2P/R8WGFRT+cJS2pqKubMmYOdO3fi77//ho2NDVq0aIHRo0eja9euug4Poihi+vTpWLVqFTIyMuDl5YUVK1bAxcVF16HprdebO+K7X0/hXPzfqGZggKlB3tgaPhivDf0aD3ILADxKVn7+YhAW/XgEE5fuQmFRMZo626FYFKV+enVww+JxvTHr+/04fC4J1QwN4FbfRjre1NkWW+a+jwUbDmPE3G2oU8sCC8f0goGhgGkr977w+6aq69gfCRj2Tge0dHNEUVERZq3YgX4jlyFm82SYmSiQ8m8mUv7NxMxP+qKRkx1upqRj3BebkfJvJtZ+MQwA8JZ3K3R9zV2l3+CZPyAvv4DJyktCgBYSFj2exKLThOX69evw8vKClZUV5s2bBw8PDxQUFGDPnj0IDg7GlStXdBkeACA8PBxLlizB2rVr4eTkhKlTp8LX1xeXLl2CsbGxrsPTS+98+oPK54+/3IaEbRPRwtUexy/cAADM+bgbvtl2El/9eFRq93gFxtDAAHNDumPaN1H44fc/pP3xN/6Rfn6rS1NcvJaGeesPAQCSbqcj7Nu9WD2tP8LXHkT2w/xKuT+iJ/285GOVz8umvQ9X389w/vJNvN6qIdwb2GPdl0HScad6tTH5o14YMX0dCguLUK2aIUyMjWBibCS1+ffefRw58xeWTBn4wu6DqDLpdA7Lxx9/DEEQcOrUKfTr1w+urq5o0qQJxo4dixMnTjz1vIkTJ8LV1RWmpqZwdnbG1KlTUVBQIB0/f/48unTpAgsLCyiVSrRu3RpnzpwBANy4cQO9evWCtbU1zMzM0KRJE+zatavM64iiiK+++gpTpkxBnz590KxZM6xbtw63b9/G9u3btfpd0NMpzR4lhveyHgIAalmZoa27A/7JyMGepcMQ/3MoIhcNxWtNX5HOae5aB3VrW6JYLMahb0bg8k/j8dPc91UqLEbVqyHvseEjAHiYVwgTRXU0d7V/AXdGVLas7FwAgJWlqZo2D2FhZoxq1QzLPL5p1ymYGBuh9xstKiNEqgQlQ0IV3fSVzhKW9PR07N69G8HBwTAzMyt13MrK6qnnWlhYICIiApcuXcLixYuxatUqLFq0SDoeEBCAevXq4fTp0zh79iw+/fRTVK9eHQAQHByMvLw8HD58GHFxcfjyyy9hbm5e5nWSkpKQmpoKb29vaZ+lpSXatWuHmJiY57xz0oQgCJgb3A0n4m7g8vU7AID6dawBAJ8O7oy1O8/i7U/X4/zVFGyfHwjnujVU2wR2wfwfDmHAZxuQkf0QOxYNgZXFozH/6DMJeLWJA/q90RQGBgLq1LLAhMGdAAB2Ncv+nSCqbMXFxfhs4S9o19wZ7g3KTpzvZmRj/urdCOz7+lP7+eG3E3jbt7VK1YVkTtDSpqd0NiSUkJAAURTRuHFjjc+dMmWK9HP9+vUxfvx4bNq0CRMmTAAAJCcnIzQ0VOr78fkmycnJ6NevHzw8PAAAzs7OT71OamoqAMDW1lZlv62trXTsSXl5ecjLy5M+Z2VlaXJr9IT5n/jBzckG3UetlvYZGDz6PzIi8gw27o4FAMQl7Eanlk54v3srzPxuHwz+/18ZC344jB1HLgMAgsO34+LmcejbqQkiIs/gwJlETPtmLxaO7oWVk/yRl1+E+T8cwuvN6qO4WASRLoSG/4TL11Kw69vRZR7Pyn6Id8esRCMnO0z8sEeZbU5dSMJfSalYGTaoEiMlerF0lrCI4vP/hbB582YsWbIEiYmJyM7ORmFhIZRKpXR87NixCAoKwvr16+Ht7Y133nkHDRo0AACMGjUKH330Efbu3Qtvb2/069cPzZo1q/D9lJg7dy5mzJihtf6qsvBRPeD7mit6jF6N2//+l/il3r0PQHU+CgDEJ/+LejaWj9qkl26TX1CE6yn3pDYAsPznGCz/OQZ2NS2Qcf8hXrGzwvThb+J6yr1Kuy+ip5kwbwv2HP0TO7/5BHVtrUsdv5+Ti3c+WQELUwXWhw9H9acMB63/9Tg8XOuhhdsrZR4neeIqIfV0NiTk4uICQRA0nlgbExODgIAA9OjRA5GRkTh37hwmT56M/Pz/JkiGhYXh4sWL8PPzQ3R0NNzd3bFt2zYAQFBQEK5du4ZBgwYhLi4Obdq0wdKlS8u8lp2dHQAgLS1NZX9aWpp07EmTJk1CZmamtN28eVOj+6NHwkf1gF97N/QeF4Hk1AyVY8mpGbj9bxYaOtRS2d+wXk3cTHvU9vxfKcjNL1BpU83QAK/YWkltHpd69z5y8wvR7w0P3ErLwPmrKdq+JaKnEkURE+Ztwc6DF/Dr8pFwrFurVJus7IfoN3IZjKobYsOC/8FYUb3MvrIf5OHX/efwfu/XKjts0jLOYVFPZwlLjRo14Ovri2XLliEnJ6fU8YyMjDLPO378OBwdHTF58mS0adMGLi4uuHHjRql2rq6uGDNmDPbu3Qt/f3+sWbNGOubg4IARI0Zg69atGDduHFatWlXmtZycnGBnZ4f9+/dL+7KysnDy5El4enqWeY5CoYBSqVTZSDPzP/FDf+9mGD77Z2Q/yIeNtTlsrM1hbPRfQXDp5mP431vt0LujO5zsa+CzoW/A5ZVaWP//K4LuP8jDmh1n8OmQzujSpgEaOtTEgtE9AQDbD12U+hn5rhfcnWzQuH5tjH+/E0a/1x4Tv/6dQ0L0QoWGb8GW38/g21mBMDc1Rtq/WUj7NwsPcx/9Qywr+yH6jVqOB7n5WDJlIO5n50ptiopUnxm0LeoPFBYVo3/3trq4FaoAQdDOpq90uqx52bJl8PLywquvvoqZM2eiWbNmKCwsRFRUFFasWIHLly+XOsfFxQXJycnYtGkT2rZti507d0rVEwB4+PAhQkND8fbbb8PJyQm3bt3C6dOn0a9fPwDA6NGj0b17d7i6uuLevXs4cOAA3NzcyoxPEASMHj0as2fPhouLi7Ss2d7eng+xq0TD+rwKANj51Qcq+z/+cht+3BMLAFj5ywkYG1XD5x93g5WFCS5eS4V/6Dpcv/3fUM60lXtRWFSMlZ/6w1hRDWcv/40+4yOQ+f8rMADA+9WGGBfQAUbVq+HPxFQETP0R+04lVP5NEj1m9S+Pluf3GrFEZf/X0wIwsOdruBB/C2f/vA4AaO0/U6VN7PYwvGJfU/r8w28x6Nm5OSwtnr7CiOhlJIgVmUyiBSkpKZgzZw4iIyORkpKC2rVro3Xr1hgzZgw6d+78KMgnnnQ7YcIErF69Gnl5efDz88Nrr72GsLAwZGRkID8/H4GBgTh27BjS0tJQq1Yt+Pv7Y968eTA2NsbIkSPx+++/49atW1AqlejWrRsWLVqEmjVrlhlfyYPjvv32W2RkZKB9+/ZYvnw5XF1dy3V/WVlZsLS0hMJzIoRqCm18ZUSyk74/TNchEFWarKws2NWyQmZmZqVUzUv+nnAe+TMMFKVXzWqiOC8H15a+XWmx6pLOExZ9x4SFqgImLKTPXljCMupnGFYwYSnKy8G1JfqZsPDlh0RERCR7On+XEBEREXFZ87MwYSEiIpIBbazy0eN8hUNCREREJH+ssBAREcmAgYEgvXrkeYkVPF/OmLAQERHJAIeE1OOQEBEREckeKyxEREQywFVC6jFhISIikgEOCanHhIWIiEgGWGFRj3NYiIiISPZYYSEiIpIBVljUY4WFiIhIBkrmsFR008TcuXPRtm1bWFhYwMbGBn379kV8fLxKm9zcXAQHB6NmzZowNzdHv379kJaWptImOTkZfn5+MDU1hY2NDUJDQ1FYWKjS5uDBg2jVqhUUCgUaNmyIiIgIjWJlwkJERFRFHTp0CMHBwThx4gSioqJQUFAAHx8f5OTkSG3GjBmDHTt24KeffsKhQ4dw+/Zt+Pv7S8eLiorg5+eH/Px8HD9+HGvXrkVERASmTZsmtUlKSoKfnx+6dOmC2NhYjB49GkFBQdizZ0+5YxVEURS1c9tUlpLXhis8J0KoptB1OESVIn1/mK5DIKo0WVlZsKtlhczMTCiVykrp39LSEh6f/gZDY7MK9VWUm4O4L3o/d6z//PMPbGxscOjQIXTs2BGZmZmoXbs2Nm7ciLfffhsAcOXKFbi5uSEmJgavvfYafv/9d/Ts2RO3b9+Gra0tAGDlypWYOHEi/vnnHxgZGWHixInYuXMn/vzzT+laAwYMQEZGBnbv3l2u2FhhISIikgFtDgllZWWpbHl5eeWKITMzEwBQo0YNAMDZs2dRUFAAb29vqU3jxo3xyiuvICYmBgAQExMDDw8PKVkBAF9fX2RlZeHixYtSm8f7KGlT0kd5MGEhIiLSMw4ODrC0tJS2uXPnPvOc4uJijB49Gl5eXmjatCkAIDU1FUZGRrCyslJpa2tri9TUVKnN48lKyfGSY+raZGVl4eHDh+W6J64SIiIikgFtrhK6efOmypCQQvHsKQnBwcH4888/cfTo0QrFUFmYsBAREcmANp90q1QqNZrDEhISgsjISBw+fBj16tWT9tvZ2SE/Px8ZGRkqVZa0tDTY2dlJbU6dOqXSX8kqosfbPLmyKC0tDUqlEiYmJuWKkUNCREREVZQoiggJCcG2bdsQHR0NJycnleOtW7dG9erVsX//fmlffHw8kpOT4enpCQDw9PREXFwc7ty5I7WJioqCUqmEu7u71ObxPkralPRRHqywEBERyYAuHhwXHByMjRs34tdff4WFhYU058TS0hImJiawtLTEsGHDMHbsWNSoUQNKpRIjR46Ep6cnXnvtNQCAj48P3N3dMWjQIISHhyM1NRVTpkxBcHCwNBQ1YsQIfP3115gwYQI++OADREdHY8uWLdi5c2e5Y2XCQkREJAO6ePnhihUrAACdO3dW2b9mzRoMGTIEALBo0SIYGBigX79+yMvLg6+vL5YvXy61NTQ0RGRkJD766CN4enrCzMwMgYGBmDlzptTGyckJO3fuxJgxY7B48WLUq1cP3333HXx9fcsdKxMWIiIiGdBFhaU8j2IzNjbGsmXLsGzZsqe2cXR0xK5du9T207lzZ5w7d06j+B7HOSxEREQke6ywEBERyYEWhoSgv+8+ZMJCREQkB3xbs3ocEiIiIiLZY4WFiIhIBnSxSuhlwoSFiIhIBjgkpB6HhIiIiEj2WGEhIiKSAQ4JqceEhYiISAY4JKQeh4SIiIhI9lhhISIikgFWWNRjwkJERCQDnMOiHhMWIiIiGWCFRT3OYSEiIiLZY4WFiIhIBjgkpB4TFiIiIhngkJB6HBIiIiIi2WOFhYiISAYEaGFISCuRyBMTFiIiIhkwEAQYVDBjqej5csYhISIiIpI9VliIiIhkgKuE1GPCQkREJANcJaQeExYiIiIZMBAebRXtQ19xDgsRERHJHissREREciBoYUhHjyssTFiIiIhkgJNu1eOQEBEREckeKyxEREQyIPz/fxXtQ18xYSEiIpIBrhJSj0NCREREJHussBAREckAHxynXrkSlt9++63cHfbu3fu5gyEiIqqquEpIvXIlLH379i1XZ4IgoKioqCLxEBEREZVSroSluLi4suMgIiKq0gwEAQYVLJFU9Hw5q9AcltzcXBgbG2srFiIioiqLQ0LqabxKqKioCLNmzULdunVhbm6Oa9euAQCmTp2K77//XusBEhERVQUlk24ruukrjROWOXPmICIiAuHh4TAyMpL2N23aFN99951WgyMiIiICniNhWbduHb799lsEBATA0NBQ2t+8eXNcuXJFq8ERERFVFSVDQhXd9JXGc1j+/vtvNGzYsNT+4uJiFBQUaCUoIiKiqoaTbtXTuMLi7u6OI0eOlNr/888/o2XLlloJioiIiOhxGldYpk2bhsDAQPz9998oLi7G1q1bER8fj3Xr1iEyMrIyYiQiItJ7wv9vFe1DX2lcYenTpw927NiBffv2wczMDNOmTcPly5exY8cOvPnmm5URIxERkd7jKiH1nus5LB06dEBUVJS2YyEiIiIq03M/OO7MmTO4fPkygEfzWlq3bq21oIiIiKoaA+HRVtE+9JXGCcutW7fw3nvv4dixY7CysgIAZGRk4PXXX8emTZtQr149bcdIRESk9/i2ZvU0nsMSFBSEgoICXL58Genp6UhPT8fly5dRXFyMoKCgyoiRiIiIqjiNKyyHDh3C8ePH0ahRI2lfo0aNsHTpUnTo0EGrwREREVUlelwgqTCNExYHB4cyHxBXVFQEe3t7rQRFRERU1XBISD2Nh4TmzZuHkSNH4syZM9K+M2fO4JNPPsH8+fO1GhwREVFVUTLptqKbvipXhcXa2lola8vJyUG7du1Qrdqj0wsLC1GtWjV88MEH6Nu3b6UESkRERFVXuRKWr776qpLDICIiqto4JKReuRKWwMDAyo6DiIioSuOj+dV77gfHAUBubi7y8/NV9imVygoFRERERPQkjROWnJwcTJw4EVu2bMHdu3dLHS8qKtJKYERERFWJgSDAoIJDOhU9X840XiU0YcIEREdHY8WKFVAoFPjuu+8wY8YM2NvbY926dZURIxERkd4TBO1s+krjCsuOHTuwbt06dO7cGUOHDkWHDh3QsGFDODo6YsOGDQgICKiMOImIiKgK07jCkp6eDmdnZwCP5qukp6cDANq3b4/Dhw9rNzoiIqIqomSVUEU3faVxwuLs7IykpCQAQOPGjbFlyxYAjyovJS9DJCIiIs1wSEg9jROWoUOH4vz58wCATz/9FMuWLYOxsTHGjBmD0NBQrQdIREREpPEcljFjxkg/e3t748qVKzh79iwaNmyIZs2aaTU4IiKiqoKrhNTTuMLyJEdHR/j7+zNZISIiqgBdDAkdPnwYvXr1gr29PQRBwPbt21WODxkypNQcmW7duqm0SU9PR0BAAJRKJaysrDBs2DBkZ2ertLlw4QI6dOgAY2NjODg4IDw8XOPvp1wVliVLlpS7w1GjRmkcBBERUVWni0fz5+TkoHnz5vjggw/g7+9fZptu3bphzZo10meFQqFyPCAgACkpKYiKikJBQQGGDh2KDz/8EBs3bgQAZGVlwcfHB97e3li5ciXi4uLwwQcfwMrKCh9++GG5Yy1XwrJo0aJydSYIAhMWIiKil0T37t3RvXt3tW0UCgXs7OzKPHb58mXs3r0bp0+fRps2bQAAS5cuRY8ePTB//nzY29tjw4YNyM/Px+rVq2FkZIQmTZogNjYWCxcu1H7CUrIqiJ5fcuRnfG0B6S3rtiG6DoGo0ohF+c9upAUGqPg8jQrP8yjDwYMHYWNjA2tra7zxxhuYPXs2atasCQCIiYmBlZWVlKwAj+a3GhgY4OTJk3jrrbcQExODjh07wsjISGrj6+uLL7/8Evfu3YO1tXW54qjQu4SIiIhIO7Q5JJSVlaWyX6FQlBrKKY9u3brB398fTk5OSExMxGeffYbu3bsjJiYGhoaGSE1NhY2Njco51apVQ40aNZCamgoASE1NhZOTk0obW1tb6RgTFiIioirKwcFB5fP06dMRFhamcT8DBgyQfvbw8ECzZs3QoEEDHDx4EF27dq1omBphwkJERCQDggAYVHBVckmB5ubNmyrTEJ6nulIWZ2dn1KpVCwkJCejatSvs7Oxw584dlTaFhYVIT0+X5r3Y2dkhLS1NpU3J56fNjSlLZQx3ERERkYYMBO1swKNX5zy+aSthuXXrFu7evYs6deoAADw9PZGRkYGzZ89KbaKjo1FcXIx27dpJbQ4fPoyCggKpTVRUFBo1alTu4SCACQsREVGVlZ2djdjYWMTGxgJ4tMgmNjYWycnJyM7ORmhoKE6cOIHr169j//796NOnDxo2bAhfX18AgJubG7p164bhw4fj1KlTOHbsGEJCQjBgwADY29sDAAYOHAgjIyMMGzYMFy9exObNm7F48WKMHTtWo1ifK2E5cuQI3n//fXh6euLvv/8GAKxfvx5Hjx59nu6IiIiqPF28/PDMmTNo2bIlWrZsCQAYO3YsWrZsiWnTpsHQ0BAXLlxA79694erqimHDhqF169Y4cuSISsVmw4YNaNy4Mbp27YoePXqgffv2+Pbbb6XjlpaW2Lt3L5KSktC6dWuMGzcO06ZN02hJM/Acc1h++eUXDBo0CAEBATh37hzy8vIAAJmZmfj888+xa9cuTbskIiKq8gy0MIdF0/M7d+4MURSfenzPnj3P7KNGjRrSQ+KeplmzZjhy5IhmwT1B4wrL7NmzsXLlSqxatQrVq1eX9nt5eeGPP/6oUDBEREREZdG4whIfH4+OHTuW2m9paYmMjAxtxERERFTlPM+7gMrqQ19pXGGxs7NDQkJCqf1Hjx6Fs7OzVoIiIiKqakre1lzRTV9pnLAMHz4cn3zyCU6ePAlBEHD79m1s2LAB48ePx0cffVQZMRIREek9Ay1t+krjIaFPP/0UxcXF6Nq1Kx48eICOHTtCoVBg/PjxGDlyZGXESERERFWcxgmLIAiYPHkyQkNDkZCQgOzsbLi7u8Pc3Lwy4iMiIqoSOIdFved+NL+RkRHc3d21GQsREVGVZYCKz0ExgP5mLBonLF26dFH7YJro6OgKBURERET0JI0TlhYtWqh8LigoQGxsLP78808EBgZqKy4iIqIqhUNC6mmcsCxatKjM/WFhYcjOzq5wQERERFWRLp50+zLR2gqo999/H6tXr9ZWd0RERESS5550+6SYmBgYGxtrqzsiIqIqRRBQ4Um3HBJ6jL+/v8pnURSRkpKCM2fOYOrUqVoLjIiIqCrhHBb1NE5YLC0tVT4bGBigUaNGmDlzJnx8fLQWGBEREVEJjRKWoqIiDB06FB4eHrC2tq6smIiIiKocTrpVT6NJt4aGhvDx8eFbmYmIiLRM0NJ/+krjVUJNmzbFtWvXKiMWIiKiKqukwlLRTV9pnLDMnj0b48ePR2RkJFJSUpCVlaWyEREREWlbueewzJw5E+PGjUOPHj0AAL1791Z5RL8oihAEAUVFRdqPkoiISM9xDot65U5YZsyYgREjRuDAgQOVGQ8REVGVJAiC2nf1lbcPfVXuhEUURQBAp06dKi0YIiIiorJotKxZnzM3IiIiXeKQkHoaJSyurq7PTFrS09MrFBAREVFVxCfdqqdRwjJjxoxST7olIiIiqmwaJSwDBgyAjY1NZcVCRERUZRkIQoVffljR8+Ws3AkL568QERFVHs5hUa/cD44rWSVERERE9KKVu8JSXFxcmXEQERFVbVqYdKvHrxLSbA4LERERVQ4DCDCoYMZR0fPljAkLERGRDHBZs3oav/yQiIiI6EVjhYWIiEgGuEpIPSYsREREMsDnsKjHISEiIiKSPVZYiIiIZICTbtVjwkJERCQDBtDCkJAeL2vmkBARERHJHissREREMsAhIfWYsBAREcmAASo+7KHPwyb6fG9ERESkJ1hhISIikgFBECBUcEynoufLGRMWIiIiGRBQ8Zct62+6woSFiIhIFvikW/U4h4WIiIhkjxUWIiIimdDf+kjFMWEhIiKSAT6HRT0OCREREZHsscJCREQkA1zWrB4TFiIiIhngk27V0+d7IyIiIj3BCgsREZEMcEhIPSYsREREMsAn3arHISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBrhJSjwkLERGRDLDCop4+J2NERESkJ1hhISIikgGuElKPCQsREZEM8OWH6nFIiIiIiGSPFRYiIiIZMIAAgwoO6lT0fDljhYWIiEgGSoaEKrpp4vDhw+jVqxfs7e0hCAK2b9+uclwURUybNg116tSBiYkJvL29cfXqVZU26enpCAgIgFKphJWVFYYNG4bs7GyVNhcuXECHDh1gbGwMBwcHhIeHa/z9MGEhIiKqonJyctC8eXMsW7aszOPh4eFYsmQJVq5ciZMnT8LMzAy+vr7Izc2V2gQEBODixYuIiopCZGQkDh8+jA8//FA6npWVBR8fHzg6OuLs2bOYN28ewsLC8O2332oUK4eEiIiIZED4//8q2ocmunfvju7du5d5TBRFfPXVV5gyZQr69OkDAFi3bh1sbW2xfft2DBgwAJcvX8bu3btx+vRptGnTBgCwdOlS9OjRA/Pnz4e9vT02bNiA/Px8rF69GkZGRmjSpAliY2OxcOFClcTmWVhhISIikgFtDgllZWWpbHl5eRrHk5SUhNTUVHh7e0v7LC0t0a5dO8TExAAAYmJiYGVlJSUrAODt7Q0DAwOcPHlSatOxY0cYGRlJbXx9fREfH4979+6VOx4mLERERHrGwcEBlpaW0jZ37lyN+0hNTQUA2Nraquy3tbWVjqWmpsLGxkbleLVq1VCjRg2VNmX18fg1yoNDQkRERDIgaGGVUMmQ0M2bN6FUKqX9CoWiQv3KASssREREMqDNISGlUqmyPU/CYmdnBwBIS0tT2Z+WliYds7Ozw507d1SOFxYWIj09XaVNWX08fo3yYMJCREQkA7pY1qyOk5MT7OzssH//fmlfVlYWTp48CU9PTwCAp6cnMjIycPbsWalNdHQ0iouL0a5dO6nN4cOHUVBQILWJiopCo0aNYG1tXe54mLAQERFVUdnZ2YiNjUVsbCyARxNtY2NjkZycDEEQMHr0aMyePRu//fYb4uLiMHjwYNjb26Nv374AADc3N3Tr1g3Dhw/HqVOncOzYMYSEhGDAgAGwt7cHAAwcOBBGRkYYNmwYLl68iM2bN2Px4sUYO3asRrFyDgsREZEM6GJZ85kzZ9ClSxfpc0kSERgYiIiICEyYMAE5OTn48MMPkZGRgfbt22P37t0wNjaWztmwYQNCQkLQtWtXGBgYoF+/fliyZIl03NLSEnv37kVwcDBat26NWrVqYdq0aRotaQYAQRRFUaMzSCNZWVmwtLRE2t1MlQlQRPrEum2IrkMgqjRiUT7y4lYhM7Ny/hwv+Xvi19PXYGZuUaG+crLvo09b50qLVZc4JERERESyxyEhIiIiGdDFkNDLhAkLERGRDGhjlY82VwnJDYeEiIiISPZYYSEiIpIBARUf0tHjAgsTFiIiIjkwEB5tFe1DX3FIiIiIiGTvpaiwCIKAbdu2SU/Wo6pl4Zo9iDxwHldvpMFYUR2vNnNGWEgfuNT/7+2fEVuP4uc9Z3Ah/hbu5+TienQ4LC1MVfpp1nsabqakq+ybFtwbY4b4vJD7ICoxZogPenZpDhdHW+TmFeDUhWsI+/pXJNz4750sNjUtMHPUW+jcrjHMTRVIuHEHC1bvwY4DsVIbK6UpwkPfgW/7phBFEb9Fx2LSgp+R8zBfatPXuyXGDvVFg1dscPdeNlZtOYSlP+wHyQ9XCamn8wpLamoqRo4cCWdnZygUCjg4OKBXr14q7y7Qpa1bt8LHxwc1a9aEIAjS44vpxTn+RwKC3umIvavHY+vXISgoLIL/yK+R8zBPavMwtwBdPd2fmXx89j8/XPn9c2n78N1OlR0+USmvt2qI7346DJ8P5sM/5GtUr2aIrUtDYGpsJLVZETYYDR1tMHDsN/B673PsOBCLNXM/gIdrPanNqlmBaOxcB/4hX2PAmJV4vWVDfPXZQOm49+vu+HbWEKz55SheHzAH47/cjI8GvoHh73R8ofdL5SO3dwnJjU4rLNevX4eXlxesrKwwb948eHh4oKCgAHv27EFwcDCuXLmiy/AAADk5OWjfvj369++P4cOH6zqcKunnpcEqn5dPfx8uPpMQe/kmvFo1BAB8NPDRo6WPnv1LbV/mpsawraVfT3+kl887o5arfP54xg9IiPoCLdwccPxcIgDg1WbOGP/FJvxx6QYAYMHqPfj4vTfQws0BcX/dgmt9W3i/3gRdBocj9nIyAGDi/J+w5auPMHXxNqT+m4l3u7+KnQfPY83WowCAG3/fxaKIvfgk8E2s+unwC7xjKg8BFZ80q8f5im4rLB9//DEEQcCpU6fQr18/uLq6okmTJhg7dixOnDjx1PMmTpwIV1dXmJqawtnZGVOnTlV5C+T58+fRpUsXWFhYQKlUonXr1jhz5gwA4MaNG+jVqxesra1hZmaGJk2aYNeuXU+91qBBgzBt2jR4e3tr78apQrKycwEA1krTZ7Qs7au1e+HsPQEdA77AkvX7UFhYpO3wiDSmNH/0XpZ7WQ+kfacuXMNbb7aGldIUgiDA/83WUCiq4ejZqwCAth5OyMh6ICUrAHDwVDyKi0W0buoIADAyqoa8/EKVa+Xm5aOurTUc6tSo7Nsi0iqdVVjS09Oxe/duzJkzB2ZmZqWOW1lZPfVcCwsLREREwN7eHnFxcRg+fDgsLCwwYcIEAEBAQABatmyJFStWwNDQELGxsahevToAIDg4GPn5+Th8+DDMzMxw6dIlmJuba+2+8vLykJf331BFVlaW1vomoLi4GJMW/ox2zZ3h3tBeo3P/924nNG/sACulGU5duIaZy35D2r+ZmDOmXyVFS/RsgiBg7ti3cSI2EZcTU6T9QyetxurPP0DS/nAUFBbhYW4+BoWuQtKtfwEAtjWV+OfefZW+ioqKcS/rAWxrPqoiRp+4jDlj/NEx0hVHzlyFs0NtBAd0BQDY1bIsNaeLdMsAAgwqOKZjoMc1Fp0lLAkJCRBFEY0bN9b43ClTpkg/169fH+PHj8emTZukhCU5ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnitxGKXPnzsWMGTO02if9Z3z4FlxOTMHvq8ZofG7JH9QA0NSlLoyqV8OYz3/EtODeUBhV12aYROU2f0J/uDWog+7DF6nsnzyiJywtTNDn4yVIz8hBj07NsGbuB+gx/CtcSrxdrr7XbjsGp7q1sGnhCFSvZoj7OblYuekgJv3PD8XFxZVxO1QBHBJST2dDQhV5SfTmzZvh5eUFOzs7mJubY8qUKUhO/q8sOnbsWAQFBcHb2xtffPEFEhMTpWOjRo3C7Nmz4eXlhenTp+PChQsVuo8nTZo0CZmZmdJ28+ZNrfZflYWGb8GeI39ix4pRqGtrXeH+Wjepj8KiYiTf5r8ySTfCQ9+Bb4em6PXREty+kyHtr1+3Fj58txNGzvoBh0//hT+v/o3w737HucvJCPr/CbNpd7NQ21r1zb6GhgawVpoi7e5/ld2wr39FvU7j0Kz3NDTq9pk0J+b633cr/waJtEhnCYuLiwsEQdB4Ym1MTAwCAgLQo0cPREZG4ty5c5g8eTLy8/9bxhcWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cu1dp9KRQKKJVKlY0qRhRFhIZvwc6D5/HbilFwrFtLK/3G/XULBgYCateo2OvciZ5HeOg78OvcHL0/WoLk26rJQ8lqoeJi1X/YFRWJEP7/yWCn45JgpTRF88YO0vGObVxhYCDg7J83VM4rLhaR8k8mCgqL0M+nNU5duIa7GdmVcVtUEYKWNj2ls4SlRo0a8PX1xbJly5CTk1PqeEZGRpnnHT9+HI6Ojpg8eTLatGkDFxcX3Lhxo1Q7V1dXjBkzBnv37oW/vz/WrFkjHXNwcMCIESOwdetWjBs3DqtWrdLafZH2jf9yC7b8fhqrZg2Buakx0v7NQtq/WXiY+1+SmvZvFuLib+HazUfj+xcTbiMu/hbuZT763Tp14RpWbDyAuL9u4fqtf7Hl99OYvOgX9O/eFlbPMXmXqCLmT+yP/t3bYvjUCGQ/yIVNTQvY1LSAseLR0ORf11ORmHwHiya9h1bujqhftxaCA95Al3aNsOvg+f9vk4Z9xy9i8eSBaOXuiHbNnBEe2h9b9/6B1H8zAQA1LM0w1L89XBxt0dS1LuaO64c+XVti0oJfdHbv9HSClv7TVzpd1rxs2TJ4eXnh1VdfxcyZM9GsWTMUFhYiKioKK1aswOXLl0ud4+LiguTkZGzatAlt27bFzp07peoJADx8+BChoaF4++234eTkhFu3buH06dPo1+/RxMrRo0eje/fucHV1xb1793DgwAG4ubk9Ncb09HQkJyfj9u1HY8bx8fEAADs7O9jZ2Wnz66CnWP3LEQBAzxGLVfYvm/Y+BvZ6DQCwZusRfLnqd+mY34dfqbRRGFXH1qiz+GLVLuQXFMLRviY+eq8LggPeeDE3QfSYYW8/GtbZ+c1olf0fz1iPHyNPorCoGP1Hr8D0kD74ceH/YGaqQNLNf/Bx2HpEHb8ktR8+dS3mhfbH9uUjpQfHfTr/J5U+B/i1w8xP3oIgPKrK9BqxWBoWInqZCGJFJpNoQUpKCubMmYPIyEikpKSgdu3aaN26NcaMGYPOnTs/CvKJJ91OmDABq1evRl5eHvz8/PDaa68hLCwMGRkZyM/PR2BgII4dO4a0tDTUqlUL/v7+mDdvHoyNjTFy5Ej8/vvvuHXrFpRKJbp164ZFixahZs2aZcYXERGBoUOHlto/ffp0hIWFPfP+srKyYGlpibS7mRweIr1l3TZE1yEQVRqxKB95cauQmVk5f46X/D2xPzYZ5hYV6z/7fha6tnil0mLVJZ0nLPqOCQtVBUxYSJ+9qIQlWksJyxt6mrDo/NH8RERERM/yUrz8kIiISO/xQSxqMWEhIiKSAb6tWT0mLERERDKgjbct6/PbmjmHhYiIiGSPFRYiIiIZ4BQW9ZiwEBERyQEzFrU4JERERESyxwoLERGRDHCVkHpMWIiIiGSAq4TU45AQERERyR4rLERERDLAObfqMWEhIiKSA2YsanFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQBXCanHhIWIiEgGOIVFPc5hISIiItljhYWIiEgOWGJRiwkLERGRDHDSrXocEiIiIiLZY4WFiIhIBrhKSD0mLERERDLAKSzqcUiIiIiIZI8VFiIiIjlgiUUtJixEREQywFVC6nFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQCnsKjHhIWIiEgOmLGoxTksREREJHussBAREckAVwmpx4SFiIhIDrQw6VaP8xUOCREREZH8scJCREQkA5xzqx4TFiIiIjlgxqIWh4SIiIhI9lhhISIikgGuElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiIZ4Jxb9VhhISIikgNBS5sGwsLCIAiCyta4cWPpeG5uLoKDg1GzZk2Ym5ujX79+SEtLU+kjOTkZfn5+MDU1hY2NDUJDQ1FYWPgcX4B6rLAQERHJgK4m3TZp0gT79u2TPler9l9qMGbMGOzcuRM//fQTLC0tERISAn9/fxw7dgwAUFRUBD8/P9jZ2eH48eNISUnB4MGDUb16dXz++ecVupcnMWEhIiKqwqpVqwY7O7tS+zMzM/H9999j48aNeOONNwAAa9asgZubG06cOIHXXnsNe/fuxaVLl7Bv3z7Y2tqiRYsWmDVrFiZOnIiwsDAYGRlpLU4OCREREcmAgP9WCj339v99ZWVlqWx5eXlPve7Vq1dhb28PZ2dnBAQEIDk5GQBw9uxZFBQUwNvbW2rbuHFjvPLKK4iJiQEAxMTEwMPDA7a2tlIbX19fZGVl4eLFi1r9fpiwEBERyYA2p7A4ODjA0tJS2ubOnVvmNdu1a4eIiAjs3r0bK1asQFJSEjp06ID79+8jNTUVRkZGsLKyUjnH1tYWqampAIDU1FSVZKXkeMkxbeKQEBERkZ65efMmlEql9FmhUJTZrnv37tLPzZo1Q7t27eDo6IgtW7bAxMSk0uPUBCssREREMlDh4aDHHjynVCpVtqclLE+ysrKCq6srEhISYGdnh/z8fGRkZKi0SUtLk+a82NnZlVo1VPK5rHkxFcGEhYiISBZ0sK75CdnZ2UhMTESdOnXQunVrVK9eHfv375eOx8fHIzk5GZ6engAAT09PxMXF4c6dO1KbqKgoKJVKuLu7VyiWJ3FIiIiIqIoaP348evXqBUdHR9y+fRvTp0+HoaEh3nvvPVhaWmLYsGEYO3YsatSoAaVSiZEjR8LT0xOvvfYaAMDHxwfu7u4YNGgQwsPDkZqaiilTpiA4OLjcVZ3yYsJCREQkA7p4l9CtW7fw3nvv4e7du6hduzbat2+PEydOoHbt2gCARYsWwcDAAP369UNeXh58fX2xfPly6XxDQ0NERkbio48+gqenJ8zMzBAYGIiZM2dW7EbKIIiiKGq9V5JkZWXB0tISaXczVSZAEekT67Yhug6BqNKIRfnIi1uFzMzK+XO85O+JKzf+gUUF+7+flYXGjrUrLVZd4hwWIiIikj0OCREREcmALoaEXiZMWIiIiGRAV+8SelkwYSEiIpKDiq9Krvj5MsY5LERERCR7rLAQERHJAAss6jFhISIikgFOulWPQ0JEREQke6ywEBERyQBXCanHhIWIiEgOOIlFLQ4JERERkeyxwkJERCQDLLCox4SFiIhIBrhKSD0OCREREZHsscJCREQkCxVfJaTPg0JMWIiIiGSAQ0LqcUiIiIiIZI8JCxEREckeh4SIiIhkgENC6jFhISIikgE+ml89DgkRERGR7LHCQkREJAMcElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiI5YIlFLSYsREREMsBVQupxSIiIiIhkjxUWIiIiGeAqIfWYsBAREckAp7Cox4SFiIhIDpixqMU5LERERCR7rLAQERHJAFcJqceEhYiISAY46VY9JiyVTBRFAMD9rCwdR0JUecSifF2HQFRpSn6/S/48ryxZWvh7Qht9yBUTlkp2//59AEBDJwcdR0JERBVx//59WFpaar1fIyMj2NnZwUVLf0/Y2dnByMhIK33JiSBWdspYxRUXF+P27duwsLCAoM+1OpnIysqCg4MDbt68CaVSqetwiLSOv+MvniiKuH//Puzt7WFgUDlrVXJzc5Gfr51KpZGREYyNjbXSl5ywwlLJDAwMUK9ePV2HUeUolUr+YU56jb/jL1ZlVFYeZ2xsrJdJhjZxWTMRERHJHhMWIiIikj0mLKRXFAoFpk+fDoVCoetQiCoFf8epquKkWyIiIpI9VliIiIhI9piwEBERkewxYSEiIiLZY8JCsiYIArZv367rMIgqBX+/icqPCQvpTGpqKkaOHAlnZ2coFAo4ODigV69e2L9/v65DA/Do6ZbTpk1DnTp1YGJiAm9vb1y9elXXYdFLQu6/31u3boWPjw9q1qwJQRAQGxur65CI1GLCQjpx/fp1tG7dGtHR0Zg3bx7i4uKwe/dudOnSBcHBwboODwAQHh6OJUuWYOXKlTh58iTMzMzg6+uL3NxcXYdGMvcy/H7n5OSgffv2+PLLL3UdClH5iEQ60L17d7Fu3bpidnZ2qWP37t2TfgYgbtu2Tfo8YcIE0cXFRTQxMRGdnJzEKVOmiPn5+dLx2NhYsXPnzqK5ubloYWEhtmrVSjx9+rQoiqJ4/fp1sWfPnqKVlZVoamoquru7izt37iwzvuLiYtHOzk6cN2+etC8jI0NUKBTijz/+WMG7J30n99/vxyUlJYkAxHPnzj33/RK9CHyXEL1w6enp2L17N+bMmQMzM7NSx62srJ56roWFBSIiImBvb4+4uDgMHz4cFhYWmDBhAgAgICAALVu2xIoVK2BoaIjY2FhUr14dABAcHIz8/HwcPnwYZmZmuHTpEszNzcu8TlJSElJTU+Ht7S3ts7S0RLt27RATE4MBAwZU4BsgffYy/H4TvYyYsNALl5CQAFEU0bhxY43PnTJlivRz/fr1MX78eGzatEn6Az05ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnp14nNTUVAGBra6uy39bWVjpGVJaX4feb6GXEOSz0wokVeLjy5s2b4eXlBTs7O5ibm2PKlClITk6Wjo8dOxZBQUHw9vbGF198gcTEROnYqFGjMHv2bHh5eWH69Om4cOFChe6DqCz8/SaqHExY6IVzcXGBIAi4cuWKRufFxMQgICAAPXr0QGRkJM6dO4fJkycjPz9fahMWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cuLfNadnZ2AIC0tDSV/WlpadIxorK8DL/fRC8l3U6hoaqqW7duGk9KnD9/vujs7KzSdtiwYaKlpeVTrzNgwACxV69eZR779NNPRQ8PjzKPlUy6nT9/vrQvMzOTk26pXOT++/04TrqllwUrLKQTy5YtQ1FREV599VX88ssvuHr1Ki5fvowlS5bA09OzzHNcXFyQnJyMTZs2ITExEUuWLJH+dQkADx8+REhICA4ePIgbN27g2LFjOH36NNzc3AAAo0ePxp49e5CUlIQ//vgDBw4ckI49SRAEjB49GrNnz8Zvv/2GuLg4DB48GPb29ujbt6/Wvw/SL3L//QYeTQ6OjY3FpUuXAADx8fGIjY3lHC2SL11nTFR13b59WwwODhYdHR1FIyMjsW7dumLv3r3FAwcOSG3wxLLP0NBQsWbNmqK5ubn47rvviosWLZL+BZqXlycOGDBAdHBwEI2MjER7e3sxJCREfPjwoSiKohgSEiI2aNBAVCgUYu3atcVBgwaJ//7771PjKy4uFqdOnSra2tqKCoVC7Nq1qxgfH18ZXwXpIbn/fq9Zs0YEUGqbPn16JXwbRBUniGIFZogRERERvQAcEiIiIiLZY8JCREREsseEhYiIiGSPCQsRERHJHhMWIiIikj0mLERERCR7TFiIiIhI9piwEFUBQ4YMUXlCb+fOnTF69OgXHsfBgwchCAIyMjKe2kYQBGzfvr3cfYaFhaFFixYViuv69esQBAGxsbEV6oeIKg8TFiIdGTJkCARBgCAIMDIyQsOGDTFz5kwUFhZW+rW3bt2KWbNmlatteZIMIqLKVk3XARBVZd26dcOaNWuQl5eHXbt2ITg4GNWrV8ekSZNKtc3Pz4eRkZFWrlujRg2t9ENE9KKwwkKkQwqFAnZ2dnB0dMRHH30Eb29v/PbbbwD+G8aZM2cO7O3t0ahRIwDAzZs30b9/f1hZWaFGjRro06cPrl+/LvVZVFSEsWPHwsrKCjVr1sSECRPw5Bs4nhwSysvLw8SJE+Hg4ACFQoGGDRvi+++/x/Xr19GlSxcAgLW1NQRBwJAhQwAAxcXFmDt3LpycnGBiYoLmzZvj559/VrnOrl274OrqChMTE3Tp0kUlzvKaOHEiXF1dYWpqCmdnZ0ydOhUFBQWl2n3zzTdwcHCAqakp+vfvj8zMTJXj3333Hdzc3GBsbIzGjRtj+fLlGsdCRLrDhIVIRkxMTJCfny993r9/P+Lj4xEVFYXIyEgUFBTA19cXFhYWOHLkCI4dOwZzc3N069ZNOm/BggWIiIjA6tWrcfToUaSnp6u89bcsgwcPxo8//oglS5bg8uXL+Oabb2Bubg4HBwf88ssvAB69zTclJQWLFy8GAMydOxfr1q3DypUrcfHiRYwZMwbvv/8+Dh06BOBRYuXv749evXohNjYWQUFB+PTTTzX+TiwsLBAREYFLly5h8eLFWLVqFRYtWqTSJiEhAVu2bMGOHTuwe/dunDt3Dh9//LF0fMOGDZg2bRrmzJmDy5cv4/PPP8fUqVOxdu1ajeMhIh3R8csXiaqswMBAsU+fPqIoPnozdFRUlKhQKMTx48dLx21tbcW8vDzpnPXr14uNGjUSi4uLpX15eXmiiYmJuGfPHlEURbFOnTpieHi4dLygoECsV6+edC1RFMVOnTqJn3zyiSiKohgfHy8CEKOiosqM88CBAyIA8d69e9K+3Nxc0dTUVDx+/LhK22HDhonvvfeeKIqiOGnSJNHd3V3l+MSJE0v19SQ88QbjJ82bN09s3bq19Hn69OmioaGheOvWLWnf77//LhoYGIgpKSmiKIpigwYNxI0bN6r0M2vWLNHT01MURVFMSkoSAYjnzp176nWJSLc4h4VIhyIjI2Fubo6CggIUFxdj4MCBCAsLk457eHiozFs5f/48EhISYGFhodJPbm4uEhMTkZmZiZSUFLRr1046Vq1aNbRp06bUsFCJ2NhYGBoaolOnTuWOOyEhAQ8ePMCbb76psj8/Px8tW7YEAFy+fFklDgDw9PQs9zVKbN68GUuWLEFiYiKys7NRWFgIpVKp0uaVV15B3bp1Va5TXFyM+Ph4WFhYIDExEcOGDcPw4cOlNoWFhbC0tNQ4HiLSDSYsRDrUpUsXrFixAkZGRrC3t0e1aqr/S5qZmal8zs7ORuvWrbFhw4ZSfdWuXfu5YjAxMdH4nOzsbADAzp07VRIF4NG8HG2JiYlBQEAAZsyYAV9fX1haWmLTpk1YsGCBxrGuWrWqVAJlaGiotViJqHIxYSHSITMzMzRs2LDc7Vu1aoXNmzfDxsamVJWhRJ06dXDy5El07NgRwKNKwtmzZ9GqVasy23t4eKC4uBiHDh2Ct7d3qeMlFZ6ioiJpn7u7OxQKBZKTk59amXFzc5MmEJc4ceLEs2/yMcePH4ejoyMmT54s7btx40apdsnJybh9+zbs7e2l6xgYGKBRo0awtbWFvb09rl27hoCAAI2uT0TywUm3RC+RgIAA1KpVC3369MGRI0eQlJSEgwcPYtSoUbh16xYA4JNPPsEXX3yB7du348qVK/j444/VPkOlfv36CAwMxAcffIDt27dLfW7ZsgUA4OjoCEEQEBkZiX/++QfZ2dmwsLDA+PHjMWbMGKxduxaJiYn4448/sHTpUmki64gRI3D16lWEhoYiPj4eGzduREREhEb36+LiguTkZGzatAmJiYlYsmRJmROIjY2NERgYiPPnz+PIkSMYNWoU+vfvDzs7OwDAjBkzMHfuXCxZsgR//fUX4uLisGbNGixcuFCjeIhId5iwEL1ETE1NcfjwYbzyyivw9/eHm5sbhg0bhtzcXKniMm7cOAwaNAiBgYHw9PSEhYUF3nrrLbX9rlixAm+//TY+/vhjNG7cGMOHD0dOTg4AoG7dupgxYwY+/fRT2NraIiQkBAAwa9YsTJ06FXPnzoWbmxu6deuGnTt3wsnJCcCjeSW//PILtm/fjubNm2PlypX4/PPPNbrf3r17Y8yYMQgJCUGLFi1w/PhxTJ06tVS7hg0bwt/fHz169ICPjw+aNWumsmw5KCgI3333HdasWQMPDw906tQJERERUqxEJH+C+LSZeEREREQywQoLERERyR4TFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHhIWIiIhkjwkLERERyR4TFiIiIpI9JixEREQke0xYiIiISPb+D8eEamDpGfNzAAAAAElFTkSuQmCC",
|
760 |
+
"text/plain": [
|
761 |
+
"<Figure size 640x480 with 2 Axes>"
|
762 |
+
]
|
763 |
+
},
|
764 |
+
"metadata": {},
|
765 |
+
"output_type": "display_data"
|
766 |
+
}
|
767 |
+
],
|
768 |
+
"source": [
|
769 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
770 |
+
"import matplotlib.pyplot as plt\n",
|
771 |
+
"\n",
|
772 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
773 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
774 |
+
"\n",
|
775 |
+
"# 可视化混淆矩阵\n",
|
776 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
777 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
778 |
+
"plt.title('Confusion Matrix')\n",
|
779 |
+
"plt.show()"
|
780 |
+
]
|
781 |
+
},
|
782 |
+
{
|
783 |
+
"cell_type": "code",
|
784 |
+
"execution_count": null,
|
785 |
+
"id": "23e3a640-88d7-4a1e-8515-7c417d50f018",
|
786 |
+
"metadata": {},
|
787 |
+
"outputs": [],
|
788 |
+
"source": []
|
789 |
+
}
|
790 |
+
],
|
791 |
+
"metadata": {
|
792 |
+
"kernelspec": {
|
793 |
+
"display_name": "Python 3 (ipykernel)",
|
794 |
+
"language": "python",
|
795 |
+
"name": "python3"
|
796 |
+
},
|
797 |
+
"language_info": {
|
798 |
+
"codemirror_mode": {
|
799 |
+
"name": "ipython",
|
800 |
+
"version": 3
|
801 |
+
},
|
802 |
+
"file_extension": ".py",
|
803 |
+
"mimetype": "text/x-python",
|
804 |
+
"name": "python",
|
805 |
+
"nbconvert_exporter": "python",
|
806 |
+
"pygments_lexer": "ipython3",
|
807 |
+
"version": "3.12.3"
|
808 |
+
}
|
809 |
+
},
|
810 |
"nbformat": 4,
|
811 |
"nbformat_minor": 5
|
812 |
}
|
03-gene-task/.ipynb_checkpoints/3-multi-seq-task-checkpoint.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
03-gene-task/.ipynb_checkpoints/5-regression-task-checkpoint.ipynb
CHANGED
@@ -1,9 +1,563 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": []
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "c499a5c3-0244-41c4-9947-e166206204e2",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.5 回归类任务"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "4678171b-bbc8-49dd-ad04-48f5ef89b45e",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值。\n",
|
17 |
+
"\n",
|
18 |
+
"使用 GPT-2 进行回归问题的解决,可以将回归问题转化为自回归语言模型任务。GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值(如情感评分、价格预测等)。\n",
|
19 |
+
"\n",
|
20 |
+
"---\n",
|
21 |
+
"\n",
|
22 |
+
"### **1. 使用 GPT-2 做回归的核心思路**\n",
|
23 |
+
"\n",
|
24 |
+
"1. **调整输出层**:\n",
|
25 |
+
" - 默认情况下,GPT-2 的输出是一个词汇表大小的概率分布,用于预测下一个 token。\n",
|
26 |
+
" - 对于回归问题,可以将模型的最后一层替换为一个线性层,使得输出变为一个标量或多个连续值。\n",
|
27 |
+
" - gpt2的huggingface实现中,可以简单设置1个分类的分类header,实现回归预测。\n",
|
28 |
+
"\n",
|
29 |
+
"2. **损失函数**:\n",
|
30 |
+
" - 对于回归问题,使用均方误差(MSE)或均绝对误差(MAE)作为损失函数,而不是分类任务中常用的交叉熵。\n",
|
31 |
+
"\n",
|
32 |
+
"3. **输入格式**:\n",
|
33 |
+
" - 输入数据仍然是文本,可以通过特定的模板形式加入上下文信息。\n",
|
34 |
+
"\n",
|
35 |
+
"---\n",
|
36 |
+
"\n",
|
37 |
+
"### **2. GPT-2 回归任务的实现步骤**\n",
|
38 |
+
"\n",
|
39 |
+
"#### **(1)加载基础模型**\n",
|
40 |
+
"\n",
|
41 |
+
"从 Hugging Face Transformers 库加载 GPT-2 模型和分词器,并调整其配置以适应回归任务。\n",
|
42 |
+
"\n",
|
43 |
+
"```python\n",
|
44 |
+
"from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, AutoModelForSequenceClassification\n",
|
45 |
+
"\n",
|
46 |
+
"# 加载分词器\n",
|
47 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
48 |
+
"\n",
|
49 |
+
"# 调整模型配置,num_labels=1 表示回归任务\n",
|
50 |
+
"config = GPT2Config.from_pretrained(\"gpt2\", num_labels=1)\n",
|
51 |
+
"\n",
|
52 |
+
"# 加载模型,增加回归输出\n",
|
53 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", config=config)\n",
|
54 |
+
"```\n",
|
55 |
+
"\n",
|
56 |
+
"---\n",
|
57 |
+
"\n",
|
58 |
+
"### **3. 课程数据集**\n",
|
59 |
+
"\n",
|
60 |
+
"本例程使用了蛋白质稳定性分析的数据集,也就是一个蛋白质序列,对应一个float的数值,做回归预测分析。\n",
|
61 |
+
"\n",
|
62 |
+
"**蛋白质稳定性分析**是研究蛋白质在不同条件下保持其结构和功能的能力的过程。蛋白质稳定性是生物化学和生物技术领域的重要课题,影响着蛋白质的折叠、功能执行、以及在应用中的可用性(如工业酶、药物开发等)。\n"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 1,
|
68 |
+
"id": "1e8c0f86-af78-43e1-8db4-e2a2ea22f815",
|
69 |
+
"metadata": {},
|
70 |
+
"outputs": [
|
71 |
+
{
|
72 |
+
"data": {
|
73 |
+
"text/plain": [
|
74 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
75 |
+
]
|
76 |
+
},
|
77 |
+
"execution_count": 1,
|
78 |
+
"metadata": {},
|
79 |
+
"output_type": "execute_result"
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"source": [
|
83 |
+
"import subprocess\n",
|
84 |
+
"import os\n",
|
85 |
+
"# 设置环境变量, autodl一般区域\n",
|
86 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
87 |
+
"output = result.stdout\n",
|
88 |
+
"for line in output.splitlines():\n",
|
89 |
+
" if '=' in line:\n",
|
90 |
+
" var, value = line.split('=', 1)\n",
|
91 |
+
" os.environ[var] = value\n",
|
92 |
+
"\n",
|
93 |
+
"\"\"\"\n",
|
94 |
+
"import os\n",
|
95 |
+
"\n",
|
96 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
97 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
98 |
+
"\n",
|
99 |
+
"# 打印环境变量以确认设置成功\n",
|
100 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
101 |
+
"\"\"\""
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 2,
|
107 |
+
"id": "c51a8d69-9a36-47e7-8084-f64e6a72e4f7",
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
112 |
+
"from tokenizers import Tokenizer\n",
|
113 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
114 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
115 |
+
"from transformers import DataCollatorWithPadding"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 3,
|
121 |
+
"id": "a5aeb7c1-2d2a-4f57-ad8c-659613870e59",
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [],
|
124 |
+
"source": [
|
125 |
+
"#set tokenizer\n",
|
126 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
127 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": 4,
|
133 |
+
"id": "ad0c19cd-96a5-463e-8b7d-439646fef429",
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [
|
136 |
+
{
|
137 |
+
"name": "stderr",
|
138 |
+
"output_type": "stream",
|
139 |
+
"text": [
|
140 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']\n",
|
141 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
142 |
+
]
|
143 |
+
}
|
144 |
+
],
|
145 |
+
"source": [
|
146 |
+
"#set model\n",
|
147 |
+
"model = AutoModelForSequenceClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', num_labels=1)\n",
|
148 |
+
"model.config.pad_token_id = model.config.eos_token_id"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "code",
|
153 |
+
"execution_count": 5,
|
154 |
+
"id": "8c48cb0a-6142-4afc-823e-08fb33f74222",
|
155 |
+
"metadata": {},
|
156 |
+
"outputs": [
|
157 |
+
{
|
158 |
+
"data": {
|
159 |
+
"text/plain": [
|
160 |
+
"DatasetDict({\n",
|
161 |
+
" train: Dataset({\n",
|
162 |
+
" features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
|
163 |
+
" num_rows: 62079\n",
|
164 |
+
" })\n",
|
165 |
+
" test: Dataset({\n",
|
166 |
+
" features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
|
167 |
+
" num_rows: 6898\n",
|
168 |
+
" })\n",
|
169 |
+
"})"
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"execution_count": 5,
|
173 |
+
"metadata": {},
|
174 |
+
"output_type": "execute_result"
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"source": [
|
178 |
+
"from datasets import load_dataset\n",
|
179 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
180 |
+
"dataset = load_dataset(\"csv\", data_files=\"data/protein_stab.csv\")['train'].train_test_split(test_size=0.1)\n",
|
181 |
+
"dataset"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 6,
|
187 |
+
"id": "685dd025-f00a-4869-bc30-9843c77b6d8a",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"data": {
|
192 |
+
"text/plain": [
|
193 |
+
"{'seq_id': 'train_prot_32672',\n",
|
194 |
+
" 'seq_type': 'prot',\n",
|
195 |
+
" 'seq': 'FYRLIIFKYPDYIDTYLRLAAIAKEKNNLQLSIEGNGSGGNGSGGNGSGN',\n",
|
196 |
+
" 'label': 0.7599999904632561}"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
"execution_count": 6,
|
200 |
+
"metadata": {},
|
201 |
+
"output_type": "execute_result"
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"source": [
|
205 |
+
"dataset[\"train\"][0]"
|
206 |
+
]
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"cell_type": "code",
|
210 |
+
"execution_count": 7,
|
211 |
+
"id": "6e10dbbb-73ef-4b67-8290-77f8896298f5",
|
212 |
+
"metadata": {},
|
213 |
+
"outputs": [
|
214 |
+
{
|
215 |
+
"name": "stdout",
|
216 |
+
"output_type": "stream",
|
217 |
+
"text": [
|
218 |
+
"datasets mean token lenght 17.24006958538707 min token length 12 max token length 35\n"
|
219 |
+
]
|
220 |
+
}
|
221 |
+
],
|
222 |
+
"source": [
|
223 |
+
"token_len_list = []\n",
|
224 |
+
"for item in dataset[\"test\"]:\n",
|
225 |
+
" inputs = tokenizer.tokenize(item[\"seq\"])\n",
|
226 |
+
" token_len_list.append( len(inputs) )\n",
|
227 |
+
"\n",
|
228 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
229 |
+
"min_len = min(token_len_list)\n",
|
230 |
+
"max_len = max(token_len_list)\n",
|
231 |
+
"\n",
|
232 |
+
"print(\"datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 25,
|
238 |
+
"id": "ac58b5b4-bff0-404d-bcf5-2b93db2b37c0",
|
239 |
+
"metadata": {},
|
240 |
+
"outputs": [
|
241 |
+
{
|
242 |
+
"data": {
|
243 |
+
"application/vnd.jupyter.widget-view+json": {
|
244 |
+
"model_id": "419cce8c5ba249ac8c8773dd2d69992d",
|
245 |
+
"version_major": 2,
|
246 |
+
"version_minor": 0
|
247 |
+
},
|
248 |
+
"text/plain": [
|
249 |
+
"Map: 0%| | 0/62079 [00:00<?, ? examples/s]"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"metadata": {},
|
253 |
+
"output_type": "display_data"
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"name": "stderr",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.\n",
|
260 |
+
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"data": {
|
265 |
+
"application/vnd.jupyter.widget-view+json": {
|
266 |
+
"model_id": "0b9ea09fe3ea49b19f7d52aca7949acf",
|
267 |
+
"version_major": 2,
|
268 |
+
"version_minor": 0
|
269 |
+
},
|
270 |
+
"text/plain": [
|
271 |
+
"Map: 0%| | 0/6898 [00:00<?, ? examples/s]"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
"metadata": {},
|
275 |
+
"output_type": "display_data"
|
276 |
+
}
|
277 |
+
],
|
278 |
+
"source": [
|
279 |
+
"# 2. tokenize\n",
|
280 |
+
"def tokenize_function(examples):\n",
|
281 |
+
" return tokenizer(examples['seq'], truncation=True, padding='max_length')\n",
|
282 |
+
"\n",
|
283 |
+
"# 3. 对数据集应用分词函数\n",
|
284 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
|
285 |
+
"\n",
|
286 |
+
"# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
|
287 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"cell_type": "code",
|
292 |
+
"execution_count": 26,
|
293 |
+
"id": "94f6d643-2cf7-4651-9a8d-1884b2bddd1c",
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [
|
296 |
+
{
|
297 |
+
"name": "stderr",
|
298 |
+
"output_type": "stream",
|
299 |
+
"text": [
|
300 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
301 |
+
" warnings.warn(\n",
|
302 |
+
"/tmp/ipykernel_1347/4285456223.py:23: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
|
303 |
+
" trainer = Trainer(\n"
|
304 |
+
]
|
305 |
+
}
|
306 |
+
],
|
307 |
+
"source": [
|
308 |
+
"from transformers import TrainingArguments, Trainer\n",
|
309 |
+
"import numpy as np\n",
|
310 |
+
"from sklearn.metrics import mean_squared_error\n",
|
311 |
+
"\n",
|
312 |
+
"\n",
|
313 |
+
"def compute_metrics(eval_pred):\n",
|
314 |
+
" predictions, labels = eval_pred\n",
|
315 |
+
" rmse = mean_squared_error(labels, predictions)\n",
|
316 |
+
" return {\"rmse\": rmse}\n",
|
317 |
+
"\n",
|
318 |
+
"# 设置训练参数\n",
|
319 |
+
"training_args = TrainingArguments(\n",
|
320 |
+
" output_dir='./results',\n",
|
321 |
+
" evaluation_strategy=\"epoch\",\n",
|
322 |
+
" learning_rate=2e-5,\n",
|
323 |
+
" per_device_train_batch_size=20,\n",
|
324 |
+
" per_device_eval_batch_size=20,\n",
|
325 |
+
" num_train_epochs=10,\n",
|
326 |
+
" weight_decay=0.01,\n",
|
327 |
+
")\n",
|
328 |
+
"\n",
|
329 |
+
"# 使用Trainer API进行训练(假设已有train_dataset和eval_dataset)\n",
|
330 |
+
"trainer = Trainer(\n",
|
331 |
+
" model=model,\n",
|
332 |
+
" args=training_args,\n",
|
333 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
334 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
335 |
+
" tokenizer=tokenizer,\n",
|
336 |
+
" data_collator=data_collator,\n",
|
337 |
+
" compute_metrics=compute_metrics,\n",
|
338 |
+
")"
|
339 |
+
]
|
340 |
+
},
|
341 |
+
{
|
342 |
+
"cell_type": "code",
|
343 |
+
"execution_count": null,
|
344 |
+
"id": "dfe12979-d977-4404-bf9e-18c1f91a3e39",
|
345 |
+
"metadata": {},
|
346 |
+
"outputs": [
|
347 |
+
{
|
348 |
+
"data": {
|
349 |
+
"text/html": [
|
350 |
+
"\n",
|
351 |
+
" <div>\n",
|
352 |
+
" \n",
|
353 |
+
" <progress value='30987' max='31040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
354 |
+
" [30987/31040 1:00:56 < 00:06, 8.47 it/s, Epoch 9.98/10]\n",
|
355 |
+
" </div>\n",
|
356 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
357 |
+
" <thead>\n",
|
358 |
+
" <tr style=\"text-align: left;\">\n",
|
359 |
+
" <th>Epoch</th>\n",
|
360 |
+
" <th>Training Loss</th>\n",
|
361 |
+
" <th>Validation Loss</th>\n",
|
362 |
+
" <th>Rmse</th>\n",
|
363 |
+
" </tr>\n",
|
364 |
+
" </thead>\n",
|
365 |
+
" <tbody>\n",
|
366 |
+
" <tr>\n",
|
367 |
+
" <td>1</td>\n",
|
368 |
+
" <td>0.044600</td>\n",
|
369 |
+
" <td>0.163462</td>\n",
|
370 |
+
" <td>0.163462</td>\n",
|
371 |
+
" </tr>\n",
|
372 |
+
" <tr>\n",
|
373 |
+
" <td>2</td>\n",
|
374 |
+
" <td>0.041900</td>\n",
|
375 |
+
" <td>0.157900</td>\n",
|
376 |
+
" <td>0.157900</td>\n",
|
377 |
+
" </tr>\n",
|
378 |
+
" <tr>\n",
|
379 |
+
" <td>3</td>\n",
|
380 |
+
" <td>0.037700</td>\n",
|
381 |
+
" <td>0.159724</td>\n",
|
382 |
+
" <td>0.159724</td>\n",
|
383 |
+
" </tr>\n",
|
384 |
+
" <tr>\n",
|
385 |
+
" <td>4</td>\n",
|
386 |
+
" <td>0.031700</td>\n",
|
387 |
+
" <td>0.157686</td>\n",
|
388 |
+
" <td>0.157686</td>\n",
|
389 |
+
" </tr>\n",
|
390 |
+
" <tr>\n",
|
391 |
+
" <td>5</td>\n",
|
392 |
+
" <td>0.028800</td>\n",
|
393 |
+
" <td>0.157124</td>\n",
|
394 |
+
" <td>0.157124</td>\n",
|
395 |
+
" </tr>\n",
|
396 |
+
" <tr>\n",
|
397 |
+
" <td>6</td>\n",
|
398 |
+
" <td>0.025400</td>\n",
|
399 |
+
" <td>0.150852</td>\n",
|
400 |
+
" <td>0.150852</td>\n",
|
401 |
+
" </tr>\n",
|
402 |
+
" <tr>\n",
|
403 |
+
" <td>7</td>\n",
|
404 |
+
" <td>0.022300</td>\n",
|
405 |
+
" <td>0.159293</td>\n",
|
406 |
+
" <td>0.159293</td>\n",
|
407 |
+
" </tr>\n",
|
408 |
+
" <tr>\n",
|
409 |
+
" <td>8</td>\n",
|
410 |
+
" <td>0.019600</td>\n",
|
411 |
+
" <td>0.154608</td>\n",
|
412 |
+
" <td>0.154608</td>\n",
|
413 |
+
" </tr>\n",
|
414 |
+
" <tr>\n",
|
415 |
+
" <td>9</td>\n",
|
416 |
+
" <td>0.017300</td>\n",
|
417 |
+
" <td>0.156104</td>\n",
|
418 |
+
" <td>0.156104</td>\n",
|
419 |
+
" </tr>\n",
|
420 |
+
" </tbody>\n",
|
421 |
+
"</table><p>"
|
422 |
+
],
|
423 |
+
"text/plain": [
|
424 |
+
"<IPython.core.display.HTML object>"
|
425 |
+
]
|
426 |
+
},
|
427 |
+
"metadata": {},
|
428 |
+
"output_type": "display_data"
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"name": "stderr",
|
432 |
+
"output_type": "stream",
|
433 |
+
"text": [
|
434 |
+
"IOPub message rate exceeded.\n",
|
435 |
+
"The Jupyter server will temporarily stop sending output\n",
|
436 |
+
"to the client in order to avoid crashing it.\n",
|
437 |
+
"To change this limit, set the config variable\n",
|
438 |
+
"`--ServerApp.iopub_msg_rate_limit`.\n",
|
439 |
+
"\n",
|
440 |
+
"Current values:\n",
|
441 |
+
"ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
|
442 |
+
"ServerApp.rate_limit_window=3.0 (secs)\n",
|
443 |
+
"\n"
|
444 |
+
]
|
445 |
+
}
|
446 |
+
],
|
447 |
+
"source": [
|
448 |
+
"# 开始训练\n",
|
449 |
+
"trainer.train()"
|
450 |
+
]
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"cell_type": "code",
|
454 |
+
"execution_count": null,
|
455 |
+
"id": "060c4618-40d0-4934-bab8-36aab3a46de5",
|
456 |
+
"metadata": {},
|
457 |
+
"outputs": [],
|
458 |
+
"source": [
|
459 |
+
"#模型测试\n",
|
460 |
+
"predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
|
461 |
+
"predictions"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": 18,
|
467 |
+
"id": "1f8ef885-5bc9-4668-905b-6b2235209654",
|
468 |
+
"metadata": {},
|
469 |
+
"outputs": [
|
470 |
+
{
|
471 |
+
"data": {
|
472 |
+
"text/html": [
|
473 |
+
"\n",
|
474 |
+
" <div>\n",
|
475 |
+
" \n",
|
476 |
+
" <progress value='345' max='345' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
477 |
+
" [345/345 00:09]\n",
|
478 |
+
" </div>\n",
|
479 |
+
" "
|
480 |
+
],
|
481 |
+
"text/plain": [
|
482 |
+
"<IPython.core.display.HTML object>"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
"metadata": {},
|
486 |
+
"output_type": "display_data"
|
487 |
+
},
|
488 |
+
{
|
489 |
+
"data": {
|
490 |
+
"text/plain": [
|
491 |
+
"{'eval_loss': 0.15949687361717224,\n",
|
492 |
+
" 'eval_rmse': 0.15949687361717224,\n",
|
493 |
+
" 'eval_runtime': 9.1483,\n",
|
494 |
+
" 'eval_samples_per_second': 754.017,\n",
|
495 |
+
" 'eval_steps_per_second': 37.712,\n",
|
496 |
+
" 'epoch': 10.0}"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
"execution_count": 18,
|
500 |
+
"metadata": {},
|
501 |
+
"output_type": "execute_result"
|
502 |
+
}
|
503 |
+
],
|
504 |
+
"source": [
|
505 |
+
"trainer.evaluate()"
|
506 |
+
]
|
507 |
+
},
|
508 |
+
{
|
509 |
+
"cell_type": "code",
|
510 |
+
"execution_count": 23,
|
511 |
+
"id": "afabdbe9-9b96-4f9e-bef2-1d819431f8d1",
|
512 |
+
"metadata": {},
|
513 |
+
"outputs": [
|
514 |
+
{
|
515 |
+
"name": "stdout",
|
516 |
+
"output_type": "stream",
|
517 |
+
"text": [
|
518 |
+
"[[ 1.7208484 ]\n",
|
519 |
+
" [ 0.00225139]\n",
|
520 |
+
" [ 0.3325616 ]\n",
|
521 |
+
" [-0.34372616]\n",
|
522 |
+
" [-0.45505935]\n",
|
523 |
+
" [-0.06892765]\n",
|
524 |
+
" [ 0.15099108]\n",
|
525 |
+
" [ 0.12211376]\n",
|
526 |
+
" [ 0.3947332 ]\n",
|
527 |
+
" [ 0.23186803]]\n"
|
528 |
+
]
|
529 |
+
}
|
530 |
+
],
|
531 |
+
"source": [
|
532 |
+
"predictions.predictions[0:10].squeeze()"
|
533 |
+
]
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"cell_type": "code",
|
537 |
+
"execution_count": 24,
|
538 |
+
"id": "fa9d17fd-eece-4c1e-99e0-3d19d36f7584",
|
539 |
+
"metadata": {},
|
540 |
+
"outputs": [
|
541 |
+
{
|
542 |
+
"data": {
|
543 |
+
"text/plain": [
|
544 |
+
"array([ 1.69, 0.84, 0.58, -0.15, 0.23, 0.03, 0.15, 0.2 , 0.51,\n",
|
545 |
+
" 1.1 ], dtype=float32)"
|
546 |
+
]
|
547 |
+
},
|
548 |
+
"execution_count": 24,
|
549 |
+
"metadata": {},
|
550 |
+
"output_type": "execute_result"
|
551 |
+
}
|
552 |
+
],
|
553 |
+
"source": [
|
554 |
+
"predictions.label_ids[0:10]"
|
555 |
+
]
|
556 |
+
},
|
557 |
{
|
558 |
"cell_type": "code",
|
559 |
"execution_count": null,
|
560 |
+
"id": "52252015-e068-414b-bd8a-79a5d1a2beec",
|
561 |
"metadata": {},
|
562 |
"outputs": [],
|
563 |
"source": []
|
03-gene-task/1-category-task.ipynb
CHANGED
@@ -1,9 +1,788 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": []
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "5840e900-43cb-4ab4-81a5-988b68fda9b1",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.1 序列分类任务"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "958e7b5f-759a-431c-8af0-325271facb41",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"基于 GPT-2 模型,可以通过微调(fine-tuning)或使用提示(prompt-based)方法来完成多种下游任务。\n",
|
17 |
+
"本章主要使用经典的微调方式,提示微调则属于chatgpt的范围,放在下一章,以下是几种常见的下游任务及其简单描述:\n",
|
18 |
+
"\n",
|
19 |
+
"\n",
|
20 |
+
"### 1. **文本分类**\n",
|
21 |
+
"\n",
|
22 |
+
"#### 任务描述\n",
|
23 |
+
"\n",
|
24 |
+
"文本分类是将文本分配到一个或多个预定义类别中的任务。例如,情感分析、主题分类等。生物序列中对应如启动序列等分类问题。\n",
|
25 |
+
"\n",
|
26 |
+
"#### 使用的模型类型\n",
|
27 |
+
"\n",
|
28 |
+
"- **GPT2ForSequenceClassification或AutoModelForSequenceClassification**:该模型在 GPT-2 的基础上添加了一个分类头,用于处理文本分类任务。通过微调这个模型,可以将其应用于多种分类任务。\n",
|
29 |
+
"\n",
|
30 |
+
"### 2. **机器翻译**\n",
|
31 |
+
"\n",
|
32 |
+
"#### 任务描述\n",
|
33 |
+
"\n",
|
34 |
+
"机器翻译是指将一种语言的文本转换为另一种语言的过程。生物学中,可以是生物序列到功能描述(英文)的翻译。\n",
|
35 |
+
"\n",
|
36 |
+
"#### 使用的模型类型\n",
|
37 |
+
"\n",
|
38 |
+
"- **AutoModelForSeq2SeqLM**:虽然 GPT-2 不是专门为机器翻译设计的模型,但可以通过构造特定格式的提示,让 GPT-2 根据上下文生成目标语言的翻译结果。\n",
|
39 |
+
"- **注意**:对于机器翻译任务,通常更推荐使用专门为此类任务设计的模型,如 T5 或 mBART。\n",
|
40 |
+
"\n",
|
41 |
+
"### 3. **词性标注 (POS Tagging)**\n",
|
42 |
+
"\n",
|
43 |
+
"#### 任务描述\n",
|
44 |
+
"\n",
|
45 |
+
"词性标注是指为每个单词分配其正确的词性标签(如名词、动词、形容词等)。生物学中,对应于结构预测任务,典型的如二级结构预测。\n",
|
46 |
+
"\n",
|
47 |
+
"#### 使用的模型类型\n",
|
48 |
+
"\n",
|
49 |
+
"- **AutoModelForTokenClassification**:该模型适用于标记级别的分类任务。通过微调,可以将 GPT-2 应用于词性标注,每个 token 的隐藏状态会被映射到相应的词性标签。\n",
|
50 |
+
"\n",
|
51 |
+
"### 4. **命名实体识别 (NER)**\n",
|
52 |
+
"\n",
|
53 |
+
"#### 任务描述\n",
|
54 |
+
"\n",
|
55 |
+
"命名实体识别是指识别文本中的人名、地名、组织机构等实体,并对其进行分类。生物学中,也对应于结构预测任务,典型的如膜结构预测。和词性标注类似。\n",
|
56 |
+
"\n",
|
57 |
+
"#### 使用的模型类型\n",
|
58 |
+
"\n",
|
59 |
+
"- **AutoModelForTokenClassification**:类似于词性标注,该模型可以用于 NER 任务,通过对每个 token 进行分类来识别和标注命名实体。\n",
|
60 |
+
"\n",
|
61 |
+
"### 5. **问答系统**\n",
|
62 |
+
"\n",
|
63 |
+
"#### 任务描述\n",
|
64 |
+
"\n",
|
65 |
+
"问答系统旨在根据给定的问题从文档或知识库中提取答案。目前一些最新的生物学大模型论文中,输入是包含生物序列的问题,回答则也是混合式的。一般是生物学领域的QA。\n",
|
66 |
+
"\n",
|
67 |
+
"#### 使用的模型类型\n",
|
68 |
+
"\n",
|
69 |
+
"- **AutoModelForQuestionAnswering**:该模型专门用于问答任务,能够理解问题并从上下文中提取答案。通过微调,它可以适应特定领域的问答需求。\n",
|
70 |
+
"\n",
|
71 |
+
"### 6. **文本生成**\n",
|
72 |
+
"\n",
|
73 |
+
"#### 任务描述\n",
|
74 |
+
"\n",
|
75 |
+
"文本生成是指根据给定的提示或前缀生成连贯的文本内容。生物学中,对应新的序列生成,如产生全新的蛋白质序列。\n",
|
76 |
+
"\n",
|
77 |
+
"#### 使用的模型类型\n",
|
78 |
+
"\n",
|
79 |
+
"- **GPT2LMHeadModel**:这是 GPT-2 的标准语言模型版本,擅长生成自然流畅的文本。它可以根据输入的提示生成后续文本,广泛应用于创作、对话系统等领域。\n",
|
80 |
+
"\n",
|
81 |
+
"### 6. **回归问题**\n",
|
82 |
+
"\n",
|
83 |
+
"#### 任务描述\n",
|
84 |
+
"\n",
|
85 |
+
"生物序列相关的回归问题,输入为序列,输出为一个float值。\n",
|
86 |
+
"\n",
|
87 |
+
"#### 使用的模型类型\n",
|
88 |
+
"\n",
|
89 |
+
"- huggingface没有特定的header,但一般回归问题,输出使用一个线性层即可,设定损失函数为均方误差(MSE)即可。最简单的,就是使用AutoModelForTokenClassification,类别数设置为1,输出的label为实测float值即可。\n",
|
90 |
+
"一个官方推荐的 [例子](https://github.com/huggingface/transformers/blob/7ae6f070044b0171a71f3269613bf02fd9fca6f2/src/transformers/models/bert/modeling_bert.py#L1564-L1575)\n",
|
91 |
+
"\n",
|
92 |
+
"### 小结\n",
|
93 |
+
"\n",
|
94 |
+
"GPT-2 可以通过微调或提示工程应用于多种下游任务。不同的任务需要使用特定类型的模型,这些模型基于 GPT-2 并添加了额外的组件或进行了调整,以更好地适应���定的任务需求\n",
|
95 |
+
"\n",
|
96 |
+
"<img src=\"img/gpt2-ft.png\" width=\"800px\" />"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "code",
|
101 |
+
"execution_count": 1,
|
102 |
+
"id": "eca17933-7b8f-44de-8c59-ea7a1c8a3b33",
|
103 |
+
"metadata": {},
|
104 |
+
"outputs": [
|
105 |
+
{
|
106 |
+
"data": {
|
107 |
+
"text/plain": [
|
108 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
109 |
+
]
|
110 |
+
},
|
111 |
+
"execution_count": 1,
|
112 |
+
"metadata": {},
|
113 |
+
"output_type": "execute_result"
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"import subprocess\n",
|
118 |
+
"import os\n",
|
119 |
+
"# 设置环境变量, autodl一般区域\n",
|
120 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
121 |
+
"output = result.stdout\n",
|
122 |
+
"for line in output.splitlines():\n",
|
123 |
+
" if '=' in line:\n",
|
124 |
+
" var, value = line.split('=', 1)\n",
|
125 |
+
" os.environ[var] = value\n",
|
126 |
+
"\n",
|
127 |
+
"\"\"\"\n",
|
128 |
+
"import os\n",
|
129 |
+
"\n",
|
130 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
131 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
132 |
+
"\n",
|
133 |
+
"# 打印环境变量以确认设置成功\n",
|
134 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
135 |
+
"\"\"\""
|
136 |
+
]
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"cell_type": "code",
|
140 |
+
"execution_count": 2,
|
141 |
+
"id": "108d9c3c-ae4d-4110-a532-a40a6fe1f9df",
|
142 |
+
"metadata": {},
|
143 |
+
"outputs": [],
|
144 |
+
"source": [
|
145 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
146 |
+
"from tokenizers import Tokenizer\n",
|
147 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
148 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
149 |
+
"from transformers import DataCollatorWithPadding"
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 6,
|
155 |
+
"id": "bcdc9f7a-1ea5-4647-b87e-ac72ddf17818",
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [
|
158 |
+
{
|
159 |
+
"data": {
|
160 |
+
"application/vnd.jupyter.widget-view+json": {
|
161 |
+
"model_id": "c2e31c61549449e78a4e1fe0e884233f",
|
162 |
+
"version_major": 2,
|
163 |
+
"version_minor": 0
|
164 |
+
},
|
165 |
+
"text/plain": [
|
166 |
+
"tokenizer_config.json: 0%| | 0.00/580 [00:00<?, ?B/s]"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
"metadata": {},
|
170 |
+
"output_type": "display_data"
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"data": {
|
174 |
+
"application/vnd.jupyter.widget-view+json": {
|
175 |
+
"model_id": "da2009ca96634f759f052a9a4ff7e41e",
|
176 |
+
"version_major": 2,
|
177 |
+
"version_minor": 0
|
178 |
+
},
|
179 |
+
"text/plain": [
|
180 |
+
"vocab.json: 0%| | 0.00/642k [00:00<?, ?B/s]"
|
181 |
+
]
|
182 |
+
},
|
183 |
+
"metadata": {},
|
184 |
+
"output_type": "display_data"
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"application/vnd.jupyter.widget-view+json": {
|
189 |
+
"model_id": "b6b6ec58d8cb4878aa2e0786ff0bbcf4",
|
190 |
+
"version_major": 2,
|
191 |
+
"version_minor": 0
|
192 |
+
},
|
193 |
+
"text/plain": [
|
194 |
+
"merges.txt: 0%| | 0.00/323k [00:00<?, ?B/s]"
|
195 |
+
]
|
196 |
+
},
|
197 |
+
"metadata": {},
|
198 |
+
"output_type": "display_data"
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"data": {
|
202 |
+
"application/vnd.jupyter.widget-view+json": {
|
203 |
+
"model_id": "5dbb5171eb6242bdbded42c87ef46c27",
|
204 |
+
"version_major": 2,
|
205 |
+
"version_minor": 0
|
206 |
+
},
|
207 |
+
"text/plain": [
|
208 |
+
"special_tokens_map.json: 0%| | 0.00/473 [00:00<?, ?B/s]"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
"metadata": {},
|
212 |
+
"output_type": "display_data"
|
213 |
+
}
|
214 |
+
],
|
215 |
+
"source": [
|
216 |
+
"#set tokenizer\n",
|
217 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/dna_gpt2_v0\")\n",
|
218 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
219 |
+
]
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"cell_type": "code",
|
223 |
+
"execution_count": 3,
|
224 |
+
"id": "0e930ef5-865a-4528-84b5-ddae6d710a99",
|
225 |
+
"metadata": {},
|
226 |
+
"outputs": [
|
227 |
+
{
|
228 |
+
"name": "stderr",
|
229 |
+
"output_type": "stream",
|
230 |
+
"text": [
|
231 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/dna_gpt2_v0 and are newly initialized: ['score.weight']\n",
|
232 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"data": {
|
237 |
+
"text/plain": [
|
238 |
+
"GPT2ForSequenceClassification(\n",
|
239 |
+
" (transformer): GPT2Model(\n",
|
240 |
+
" (wte): Embedding(30000, 768)\n",
|
241 |
+
" (wpe): Embedding(1024, 768)\n",
|
242 |
+
" (drop): Dropout(p=0.1, inplace=False)\n",
|
243 |
+
" (h): ModuleList(\n",
|
244 |
+
" (0-11): 12 x GPT2Block(\n",
|
245 |
+
" (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
246 |
+
" (attn): GPT2SdpaAttention(\n",
|
247 |
+
" (c_attn): Conv1D(nf=2304, nx=768)\n",
|
248 |
+
" (c_proj): Conv1D(nf=768, nx=768)\n",
|
249 |
+
" (attn_dropout): Dropout(p=0.1, inplace=False)\n",
|
250 |
+
" (resid_dropout): Dropout(p=0.1, inplace=False)\n",
|
251 |
+
" )\n",
|
252 |
+
" (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
253 |
+
" (mlp): GPT2MLP(\n",
|
254 |
+
" (c_fc): Conv1D(nf=3072, nx=768)\n",
|
255 |
+
" (c_proj): Conv1D(nf=768, nx=3072)\n",
|
256 |
+
" (act): NewGELUActivation()\n",
|
257 |
+
" (dropout): Dropout(p=0.1, inplace=False)\n",
|
258 |
+
" )\n",
|
259 |
+
" )\n",
|
260 |
+
" )\n",
|
261 |
+
" (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
|
262 |
+
" )\n",
|
263 |
+
" (score): Linear(in_features=768, out_features=2, bias=False)\n",
|
264 |
+
")"
|
265 |
+
]
|
266 |
+
},
|
267 |
+
"execution_count": 3,
|
268 |
+
"metadata": {},
|
269 |
+
"output_type": "execute_result"
|
270 |
+
}
|
271 |
+
],
|
272 |
+
"source": [
|
273 |
+
"#set model\n",
|
274 |
+
"model = AutoModelForSequenceClassification.from_pretrained('dnagpt/dna_gpt2_v0', num_labels=2)\n",
|
275 |
+
"model.config.pad_token_id = model.config.eos_token_id\n",
|
276 |
+
"model"
|
277 |
+
]
|
278 |
+
},
|
279 |
+
{
|
280 |
+
"cell_type": "markdown",
|
281 |
+
"id": "bd14794b-e507-4c1d-be47-0e0144835f18",
|
282 |
+
"metadata": {},
|
283 |
+
"source": [
|
284 |
+
"在生物学中,**启动子(promoter)** 是一段特定的DNA序列,它位于基因的上游(通常是5'端),并且是转录起始的关键调控元件。启动子的主要功能是为RNA聚合酶提供结合位点,并招募其他转录因子,以启动基因转录过程。以下是关于启动子的一些重要概念和特点:\n",
|
285 |
+
"\n",
|
286 |
+
"### 启动子的功能\n",
|
287 |
+
"\n",
|
288 |
+
"1. **转录起始**:\n",
|
289 |
+
" - 启动子是基因表达的第一步,它决定了何时、何地以及多频繁地进行转录。\n",
|
290 |
+
" \n",
|
291 |
+
"2. **调控基因表达**:\n",
|
292 |
+
" - 不同类型的启动子可以调节不同组织或细胞类型中的基因表达水平。例如,在某些细胞中高度活跃而在其他细胞中不活跃。\n",
|
293 |
+
"\n",
|
294 |
+
"3. **与转录因子和其他蛋白质相互作用**:\n",
|
295 |
+
" - 启动子区域通常包含多个顺式作用元件(cis-regulatory elements),这些元件可以与特定的转录因子或其他调控蛋白结合,进一步精细调整基因表达。\n",
|
296 |
+
" \n",
|
297 |
+
" \n",
|
298 |
+
"在生物学中,启动子(promoter)序列的二分类问题通常是指将DNA序列分为两类:**启动子序列**和**非启动子序列**。这种分类任务的目标是通过机器学习或生物信息学方法来预测给定的DNA序列是否具有启动子功能。\n",
|
299 |
+
"\n",
|
300 |
+
"### 二分类问题中的两个类别\n",
|
301 |
+
"\n",
|
302 |
+
"1. **启动子序列(Promoter Sequences)**:\n",
|
303 |
+
" - 这些序列包含能够指导转录起始的调控元件,通常是位于基因5'端上游区域的一段DNA。\n",
|
304 |
+
" - 启动子序列可能含有特定的保守基序(motifs),如TATA盒、CAAT盒等,这些基序对于RNA聚合酶及其辅助因子的结合至关重要。\n",
|
305 |
+
"\n",
|
306 |
+
"2. **非启动子序列(Non-Promoter Sequences)**:\n",
|
307 |
+
" - 这类序列指的是那些不具有启动子功能的DNA片段。它们可以来自基因内部(编码区或内含子)、基因间区域(intergenic regions)或其他调控元件(如增强子、沉默子等),但明确不是启动子。\n",
|
308 |
+
" - 非启动子序列不具备启动转录的能力,或者至少在自然条件下不会作为主要的转录起始点。\n",
|
309 |
+
"\n",
|
310 |
+
"### 启动子的研究意义\n",
|
311 |
+
"\n",
|
312 |
+
"理解启动子的工作机制对于揭示基因表达调控网络非常重要。这不仅有助于基础科学研究,而且对于医学应用也有着深远的影响,比如开发新的治疗策略来纠正异常的基因表达模式,或者利用合成生物学设计定制化的基因表达系统。\n"
|
313 |
+
]
|
314 |
+
},
|
315 |
+
{
|
316 |
+
"cell_type": "code",
|
317 |
+
"execution_count": 8,
|
318 |
+
"id": "aee08f3f-6cda-4975-8cb9-9a7bfacb9eac",
|
319 |
+
"metadata": {},
|
320 |
+
"outputs": [
|
321 |
+
{
|
322 |
+
"data": {
|
323 |
+
"application/vnd.jupyter.widget-view+json": {
|
324 |
+
"model_id": "82d2ec71cf6648469040897d9174a55f",
|
325 |
+
"version_major": 2,
|
326 |
+
"version_minor": 0
|
327 |
+
},
|
328 |
+
"text/plain": [
|
329 |
+
"README.md: 0%| | 0.00/314 [00:00<?, ?B/s]"
|
330 |
+
]
|
331 |
+
},
|
332 |
+
"metadata": {},
|
333 |
+
"output_type": "display_data"
|
334 |
+
},
|
335 |
+
{
|
336 |
+
"data": {
|
337 |
+
"application/vnd.jupyter.widget-view+json": {
|
338 |
+
"model_id": "40183e0714ea4155a2c0772fb7c72a00",
|
339 |
+
"version_major": 2,
|
340 |
+
"version_minor": 0
|
341 |
+
},
|
342 |
+
"text/plain": [
|
343 |
+
"train-00000-of-00001.parquet: 0%| | 0.00/8.66M [00:00<?, ?B/s]"
|
344 |
+
]
|
345 |
+
},
|
346 |
+
"metadata": {},
|
347 |
+
"output_type": "display_data"
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"data": {
|
351 |
+
"application/vnd.jupyter.widget-view+json": {
|
352 |
+
"model_id": "8e5ebe15df194e3c8bf5811777755947",
|
353 |
+
"version_major": 2,
|
354 |
+
"version_minor": 0
|
355 |
+
},
|
356 |
+
"text/plain": [
|
357 |
+
"Generating train split: 0%| | 0/59195 [00:00<?, ? examples/s]"
|
358 |
+
]
|
359 |
+
},
|
360 |
+
"metadata": {},
|
361 |
+
"output_type": "display_data"
|
362 |
+
}
|
363 |
+
],
|
364 |
+
"source": [
|
365 |
+
"from datasets import load_dataset\n",
|
366 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
367 |
+
"dataset = load_dataset(\"dnagpt/dna_promoter_300\")['train'].train_test_split(test_size=0.1)"
|
368 |
+
]
|
369 |
+
},
|
370 |
+
{
|
371 |
+
"cell_type": "code",
|
372 |
+
"execution_count": 9,
|
373 |
+
"id": "6ac9fe5b-2175-42d8-949c-cb12bc8fb65c",
|
374 |
+
"metadata": {},
|
375 |
+
"outputs": [
|
376 |
+
{
|
377 |
+
"data": {
|
378 |
+
"text/plain": [
|
379 |
+
"DatasetDict({\n",
|
380 |
+
" train: Dataset({\n",
|
381 |
+
" features: ['sequence', 'label'],\n",
|
382 |
+
" num_rows: 53275\n",
|
383 |
+
" })\n",
|
384 |
+
" test: Dataset({\n",
|
385 |
+
" features: ['sequence', 'label'],\n",
|
386 |
+
" num_rows: 5920\n",
|
387 |
+
" })\n",
|
388 |
+
"})"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
"execution_count": 9,
|
392 |
+
"metadata": {},
|
393 |
+
"output_type": "execute_result"
|
394 |
+
}
|
395 |
+
],
|
396 |
+
"source": [
|
397 |
+
"dataset"
|
398 |
+
]
|
399 |
+
},
|
400 |
+
{
|
401 |
+
"cell_type": "code",
|
402 |
+
"execution_count": 10,
|
403 |
+
"id": "b5025f95-ca5d-42b1-95e1-55495f77d009",
|
404 |
+
"metadata": {},
|
405 |
+
"outputs": [
|
406 |
+
{
|
407 |
+
"data": {
|
408 |
+
"text/plain": [
|
409 |
+
"{'sequence': 'CCTGACGCCCACCGCAAGCTGCCGGGTAAGACCGGGTCGACTTCAGCGCGGCCCGCTGCACGAGAGACCATTATGGTGATCCGCCCGCCTGACACTACTGATATGTTGGGATTACAGGCGTGAGCCACGGCGCCCGGCGGGCAAGACACCCTCAGAGCACAGGGTGAATCCATGGTTAAAATACAGCGGGAAGTTAGCGCCGAAGTCGCCGTGTAATTTGTGCGCGGTTCAGGTTCATGTATTCAGAATCATTTTACTAGGTTTAGGGCTCGCCGCTGCCTCAGTGGCTTTCAGGCGCTT',\n",
|
410 |
+
" 'label': 0}"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
"execution_count": 10,
|
414 |
+
"metadata": {},
|
415 |
+
"output_type": "execute_result"
|
416 |
+
}
|
417 |
+
],
|
418 |
+
"source": [
|
419 |
+
"dataset[\"train\"][0]"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": 13,
|
425 |
+
"id": "ac999213-67b1-4294-8d92-80b8c6c68acd",
|
426 |
+
"metadata": {},
|
427 |
+
"outputs": [
|
428 |
+
{
|
429 |
+
"name": "stdout",
|
430 |
+
"output_type": "stream",
|
431 |
+
"text": [
|
432 |
+
"dna datasets mean token lenght 52.41266891891892 min token length 33 max token length 60\n"
|
433 |
+
]
|
434 |
+
}
|
435 |
+
],
|
436 |
+
"source": [
|
437 |
+
"token_len_list = []\n",
|
438 |
+
"for item in dataset[\"test\"]:\n",
|
439 |
+
" inputs = tokenizer.tokenize(item[\"sequence\"])\n",
|
440 |
+
" token_len_list.append( len(inputs) )\n",
|
441 |
+
"\n",
|
442 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
443 |
+
"min_len = min(token_len_list)\n",
|
444 |
+
"max_len = max(token_len_list)\n",
|
445 |
+
"\n",
|
446 |
+
"print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": 14,
|
452 |
+
"id": "72a2dec3-043b-41e4-afd8-4dbd8c8fcbb0",
|
453 |
+
"metadata": {},
|
454 |
+
"outputs": [
|
455 |
+
{
|
456 |
+
"data": {
|
457 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIjCAYAAAAJLyrXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABdnElEQVR4nO3deZyN9f//8eeZfTEzxjIbZowte2NLEyIUkQotiqylLBXSotVSREiopG9ZKh+lT4sP2UVCkrJlrNHImGEyjDH7zPX7Y36OTmO7zpwzZ4bH/XY7N3Nd1/v1vt7nOE7z7Hpf72MxDMMQAAAAAOCqubl6AAAAAABQ2hCkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpADAyUaPHi2LxVIs52rTpo3atGlj3V63bp0sFou+/PLLYjl/3759VbVq1WI5l73S0tL06KOPKiwsTBaLRcOGDXPq+c7//ScnJzv1PNe6vn37qkyZMq4eBgBYEaQAwIS5c+fKYrFYHz4+PoqIiFCHDh00ffp0nT171iHnSUhI0OjRo7V9+3aH9OdIJXlsV2P8+PGaO3euBg0apE8++USPPPJIoTbnw8+VHv8MraVBcQdrs9LT0zV69GitW7fO1UMBgCvycPUAAKA0Gjt2rKKjo5WTk6PExEStW7dOw4YN09SpU7V48WI1bNjQ2vbll1/WCy+8YKr/hIQEjRkzRlWrVlVMTMxV161cudLUeexxubF9+OGHys/Pd/oYimLt2rW6+eab9dprr12yTbdu3VSjRg3rdlpamgYNGqSuXbuqW7du1v2hoaFOHev1Jj09XWPGjJGkUhdSAVx/CFIAYIc777xTTZs2tW6PGjVKa9eu1V133aW7775bcXFx8vX1lSR5eHjIw8O5H7fp6eny8/OTl5eXU89zJZ6eni49/9U4ceKE6tate9k2DRs2tAnDycnJGjRokBo2bKhevXo5e4gAgFKAqX0A4CBt27bVK6+8oj///FOffvqpdf/F7pFatWqVWrZsqbJly6pMmTK64YYb9OKLL0oqmH7VrFkzSVK/fv2s08jmzp0rqeD/1NevX1/btm3TrbfeKj8/P2vtv++ROi8vL08vvviiwsLC5O/vr7vvvltHjx61aVO1alX17du3UO0/+7zS2C52j9S5c+f0zDPPqEqVKvL29tYNN9ygyZMnyzAMm3YWi0VDhw7VN998o/r168vb21v16tXT8uXLL/6C/8uJEyc0YMAAhYaGysfHRzfeeKPmzZtnPX5+Wtvhw4e1dOlS69iPHDlyVf1fzNq1a9WqVSv5+/urbNmyuueeexQXF3fFuj///FM1atRQ/fr1lZSUJEk6ffq0hg0bZn2datSooYkTJ9pc4Tty5IgsFosmT56s2bNnq3r16vL29lazZs20detWu5/HvzljLIsWLVLdunXl4+Oj+vXr6+uvv7Z5vxw5ckQVK1aUJI0ZM8b69zN69Gibfo4dO6Z7771XZcqUUcWKFTVy5Ejl5eXZtFm4cKGaNGmigIAABQYGqkGDBnrnnXcc9voAgMQVKQBwqEceeUQvvviiVq5cqccee+yibX7//XfdddddatiwocaOHStvb28dPHhQGzdulCTVqVNHY8eO1auvvqqBAweqVatWkqRbbrnF2sfff/+tO++8Uz169FCvXr2uOMXsjTfekMVi0fPPP68TJ05o2rRpat++vbZv3269cnY1rmZs/2QYhu6++259//33GjBggGJiYrRixQo9++yzOnbsmN5++22b9j/++KO++uorDR48WAEBAZo+fbq6d++u+Ph4lS9f/pLjysjIUJs2bXTw4EENHTpU0dHRWrRokfr27avTp0/r6aefVp06dfTJJ59o+PDhqly5sp555hlJsv7ybtbq1at15513qlq1aho9erQyMjI0Y8YMtWjRQr/++uslF904dOiQ2rZtq3LlymnVqlWqUKGC0tPT1bp1ax07dkyPP/64IiMjtWnTJo0aNUrHjx/XtGnTbPpYsGCBzp49q8cff1wWi0WTJk1St27d9McffxT5qqAzxrJ06VI9+OCDatCggSZMmKCUlBQNGDBAlSpVsvZTsWJFvf/++4WmUP7zymBeXp46dOig5s2ba/LkyVq9erWmTJmi6tWra9CgQZIK/ifFQw89pHbt2mnixImSpLi4OG3cuFFPP/10kV4bALBhAACu2pw5cwxJxtatWy/ZJigoyGjUqJF1+7XXXjP++XH79ttvG5KMkydPXrKPrVu3GpKMOXPmFDrWunVrQ5Ixa9asix5r3bq1dfv77783JBmVKlUyUlNTrfu/+OILQ5LxzjvvWPdFRUUZffr0uWKflxtbnz59jKioKOv2N998Y0gyXn/9dZt29913n2GxWIyDBw9a90kyvLy8bPbt2LHDkGTMmDGj0Ln+adq0aYYk49NPP7Xuy87ONmJjY40yZcrYPPeoqCijc+fOl+3v306ePGlIMl577TXrvpiYGCMkJMT4+++/bcbr5uZm9O7d27rv/N//yZMnjbi4OCMiIsJo1qyZcerUKWubcePGGf7+/sb+/fttzvvCCy8Y7u7uRnx8vGEYhnH48GFDklG+fHmb+m+//daQZPzvf/+77PM4/35YtGjRJds4YywNGjQwKleubJw9e9a6b926dYYkm/fLxV7n8/r06WNIMsaOHWuzv1GjRkaTJk2s208//bQRGBho5ObmXva1AICiYmofADhYmTJlLrt6X9myZSVJ3377rd0LM3h7e6tfv35X3b53794KCAiwbt93330KDw/Xd999Z9f5r9Z3330nd3d3PfXUUzb7n3nmGRmGoWXLltnsb9++vapXr27dbtiwoQIDA/XHH39c8TxhYWF66KGHrPs8PT311FNPKS0tTevXr3fAs7ng+PHj2r59u/r27aty5crZjPf222+/6Ou6e/dutW7dWlWrVtXq1asVHBxsPbZo0SK1atVKwcHBSk5Otj7at2+vvLw8/fDDDzZ9Pfjggzb1568MXul1uhqOHktCQoJ27dql3r172yxf3rp1azVo0MD0+J544gmb7VatWtk877Jly+rcuXNatWqV6b4BwAyCFAA4WFpamk1o+bcHH3xQLVq00KOPPqrQ0FD16NFDX3zxhalQValSJVMLS9SsWdNm22KxqEaNGkW6P+hq/Pnnn4qIiCj0etSpU8d6/J8iIyML9REcHKyUlJQrnqdmzZpyc7P9z9qlzlNU5/u74YYbCh2rU6eOkpOTde7cOZv9Xbp0UUBAgFasWKHAwECbYwcOHNDy5ctVsWJFm0f79u0lFdz/9U//fp3OB5krvU5Xw9FjOf9a/XMVxPMutu9yfHx8Ck3F/Pf7Y/DgwapVq5buvPNOVa5cWf3797/q++wAwAzukQIAB/rrr7905syZy/6C6Ovrqx9++EHff/+9li5dquXLl+vzzz9X27ZttXLlSrm7u1/xPGbua7pal/rS4Ly8vKsakyNc6jzGvxamKI26d++uefPm6bPPPtPjjz9ucyw/P1+33367nnvuuYvW1qpVy2bbma9TSRrLv13N+zAkJETbt2/XihUrtGzZMi1btkxz5sxR7969bRYfAYCiIkgBgAN98sknkqQOHTpctp2bm5vatWundu3aaerUqRo/frxeeuklff/992rfvv0lQ429Dhw4YLNtGIYOHjxocyN/cHCwTp8+Xaj2zz//VLVq1azbZsYWFRWl1atX6+zZszZXpfbu3Ws97ghRUVHauXOn8vPzba5KOfo8/zyfJO3bt6/Qsb1796pChQry9/e32f/WW2/Jw8PDupDGww8/bD1WvXp1paWlWa/6uJKjx3L+tTp48GChY//e56j3vZeXl7p06aIuXbooPz9fgwcP1gcffKBXXnnF9FUwALgUpvYBgIOsXbtW48aNU3R0tHr27HnJdqdOnSq07/wX22ZlZUmS9ZfwiwUbe8yfP9/mvq0vv/xSx48f15133mndV716df3000/Kzs627luyZEmhZdLNjK1Tp07Ky8vTzJkzbfa//fbbslgsNucvik6dOikxMVGff/65dV9ubq5mzJihMmXKqHXr1g45z3nh4eGKiYnRvHnzbF6H3bt3a+XKlerUqVOhGovFotmzZ+u+++5Tnz59tHjxYuuxBx54QJs3b9aKFSsK1Z0+fVq5ubkOHf/lOHosERERql+/vubPn6+0tDTr/vXr12vXrl02bf38/Kznsdfff/9ts+3m5mb9Hwbn/30BgCNwRQoA7LBs2TLt3btXubm5SkpK0tq1a7Vq1SpFRUVp8eLF8vHxuWTt2LFj9cMPP6hz586KiorSiRMn9N5776ly5cpq2bKlpIJQU7ZsWc2aNUsBAQHy9/dX8+bNFR0dbdd4y5Urp5YtW6pfv35KSkrStGnTVKNGDZsl2h999FF9+eWX6tixox544AEdOnRIn376qc3iD2bH1qVLF91222166aWXdOTIEd14441auXKlvv32Ww0bNqxQ3/YaOHCgPvjgA/Xt21fbtm1T1apV9eWXX2rjxo2aNm3aZe9Zs9dbb72lO++8U7GxsRowYIB1+fOgoKBC3310npubmz799FPde++9euCBB/Tdd9+pbdu2evbZZ7V48WLddddd6tu3r5o0aaJz585p165d+vLLL3XkyBFVqFDBYWP/73//a71a9099+vRxyljGjx+ve+65Ry1atFC/fv2UkpKimTNnqn79+jbhytfXV3Xr1tXnn3+uWrVqqVy5cqpfv77q169/1ed69NFHderUKbVt21aVK1fWn3/+qRkzZigmJsZ6zxwAOIRL1wwEgFLm/PLn5x9eXl5GWFiYcfvttxvvvPOOzTLb5/17+fM1a9YY99xzjxEREWF4eXkZERERxkMPPVRouelvv/3WqFu3ruHh4WGz3Hjr1q2NevXqXXR8l1r+/D//+Y8xatQoIyQkxPD19TU6d+5s/Pnnn4Xqp0yZYlSqVMnw9vY2WrRoYfzyyy+F+rzc2P69/LlhGMbZs2eN4cOHGxEREYanp6dRs2ZN46233jLy8/Nt2kkyhgwZUmhMl1qW/d+SkpKMfv36GRUqVDC8vLyMBg0aXHSJdkctf24YhrF69WqjRYsWhq+vrxEYGGh06dLF2LNnj02bfy5/fl56errRunVro0yZMsZPP/1kGEbB6zRq1CijRo0ahpeXl1GhQgXjlltuMSZPnmxkZ2cbhnFhyfG33nqr0BgvNr5/O/9+uNRjw4YNThvLwoULjdq1axve3t5G/fr1jcWLFxvdu3c3ateubdNu06ZNRpMmTQwvLy+bfvr06WP4+/sXOte//319+eWXxh133GGEhIQYXl5eRmRkpPH4448bx48fv+xrAwBmWQzjGriDFwAAlDoxMTGqWLEiS5UDKJW4RwoAADhVTk5OoXur1q1bpx07dqhNmzauGRQAFBFXpAAAgFMdOXJE7du3V69evRQREaG9e/dq1qxZCgoK0u7du1W+fHlXDxEATGOxCQAA4FTBwcFq0qSJ/u///k8nT56Uv7+/OnfurDfffJMQBaDU4ooUAAAAAJjEPVIAAAAAYBJBCgAAAABM4h4pSfn5+UpISFBAQIAsFourhwMAAADARQzD0NmzZxURESE3t0tfdyJISUpISFCVKlVcPQwAAAAAJcTRo0dVuXLlSx4nSEkKCAiQVPBiBQYGung0AADgmlW7tnT8uBQeLu3da758Zm0dP3tc4QHh2jvUfD2AK0tNTVWVKlWsGeFSCFKSdTpfYGAgQQoAADjP+WlCbm6SHb9zuPm4STkFf/I7C+BcV7rlh8UmAAAAAMAkghQAAAAAmESQAgAAAACTuEcKAACguGzdKuXlSe7u9pU/tlV5Rp7cLfbVA3AcghQAAEBxCQ8vWnlA0eoBOA5T+wAAAADAJIIUAAAAAJjE1D4AAIDiMnu2lJYmlSkjDRxovnzbbKVlp6mMVxkNbGK+HoDjWAzDMFw9CFdLTU1VUFCQzpw5w5fbAQAA56lcWTp2TKpUSfrrL/PlUyvr2NljqhRQSX+NMF8P4MquNhswtQ8AAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHDdqFVLCgqSQkPtKy9fS0E+QQr1t68egONYDMMwXD0IV7vaby8GAAAoTvHx8UpOTnZK3xUqVFBkZKRT+gZKs6vNBlyRAgAAKIHi4+NVu04dZaSnO6V/Xz8/7Y2LI0wBdiJIAQAAlEDJycnKSE/X4MmzFVG9lkP7Tji0X++NHKjk5GSCFGAnghQAAEAJFlG9lqLrxbh6GAD+hSAFAABQXHr2lJKTpQoVpM8+M13+7sbHdDbrbwV4l9eQFh86YYAArhZBCgAAoLisXy8dOyZVqmRXedyJjUrJSFCwb4SDBwbALL5HCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQXB57TDpzRgoKsqv8thq9lZGdKl+vQAcPDIBZBCkAAIDi8tprRSrv3uAFBw0EQFExtQ8AAABF17evZLFITzxR+NiQIQXH+vYt7lFdnfNj/+ejY8cLx48ckQYMkKKjJV9fqXr1glCcnX11/RuGdOedBf1+843tsaeekpo0kby9pZgYxzwfFAuuSAEAAMAxqlSRFi6U3n67IHBIUmamtGCBFBnp2rFdSceO0pw5F7a9vS/8vHevlJ8vffCBVKOGtHt3wTTNc+ekyZOv3Pe0aQUh6lL695e2bJF27rR7+Ch+XJECAACAYzRuXBCmvvrqwr6vvioIUY0a2bbNz5cmTLhwlefGG6Uvv7xwPC/P9irQDTdI77xj20ffvtK99xaEmfBwqXz5gqtfOTnmx+7tLYWFXXgEB184dj5k3XGHVK2adPfd0siRts/zUrZvl6ZMkT7++OLHp08vGHO1aubHDJciSAEAABSXypULrkxUrmxX+dCv66rngrIa+nVdBw/Mgfr3t72y8/HHUr9+hdtNmCDNny/NmiX9/rs0fLjUq5e0fn3B8fz8gtdp0SJpzx7p1VelF1+UvvjCtp/vv5cOHSr4c948ae7cgsd5o0dLVateedzr1kkhIQWBbdAg6e+/L9/+zBmpXLnLt0lPlx5+WHr33YJwhmsKU/sAAADgOL16SaNGSX/+WbC9cWPBdL916y60ycqSxo+XVq+WYmML9lWrJv34Y8H0udatJU9PacyYCzXR0dLmzQVB6oEHLuwPDpZmzpTc3aXataXOnaU1awqm3klShQoF9zRdTseOUrduBec4dKggsN15Z8H53N0Ltz94UJox48rT+oYPl265Rbrnnsu3Q6lEkAIAAIDjVKxYEGbmzi1YZKFz54Iw808HDxZcrbn9dtv92dm2UwDffbfgilZ8vJSRUXD83wsy1KtnG3bCw6Vduy5sDx1a8LicHj0u/NyggdSwYUH4WrdOatfOtu2xYwXB6/77L4S1i1m8WFq7Vvrtt8ufG6UWQQoAAACO1b//hfDy7ruFj6elFfy5dKlUqZLtsfOLPCxcWHAf0pQpBVetAgKkt94qWJThnzw9bbctloJpgUVRrVpB+Dt40DZIJSRIt91WcJVp9uzL97F2bcHVrbJlbfd37y61amV7hQ6lEkEKAAAAjtWxY8HVI4tF6tCh8PG6dQsCU3x8wTS+i9m4sSCwDB58Yd+hQ84Z77/99VfBPVLh4Rf2HTtWEKKaNCm4B8ztCksNvPCC9OijtvsaNChY0bBLF8ePGcWOIAUAAADHcneX4uIu/PxvAQEFV5uGDy+4etSyZcHiDRs3SoGBUp8+Us2aBYtRrFhRcO/SJ59IW7cW/GzGzJnS118X3Dd1MWlpBfdide9esCDEoUPSc88VLHN+PgQeOya1aSNFRRXcF3Xy5IX684tIHDtWcPVq/nzpppsurP73b5GRts/h4MGCMSQmFkxf3L69YH/dupKXl7nnimJFkAIAAIDjBQZe/vi4cQX3U02YIP3xR8EUuMaNCxZ6kKTHHy+4v+jBBwuubD30UMHVqWXLzI0jOfnyV7Lc3Qu+v2nePOn0aSkiomCZ83HjLkwzXLWqIPAcPFh4xUXDKPgzJ0fat6/g3i8zHn30wkqF0oV7xA4fvrrVBuEyFsM4/7d//UpNTVVQUJDOnDmjwCv9owcAALBX5coFVy4qVSqYPnYZv/76q5o0aaLXv16n6HoxkgqWP0/JSFCwb4Rmdt1j9zAO/75dL3dto23btqlx48Z29wNci642G/A9UgAAAABgEkEKAAAAAEziHikAAIDi8umnBV9Ge/7eG5MG3/KBcvOy5eHOIgSAqxGkAAAAikubNkUqrxvayjHj+P/izq+s5wQVKlRQZGSk0/oHXI0gBQAAcJ05fTJJFotFvXr1cto5fP38tDcujjCFaxZBCgAA4DqTnnpGhmGo37jpql6/ocP7Tzi0X++NHKjk5GSCFK5ZBCkAAIDism7dhXuk7Jjmtydpg/UeKUdM8wuPrmFdWh2AOQQpAACA4tKr11V/j9TFvLfpcYd8jxSAomP5cwAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATOJ7pAAAAOwUHx+v5OTkq25fPydHXpKyc3K0+9dfL9s2Li6uiKMD4EwEKQAAADvEx8erdp06ykhPv+qao5IqSzpx4oSaNGlyVTXZWdn2DRCAUxGkAAAA7JCcnKyM9HQNnjxbEdVrXVXNvH/8/PoV2u5Yv0qLpr2h3Nxc676ZXfeYHygApyBIAQAAFEFE9VqKrhfj8H4TDu13eJ8AHIfFJgAAAADAJIIUAAAAAJjE1D4AAIBiUn/Gm/I6m6rsgEDtfvIF0/X/3fWmMrJT5esVqO4NzNcDcByCFAAAQDGp8cV8+SUlKD00wq4g9f3B+UrJSFCwbwRBCnAxpvYBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1wapPLy8vTKK68oOjpavr6+ql69usaNGyfDMKxtDMPQq6++qvDwcPn6+qp9+/Y6cOCATT+nTp1Sz549FRgYqLJly2rAgAFKS0sr7qcDAAAA4Drh0iA1ceJEvf/++5o5c6bi4uI0ceJETZo0STNmzLC2mTRpkqZPn65Zs2Zpy5Yt8vf3V4cOHZSZmWlt07NnT/3+++9atWqVlixZoh9++EEDBw50xVMCAAAAcB1w6Rfybtq0Sffcc486d+4sSapatar+85//6Oeff5ZUcDVq2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjh+Li4rR8+XJt3bpVTZs2lSTNmDFDnTp10uTJkxUREeGaJwcAAPAvJ25qIe+Uv5UVXN6u+johLXQ2628FeNtXD8BxXBqkbrnlFs2ePVv79+9XrVq1tGPHDv3444+aOnWqJOnw4cNKTExU+/btrTVBQUFq3ry5Nm/erB49emjz5s0qW7asNURJUvv27eXm5qYtW7aoa9euhc6blZWlrKws63ZqaqoTnyUAAECBTVM+LFL9kBZFqwfgOC4NUi+88IJSU1NVu3Ztubu7Ky8vT2+88YZ69uwpSUpMTJQkhYaG2tSFhoZajyUmJiokJMTmuIeHh8qVK2dt828TJkzQmDFjHP10AAAAAFwnXHqP1BdffKHPPvtMCxYs0K+//qp58+Zp8uTJmjdvnlPPO2rUKJ05c8b6OHr0qFPPBwClwboj62QZY9HpzNOSpLnb56rsm2VdOiYAAEoqlwapZ599Vi+88IJ69OihBg0a6JFHHtHw4cM1YcIESVJYWJgkKSkpyaYuKSnJeiwsLEwnTpywOZ6bm6tTp05Z2/ybt7e3AgMDbR4AUJL1/aavLGMsemLJE4WODVk6RJYxFvX9pq9Dz/lgvQe1/8n9Du3zalWdVlWWMRabx5s/vmk9vi95n26bd5tCJ4fK53UfVXunml5e+7Jy8nIu22/8mXh1XtBZfm/4KeStED278lnl5ufatFl3ZJ0af9BY3q97q8b0Gpq7fa4zniIAoJRz6dS+9PR0ubnZZjl3d3fl5+dLkqKjoxUWFqY1a9YoJiZGUsH9TFu2bNGgQYMkSbGxsTp9+rS2bdumJk2aSJLWrl2r/Px8NW/evPieDAA4WZXAKlq4e6He7vC2fD19JUmZuZlasHuBIoMiHX4+X09f63lcYWybsXqsyWPW7QCvAOvPnu6e6t2wtxqHN1ZZn7LakbRDj/3vMeUb+RrfbvxF+8vLz1PnBZ0VViZMmwZs0vGzx9X7m97ydPe01hxOOazOCzrriSZP6LNun2nN4TV6dPGjCi8Trg41Ojj3CeO60LZ3F/kkn1RmhYpaO/9/puvfWNNFZzJPKsinol5qZ74egOO4NEh16dJFb7zxhiIjI1WvXj399ttvmjp1qvr37y9JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XklSnTh117NhRjz32mGbNmqWcnBwNHTpUPXr0YMU+ANeUxuGNdSjlkL6K+0o9GxbcS/pV3FeKDIpUdNlom7b5Rr4m/jhRs3+drcS0RNUqX0uv3PqK7qt7n7XNdwe+07Dlw3Q09ahurnyz+tzYx6aPudvnatjyYTr9wmlJ0qFThzRi5Qj99NdPOpd9TnUq1tGEdhPUvtqFBYGqTquqgU0G6uCpg1q0Z5GCfYL18q0va2AT819JEeAdoLAyF59ZUC24mqoFV7NuR5WN0roj67QhfsMl+1t5aKX2nNyj1Y+sVmiZUMWExWjcbeP0/OrnNbrNaHm5e2nWL7MUXTZaUzpMkSTVqVhHP8b/qLd/epsgBYcIPHxIfkkJSj9r30JXx1MPKSUjQenZLJQFuJpLp/bNmDFD9913nwYPHqw6depo5MiRevzxxzVu3Dhrm+eee05PPvmkBg4cqGbNmiktLU3Lly+Xj4+Ptc1nn32m2rVrq127durUqZNatmyp2bNnu+IpAYBT9Y/prznb51i3P/7tY/WL6Veo3YQNEzR/53zN6jxLvw/+XcNvHq5eX/XS+iPrJUlHzxxVt8+7qUutLtr++HY92uhRvbD6hcueOy07TZ1qdNKa3mv02+O/qWP1juryny6KPxNv027K5ilqGtFUvz3+mwY3G6xBSwdpX/I+6/E2c9tc1TTEN398U+UnlVejDxrprY1vFZqC908HTx3U8oPL1Tqq9SXbbP5rsxqENFBomQsLGHWo3kGpWan6/cTv1jb/DIbn22z+a/MVxwsAuL649IpUQECApk2bpmnTpl2yjcVi0dixYzV27NhLtilXrpwWLFjghBECQMnSq2EvjVozSn+e/lOStPHoRi28b6HWHVlnbZOVm6XxP47X6kdWK7ZKrKSCKzg/xv+oD7Z9oNZVW+v9X95X9XLVrVdebqhwg3ad2KWJGyde8tw3ht2oG8NutG6PaztOX+/9Wov3LdbQm4Za93eq2UmDmw2WJD3f4nm9/dPb+v7I97qhwg2SpMigSIWXCb/s83yq+VNqHN5Y5XzLadPRTRq1ZpSOpx3X1A5Tbdrd8tEt+vX4r8rKy9LAxgM19rZL/7ciMS3RJkRJsm4npiVeaONfuE1qVqoycjJcOtURAFCyuDRIAQDMqehfUZ1rddbc7XNlyFDnmp1Vwa+CTZuDpw4qPSddt39yu83+7LxsNQpvJEmKS45T80q295HGVo697LnTstM0et1oLT2wVMfPHldufq4ycjMKXZFqGNLQ+rPFYlFYmTCdOHdhUaD5Xedf8XmOiB1xob/QhvJy99LjSx7XhHYT5O3hbT32+X2f62z2We1I3KFnVz2ryZsm67kWz12xfwAAioogBQClTP+Y/hq6rOAK0Lud3i10PC07TZK09OGlqhRYyeaYt7t3ofZXa+TKkVr1xypNvn2yapSrIV9PX933xX3Kzsu2aefp7mmzbZFF+Ua+3eeVpOaVmis3P1dHTh+xXtmSpCpBVSRJdSvWVZ6Rp4H/G6hnYp+Ru5t7oT7CyoTp52M/2+xLSkuyHjv/Z9K5pEJtAr0DuRoFALBBkAKAUqZjjY7KzsuWRRZ1qF54AYS6FevK291b8Wfi1brqxe8ZqlOhjhbvW2yz76e/frrseTce3ai+N/ZV1zpdJRUEtiOnj9j3JEzanrhdbhY3hfiHXLJNvpGvnPwc5Rv5clfhIBVbOVZvbHhDJ86dsPaz6o9VCvQOVN2Kda1tvjv4nU3dqj9WXfFqHQDg+kOQAoBSxt3NXXFD4qw//1uAd4BG3jJSw1cMV76Rr5aRLXUm64w2xm9UoHeg+sT00RNNn9CUzVP07Mpn9WjjR7Xt+DbN3TH3suetWa6mvtr7lbrc0EUWWfTK96/YdaWp99e9VSmgkia0n3DR45uPbtaWY1t0W9XbFOAdoM1HN2v4iuHq1bCXgn2DJUmf7fxMnu6eahDSQN4e3vol4ReNWjNKD9Z70HpF7Ou4rzVqzSjtHbpXknRH9TtUt2JdPfL1I5rUfpIS0xL18tqXNaTZEOt0wSeaPqGZW2fquVXPqX+j/lp7eK2++P0LLX14qennCQC4thGkAKAUCvS+/BeJj7ttnCr6VdSEHyfoj5Q/VNanrBqHN9aLrV6UVLDgw38f+K+GrxiuGT/P0E2VbtL4tuPVf3H/S/Y5tcNU9f+2v2756BZV8Kug51s8r9Qs80swx5+Jl5vl0ovGent4a+HuhRq9brSy8rIUXTZaw28ebnPflIebhyZunKj9f++XYRiKKhuloc2GanjscGubM1lntO/vC6sFuru5a8lDSzRo6SDFfhQrfy9/9bmxj80CFdHB0Vr68FINXzFc72x5R5UDK+v/7v4/lj4HABRiMQzDcPUgXC01NVVBQUE6c+aMAgMv/8sJAACAJP36669q0qSJXv96naLrxVxVzb2t6hZ8j1RohL7ZsOeybTcu/kLvjRyoFz/9TvVuukWSNPTrukrJSFCwb4Rmdr18vdm+Henw79v1ctc22rZtmxo3buzw/gFnutpswBUpAACAYrJryHPyTE9Tjl8Zu+q7NXhOmTlp8vG0rx6A4xCkAAAAismhHn2LVN+2RtHqATjOpSepAwAAAAAuiiAFAAAAACYxtQ8AAKCY+JxIlCU/T4abuzJDwkzXp2QkKt/Ik5vFXcG+5usBOA5BCgAAoJh07N72qlftu5hXlrd1yKp9AIqOqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGCSh6sHAAAAcL1YM+9bueXlKt/dvl/BXmz3rfLzc+Xmxq9wgKvxrxAAAKCYnK1Ws0j1EYFFqwfgOEztAwAAAACTCFIAAAAAYBJT+wAAAIpJ1P8WySMjQ7m+vvqzy/2m6zceWaTs3Ax5efiqRVXz9QAchyAFAABQTBpNek1+SQlKD42wK0j957fXlJKRoGDfCIIU4GJM7QMAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYxBfyAgAAFJOMiiE2f5pV1jfE5k8ArkOQAgAAKCYrvlpXpPrXOxatHoDjEKQAAMA1LT4+XsnJyQ7vNy4uzuF9Aig9CFIAAOCaFR8fr9p16igjPd1p58jOynZa3wBKLoIUAAC4ZiUnJysjPV2DJ89WRPVaDu17x/pVWjTtDeXm5jq0XwClA0EKAABc8yKq11J0vRiH9plwaL/pmmavDJP3mRRlBQVr67hppus/+nmY0rJSVMY7WANuMl8PwHEIUgAAAMWk0rqV8ktKUHpohLbaUf/bsZVKyUhQsG+Ew8cGwBy+RwoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUEyO3NVdXmdOKzuorF31t1TtrnPZp+XvZV89AMchSAEAABST7c+PK1L9w42KVg/AcZjaBwAAAAAmEaQAAAAAwCSCFAAAAACYxD1SAAAAxaRzh2byO5Go9JAwLV2x1XT9yCXNlJKeqGC/ME2+y3w9AMfhihQAAEAx8Uw/J89zZ+WZfs6u+sycc8rMPavMHPvqATgOQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgkoerBwAAAHC9+HnsVLlnZirPx8eu+v43TVVOXqY83e2rB+A4BCkAAIBiknBbxyLVN65UtHoAjsPUPgAAAAAwiSAFAAAAACYxtQ8AAKCYBO/eLvecbOV5eimlfozp+sOntis3L1se7l6KLme+HoDjEKQAAACKSetBD8svKUHpoRH6ZsMe0/VT1j+slIwEBftGaGZX8/UAHIcgBQAAAKeIi4tzSr8VKlRQZGSkU/oGrhZBCgAAAA51+mSSLBaLevXq5ZT+ff38tDcujjAFlyJIAQAAwKHSU8/IMAz1Gzdd1es3dGjfCYf2672RA5WcnEyQgksRpAAAAOAU4dE1FF0vxtXDAJyC5c8BAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAExi1T4AAIBismTZFkmGJItd9W/dtUWGDFnsrAfgOAQpAACAYpJbJqBI9b6eRasH4DhM7QMAAAAAkwhSAAAAAGCSy4PUsWPH1KtXL5UvX16+vr5q0KCBfvnlF+txwzD06quvKjw8XL6+vmrfvr0OHDhg08epU6fUs2dPBQYGqmzZshowYIDS0tKK+6kAAABcVu2PZ6rB9Amq/fFMu+q/i5up/+6coO/i7KsH4DguDVIpKSlq0aKFPD09tWzZMu3Zs0dTpkxRcHCwtc2kSZM0ffp0zZo1S1u2bJG/v786dOigzMxMa5uePXvq999/16pVq7RkyRL98MMPGjhwoCueEgAAwCXVnvOeGsycqNpz3rOr/ru97+mr3RP13V776gE4jksXm5g4caKqVKmiOXPmWPdFR0dbfzYMQ9OmTdPLL7+se+65R5I0f/58hYaG6ptvvlGPHj0UFxen5cuXa+vWrWratKkkacaMGerUqZMmT56siIiI4n1SAAAAAK55Lr0itXjxYjVt2lT333+/QkJC1KhRI3344YfW44cPH1ZiYqLat29v3RcUFKTmzZtr8+bNkqTNmzerbNmy1hAlSe3bt5ebm5u2bNly0fNmZWUpNTXV5gEAAAAAV8ulQeqPP/7Q+++/r5o1a2rFihUaNGiQnnrqKc2bN0+SlJiYKEkKDQ21qQsNDbUeS0xMVEhIiM1xDw8PlStXztrm3yZMmKCgoCDro0qVKo5+agAAAACuYS4NUvn5+WrcuLHGjx+vRo0aaeDAgXrsscc0a9Ysp5531KhROnPmjPVx9OhRp54PAAAAwLXFpUEqPDxcdevWtdlXp04dxcfHS5LCwsIkSUlJSTZtkpKSrMfCwsJ04sQJm+O5ubk6deqUtc2/eXt7KzAw0OYBAAAAAFfLpUGqRYsW2rdvn82+/fv3KyoqSlLBwhNhYWFas2aN9Xhqaqq2bNmi2NhYSVJsbKxOnz6tbdu2WdusXbtW+fn5at68eTE8CwAAAADXG5eu2jd8+HDdcsstGj9+vB544AH9/PPPmj17tmbPni1JslgsGjZsmF5//XXVrFlT0dHReuWVVxQREaF7771XUsEVrI4dO1qnBObk5Gjo0KHq0aMHK/YBAAAAcAqXBqlmzZrp66+/1qhRozR27FhFR0dr2rRp6tmzp7XNc889p3PnzmngwIE6ffq0WrZsqeXLl8vHx8fa5rPPPtPQoUPVrl07ubm5qXv37po+fbornhIAAACA64BLg5Qk3XXXXbrrrrsuedxisWjs2LEaO3bsJduUK1dOCxYscMbwAAAAHOZUvYZKD6+kzHLl7aqPLtdQ5TMrKdDHvnoAjuPyIAUAAHC9+GHWwiLVP9O6aPUAHMeli00AAAAAQGlEkAIAAAAAkwhSAAAAAGAS90gBAAAUk1uf6CGfU38rs1x5u+6XmrK+h1Iz/1agT3nulwJcjCAFAABQTMr9vlN+SQlKD7Xvuy4Pn9qplIwEBfvyXZmAqzG1DwAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASX8gLAABQTPb2GyzPtLPKKRNgV32n2oOVkXNWvp721QNwHLuC1B9//KFq1ao5eiwAAADXtL39hxapvlOdotUDcBy7pvbVqFFDt912mz799FNlZmY6ekwAAAAAUKLZFaR+/fVXNWzYUCNGjFBYWJgef/xx/fzzz44eGwAAAACUSHYFqZiYGL3zzjtKSEjQxx9/rOPHj6tly5aqX7++pk6dqpMnTzp6nAAAAKWeR9pZeaSlyiPtrF31GTlnlZ6Tqowc++oBOE6RVu3z8PBQt27dtGjRIk2cOFEHDx7UyJEjVaVKFfXu3VvHjx931DgBAABKvbvubK4HGkfqrjub21X/7JLmemxRpJ5dYl89AMcpUpD65ZdfNHjwYIWHh2vq1KkaOXKkDh06pFWrVikhIUH33HOPo8YJAAAAACWGXav2TZ06VXPmzNG+ffvUqVMnzZ8/X506dZKbW0Eui46O1ty5c1W1alVHjhUAAAAASgS7gtT777+v/v37q2/fvgoPD79om5CQEH300UdFGhwAAAAAlER2BakDBw5csY2Xl5f69OljT/cAAAAAUKLZdY/UnDlztGjRokL7Fy1apHnz5hV5UAAAAABQktkVpCZMmKAKFSoU2h8SEqLx48cXeVAAAAAAUJLZFaTi4+MVHR1daH9UVJTi4+OLPCgAAAAAKMnsClIhISHauXNnof07duxQ+fLlizwoAAAAACjJ7Fps4qGHHtJTTz2lgIAA3XrrrZKk9evX6+mnn1aPHj0cOkAAAOB68fHxSk5OdkrfFSpUUGRkpFP6BgBnsStIjRs3TkeOHFG7du3k4VHQRX5+vnr37s09UgAAXGPi4+NVu04dZaSnO6V/Xz8/7Y2Luy7C1Pr3F8g9J1t5nl521T/TeoFy87Ll4W5fPQDHsStIeXl56fPPP9e4ceO0Y8cO+fr6qkGDBoqKinL0+AAAgIslJycrIz1dgyfPVkT1Wg7tO+HQfr03cqCSk5OviyCVUj+mSPXR5YpWD8Bx7ApS59WqVUu1ajn2AxUAAJRMEdVrKbpejKuHAQAlgl1BKi8vT3PnztWaNWt04sQJ5efn2xxfu3atQwYHAAAAACWRXUHq6aef1ty5c9W5c2fVr19fFovF0eMCAAC45kR8v1zumZnK8/FRwm0dTdf/emy5cvIy5enuo8aVzNcDcBy7gtTChQv1xRdfqFOnTo4eDwAAwDXrpldHyC8pQemhEfpmg/kg9PHPI5SSkaBg3wg17kqQAlzJru+R8vLyUo0aNRw9FgAAAAAoFewKUs8884zeeecdGYbh6PEAAAAAQIln19S+H3/8Ud9//72WLVumevXqydPT0+b4V1995ZDBAQAAAEBJZFeQKlu2rLp27erosQAAAABAqWBXkJozZ46jxwEAAAAApYZd90hJUm5urlavXq0PPvhAZ8+elSQlJCQoLS3NYYMDAAAAgJLIritSf/75pzp27Kj4+HhlZWXp9ttvV0BAgCZOnKisrCzNmjXL0eMEAAAAgBLDritSTz/9tJo2baqUlBT5+vpa93ft2lVr1qxx2OAAAAAAoCSy64rUhg0btGnTJnl5ednsr1q1qo4dO+aQgQEAAFxrcvz8leMfoBw/f7vqfTz95ZMTIB9P++oBOI5dQSo/P195eXmF9v/1118KCAgo8qAAAACuRUtXbC1S/eS7ilYPwHHsmtp3xx13aNq0adZti8WitLQ0vfbaa+rUqZOjxgYAAAAAJZJdV6SmTJmiDh06qG7dusrMzNTDDz+sAwcOqEKFCvrPf/7j6DECAAAAQIliV5CqXLmyduzYoYULF2rnzp1KS0vTgAED1LNnT5vFJwAAAADgWmRXkJIkDw8P9erVy5FjAQAAuKbFTHxFXmdOKzuorLY/P850/YLfXtG57NPy9yqrhxuZrwfgOHYFqfnz51/2eO/eve0aDAAAwLWs6pL/yi8pQemhEXYFqU1H/quUjAQF+0YQpAAXsytIPf300zbbOTk5Sk9Pl5eXl/z8/AhSAAAAAK5pdq3al5KSYvNIS0vTvn371LJlSxabAAAAAHDNsytIXUzNmjX15ptvFrpaBQAAAADXGocFKalgAYqEhARHdgkAAAAAJY5d90gtXrzYZtswDB0/flwzZ85UixYtHDIwAAAAACip7ApS9957r822xWJRxYoV1bZtW02ZMsUR4wIAAACAEsuuIJWfn+/ocQAAAABAqeHQe6QAAAAA4Hpg1xWpESNGXHXbqVOn2nMKAACAa86xNnfI+0yKsoKC7apvVOkOpWWlqIy3ffUAHMeuIPXbb7/pt99+U05Ojm644QZJ0v79++Xu7q7GjRtb21ksFseMEgAA4Bqwddy0ItUPuKlo9QAcx64g1aVLFwUEBGjevHkKDi74PyIpKSnq16+fWrVqpWeeecahgwQAAACAksSue6SmTJmiCRMmWEOUJAUHB+v1119n1T4AAAAA1zy7glRqaqpOnjxZaP/Jkyd19uzZIg8KAAAAAEoyu6b2de3aVf369dOUKVN00003SZK2bNmiZ599Vt26dXPoAAEAAK4VHbq1ke/JE8qoGKIVX60zXf/y8jY6nXFCZX1D9HpH8/UAHMeuIDVr1iyNHDlSDz/8sHJycgo68vDQgAED9NZbbzl0gAAAANcK35Mn5JeUYHf96YwTSsmwvx6A49gVpPz8/PTee+/prbfe0qFDhyRJ1atXl7+/v0MHBwAAAAAlUZG+kPf48eM6fvy4atasKX9/fxmG4ahxAQAAAECJZVeQ+vvvv9WuXTvVqlVLnTp10vHjxyVJAwYMYOlzAAAAANc8u4LU8OHD5enpqfj4ePn5+Vn3P/jgg1q+fLnDBgcAAAAAJZFd90itXLlSK1asUOXKlW3216xZU3/++adDBgYAAAAAJZVdV6TOnTtncyXqvFOnTsnb27vIgwIAAACAksyuINWqVSvNnz/fum2xWJSfn69Jkybptttuc9jgAAAAAKAksmtq36RJk9SuXTv98ssvys7O1nPPPafff/9dp06d0saNGx09RgAAAAAoUewKUvXr19f+/fs1c+ZMBQQEKC0tTd26ddOQIUMUHh7u6DECAABcE357bow8MjKU6+trV/1DjcYoOzdDXh721QNwHNNBKicnRx07dtSsWbP00ksvOWNMAAAA16Q/u9xfpPoWVYtWD8BxTN8j5enpqZ07dzpjLAAAAABQKti12ESvXr300UcfOXosAAAAAFAq2HWPVG5urj7++GOtXr1aTZo0kb+/v83xqVOnOmRwAAAA15KAPw7ILS9X+e4eOlutpun6hNQDys/PlZubhyICzdcDcBxTQeqPP/5Q1apVtXv3bjVu3FiStH//fps2FovFcaMDAAC4hrTrc4/8khKUHhqhbzbsMV0/fs09SslIULBvhGZ2NV8PwHFMBamaNWvq+PHj+v777yVJDz74oKZPn67Q0FCnDA4AAAAASiJT90gZhmGzvWzZMp07d86hAwIAAACAks6uxSbO+3ewAgAAAIDrgakgZbFYCt0DxT1RAAAAAK43pu6RMgxDffv2lbe3tyQpMzNTTzzxRKFV+7766ivHjRAAAAAAShhTV6T69OmjkJAQBQUFKSgoSL169VJERIR1+/zDHm+++aYsFouGDRtm3ZeZmakhQ4aofPnyKlOmjLp3766kpCSbuvj4eHXu3Fl+fn4KCQnRs88+q9zcXLvGAAAAAABXw9QVqTlz5jhlEFu3btUHH3yghg0b2uwfPny4li5dqkWLFikoKEhDhw5Vt27dtHHjRklSXl6eOnfurLCwMG3atEnHjx9X79695enpqfHjxztlrAAAAABQpMUmHCEtLU09e/bUhx9+qODgYOv+M2fO6KOPPtLUqVPVtm1bNWnSRHPmzNGmTZv0008/SZJWrlypPXv26NNPP1VMTIzuvPNOjRs3Tu+++66ys7Nd9ZQAAAAAXONcHqSGDBmizp07q3379jb7t23bppycHJv9tWvXVmRkpDZv3ixJ2rx5sxo0aGDzPVYdOnRQamqqfv/990ueMysrS6mpqTYPAAAAALhapqb2OdrChQv166+/auvWrYWOJSYmysvLS2XLlrXZHxoaqsTERGubf38Z8Pnt820uZsKECRozZkwRRw8AAGDO8v+ulSU/T4abu1314zquVb6RJzeLffUAHMdlQero0aN6+umntWrVKvn4+BTruUeNGqURI0ZYt1NTU1WlSpViHQMAALj+ZIaEFak+2Ldo9QAcx2VT+7Zt26YTJ06ocePG8vDwkIeHh9avX6/p06fLw8NDoaGhys7O1unTp23qkpKSFBZW8CESFhZWaBW/89vn21yMt7e3AgMDbR4AAAAAcLVcFqTatWunXbt2afv27dZH06ZN1bNnT+vPnp6eWrNmjbVm3759io+PV2xsrCQpNjZWu3bt0okTJ6xtVq1apcDAQNWtW7fYnxMAAACA64PLpvYFBASofv36Nvv8/f1Vvnx56/4BAwZoxIgRKleunAIDA/Xkk08qNjZWN998syTpjjvuUN26dfXII49o0qRJSkxM1Msvv6whQ4ZYvzQYAACgpKi+cK4809OU41dGh3r0NV2/9uBcZeakycezjNrWMF8PwHFcutjElbz99ttyc3NT9+7dlZWVpQ4dOui9996zHnd3d9eSJUs0aNAgxcbGyt/fX3369NHYsWNdOGoAAICLa/DuJPklJSg9NMKuIPXVrklKyUhQsG/EdR+k4uLinNJvhQoVFBkZ6ZS+cW0pUUFq3bp1Nts+Pj5699139e67716yJioqSt99952TRwYAAICS4PTJJFksFvXq1csp/fv6+WlvXBxhCldUooIUAAAAcDnpqWdkGIb6jZuu6vUbOrTvhEP79d7IgUpOTiZI4YoIUgAAACh1wqNrKLpejKuHgeuYy1btAwAAAIDSiiAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1i1DwAAoJikRldXdkCgMitUtKs+PLC6/LwCFeRjXz0AxyFIAQAAFJO18/9XpPqX2hWtHoDjMLUPAAAAAEwiSAEAAACASQQpAAAAADCJe6QAAACKyS3PPCbvlL+VFVxem6Z8aLr+3Y2P6WzW3wrwLq8hLczXA3AcghQAAEAxCfl5o/ySEpQeGmFXfdyJjUrJSFCwr331AByHqX0AAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk/hCXgAArgHx8fFKTk52St9xcXFO6fd6dPCB3vI6m6rsgEC76m+r0VsZ2any9bKvHoDjEKQAACjl4uPjVbtOHWWkpzv1PNlZ2U7t/3qw+8kXilTfvUHR6gE4DkEKAIBSLjk5WRnp6Ro8ebYiqtdyeP871q/SomlvKDc31+F9A0BpRZACAOAaEVG9lqLrxTi834RD+x3eJwCUdiw2AQAAAAAmcUUKAACgmNzbqq78khKUHhqhbzbsMV0/9Ou6SslIULBvhGZ2NV8PwHG4IgUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEkEKQAAAAAwiSAFAAAAACZ5uHoAAAAA14tNkz+Qe3a28ry87KoffMsHys3Lloe7ffUAHIcgBQAAUExONG9VpPq6oUWrB+A4TO0DAAAAAJMIUgAAAABgElP7AAAAiknIlg3We6Tsmea3J2mD9R4ppvkBrkWQAgAAKCa3jHxcfkkJSg+N0Dcb9piuf2/T40rJSFCwb4RmdjVfD8BxmNoHAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmebh6AAAAANeLbzbsKVL9zK5FqwfgOFyRAgAAAACTCFIAAAAAYBJBCgAAAABM4h4pAACAYlJ/xpvyOpuq7IBA7X7yBdP1/931pjKyU+XrFajuDczXA3AcghQAAEAxqfHFfPklJSg9NMKuIPX9wflKyUhQsG8EQQpwMab2AQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEkEKAAAAAEziC3kBAACKyYmbWsg75W9lBZe3q75OSAudzfpbAd721QNwHIIUAABAMdk05cMi1Q9pUbR6AI7D1D4AAAAAMIkgBQAAAAAmEaQAAAAAwCTukQIAACgmbXt3kU/ySWVWqKi18/9nuv6NNV10JvOkgnwq6qV25usBOA5BCgAAoJgEHj4kv6QEpZ9Ntav+eOohpWQkKD3bvnoAjsPUPgAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAkwhSAAAAAGASQQoAAAAATCJIAQAAAIBJfCEvAABAMdk15Dl5pqcpx6+MXfXdGjynzJw0+XjaVw/AcQhSAAAAxeRQj75Fqm9bo2j1AByHqX0AAAAAYBJBCgAAAABMcmmQmjBhgpo1a6aAgACFhITo3nvv1b59+2zaZGZmasiQISpfvrzKlCmj7t27KykpyaZNfHy8OnfuLD8/P4WEhOjZZ59Vbm5ucT4VAACAK/I5kSjfxGPyOZFoV31KRqL+Tj+mlAz76gE4jkvvkVq/fr2GDBmiZs2aKTc3Vy+++KLuuOMO7dmzR/7+/pKk4cOHa+nSpVq0aJGCgoI0dOhQdevWTRs3bpQk5eXlqXPnzgoLC9OmTZt0/Phx9e7dW56enho/frwrnx4AADbi4+OVnJzs8H7j4uIc3ieco2P3tvJLSlB6aIS+2bDHdP0ry9sqJSNBwb4RmtnVfD0Ax3FpkFq+fLnN9ty5cxUSEqJt27bp1ltv1ZkzZ/TRRx9pwYIFatu2rSRpzpw5qlOnjn766SfdfPPNWrlypfbs2aPVq1crNDRUMTExGjdunJ5//nmNHj1aXl5ehc6blZWlrKws63ZqaqpznygA4LoXHx+v2nXqKCM93WnnyM7KdlrfAABbJWrVvjNnzkiSypUrJ0natm2bcnJy1L59e2ub2rVrKzIyUps3b9bNN9+szZs3q0GDBgoNDbW26dChgwYNGqTff/9djRo1KnSeCRMmaMyYMU5+NgAAXJCcnKyM9HQNnjxbEdVrObTvHetXadG0N5jWDgDFqMQEqfz8fA0bNkwtWrRQ/fr1JUmJiYny8vJS2bJlbdqGhoYqMTHR2uafIer88fPHLmbUqFEaMWKEdTs1NVVVqlRx1FMBAOCSIqrXUnS9GIf2mXBov0P7AwBcWYkJUkOGDNHu3bv1448/Ov1c3t7e8vb2dvp5AAAAAFybSsTy50OHDtWSJUv0/fffq3Llytb9YWFhys7O1unTp23aJyUlKSwszNrm36v4nd8+3wYAAAAAHMmlQcowDA0dOlRff/211q5dq+joaJvjTZo0kaenp9asWWPdt2/fPsXHxys2NlaSFBsbq127dunEiRPWNqtWrVJgYKDq1q1bPE8EAAAAwHXFpVP7hgwZogULFujbb79VQECA9Z6moKAg+fr6KigoSAMGDNCIESNUrlw5BQYG6sknn1RsbKxuvvlmSdIdd9yhunXr6pFHHtGkSZOUmJiol19+WUOGDGH6HgAAAACncGmQev/99yVJbdq0sdk/Z84c9e3bV5L09ttvy83NTd27d1dWVpY6dOig9957z9rW3d1dS5Ys0aBBgxQbGyt/f3/16dNHY8eOLa6nAQAAAOA649IgZRjGFdv4+Pjo3Xff1bvvvnvJNlFRUfruu+8cOTQAAAAAuKQSs2ofAADAtW7NvG/llperfHf7fgV7sd23ys/PlZsbv8IBrsa/QgAAgGJytlrNItVHBBatHoDjlIjlzwEAAACgNCFIAQAAAIBJTO0DAAAoJlH/WySPjAzl+vrqzy73m67feGSRsnMz5OXhqxZVzdcDcByCFAAAQDFpNOk1+SUlKD00wq4g9Z/fXlNKRoKCfSMIUk4UFxfntL4rVKigyMhIp/WP4kOQAgAAACSdPpkki8WiXr16Oe0cvn5+2hsXR5i6BhCkAAAAAEnpqWdkGIb6jZuu6vUbOrz/hEP79d7IgUpOTiZIXQMIUgAAAMA/hEfXUHS9GFcPAyUcq/YBAAAAgEkEKQAAAAAwiSAFAAAAACYRpAAAAADAJIIUAAAAAJjEqn0AAADFJKNiiM2fZpX1DbH5E4DrEKQAAACKyYqv1hWp/vWORasH4DhM7QMAAAAAkwhSAAAAAGASQQoAAAAATOIeKQAAgGLS7JVh8j6ToqygYG0dN810/Uc/D1NaVorKeAdrwE3m6wE4DkEKAACgmFRat1J+SQlKD43QVjvqfzu2UikZCQr2jXD42ACYw9Q+AAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAAAAgEl8IS8AAEAxOXJXd3mdOa3soLJ21d9StbvOZZ+Wv5d99QAchyAFAABQTLY/P65I9Q83Klo9AMdhah8AAAAAmESQAgAAAACTCFIAAAAAYBL3SAEAABSTzh2aye9EotJDwrR0xVbT9SOXNFNKeqKC/cI0+S7z9QAchyAFAMD/Fx8fr+TkZKf0HRcX55R+Ubp4pp+T57mz8kwPsKs+M+ecMnPPKjPHvnoAjkOQAgBABSGqdp06ykhPd+p5srOyndo/AKB4EKQAAJCUnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BAMWPIAUAwD9EVK+l6HoxDu834dB+h/cJAHAdVu0DAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJVfsAAACKyc9jp8o9M1N5Pj521fe/aapy8jLl6W5fPQDHIUgBAAAUk4TbOhapvnGlotUDcBym9gEAAACASQQpAAAAADCJqX0AAADFJHj3drnnZCvP00sp9WNM1x8+tV25ednycPdSdDnz9QAchyAFAABQTFoPelh+SQlKD43QNxv2mK6fsv5hpWQkKNg3QjO7mq8H4DhM7QMAAAAAk7giBQAAABSjuLg4p/RboUIFRUZGOqVvFEaQAgAAAIrB6ZNJslgs6tWrl1P69/Xz0964OMJUMSFIAQAAAMUgPfWMDMNQv3HTVb1+Q4f2nXBov94bOVDJyckEqWJCkAIAAACKUXh0DUXXi3H1MFBELDYBAAAAACYRpAAAAADAJIIUAAAAAJhEkAIAAAAAk1hsAgAAoJgsWbZFkiHJYlf9W3dtkSFDFjvrATgOQQoAAKCY5JYJKFK9r2fR6gE4DkEKAFCqxMfHKzk52eH9xsXFObxPAMC1iyAFACg14uPjVbtOHWWkpzvtHNlZ2U7rGwBw7SBIAQBKjeTkZGWkp2vw5NmKqF7LoX3vWL9Ki6a9odzcXIf2C/xT7Y9nyjPtrHLKBGhv/6Gm67+Lm6mMnLPy9QxQpzrm6wE4DkEKAFDqRFSvpeh6MQ7tM+HQfof2B1xM7TnvyS8pQemhEfYFqb3vKSUjQcG+EQQpwMVY/hwAAAAATCJIAQAAAIBJBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFAAAAACYRJACAAAAAJMIUgAAAABgEl/ICwAAUExO1Wuo9PBKyixX3q766HINVT6zkgJ97KsH4DgEKQAAgGLyw6yFRap/pnXR6gE4DkEKAOBQ8fHxSk5OdkrfcXFxTukXAACzCFIAAIeJj49X7Tp1lJGe7tTzZGdlO7V/ACitnPk/nCpUqKDIyEin9V/aEKQAAA6TnJysjPR0DZ48WxHVazm8/x3rV2nRtDeUm5vr8L4BoDQ7fTJJFotFvXr1cto5fP38tDcujjD1/xGkAAAOF1G9lqLrxTi834RD+x3eJ1Ccbn2ih3xO/a3McuXtul9qyvoeSs38W4E+5blfCjbSU8/IMAz1Gzdd1es3dHj/CYf2672RA5WcnEyQ+v8IUgAAAMWk3O875ZeUoPTQCLvqD5/aqZSMBAX72lePa194dA2n/I8sFMb3SAEAAACASVyRAoDrkLNW1mNVPQDA9YIgBeCynLmUtVS6VwBy5mvjzNelOFbWY1U9AMC1jiAF4JKK4xfu0roCkLNfG2e+Ls5cWY9V9QAA14trJki9++67euutt5SYmKgbb7xRM2bM0E033eTqYQGlmrOXsi7NKwA587U5/7ps2LBBderUcWjf0oXpd85YWY9V9QAA14trIkh9/vnnGjFihGbNmqXmzZtr2rRp6tChg/bt26eQkBBXDw8o9Zy1lPV5zrqvpjimDTrjtSmO7wKRmH4HADCvNP8329GuiSA1depUPfbYY+rXr58kadasWVq6dKk+/vhjvfDCCy4enXnOvO8iKytL3t7eTulbcv59HaX1Xh1nj91Zf6/OXjjA2YGhtE4bdPZ3gTD9DgBgFv/NLqzUB6ns7Gxt27ZNo0aNsu5zc3NT+/bttXnz5ovWZGVlKSsry7p95swZSVJqaqpzB3sVjh49qqbNmikzI8NJZ7BIMpzUt+Tt46NP5s9XaGioQ/tNSkpS7969lZmZ6dB+/6k0j93Zf68Htv+izPRzDu/34G9bZRiGbn/kcYVHVXNo338n/qWl/zdDK1as0A033ODQviVp3759kqQjv+9w+GtzfnpcdmaGU173nP//+fdn3C65WRz7vjk/dmf07ez+Gbtr+k88fFCStG3bNqWlpTm0b6l4/q2aeV1uzc5UrqTU7EzFbd1ouv/ctEwpR8rNu3K9o8deUvovrX07u39nj704/pt95MgRlS1b1qF92+N8JjCMy7+OFuNKLUq4hIQEVapUSZs2bVJsbKx1/3PPPaf169dry5YthWpGjx6tMWPGFOcwAQAAAJQiR48eVeXKlS95vNRfkbLHqFGjNGLECOt2fn6+Tp06pfLly8tisbhwZNee1NRUValSRUePHlVgYKCrhwMX4r0AifcBCvA+wHm8FyCVvPeBYRg6e/asIiIiLtuu1AepChUqyN3dXUlJSTb7k5KSFBYWdtEab2/vQveTlITLiNeywMDAEvEPA67HewES7wMU4H2A83gvQCpZ74OgoKArtnErhnE4lZeXl5o0aaI1a9ZY9+Xn52vNmjU2U/0AAAAAwFFK/RUpSRoxYoT69Omjpk2b6qabbtK0adN07tw56yp+AAAAAOBI10SQevDBB3Xy5Em9+uqrSkxMVExMjJYvX+7w1ddgnre3t1577TWnLrmO0oH3AiTeByjA+wDn8V6AVHrfB6V+1T4AAAAAKG6l/h4pAAAAAChuBCkAAAAAMIkgBQAAAAAmEaQAAAAAwCSCFIrs/fffV8OGDa1fohYbG6tly5ZZj7dp00YWi8Xm8cQTT7hwxCgOb775piwWi4YNG2bdl5mZqSFDhqh8+fIqU6aMunfvXujLtHHtudh7gc+F68Po0aML/T3Xrl3bepzPhOvDld4HfB5cX44dO6ZevXqpfPny8vX1VYMGDfTLL79YjxuGoVdffVXh4eHy9fVV+/btdeDAAReO+NKuieXP4VqVK1fWm2++qZo1a8owDM2bN0/33HOPfvvtN9WrV0+S9Nhjj2ns2LHWGj8/P1cNF8Vg69at+uCDD9SwYUOb/cOHD9fSpUu1aNEiBQUFaejQoerWrZs2btzoopHC2S71XpD4XLhe1KtXT6tXr7Zue3hc+NWDz4Trx+XeBxKfB9eLlJQUtWjRQrfddpuWLVumihUr6sCBAwoODra2mTRpkqZPn6558+YpOjpar7zyijp06KA9e/bIx8fHhaMvjCCFIuvSpYvN9htvvKH3339fP/30kzVI+fn5KSwszBXDQzFLS0tTz5499eGHH+r111+37j9z5ow++ugjLViwQG3btpUkzZkzR3Xq1NFPP/2km2++2VVDhpNc6r1wHp8L1wcPD4+L/j3zmXB9udT74Dw+D64PEydOVJUqVTRnzhzrvujoaOvPhmFo2rRpevnll3XPPfdIkubPn6/Q0FB988036tGjR7GP+XKY2geHysvL08KFC3Xu3DnFxsZa93/22WeqUKGC6tevr1GjRik9Pd2Fo4QzDRkyRJ07d1b79u1t9m/btk05OTk2+2vXrq3IyEht3ry5uIeJYnCp98J5fC5cHw4cOKCIiAhVq1ZNPXv2VHx8vCQ+E643l3ofnMfnwfVh8eLFatq0qe6//36FhISoUaNG+vDDD63HDx8+rMTERJvPhaCgIDVv3rxEfi5wRQoOsWvXLsXGxiozM1NlypTR119/rbp160qSHn74YUVFRSkiIkI7d+7U888/r3379umrr75y8ajhaAsXLtSvv/6qrVu3FjqWmJgoLy8vlS1b1mZ/aGioEhMTi2mEKC6Xey9IfC5cL5o3b665c+fqhhtu0PHjxzVmzBi1atVKu3fv5jPhOnK590FAQACfB9eRP/74Q++//75GjBihF198UVu3btVTTz0lLy8v9enTx/pvPzQ01KaupH4uEKTgEDfccIO2b9+uM2fO6Msvv1SfPn20fv161a1bVwMHDrS2a9CggcLDw9WuXTsdOnRI1atXd+Go4UhHjx7V008/rVWrVpW4OcwoXlfzXuBz4fpw5513Wn9u2LChmjdvrqioKH3xxRfy9fV14chQnC73PhgwYACfB9eR/Px8NW3aVOPHj5ckNWrUSLt379asWbPUp08fF4/OPKb2wSG8vLxUo0YNNWnSRBMmTNCNN96od95556JtmzdvLkk6ePBgcQ4RTrZt2zadOHFCjRs3loeHhzw8PLR+/XpNnz5dHh4eCg0NVXZ2tk6fPm1Tl5SUxLz4a8yV3gt5eXmFavhcuD6ULVtWtWrV0sGDBxUWFsZnwnXqn++Di+Hz4NoVHh5unbF0Xp06daxTPc//2//36p0l9XOBIAWnyM/PV1ZW1kWPbd++XVLBPyZcO9q1a6ddu3Zp+/bt1kfTpk3Vs2dP68+enp5as2aNtWbfvn2Kj4+3uZ8Opd+V3gvu7u6FavhcuD6kpaXp0KFDCg8PV5MmTfhMuE79831wMXweXLtatGihffv22ezbv3+/oqKiJBUsPBEWFmbzuZCamqotW7aUyM8FpvahyEaNGqU777xTkZGROnv2rBYsWKB169ZpxYoVOnTokBYsWKBOnTqpfPny2rlzp4YPH65bb731ossho/QKCAhQ/fr1bfb5+/urfPny1v0DBgzQiBEjVK5cOQUGBurJJ59UbGwsq3NdY670XuBz4foxcuRIdenSRVFRUUpISNBrr70md3d3PfTQQwoKCuIz4TpxufcBnwfXl+HDh+uWW27R+PHj9cADD+jnn3/W7NmzNXv2bEmyfufg66+/rpo1a1qXP4+IiNC9997r2sFfBEEKRXbixAn17t1bx48fV1BQkBo2bKgVK1bo9ttv19GjR7V69WpNmzZN586dU5UqVdS9e3e9/PLLrh42XODtt9+Wm5ubunfvrqysLHXo0EHvvfeeq4eFYubl5cXnwnXir7/+0kMPPaS///5bFStWVMuWLfXTTz+pYsWKkvhMuF5c7n2QmZnJ58F1pFmzZvr66681atQojR07VtHR0Zo2bZp69uxpbfPcc8/p3LlzGjhwoE6fPq2WLVtq+fLlJfL+a4thGIarBwEAAAAApQn3SAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmESQAgAAAACTCFIAAAAAYBJBCgAAAABMIkgBAEqUI0eOyGKxaPv27a4eSonRpk0bDRs2zNXDAAD8A0EKAOBwFovlso/Ro0e7eoiFlISwsm7dOlksFp0+fdql4wAAXJmHqwcAALj2HD9+3Prz559/rldffVX79u2z7itTpowrhgUAgMNwRQoA4HBhYWHWR1BQkCwWi3U7JCREU6dOVeXKleXt7a2YmBgtX778kn3l5eWpf//+ql27tuLj4yVJ3377rRo3biwfHx9Vq1ZNY8aMUW5urrXGYrHo//7v/9S1a1f5+fmpZs2aWrx4cZGe048//qhWrVrJ19dXVapU0VNPPaVz585Zj1etWlXjx49X//79FRAQoMjISM2ePdumj02bNikmJkY+Pj5q2rSpvvnmG+s0xiNHjui2226TJAUHB8tisahv377W2vz8fD333HMqV66cwsLCSuRVPQC4nhCkAADF6p133tGUKVM0efJk7dy5Ux06dNDdd9+tAwcOFGqblZWl+++/X9u3b9eGDRsUGRmpDRs2qHfv3nr66ae1Z88effDBB5o7d67eeOMNm9oxY8bogQce0M6dO9WpUyf17NlTp06dsmvMhw4dUseOHdW9e3ft3LlTn3/+uX788UcNHTrUpt2UKVPUtGlT/fbbbxo8eLAGDRpkvRKXmpqqLl26qEGDBvr11181btw4Pf/889baKlWq6L///a8kad++fTp+/Ljeeecd6/F58+bJ399fW7Zs0aRJkzR27FitWrXKrucDAHAAAwAAJ5ozZ44RFBRk3Y6IiDDeeOMNmzbNmjUzBg8ebBiGYRw+fNiQZGzYsMFo166d0bJlS+P06dPWtu3atTPGjx9vU//JJ58Y4eHh1m1Jxssvv2zdTktLMyQZy5Ytu+Q4W7dubTz99NMXPTZgwABj4MCBNvs2bNhguLm5GRkZGYZhGEZUVJTRq1cv6/H8/HwjJCTEeP/99w3DMIz333/fKF++vLW9YRjGhx9+aEgyfvvtN8MwDOP77783JBkpKSmFxtayZUubfc2aNTOef/75Sz4fAIBzcY8UAKDYpKamKiEhQS1atLDZ36JFC+3YscNm30MPPaTKlStr7dq18vX1te7fsWOHNm7caHMFKi8vT5mZmUpPT5efn58kqWHDhtbj/v7+CgwM1IkTJ+wa944dO7Rz50599tln1n2GYSg/P1+HDx9WnTp1Cp3z/HTG8+fct2+fGjZsKB8fH2ubm2666arH8M++JSk8PNzu5wMAKDqCFACgROrUqZM+/fRTbd68WW3btrXuT0tL05gxY9StW7dCNf8MKZ6enjbHLBaL8vPz7RpLWlqaHn/8cT311FOFjkVGRjrlnP/mzL4BAOYRpAAAxSYwMFARERHauHGjWrdubd2/cePGQldnBg0apPr16+vuu+/W0qVLre0bN26sffv2qUaNGsU27saNG2vPnj1FOucNN9ygTz/9VFlZWfL29pYkbd261aaNl5eXpIIrbACAko0gBQAoVs8++6xee+01Va9eXTExMZozZ462b99uM23uvCeffFJ5eXm66667tGzZMrVs2VKvvvqq7rrrLkVGRuq+++6Tm5ubduzYod27d+v1118v0thOnjxZ6IuAw8PD9fzzz+vmm2/W0KFD9eijj8rf31979uzRqlWrNHPmzKvq++GHH9ZLL72kgQMH6oUXXlB8fLwmT54sqeDqkiRFRUXJYrFoyZIl6tSpk3x9fVkqHgBKKFbtAwAUq6eeekojRozQM888owYNGmj58uVavHixatasedH2w4YN05gxY9SpUydt2rRJHTp00JIlS7Ry5Uo1a9ZMN998s95++21FRUUVeWwLFixQo0aNbB4ffvihGjZsqPXr12v//v1q1aqVGjVqpFdffVURERFX3XdgYKD+97//afv27YqJidFLL72kV199VdKFKYmVKlXSmDFj9MILLyg0NLTQqoAAgJLDYhiG4epBAABwPfrss8/Ur18/nTlzxmZBDQBAycfUPgAAisn8+fNVrVo1VapUSTt27NDzzz+vBx54gBAFAKUQQQoAgGKSmJioV199VYmJiQoPD9f9999f6IuEAQClA1P7AAAAAMAkFpsAAAAAAJMIUgAAAABgEkEKAAAAAEwiSAEAAACASQQpAAAAADCJIAUAAAAAJhGkAAAAAMAkghQAAAAAmPT/ALFDWFiHUDxIAAAAAElFTkSuQmCC",
|
458 |
+
"text/plain": [
|
459 |
+
"<Figure size 1000x600 with 1 Axes>"
|
460 |
+
]
|
461 |
+
},
|
462 |
+
"metadata": {},
|
463 |
+
"output_type": "display_data"
|
464 |
+
}
|
465 |
+
],
|
466 |
+
"source": [
|
467 |
+
"#统计图\n",
|
468 |
+
"import matplotlib.pyplot as plt\n",
|
469 |
+
"import seaborn as sns\n",
|
470 |
+
"import numpy as np\n",
|
471 |
+
"\n",
|
472 |
+
"# 假设这是您的 token_len_list\n",
|
473 |
+
"\n",
|
474 |
+
"# 设置画布大小\n",
|
475 |
+
"plt.figure(figsize=(10, 6))\n",
|
476 |
+
"\n",
|
477 |
+
"# 使用 seaborn 生成直方图\n",
|
478 |
+
"sns.histplot(token_len_list, bins=30, kde=False, color=\"skyblue\", edgecolor=\"black\")\n",
|
479 |
+
"\n",
|
480 |
+
"# 添加标题和标签\n",
|
481 |
+
"plt.title(\"Distribution of Token Lengths\")\n",
|
482 |
+
"plt.xlabel(\"Token Length\")\n",
|
483 |
+
"plt.ylabel(\"Frequency\")\n",
|
484 |
+
"\n",
|
485 |
+
"# 显示平均值线\n",
|
486 |
+
"mean_value = np.mean(token_len_list)\n",
|
487 |
+
"plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2)\n",
|
488 |
+
"plt.text(mean_value + 2, plt.ylim()[1]*0.9, f'Mean: {mean_value:.2f}', color='red')\n",
|
489 |
+
"\n",
|
490 |
+
"# 显示中位数线\n",
|
491 |
+
"median_value = np.median(token_len_list)\n",
|
492 |
+
"plt.axvline(median_value, color='green', linestyle='dashed', linewidth=2)\n",
|
493 |
+
"plt.text(median_value - 10, plt.ylim()[1]*0.8, f'Median: {median_value:.2f}', color='green')\n",
|
494 |
+
"\n",
|
495 |
+
"# 显示图形\n",
|
496 |
+
"plt.show()"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
{
|
500 |
+
"cell_type": "code",
|
501 |
+
"execution_count": 15,
|
502 |
+
"id": "9a65c8bc-6bf0-4605-8c38-409bbb14f2c7",
|
503 |
+
"metadata": {},
|
504 |
+
"outputs": [
|
505 |
+
{
|
506 |
+
"data": {
|
507 |
+
"application/vnd.jupyter.widget-view+json": {
|
508 |
+
"model_id": "a4e97d92506f419581c3711f26d7f683",
|
509 |
+
"version_major": 2,
|
510 |
+
"version_minor": 0
|
511 |
+
},
|
512 |
+
"text/plain": [
|
513 |
+
"Map: 0%| | 0/53275 [00:00<?, ? examples/s]"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
"metadata": {},
|
517 |
+
"output_type": "display_data"
|
518 |
+
},
|
519 |
+
{
|
520 |
+
"data": {
|
521 |
+
"application/vnd.jupyter.widget-view+json": {
|
522 |
+
"model_id": "9004c03cb9b24411b6bd9d33662402fb",
|
523 |
+
"version_major": 2,
|
524 |
+
"version_minor": 0
|
525 |
+
},
|
526 |
+
"text/plain": [
|
527 |
+
"Map: 0%| | 0/5920 [00:00<?, ? examples/s]"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
"metadata": {},
|
531 |
+
"output_type": "display_data"
|
532 |
+
}
|
533 |
+
],
|
534 |
+
"source": [
|
535 |
+
"# 2. tokenize\n",
|
536 |
+
"def tokenize_function(examples):\n",
|
537 |
+
" examples['label'] = [int(item) for item in examples['label']]\n",
|
538 |
+
" return tokenizer(examples['sequence'], truncation=True, padding='max_length', max_length=128)\n",
|
539 |
+
"\n",
|
540 |
+
"# 3. 对数据集应用分词函数\n",
|
541 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
|
542 |
+
"\n",
|
543 |
+
"# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
|
544 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
545 |
+
]
|
546 |
+
},
|
547 |
+
{
|
548 |
+
"cell_type": "code",
|
549 |
+
"execution_count": 22,
|
550 |
+
"id": "4b0faa94-d0c4-4ce8-9976-dcefcb766f0b",
|
551 |
+
"metadata": {},
|
552 |
+
"outputs": [
|
553 |
+
{
|
554 |
+
"name": "stderr",
|
555 |
+
"output_type": "stream",
|
556 |
+
"text": [
|
557 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
558 |
+
" warnings.warn(\n",
|
559 |
+
"/tmp/ipykernel_2549/341301010.py:29: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
|
560 |
+
" trainer = Trainer(\n"
|
561 |
+
]
|
562 |
+
}
|
563 |
+
],
|
564 |
+
"source": [
|
565 |
+
"from transformers import TrainingArguments, Trainer\n",
|
566 |
+
"import numpy as np\n",
|
567 |
+
"import torch.nn as nn\n",
|
568 |
+
"\n",
|
569 |
+
"\n",
|
570 |
+
"\n",
|
571 |
+
"def compute_metrics(eval_pred):\n",
|
572 |
+
" predictions, labels = eval_pred\n",
|
573 |
+
" predictions = np.argmax(predictions, axis=1)\n",
|
574 |
+
" return {'accuracy': (predictions==labels).sum() / len(labels)}\n",
|
575 |
+
"\n",
|
576 |
+
"# change training hyperparameters to archive better quality\n",
|
577 |
+
"training_args = TrainingArguments(\n",
|
578 |
+
" output_dir=\"ds_job_category_v0\",\n",
|
579 |
+
" learning_rate=1e-5,\n",
|
580 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
581 |
+
" warmup_ratio=0.1,\n",
|
582 |
+
" optim='adamw_torch',\n",
|
583 |
+
" weight_decay=0.0,\n",
|
584 |
+
" per_device_train_batch_size=20,\n",
|
585 |
+
" per_device_eval_batch_size=20,\n",
|
586 |
+
" num_train_epochs=10,\n",
|
587 |
+
" evaluation_strategy=\"epoch\",\n",
|
588 |
+
" save_strategy=\"epoch\",\n",
|
589 |
+
" logging_strategy=\"epoch\",\n",
|
590 |
+
" load_best_model_at_end=True\n",
|
591 |
+
")\n",
|
592 |
+
"\n",
|
593 |
+
"trainer = Trainer(\n",
|
594 |
+
" model=model,\n",
|
595 |
+
" args=training_args,\n",
|
596 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
597 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
598 |
+
" tokenizer=tokenizer,\n",
|
599 |
+
" data_collator=data_collator,\n",
|
600 |
+
" compute_metrics=compute_metrics,\n",
|
601 |
+
")"
|
602 |
+
]
|
603 |
+
},
|
604 |
+
{
|
605 |
+
"cell_type": "code",
|
606 |
+
"execution_count": 17,
|
607 |
+
"id": "9b067740-9c0f-4df8-a5af-b68ec9d1f3e0",
|
608 |
+
"metadata": {},
|
609 |
+
"outputs": [
|
610 |
+
{
|
611 |
+
"data": {
|
612 |
+
"text/html": [
|
613 |
+
"\n",
|
614 |
+
" <div>\n",
|
615 |
+
" \n",
|
616 |
+
" <progress value='26640' max='26640' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
617 |
+
" [26640/26640 1:00:13, Epoch 10/10]\n",
|
618 |
+
" </div>\n",
|
619 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
620 |
+
" <thead>\n",
|
621 |
+
" <tr style=\"text-align: left;\">\n",
|
622 |
+
" <th>Epoch</th>\n",
|
623 |
+
" <th>Training Loss</th>\n",
|
624 |
+
" <th>Validation Loss</th>\n",
|
625 |
+
" <th>Accuracy</th>\n",
|
626 |
+
" </tr>\n",
|
627 |
+
" </thead>\n",
|
628 |
+
" <tbody>\n",
|
629 |
+
" <tr>\n",
|
630 |
+
" <td>1</td>\n",
|
631 |
+
" <td>0.324900</td>\n",
|
632 |
+
" <td>0.237557</td>\n",
|
633 |
+
" <td>0.916216</td>\n",
|
634 |
+
" </tr>\n",
|
635 |
+
" <tr>\n",
|
636 |
+
" <td>2</td>\n",
|
637 |
+
" <td>0.193100</td>\n",
|
638 |
+
" <td>0.212998</td>\n",
|
639 |
+
" <td>0.925338</td>\n",
|
640 |
+
" </tr>\n",
|
641 |
+
" <tr>\n",
|
642 |
+
" <td>3</td>\n",
|
643 |
+
" <td>0.126900</td>\n",
|
644 |
+
" <td>0.278650</td>\n",
|
645 |
+
" <td>0.923480</td>\n",
|
646 |
+
" </tr>\n",
|
647 |
+
" <tr>\n",
|
648 |
+
" <td>4</td>\n",
|
649 |
+
" <td>0.076900</td>\n",
|
650 |
+
" <td>0.362979</td>\n",
|
651 |
+
" <td>0.922804</td>\n",
|
652 |
+
" </tr>\n",
|
653 |
+
" <tr>\n",
|
654 |
+
" <td>5</td>\n",
|
655 |
+
" <td>0.047400</td>\n",
|
656 |
+
" <td>0.518552</td>\n",
|
657 |
+
" <td>0.915372</td>\n",
|
658 |
+
" </tr>\n",
|
659 |
+
" <tr>\n",
|
660 |
+
" <td>6</td>\n",
|
661 |
+
" <td>0.032000</td>\n",
|
662 |
+
" <td>0.698843</td>\n",
|
663 |
+
" <td>0.918412</td>\n",
|
664 |
+
" </tr>\n",
|
665 |
+
" <tr>\n",
|
666 |
+
" <td>7</td>\n",
|
667 |
+
" <td>0.029000</td>\n",
|
668 |
+
" <td>0.760331</td>\n",
|
669 |
+
" <td>0.915709</td>\n",
|
670 |
+
" </tr>\n",
|
671 |
+
" <tr>\n",
|
672 |
+
" <td>8</td>\n",
|
673 |
+
" <td>0.025900</td>\n",
|
674 |
+
" <td>0.769762</td>\n",
|
675 |
+
" <td>0.921959</td>\n",
|
676 |
+
" </tr>\n",
|
677 |
+
" <tr>\n",
|
678 |
+
" <td>9</td>\n",
|
679 |
+
" <td>0.021800</td>\n",
|
680 |
+
" <td>0.740165</td>\n",
|
681 |
+
" <td>0.923142</td>\n",
|
682 |
+
" </tr>\n",
|
683 |
+
" <tr>\n",
|
684 |
+
" <td>10</td>\n",
|
685 |
+
" <td>0.021300</td>\n",
|
686 |
+
" <td>0.738664</td>\n",
|
687 |
+
" <td>0.922973</td>\n",
|
688 |
+
" </tr>\n",
|
689 |
+
" </tbody>\n",
|
690 |
+
"</table><p>"
|
691 |
+
],
|
692 |
+
"text/plain": [
|
693 |
+
"<IPython.core.display.HTML object>"
|
694 |
+
]
|
695 |
+
},
|
696 |
+
"metadata": {},
|
697 |
+
"output_type": "display_data"
|
698 |
+
},
|
699 |
+
{
|
700 |
+
"data": {
|
701 |
+
"text/plain": [
|
702 |
+
"TrainOutput(global_step=26640, training_loss=0.08990609108864724, metrics={'train_runtime': 3619.5996, 'train_samples_per_second': 147.185, 'train_steps_per_second': 7.36, 'total_flos': 3.4801460969472e+16, 'train_loss': 0.08990609108864724, 'epoch': 10.0})"
|
703 |
+
]
|
704 |
+
},
|
705 |
+
"execution_count": 17,
|
706 |
+
"metadata": {},
|
707 |
+
"output_type": "execute_result"
|
708 |
+
}
|
709 |
+
],
|
710 |
+
"source": [
|
711 |
+
"trainer.train()"
|
712 |
+
]
|
713 |
+
},
|
714 |
+
{
|
715 |
+
"cell_type": "code",
|
716 |
+
"execution_count": 20,
|
717 |
+
"id": "aa26e020-2dfd-4e0e-b330-250ee3e44a44",
|
718 |
+
"metadata": {},
|
719 |
+
"outputs": [
|
720 |
+
{
|
721 |
+
"data": {
|
722 |
+
"text/html": [],
|
723 |
+
"text/plain": [
|
724 |
+
"<IPython.core.display.HTML object>"
|
725 |
+
]
|
726 |
+
},
|
727 |
+
"metadata": {},
|
728 |
+
"output_type": "display_data"
|
729 |
+
},
|
730 |
+
{
|
731 |
+
"data": {
|
732 |
+
"text/plain": [
|
733 |
+
"{'accuracy': 0.9253378378378379, 'f1': 0.927062706270627}"
|
734 |
+
]
|
735 |
+
},
|
736 |
+
"execution_count": 20,
|
737 |
+
"metadata": {},
|
738 |
+
"output_type": "execute_result"
|
739 |
+
}
|
740 |
+
],
|
741 |
+
"source": [
|
742 |
+
"#模型测试\n",
|
743 |
+
"import evaluate\n",
|
744 |
+
"predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
|
745 |
+
"preds = np.argmax(predictions.predictions, axis=-1)\n",
|
746 |
+
"metric = evaluate.load(\"glue\", \"mrpc\")\n",
|
747 |
+
"ret = metric.compute(predictions=preds, references=predictions.label_ids)\n",
|
748 |
+
"ret"
|
749 |
+
]
|
750 |
+
},
|
751 |
+
{
|
752 |
+
"cell_type": "code",
|
753 |
+
"execution_count": 21,
|
754 |
+
"id": "5e6d99ad-66a0-4b85-9380-ae2b7ee88056",
|
755 |
+
"metadata": {},
|
756 |
+
"outputs": [
|
757 |
+
{
|
758 |
+
"data": {
|
759 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAHHCAYAAACcHAM1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABRWklEQVR4nO3deVgV1f8H8PeAclkv4AKIEoKCguJuRrgmgYpbWGaSool9LdDc0MwNtyzcUnMpS1HT1Eot0VQUd3FNlFxIEEUTsERAUPb5/eGPySt45crFO17er555Hu7MmTOfuQ/px885Z0YQRVEEERERkYwZ6DoAIiIiomdhwkJERESyx4SFiIiIZI8JCxEREckeExYiIiKSPSYsREREJHtMWIiIiEj2mLAQERGR7DFhISIiItljwkL0Erp69Sp8fHxgaWkJQRCwfft2rfZ//fp1CIKAiIgIrfb7MuvcuTM6d+6s6zCIqiwmLETPKTExEf/73//g7OwMY2NjKJVKeHl5YfHixXj48GGlXjswMBBxcXGYM2cO1q9fjzZt2lTq9V6kIUOGQBAEKJXKMr/Hq1evQhAECIKA+fPna9z/7du3ERYWhtjYWC1ES0QvSjVdB0D0Mtq5cyfeeecdKBQKDB48GE2bNkV+fj6OHj2K0NBQXLx4Ed9++22lXPvhw4eIiYnB5MmTERISUinXcHR0xMOHD1G9evVK6f9ZqlWrhgcPHmDHjh3o37+/yrENGzbA2NgYubm5z9X37du3MWPGDNSvXx8tWrQo93l79+59rusRkXYwYSHSUFJSEgYMGABHR0dER0ejTp060rHg4GAkJCRg586dlXb9f/75BwBgZWVVadcQBAHGxsaV1v+zKBQKeHl54ccffyyVsGzcuBF+fn745ZdfXkgsDx48gKmpKYyMjF7I9YiobBwSItJQeHg4srOz8f3336skKyUaNmyITz75RPpcWFiIWbNmoUGDBlAoFKhfvz4+++wz5OXlqZxXv3599OzZE0ePHsWrr74KY2NjODs7Y926dVKbsLAwODo6AgBCQ0MhCALq168P4NFQSsnPjwsLC4MgCCr7oqKi0L59e1hZWcHc3ByNGjXCZ599Jh1/2hyW6OhodOjQAWZmZrCyskKfPn1w+fLlMq+XkJCAIUOGwMrKCpaWlhg6dCgePHjw9C/2CQMHDsTvv/+OjIwMad/p06dx9epVDBw4sFT79PR0jB8/Hh4eHjA3N4dSqUT37t1x/vx5qc3BgwfRtm1bAMDQoUOloaWS++zcuTOaNm2Ks2fPomPHjjA1NZW+lyfnsAQGBsLY2LjU/fv6+sLa2hq3b98u970S0bMxYSHS0I4dO+Ds7IzXX3+9XO2DgoIwbdo0tGrVCosWLUKnTp0wd+5cDBgwoFTbhIQEvP3223jzzTexYMECWFtbY8iQIbh48SIAwN/fH4sWLQIAvPfee1i/fj2++uorjeK/ePEievbsiby8PMycORMLFixA7969cezYMbXn7du3D76+vrhz5w7CwsIwduxYHD9+HF5eXrh+/Xqp9v3798f9+/cxd+5c9O/fHxEREZgxY0a54/T394cgCNi6dau0b+PGjWjcuDFatWpVqv21a9ewfft29OzZEwsXLkRoaCji4uLQqVMnKXlwc3PDzJkzAQAffvgh1q9fj/Xr16Njx45SP3fv3kX37t3RokULfPXVV+jSpUuZ8S1evBi1a9dGYGAgioqKAADffPMN9u7di6VLl8Le3r7c90pE5SASUbllZmaKAMQ+ffqUq31sbKwIQAwKClLZP378eBGAGB0dLe1zdHQUAYiHDx+W9t25c0dUKBTiuHHjpH1JSUkiAHHevHkqfQYGBoqOjo6lYpg+fbr4+P/qixYtEgGI//zzz1PjLrnGmjVrpH0tWrQQbWxsxLt370r7zp8/LxoYGIiDBw8udb0PPvhApc+33npLrFmz5lOv+fh9mJmZiaIoim+//bbYtWtXURRFsaioSLSzsxNnzJhR5neQm5srFhUVlboPhUIhzpw5U9p3+vTpUvdWolOnTiIAceXKlWUe69Spk8q+PXv2iADE2bNni9euXRPNzc3Fvn37PvMeiUhzrLAQaSArKwsAYGFhUa72u3btAgCMHTtWZf+4ceMAoNRcF3d3d3To0EH6XLt2bTRq1AjXrl177pifVDL35ddff0VxcXG5zklJSUFsbCyGDBmCGjVqSPubNWuGN998U7rPx40YMULlc4cOHXD37l3pOyyPgQMH4uDBg0hNTUV0dDRSU1PLHA4CHs17MTB49EdaUVER7t69Kw13/fHHH+W+pkKhwNChQ8vV1sfHB//73/8wc+ZM+Pv7w9jYGN988025r0VE5ceEhUgDSqUSAHD//v1ytb9x4wYMDAzQsGFDlf12dnawsrLCjRs3VPa/8sorpfqwtrbGvXv3njPi0t599114eXkhKCgItra2GDBgALZs2aI2eSmJs1GjRqWOubm54d9//0VOTo7K/ifvxdraGgA0upcePXrAwsICmzdvxoYNG9C2bdtS32WJ4uJiLFq0CC4uLlAoFKhVqxZq166NCxcuIDMzs9zXrFu3rkYTbOfPn48aNWogNjYWS5YsgY2NTbnPJaLyY8JCpAGlUgl7e3v8+eefGp335KTXpzE0NCxzvyiKz32NkvkVJUxMTHD48GHs27cPgwYNwoULF/Duu+/izTffLNW2IipyLyUUCgX8/f2xdu1abNu27anVFQD4/PPPMXbsWHTs2BE//PAD9uzZg6ioKDRp0qTclSTg0fejiXPnzuHOnTsAgLi4OI3OJaLyY8JCpKGePXsiMTERMTExz2zr6OiI4uJiXL16VWV/WloaMjIypBU/2mBtba2yoqbEk1UcADAwMEDXrl2xcOFCXLp0CXPmzEF0dDQOHDhQZt8lccbHx5c6duXKFdSqVQtmZmYVu4GnGDhwIM6dO4f79++XOVG5xM8//4wuXbrg+++/x4ABA+Dj4wNvb+9S30l5k8fyyMnJwdChQ+Hu7o4PP/wQ4eHhOH36tNb6J6L/MGEh0tCECRNgZmaGoKAgpKWllTqemJiIxYsXA3g0pAGg1EqehQsXAgD8/Py0FleDBg2QmZmJCxcuSPtSUlKwbds2lXbp6emlzi15gNqTS61L1KlTBy1atMDatWtVEoA///wTe/fule6zMnTp0gWzZs3C119/DTs7u6e2MzQ0LFW9+emnn/D333+r7CtJrMpK7jQ1ceJEJCcnY+3atVi4cCHq16+PwMDAp36PRPT8+OA4Ig01aNAAGzduxLvvvgs3NzeVJ90eP34cP/30E4YMGQIAaN68OQIDA/Htt98iIyMDnTp1wqlTp7B27Vr07dv3qUtmn8eAAQMwceJEvPXWWxg1ahQePHiAFStWwNXVVWXS6cyZM3H48GH4+fnB0dERd+7cwfLly1GvXj20b9/+qf3PmzcP3bt3h6enJ4YNG4aHDx9i6dKlsLS0RFhYmNbu40kGBgaYMmXKM9v17NkTM2fOxNChQ/H6668jLi4OGzZsgLOzs0q7Bg0awMrKCitXroSFhQXMzMzQrl07ODk5aRRXdHQ0li9fjunTp0vLrNesWYPOnTtj6tSpCA8P16g/InoGHa9SInpp/fXXX+Lw4cPF+vXri0ZGRqKFhYXo5eUlLl26VMzNzZXaFRQUiDNmzBCdnJzE6tWriw4ODuKkSZNU2ojio2XNfn5+pa7z5HLapy1rFkVR3Lt3r9i0aVPRyMhIbNSokfjDDz+UWta8f/9+sU+fPqK9vb1oZGQk2tvbi++99574119/lbrGk0t/9+3bJ3p5eYkmJiaiUqkUe/XqJV66dEmlTcn1nlw2vWbNGhGAmJSU9NTvVBRVlzU/zdOWNY8bN06sU6eOaGJiInp5eYkxMTFlLkf+9ddfRXd3d7FatWoq99mpUyexSZMmZV7z8X6ysrJER0dHsVWrVmJBQYFKuzFjxogGBgZiTEyM2nsgIs0IoqjBDDgiIiIiHeAcFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHB8dVsuLiYty+fRsWFhZafSQ4ERG9GKIo4v79+7C3t5feCK5tubm5yM/P10pfRkZGMDY21kpfcsKEpZLdvn0bDg4Oug6DiIgq6ObNm6hXr57W+83NzYWJRU2g8IFW+rOzs0NSUpLeJS1MWCqZhYUFAMCo7WgI1RQ6joaoctzYMUnXIRBVmvv3s+Di9Ir057m25efnA4UPoHAPBAyNKtZZUT5SL61Ffn4+ExbSTMkwkFBNwYSF9JZSqdR1CESVrtKH9asZQ6hgwiIK+js1lQkLERGRHAgAKpoU6fFUSSYsREREciAYPNoq2oee0t87IyIiIr3BCgsREZEcCIIWhoT0d0yICQsREZEccEhILf29MyIiItIbrLAQERHJAYeE1GLCQkREJAtaGBLS44ET/b0zIiIi0hussBAREckBh4TUYsJCREQkB1wlpJb+3hkRERHpDVZYiIiI5IBDQmoxYSEiIpIDDgmpxYSFiIhIDlhhUUt/UzEiIiLSG6ywEBERyQGHhNRiwkJERCQHgqCFhIVDQkREREQ6wwoLERGRHBgIj7aK9qGnmLAQERHJAeewqKW/d0ZERER6gxUWIiIiOeBzWNRiwkJERCQHHBJSS3/vjIiIiPQGKyxERERywCEhtZiwEBERyQGHhNRiwkJERCQHrLCopb+pGBEREekNVliIiIjkgENCajFhISIikgMOCamlv6kYERER6Q1WWIiIiGRBC0NCelyHYMJCREQkBxwSUkt/UzEiIiLSG6ywEBERyYEgaGGVkP5WWJiwEBERyQGXNaulv3dGREREas2dOxdt27aFhYUFbGxs0LdvX8THx6u06dy5MwRBUNlGjBih0iY5ORl+fn4wNTWFjY0NQkNDUVhYqNLm4MGDaNWqFRQKBRo2bIiIiAiNYmXCQkREJAclk24rumng0KFDCA4OxokTJxAVFYWCggL4+PggJydHpd3w4cORkpIibeHh4dKxoqIi+Pn5IT8/H8ePH8fatWsRERGBadOmSW2SkpLg5+eHLl26IDY2FqNHj0ZQUBD27NlT7lg5JERERCQHOhgS2r17t8rniIgI2NjY4OzZs+jYsaO039TUFHZ2dmX2sXfvXly6dAn79u2Dra0tWrRogVmzZmHixIkICwuDkZERVq5cCScnJyxYsAAA4ObmhqNHj2LRokXw9fUtV6yssBAREcmBFissWVlZKlteXl65QsjMzAQA1KhRQ2X/hg0bUKtWLTRt2hSTJk3CgwcPpGMxMTHw8PCAra2ttM/X1xdZWVm4ePGi1Mbb21ulT19fX8TExJT762GFhYiISM84ODiofJ4+fTrCwsLUnlNcXIzRo0fDy8sLTZs2lfYPHDgQjo6OsLe3x4ULFzBx4kTEx8dj69atAIDU1FSVZAWA9Dk1NVVtm6ysLDx8+BAmJibPvCcmLERERHKgxSGhmzdvQqlUSrsVCsUzTw0ODsaff/6Jo0ePquz/8MMPpZ89PDxQp04ddO3aFYmJiWjQoEHF4tUAh4SIiIjkQItDQkqlUmV7VsISEhKCyMhIHDhwAPXq1VPbtl27dgCAhIQEAICdnR3S0tJU2pR8Lpn38rQ2SqWyXNUVgAkLERFRlSWKIkJCQrBt2zZER0fDycnpmefExsYCAOrUqQMA8PT0RFxcHO7cuSO1iYqKglKphLu7u9Rm//79Kv1ERUXB09Oz3LEyYSEiIpKBJ5918rybJoKDg/HDDz9g48aNsLCwQGpqKlJTU/Hw4UMAQGJiImbNmoWzZ8/i+vXr+O233zB48GB07NgRzZo1AwD4+PjA3d0dgwYNwvnz57Fnzx5MmTIFwcHBUmVnxIgRuHbtGiZMmIArV65g+fLl2LJlC8aMGVPuWJmwEBERyYAuEpYVK1YgMzMTnTt3Rp06daRt8+bNAAAjIyPs27cPPj4+aNy4McaNG4d+/fphx44dUh+GhoaIjIyEoaEhPD098f7772Pw4MGYOXOm1MbJyQk7d+5EVFQUmjdvjgULFuC7774r95JmgJNuiYiIqixRFNUed3BwwKFDh57Zj6OjI3bt2qW2TefOnXHu3DmN4nscExYiIiI5EP5/q2gfeooJCxERkQw8z5BOGZ1oJxgZ4hwWIiIikj1WWIiIiGSAFRb1mLAQERHJABMW9ZiwEBERyQATFvU4h4WIiIhkjxUWIiIiOeCyZrWYsBAREckAh4TU45AQERERyR4rLERERDIgCNBChUU7scgRExYiIiIZEKCFISE9zlg4JERERESyxwoLERGRDHDSrXpMWIiIiOSAy5rV4pAQERERyR4rLERERHKghSEhkUNCREREVJm0MYel4quM5IsJCxERkQwwYVGPc1iIiIhI9lhhISIikgOuElKLCQsREZEMcEhIPQ4JERERkeyxwkJERCQDrLCox4SFiIhIBpiwqMchISIiIpI9VliIiIhkgBUW9ZiwEBERyQGXNavFISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBJizqMWEhIiKSASYs6nEOCxEREckeKyxERERywFVCajFhISIikgEOCanHISEiIiKSvZeiwiIIArZt24a+ffvqOhR6Aca81wE9O7jB5ZVayM0rwKmLNxG2KgoJN++qtGvrXg9ThnVF68b1UFRcjD8TU9Fvwnrk5hdKbXzauSB0cGc0cbZFXn4hjp2/jvenbZKOd2zphMkfvAE3J1s8yM3Hpj3nMev7/SgqLn5h90u0KGIvIg+cx9UbaTBWVMerHk6YPrIPXBxtAQD3MnPwxbe7cODkFdxKu4eaVubw69QMn43wg9LcBACwMfIEQmZuKLP/+N2fo3YNixd2P/R8WGFRT+cJS2pqKubMmYOdO3fi77//ho2NDVq0aIHRo0eja9euug4Poihi+vTpWLVqFTIyMuDl5YUVK1bAxcVF16HprdebO+K7X0/hXPzfqGZggKlB3tgaPhivDf0aD3ILADxKVn7+YhAW/XgEE5fuQmFRMZo626FYFKV+enVww+JxvTHr+/04fC4J1QwN4FbfRjre1NkWW+a+jwUbDmPE3G2oU8sCC8f0goGhgGkr977w+6aq69gfCRj2Tge0dHNEUVERZq3YgX4jlyFm82SYmSiQ8m8mUv7NxMxP+qKRkx1upqRj3BebkfJvJtZ+MQwA8JZ3K3R9zV2l3+CZPyAvv4DJyktCgBYSFj2exKLThOX69evw8vKClZUV5s2bBw8PDxQUFGDPnj0IDg7GlStXdBkeACA8PBxLlizB2rVr4eTkhKlTp8LX1xeXLl2CsbGxrsPTS+98+oPK54+/3IaEbRPRwtUexy/cAADM+bgbvtl2El/9eFRq93gFxtDAAHNDumPaN1H44fc/pP3xN/6Rfn6rS1NcvJaGeesPAQCSbqcj7Nu9WD2tP8LXHkT2w/xKuT+iJ/285GOVz8umvQ9X389w/vJNvN6qIdwb2GPdl0HScad6tTH5o14YMX0dCguLUK2aIUyMjWBibCS1+ffefRw58xeWTBn4wu6DqDLpdA7Lxx9/DEEQcOrUKfTr1w+urq5o0qQJxo4dixMnTjz1vIkTJ8LV1RWmpqZwdnbG1KlTUVBQIB0/f/48unTpAgsLCyiVSrRu3RpnzpwBANy4cQO9evWCtbU1zMzM0KRJE+zatavM64iiiK+++gpTpkxBnz590KxZM6xbtw63b9/G9u3btfpd0NMpzR4lhveyHgIAalmZoa27A/7JyMGepcMQ/3MoIhcNxWtNX5HOae5aB3VrW6JYLMahb0bg8k/j8dPc91UqLEbVqyHvseEjAHiYVwgTRXU0d7V/AXdGVLas7FwAgJWlqZo2D2FhZoxq1QzLPL5p1ymYGBuh9xstKiNEqgQlQ0IV3fSVzhKW9PR07N69G8HBwTAzMyt13MrK6qnnWlhYICIiApcuXcLixYuxatUqLFq0SDoeEBCAevXq4fTp0zh79iw+/fRTVK9eHQAQHByMvLw8HD58GHFxcfjyyy9hbm5e5nWSkpKQmpoKb29vaZ+lpSXatWuHmJiY57xz0oQgCJgb3A0n4m7g8vU7AID6dawBAJ8O7oy1O8/i7U/X4/zVFGyfHwjnujVU2wR2wfwfDmHAZxuQkf0QOxYNgZXFozH/6DMJeLWJA/q90RQGBgLq1LLAhMGdAAB2Ncv+nSCqbMXFxfhs4S9o19wZ7g3KTpzvZmRj/urdCOz7+lP7+eG3E3jbt7VK1YVkTtDSpqd0NiSUkJAAURTRuHFjjc+dMmWK9HP9+vUxfvx4bNq0CRMmTAAAJCcnIzQ0VOr78fkmycnJ6NevHzw8PAAAzs7OT71OamoqAMDW1lZlv62trXTsSXl5ecjLy5M+Z2VlaXJr9IT5n/jBzckG3UetlvYZGDz6PzIi8gw27o4FAMQl7Eanlk54v3srzPxuHwz+/18ZC344jB1HLgMAgsO34+LmcejbqQkiIs/gwJlETPtmLxaO7oWVk/yRl1+E+T8cwuvN6qO4WASRLoSG/4TL11Kw69vRZR7Pyn6Id8esRCMnO0z8sEeZbU5dSMJfSalYGTaoEiMlerF0lrCI4vP/hbB582YsWbIEiYmJyM7ORmFhIZRKpXR87NixCAoKwvr16+Ht7Y133nkHDRo0AACMGjUKH330Efbu3Qtvb2/069cPzZo1q/D9lJg7dy5mzJihtf6qsvBRPeD7mit6jF6N2//+l/il3r0PQHU+CgDEJ/+LejaWj9qkl26TX1CE6yn3pDYAsPznGCz/OQZ2NS2Qcf8hXrGzwvThb+J6yr1Kuy+ip5kwbwv2HP0TO7/5BHVtrUsdv5+Ti3c+WQELUwXWhw9H9acMB63/9Tg8XOuhhdsrZR4neeIqIfV0NiTk4uICQRA0nlgbExODgIAA9OjRA5GRkTh37hwmT56M/Pz/JkiGhYXh4sWL8PPzQ3R0NNzd3bFt2zYAQFBQEK5du4ZBgwYhLi4Obdq0wdKlS8u8lp2dHQAgLS1NZX9aWpp07EmTJk1CZmamtN28eVOj+6NHwkf1gF97N/QeF4Hk1AyVY8mpGbj9bxYaOtRS2d+wXk3cTHvU9vxfKcjNL1BpU83QAK/YWkltHpd69z5y8wvR7w0P3ErLwPmrKdq+JaKnEkURE+Ztwc6DF/Dr8pFwrFurVJus7IfoN3IZjKobYsOC/8FYUb3MvrIf5OHX/efwfu/XKjts0jLOYVFPZwlLjRo14Ovri2XLliEnJ6fU8YyMjDLPO378OBwdHTF58mS0adMGLi4uuHHjRql2rq6uGDNmDPbu3Qt/f3+sWbNGOubg4IARI0Zg69atGDduHFatWlXmtZycnGBnZ4f9+/dL+7KysnDy5El4enqWeY5CoYBSqVTZSDPzP/FDf+9mGD77Z2Q/yIeNtTlsrM1hbPRfQXDp5mP431vt0LujO5zsa+CzoW/A5ZVaWP//K4LuP8jDmh1n8OmQzujSpgEaOtTEgtE9AQDbD12U+hn5rhfcnWzQuH5tjH+/E0a/1x4Tv/6dQ0L0QoWGb8GW38/g21mBMDc1Rtq/WUj7NwsPcx/9Qywr+yH6jVqOB7n5WDJlIO5n50ptiopUnxm0LeoPFBYVo3/3trq4FaoAQdDOpq90uqx52bJl8PLywquvvoqZM2eiWbNmKCwsRFRUFFasWIHLly+XOsfFxQXJycnYtGkT2rZti507d0rVEwB4+PAhQkND8fbbb8PJyQm3bt3C6dOn0a9fPwDA6NGj0b17d7i6uuLevXs4cOAA3NzcyoxPEASMHj0as2fPhouLi7Ss2d7eng+xq0TD+rwKANj51Qcq+z/+cht+3BMLAFj5ywkYG1XD5x93g5WFCS5eS4V/6Dpcv/3fUM60lXtRWFSMlZ/6w1hRDWcv/40+4yOQ+f8rMADA+9WGGBfQAUbVq+HPxFQETP0R+04lVP5NEj1m9S+Pluf3GrFEZf/X0wIwsOdruBB/C2f/vA4AaO0/U6VN7PYwvGJfU/r8w28x6Nm5OSwtnr7CiOhlJIgVmUyiBSkpKZgzZw4iIyORkpKC2rVro3Xr1hgzZgw6d+78KMgnnnQ7YcIErF69Gnl5efDz88Nrr72GsLAwZGRkID8/H4GBgTh27BjS0tJQq1Yt+Pv7Y968eTA2NsbIkSPx+++/49atW1AqlejWrRsWLVqEmjVrlhlfyYPjvv32W2RkZKB9+/ZYvnw5XF1dy3V/WVlZsLS0hMJzIoRqCm18ZUSyk74/TNchEFWarKws2NWyQmZmZqVUzUv+nnAe+TMMFKVXzWqiOC8H15a+XWmx6pLOExZ9x4SFqgImLKTPXljCMupnGFYwYSnKy8G1JfqZsPDlh0RERCR7On+XEBEREXFZ87MwYSEiIpIBbazy0eN8hUNCREREJH+ssBAREcmAgYEgvXrkeYkVPF/OmLAQERHJAIeE1OOQEBEREckeKyxEREQywFVC6jFhISIikgEOCanHhIWIiEgGWGFRj3NYiIiISPZYYSEiIpIBVljUY4WFiIhIBkrmsFR008TcuXPRtm1bWFhYwMbGBn379kV8fLxKm9zcXAQHB6NmzZowNzdHv379kJaWptImOTkZfn5+MDU1hY2NDUJDQ1FYWKjS5uDBg2jVqhUUCgUaNmyIiIgIjWJlwkJERFRFHTp0CMHBwThx4gSioqJQUFAAHx8f5OTkSG3GjBmDHTt24KeffsKhQ4dw+/Zt+Pv7S8eLiorg5+eH/Px8HD9+HGvXrkVERASmTZsmtUlKSoKfnx+6dOmC2NhYjB49GkFBQdizZ0+5YxVEURS1c9tUlpLXhis8J0KoptB1OESVIn1/mK5DIKo0WVlZsKtlhczMTCiVykrp39LSEh6f/gZDY7MK9VWUm4O4L3o/d6z//PMPbGxscOjQIXTs2BGZmZmoXbs2Nm7ciLfffhsAcOXKFbi5uSEmJgavvfYafv/9d/Ts2RO3b9+Gra0tAGDlypWYOHEi/vnnHxgZGWHixInYuXMn/vzzT+laAwYMQEZGBnbv3l2u2FhhISIikgFtDgllZWWpbHl5eeWKITMzEwBQo0YNAMDZs2dRUFAAb29vqU3jxo3xyiuvICYmBgAQExMDDw8PKVkBAF9fX2RlZeHixYtSm8f7KGlT0kd5MGEhIiLSMw4ODrC0tJS2uXPnPvOc4uJijB49Gl5eXmjatCkAIDU1FUZGRrCyslJpa2tri9TUVKnN48lKyfGSY+raZGVl4eHDh+W6J64SIiIikgFtrhK6efOmypCQQvHsKQnBwcH4888/cfTo0QrFUFmYsBAREcmANp90q1QqNZrDEhISgsjISBw+fBj16tWT9tvZ2SE/Px8ZGRkqVZa0tDTY2dlJbU6dOqXSX8kqosfbPLmyKC0tDUqlEiYmJuWKkUNCREREVZQoiggJCcG2bdsQHR0NJycnleOtW7dG9erVsX//fmlffHw8kpOT4enpCQDw9PREXFwc7ty5I7WJioqCUqmEu7u71ObxPkralPRRHqywEBERyYAuHhwXHByMjRs34tdff4WFhYU058TS0hImJiawtLTEsGHDMHbsWNSoUQNKpRIjR46Ep6cnXnvtNQCAj48P3N3dMWjQIISHhyM1NRVTpkxBcHCwNBQ1YsQIfP3115gwYQI++OADREdHY8uWLdi5c2e5Y2XCQkREJAO6ePnhihUrAACdO3dW2b9mzRoMGTIEALBo0SIYGBigX79+yMvLg6+vL5YvXy61NTQ0RGRkJD766CN4enrCzMwMgYGBmDlzptTGyckJO3fuxJgxY7B48WLUq1cP3333HXx9fcsdKxMWIiIiGdBFhaU8j2IzNjbGsmXLsGzZsqe2cXR0xK5du9T207lzZ5w7d06j+B7HOSxEREQke6ywEBERyYEWhoSgv+8+ZMJCREQkB3xbs3ocEiIiIiLZY4WFiIhIBnSxSuhlwoSFiIhIBjgkpB6HhIiIiEj2WGEhIiKSAQ4JqceEhYiISAY4JKQeh4SIiIhI9lhhISIikgFWWNRjwkJERCQDnMOiHhMWIiIiGWCFRT3OYSEiIiLZY4WFiIhIBjgkpB4TFiIiIhngkJB6HBIiIiIi2WOFhYiISAYEaGFISCuRyBMTFiIiIhkwEAQYVDBjqej5csYhISIiIpI9VliIiIhkgKuE1GPCQkREJANcJaQeExYiIiIZMBAebRXtQ19xDgsRERHJHissREREciBoYUhHjyssTFiIiIhkgJNu1eOQEBEREckeKyxEREQyIPz/fxXtQ18xYSEiIpIBrhJSj0NCREREJHussBAREckAHxynXrkSlt9++63cHfbu3fu5gyEiIqqquEpIvXIlLH379i1XZ4IgoKioqCLxEBEREZVSroSluLi4suMgIiKq0gwEAQYVLJFU9Hw5q9AcltzcXBgbG2srFiIioiqLQ0LqabxKqKioCLNmzULdunVhbm6Oa9euAQCmTp2K77//XusBEhERVQUlk24ruukrjROWOXPmICIiAuHh4TAyMpL2N23aFN99951WgyMiIiICniNhWbduHb799lsEBATA0NBQ2t+8eXNcuXJFq8ERERFVFSVDQhXd9JXGc1j+/vtvNGzYsNT+4uJiFBQUaCUoIiKiqoaTbtXTuMLi7u6OI0eOlNr/888/o2XLlloJioiIiOhxGldYpk2bhsDAQPz9998oLi7G1q1bER8fj3Xr1iEyMrIyYiQiItJ7wv9vFe1DX2lcYenTpw927NiBffv2wczMDNOmTcPly5exY8cOvPnmm5URIxERkd7jKiH1nus5LB06dEBUVJS2YyEiIiIq03M/OO7MmTO4fPkygEfzWlq3bq21oIiIiKoaA+HRVtE+9JXGCcutW7fw3nvv4dixY7CysgIAZGRk4PXXX8emTZtQr149bcdIRESk9/i2ZvU0nsMSFBSEgoICXL58Genp6UhPT8fly5dRXFyMoKCgyoiRiIiIqjiNKyyHDh3C8ePH0ahRI2lfo0aNsHTpUnTo0EGrwREREVUlelwgqTCNExYHB4cyHxBXVFQEe3t7rQRFRERU1XBISD2Nh4TmzZuHkSNH4syZM9K+M2fO4JNPPsH8+fO1GhwREVFVUTLptqKbvipXhcXa2lola8vJyUG7du1Qrdqj0wsLC1GtWjV88MEH6Nu3b6UESkRERFVXuRKWr776qpLDICIiqto4JKReuRKWwMDAyo6DiIioSuOj+dV77gfHAUBubi7y8/NV9imVygoFRERERPQkjROWnJwcTJw4EVu2bMHdu3dLHS8qKtJKYERERFWJgSDAoIJDOhU9X840XiU0YcIEREdHY8WKFVAoFPjuu+8wY8YM2NvbY926dZURIxERkd4TBO1s+krjCsuOHTuwbt06dO7cGUOHDkWHDh3QsGFDODo6YsOGDQgICKiMOImIiKgK07jCkp6eDmdnZwCP5qukp6cDANq3b4/Dhw9rNzoiIqIqomSVUEU3faVxwuLs7IykpCQAQOPGjbFlyxYAjyovJS9DJCIiIs1wSEg9jROWoUOH4vz58wCATz/9FMuWLYOxsTHGjBmD0NBQrQdIREREpPEcljFjxkg/e3t748qVKzh79iwaNmyIZs2aaTU4IiKiqoKrhNTTuMLyJEdHR/j7+zNZISIiqgBdDAkdPnwYvXr1gr29PQRBwPbt21WODxkypNQcmW7duqm0SU9PR0BAAJRKJaysrDBs2DBkZ2ertLlw4QI6dOgAY2NjODg4IDw8XOPvp1wVliVLlpS7w1GjRmkcBBERUVWni0fz5+TkoHnz5vjggw/g7+9fZptu3bphzZo10meFQqFyPCAgACkpKYiKikJBQQGGDh2KDz/8EBs3bgQAZGVlwcfHB97e3li5ciXi4uLwwQcfwMrKCh9++GG5Yy1XwrJo0aJydSYIAhMWIiKil0T37t3RvXt3tW0UCgXs7OzKPHb58mXs3r0bp0+fRps2bQAAS5cuRY8ePTB//nzY29tjw4YNyM/Px+rVq2FkZIQmTZogNjYWCxcu1H7CUrIqiJ5fcuRnfG0B6S3rtiG6DoGo0ohF+c9upAUGqPg8jQrP8yjDwYMHYWNjA2tra7zxxhuYPXs2atasCQCIiYmBlZWVlKwAj+a3GhgY4OTJk3jrrbcQExODjh07wsjISGrj6+uLL7/8Evfu3YO1tXW54qjQu4SIiIhIO7Q5JJSVlaWyX6FQlBrKKY9u3brB398fTk5OSExMxGeffYbu3bsjJiYGhoaGSE1NhY2Njco51apVQ40aNZCamgoASE1NhZOTk0obW1tb6RgTFiIioirKwcFB5fP06dMRFhamcT8DBgyQfvbw8ECzZs3QoEEDHDx4EF27dq1omBphwkJERCQDggAYVHBVckmB5ubNmyrTEJ6nulIWZ2dn1KpVCwkJCejatSvs7Oxw584dlTaFhYVIT0+X5r3Y2dkhLS1NpU3J56fNjSlLZQx3ERERkYYMBO1swKNX5zy+aSthuXXrFu7evYs6deoAADw9PZGRkYGzZ89KbaKjo1FcXIx27dpJbQ4fPoyCggKpTVRUFBo1alTu4SCACQsREVGVlZ2djdjYWMTGxgJ4tMgmNjYWycnJyM7ORmhoKE6cOIHr169j//796NOnDxo2bAhfX18AgJubG7p164bhw4fj1KlTOHbsGEJCQjBgwADY29sDAAYOHAgjIyMMGzYMFy9exObNm7F48WKMHTtWo1ifK2E5cuQI3n//fXh6euLvv/8GAKxfvx5Hjx59nu6IiIiqPF28/PDMmTNo2bIlWrZsCQAYO3YsWrZsiWnTpsHQ0BAXLlxA79694erqimHDhqF169Y4cuSISsVmw4YNaNy4Mbp27YoePXqgffv2+Pbbb6XjlpaW2Lt3L5KSktC6dWuMGzcO06ZN02hJM/Acc1h++eUXDBo0CAEBATh37hzy8vIAAJmZmfj888+xa9cuTbskIiKq8gy0MIdF0/M7d+4MURSfenzPnj3P7KNGjRrSQ+KeplmzZjhy5IhmwT1B4wrL7NmzsXLlSqxatQrVq1eX9nt5eeGPP/6oUDBEREREZdG4whIfH4+OHTuW2m9paYmMjAxtxERERFTlPM+7gMrqQ19pXGGxs7NDQkJCqf1Hjx6Fs7OzVoIiIiKqakre1lzRTV9pnLAMHz4cn3zyCU6ePAlBEHD79m1s2LAB48ePx0cffVQZMRIREek9Ay1t+krjIaFPP/0UxcXF6Nq1Kx48eICOHTtCoVBg/PjxGDlyZGXESERERFWcxgmLIAiYPHkyQkNDkZCQgOzsbLi7u8Pc3Lwy4iMiIqoSOIdFved+NL+RkRHc3d21GQsREVGVZYCKz0ExgP5mLBonLF26dFH7YJro6OgKBURERET0JI0TlhYtWqh8LigoQGxsLP78808EBgZqKy4iIqIqhUNC6mmcsCxatKjM/WFhYcjOzq5wQERERFWRLp50+zLR2gqo999/H6tXr9ZWd0RERESS5550+6SYmBgYGxtrqzsiIqIqRRBQ4Um3HBJ6jL+/v8pnURSRkpKCM2fOYOrUqVoLjIiIqCrhHBb1NE5YLC0tVT4bGBigUaNGmDlzJnx8fLQWGBEREVEJjRKWoqIiDB06FB4eHrC2tq6smIiIiKocTrpVT6NJt4aGhvDx8eFbmYmIiLRM0NJ/+krjVUJNmzbFtWvXKiMWIiKiKqukwlLRTV9pnLDMnj0b48ePR2RkJFJSUpCVlaWyEREREWlbueewzJw5E+PGjUOPHj0AAL1791Z5RL8oihAEAUVFRdqPkoiISM9xDot65U5YZsyYgREjRuDAgQOVGQ8REVGVJAiC2nf1lbcPfVXuhEUURQBAp06dKi0YIiIiorJotKxZnzM3IiIiXeKQkHoaJSyurq7PTFrS09MrFBAREVFVxCfdqqdRwjJjxoxST7olIiIiqmwaJSwDBgyAjY1NZcVCRERUZRkIQoVffljR8+Ws3AkL568QERFVHs5hUa/cD44rWSVERERE9KKVu8JSXFxcmXEQERFVbVqYdKvHrxLSbA4LERERVQ4DCDCoYMZR0fPljAkLERGRDHBZs3oav/yQiIiI6EVjhYWIiEgGuEpIPSYsREREMsDnsKjHISEiIiKSPVZYiIiIZICTbtVjwkJERCQDBtDCkJAeL2vmkBARERHJHissREREMsAhIfWYsBAREcmAASo+7KHPwyb6fG9ERESkJ1hhISIikgFBECBUcEynoufLGRMWIiIiGRBQ8Zct62+6woSFiIhIFvikW/U4h4WIiIhkjxUWIiIimdDf+kjFMWEhIiKSAT6HRT0OCREREZHsscJCREQkA1zWrB4TFiIiIhngk27V0+d7IyIiIj3BCgsREZEMcEhIPSYsREREMsAn3arHISEiIiKSPVZYiIiIZIBDQuoxYSEiIpIBrhJSjwkLERGRDLDCop4+J2NERESkJ1hhISIikgGuElKPCQsREZEM8OWH6nFIiIiIiGSPFRYiIiIZMIAAgwoO6lT0fDljhYWIiEgGSoaEKrpp4vDhw+jVqxfs7e0hCAK2b9+uclwURUybNg116tSBiYkJvL29cfXqVZU26enpCAgIgFKphJWVFYYNG4bs7GyVNhcuXECHDh1gbGwMBwcHhIeHa/z9MGEhIiKqonJyctC8eXMsW7aszOPh4eFYsmQJVq5ciZMnT8LMzAy+vr7Izc2V2gQEBODixYuIiopCZGQkDh8+jA8//FA6npWVBR8fHzg6OuLs2bOYN28ewsLC8O2332oUK4eEiIiIZED4//8q2ocmunfvju7du5d5TBRFfPXVV5gyZQr69OkDAFi3bh1sbW2xfft2DBgwAJcvX8bu3btx+vRptGnTBgCwdOlS9OjRA/Pnz4e9vT02bNiA/Px8rF69GkZGRmjSpAliY2OxcOFClcTmWVhhISIikgFtDgllZWWpbHl5eRrHk5SUhNTUVHh7e0v7LC0t0a5dO8TExAAAYmJiYGVlJSUrAODt7Q0DAwOcPHlSatOxY0cYGRlJbXx9fREfH4979+6VOx4mLERERHrGwcEBlpaW0jZ37lyN+0hNTQUA2Nraquy3tbWVjqWmpsLGxkbleLVq1VCjRg2VNmX18fg1yoNDQkRERDIgaGGVUMmQ0M2bN6FUKqX9CoWiQv3KASssREREMqDNISGlUqmyPU/CYmdnBwBIS0tT2Z+WliYds7Ozw507d1SOFxYWIj09XaVNWX08fo3yYMJCREQkA7pY1qyOk5MT7OzssH//fmlfVlYWTp48CU9PTwCAp6cnMjIycPbsWalNdHQ0iouL0a5dO6nN4cOHUVBQILWJiopCo0aNYG1tXe54mLAQERFVUdnZ2YiNjUVsbCyARxNtY2NjkZycDEEQMHr0aMyePRu//fYb4uLiMHjwYNjb26Nv374AADc3N3Tr1g3Dhw/HqVOncOzYMYSEhGDAgAGwt7cHAAwcOBBGRkYYNmwYLl68iM2bN2Px4sUYO3asRrFyDgsREZEM6GJZ85kzZ9ClSxfpc0kSERgYiIiICEyYMAE5OTn48MMPkZGRgfbt22P37t0wNjaWztmwYQNCQkLQtWtXGBgYoF+/fliyZIl03NLSEnv37kVwcDBat26NWrVqYdq0aRotaQYAQRRFUaMzSCNZWVmwtLRE2t1MlQlQRPrEum2IrkMgqjRiUT7y4lYhM7Ny/hwv+Xvi19PXYGZuUaG+crLvo09b50qLVZc4JERERESyxyEhIiIiGdDFkNDLhAkLERGRDGhjlY82VwnJDYeEiIiISPZYYSEiIpIBARUf0tHjAgsTFiIiIjkwEB5tFe1DX3FIiIiIiGTvpaiwCIKAbdu2SU/Wo6pl4Zo9iDxwHldvpMFYUR2vNnNGWEgfuNT/7+2fEVuP4uc9Z3Ah/hbu5+TienQ4LC1MVfpp1nsabqakq+ybFtwbY4b4vJD7ICoxZogPenZpDhdHW+TmFeDUhWsI+/pXJNz4750sNjUtMHPUW+jcrjHMTRVIuHEHC1bvwY4DsVIbK6UpwkPfgW/7phBFEb9Fx2LSgp+R8zBfatPXuyXGDvVFg1dscPdeNlZtOYSlP+wHyQ9XCamn8wpLamoqRo4cCWdnZygUCjg4OKBXr14q7y7Qpa1bt8LHxwc1a9aEIAjS44vpxTn+RwKC3umIvavHY+vXISgoLIL/yK+R8zBPavMwtwBdPd2fmXx89j8/XPn9c2n78N1OlR0+USmvt2qI7346DJ8P5sM/5GtUr2aIrUtDYGpsJLVZETYYDR1tMHDsN/B673PsOBCLNXM/gIdrPanNqlmBaOxcB/4hX2PAmJV4vWVDfPXZQOm49+vu+HbWEKz55SheHzAH47/cjI8GvoHh73R8ofdL5SO3dwnJjU4rLNevX4eXlxesrKwwb948eHh4oKCgAHv27EFwcDCuXLmiy/AAADk5OWjfvj369++P4cOH6zqcKunnpcEqn5dPfx8uPpMQe/kmvFo1BAB8NPDRo6WPnv1LbV/mpsawraVfT3+kl887o5arfP54xg9IiPoCLdwccPxcIgDg1WbOGP/FJvxx6QYAYMHqPfj4vTfQws0BcX/dgmt9W3i/3gRdBocj9nIyAGDi/J+w5auPMHXxNqT+m4l3u7+KnQfPY83WowCAG3/fxaKIvfgk8E2s+unwC7xjKg8BFZ80q8f5im4rLB9//DEEQcCpU6fQr18/uLq6okmTJhg7dixOnDjx1PMmTpwIV1dXmJqawtnZGVOnTlV5C+T58+fRpUsXWFhYQKlUonXr1jhz5gwA4MaNG+jVqxesra1hZmaGJk2aYNeuXU+91qBBgzBt2jR4e3tr78apQrKycwEA1krTZ7Qs7au1e+HsPQEdA77AkvX7UFhYpO3wiDSmNH/0XpZ7WQ+kfacuXMNbb7aGldIUgiDA/83WUCiq4ejZqwCAth5OyMh6ICUrAHDwVDyKi0W0buoIADAyqoa8/EKVa+Xm5aOurTUc6tSo7Nsi0iqdVVjS09Oxe/duzJkzB2ZmZqWOW1lZPfVcCwsLREREwN7eHnFxcRg+fDgsLCwwYcIEAEBAQABatmyJFStWwNDQELGxsahevToAIDg4GPn5+Th8+DDMzMxw6dIlmJuba+2+8vLykJf331BFVlaW1vomoLi4GJMW/ox2zZ3h3tBeo3P/924nNG/sACulGU5duIaZy35D2r+ZmDOmXyVFS/RsgiBg7ti3cSI2EZcTU6T9QyetxurPP0DS/nAUFBbhYW4+BoWuQtKtfwEAtjWV+OfefZW+ioqKcS/rAWxrPqoiRp+4jDlj/NEx0hVHzlyFs0NtBAd0BQDY1bIsNaeLdMsAAgwqOKZjoMc1Fp0lLAkJCRBFEY0bN9b43ClTpkg/169fH+PHj8emTZukhCU5ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnitxGKXPnzsWMGTO02if9Z3z4FlxOTMHvq8ZofG7JH9QA0NSlLoyqV8OYz3/EtODeUBhV12aYROU2f0J/uDWog+7DF6nsnzyiJywtTNDn4yVIz8hBj07NsGbuB+gx/CtcSrxdrr7XbjsGp7q1sGnhCFSvZoj7OblYuekgJv3PD8XFxZVxO1QBHBJST2dDQhV5SfTmzZvh5eUFOzs7mJubY8qUKUhO/q8sOnbsWAQFBcHb2xtffPEFEhMTpWOjRo3C7Nmz4eXlhenTp+PChQsVuo8nTZo0CZmZmdJ28+ZNrfZflYWGb8GeI39ix4pRqGtrXeH+Wjepj8KiYiTf5r8ySTfCQ9+Bb4em6PXREty+kyHtr1+3Fj58txNGzvoBh0//hT+v/o3w737HucvJCPr/CbNpd7NQ21r1zb6GhgawVpoi7e5/ld2wr39FvU7j0Kz3NDTq9pk0J+b633cr/waJtEhnCYuLiwsEQdB4Ym1MTAwCAgLQo0cPREZG4ty5c5g8eTLy8/9bxhcWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cu1dp9KRQKKJVKlY0qRhRFhIZvwc6D5/HbilFwrFtLK/3G/XULBgYCateo2OvciZ5HeOg78OvcHL0/WoLk26rJQ8lqoeJi1X/YFRWJEP7/yWCn45JgpTRF88YO0vGObVxhYCDg7J83VM4rLhaR8k8mCgqL0M+nNU5duIa7GdmVcVtUEYKWNj2ls4SlRo0a8PX1xbJly5CTk1PqeEZGRpnnHT9+HI6Ojpg8eTLatGkDFxcX3Lhxo1Q7V1dXjBkzBnv37oW/vz/WrFkjHXNwcMCIESOwdetWjBs3DqtWrdLafZH2jf9yC7b8fhqrZg2Buakx0v7NQtq/WXiY+1+SmvZvFuLib+HazUfj+xcTbiMu/hbuZT763Tp14RpWbDyAuL9u4fqtf7Hl99OYvOgX9O/eFlbPMXmXqCLmT+yP/t3bYvjUCGQ/yIVNTQvY1LSAseLR0ORf11ORmHwHiya9h1bujqhftxaCA95Al3aNsOvg+f9vk4Z9xy9i8eSBaOXuiHbNnBEe2h9b9/6B1H8zAQA1LM0w1L89XBxt0dS1LuaO64c+XVti0oJfdHbv9HSClv7TVzpd1rxs2TJ4eXnh1VdfxcyZM9GsWTMUFhYiKioKK1aswOXLl0ud4+LiguTkZGzatAlt27bFzp07peoJADx8+BChoaF4++234eTkhFu3buH06dPo1+/RxMrRo0eje/fucHV1xb1793DgwAG4ubk9Ncb09HQkJyfj9u1HY8bx8fEAADs7O9jZ2Wnz66CnWP3LEQBAzxGLVfYvm/Y+BvZ6DQCwZusRfLnqd+mY34dfqbRRGFXH1qiz+GLVLuQXFMLRviY+eq8LggPeeDE3QfSYYW8/GtbZ+c1olf0fz1iPHyNPorCoGP1Hr8D0kD74ceH/YGaqQNLNf/Bx2HpEHb8ktR8+dS3mhfbH9uUjpQfHfTr/J5U+B/i1w8xP3oIgPKrK9BqxWBoWInqZCGJFJpNoQUpKCubMmYPIyEikpKSgdu3aaN26NcaMGYPOnTs/CvKJJ91OmDABq1evRl5eHvz8/PDaa68hLCwMGRkZyM/PR2BgII4dO4a0tDTUqlUL/v7+mDdvHoyNjTFy5Ej8/vvvuHXrFpRKJbp164ZFixahZs2aZcYXERGBoUOHlto/ffp0hIWFPfP+srKyYGlpibS7mRweIr1l3TZE1yEQVRqxKB95cauQmVk5f46X/D2xPzYZ5hYV6z/7fha6tnil0mLVJZ0nLPqOCQtVBUxYSJ+9qIQlWksJyxt6mrDo/NH8RERERM/yUrz8kIiISO/xQSxqMWEhIiKSAb6tWT0mLERERDKgjbct6/PbmjmHhYiIiGSPFRYiIiIZ4BQW9ZiwEBERyQEzFrU4JERERESyxwoLERGRDHCVkHpMWIiIiGSAq4TU45AQERERyR4rLERERDLAObfqMWEhIiKSA2YsanFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQBXCanHhIWIiEgGOIVFPc5hISIiItljhYWIiEgOWGJRiwkLERGRDHDSrXocEiIiIiLZY4WFiIhIBrhKSD0mLERERDLAKSzqcUiIiIiIZI8VFiIiIjlgiUUtJixEREQywFVC6nFIiIiIiGSPFRYiIiIZ4Coh9ZiwEBERyQCnsKjHhIWIiEgOmLGoxTksREREJHussBAREckAVwmpx4SFiIhIDrQw6VaP8xUOCREREZH8scJCREQkA5xzqx4TFiIiIjlgxqIWh4SIiIhI9lhhISIikgGuElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiIZ4Jxb9VhhISIikgNBS5sGwsLCIAiCyta4cWPpeG5uLoKDg1GzZk2Ym5ujX79+SEtLU+kjOTkZfn5+MDU1hY2NDUJDQ1FYWPgcX4B6rLAQERHJgK4m3TZp0gT79u2TPler9l9qMGbMGOzcuRM//fQTLC0tERISAn9/fxw7dgwAUFRUBD8/P9jZ2eH48eNISUnB4MGDUb16dXz++ecVupcnMWEhIiKqwqpVqwY7O7tS+zMzM/H9999j48aNeOONNwAAa9asgZubG06cOIHXXnsNe/fuxaVLl7Bv3z7Y2tqiRYsWmDVrFiZOnIiwsDAYGRlpLU4OCREREcmAgP9WCj339v99ZWVlqWx5eXlPve7Vq1dhb28PZ2dnBAQEIDk5GQBw9uxZFBQUwNvbW2rbuHFjvPLKK4iJiQEAxMTEwMPDA7a2tlIbX19fZGVl4eLFi1r9fpiwEBERyYA2p7A4ODjA0tJS2ubOnVvmNdu1a4eIiAjs3r0bK1asQFJSEjp06ID79+8jNTUVRkZGsLKyUjnH1tYWqampAIDU1FSVZKXkeMkxbeKQEBERkZ65efMmlEql9FmhUJTZrnv37tLPzZo1Q7t27eDo6IgtW7bAxMSk0uPUBCssREREMlDh4aDHHjynVCpVtqclLE+ysrKCq6srEhISYGdnh/z8fGRkZKi0SUtLk+a82NnZlVo1VPK5rHkxFcGEhYiISBZ0sK75CdnZ2UhMTESdOnXQunVrVK9eHfv375eOx8fHIzk5GZ6engAAT09PxMXF4c6dO1KbqKgoKJVKuLu7VyiWJ3FIiIiIqIoaP348evXqBUdHR9y+fRvTp0+HoaEh3nvvPVhaWmLYsGEYO3YsatSoAaVSiZEjR8LT0xOvvfYaAMDHxwfu7u4YNGgQwsPDkZqaiilTpiA4OLjcVZ3yYsJCREQkA7p4l9CtW7fw3nvv4e7du6hduzbat2+PEydOoHbt2gCARYsWwcDAAP369UNeXh58fX2xfPly6XxDQ0NERkbio48+gqenJ8zMzBAYGIiZM2dW7EbKIIiiKGq9V5JkZWXB0tISaXczVSZAEekT67Yhug6BqNKIRfnIi1uFzMzK+XO85O+JKzf+gUUF+7+flYXGjrUrLVZd4hwWIiIikj0OCREREcmALoaEXiZMWIiIiGRAV+8SelkwYSEiIpKDiq9Krvj5MsY5LERERCR7rLAQERHJAAss6jFhISIikgFOulWPQ0JEREQke6ywEBERyQBXCanHhIWIiEgOOIlFLQ4JERERkeyxwkJERCQDLLCox4SFiIhIBrhKSD0OCREREZHsscJCREQkCxVfJaTPg0JMWIiIiGSAQ0LqcUiIiIiIZI8JCxEREckeh4SIiIhkgENC6jFhISIikgE+ml89DgkRERGR7LHCQkREJAMcElKPCQsREZEM8NH86nFIiIiIiGSPFRYiIiI5YIlFLSYsREREMsBVQupxSIiIiIhkjxUWIiIiGeAqIfWYsBAREckAp7Cox4SFiIhIDpixqMU5LERERCR7rLAQERHJAFcJqceEhYiISAY46VY9JiyVTBRFAMD9rCwdR0JUecSifF2HQFRpSn6/S/48ryxZWvh7Qht9yBUTlkp2//59AEBDJwcdR0JERBVx//59WFpaar1fIyMj2NnZwUVLf0/Y2dnByMhIK33JiSBWdspYxRUXF+P27duwsLCAoM+1OpnIysqCg4MDbt68CaVSqetwiLSOv+MvniiKuH//Puzt7WFgUDlrVXJzc5Gfr51KpZGREYyNjbXSl5ywwlLJDAwMUK9ePV2HUeUolUr+YU56jb/jL1ZlVFYeZ2xsrJdJhjZxWTMRERHJHhMWIiIikj0mLKRXFAoFpk+fDoVCoetQiCoFf8epquKkWyIiIpI9VliIiIhI9piwEBERkewxYSEiIiLZY8JCsiYIArZv367rMIgqBX+/icqPCQvpTGpqKkaOHAlnZ2coFAo4ODigV69e2L9/v65DA/Do6ZbTpk1DnTp1YGJiAm9vb1y9elXXYdFLQu6/31u3boWPjw9q1qwJQRAQGxur65CI1GLCQjpx/fp1tG7dGtHR0Zg3bx7i4uKwe/dudOnSBcHBwboODwAQHh6OJUuWYOXKlTh58iTMzMzg6+uL3NxcXYdGMvcy/H7n5OSgffv2+PLLL3UdClH5iEQ60L17d7Fu3bpidnZ2qWP37t2TfgYgbtu2Tfo8YcIE0cXFRTQxMRGdnJzEKVOmiPn5+dLx2NhYsXPnzqK5ubloYWEhtmrVSjx9+rQoiqJ4/fp1sWfPnqKVlZVoamoquru7izt37iwzvuLiYtHOzk6cN2+etC8jI0NUKBTijz/+WMG7J30n99/vxyUlJYkAxHPnzj33/RK9CHyXEL1w6enp2L17N+bMmQMzM7NSx62srJ56roWFBSIiImBvb4+4uDgMHz4cFhYWmDBhAgAgICAALVu2xIoVK2BoaIjY2FhUr14dABAcHIz8/HwcPnwYZmZmuHTpEszNzcu8TlJSElJTU+Ht7S3ts7S0RLt27RATE4MBAwZU4BsgffYy/H4TvYyYsNALl5CQAFEU0bhxY43PnTJlivRz/fr1MX78eGzatEn6Az05ORmhoaFS3y4uLlL75ORk9OvXDx4eHgAAZ2fnp14nNTUVAGBra6uy39bWVjpGVJaX4feb6GXEOSz0wokVeLjy5s2b4eXlBTs7O5ibm2PKlClITk6Wjo8dOxZBQUHw9vbGF198gcTEROnYqFGjMHv2bHh5eWH69Om4cOFChe6DqCz8/SaqHExY6IVzcXGBIAi4cuWKRufFxMQgICAAPXr0QGRkJM6dO4fJkycjPz9fahMWFoaLFy/Cz88P0dHRcHd3x7Zt2wAAQUFBuHbtGgYNGoS4uDi0adMGS5cuLfNadnZ2AIC0tDSV/WlpadIxorK8DL/fRC8l3U6hoaqqW7duGk9KnD9/vujs7KzSdtiwYaKlpeVTrzNgwACxV69eZR779NNPRQ8PjzKPlUy6nT9/vrQvMzOTk26pXOT++/04TrqllwUrLKQTy5YtQ1FREV599VX88ssvuHr1Ki5fvowlS5bA09OzzHNcXFyQnJyMTZs2ITExEUuWLJH+dQkADx8+REhICA4ePIgbN27g2LFjOH36NNzc3AAAo0ePxp49e5CUlIQ//vgDBw4ckI49SRAEjB49GrNnz8Zvv/2GuLg4DB48GPb29ujbt6/Wvw/SL3L//QYeTQ6OjY3FpUuXAADx8fGIjY3lHC2SL11nTFR13b59WwwODhYdHR1FIyMjsW7dumLv3r3FAwcOSG3wxLLP0NBQsWbNmqK5ubn47rvviosWLZL+BZqXlycOGDBAdHBwEI2MjER7e3sxJCREfPjwoSiKohgSEiI2aNBAVCgUYu3atcVBgwaJ//7771PjKy4uFqdOnSra2tqKCoVC7Nq1qxgfH18ZXwXpIbn/fq9Zs0YEUGqbPn16JXwbRBUniGIFZogRERERvQAcEiIiIiLZY8JCREREsseEhYiIiGSPCQsRERHJHhMWIiIikj0mLERERCR7TFiIiIhI9piwEFUBQ4YMUXlCb+fOnTF69OgXHsfBgwchCAIyMjKe2kYQBGzfvr3cfYaFhaFFixYViuv69esQBAGxsbEV6oeIKg8TFiIdGTJkCARBgCAIMDIyQsOGDTFz5kwUFhZW+rW3bt2KWbNmlatteZIMIqLKVk3XARBVZd26dcOaNWuQl5eHXbt2ITg4GNWrV8ekSZNKtc3Pz4eRkZFWrlujRg2t9ENE9KKwwkKkQwqFAnZ2dnB0dMRHH30Eb29v/PbbbwD+G8aZM2cO7O3t0ahRIwDAzZs30b9/f1hZWaFGjRro06cPrl+/LvVZVFSEsWPHwsrKCjVr1sSECRPw5Bs4nhwSysvLw8SJE+Hg4ACFQoGGDRvi+++/x/Xr19GlSxcAgLW1NQRBwJAhQwAAxcXFmDt3LpycnGBiYoLmzZvj559/VrnOrl274OrqChMTE3Tp0kUlzvKaOHEiXF1dYWpqCmdnZ0ydOhUFBQWl2n3zzTdwcHCAqakp+vfvj8zMTJXj3333Hdzc3GBsbIzGjRtj+fLlGsdCRLrDhIVIRkxMTJCfny993r9/P+Lj4xEVFYXIyEgUFBTA19cXFhYWOHLkCI4dOwZzc3N069ZNOm/BggWIiIjA6tWrcfToUaSnp6u89bcsgwcPxo8//oglS5bg8uXL+Oabb2Bubg4HBwf88ssvAB69zTclJQWLFy8GAMydOxfr1q3DypUrcfHiRYwZMwbvv/8+Dh06BOBRYuXv749evXohNjYWQUFB+PTTTzX+TiwsLBAREYFLly5h8eLFWLVqFRYtWqTSJiEhAVu2bMGOHTuwe/dunDt3Dh9//LF0fMOGDZg2bRrmzJmDy5cv4/PPP8fUqVOxdu1ajeMhIh3R8csXiaqswMBAsU+fPqIoPnozdFRUlKhQKMTx48dLx21tbcW8vDzpnPXr14uNGjUSi4uLpX15eXmiiYmJuGfPHlEURbFOnTpieHi4dLygoECsV6+edC1RFMVOnTqJn3zyiSiKohgfHy8CEKOiosqM88CBAyIA8d69e9K+3Nxc0dTUVDx+/LhK22HDhonvvfeeKIqiOGnSJNHd3V3l+MSJE0v19SQ88QbjJ82bN09s3bq19Hn69OmioaGheOvWLWnf77//LhoYGIgpKSmiKIpigwYNxI0bN6r0M2vWLNHT01MURVFMSkoSAYjnzp176nWJSLc4h4VIhyIjI2Fubo6CggIUFxdj4MCBCAsLk457eHiozFs5f/48EhISYGFhodJPbm4uEhMTkZmZiZSUFLRr1046Vq1aNbRp06bUsFCJ2NhYGBoaolOnTuWOOyEhAQ8ePMCbb76psj8/Px8tW7YEAFy+fFklDgDw9PQs9zVKbN68GUuWLEFiYiKys7NRWFgIpVKp0uaVV15B3bp1Va5TXFyM+Ph4WFhYIDExEcOGDcPw4cOlNoWFhbC0tNQ4HiLSDSYsRDrUpUsXrFixAkZGRrC3t0e1aqr/S5qZmal8zs7ORuvWrbFhw4ZSfdWuXfu5YjAxMdH4nOzsbADAzp07VRIF4NG8HG2JiYlBQEAAZsyYAV9fX1haWmLTpk1YsGCBxrGuWrWqVAJlaGiotViJqHIxYSHSITMzMzRs2LDc7Vu1aoXNmzfDxsamVJWhRJ06dXDy5El07NgRwKNKwtmzZ9GqVasy23t4eKC4uBiHDh2Ct7d3qeMlFZ6ioiJpn7u7OxQKBZKTk59amXFzc5MmEJc4ceLEs2/yMcePH4ejoyMmT54s7btx40apdsnJybh9+zbs7e2l6xgYGKBRo0awtbWFvb09rl27hoCAAI2uT0TywUm3RC+RgIAA1KpVC3369MGRI0eQlJSEgwcPYtSoUbh16xYA4JNPPsEXX3yB7du348qVK/j444/VPkOlfv36CAwMxAcffIDt27dLfW7ZsgUA4OjoCEEQEBkZiX/++QfZ2dmwsLDA+PHjMWbMGKxduxaJiYn4448/sHTpUmki64gRI3D16lWEhoYiPj4eGzduREREhEb36+LiguTkZGzatAmJiYlYsmRJmROIjY2NERgYiPPnz+PIkSMYNWoU+vfvDzs7OwDAjBkzMHfuXCxZsgR//fUX4uLisGbNGixcuFCjeIhId5iwEL1ETE1NcfjwYbzyyivw9/eHm5sbhg0bhtzcXKniMm7cOAwaNAiBgYHw9PSEhYUF3nrrLbX9rlixAm+//TY+/vhjNG7cGMOHD0dOTg4AoG7dupgxYwY+/fRT2NraIiQkBAAwa9YsTJ06FXPnzoWbmxu6deuGnTt3wsnJCcCjeSW//PILtm/fjubNm2PlypX4/PPPNbrf3r17Y8yYMQgJCUGLFi1w/PhxTJ06tVS7hg0bwt/fHz169ICPjw+aNWumsmw5KCgI3333HdasWQMPDw906tQJERERUqxEJH+C+LSZeEREREQywQoLERERyR4TFiIiIpI9JixEREQke0xYiIiISPaYsBAREZHsMWEhIiIi2WPCQkRERLLHhIWIiIhkjwkLERERyR4TFiIiIpI9JixEREQke0xYiIiISPb+D8eEamDpGfNzAAAAAElFTkSuQmCC",
|
760 |
+
"text/plain": [
|
761 |
+
"<Figure size 640x480 with 2 Axes>"
|
762 |
+
]
|
763 |
+
},
|
764 |
+
"metadata": {},
|
765 |
+
"output_type": "display_data"
|
766 |
+
}
|
767 |
+
],
|
768 |
+
"source": [
|
769 |
+
"from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n",
|
770 |
+
"import matplotlib.pyplot as plt\n",
|
771 |
+
"\n",
|
772 |
+
"# 假设 predictions.label_ids 是真实的标签,preds 是模型的预测\n",
|
773 |
+
"cm = confusion_matrix(predictions.label_ids, preds)\n",
|
774 |
+
"\n",
|
775 |
+
"# 可视化混淆矩阵\n",
|
776 |
+
"disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])\n",
|
777 |
+
"disp.plot(cmap=plt.cm.Blues)\n",
|
778 |
+
"plt.title('Confusion Matrix')\n",
|
779 |
+
"plt.show()"
|
780 |
+
]
|
781 |
+
},
|
782 |
{
|
783 |
"cell_type": "code",
|
784 |
"execution_count": null,
|
785 |
+
"id": "23e3a640-88d7-4a1e-8515-7c417d50f018",
|
786 |
"metadata": {},
|
787 |
"outputs": [],
|
788 |
"source": []
|
03-gene-task/2-structure-predict.ipynb
CHANGED
@@ -1,9 +1,962 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": []
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "212e1052-e0d9-404f-a4ee-db199a4c6d17",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.2 序列结构预测"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "0eb5d83c-8dd6-498b-adc9-1f74c97c3427",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"蛋白质的结构可分为四级:\n",
|
17 |
+
"\n",
|
18 |
+
"1. 一级结构也就是氨基酸序列;\n",
|
19 |
+
"2. 二级结构是周期性的结构构象,比如α螺旋β折叠等\n",
|
20 |
+
"3. 三级结构是整条多肽链的三维空间结构\n",
|
21 |
+
"4. 四级结构是几个蛋白质分子形成的复合体结构,比如三聚体,四聚 体等\n",
|
22 |
+
"\n",
|
23 |
+
"\n",
|
24 |
+
"二级结构(Secondary Structure)是指生物大分子如蛋白质和核酸(RNA 和 DNA)中局部的、有规则的空间构象。这些结构是由分子内的一些化学键或相互作用稳定下来的,但不涉及整个分子的整体折叠状态。以下是关于蛋白质和 RNA 二级结构的简单介绍:\n",
|
25 |
+
"\n",
|
26 |
+
"### 蛋白质的二级结构\n",
|
27 |
+
"\n",
|
28 |
+
"蛋白质的二级结构主要由主链原子间的氢键形成,具体包括以下几种常见的类型:\n",
|
29 |
+
"\n",
|
30 |
+
"1. **α-螺旋 (Alpha Helix)**\n",
|
31 |
+
" - **描述**:一个右手螺旋结构,每个氨基酸残基沿螺旋轴旋转约 100 度,并沿着轴向上移动约 1.5 Å。\n",
|
32 |
+
" - **特点**:通过相邻的肽键之间形成的氢键稳定,通常每 3.6 个氨基酸残基转一圈。\n",
|
33 |
+
"\n",
|
34 |
+
"2. **β-折叠片 (Beta Sheet)**\n",
|
35 |
+
" - **描述**:由多个几乎平行或反平行排列的多肽链组成,链间通过氢键连接。\n",
|
36 |
+
" - **特点**:可以是平行(所有链同向)或反平行(相邻链方向相反),提供了高度刚性的平面结构。\n",
|
37 |
+
"\n",
|
38 |
+
"3. **转角 (Turns)**\n",
|
39 |
+
" - **描述**:短的序列片段,通常包含 3 到 4 个氨基酸残基,用于改变多肽链的方向。\n",
|
40 |
+
" - **特点**:最常见的类型是 β-转角(beta turn),它使得链可以在空间上回折。\n",
|
41 |
+
"\n",
|
42 |
+
"4. **无规则卷曲 (Random Coil)**\n",
|
43 |
+
" - **描述**:没有固定模式的区域,可能是由于缺乏足够的氢键或其他稳定力。\n",
|
44 |
+
" - **特点**:虽然称为“无规则”,但实际上可能在特定环境下具有功能性意义。\n",
|
45 |
+
"\n",
|
46 |
+
"\n",
|
47 |
+
"<img src=\"img/protein-structure-1.png\" width=\"500px\" />\n",
|
48 |
+
"\n",
|
49 |
+
"蛋白质的二级结构经常用图形来形象的描述。比如下图中黄色的箭头代表对应的氨基酸 具有β折片结构。波浪线代表螺旋结构,小鼓包是转角。此外,以字母形式书写的二级结构序列能够更加精准的描述。\n",
|
50 |
+
"其中,E 代表β折叠,H 代表α螺旋,T 代表转角。没有写任何字母的地方是松散的 coil 结构。很多序列预测数据集中,一般不区分转角和coil结构。\n",
|
51 |
+
"\n",
|
52 |
+
"\n",
|
53 |
+
"<img src=\"img/protein-structure-2.png\" width=\"500px\" />\n"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "markdown",
|
58 |
+
"id": "c90a583c-f6a5-4a41-8e7e-da27b7e95c50",
|
59 |
+
"metadata": {},
|
60 |
+
"source": [
|
61 |
+
"获得实验测定的蛋白质或 RNA 的二级结构数据,通常需要依赖于实验室技术和公共数据库中已发表的实验结果。以下是一些常用的资源和方法,帮助你获取经过实验验证的二级结构数据:\n",
|
62 |
+
"\n",
|
63 |
+
"### 1. **蛋白质二级结构数据**\n",
|
64 |
+
"\n",
|
65 |
+
"#### a. **PDB (Protein Data Bank)**\n",
|
66 |
+
"\n",
|
67 |
+
"- **网址**:[RCSB PDB](https://www.rcsb.org/)\n",
|
68 |
+
"- **特点**:PDB 是一个全球性的生物大分子结构数据库,包含通过 X 射线晶体学、核磁共振(NMR)和冷冻电镜(Cryo-EM)等实验方法测定的蛋白质三维结构。\n",
|
69 |
+
"- **使用方法**:\n",
|
70 |
+
" - 搜索特定蛋白质的 PDB ID 或名称。\n",
|
71 |
+
" - 查看详细条目页面,其中包含了蛋白质的三级结构信息,可以通过可视化工具如 PyMOL 或 Chimera 来观察二级结构元素(如 α-螺旋、β-折叠片等)。\n",
|
72 |
+
"\n",
|
73 |
+
"<img src=\"img/pdb1.png\" width=\"600px\" />\n",
|
74 |
+
"\n",
|
75 |
+
"From https://www.rcsb.org/sequence/9rsa\n",
|
76 |
+
"\n",
|
77 |
+
"#### b. **PDBe (Protein Data Bank in Europe)**\n",
|
78 |
+
"\n",
|
79 |
+
"- **网址**:[PDBe](https://www.ebi.ac.uk/pdbe/)\n",
|
80 |
+
"- **特点**:PDBe 是欧洲的 PDB 镜像站点,提供了与 RCSB PDB 类似的功能,并且有额外的分析工具和注释信息。\n",
|
81 |
+
"- **使用方法**:\n",
|
82 |
+
" - 搜索蛋白质的 PDB ID 或名称。\n",
|
83 |
+
" - 使用 PDBe-KB 和其他工具来获取详细的结构信息和二级结构注释。\n",
|
84 |
+
"\n",
|
85 |
+
"#### c. **Biomolecule Structure Knowledgebase (BSK)**\n",
|
86 |
+
"\n",
|
87 |
+
"- **网址**:[BSK](https://bsk.pdbj.org/)\n",
|
88 |
+
"- **特点**:BSK 是日本的 PDB 镜像站点,同样提供丰富的结构数据和分析工具。\n",
|
89 |
+
"- **使用方法**:\n",
|
90 |
+
" - 搜索蛋白质的 PDB ID 或名称。\n",
|
91 |
+
" - 浏览条目以获取详细的结构信息和二级结构注释。\n",
|
92 |
+
"\n",
|
93 |
+
"\n",
|
94 |
+
"\n",
|
95 |
+
"### 3. **实验方法**\n",
|
96 |
+
"\n",
|
97 |
+
"如果你需要最新的或特定条件下的二级结构数据,可能需要参考文献中的实验方法。以下是一些常见的实验技术:\n",
|
98 |
+
"\n",
|
99 |
+
"#### a. **X 射线晶体学**\n",
|
100 |
+
"\n",
|
101 |
+
"- **原理**:通过解析蛋白质或 RNA 晶体的衍射图案来确定其三维结构。\n",
|
102 |
+
"- **应用**:适用于能够形成稳定晶体的分子。\n",
|
103 |
+
"\n",
|
104 |
+
"#### b. **核磁共振(NMR)**\n",
|
105 |
+
"\n",
|
106 |
+
"- **原理**:利用核磁共振波谱技术来确定溶液状态下分子的结构。\n",
|
107 |
+
"- **应用**:适用于较小的蛋白质和 RNA 分子。\n",
|
108 |
+
"\n",
|
109 |
+
"#### c. **冷冻电镜(Cryo-EM)**\n",
|
110 |
+
"\n",
|
111 |
+
"- **原理**:通过低温冷冻样品并在电子显微镜下成像来确定分子结构。\n",
|
112 |
+
"- **应用**:适用于较大的复合物和难以结晶的分子。\n",
|
113 |
+
"\n",
|
114 |
+
"\n",
|
115 |
+
"\n",
|
116 |
+
"### 4. **文献检索**\n",
|
117 |
+
"\n",
|
118 |
+
"#### a. **PubMed**\n",
|
119 |
+
"\n",
|
120 |
+
"- **网址**:[PubMed](https://pubmed.ncbi.nlm.nih.gov/)\n",
|
121 |
+
"- **特点**:PubMed 是一个广泛使用的生物医学文献数据库,提供了大量关于蛋白质和 RNA 功能及结构的研究论文。\n",
|
122 |
+
"- **使用方法**:\n",
|
123 |
+
" - 使用关键词搜索与特定蛋白质或 RNA 相关的实验研究。\n",
|
124 |
+
" - 阅读论文以获取详细的实验数据和二级结构描述。\n",
|
125 |
+
"\n",
|
126 |
+
"### 总结\n",
|
127 |
+
"\n",
|
128 |
+
"获得实验测定的蛋白质或 RNA 的二级结构数据主要依赖于公共数据库如 PDB 和 NDB,这些数据库收录了通过多种实验方法测定的结构信息。此外,查阅相关文献也是一种重要的途径,可以找到最新的或特定条件下的实验结果。对于具体的实验方法,如 X 射线晶体学、NMR 和 Cryo-EM 等,它们各自有适用的场景和优势。\n"
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "markdown",
|
133 |
+
"id": "1cadfd11-2130-429d-848f-39371356ca10",
|
134 |
+
"metadata": {},
|
135 |
+
"source": [
|
136 |
+
"## 整理好的数据\n",
|
137 |
+
"\n",
|
138 |
+
"https://huggingface.co/datasets/proteinea/secondary_structure_prediction\n",
|
139 |
+
"\n",
|
140 |
+
"<img src=\"img/ds_structure.png\" width=\"600px\" />\n",
|
141 |
+
"\n",
|
142 |
+
"https://huggingface.co/datasets/genbio-ai/rna-secondary-structure-prediction"
|
143 |
+
]
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"cell_type": "code",
|
147 |
+
"execution_count": 24,
|
148 |
+
"id": "134a72e3-597a-446e-9193-d060a6e677f6",
|
149 |
+
"metadata": {},
|
150 |
+
"outputs": [
|
151 |
+
{
|
152 |
+
"data": {
|
153 |
+
"text/plain": [
|
154 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
155 |
+
]
|
156 |
+
},
|
157 |
+
"execution_count": 24,
|
158 |
+
"metadata": {},
|
159 |
+
"output_type": "execute_result"
|
160 |
+
}
|
161 |
+
],
|
162 |
+
"source": [
|
163 |
+
"import subprocess\n",
|
164 |
+
"import os\n",
|
165 |
+
"# 设置环境变量, autodl一般区域\n",
|
166 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
167 |
+
"output = result.stdout\n",
|
168 |
+
"for line in output.splitlines():\n",
|
169 |
+
" if '=' in line:\n",
|
170 |
+
" var, value = line.split('=', 1)\n",
|
171 |
+
" os.environ[var] = value\n",
|
172 |
+
"\n",
|
173 |
+
"\"\"\"\n",
|
174 |
+
"import os\n",
|
175 |
+
"\n",
|
176 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
177 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
178 |
+
"\n",
|
179 |
+
"# 打印环境变量以确认设置成功\n",
|
180 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
181 |
+
"\"\"\""
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 25,
|
187 |
+
"id": "b43dd5f2-6b23-4b51-ad04-7b7ded732cb7",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [],
|
190 |
+
"source": [
|
191 |
+
"from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer\n",
|
192 |
+
"from tokenizers import Tokenizer\n",
|
193 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
194 |
+
"from transformers import AutoModelForTokenClassification \n",
|
195 |
+
"from transformers import DataCollatorWithPadding"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"cell_type": "code",
|
200 |
+
"execution_count": 26,
|
201 |
+
"id": "4c66fa5b-b8b8-4dfd-ada1-32ed9e690c33",
|
202 |
+
"metadata": {},
|
203 |
+
"outputs": [],
|
204 |
+
"source": [
|
205 |
+
"#set tokenizer,dna protein \n",
|
206 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
207 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
208 |
+
]
|
209 |
+
},
|
210 |
+
{
|
211 |
+
"cell_type": "code",
|
212 |
+
"execution_count": 27,
|
213 |
+
"id": "70a3fd79-48bf-4452-a7ee-689f1b11e987",
|
214 |
+
"metadata": {},
|
215 |
+
"outputs": [],
|
216 |
+
"source": [
|
217 |
+
"from datasets import load_dataset\n",
|
218 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
219 |
+
"dataset = load_dataset(\"proteinea/secondary_structure_prediction\")['train'].train_test_split(test_size=0.1)"
|
220 |
+
]
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"cell_type": "code",
|
224 |
+
"execution_count": 28,
|
225 |
+
"id": "13cd141e-98c3-47da-8e21-cba5576707fe",
|
226 |
+
"metadata": {},
|
227 |
+
"outputs": [
|
228 |
+
{
|
229 |
+
"data": {
|
230 |
+
"text/plain": [
|
231 |
+
"DatasetDict({\n",
|
232 |
+
" train: Dataset({\n",
|
233 |
+
" features: ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'],\n",
|
234 |
+
" num_rows: 9712\n",
|
235 |
+
" })\n",
|
236 |
+
" test: Dataset({\n",
|
237 |
+
" features: ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask'],\n",
|
238 |
+
" num_rows: 1080\n",
|
239 |
+
" })\n",
|
240 |
+
"})"
|
241 |
+
]
|
242 |
+
},
|
243 |
+
"execution_count": 28,
|
244 |
+
"metadata": {},
|
245 |
+
"output_type": "execute_result"
|
246 |
+
}
|
247 |
+
],
|
248 |
+
"source": [
|
249 |
+
"dataset"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"cell_type": "code",
|
254 |
+
"execution_count": 29,
|
255 |
+
"id": "7936af74-3f5f-43c1-aa69-fd7b08989e24",
|
256 |
+
"metadata": {},
|
257 |
+
"outputs": [
|
258 |
+
{
|
259 |
+
"data": {
|
260 |
+
"text/plain": [
|
261 |
+
"{'input': 'MTQTQPVTPTPPASFQTQHDPRTRLGATPLPGGAGTRFRLWTSTARTVAVRVNGTEHVMTSLGGGIYELELPVGPGARYLFVLDGVPTPDPYARFLPDGVHGEAEVVDFGTFDWTDADWHGIKLADCVFYEVHVGTFTPEGTYRAAAEKLPYLKELGVTAIQVMPLAAFDGQRGWGYDGAAFYAPYAPYGRPEDLMALVDAAHRLGLGVFLDVVYNHFGPSGNYLSSYAPSYFTDRFSSAWGMGLDYAEPHMRRYVTGNARMWLRDYHFDGLRLDATPYMTDDSETHILTELAQEIHELGGTHLLLAEDHRNLPDLVTVNHLDGIWTDDFHHETRVTLTGEQEGYYAGYRGGAEALAYTIRRGWRYEGQFWAVKGEEHERGHPSDALEAPNFVYCIQNHDQIGNRPLGERLHQSDGVTLHEYRGAAALLLTLPMTPLLFQGQEWAASTPFQFFSDHAGELGQAVSEGRKKEFGGFSGFSGEDVPDPQAEQTFLNSKLNWAEREGGEHARTLRLYRDLLRLRREDPVLHNRQRENLTTGHDGDVLWVRTVTGAGERVLLWNLGQDTRAVAEVKLPFTVPRRLLLHTEGREDLTLGAGEAVLVG',\n",
|
262 |
+
" 'dssp3': 'CCCCCCCCCCCCCCCCCCCCHHHCCEEEECHHHCCEEEEEECCCCCCEEEEECCEEEECEEEECCEEEEEECCCCCCEEEEEECCEEECCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCHHHCCEEEECHHHHCCCCCHHHHHHCHHHHHHHCCCEEEECCCEECCCCCCCCCCCCEEEEECHHHCCHHHHHHHHHHHHHCCCEEEEEECCCCCCCCCCCHHHHCHHHEEEEEECCCCEEECCCCHHHHHHHHHHHHHHHHHHCCCEEEECCHHHCCCCCCCCHHHHHHHHHHCCCCCCEEEEECCCCCCHHHHCCCCCEEECCHHHHHHHHHHHCCCCHHHHHCCCCHHHHHHHHHHCCCCEEEEECCCCCCEEEECCCCCCCHHHEEEECCCHHHHHCCCCCCCHHHCCCCCHHHHHHHHHHHHHCCCEEEEECCHHHCCCCCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCCCCHHHHHCCCCCCHHHHCHHHHHHHHHHHHHHHHHHHCCCCCCCCHHHEEEEEECCEEEEEEEECCEEEEEEEECCCCCEEHHHCCCCCCCCCCEEEECCCCCCCEECCCCEEEEC',\n",
|
263 |
+
" 'dssp8': 'CCCCCCCCCCCCCCCCCSCCGGGCSEEEECGGGCCEEEEEECSSCSSEEEEETTEEEECEEEETTEEEEEESCCTTCEEEEEETTEEECCTTCSCCTTCTTSCEECCCTTSSCCCCTTCCCCCGGGCCEEEECHHHHSSSCSHHHHHHTHHHHHHHTCCEEEECCCEECSSSCCCSTTCCEEEEECGGGCCHHHHHHHHHHHHHTTCEEEEEECCSCCCSSSCCHHHHCGGGEEEEEECSSSEEECTTSHHHHHHHHHHHHIIIIIHCCSEEEETTGGGCCCCSSSCHHHHHHHHHHTTCSCCEEEEECSSCCTHHHHTTCCSEEECTHHHHHHHHHHHCCCSGGGGGCCCSHHHHHHHHHHSSSCEEEEECCTTCCEEEECCCTTCCGGGEEEESCCHHHHHTSTTCCCGGGSTTCCHHHHHHHHHHHHHSSSEEEEETTGGGTCSSCCCCCCCCCHHHHHHHHHHHHHHCCCCCCCCCCCCCCTTSHHHHHTTSCCSGGGGSHHHHHHHHHHHHHHHHHHHCTTTTCCCGGGEEEEEETTEEEEEEEETTEEEEEEEECSSSCEEGGGSCCSSCCCCCEEEETTCCSSSEECTTCEEEEC',\n",
|
264 |
+
" 'disorder': '0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0',\n",
|
265 |
+
" 'cb513_mask': '1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0'}"
|
266 |
+
]
|
267 |
+
},
|
268 |
+
"execution_count": 29,
|
269 |
+
"metadata": {},
|
270 |
+
"output_type": "execute_result"
|
271 |
+
}
|
272 |
+
],
|
273 |
+
"source": [
|
274 |
+
"dataset[\"train\"][0]"
|
275 |
+
]
|
276 |
+
},
|
277 |
+
{
|
278 |
+
"cell_type": "code",
|
279 |
+
"execution_count": 30,
|
280 |
+
"id": "47b1ac0c-e934-4ac3-b869-509515b15aa1",
|
281 |
+
"metadata": {},
|
282 |
+
"outputs": [
|
283 |
+
{
|
284 |
+
"name": "stdout",
|
285 |
+
"output_type": "stream",
|
286 |
+
"text": [
|
287 |
+
"dna datasets mean token lenght 96.07685185185186 min token length 7 max token length 576\n"
|
288 |
+
]
|
289 |
+
}
|
290 |
+
],
|
291 |
+
"source": [
|
292 |
+
"token_len_list = []\n",
|
293 |
+
"for item in dataset[\"test\"]:\n",
|
294 |
+
" inputs = tokenizer.tokenize(item[\"input\"])\n",
|
295 |
+
" token_len_list.append( len(inputs) )\n",
|
296 |
+
"\n",
|
297 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
298 |
+
"min_len = min(token_len_list)\n",
|
299 |
+
"max_len = max(token_len_list)\n",
|
300 |
+
"\n",
|
301 |
+
"print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
302 |
+
]
|
303 |
+
},
|
304 |
+
{
|
305 |
+
"cell_type": "code",
|
306 |
+
"execution_count": 31,
|
307 |
+
"id": "1b32de6e-fe08-426e-983e-7dd157c9af62",
|
308 |
+
"metadata": {},
|
309 |
+
"outputs": [
|
310 |
+
{
|
311 |
+
"name": "stdout",
|
312 |
+
"output_type": "stream",
|
313 |
+
"text": [
|
314 |
+
"Number of unique labels: 3\n",
|
315 |
+
"Label to ID mapping: {'C': 0, 'H': 1, 'E': 2, '<pad>': 3}\n"
|
316 |
+
]
|
317 |
+
}
|
318 |
+
],
|
319 |
+
"source": [
|
320 |
+
"from collections import Counter\n",
|
321 |
+
"\n",
|
322 |
+
"# Confirm the number of labels and create a mapping from string labels to integer IDs.\n",
|
323 |
+
"all_labels = [label for item in dataset[\"train\"] for label in item[\"dssp3\"]]\n",
|
324 |
+
"label_counts = Counter(all_labels)\n",
|
325 |
+
"num_labels = len(label_counts)\n",
|
326 |
+
"\n",
|
327 |
+
"# Define a special ID for padding. Make sure this ID is not used by any actual label.\n",
|
328 |
+
"# If you have 3 classes, start with 3 or higher.\n",
|
329 |
+
"pad_token_label_id = num_labels # Assuming no other labels have this ID.\n",
|
330 |
+
"\n",
|
331 |
+
"label_to_id = {label: i for i, (label, _) in enumerate(label_counts.items())}\n",
|
332 |
+
"label_to_id['<pad>'] = pad_token_label_id # Add padding token to the mapping.\n",
|
333 |
+
"id_to_label = {v: k for k, v in label_to_id.items()}\n",
|
334 |
+
"\n",
|
335 |
+
"print(f\"Number of unique labels: {num_labels}\")\n",
|
336 |
+
"print(\"Label to ID mapping:\", label_to_id)"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"cell_type": "code",
|
341 |
+
"execution_count": 32,
|
342 |
+
"id": "2bd65f47-3325-4357-a896-9a0abf160e8a",
|
343 |
+
"metadata": {},
|
344 |
+
"outputs": [
|
345 |
+
{
|
346 |
+
"name": "stderr",
|
347 |
+
"output_type": "stream",
|
348 |
+
"text": [
|
349 |
+
"Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
|
350 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
351 |
+
]
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"source": [
|
355 |
+
"#set model\n",
|
356 |
+
"#model = AutoModelForTokenClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', )\n",
|
357 |
+
"model = AutoModelForTokenClassification.from_pretrained(\n",
|
358 |
+
" 'dnagpt/gene_eng_gpt2_v0',\n",
|
359 |
+
" num_labels=num_labels + 1, # Include the padding label in the count.\n",
|
360 |
+
" id2label=id_to_label,\n",
|
361 |
+
" label2id=label_to_id\n",
|
362 |
+
")"
|
363 |
+
]
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"cell_type": "code",
|
367 |
+
"execution_count": 33,
|
368 |
+
"id": "e247ac1e-bcd4-4aaf-9f91-dc939e5abe89",
|
369 |
+
"metadata": {},
|
370 |
+
"outputs": [],
|
371 |
+
"source": [
|
372 |
+
"# 5. Preprocess the data\n",
|
373 |
+
"from transformers import DataCollatorForTokenClassification\n",
|
374 |
+
"import torch\n",
|
375 |
+
"# Define the maximum sequence length based on your model or dataset requirements.\n",
|
376 |
+
"max_seq_length = 128 # Adjust this value as needed.\n",
|
377 |
+
"\n",
|
378 |
+
"def preprocess_function(examples):\n",
|
379 |
+
" tokenized_inputs = tokenizer(\n",
|
380 |
+
" examples[\"input\"], \n",
|
381 |
+
" truncation=True, \n",
|
382 |
+
" padding='max_length', \n",
|
383 |
+
" max_length=max_seq_length,\n",
|
384 |
+
" return_tensors=\"pt\" # Return PyTorch tensors directly.\n",
|
385 |
+
" )\n",
|
386 |
+
" \n",
|
387 |
+
" labels = []\n",
|
388 |
+
" for label in examples['dssp3']:\n",
|
389 |
+
" label_ids = [label_to_id[l] if l in label_to_id else pad_token_label_id for l in label]\n",
|
390 |
+
" # Ensure labels are padded/truncated to the same length as inputs.\n",
|
391 |
+
" if len(label_ids) > max_seq_length:\n",
|
392 |
+
" label_ids = label_ids[:max_seq_length]\n",
|
393 |
+
" else:\n",
|
394 |
+
" label_ids = label_ids + [pad_token_label_id] * (max_seq_length - len(label_ids))\n",
|
395 |
+
" \n",
|
396 |
+
" labels.append(label_ids)\n",
|
397 |
+
" \n",
|
398 |
+
" tokenized_inputs[\"labels\"] = torch.tensor(labels)\n",
|
399 |
+
"\n",
|
400 |
+
" return tokenized_inputs"
|
401 |
+
]
|
402 |
+
},
|
403 |
+
{
|
404 |
+
"cell_type": "code",
|
405 |
+
"execution_count": 34,
|
406 |
+
"id": "8144d093-e8d3-41ff-ae4f-82aa1f28d689",
|
407 |
+
"metadata": {},
|
408 |
+
"outputs": [
|
409 |
+
{
|
410 |
+
"data": {
|
411 |
+
"application/vnd.jupyter.widget-view+json": {
|
412 |
+
"model_id": "707978d4f8304cada1041f8e794d79b7",
|
413 |
+
"version_major": 2,
|
414 |
+
"version_minor": 0
|
415 |
+
},
|
416 |
+
"text/plain": [
|
417 |
+
"Map: 0%| | 0/9712 [00:00<?, ? examples/s]"
|
418 |
+
]
|
419 |
+
},
|
420 |
+
"metadata": {},
|
421 |
+
"output_type": "display_data"
|
422 |
+
},
|
423 |
+
{
|
424 |
+
"data": {
|
425 |
+
"application/vnd.jupyter.widget-view+json": {
|
426 |
+
"model_id": "7ab7ae3ed05244bab1fe13050aad3764",
|
427 |
+
"version_major": 2,
|
428 |
+
"version_minor": 0
|
429 |
+
},
|
430 |
+
"text/plain": [
|
431 |
+
"Map: 0%| | 0/1080 [00:00<?, ? examples/s]"
|
432 |
+
]
|
433 |
+
},
|
434 |
+
"metadata": {},
|
435 |
+
"output_type": "display_data"
|
436 |
+
}
|
437 |
+
],
|
438 |
+
"source": [
|
439 |
+
"tokenized_datasets = dataset.map(preprocess_function, batched=True)"
|
440 |
+
]
|
441 |
+
},
|
442 |
+
{
|
443 |
+
"cell_type": "code",
|
444 |
+
"execution_count": 35,
|
445 |
+
"id": "de5067da-a010-4e0d-b99b-659ee2d3cf3c",
|
446 |
+
"metadata": {},
|
447 |
+
"outputs": [],
|
448 |
+
"source": [
|
449 |
+
"# Remove columns that are not required by the model.\n",
|
450 |
+
"columns_to_remove = ['input', 'dssp3', 'dssp8', 'disorder', 'cb513_mask']\n",
|
451 |
+
"tokenized_datasets.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"], output_all_columns=True)"
|
452 |
+
]
|
453 |
+
},
|
454 |
+
{
|
455 |
+
"cell_type": "code",
|
456 |
+
"execution_count": 36,
|
457 |
+
"id": "fcce6f4e-9716-4fe4-9250-e201a442bbbc",
|
458 |
+
"metadata": {},
|
459 |
+
"outputs": [],
|
460 |
+
"source": [
|
461 |
+
"# Set up data collator for handling padding during batching.\n",
|
462 |
+
"data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=8, label_pad_token_id=pad_token_label_id)"
|
463 |
+
]
|
464 |
+
},
|
465 |
+
{
|
466 |
+
"cell_type": "code",
|
467 |
+
"execution_count": 37,
|
468 |
+
"id": "fa3e62b3-dba4-4cef-9bb7-de410f4bb444",
|
469 |
+
"metadata": {},
|
470 |
+
"outputs": [
|
471 |
+
{
|
472 |
+
"name": "stderr",
|
473 |
+
"output_type": "stream",
|
474 |
+
"text": [
|
475 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
476 |
+
" warnings.warn(\n",
|
477 |
+
"/tmp/ipykernel_1443/204012889.py:41: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
|
478 |
+
" trainer = Trainer(\n"
|
479 |
+
]
|
480 |
+
}
|
481 |
+
],
|
482 |
+
"source": [
|
483 |
+
"# 6. Prepare training\n",
|
484 |
+
"import evaluate\n",
|
485 |
+
"import numpy as np\n",
|
486 |
+
"\n",
|
487 |
+
"metric = evaluate.load(\"seqeval\")\n",
|
488 |
+
"\n",
|
489 |
+
"def compute_metrics(p):\n",
|
490 |
+
" predictions, labels = p\n",
|
491 |
+
" predictions = np.argmax(predictions, axis=2)\n",
|
492 |
+
"\n",
|
493 |
+
" # Remove ignored index (special tokens)\n",
|
494 |
+
" true_predictions = [\n",
|
495 |
+
" [id_to_label[p] for (p, l) in zip(prediction, label) if l != pad_token_label_id]\n",
|
496 |
+
" for prediction, label in zip(predictions, labels)\n",
|
497 |
+
" ]\n",
|
498 |
+
" true_labels = [\n",
|
499 |
+
" [id_to_label[l] for (p, l) in zip(prediction, label) if l != pad_token_label_id]\n",
|
500 |
+
" for prediction, label in zip(predictions, labels)\n",
|
501 |
+
" ]\n",
|
502 |
+
"\n",
|
503 |
+
" results = metric.compute(predictions=true_predictions, references=true_labels)\n",
|
504 |
+
" return {\n",
|
505 |
+
" \"precision\": results[\"overall_precision\"],\n",
|
506 |
+
" \"recall\": results[\"overall_recall\"],\n",
|
507 |
+
" \"f1\": results[\"overall_f1\"],\n",
|
508 |
+
" \"accuracy\": results[\"overall_accuracy\"],\n",
|
509 |
+
" }\n",
|
510 |
+
" \n",
|
511 |
+
"training_args = TrainingArguments(\n",
|
512 |
+
" output_dir=\"./results\",\n",
|
513 |
+
" evaluation_strategy=\"epoch\",\n",
|
514 |
+
" learning_rate=1e-5,\n",
|
515 |
+
" lr_scheduler_type=\"constant_with_warmup\",\n",
|
516 |
+
" optim='adamw_torch',\n",
|
517 |
+
" per_device_train_batch_size=16,\n",
|
518 |
+
" per_device_eval_batch_size=16,\n",
|
519 |
+
" num_train_epochs=20,\n",
|
520 |
+
" weight_decay=0.01,\n",
|
521 |
+
")\n",
|
522 |
+
"\n",
|
523 |
+
"trainer = Trainer(\n",
|
524 |
+
" model=model,\n",
|
525 |
+
" args=training_args,\n",
|
526 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
527 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
528 |
+
" tokenizer=tokenizer,\n",
|
529 |
+
" data_collator=DataCollatorWithPadding(tokenizer=tokenizer),\n",
|
530 |
+
" compute_metrics=compute_metrics,\n",
|
531 |
+
")"
|
532 |
+
]
|
533 |
+
},
|
534 |
+
{
|
535 |
+
"cell_type": "code",
|
536 |
+
"execution_count": 38,
|
537 |
+
"id": "8a76f326-1097-47bb-bb9d-03b77c4f8f4f",
|
538 |
+
"metadata": {},
|
539 |
+
"outputs": [
|
540 |
+
{
|
541 |
+
"data": {
|
542 |
+
"text/html": [
|
543 |
+
"\n",
|
544 |
+
" <div>\n",
|
545 |
+
" \n",
|
546 |
+
" <progress value='8001' max='12140' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
547 |
+
" [ 8001/12140 09:41 < 05:00, 13.76 it/s, Epoch 13.18/20]\n",
|
548 |
+
" </div>\n",
|
549 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
550 |
+
" <thead>\n",
|
551 |
+
" <tr style=\"text-align: left;\">\n",
|
552 |
+
" <th>Epoch</th>\n",
|
553 |
+
" <th>Training Loss</th>\n",
|
554 |
+
" <th>Validation Loss</th>\n",
|
555 |
+
" <th>Precision</th>\n",
|
556 |
+
" <th>Recall</th>\n",
|
557 |
+
" <th>F1</th>\n",
|
558 |
+
" <th>Accuracy</th>\n",
|
559 |
+
" </tr>\n",
|
560 |
+
" </thead>\n",
|
561 |
+
" <tbody>\n",
|
562 |
+
" <tr>\n",
|
563 |
+
" <td>1</td>\n",
|
564 |
+
" <td>1.102200</td>\n",
|
565 |
+
" <td>0.923186</td>\n",
|
566 |
+
" <td>0.314843</td>\n",
|
567 |
+
" <td>0.125521</td>\n",
|
568 |
+
" <td>0.179485</td>\n",
|
569 |
+
" <td>0.503214</td>\n",
|
570 |
+
" </tr>\n",
|
571 |
+
" <tr>\n",
|
572 |
+
" <td>2</td>\n",
|
573 |
+
" <td>0.942100</td>\n",
|
574 |
+
" <td>0.883362</td>\n",
|
575 |
+
" <td>0.357413</td>\n",
|
576 |
+
" <td>0.153466</td>\n",
|
577 |
+
" <td>0.214731</td>\n",
|
578 |
+
" <td>0.521366</td>\n",
|
579 |
+
" </tr>\n",
|
580 |
+
" <tr>\n",
|
581 |
+
" <td>3</td>\n",
|
582 |
+
" <td>0.898500</td>\n",
|
583 |
+
" <td>0.895442</td>\n",
|
584 |
+
" <td>0.355443</td>\n",
|
585 |
+
" <td>0.194234</td>\n",
|
586 |
+
" <td>0.251199</td>\n",
|
587 |
+
" <td>0.522545</td>\n",
|
588 |
+
" </tr>\n",
|
589 |
+
" <tr>\n",
|
590 |
+
" <td>4</td>\n",
|
591 |
+
" <td>0.870200</td>\n",
|
592 |
+
" <td>0.891230</td>\n",
|
593 |
+
" <td>0.367170</td>\n",
|
594 |
+
" <td>0.050731</td>\n",
|
595 |
+
" <td>0.089145</td>\n",
|
596 |
+
" <td>0.526761</td>\n",
|
597 |
+
" </tr>\n",
|
598 |
+
" <tr>\n",
|
599 |
+
" <td>5</td>\n",
|
600 |
+
" <td>0.831900</td>\n",
|
601 |
+
" <td>0.890030</td>\n",
|
602 |
+
" <td>0.373252</td>\n",
|
603 |
+
" <td>0.197096</td>\n",
|
604 |
+
" <td>0.257971</td>\n",
|
605 |
+
" <td>0.530358</td>\n",
|
606 |
+
" </tr>\n",
|
607 |
+
" <tr>\n",
|
608 |
+
" <td>6</td>\n",
|
609 |
+
" <td>0.815800</td>\n",
|
610 |
+
" <td>0.867876</td>\n",
|
611 |
+
" <td>0.378696</td>\n",
|
612 |
+
" <td>0.236628</td>\n",
|
613 |
+
" <td>0.291262</td>\n",
|
614 |
+
" <td>0.540153</td>\n",
|
615 |
+
" </tr>\n",
|
616 |
+
" <tr>\n",
|
617 |
+
" <td>7</td>\n",
|
618 |
+
" <td>0.800900</td>\n",
|
619 |
+
" <td>0.873521</td>\n",
|
620 |
+
" <td>0.380925</td>\n",
|
621 |
+
" <td>0.212640</td>\n",
|
622 |
+
" <td>0.272927</td>\n",
|
623 |
+
" <td>0.544393</td>\n",
|
624 |
+
" </tr>\n",
|
625 |
+
" <tr>\n",
|
626 |
+
" <td>8</td>\n",
|
627 |
+
" <td>0.785100</td>\n",
|
628 |
+
" <td>0.872138</td>\n",
|
629 |
+
" <td>0.385372</td>\n",
|
630 |
+
" <td>0.156363</td>\n",
|
631 |
+
" <td>0.222462</td>\n",
|
632 |
+
" <td>0.547684</td>\n",
|
633 |
+
" </tr>\n",
|
634 |
+
" <tr>\n",
|
635 |
+
" <td>9</td>\n",
|
636 |
+
" <td>0.774100</td>\n",
|
637 |
+
" <td>0.885855</td>\n",
|
638 |
+
" <td>0.384813</td>\n",
|
639 |
+
" <td>0.180280</td>\n",
|
640 |
+
" <td>0.245531</td>\n",
|
641 |
+
" <td>0.549681</td>\n",
|
642 |
+
" </tr>\n",
|
643 |
+
" <tr>\n",
|
644 |
+
" <td>10</td>\n",
|
645 |
+
" <td>0.750800</td>\n",
|
646 |
+
" <td>0.884582</td>\n",
|
647 |
+
" <td>0.388464</td>\n",
|
648 |
+
" <td>0.206529</td>\n",
|
649 |
+
" <td>0.269681</td>\n",
|
650 |
+
" <td>0.555933</td>\n",
|
651 |
+
" </tr>\n",
|
652 |
+
" <tr>\n",
|
653 |
+
" <td>11</td>\n",
|
654 |
+
" <td>0.737500</td>\n",
|
655 |
+
" <td>0.886323</td>\n",
|
656 |
+
" <td>0.396929</td>\n",
|
657 |
+
" <td>0.202713</td>\n",
|
658 |
+
" <td>0.268369</td>\n",
|
659 |
+
" <td>0.557624</td>\n",
|
660 |
+
" </tr>\n",
|
661 |
+
" <tr>\n",
|
662 |
+
" <td>12</td>\n",
|
663 |
+
" <td>0.731000</td>\n",
|
664 |
+
" <td>0.878285</td>\n",
|
665 |
+
" <td>0.365956</td>\n",
|
666 |
+
" <td>0.315728</td>\n",
|
667 |
+
" <td>0.338991</td>\n",
|
668 |
+
" <td>0.555857</td>\n",
|
669 |
+
" </tr>\n",
|
670 |
+
" <tr>\n",
|
671 |
+
" <td>13</td>\n",
|
672 |
+
" <td>0.708900</td>\n",
|
673 |
+
" <td>0.912278</td>\n",
|
674 |
+
" <td>0.377030</td>\n",
|
675 |
+
" <td>0.249346</td>\n",
|
676 |
+
" <td>0.300174</td>\n",
|
677 |
+
" <td>0.555030</td>\n",
|
678 |
+
" </tr>\n",
|
679 |
+
" </tbody>\n",
|
680 |
+
"</table><p>"
|
681 |
+
],
|
682 |
+
"text/plain": [
|
683 |
+
"<IPython.core.display.HTML object>"
|
684 |
+
]
|
685 |
+
},
|
686 |
+
"metadata": {},
|
687 |
+
"output_type": "display_data"
|
688 |
+
},
|
689 |
+
{
|
690 |
+
"name": "stderr",
|
691 |
+
"output_type": "stream",
|
692 |
+
"text": [
|
693 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
694 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
695 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
696 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
697 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
698 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
699 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
700 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
701 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
702 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
703 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
704 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
705 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
706 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
707 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
708 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
709 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
710 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
711 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
712 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
713 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
714 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
715 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
716 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
717 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
718 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
719 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
720 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
721 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
722 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
723 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
724 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
725 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
726 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
727 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
728 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
729 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
730 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
731 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
732 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
733 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
734 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
735 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
736 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
737 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
738 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
739 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
740 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
741 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
742 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
743 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
744 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
745 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
746 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
747 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
748 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
749 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
750 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
751 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
752 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
753 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
754 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
755 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
756 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
757 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
758 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
759 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
760 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
761 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
762 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
763 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
764 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
765 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
766 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
767 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
768 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
769 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
770 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
771 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
772 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
773 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
774 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
775 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
776 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
777 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
778 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
779 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
780 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
781 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
782 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
783 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
784 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
785 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
786 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
787 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
788 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n",
|
789 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: C seems not to be NE tag.\n",
|
790 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
791 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: H seems not to be NE tag.\n",
|
792 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
793 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/sequence_labeling.py:171: UserWarning: <pad> seems not to be NE tag.\n",
|
794 |
+
" warnings.warn('{} seems not to be NE tag.'.format(chunk))\n",
|
795 |
+
"/root/miniconda3/lib/python3.12/site-packages/seqeval/metrics/v1.py:57: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n",
|
796 |
+
" _warn_prf(average, modifier, msg_start, len(result))\n"
|
797 |
+
]
|
798 |
+
},
|
799 |
+
{
|
800 |
+
"ename": "RuntimeError",
|
801 |
+
"evalue": "[enforce fail at inline_container.cc:595] . unexpected pos 1216226560 vs 1216226452",
|
802 |
+
"output_type": "error",
|
803 |
+
"traceback": [
|
804 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
805 |
+
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
806 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:628\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _open_zipfile_writer(f) \u001b[38;5;28;01mas\u001b[39;00m opened_zipfile:\n\u001b[0;32m--> 628\u001b[0m \u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_protocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_disable_byteorder_record\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m\n",
|
807 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:862\u001b[0m, in \u001b[0;36m_save\u001b[0;34m(obj, zip_file, pickle_module, pickle_protocol, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 861\u001b[0m num_bytes \u001b[38;5;241m=\u001b[39m storage\u001b[38;5;241m.\u001b[39mnbytes()\n\u001b[0;32m--> 862\u001b[0m \u001b[43mzip_file\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_record\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnum_bytes\u001b[49m\u001b[43m)\u001b[49m\n",
|
808 |
+
"\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:764] . PytorchStreamWriter failed writing file data/94: file write failed",
|
809 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
810 |
+
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
811 |
+
"Cell \u001b[0;32mIn[38], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Start training\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
|
812 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:2164\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 2162\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 2163\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 2164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2165\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2166\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2167\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2168\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2169\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
|
813 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:2591\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2589\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mepoch \u001b[38;5;241m=\u001b[39m epoch \u001b[38;5;241m+\u001b[39m (step \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m steps_skipped) \u001b[38;5;241m/\u001b[39m steps_in_epoch\n\u001b[1;32m 2590\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[0;32m-> 2591\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_log_save_evaluate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2592\u001b[0m \u001b[43m \u001b[49m\u001b[43mtr_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgrad_norm\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_time\u001b[49m\n\u001b[1;32m 2593\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2594\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2595\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_substep_end(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n",
|
814 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3056\u001b[0m, in \u001b[0;36mTrainer._maybe_log_save_evaluate\u001b[0;34m(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)\u001b[0m\n\u001b[1;32m 3053\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_save \u001b[38;5;241m=\u001b[39m is_new_best_metric\n\u001b[1;32m 3055\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[0;32m-> 3056\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_checkpoint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3057\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_save(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n",
|
815 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3192\u001b[0m, in \u001b[0;36mTrainer._save_checkpoint\u001b[0;34m(self, model, trial)\u001b[0m\n\u001b[1;32m 3188\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msave_model(output_dir, _internal_call\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 3190\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39msave_only_model:\n\u001b[1;32m 3191\u001b[0m \u001b[38;5;66;03m# Save optimizer and scheduler\u001b[39;00m\n\u001b[0;32m-> 3192\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_save_optimizer_and_scheduler\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3193\u001b[0m \u001b[38;5;66;03m# Save RNG state\u001b[39;00m\n\u001b[1;32m 3194\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_save_rng_state(output_dir)\n",
|
816 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/transformers/trainer.py:3313\u001b[0m, in \u001b[0;36mTrainer._save_optimizer_and_scheduler\u001b[0;34m(self, output_dir)\u001b[0m\n\u001b[1;32m 3308\u001b[0m save_fsdp_optimizer(\n\u001b[1;32m 3309\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mfsdp_plugin, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptimizer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel, output_dir\n\u001b[1;32m 3310\u001b[0m )\n\u001b[1;32m 3311\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mshould_save:\n\u001b[1;32m 3312\u001b[0m \u001b[38;5;66;03m# deepspeed.save_checkpoint above saves model/optim/sched\u001b[39;00m\n\u001b[0;32m-> 3313\u001b[0m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstate_dict\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_dir\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mOPTIMIZER_NAME\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3315\u001b[0m \u001b[38;5;66;03m# Save SCHEDULER & SCALER\u001b[39;00m\n\u001b[1;32m 3316\u001b[0m is_deepspeed_custom_scheduler \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mis_deepspeed_enabled \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\n\u001b[1;32m 3317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlr_scheduler, DeepSpeedSchedulerWrapper\n\u001b[1;32m 3318\u001b[0m )\n",
|
817 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:627\u001b[0m, in \u001b[0;36msave\u001b[0;34m(obj, f, pickle_module, pickle_protocol, _use_new_zipfile_serialization, _disable_byteorder_record)\u001b[0m\n\u001b[1;32m 624\u001b[0m _check_save_filelike(f)\n\u001b[1;32m 626\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _use_new_zipfile_serialization:\n\u001b[0;32m--> 627\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mwith\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43m_open_zipfile_writer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mas\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43m_save\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopened_zipfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpickle_protocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_disable_byteorder_record\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mreturn\u001b[39;49;00m\n",
|
818 |
+
"File \u001b[0;32m~/miniconda3/lib/python3.12/site-packages/torch/serialization.py:475\u001b[0m, in \u001b[0;36m_open_zipfile_writer_file.__exit__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__exit__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 475\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfile_like\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite_end_of_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 477\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfile_stream\u001b[38;5;241m.\u001b[39mclose()\n",
|
819 |
+
"\u001b[0;31mRuntimeError\u001b[0m: [enforce fail at inline_container.cc:595] . unexpected pos 1216226560 vs 1216226452"
|
820 |
+
]
|
821 |
+
}
|
822 |
+
],
|
823 |
+
"source": [
|
824 |
+
"# Start training\n",
|
825 |
+
"trainer.train()"
|
826 |
+
]
|
827 |
+
},
|
828 |
+
{
|
829 |
+
"cell_type": "code",
|
830 |
+
"execution_count": 39,
|
831 |
+
"id": "950c460f-5631-4c9a-819b-1e3ac484cc65",
|
832 |
+
"metadata": {},
|
833 |
+
"outputs": [
|
834 |
+
{
|
835 |
+
"data": {
|
836 |
+
"text/plain": [
|
837 |
+
"{'eval_loss': 0.9122781157493591,\n",
|
838 |
+
" 'eval_precision': 0.3770299145299145,\n",
|
839 |
+
" 'eval_recall': 0.2493464283190843,\n",
|
840 |
+
" 'eval_f1': 0.3001743716242079,\n",
|
841 |
+
" 'eval_accuracy': 0.5550300748427384}"
|
842 |
+
]
|
843 |
+
},
|
844 |
+
"execution_count": 39,
|
845 |
+
"metadata": {},
|
846 |
+
"output_type": "execute_result"
|
847 |
+
}
|
848 |
+
],
|
849 |
+
"source": [
|
850 |
+
"results = trainer.evaluate()\n",
|
851 |
+
"results"
|
852 |
+
]
|
853 |
+
},
|
854 |
+
{
|
855 |
+
"cell_type": "code",
|
856 |
+
"execution_count": 40,
|
857 |
+
"id": "8174c1c6-a5bc-4fe3-8f9b-356625531e7d",
|
858 |
+
"metadata": {},
|
859 |
+
"outputs": [
|
860 |
+
{
|
861 |
+
"name": "stdout",
|
862 |
+
"output_type": "stream",
|
863 |
+
"text": [
|
864 |
+
">>> Perplexity: 2.49\n"
|
865 |
+
]
|
866 |
+
}
|
867 |
+
],
|
868 |
+
"source": [
|
869 |
+
"import math\n",
|
870 |
+
"eval_results = trainer.evaluate()\n",
|
871 |
+
"print(f\">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
|
872 |
+
]
|
873 |
+
},
|
874 |
+
{
|
875 |
+
"cell_type": "code",
|
876 |
+
"execution_count": 41,
|
877 |
+
"id": "6a22f131-9e5f-4125-942a-22d1b1e6373b",
|
878 |
+
"metadata": {},
|
879 |
+
"outputs": [
|
880 |
+
{
|
881 |
+
"data": {
|
882 |
+
"text/plain": [
|
883 |
+
"('./secondary_structure_model/tokenizer_config.json',\n",
|
884 |
+
" './secondary_structure_model/special_tokens_map.json',\n",
|
885 |
+
" './secondary_structure_model/vocab.json',\n",
|
886 |
+
" './secondary_structure_model/merges.txt',\n",
|
887 |
+
" './secondary_structure_model/added_tokens.json')"
|
888 |
+
]
|
889 |
+
},
|
890 |
+
"execution_count": 41,
|
891 |
+
"metadata": {},
|
892 |
+
"output_type": "execute_result"
|
893 |
+
}
|
894 |
+
],
|
895 |
+
"source": [
|
896 |
+
"# 保存模型\n",
|
897 |
+
"model.save_pretrained(\"./secondary_structure_model\")\n",
|
898 |
+
"tokenizer.save_pretrained(\"./secondary_structure_model\")"
|
899 |
+
]
|
900 |
+
},
|
901 |
+
{
|
902 |
+
"cell_type": "code",
|
903 |
+
"execution_count": 42,
|
904 |
+
"id": "d5817a6c-c707-4005-9210-2a12ff0d43b0",
|
905 |
+
"metadata": {},
|
906 |
+
"outputs": [],
|
907 |
+
"source": [
|
908 |
+
"# 加载模型\n",
|
909 |
+
"model = AutoModelForTokenClassification.from_pretrained(\"./secondary_structure_model\")\n",
|
910 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"./secondary_structure_model\")"
|
911 |
+
]
|
912 |
+
},
|
913 |
+
{
|
914 |
+
"cell_type": "code",
|
915 |
+
"execution_count": 43,
|
916 |
+
"id": "2f6ebdc6-8ff8-4947-ada4-05ff4b28e0f3",
|
917 |
+
"metadata": {},
|
918 |
+
"outputs": [],
|
919 |
+
"source": [
|
920 |
+
"# 进行预测\n",
|
921 |
+
"def predict_secondary_structure(sequence):\n",
|
922 |
+
" inputs = tokenizer(sequence, return_tensors=\"pt\", truncation=True, padding=True)\n",
|
923 |
+
" outputs = model(**inputs)\n",
|
924 |
+
" predictions = outputs.logits.argmax(dim=-1)\n",
|
925 |
+
" return predictions"
|
926 |
+
]
|
927 |
+
},
|
928 |
+
{
|
929 |
+
"cell_type": "code",
|
930 |
+
"execution_count": 44,
|
931 |
+
"id": "841ebba8-7619-411f-a11e-841de3a3f064",
|
932 |
+
"metadata": {},
|
933 |
+
"outputs": [
|
934 |
+
{
|
935 |
+
"name": "stderr",
|
936 |
+
"output_type": "stream",
|
937 |
+
"text": [
|
938 |
+
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
|
939 |
+
]
|
940 |
+
},
|
941 |
+
{
|
942 |
+
"name": "stdout",
|
943 |
+
"output_type": "stream",
|
944 |
+
"text": [
|
945 |
+
"tensor([[0, 0, 0, 0, 2, 2, 2, 2]])\n"
|
946 |
+
]
|
947 |
+
}
|
948 |
+
],
|
949 |
+
"source": [
|
950 |
+
"# 示例预测\n",
|
951 |
+
"sequence = \"ACDEFGHIKLMNPQRSTVWY\"\n",
|
952 |
+
"predictions = predict_secondary_structure(sequence)\n",
|
953 |
+
"print(predictions)"
|
954 |
+
]
|
955 |
+
},
|
956 |
{
|
957 |
"cell_type": "code",
|
958 |
"execution_count": null,
|
959 |
+
"id": "37e7d22e-0545-422b-b8ba-7990ca127d8a",
|
960 |
"metadata": {},
|
961 |
"outputs": [],
|
962 |
"source": []
|
03-gene-task/3-multi-seq-task.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
03-gene-task/4-fun-predict.ipynb
CHANGED
@@ -1,9 +1,763 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": []
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "ce0fa061-3f49-46c3-ba5c-8dcca7d283d3",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.4 功能预测任务"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "e0fe3429-dbff-4e83-827a-2e31da60dfc3",
|
14 |
+
"metadata": {
|
15 |
+
"jp-MarkdownHeadingCollapsed": true
|
16 |
+
},
|
17 |
+
"source": [
|
18 |
+
"## 数据准备\n",
|
19 |
+
"\n",
|
20 |
+
"获得蛋白质序列及其对应的功能描述是生物信息学中的一个重要任务,通常涉及从公共数据库中检索数据或通过实验方法确定。以下是几种常用的方法和资源,帮助你获取蛋白质序列和功能描述:\n",
|
21 |
+
"\n",
|
22 |
+
"### 1. **使用公共数据库**\n",
|
23 |
+
"\n",
|
24 |
+
"#### a. **UniProt (Universal Protein Resource)**\n",
|
25 |
+
"\n",
|
26 |
+
"- **网址**:[UniProt](https://www.uniprot.org/)\n",
|
27 |
+
"- **特点**:UniProt 是一个综合性的蛋白质数据库,提供了丰富的注释信息,包括蛋白质序列、结构、功能、亚细胞定位等。\n",
|
28 |
+
"- **使用方法**:\n",
|
29 |
+
" - 在搜索栏中输入蛋白质名称、基因名称或序列 ID。\n",
|
30 |
+
" - 浏览结果页面以查看详细的注释信息,包括功能描述、GO(Gene Ontology)术语、文献引用等。\n",
|
31 |
+
"\n",
|
32 |
+
"\n",
|
33 |
+
"<img src=\"img/function.png\" width=\"500px\" />\n",
|
34 |
+
"\n",
|
35 |
+
"<img src=\"img/sequence.png\" width=\"500px\" />\n",
|
36 |
+
"\n",
|
37 |
+
"#### b. **NCBI (National Center for Biotechnology Information)**\n",
|
38 |
+
"\n",
|
39 |
+
"- **网址**:[NCBI](https://www.ncbi.nlm.nih.gov/)\n",
|
40 |
+
"- **特点**:NCBI 提供多个相关数据库,如 GenBank、RefSeq 和 Protein 数据库,涵盖广泛的生物物种和蛋白质信息。\n",
|
41 |
+
"- **使用方法**:\n",
|
42 |
+
" - 使用 NCBI 的搜索工具 Entrez 或 BLAST 搜索蛋白质序列或功能描述。\n",
|
43 |
+
" - 访问特定的蛋白质条目以获取详细信息,包括序列、功能、参考文献等。\n",
|
44 |
+
"\n",
|
45 |
+
"#### c. **PDB (Protein Data Bank)**\n",
|
46 |
+
"\n",
|
47 |
+
"- **网址**:[PDB](https://www.rcsb.org/)\n",
|
48 |
+
"- **特点**:PDB 主要包含蛋白质的三维结构信息,但也提供相关的功能描述和文献引用。\n",
|
49 |
+
"- **使用方法**:\n",
|
50 |
+
" - 使用 PDB 的搜索功能查找特定蛋白质的结构信息。\n",
|
51 |
+
" - 查看每个条目的详细页面以获取功能描述和其他相关信息。\n",
|
52 |
+
"\n",
|
53 |
+
"#### d. **Ensembl**\n",
|
54 |
+
"\n",
|
55 |
+
"- **网址**:[Ensembl](https://www.ensembl.org/)\n",
|
56 |
+
"- **特点**:Ensembl 提供基因组浏览器和注释信息,涵盖了多种物种的基因和蛋白质数据。\n",
|
57 |
+
"- **使用方法**:\n",
|
58 |
+
" - 使用 Ensembl 的搜索功能查找特定蛋白质或基因。\n",
|
59 |
+
" - 浏览条目页面以获取详细的注释信息,包括功能描述、GO 术语等。\n",
|
60 |
+
"\n",
|
61 |
+
"### 2. **通过生物信息学工具**\n",
|
62 |
+
"\n",
|
63 |
+
"#### a. **BLAST (Basic Local Alignment Search Tool)**\n",
|
64 |
+
"\n",
|
65 |
+
"- **网址**:[BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi)\n",
|
66 |
+
"- **特点**:BLAST 是一种常用的比对工具,可以帮助你根据已知的蛋白质序列找到相似的序列,并获取其功能描述。\n",
|
67 |
+
"- **使用方法**:\n",
|
68 |
+
" - 输入你的蛋白质序列。\n",
|
69 |
+
" - 选择适当的数据库(如 NR、Swiss-Prot 等)进行比对。\n",
|
70 |
+
" - 分析比对结果,查看相似序列的功能描述。\n",
|
71 |
+
"\n",
|
72 |
+
"#### b. **InterProScan**\n",
|
73 |
+
"\n",
|
74 |
+
"- **网址**:[InterPro](https://www.ebi.ac.uk/interpro/)\n",
|
75 |
+
"- **特点**:InterProScan 是一种用于识别蛋白质家族、结构域和重要位点的工具,可以提供详细的注释信息。\n",
|
76 |
+
"- **使用方法**:\n",
|
77 |
+
" - 输入你的蛋白质序列。\n",
|
78 |
+
" - 运行 InterProScan 分析,获取功能描述、结构域信息等。\n",
|
79 |
+
"\n",
|
80 |
+
"### 3. **通过文献和出版物**\n",
|
81 |
+
"\n",
|
82 |
+
"#### a. **PubMed**\n",
|
83 |
+
"\n",
|
84 |
+
"- **网址**:[PubMed](https://pubmed.ncbi.nlm.nih.gov/)\n",
|
85 |
+
"- **特点**:PubMed 是一个广泛使用的生物医学文献数据库,提供了大量关于蛋白质功能的研究论文。\n",
|
86 |
+
"- **使用方法**:\n",
|
87 |
+
" - 使用关键词搜索与特定蛋白质相关的研究论文。\n",
|
88 |
+
" - 阅读论文以获取详细的实验数据和功能描述。\n",
|
89 |
+
"\n",
|
90 |
+
"#### b. **Google Scholar**\n",
|
91 |
+
"\n",
|
92 |
+
"- **网址**:[Google Scholar](https://scholar.google.com/)\n",
|
93 |
+
"- **特点**:Google Scholar 是一个学术搜索引擎,涵盖广泛的科学文献。\n",
|
94 |
+
"- **使用方法**:\n",
|
95 |
+
" - 使用关键词搜索与特定蛋白质相关的研究论文。\n",
|
96 |
+
" - 阅读论文以获取详细的实验数据和功能描述。"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "markdown",
|
101 |
+
"id": "e794b698-ef26-4695-9b77-ff7314210e8b",
|
102 |
+
"metadata": {},
|
103 |
+
"source": [
|
104 |
+
"## 整理好的数据\n",
|
105 |
+
"\n",
|
106 |
+
"https://huggingface.co/datasets/PharMolix/MutaDescribe\n",
|
107 |
+
"\n",
|
108 |
+
"<img src=\"img/dataset.png\" width=\"500px\" />\n",
|
109 |
+
"\n",
|
110 |
+
"https://huggingface.co/datasets/jonghyunlee/UniProt_function_text_descriptions?row=2"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "code",
|
115 |
+
"execution_count": 1,
|
116 |
+
"id": "e6c59f74-877a-4a74-9017-e61eb713e285",
|
117 |
+
"metadata": {},
|
118 |
+
"outputs": [
|
119 |
+
{
|
120 |
+
"data": {
|
121 |
+
"text/plain": [
|
122 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
123 |
+
]
|
124 |
+
},
|
125 |
+
"execution_count": 1,
|
126 |
+
"metadata": {},
|
127 |
+
"output_type": "execute_result"
|
128 |
+
}
|
129 |
+
],
|
130 |
+
"source": [
|
131 |
+
"import subprocess\n",
|
132 |
+
"import os\n",
|
133 |
+
"# 设置环境变量, autodl一般区域\n",
|
134 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
135 |
+
"output = result.stdout\n",
|
136 |
+
"for line in output.splitlines():\n",
|
137 |
+
" if '=' in line:\n",
|
138 |
+
" var, value = line.split('=', 1)\n",
|
139 |
+
" os.environ[var] = value\n",
|
140 |
+
"\n",
|
141 |
+
"\"\"\"\n",
|
142 |
+
"import os\n",
|
143 |
+
"\n",
|
144 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
145 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
146 |
+
"\n",
|
147 |
+
"# 打印环境变量以确认设置成功\n",
|
148 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
149 |
+
"\"\"\""
|
150 |
+
]
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"cell_type": "code",
|
154 |
+
"execution_count": 2,
|
155 |
+
"id": "f038b5f3-b2a5-45bd-b66a-0475b1f2c026",
|
156 |
+
"metadata": {},
|
157 |
+
"outputs": [],
|
158 |
+
"source": [
|
159 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
160 |
+
"from tokenizers import Tokenizer\n",
|
161 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
162 |
+
"from transformers import AutoModelForSeq2SeqLM \n",
|
163 |
+
"from transformers import DataCollatorWithPadding"
|
164 |
+
]
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"cell_type": "code",
|
168 |
+
"execution_count": 3,
|
169 |
+
"id": "7c861666-010e-46d6-aaf0-c63e52920d99",
|
170 |
+
"metadata": {},
|
171 |
+
"outputs": [],
|
172 |
+
"source": [
|
173 |
+
"#set tokenizer,dna protein \n",
|
174 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
175 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"cell_type": "code",
|
180 |
+
"execution_count": 4,
|
181 |
+
"id": "32532c2d-962d-4aa0-a823-87d6c62a411f",
|
182 |
+
"metadata": {},
|
183 |
+
"outputs": [],
|
184 |
+
"source": [
|
185 |
+
"from datasets import load_dataset\n",
|
186 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
187 |
+
"dataset = load_dataset(\"jonghyunlee/UniProt_function_text_descriptions\")['train'].select(range(5000)).train_test_split(test_size=0.05)"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"cell_type": "code",
|
192 |
+
"execution_count": 5,
|
193 |
+
"id": "3829dc86-1274-440b-aa2b-e100f112e9bf",
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [
|
196 |
+
{
|
197 |
+
"data": {
|
198 |
+
"text/plain": [
|
199 |
+
"DatasetDict({\n",
|
200 |
+
" train: Dataset({\n",
|
201 |
+
" features: ['entry', 'entry_name', 'protein_name', 'sequence', 'function'],\n",
|
202 |
+
" num_rows: 4750\n",
|
203 |
+
" })\n",
|
204 |
+
" test: Dataset({\n",
|
205 |
+
" features: ['entry', 'entry_name', 'protein_name', 'sequence', 'function'],\n",
|
206 |
+
" num_rows: 250\n",
|
207 |
+
" })\n",
|
208 |
+
"})"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
"execution_count": 5,
|
212 |
+
"metadata": {},
|
213 |
+
"output_type": "execute_result"
|
214 |
+
}
|
215 |
+
],
|
216 |
+
"source": [
|
217 |
+
"dataset"
|
218 |
+
]
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"cell_type": "code",
|
222 |
+
"execution_count": 6,
|
223 |
+
"id": "6cb4a847-8fd1-4cdd-a905-e0595250c712",
|
224 |
+
"metadata": {},
|
225 |
+
"outputs": [
|
226 |
+
{
|
227 |
+
"data": {
|
228 |
+
"text/plain": [
|
229 |
+
"{'entry': 'A1TFU9',\n",
|
230 |
+
" 'entry_name': 'HPXO_MYCVP',\n",
|
231 |
+
" 'protein_name': 'FAD-dependent urate hydroxylase (EC 1.14.13.113) (Flavoprotein urate hydroxylase)',\n",
|
232 |
+
" 'sequence': 'MKVVIVGAGMGGMSAAIALRQIGIDTVVYERVTENKPVGAAISVWSNGVKCLNYLGLQEETAELGGKVETMSYVDGHTGDTMCRFSMHPLIEQVGQRPYPIARAELQLMLMKAYGIDDINFGMKMVGVENDTAGSAAKATFADGTTVSADVIIGADGAGSITREYVLGGPVSRRYAGYVNYNGLVSTDDAIGPATEWTTYVGDGKRVSVMPVSDDRFYFFFDVVEPQGSPYEEGRVREVLRAHFAGWTPGVQTLIDTLDPLATNRVEILDLDPFHTWVKGRVAVLGDAAHNTTPDIGQGGCSAMEDAIALQWAFKDHPDDVHAALAAYQSARTERAADLVLRARKRCDVTHAKDPQVTSRWYDELRNEDGTNIIRGIVGNIVGGPLTPVTAATEG',\n",
|
233 |
+
" 'function': 'Catalyzes the hydroxylation of urate to 5-hydroxyisourate (HIU). Is likely to be involved in the urate degradation pathway to allantoin. Prefers NADH over NADPH as the electron donor. '}"
|
234 |
+
]
|
235 |
+
},
|
236 |
+
"execution_count": 6,
|
237 |
+
"metadata": {},
|
238 |
+
"output_type": "execute_result"
|
239 |
+
}
|
240 |
+
],
|
241 |
+
"source": [
|
242 |
+
"dataset[\"train\"][0]"
|
243 |
+
]
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"cell_type": "code",
|
247 |
+
"execution_count": 10,
|
248 |
+
"id": "2eb330e7-28ea-46f1-b9bb-093352e1c5d8",
|
249 |
+
"metadata": {},
|
250 |
+
"outputs": [
|
251 |
+
{
|
252 |
+
"name": "stdout",
|
253 |
+
"output_type": "stream",
|
254 |
+
"text": [
|
255 |
+
"dna datasets mean token lenght 269.515 min token length 24 max token length 4577\n"
|
256 |
+
]
|
257 |
+
}
|
258 |
+
],
|
259 |
+
"source": [
|
260 |
+
"token_len_list = []\n",
|
261 |
+
"for item in dataset[\"test\"].select(range(200)):\n",
|
262 |
+
" inputs = tokenizer.tokenize(item[\"sequence\"])\n",
|
263 |
+
" token_len_list.append( len(inputs) )\n",
|
264 |
+
"\n",
|
265 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
266 |
+
"min_len = min(token_len_list)\n",
|
267 |
+
"max_len = max(token_len_list)\n",
|
268 |
+
"\n",
|
269 |
+
"print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
270 |
+
]
|
271 |
+
},
|
272 |
+
{
|
273 |
+
"cell_type": "code",
|
274 |
+
"execution_count": 11,
|
275 |
+
"id": "f904a4ad-6fe1-4588-b5fd-5541e26a9bfd",
|
276 |
+
"metadata": {},
|
277 |
+
"outputs": [
|
278 |
+
{
|
279 |
+
"name": "stdout",
|
280 |
+
"output_type": "stream",
|
281 |
+
"text": [
|
282 |
+
"dna datasets mean token lenght 271.02 min token length 23 max token length 1934\n"
|
283 |
+
]
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"source": [
|
287 |
+
"token_len_list = []\n",
|
288 |
+
"for item in dataset[\"test\"].select(range(50)):\n",
|
289 |
+
" inputs = tokenizer.tokenize(item[\"function\"])\n",
|
290 |
+
" token_len_list.append( len(inputs) )\n",
|
291 |
+
"\n",
|
292 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
293 |
+
"min_len = min(token_len_list)\n",
|
294 |
+
"max_len = max(token_len_list)\n",
|
295 |
+
"\n",
|
296 |
+
"print(\"dna datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
297 |
+
]
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"cell_type": "code",
|
301 |
+
"execution_count": 6,
|
302 |
+
"id": "73ea434d-6262-4d47-a6e9-3ffb136d8cd0",
|
303 |
+
"metadata": {},
|
304 |
+
"outputs": [
|
305 |
+
{
|
306 |
+
"data": {
|
307 |
+
"application/vnd.jupyter.widget-view+json": {
|
308 |
+
"model_id": "32b5e6b926f24df282da36e7f14cc9ba",
|
309 |
+
"version_major": 2,
|
310 |
+
"version_minor": 0
|
311 |
+
},
|
312 |
+
"text/plain": [
|
313 |
+
"Map: 0%| | 0/4750 [00:00<?, ? examples/s]"
|
314 |
+
]
|
315 |
+
},
|
316 |
+
"metadata": {},
|
317 |
+
"output_type": "display_data"
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"data": {
|
321 |
+
"application/vnd.jupyter.widget-view+json": {
|
322 |
+
"model_id": "6082815e684f4469bee48ada6a049998",
|
323 |
+
"version_major": 2,
|
324 |
+
"version_minor": 0
|
325 |
+
},
|
326 |
+
"text/plain": [
|
327 |
+
"Map: 0%| | 0/250 [00:00<?, ? examples/s]"
|
328 |
+
]
|
329 |
+
},
|
330 |
+
"metadata": {},
|
331 |
+
"output_type": "display_data"
|
332 |
+
}
|
333 |
+
],
|
334 |
+
"source": [
|
335 |
+
"max_length = 128\n",
|
336 |
+
"\n",
|
337 |
+
"def preprocess_function(examples):\n",
|
338 |
+
" # 直接从 examples 中提取字段\n",
|
339 |
+
" inputs = examples[\"sequence\"] # 获取所有样本的 \"sequence\"\n",
|
340 |
+
" targets = examples[\"function\"] # 获取所有样本的 \"function\"\n",
|
341 |
+
"\n",
|
342 |
+
" # 对数据进行编码\n",
|
343 |
+
" model_inputs = tokenizer(\n",
|
344 |
+
" inputs, text_target=targets, max_length=max_length, truncation=True\n",
|
345 |
+
" )\n",
|
346 |
+
" return model_inputs\n",
|
347 |
+
"\n",
|
348 |
+
"\n",
|
349 |
+
"# 应用分词\n",
|
350 |
+
"tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset[\"train\"].column_names,)\n"
|
351 |
+
]
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"cell_type": "code",
|
355 |
+
"execution_count": 10,
|
356 |
+
"id": "36c17576-cf54-4bfd-8d98-fe41a4b7d2cc",
|
357 |
+
"metadata": {},
|
358 |
+
"outputs": [
|
359 |
+
{
|
360 |
+
"name": "stdout",
|
361 |
+
"output_type": "stream",
|
362 |
+
"text": [
|
363 |
+
"{'input_ids': [9206, 609, 532, 2065, 487, 1241, 50, 1785, 4070, 827, 28817, 3840, 1105, 8309, 2993, 3449, 47036, 22588, 5215, 636, 4189, 12265, 3721, 7075, 69183, 3040, 814, 1209, 1910, 217, 474, 13943, 15033, 535, 558, 51164, 12333, 56886, 1174, 338, 20934, 9865, 46, 1131, 3021, 336, 11005, 20318, 748, 396, 46, 38, 46, 54, 12036, 482, 4807, 284, 13333, 87969, 1482, 618, 371, 46, 49, 29703, 46, 5669, 55496, 40, 2682, 2186, 84535, 471, 12020, 280, 1751, 46, 545, 3968, 1660, 354, 1309, 84775, 328, 3802, 52, 46, 33718, 797, 46, 39, 487, 965, 16953, 790, 8503, 53823, 365, 39878, 41235, 17957, 25823, 785, 967, 1371, 543, 8660, 1510, 308, 46, 46, 35663, 3804, 4662, 15100, 8524, 2378, 254, 2399, 38462, 1700, 3223, 1296, 478, 1972, 809, 251], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [31598, 406, 418, 11784, 170, 2001, 170, 4596, 26168, 170, 36668, 69, 170, 3225, 283, 829, 170, 223, 362, 170, 5700, 83, 170, 450, 170, 65, 170, 14420, 270, 170, 2001, 170, 5429, 52715, 468, 370, 3550, 46, 33, 170, 37558, 77, 170, 39404, 73, 65, 170, 1389, 170, 5739, 548, 456, 170, 1389, 170, 15898, 456, 170, 666, 170, 2021, 672, 170, 2001, 170, 13336, 1471, 1882, 170, 59356, 307, 170, 8, 518, 46, 13, 73310, 562, 170, 1389, 170, 518, 46, 13, 1681, 65, 9, 14, 26398, 78, 26168, 170, 36668, 69, 170, 4655, 792, 170, 436, 170, 35585, 24270, 952, 170, 248, 170, 4655, 792, 170, 1978, 170, 21544, 13, 35, 80, 39, 170, 21033, 597, 13, 78252, 952, 3550, 46, 33, 170, 37558, 77]}\n",
|
364 |
+
"{'input_ids': [45, 504, 1187, 659, 46, 5874, 86301, 412, 51, 86301, 3970, 4570, 59926, 3476, 43, 1517, 46, 244, 46, 335, 9187, 1342, 36689, 14000, 542, 40307, 3757, 14421, 412, 3762, 256, 1588, 723, 12505, 36170, 4898, 846, 87891, 3670, 4020, 4651, 12182, 1121, 60975, 264, 1201, 404, 10714, 256, 1396, 4709, 22460, 2538, 254, 1173, 71302, 423, 11201, 1259, 4013, 87933, 23361, 3410, 46, 41032, 407, 1131, 23083, 2151, 333, 4143, 28020, 213, 52, 21197, 46, 58776, 46, 10528, 1424, 19812, 5974, 46, 54], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [50, 18991, 19522, 170, 1978, 170, 4067, 26168, 170, 1389, 170, 11307, 170, 666, 170, 2250, 85871, 170, 2001, 170, 666, 170, 11086, 655, 1836, 170, 1285, 79, 45056, 17704, 170, 298, 36851, 875, 170, 8371, 6081, 170, 1389, 170, 666, 170, 10434, 10228, 69, 170, 8, 70, 6326, 9, 170, 82935, 468, 69, 14, 26398, 1268, 456, 170, 2901, 170, 70249, 362, 6356, 170, 248, 170, 79448, 456, 170, 1389, 170, 3774, 456, 170, 79448, 69, 170, 11086, 655, 597, 170, 1338, 982, 14, 170]}\n"
|
365 |
+
]
|
366 |
+
}
|
367 |
+
],
|
368 |
+
"source": [
|
369 |
+
"# 查看处理后的数据,使用正确的拆分(例如 'train')\n",
|
370 |
+
"print(tokenized_datasets['train'][0]) # 查看 'train' 数据集中的第一个样本\n",
|
371 |
+
"print(tokenized_datasets[\"test\"][0])# 查看 'test' 数据集中的第一个样本\n"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "code",
|
376 |
+
"execution_count": 16,
|
377 |
+
"id": "9aec8d0b-ef4f-43f3-b986-53edfc7a509f",
|
378 |
+
"metadata": {},
|
379 |
+
"outputs": [
|
380 |
+
{
|
381 |
+
"data": {
|
382 |
+
"application/vnd.jupyter.widget-view+json": {
|
383 |
+
"model_id": "086f8f974e99413da19fc5c18030f1c6",
|
384 |
+
"version_major": 2,
|
385 |
+
"version_minor": 0
|
386 |
+
},
|
387 |
+
"text/plain": [
|
388 |
+
"model.safetensors: 69%|######9 | 430M/620M [00:00<?, ?B/s]"
|
389 |
+
]
|
390 |
+
},
|
391 |
+
"metadata": {},
|
392 |
+
"output_type": "display_data"
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"data": {
|
396 |
+
"application/vnd.jupyter.widget-view+json": {
|
397 |
+
"model_id": "187fabd0fd8b49419f7fa1ecf3ff6216",
|
398 |
+
"version_major": 2,
|
399 |
+
"version_minor": 0
|
400 |
+
},
|
401 |
+
"text/plain": [
|
402 |
+
"generation_config.json: 0%| | 0.00/111 [00:00<?, ?B/s]"
|
403 |
+
]
|
404 |
+
},
|
405 |
+
"metadata": {},
|
406 |
+
"output_type": "display_data"
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"name": "stderr",
|
410 |
+
"output_type": "stream",
|
411 |
+
"text": [
|
412 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
413 |
+
" warnings.warn(\n"
|
414 |
+
]
|
415 |
+
},
|
416 |
+
{
|
417 |
+
"name": "stdout",
|
418 |
+
"output_type": "stream",
|
419 |
+
"text": [
|
420 |
+
"[2025-01-05 16:06:01,794] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
|
421 |
+
]
|
422 |
+
},
|
423 |
+
{
|
424 |
+
"name": "stderr",
|
425 |
+
"output_type": "stream",
|
426 |
+
"text": [
|
427 |
+
"/root/miniconda3/compiler_compat/ld: cannot find -laio: No such file or directory\n",
|
428 |
+
"collect2: error: ld returned 1 exit status\n",
|
429 |
+
"/root/miniconda3/compiler_compat/ld: warning: libpthread.so.0, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
430 |
+
"/root/miniconda3/compiler_compat/ld: warning: libstdc++.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
431 |
+
"/root/miniconda3/compiler_compat/ld: warning: libm.so.6, needed by /usr/local/cuda/lib64/libcufile.so, not found (try using -rpath or -rpath-link)\n",
|
432 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'\n",
|
433 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'\n",
|
434 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'\n",
|
435 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'\n",
|
436 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsigned long, char)@GLIBCXX_3.4'\n",
|
437 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for bool@CXXABI_1.3'\n",
|
438 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_logic_error(char const*)@GLIBCXX_3.4'\n",
|
439 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
440 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::logic_error@GLIBCXX_3.4'\n",
|
441 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::~locale()@GLIBCXX_3.4'\n",
|
442 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
443 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_end_catch@CXXABI_1.3'\n",
|
444 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
445 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::logic_error::~logic_error()@GLIBCXX_3.4'\n",
|
446 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__si_class_type_info@CXXABI_1.3'\n",
|
447 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
448 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
449 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new[](unsigned long)@GLIBCXX_3.4'\n",
|
450 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak_hard()@GLIBCXX_3.4'\n",
|
451 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
452 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::basic_streambuf(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
453 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
454 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::string const&)@GLIBCXX_3.4'\n",
|
455 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned short@CXXABI_1.3'\n",
|
456 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::resize(unsigned long, char)@GLIBCXX_3.4'\n",
|
457 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char const*@CXXABI_1.3'\n",
|
458 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ctype<char>::_M_widen_init() const@GLIBCXX_3.4.11'\n",
|
459 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_invalid_argument(char const*)@GLIBCXX_3.4'\n",
|
460 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::operator=(std::locale const&)@GLIBCXX_3.4'\n",
|
461 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::_M_cache_locale(std::locale const&)@GLIBCXX_3.4'\n",
|
462 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
463 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_free_exception@CXXABI_1.3'\n",
|
464 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::notify_one()@GLIBCXX_3.4.11'\n",
|
465 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::~Init()@GLIBCXX_3.4'\n",
|
466 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::~basic_string()@GLIBCXX_3.4'\n",
|
467 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_pure_virtual@CXXABI_1.3'\n",
|
468 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::flush()@GLIBCXX_3.4'\n",
|
469 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for __cxxabiv1::__class_type_info@CXXABI_1.3'\n",
|
470 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_rethrow@CXXABI_1.3'\n",
|
471 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
472 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_fstream<char, std::char_traits<char> >::~basic_fstream()@GLIBCXX_3.4'\n",
|
473 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::compare(char const*) const@GLIBCXX_3.4'\n",
|
474 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
475 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale()@GLIBCXX_3.4'\n",
|
476 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::system_clock::now()@GLIBCXX_3.4.19'\n",
|
477 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_ifstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
478 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Hash_bytes(void const*, unsigned long, unsigned long)@CXXABI_1.3.5'\n",
|
479 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long long>(long long)@GLIBCXX_3.4.9'\n",
|
480 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for char*@CXXABI_1.3'\n",
|
481 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_Prime_rehash_policy::_M_need_rehash(unsigned long, unsigned long, unsigned long) const@GLIBCXX_3.4.18'\n",
|
482 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::out_of_range@GLIBCXX_3.4'\n",
|
483 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long>(unsigned long)@GLIBCXX_3.4.9'\n",
|
484 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base const*)@GLIBCXX_3.4'\n",
|
485 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::~ios_base()@GLIBCXX_3.4'\n",
|
486 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::range_error::~range_error()@GLIBCXX_3.4'\n",
|
487 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::~__basic_file()@GLIBCXX_3.4'\n",
|
488 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_acquire@CXXABI_1.3'\n",
|
489 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<bool>(bool)@GLIBCXX_3.4.9'\n",
|
490 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::overflow_error@GLIBCXX_3.4'\n",
|
491 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
492 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::range_error@GLIBCXX_3.4'\n",
|
493 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
494 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_filebuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
495 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete[](void*)@GLIBCXX_3.4'\n",
|
496 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
497 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(unsigned long, char, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
498 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_transfer(std::__detail::_List_node_base*, std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
499 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::replace(unsigned long, unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
500 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::exception@GLIBCXX_3.4'\n",
|
501 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_M_destroy(std::allocator<wchar_t> const&)@GLIBCXX_3.4'\n",
|
502 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream& std::istream::_M_extract<double>(double&)@GLIBCXX_3.4.9'\n",
|
503 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
504 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_fstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
505 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::basic_ifstream(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
506 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(std::string const&)@GLIBCXX_3.4'\n",
|
507 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator new(unsigned long)@GLIBCXX_3.4'\n",
|
508 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
509 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned int@CXXABI_1.3'\n",
|
510 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::append(char const*)@GLIBCXX_3.4'\n",
|
511 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::domain_error@GLIBCXX_3.4'\n",
|
512 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char, unsigned long) const@GLIBCXX_3.4'\n",
|
513 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::put(char)@GLIBCXX_3.4'\n",
|
514 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int@CXXABI_1.3'\n",
|
515 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_alloc()@GLIBCXX_3.4'\n",
|
516 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_thread_atexit@CXXABI_1.3.7'\n",
|
517 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_increment(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
518 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::~basic_ifstream()@GLIBCXX_3.4'\n",
|
519 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::Init::Init()@GLIBCXX_3.4'\n",
|
520 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::condition_variable()@GLIBCXX_3.4.11'\n",
|
521 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::basic_filebuf()@GLIBCXX_3.4'\n",
|
522 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
523 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::domain_error::~domain_error()@GLIBCXX_3.4'\n",
|
524 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cerr@GLIBCXX_3.4'\n",
|
525 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::find(char const*, unsigned long, unsigned long) const@GLIBCXX_3.4'\n",
|
526 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
527 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
528 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::str() const@GLIBCXX_3.4'\n",
|
529 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::invalid_argument@GLIBCXX_3.4'\n",
|
530 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void*@CXXABI_1.3'\n",
|
531 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(std::string const&)@GLIBCXX_3.4'\n",
|
532 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_ostringstream()@GLIBCXX_3.4'\n",
|
533 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_rebalance_for_erase(std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
534 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long@CXXABI_1.3'\n",
|
535 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_hook(std::__detail::_List_node_base*)@GLIBCXX_3.4.15'\n",
|
536 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__detail::_List_node_base::_M_unhook()@GLIBCXX_3.4.15'\n",
|
537 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
538 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<char, std::char_traits<char>, std::allocator<char> >::_M_sync(char*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
539 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<char, std::char_traits<char> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
540 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::locale::locale(std::locale const&)@GLIBCXX_3.4'\n",
|
541 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_istringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
542 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'\n",
|
543 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
544 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
545 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::exception::~exception()@GLIBCXX_3.4'\n",
|
546 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_create(unsigned long, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
547 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__basic_file<char>::is_open() const@GLIBCXX_3.4'\n",
|
548 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_istringstream()@GLIBCXX_3.4'\n",
|
549 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::swap(std::string&)@GLIBCXX_3.4'\n",
|
550 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long*@CXXABI_1.3'\n",
|
551 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ostringstream<char, std::char_traits<char>, std::allocator<char> >@GLIBCXX_3.4'\n",
|
552 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::basic_streambuf(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
553 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::init(std::basic_streambuf<char, std::char_traits<char> >*)@GLIBCXX_3.4'\n",
|
554 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_bad_cast()@GLIBCXX_3.4'\n",
|
555 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<char, std::char_traits<char> >::clear(std::_Ios_Iostate)@GLIBCXX_3.4'\n",
|
556 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >::operator=(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> > const&)@GLIBCXX_3.4'\n",
|
557 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `operator delete(void*)@GLIBCXX_3.4'\n",
|
558 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::operator<<(int)@GLIBCXX_3.4'\n",
|
559 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
560 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_Rep::_M_destroy(std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
561 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_iostream<wchar_t, std::char_traits<wchar_t> >::~basic_iostream()@GLIBCXX_3.4'\n",
|
562 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::runtime_error@GLIBCXX_3.4'\n",
|
563 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ofstream<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
564 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_insert_and_rebalance(bool, std::_Rb_tree_node_base*, std::_Rb_tree_node_base*, std::_Rb_tree_node_base&)@GLIBCXX_3.4'\n",
|
565 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringstream<char, std::char_traits<char>, std::allocator<char> >::~basic_stringstream()@GLIBCXX_3.4'\n",
|
566 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `VTT for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
567 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<long>(long)@GLIBCXX_3.4.9'\n",
|
568 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::get()@GLIBCXX_3.4'\n",
|
569 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned long long@CXXABI_1.3'\n",
|
570 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::operator<< <std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*)@GLIBCXX_3.4'\n",
|
571 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::out_of_range::~out_of_range()@GLIBCXX_3.4'\n",
|
572 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::length_error::~length_error()@GLIBCXX_3.4'\n",
|
573 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)@GLIBCXX_3.4.9'\n",
|
574 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::invalid_argument::~invalid_argument()@GLIBCXX_3.4'\n",
|
575 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::swap(std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >&)@GLIBCXX_3.4'\n",
|
576 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::cout@GLIBCXX_3.4'\n",
|
577 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<unsigned long long>(unsigned long long)@GLIBCXX_3.4.9'\n",
|
578 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for int*@CXXABI_1.3'\n",
|
579 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<void const*>(void const*)@GLIBCXX_3.4.9'\n",
|
580 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::underflow_error@GLIBCXX_3.4'\n",
|
581 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_streambuf<char, std::char_traits<char> >@GLIBCXX_3.4'\n",
|
582 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for std::out_of_range@GLIBCXX_3.4'\n",
|
583 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_allocate_exception@CXXABI_1.3'\n",
|
584 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_ios<wchar_t, std::char_traits<wchar_t> >@GLIBCXX_3.4'\n",
|
585 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for void const*@CXXABI_1.3'\n",
|
586 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ios<wchar_t, std::char_traits<wchar_t> >::init(std::basic_streambuf<wchar_t, std::char_traits<wchar_t> >*)@GLIBCXX_3.4'\n",
|
587 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::reserve(unsigned long)@GLIBCXX_3.4'\n",
|
588 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_begin_catch@CXXABI_1.3'\n",
|
589 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long@CXXABI_1.3'\n",
|
590 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_Rep::_S_empty_rep_storage@GLIBCXX_3.4'\n",
|
591 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_leak()@GLIBCXX_3.4'\n",
|
592 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::open(char const*, std::_Ios_Openmode)@GLIBCXX_3.4'\n",
|
593 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >::_M_sync(wchar_t*, unsigned long, unsigned long)@GLIBCXX_3.4'\n",
|
594 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::istream::getline(char*, long, char)@GLIBCXX_3.4'\n",
|
595 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_istream<char, std::char_traits<char> >& std::getline<char, std::char_traits<char>, std::allocator<char> >(std::basic_istream<char, std::char_traits<char> >&, std::basic_string<char, std::char_traits<char>, std::allocator<char> >&, char)@GLIBCXX_3.4'\n",
|
596 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringstream<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
597 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::condition_variable::~condition_variable()@GLIBCXX_3.4.11'\n",
|
598 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::basic_stringbuf<wchar_t, std::char_traits<wchar_t>, std::allocator<wchar_t> >@GLIBCXX_3.4'\n",
|
599 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::insert(unsigned long, char const*, unsigned long)@GLIBCXX_3.4'\n",
|
600 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::assign(char const*, unsigned long)@GLIBCXX_3.4'\n",
|
601 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for unsigned char@CXXABI_1.3'\n",
|
602 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ios_base::ios_base()@GLIBCXX_3.4'\n",
|
603 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_out_of_range(char const*)@GLIBCXX_3.4'\n",
|
604 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::overflow_error::~overflow_error()@GLIBCXX_3.4'\n",
|
605 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_length_error(char const*)@GLIBCXX_3.4'\n",
|
606 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::__throw_system_error(int)@GLIBCXX_3.4.11'\n",
|
607 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ofstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
608 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream& std::ostream::_M_insert<double>(double)@GLIBCXX_3.4.9'\n",
|
609 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_streambuf<char, std::char_traits<char> >::operator=(std::basic_streambuf<char, std::char_traits<char> > const&)@GLIBCXX_3.4'\n",
|
610 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `typeinfo for long long@CXXABI_1.3'\n",
|
611 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, unsigned long, std::allocator<char> const&)@GLIBCXX_3.4'\n",
|
612 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_ifstream<char, std::char_traits<char> >::close()@GLIBCXX_3.4'\n",
|
613 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_guard_release@CXXABI_1.3'\n",
|
614 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__cxa_throw@CXXABI_1.3'\n",
|
615 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::underflow_error::~underflow_error()@GLIBCXX_3.4'\n",
|
616 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::_Rb_tree_decrement(std::_Rb_tree_node_base*)@GLIBCXX_3.4'\n",
|
617 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `vtable for std::length_error@GLIBCXX_3.4'\n",
|
618 |
+
"/root/miniconda3/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::basic_filebuf<char, std::char_traits<char> >::~basic_filebuf()@GLIBCXX_3.4'\n",
|
619 |
+
"collect2: error: ld returned 1 exit status\n"
|
620 |
+
]
|
621 |
+
},
|
622 |
+
{
|
623 |
+
"data": {
|
624 |
+
"text/html": [
|
625 |
+
"\n",
|
626 |
+
" <div>\n",
|
627 |
+
" \n",
|
628 |
+
" <progress value='2970' max='2970' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
629 |
+
" [2970/2970 03:02, Epoch 5/5]\n",
|
630 |
+
" </div>\n",
|
631 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
632 |
+
" <thead>\n",
|
633 |
+
" <tr style=\"text-align: left;\">\n",
|
634 |
+
" <th>Epoch</th>\n",
|
635 |
+
" <th>Training Loss</th>\n",
|
636 |
+
" <th>Validation Loss</th>\n",
|
637 |
+
" </tr>\n",
|
638 |
+
" </thead>\n",
|
639 |
+
" <tbody>\n",
|
640 |
+
" <tr>\n",
|
641 |
+
" <td>1</td>\n",
|
642 |
+
" <td>5.123600</td>\n",
|
643 |
+
" <td>4.953184</td>\n",
|
644 |
+
" </tr>\n",
|
645 |
+
" <tr>\n",
|
646 |
+
" <td>2</td>\n",
|
647 |
+
" <td>4.958600</td>\n",
|
648 |
+
" <td>4.907465</td>\n",
|
649 |
+
" </tr>\n",
|
650 |
+
" <tr>\n",
|
651 |
+
" <td>3</td>\n",
|
652 |
+
" <td>4.893500</td>\n",
|
653 |
+
" <td>4.876421</td>\n",
|
654 |
+
" </tr>\n",
|
655 |
+
" <tr>\n",
|
656 |
+
" <td>4</td>\n",
|
657 |
+
" <td>4.801000</td>\n",
|
658 |
+
" <td>4.860046</td>\n",
|
659 |
+
" </tr>\n",
|
660 |
+
" <tr>\n",
|
661 |
+
" <td>5</td>\n",
|
662 |
+
" <td>4.709600</td>\n",
|
663 |
+
" <td>4.876781</td>\n",
|
664 |
+
" </tr>\n",
|
665 |
+
" </tbody>\n",
|
666 |
+
"</table><p>"
|
667 |
+
],
|
668 |
+
"text/plain": [
|
669 |
+
"<IPython.core.display.HTML object>"
|
670 |
+
]
|
671 |
+
},
|
672 |
+
"metadata": {},
|
673 |
+
"output_type": "display_data"
|
674 |
+
},
|
675 |
+
{
|
676 |
+
"data": {
|
677 |
+
"text/plain": [
|
678 |
+
"TrainOutput(global_step=2970, training_loss=4.859284495344066, metrics={'train_runtime': 183.1426, 'train_samples_per_second': 129.68, 'train_steps_per_second': 16.217, 'total_flos': 1551421440000000.0, 'train_loss': 4.859284495344066, 'epoch': 5.0})"
|
679 |
+
]
|
680 |
+
},
|
681 |
+
"execution_count": 16,
|
682 |
+
"metadata": {},
|
683 |
+
"output_type": "execute_result"
|
684 |
+
}
|
685 |
+
],
|
686 |
+
"source": [
|
687 |
+
"from transformers import Trainer, TrainingArguments, GPT2LMHeadModel\n",
|
688 |
+
"from transformers import DataCollatorForSeq2Seq\n",
|
689 |
+
"\n",
|
690 |
+
"\n",
|
691 |
+
"# 加载预训练的 GPT-2 模型\n",
|
692 |
+
"model = GPT2LMHeadModel.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
693 |
+
"\n",
|
694 |
+
"data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)\n",
|
695 |
+
"\n",
|
696 |
+
"# 训练参数\n",
|
697 |
+
"training_args = TrainingArguments(\n",
|
698 |
+
" output_dir=\"gpt2_protein_function\", # 输出目录\n",
|
699 |
+
" evaluation_strategy=\"epoch\",\n",
|
700 |
+
" learning_rate=5e-5,\n",
|
701 |
+
" per_device_train_batch_size=8,\n",
|
702 |
+
" per_device_eval_batch_size=8,\n",
|
703 |
+
" num_train_epochs=5,\n",
|
704 |
+
" save_steps=500,\n",
|
705 |
+
" save_total_limit=2,\n",
|
706 |
+
" logging_steps=500,\n",
|
707 |
+
")\n",
|
708 |
+
"\n",
|
709 |
+
"# 初始化 Trainer\n",
|
710 |
+
"trainer = Trainer(\n",
|
711 |
+
" model=model,\n",
|
712 |
+
" args=training_args,\n",
|
713 |
+
" train_dataset=tokenized_datasets['train'], # 传递 'train' 拆分数据\n",
|
714 |
+
" eval_dataset=tokenized_datasets['test'], # 传递 'test' 拆分数据\n",
|
715 |
+
" data_collator=data_collator,\n",
|
716 |
+
")\n",
|
717 |
+
"\n",
|
718 |
+
"# 训练模型\n",
|
719 |
+
"trainer.train()\n"
|
720 |
+
]
|
721 |
+
},
|
722 |
+
{
|
723 |
+
"cell_type": "code",
|
724 |
+
"execution_count": null,
|
725 |
+
"id": "0e9e5526-39ab-4a75-a09d-f4d14a691211",
|
726 |
+
"metadata": {},
|
727 |
+
"outputs": [],
|
728 |
+
"source": [
|
729 |
+
"# 保存训练好的模型和 tokenizer\n",
|
730 |
+
"model.save_pretrained(run_path)\n",
|
731 |
+
"tokenizer.save_pretrained(run_path)\n"
|
732 |
+
]
|
733 |
+
},
|
734 |
+
{
|
735 |
+
"cell_type": "code",
|
736 |
+
"execution_count": 19,
|
737 |
+
"id": "18cba087-6620-4d08-ae3a-0ffbd37c7f69",
|
738 |
+
"metadata": {},
|
739 |
+
"outputs": [
|
740 |
+
{
|
741 |
+
"name": "stdout",
|
742 |
+
"output_type": "stream",
|
743 |
+
"text": [
|
744 |
+
"{'eval_loss': 4.8767805099487305, 'eval_runtime': 0.489, 'eval_samples_per_second': 511.252, 'eval_steps_per_second': 65.44, 'epoch': 5.0}\n",
|
745 |
+
"Perplexity: 131.21\n"
|
746 |
+
]
|
747 |
+
}
|
748 |
+
],
|
749 |
+
"source": [
|
750 |
+
"import math\n",
|
751 |
+
"# 评估模型在验证集上的表现\n",
|
752 |
+
"eval_results = trainer.evaluate()\n",
|
753 |
+
"print(eval_results)\n",
|
754 |
+
"print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
|
755 |
+
]
|
756 |
+
},
|
757 |
{
|
758 |
"cell_type": "code",
|
759 |
"execution_count": null,
|
760 |
+
"id": "4e032392-f018-44de-b1f4-0143025a660c",
|
761 |
"metadata": {},
|
762 |
"outputs": [],
|
763 |
"source": []
|
03-gene-task/5-regression-task.ipynb
CHANGED
@@ -1,9 +1,563 @@
|
|
1 |
{
|
2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
{
|
4 |
"cell_type": "code",
|
5 |
"execution_count": null,
|
6 |
-
"id": "
|
7 |
"metadata": {},
|
8 |
"outputs": [],
|
9 |
"source": []
|
|
|
1 |
{
|
2 |
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "c499a5c3-0244-41c4-9947-e166206204e2",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 3.5 回归类任务"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "4678171b-bbc8-49dd-ad04-48f5ef89b45e",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值。\n",
|
17 |
+
"\n",
|
18 |
+
"使用 GPT-2 进行回归问题的解决,可以将回归问题转化为自回归语言模型任务。GPT-2 原本是设计用于生成自然语言的模型,但通过适当的调整和微调,它也可以用于回归任务,例如预测连续值(如情感评分、价格预测等)。\n",
|
19 |
+
"\n",
|
20 |
+
"---\n",
|
21 |
+
"\n",
|
22 |
+
"### **1. 使用 GPT-2 做回归的核心思路**\n",
|
23 |
+
"\n",
|
24 |
+
"1. **调整输出层**:\n",
|
25 |
+
" - 默认情况下,GPT-2 的输出是一个词汇表大小的概率分布,用于预测下一个 token。\n",
|
26 |
+
" - 对于回归问题,可以将模型的最后一层替换为一个线性层,使得输出变为一个标量或多个连续值。\n",
|
27 |
+
" - gpt2的huggingface实现中,可以简单设置1个分类的分类header,实现回归预测。\n",
|
28 |
+
"\n",
|
29 |
+
"2. **损失函数**:\n",
|
30 |
+
" - 对于回归问题,使用均方误差(MSE)或均绝对误差(MAE)作为损失函数,而不是分类任务中常用的交叉熵。\n",
|
31 |
+
"\n",
|
32 |
+
"3. **输入格式**:\n",
|
33 |
+
" - 输入数据仍然是文本,可以通过特定的模板形式加入上下文信息。\n",
|
34 |
+
"\n",
|
35 |
+
"---\n",
|
36 |
+
"\n",
|
37 |
+
"### **2. GPT-2 回归任务的实现步骤**\n",
|
38 |
+
"\n",
|
39 |
+
"#### **(1)加载基础模型**\n",
|
40 |
+
"\n",
|
41 |
+
"从 Hugging Face Transformers 库加载 GPT-2 模型和分词器,并调整其配置以适应回归任务。\n",
|
42 |
+
"\n",
|
43 |
+
"```python\n",
|
44 |
+
"from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, AutoModelForSequenceClassification\n",
|
45 |
+
"\n",
|
46 |
+
"# 加载分词器\n",
|
47 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")\n",
|
48 |
+
"\n",
|
49 |
+
"# 调整模型配置,num_labels=1 表示回归任务\n",
|
50 |
+
"config = GPT2Config.from_pretrained(\"gpt2\", num_labels=1)\n",
|
51 |
+
"\n",
|
52 |
+
"# 加载模型,增加回归输出\n",
|
53 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", config=config)\n",
|
54 |
+
"```\n",
|
55 |
+
"\n",
|
56 |
+
"---\n",
|
57 |
+
"\n",
|
58 |
+
"### **3. 课程数据集**\n",
|
59 |
+
"\n",
|
60 |
+
"本例程使用了蛋白质稳定性分析的数据集,也就是一个蛋白质序列,对应一个float的数值,做回归预测分析。\n",
|
61 |
+
"\n",
|
62 |
+
"**蛋白质稳定性分析**是研究蛋白质在不同条件下保持其结构和功能的能力的过程。蛋白质稳定性是生物化学和生物技术领域的重要课题,影响着蛋白质的折叠、功能执行、以及在应用中的可用性(如工业酶、药物开发等)。\n"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": 1,
|
68 |
+
"id": "1e8c0f86-af78-43e1-8db4-e2a2ea22f815",
|
69 |
+
"metadata": {},
|
70 |
+
"outputs": [
|
71 |
+
{
|
72 |
+
"data": {
|
73 |
+
"text/plain": [
|
74 |
+
"\"\\nimport os\\n\\n# 设置环境变量, autodl专区 其他idc\\nos.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\\n\\n# 打印环境变量以确认设置成功\\nprint(os.environ.get('HF_ENDPOINT'))\\n\""
|
75 |
+
]
|
76 |
+
},
|
77 |
+
"execution_count": 1,
|
78 |
+
"metadata": {},
|
79 |
+
"output_type": "execute_result"
|
80 |
+
}
|
81 |
+
],
|
82 |
+
"source": [
|
83 |
+
"import subprocess\n",
|
84 |
+
"import os\n",
|
85 |
+
"# 设置环境变量, autodl一般区域\n",
|
86 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
87 |
+
"output = result.stdout\n",
|
88 |
+
"for line in output.splitlines():\n",
|
89 |
+
" if '=' in line:\n",
|
90 |
+
" var, value = line.split('=', 1)\n",
|
91 |
+
" os.environ[var] = value\n",
|
92 |
+
"\n",
|
93 |
+
"\"\"\"\n",
|
94 |
+
"import os\n",
|
95 |
+
"\n",
|
96 |
+
"# 设置环境变量, autodl专区 其他idc\n",
|
97 |
+
"os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
|
98 |
+
"\n",
|
99 |
+
"# 打印环境变量以确认设置成功\n",
|
100 |
+
"print(os.environ.get('HF_ENDPOINT'))\n",
|
101 |
+
"\"\"\""
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 2,
|
107 |
+
"id": "c51a8d69-9a36-47e7-8084-f64e6a72e4f7",
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"from transformers import AutoTokenizer, AutoModel\n",
|
112 |
+
"from tokenizers import Tokenizer\n",
|
113 |
+
"from transformers import GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n",
|
114 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
115 |
+
"from transformers import DataCollatorWithPadding"
|
116 |
+
]
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"cell_type": "code",
|
120 |
+
"execution_count": 3,
|
121 |
+
"id": "a5aeb7c1-2d2a-4f57-ad8c-659613870e59",
|
122 |
+
"metadata": {},
|
123 |
+
"outputs": [],
|
124 |
+
"source": [
|
125 |
+
"#set tokenizer\n",
|
126 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
127 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": 4,
|
133 |
+
"id": "ad0c19cd-96a5-463e-8b7d-439646fef429",
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [
|
136 |
+
{
|
137 |
+
"name": "stderr",
|
138 |
+
"output_type": "stream",
|
139 |
+
"text": [
|
140 |
+
"Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at dnagpt/gene_eng_gpt2_v0 and are newly initialized: ['score.weight']\n",
|
141 |
+
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
|
142 |
+
]
|
143 |
+
}
|
144 |
+
],
|
145 |
+
"source": [
|
146 |
+
"#set model\n",
|
147 |
+
"model = AutoModelForSequenceClassification.from_pretrained('dnagpt/gene_eng_gpt2_v0', num_labels=1)\n",
|
148 |
+
"model.config.pad_token_id = model.config.eos_token_id"
|
149 |
+
]
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"cell_type": "code",
|
153 |
+
"execution_count": 5,
|
154 |
+
"id": "8c48cb0a-6142-4afc-823e-08fb33f74222",
|
155 |
+
"metadata": {},
|
156 |
+
"outputs": [
|
157 |
+
{
|
158 |
+
"data": {
|
159 |
+
"text/plain": [
|
160 |
+
"DatasetDict({\n",
|
161 |
+
" train: Dataset({\n",
|
162 |
+
" features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
|
163 |
+
" num_rows: 62079\n",
|
164 |
+
" })\n",
|
165 |
+
" test: Dataset({\n",
|
166 |
+
" features: ['seq_id', 'seq_type', 'seq', 'label'],\n",
|
167 |
+
" num_rows: 6898\n",
|
168 |
+
" })\n",
|
169 |
+
"})"
|
170 |
+
]
|
171 |
+
},
|
172 |
+
"execution_count": 5,
|
173 |
+
"metadata": {},
|
174 |
+
"output_type": "execute_result"
|
175 |
+
}
|
176 |
+
],
|
177 |
+
"source": [
|
178 |
+
"from datasets import load_dataset\n",
|
179 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
180 |
+
"dataset = load_dataset(\"csv\", data_files=\"data/protein_stab.csv\")['train'].train_test_split(test_size=0.1)\n",
|
181 |
+
"dataset"
|
182 |
+
]
|
183 |
+
},
|
184 |
+
{
|
185 |
+
"cell_type": "code",
|
186 |
+
"execution_count": 6,
|
187 |
+
"id": "685dd025-f00a-4869-bc30-9843c77b6d8a",
|
188 |
+
"metadata": {},
|
189 |
+
"outputs": [
|
190 |
+
{
|
191 |
+
"data": {
|
192 |
+
"text/plain": [
|
193 |
+
"{'seq_id': 'train_prot_32672',\n",
|
194 |
+
" 'seq_type': 'prot',\n",
|
195 |
+
" 'seq': 'FYRLIIFKYPDYIDTYLRLAAIAKEKNNLQLSIEGNGSGGNGSGGNGSGN',\n",
|
196 |
+
" 'label': 0.7599999904632561}"
|
197 |
+
]
|
198 |
+
},
|
199 |
+
"execution_count": 6,
|
200 |
+
"metadata": {},
|
201 |
+
"output_type": "execute_result"
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"source": [
|
205 |
+
"dataset[\"train\"][0]"
|
206 |
+
]
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"cell_type": "code",
|
210 |
+
"execution_count": 7,
|
211 |
+
"id": "6e10dbbb-73ef-4b67-8290-77f8896298f5",
|
212 |
+
"metadata": {},
|
213 |
+
"outputs": [
|
214 |
+
{
|
215 |
+
"name": "stdout",
|
216 |
+
"output_type": "stream",
|
217 |
+
"text": [
|
218 |
+
"datasets mean token lenght 17.24006958538707 min token length 12 max token length 35\n"
|
219 |
+
]
|
220 |
+
}
|
221 |
+
],
|
222 |
+
"source": [
|
223 |
+
"token_len_list = []\n",
|
224 |
+
"for item in dataset[\"test\"]:\n",
|
225 |
+
" inputs = tokenizer.tokenize(item[\"seq\"])\n",
|
226 |
+
" token_len_list.append( len(inputs) )\n",
|
227 |
+
"\n",
|
228 |
+
"mean_len = sum(token_len_list)/len(token_len_list)\n",
|
229 |
+
"min_len = min(token_len_list)\n",
|
230 |
+
"max_len = max(token_len_list)\n",
|
231 |
+
"\n",
|
232 |
+
"print(\"datasets \", \"mean token lenght\", mean_len, \"min token length\", min_len, \"max token length\", max_len)"
|
233 |
+
]
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"cell_type": "code",
|
237 |
+
"execution_count": 25,
|
238 |
+
"id": "ac58b5b4-bff0-404d-bcf5-2b93db2b37c0",
|
239 |
+
"metadata": {},
|
240 |
+
"outputs": [
|
241 |
+
{
|
242 |
+
"data": {
|
243 |
+
"application/vnd.jupyter.widget-view+json": {
|
244 |
+
"model_id": "419cce8c5ba249ac8c8773dd2d69992d",
|
245 |
+
"version_major": 2,
|
246 |
+
"version_minor": 0
|
247 |
+
},
|
248 |
+
"text/plain": [
|
249 |
+
"Map: 0%| | 0/62079 [00:00<?, ? examples/s]"
|
250 |
+
]
|
251 |
+
},
|
252 |
+
"metadata": {},
|
253 |
+
"output_type": "display_data"
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"name": "stderr",
|
257 |
+
"output_type": "stream",
|
258 |
+
"text": [
|
259 |
+
"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.\n",
|
260 |
+
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"data": {
|
265 |
+
"application/vnd.jupyter.widget-view+json": {
|
266 |
+
"model_id": "0b9ea09fe3ea49b19f7d52aca7949acf",
|
267 |
+
"version_major": 2,
|
268 |
+
"version_minor": 0
|
269 |
+
},
|
270 |
+
"text/plain": [
|
271 |
+
"Map: 0%| | 0/6898 [00:00<?, ? examples/s]"
|
272 |
+
]
|
273 |
+
},
|
274 |
+
"metadata": {},
|
275 |
+
"output_type": "display_data"
|
276 |
+
}
|
277 |
+
],
|
278 |
+
"source": [
|
279 |
+
"# 2. tokenize\n",
|
280 |
+
"def tokenize_function(examples):\n",
|
281 |
+
" return tokenizer(examples['seq'], truncation=True, padding='max_length')\n",
|
282 |
+
"\n",
|
283 |
+
"# 3. 对数据集应用分词函数\n",
|
284 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
|
285 |
+
"\n",
|
286 |
+
"# 4. 创建一个数据收集器,用于动态填充和遮蔽\n",
|
287 |
+
"data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
|
288 |
+
]
|
289 |
+
},
|
290 |
+
{
|
291 |
+
"cell_type": "code",
|
292 |
+
"execution_count": 26,
|
293 |
+
"id": "94f6d643-2cf7-4651-9a8d-1884b2bddd1c",
|
294 |
+
"metadata": {},
|
295 |
+
"outputs": [
|
296 |
+
{
|
297 |
+
"name": "stderr",
|
298 |
+
"output_type": "stream",
|
299 |
+
"text": [
|
300 |
+
"/root/miniconda3/lib/python3.12/site-packages/transformers/training_args.py:1575: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead\n",
|
301 |
+
" warnings.warn(\n",
|
302 |
+
"/tmp/ipykernel_1347/4285456223.py:23: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n",
|
303 |
+
" trainer = Trainer(\n"
|
304 |
+
]
|
305 |
+
}
|
306 |
+
],
|
307 |
+
"source": [
|
308 |
+
"from transformers import TrainingArguments, Trainer\n",
|
309 |
+
"import numpy as np\n",
|
310 |
+
"from sklearn.metrics import mean_squared_error\n",
|
311 |
+
"\n",
|
312 |
+
"\n",
|
313 |
+
"def compute_metrics(eval_pred):\n",
|
314 |
+
" predictions, labels = eval_pred\n",
|
315 |
+
" rmse = mean_squared_error(labels, predictions)\n",
|
316 |
+
" return {\"rmse\": rmse}\n",
|
317 |
+
"\n",
|
318 |
+
"# 设置训练参数\n",
|
319 |
+
"training_args = TrainingArguments(\n",
|
320 |
+
" output_dir='./results',\n",
|
321 |
+
" evaluation_strategy=\"epoch\",\n",
|
322 |
+
" learning_rate=2e-5,\n",
|
323 |
+
" per_device_train_batch_size=20,\n",
|
324 |
+
" per_device_eval_batch_size=20,\n",
|
325 |
+
" num_train_epochs=10,\n",
|
326 |
+
" weight_decay=0.01,\n",
|
327 |
+
")\n",
|
328 |
+
"\n",
|
329 |
+
"# 使用Trainer API进行训练(假设已有train_dataset和eval_dataset)\n",
|
330 |
+
"trainer = Trainer(\n",
|
331 |
+
" model=model,\n",
|
332 |
+
" args=training_args,\n",
|
333 |
+
" train_dataset=tokenized_datasets[\"train\"],\n",
|
334 |
+
" eval_dataset=tokenized_datasets[\"test\"],\n",
|
335 |
+
" tokenizer=tokenizer,\n",
|
336 |
+
" data_collator=data_collator,\n",
|
337 |
+
" compute_metrics=compute_metrics,\n",
|
338 |
+
")"
|
339 |
+
]
|
340 |
+
},
|
341 |
+
{
|
342 |
+
"cell_type": "code",
|
343 |
+
"execution_count": null,
|
344 |
+
"id": "dfe12979-d977-4404-bf9e-18c1f91a3e39",
|
345 |
+
"metadata": {},
|
346 |
+
"outputs": [
|
347 |
+
{
|
348 |
+
"data": {
|
349 |
+
"text/html": [
|
350 |
+
"\n",
|
351 |
+
" <div>\n",
|
352 |
+
" \n",
|
353 |
+
" <progress value='30987' max='31040' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
354 |
+
" [30987/31040 1:00:56 < 00:06, 8.47 it/s, Epoch 9.98/10]\n",
|
355 |
+
" </div>\n",
|
356 |
+
" <table border=\"1\" class=\"dataframe\">\n",
|
357 |
+
" <thead>\n",
|
358 |
+
" <tr style=\"text-align: left;\">\n",
|
359 |
+
" <th>Epoch</th>\n",
|
360 |
+
" <th>Training Loss</th>\n",
|
361 |
+
" <th>Validation Loss</th>\n",
|
362 |
+
" <th>Rmse</th>\n",
|
363 |
+
" </tr>\n",
|
364 |
+
" </thead>\n",
|
365 |
+
" <tbody>\n",
|
366 |
+
" <tr>\n",
|
367 |
+
" <td>1</td>\n",
|
368 |
+
" <td>0.044600</td>\n",
|
369 |
+
" <td>0.163462</td>\n",
|
370 |
+
" <td>0.163462</td>\n",
|
371 |
+
" </tr>\n",
|
372 |
+
" <tr>\n",
|
373 |
+
" <td>2</td>\n",
|
374 |
+
" <td>0.041900</td>\n",
|
375 |
+
" <td>0.157900</td>\n",
|
376 |
+
" <td>0.157900</td>\n",
|
377 |
+
" </tr>\n",
|
378 |
+
" <tr>\n",
|
379 |
+
" <td>3</td>\n",
|
380 |
+
" <td>0.037700</td>\n",
|
381 |
+
" <td>0.159724</td>\n",
|
382 |
+
" <td>0.159724</td>\n",
|
383 |
+
" </tr>\n",
|
384 |
+
" <tr>\n",
|
385 |
+
" <td>4</td>\n",
|
386 |
+
" <td>0.031700</td>\n",
|
387 |
+
" <td>0.157686</td>\n",
|
388 |
+
" <td>0.157686</td>\n",
|
389 |
+
" </tr>\n",
|
390 |
+
" <tr>\n",
|
391 |
+
" <td>5</td>\n",
|
392 |
+
" <td>0.028800</td>\n",
|
393 |
+
" <td>0.157124</td>\n",
|
394 |
+
" <td>0.157124</td>\n",
|
395 |
+
" </tr>\n",
|
396 |
+
" <tr>\n",
|
397 |
+
" <td>6</td>\n",
|
398 |
+
" <td>0.025400</td>\n",
|
399 |
+
" <td>0.150852</td>\n",
|
400 |
+
" <td>0.150852</td>\n",
|
401 |
+
" </tr>\n",
|
402 |
+
" <tr>\n",
|
403 |
+
" <td>7</td>\n",
|
404 |
+
" <td>0.022300</td>\n",
|
405 |
+
" <td>0.159293</td>\n",
|
406 |
+
" <td>0.159293</td>\n",
|
407 |
+
" </tr>\n",
|
408 |
+
" <tr>\n",
|
409 |
+
" <td>8</td>\n",
|
410 |
+
" <td>0.019600</td>\n",
|
411 |
+
" <td>0.154608</td>\n",
|
412 |
+
" <td>0.154608</td>\n",
|
413 |
+
" </tr>\n",
|
414 |
+
" <tr>\n",
|
415 |
+
" <td>9</td>\n",
|
416 |
+
" <td>0.017300</td>\n",
|
417 |
+
" <td>0.156104</td>\n",
|
418 |
+
" <td>0.156104</td>\n",
|
419 |
+
" </tr>\n",
|
420 |
+
" </tbody>\n",
|
421 |
+
"</table><p>"
|
422 |
+
],
|
423 |
+
"text/plain": [
|
424 |
+
"<IPython.core.display.HTML object>"
|
425 |
+
]
|
426 |
+
},
|
427 |
+
"metadata": {},
|
428 |
+
"output_type": "display_data"
|
429 |
+
},
|
430 |
+
{
|
431 |
+
"name": "stderr",
|
432 |
+
"output_type": "stream",
|
433 |
+
"text": [
|
434 |
+
"IOPub message rate exceeded.\n",
|
435 |
+
"The Jupyter server will temporarily stop sending output\n",
|
436 |
+
"to the client in order to avoid crashing it.\n",
|
437 |
+
"To change this limit, set the config variable\n",
|
438 |
+
"`--ServerApp.iopub_msg_rate_limit`.\n",
|
439 |
+
"\n",
|
440 |
+
"Current values:\n",
|
441 |
+
"ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)\n",
|
442 |
+
"ServerApp.rate_limit_window=3.0 (secs)\n",
|
443 |
+
"\n"
|
444 |
+
]
|
445 |
+
}
|
446 |
+
],
|
447 |
+
"source": [
|
448 |
+
"# 开始训练\n",
|
449 |
+
"trainer.train()"
|
450 |
+
]
|
451 |
+
},
|
452 |
+
{
|
453 |
+
"cell_type": "code",
|
454 |
+
"execution_count": null,
|
455 |
+
"id": "060c4618-40d0-4934-bab8-36aab3a46de5",
|
456 |
+
"metadata": {},
|
457 |
+
"outputs": [],
|
458 |
+
"source": [
|
459 |
+
"#模型测试\n",
|
460 |
+
"predictions = trainer.predict(tokenized_datasets[\"test\"])\n",
|
461 |
+
"predictions"
|
462 |
+
]
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"cell_type": "code",
|
466 |
+
"execution_count": 18,
|
467 |
+
"id": "1f8ef885-5bc9-4668-905b-6b2235209654",
|
468 |
+
"metadata": {},
|
469 |
+
"outputs": [
|
470 |
+
{
|
471 |
+
"data": {
|
472 |
+
"text/html": [
|
473 |
+
"\n",
|
474 |
+
" <div>\n",
|
475 |
+
" \n",
|
476 |
+
" <progress value='345' max='345' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
477 |
+
" [345/345 00:09]\n",
|
478 |
+
" </div>\n",
|
479 |
+
" "
|
480 |
+
],
|
481 |
+
"text/plain": [
|
482 |
+
"<IPython.core.display.HTML object>"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
"metadata": {},
|
486 |
+
"output_type": "display_data"
|
487 |
+
},
|
488 |
+
{
|
489 |
+
"data": {
|
490 |
+
"text/plain": [
|
491 |
+
"{'eval_loss': 0.15949687361717224,\n",
|
492 |
+
" 'eval_rmse': 0.15949687361717224,\n",
|
493 |
+
" 'eval_runtime': 9.1483,\n",
|
494 |
+
" 'eval_samples_per_second': 754.017,\n",
|
495 |
+
" 'eval_steps_per_second': 37.712,\n",
|
496 |
+
" 'epoch': 10.0}"
|
497 |
+
]
|
498 |
+
},
|
499 |
+
"execution_count": 18,
|
500 |
+
"metadata": {},
|
501 |
+
"output_type": "execute_result"
|
502 |
+
}
|
503 |
+
],
|
504 |
+
"source": [
|
505 |
+
"trainer.evaluate()"
|
506 |
+
]
|
507 |
+
},
|
508 |
+
{
|
509 |
+
"cell_type": "code",
|
510 |
+
"execution_count": 23,
|
511 |
+
"id": "afabdbe9-9b96-4f9e-bef2-1d819431f8d1",
|
512 |
+
"metadata": {},
|
513 |
+
"outputs": [
|
514 |
+
{
|
515 |
+
"name": "stdout",
|
516 |
+
"output_type": "stream",
|
517 |
+
"text": [
|
518 |
+
"[[ 1.7208484 ]\n",
|
519 |
+
" [ 0.00225139]\n",
|
520 |
+
" [ 0.3325616 ]\n",
|
521 |
+
" [-0.34372616]\n",
|
522 |
+
" [-0.45505935]\n",
|
523 |
+
" [-0.06892765]\n",
|
524 |
+
" [ 0.15099108]\n",
|
525 |
+
" [ 0.12211376]\n",
|
526 |
+
" [ 0.3947332 ]\n",
|
527 |
+
" [ 0.23186803]]\n"
|
528 |
+
]
|
529 |
+
}
|
530 |
+
],
|
531 |
+
"source": [
|
532 |
+
"predictions.predictions[0:10].squeeze()"
|
533 |
+
]
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"cell_type": "code",
|
537 |
+
"execution_count": 24,
|
538 |
+
"id": "fa9d17fd-eece-4c1e-99e0-3d19d36f7584",
|
539 |
+
"metadata": {},
|
540 |
+
"outputs": [
|
541 |
+
{
|
542 |
+
"data": {
|
543 |
+
"text/plain": [
|
544 |
+
"array([ 1.69, 0.84, 0.58, -0.15, 0.23, 0.03, 0.15, 0.2 , 0.51,\n",
|
545 |
+
" 1.1 ], dtype=float32)"
|
546 |
+
]
|
547 |
+
},
|
548 |
+
"execution_count": 24,
|
549 |
+
"metadata": {},
|
550 |
+
"output_type": "execute_result"
|
551 |
+
}
|
552 |
+
],
|
553 |
+
"source": [
|
554 |
+
"predictions.label_ids[0:10]"
|
555 |
+
]
|
556 |
+
},
|
557 |
{
|
558 |
"cell_type": "code",
|
559 |
"execution_count": null,
|
560 |
+
"id": "52252015-e068-414b-bd8a-79a5d1a2beec",
|
561 |
"metadata": {},
|
562 |
"outputs": [],
|
563 |
"source": []
|
03-gene-task/data/.ipynb_checkpoints/protein_stab-checkpoint.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
03-gene-task/data/dna_protein_full.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d096f1fcfa22524d216190ef7a65fc3025755c3dc82e167ff4f059562b85f046
|
3 |
+
size 26089081
|
03-gene-task/data/protein_stab.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
03-gene-task/img/.ipynb_checkpoints/dataset-checkpoint.png
ADDED
03-gene-task/img/2_structure.png
ADDED
03-gene-task/img/dataset.png
ADDED
03-gene-task/img/ds_structure.png
ADDED
03-gene-task/img/function.png
ADDED
03-gene-task/img/gpt2-ft.png
ADDED
03-gene-task/img/pdb1.png
ADDED
03-gene-task/img/protein-structure-1-2.png
ADDED
Git LFS Details
|
03-gene-task/img/protein-structure-1.png
ADDED
03-gene-task/img/protein-structure-2.png
ADDED
03-gene-task/img/sequence.png
ADDED
04-gene-sft/.ipynb_checkpoints/1-finetue-intro-checkpoint.ipynb
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "32216f81-0979-4afd-8c8c-16729cd0dab6",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.1 模型微调VS指令微调"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "7cd7f9b2-c0a3-48c2-848e-a1e9c7188f03",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"## 一个典型的知乎问题\n",
|
17 |
+
"\n",
|
18 |
+
"### **问题**\n",
|
19 |
+
"\n",
|
20 |
+
"用LLM实现文本二分类,微调base模型还是微调chat模型比较好?[问题](https://www.zhihu.com/question/632473480/answer/38930949853)\n",
|
21 |
+
"\n",
|
22 |
+
"我想用开源LLM(例如chatglm,baichuan)实现文本二分类(比如正负情感分类),有一组训练数据可以用于微调模型,提升分类性能,这时候应该选择base模型还是chat模型?\n",
|
23 |
+
"\n",
|
24 |
+
"\n",
|
25 |
+
"### **回答**\n",
|
26 |
+
"1 如果是使用2分类的header,base模型好一些。\n",
|
27 |
+
"\n",
|
28 |
+
"也就是使用如下类似的的设置。\n",
|
29 |
+
"\n",
|
30 |
+
"model = AutoModelForSequenceClassification.from_pretrained(\n",
|
31 |
+
"\"yuanzhoulvpi/gpt2_chinese\", num_labels=2\n",
|
32 |
+
")\n",
|
33 |
+
"\n",
|
34 |
+
"\n",
|
35 |
+
"\n",
|
36 |
+
"2 如果是把分类问题,改成指令微调的模式,就是像\n",
|
37 |
+
"\n",
|
38 |
+
"```\n",
|
39 |
+
"{\n",
|
40 |
+
"\n",
|
41 |
+
"\"instruction\": \"你现在在做一项情感分类的任务,如果是积极情感,则回答积极。消极情感则回答消极。\"\n",
|
42 |
+
"\"input\":他家的奶茶超级好喝。。。\n",
|
43 |
+
"\"output\":“积极”\n",
|
44 |
+
"\n",
|
45 |
+
"}\n",
|
46 |
+
"```\n",
|
47 |
+
"\n",
|
48 |
+
"然后进行指令微调,lora/peft调整部分参数就行,一般是chat模型比较好。\n",
|
49 |
+
"\n",
|
50 |
+
"\n",
|
51 |
+
"\n",
|
52 |
+
"这种二分类问题,用llm就是大材小用了,一般就是选个小的的模型,用AutoModelForSequenceClassification效果最好,如果追求SOTA,有些研究表明搞成指令微调模式效果可能更好。"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"cell_type": "markdown",
|
57 |
+
"id": "2cfcc1e9-ddda-4a1c-871b-0508fd421ed5",
|
58 |
+
"metadata": {},
|
59 |
+
"source": [
|
60 |
+
"## 大模型微调(Fine-tuning)和指令微调(Instruction Tuning)\n",
|
61 |
+
"\n",
|
62 |
+
"普通的大模型微调(Fine-tuning)和指令微调(Instruction Tuning)是两种不同的训练方法,它们适用于不同的应用场景,并且在实现细节上也有所区别。\n",
|
63 |
+
"\n",
|
64 |
+
"\n",
|
65 |
+
"#### 1. **定义**\n",
|
66 |
+
"\n",
|
67 |
+
"普通微调是指在一个预训练好的大模型基础上,针对特定任务添加一个或多个新层(通常称为头部或 header),然后使用特定任务的数据集对整个模型(包括新添加的层)进行再训练的过程。对于分类任务,常见的做法是在 GPT-2 的顶部添加一个分类头。\n",
|
68 |
+
"\n",
|
69 |
+
"#### 2. **具体步骤**\n",
|
70 |
+
"\n",
|
71 |
+
"- **添加分类头**:为 GPT-2 添加一个分类头,该头通常包含线性层(全连接层)以及可能的激活函数和归一化层。\n",
|
72 |
+
" \n",
|
73 |
+
"- **准备数据**:准备好用于微调的任务特定数据集,如文本分类、情感分析等。\n",
|
74 |
+
" \n",
|
75 |
+
"- **微调过程**:\n",
|
76 |
+
" - 使用任务特定的数据集对整个模型(包括预训练权重和新添加的分类头)进行再训练。\n",
|
77 |
+
" - 通常会调整学习率、批量大小等超参数以优化性能。\n",
|
78 |
+
" - 可能只对新添加的层进行训练,或者对整个模型进行微调(取决于资源和需求)。\n",
|
79 |
+
"\n",
|
80 |
+
"#### 3. **适用场景**\n",
|
81 |
+
"\n",
|
82 |
+
"- **任务明确**:当有清晰的任务目标时,例如文本分类、命名实体识别等。\n",
|
83 |
+
"- **标签数据可用**:拥有足够的标注数据来进行监督学习。\n",
|
84 |
+
"\n",
|
85 |
+
"#### 4. **优点**\n",
|
86 |
+
"\n",
|
87 |
+
"- **针对性强**:能够有效地提升模型在特定任务上的表现。\n",
|
88 |
+
"- **资源利用效率高**:相比于从头开始训练,微调需要的计算资源和时间较少。\n",
|
89 |
+
"\n",
|
90 |
+
"#### 5. **缺点**\n",
|
91 |
+
"\n",
|
92 |
+
"- **泛化能力有限**:微调后的模型可能在未见过的任务或领域中表现不佳。\n",
|
93 |
+
"\n",
|
94 |
+
"### 指令微调(Instruction Tuning)\n",
|
95 |
+
"\n",
|
96 |
+
"#### 1. **定义**\n",
|
97 |
+
"\n",
|
98 |
+
"指令微调是一种更为通用的微调方法,它旨在让模型理解和遵循自然语言指令,而不是直接针对某个特定任务进行优化。这种方法通过提供一系列指令-输出对来训练模型,使其学会根据指令生成适当的响应。\n",
|
99 |
+
"\n",
|
100 |
+
"#### 2. **具体步骤**\n",
|
101 |
+
"\n",
|
102 |
+
"- **构造指令数据集**:创建一个包含各种指令及其预期输出的数据集。这些指令可以覆盖多种任务类型,如问答、翻译、摘要生成等。\n",
|
103 |
+
" \n",
|
104 |
+
"- **微调过程**:\n",
|
105 |
+
" - 使用指令数据集对模型进行训练,使模型能够理解并执行不同类型的指令。\n",
|
106 |
+
" - 强调模型对自然语言指令的理解和执行,而非特定于某一任务的优化。\n",
|
107 |
+
"\n",
|
108 |
+
"#### 3. **适用场景**\n",
|
109 |
+
"\n",
|
110 |
+
"- **多任务适应**:当希望模型能够在多种不同类型的任务中表现出色时。\n",
|
111 |
+
"- **少样本学习**:在仅有少量示例的情况下,仍然可以让模型快速适应新任务。\n",
|
112 |
+
"\n",
|
113 |
+
"#### 4. **优点**\n",
|
114 |
+
"\n",
|
115 |
+
"- **灵活性高**:模型可以在没有额外训练的情况下处理新的任务。\n",
|
116 |
+
"- **跨领域泛化能力强**:更有可能在未曾见过的任务或领域中保持良好的性能。\n",
|
117 |
+
"\n",
|
118 |
+
"#### 5. **缺点**\n",
|
119 |
+
"\n",
|
120 |
+
"- **复杂度增加**:指令微调通常涉及更多的训练数据和更复杂的训练过程。\n",
|
121 |
+
"- **评估难度较大**:由于任务的多样性,评估模型性能变得更加困难。\n",
|
122 |
+
"\n",
|
123 |
+
"\n",
|
124 |
+
"### 小结\n",
|
125 |
+
"\n",
|
126 |
+
"普通微调侧重于提高模型在特定任务上的性能,而指令微调则更加注重模型对自然语言指令的理解和执行能力。选择哪种方法取决于你的具体需求和应用场景。如果你有一个明确的任务并且有大量的标注数据,那么普通微调可能是更好的选择;如果你希望模型具有更高的灵活性和跨任务适应能力,则可以考虑指令微调。"
|
127 |
+
]
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "markdown",
|
131 |
+
"id": "6203be53-18a5-447d-9071-32e031934b9c",
|
132 |
+
"metadata": {},
|
133 |
+
"source": [
|
134 |
+
"## 从GPT到chatGPT\n",
|
135 |
+
"\n",
|
136 |
+
"关键点在于指令微调(Instruction Tuning)\n",
|
137 |
+
"* 将所有任务统一为指令形式\n",
|
138 |
+
"* 多任务精调\n",
|
139 |
+
"* 与人类对齐(多样性)\n",
|
140 |
+
"* 进一步分为有监督指令微调和带有人类反馈的强化学习(RLHF)\n",
|
141 |
+
"\n",
|
142 |
+
"告别微调\n",
|
143 |
+
"\n",
|
144 |
+
"因为GPT-3使用了天量级的数据来进行预训练,所以学到的知识也更多更通用,以致于GPT-3打出的口号就是“告别微调的GPT-3”。\n",
|
145 |
+
"\n",
|
146 |
+
"相比于BERT这种预训练+微调的两阶段模型,GPT-3的目标是模型更加通用,从而解决BERT这种下游任务微调需要依赖领域标注数据的情况。\n",
|
147 |
+
"\n",
|
148 |
+
"拿我们实际业务举例,我主要做分本分类任务。对于使用BERT来完成文本分类任务来说,首先我需要使用海量的无标注文本数据进行预训练学习语言学知识。\n",
|
149 |
+
"\n",
|
150 |
+
"幸运的是这种预训练过程一般是一次性的,训练完成后可以把模型保存下来继续使用。很多大厂比如谷歌、Facebook等把得到的预训练模型开源了出来,所以咱们只需要导入预训练好的模型权重就可以直接使用了,相当于完成了模型的预训练过程;第二阶段就是微调了,对于文本分类等下游任务来说, 我们需要一批带标签的训练语料来微调模型。不同的下游任务会需要特定的训练语料。这时候面临的一个最大的问题是训练语料是需要人工标注的,而标注的成本是非常高的。除此之外不同的标注人员因为经验阅历等不同导致对同一条文本的理解也不同,所以容易出现标注不一致的问题。当标注数据量较少时还容易出现模型过拟合。归根结底就是微调是需要标注数据的,而获取标注数据的成本是很高的。\n",
|
151 |
+
"\n",
|
152 |
+
"为了解决这个问题,GPT-3可以让NLPer不用标注训练语料就能很好的完成下游任务,让GPT-3更通用更便利。GPT-3不需要进行微调的结构图如下所示:\n",
|
153 |
+
"\n",
|
154 |
+
"<img src='img/sft.png' width='600px' />"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "markdown",
|
159 |
+
"id": "28e037df-734b-4fe7-ac07-311f1b3a7d7b",
|
160 |
+
"metadata": {},
|
161 |
+
"source": [
|
162 |
+
"## 指令微调数据构建\n",
|
163 |
+
"\n",
|
164 |
+
"<img src='img/sft2.png' width='800px' />\n",
|
165 |
+
"\n",
|
166 |
+
"\n",
|
167 |
+
"\n",
|
168 |
+
"根据典型的分类语料数据,构建指令微调数据\n",
|
169 |
+
"\n",
|
170 |
+
"目前如llama等都使用Alpaca格式\n",
|
171 |
+
"\n",
|
172 |
+
"指令数据当做一般的文本,进行无监督的训练,和预训练流程一致"
|
173 |
+
]
|
174 |
+
},
|
175 |
+
{
|
176 |
+
"cell_type": "code",
|
177 |
+
"execution_count": null,
|
178 |
+
"id": "64312191-423f-4a18-aa0c-036374e93fb2",
|
179 |
+
"metadata": {},
|
180 |
+
"outputs": [],
|
181 |
+
"source": [
|
182 |
+
"import subprocess\n",
|
183 |
+
"import os\n",
|
184 |
+
"# 设置环境变量, autodl一般区域\n",
|
185 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
186 |
+
"output = result.stdout\n",
|
187 |
+
"for line in output.splitlines():\n",
|
188 |
+
" if '=' in line:\n",
|
189 |
+
" var, value = line.split('=', 1)\n",
|
190 |
+
" os.environ[var] = value"
|
191 |
+
]
|
192 |
+
},
|
193 |
+
{
|
194 |
+
"cell_type": "code",
|
195 |
+
"execution_count": null,
|
196 |
+
"id": "32c16282-f9f1-4545-b522-daf2b39b4ead",
|
197 |
+
"metadata": {},
|
198 |
+
"outputs": [],
|
199 |
+
"source": [
|
200 |
+
"#原始模型\n",
|
201 |
+
"from transformers import AutoModel\n",
|
202 |
+
"model = AutoModel.from_pretrained(\"gpt2\")\n",
|
203 |
+
"model"
|
204 |
+
]
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"execution_count": null,
|
209 |
+
"id": "1149163f-4d89-472e-8d45-ebcbb5f9575e",
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"#分类微调模型\n",
|
214 |
+
"from transformers import AutoModelForSequenceClassification\n",
|
215 |
+
"ft_model = AutoModelForSequenceClassification.from_pretrained(\"gpt2\", num_labels=2)\n",
|
216 |
+
"ft_model"
|
217 |
+
]
|
218 |
+
},
|
219 |
+
{
|
220 |
+
"cell_type": "code",
|
221 |
+
"execution_count": 1,
|
222 |
+
"id": "09735059-507c-48c4-893f-ca0da21ce5e8",
|
223 |
+
"metadata": {},
|
224 |
+
"outputs": [],
|
225 |
+
"source": [
|
226 |
+
"#指令微调模型\n",
|
227 |
+
"from transformers import AutoModelForCausalLM\n",
|
228 |
+
"sft_model = AutoModelForMaskedLM.from_pretrained(\"gpt2\")\n",
|
229 |
+
"sft_model"
|
230 |
+
]
|
231 |
+
}
|
232 |
+
],
|
233 |
+
"metadata": {
|
234 |
+
"kernelspec": {
|
235 |
+
"display_name": "Python 3 (ipykernel)",
|
236 |
+
"language": "python",
|
237 |
+
"name": "python3"
|
238 |
+
},
|
239 |
+
"language_info": {
|
240 |
+
"codemirror_mode": {
|
241 |
+
"name": "ipython",
|
242 |
+
"version": 3
|
243 |
+
},
|
244 |
+
"file_extension": ".py",
|
245 |
+
"mimetype": "text/x-python",
|
246 |
+
"name": "python",
|
247 |
+
"nbconvert_exporter": "python",
|
248 |
+
"pygments_lexer": "ipython3",
|
249 |
+
"version": "3.12.3"
|
250 |
+
}
|
251 |
+
},
|
252 |
+
"nbformat": 4,
|
253 |
+
"nbformat_minor": 5
|
254 |
+
}
|
04-gene-sft/.ipynb_checkpoints/2-gpt2-instruction-ft-checkpoint.ipynb
ADDED
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "c2e6b786-d18e-4a0d-aa59-0792dcb49c5f",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.2 基于GPT2的指令微调"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"id": "dc04d5e3-7623-4d59-9f3b-ad03e339db11",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"from datasets import load_dataset\n",
|
19 |
+
"# 1. load ~11k samples from promoters prediction dataset\n",
|
20 |
+
"dataset = load_dataset(\"dnagpt/dna_promoter_300\")\n",
|
21 |
+
"dataset"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": null,
|
27 |
+
"id": "93d09d8d-f521-49f7-b0e0-7ac089dfbf49",
|
28 |
+
"metadata": {},
|
29 |
+
"outputs": [],
|
30 |
+
"source": [
|
31 |
+
"def build_prompt(example):\n",
|
32 |
+
" if int(example['label']) == 1:\n",
|
33 |
+
" label = 'promoter'\n",
|
34 |
+
" else:\n",
|
35 |
+
" label = 'Non-promoter'\n",
|
36 |
+
"\n",
|
37 |
+
" instruction = \"Determine core promoter detection of following dna sequence, The result will be one of the following: Non-promoter, promoter.\"\n",
|
38 |
+
" \n",
|
39 |
+
" input = example[\"sequence\"]\n",
|
40 |
+
" input_text = f\"\\n\\n### Input:\\n{input}\"\n",
|
41 |
+
"\n",
|
42 |
+
"\n",
|
43 |
+
" output = label\n",
|
44 |
+
"\n",
|
45 |
+
" prompt = {\"instruction\":instruction, \n",
|
46 |
+
" \"input\":input,\n",
|
47 |
+
" \"output\":output\n",
|
48 |
+
" }\n",
|
49 |
+
"\n",
|
50 |
+
" return prompt"
|
51 |
+
]
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"cell_type": "code",
|
55 |
+
"execution_count": null,
|
56 |
+
"id": "9f9c0e5a-6591-47ac-b358-d746a00dfc0a",
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [],
|
59 |
+
"source": [
|
60 |
+
"example = dna_dataset[\"train\"][0]\n",
|
61 |
+
"print(build_prompt(example))"
|
62 |
+
]
|
63 |
+
},
|
64 |
+
{
|
65 |
+
"cell_type": "code",
|
66 |
+
"execution_count": null,
|
67 |
+
"id": "83070a23-1604-4d28-b371-e01060331ed5",
|
68 |
+
"metadata": {},
|
69 |
+
"outputs": [],
|
70 |
+
"source": [
|
71 |
+
"import json\n",
|
72 |
+
"ins_file = open(\"data/dna_promoter_300.jsonl\", \"w\")\n",
|
73 |
+
"ins_list = []\n",
|
74 |
+
"for ins in dna_dataset[\"train\"]:\n",
|
75 |
+
" if ins[\"sequence\"]==\"sequence\":\n",
|
76 |
+
" continue\n",
|
77 |
+
" ins = build_prompt(ins)\n",
|
78 |
+
" ins_file.write(json.dumps(ins)+\"\\n\")\n",
|
79 |
+
" ins_list.append(ins)\n",
|
80 |
+
"ins_file.close()"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"cell_type": "code",
|
85 |
+
"execution_count": null,
|
86 |
+
"id": "89fb8ed3-aa58-462f-b2a6-ce445c597a33",
|
87 |
+
"metadata": {},
|
88 |
+
"outputs": [],
|
89 |
+
"source": [
|
90 |
+
"dna_ft_dataset = load_dataset(\"json\", data_files='data/dna_promoter_300.jsonl')\n",
|
91 |
+
"dna_ft_dataset"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "code",
|
96 |
+
"execution_count": null,
|
97 |
+
"id": "e4f7b75f-6ccb-4fda-8004-40df7d52678f",
|
98 |
+
"metadata": {},
|
99 |
+
"outputs": [],
|
100 |
+
"source": [
|
101 |
+
"data = dna_ft_dataset[\"train\"].train_test_split(train_size=0.9, seed=42)\n",
|
102 |
+
"data"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": null,
|
108 |
+
"id": "36d9ee0e-8423-4529-aa7e-fda2728fab2f",
|
109 |
+
"metadata": {},
|
110 |
+
"outputs": [],
|
111 |
+
"source": [
|
112 |
+
"# 初始化tokenizer\n",
|
113 |
+
"from datasets import load_dataset\n",
|
114 |
+
"from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig\n",
|
115 |
+
"from transformers import GPT2Tokenizer,GPT2Model,AutoModel\n",
|
116 |
+
"from transformers import DataCollatorForLanguageModeling\n",
|
117 |
+
"from transformers import Trainer, TrainingArguments\n",
|
118 |
+
"from tokenizers import Tokenizer\n",
|
119 |
+
"from transformers import GPT2TokenizerFast\n",
|
120 |
+
"\n",
|
121 |
+
"tokenizer = GPT2Tokenizer.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")\n",
|
122 |
+
"tokenizer.pad_token = tokenizer.eos_token"
|
123 |
+
]
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"cell_type": "code",
|
127 |
+
"execution_count": null,
|
128 |
+
"id": "871baee0-f06f-4422-a741-af533f7d92e1",
|
129 |
+
"metadata": {},
|
130 |
+
"outputs": [],
|
131 |
+
"source": [
|
132 |
+
"#构建提示词\n",
|
133 |
+
"def format_input(entry):\n",
|
134 |
+
" instruction_text = (\n",
|
135 |
+
" f\"Below is an instruction that describes a task. \"\n",
|
136 |
+
" f\"Write a response that appropriately completes the request.\"\n",
|
137 |
+
" f\"\\n\\n### Instruction:\\n{entry['instruction']}\"\n",
|
138 |
+
" )\n",
|
139 |
+
"\n",
|
140 |
+
" input_text = f\"\\n\\n### Input:\\n{entry['input']}\" if entry[\"input\"] else \"\"\n",
|
141 |
+
"\n",
|
142 |
+
" return instruction_text + input_text + \"\\n\\n### Response:\\n\"\n",
|
143 |
+
"\n",
|
144 |
+
"#构建提示词\n",
|
145 |
+
"def build_prompt(entry):\n",
|
146 |
+
"\n",
|
147 |
+
" input_data = format_input(entry)\n",
|
148 |
+
"\n",
|
149 |
+
" desired_response = entry['output']\n",
|
150 |
+
"\n",
|
151 |
+
" return input_data + desired_response"
|
152 |
+
]
|
153 |
+
},
|
154 |
+
{
|
155 |
+
"cell_type": "code",
|
156 |
+
"execution_count": null,
|
157 |
+
"id": "bca1c275-cc3d-43df-923e-e6604d584226",
|
158 |
+
"metadata": {},
|
159 |
+
"outputs": [],
|
160 |
+
"source": [
|
161 |
+
"example = data[\"test\"][0]\n",
|
162 |
+
"example"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": null,
|
168 |
+
"id": "76f2e027-0a31-4919-bb7e-404c786e1599",
|
169 |
+
"metadata": {},
|
170 |
+
"outputs": [],
|
171 |
+
"source": [
|
172 |
+
"prompt = build_prompt(example)\n",
|
173 |
+
"print(prompt)"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": null,
|
179 |
+
"id": "932b54ca-7e27-47cd-b67d-7ef8386b6608",
|
180 |
+
"metadata": {},
|
181 |
+
"outputs": [],
|
182 |
+
"source": [
|
183 |
+
"print('tokens: ', ' '.join(tokenizer.tokenize(prompt)))"
|
184 |
+
]
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"cell_type": "code",
|
188 |
+
"execution_count": null,
|
189 |
+
"id": "26671faf-68d0-4a44-978e-e1a24e86c9b1",
|
190 |
+
"metadata": {},
|
191 |
+
"outputs": [],
|
192 |
+
"source": [
|
193 |
+
"def tokenize_function(example):\n",
|
194 |
+
" prompt = build_prompt(example)\n",
|
195 |
+
" result = tokenizer(prompt, padding='max_length', truncation=True, max_length=1024) # max_length=1024\n",
|
196 |
+
" return result\n",
|
197 |
+
"\n",
|
198 |
+
"\n",
|
199 |
+
"# Use batched=False for easy\n",
|
200 |
+
"tokenized_datasets = data.map(\n",
|
201 |
+
" tokenize_function, batched=False,remove_columns=['instruction', 'input', 'output']\n",
|
202 |
+
")\n",
|
203 |
+
"tokenized_datasets"
|
204 |
+
]
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"cell_type": "code",
|
208 |
+
"execution_count": null,
|
209 |
+
"id": "3d46c8b1-9fb3-431a-87ea-c278468543e7",
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"tokenized_datasets[\"train\"]"
|
214 |
+
]
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"cell_type": "code",
|
218 |
+
"execution_count": null,
|
219 |
+
"id": "26985c81-4335-4ac0-9a5a-84a5b4f2d0e4",
|
220 |
+
"metadata": {},
|
221 |
+
"outputs": [],
|
222 |
+
"source": [
|
223 |
+
"# 创建DataCollator\n",
|
224 |
+
"data_collator = DataCollatorForLanguageModeling(\n",
|
225 |
+
" tokenizer=tokenizer,\n",
|
226 |
+
" mlm=False, # 因为GPT2是自回归模型,不需要MLM\n",
|
227 |
+
")"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "code",
|
232 |
+
"execution_count": null,
|
233 |
+
"id": "e18d3095-d6dd-423b-84fb-dca4a629d450",
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": [
|
237 |
+
"model = GPT2LMHeadModel.from_pretrained(\"dnagpt/gene_eng_gpt2_v0\")"
|
238 |
+
]
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"cell_type": "code",
|
242 |
+
"execution_count": null,
|
243 |
+
"id": "12134cf2-676a-4176-a733-35caab2fd520",
|
244 |
+
"metadata": {},
|
245 |
+
"outputs": [],
|
246 |
+
"source": [
|
247 |
+
"def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=1000):\n",
|
248 |
+
" # Tokenize\n",
|
249 |
+
" input_ids = tokenizer.encode(\n",
|
250 |
+
" text,\n",
|
251 |
+
" return_tensors=\"pt\",\n",
|
252 |
+
" truncation=True,\n",
|
253 |
+
" max_length=max_input_tokens\n",
|
254 |
+
" # return_attention_mask=True,\n",
|
255 |
+
" )\n",
|
256 |
+
"\n",
|
257 |
+
" # Generate\n",
|
258 |
+
" device = model.device\n",
|
259 |
+
" generated_tokens_with_prompt = model.generate(\n",
|
260 |
+
" input_ids=input_ids.to(device),\n",
|
261 |
+
" #max_length=max_output_tokens,\n",
|
262 |
+
" max_new_tokens=5,\n",
|
263 |
+
" )\n",
|
264 |
+
"\n",
|
265 |
+
" # Decode\n",
|
266 |
+
" #generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)\n",
|
267 |
+
" # Strip the prompt\n",
|
268 |
+
" #generated_text_answer = generated_text_with_prompt[0][len(text):]\n",
|
269 |
+
" \n",
|
270 |
+
" generated_text_with_prompt = tokenizer.decode(generated_tokens_with_prompt[0], skip_special_tokens=True)\n",
|
271 |
+
" generated_text_answer = generated_text_with_prompt[len(text):]\n",
|
272 |
+
"\n",
|
273 |
+
"\n",
|
274 |
+
" return generated_text_answer\n",
|
275 |
+
"\n",
|
276 |
+
"# 如果需要进一步清理\n",
|
277 |
+
"def clean_generated_text(text):\n",
|
278 |
+
" # 去除 'Ġ' 符号并替换为空格\n",
|
279 |
+
" text = text.replace('Ġ', ' ')\n",
|
280 |
+
" # 去除多余的空格\n",
|
281 |
+
" text = ' '.join(text.split())\n",
|
282 |
+
" return text"
|
283 |
+
]
|
284 |
+
},
|
285 |
+
{
|
286 |
+
"cell_type": "code",
|
287 |
+
"execution_count": null,
|
288 |
+
"id": "b9a2e2a9-a1ff-44b0-a550-623a16d0d7a2",
|
289 |
+
"metadata": {},
|
290 |
+
"outputs": [],
|
291 |
+
"source": [
|
292 |
+
"input_text = format_input(data[\"test\"][0])\n",
|
293 |
+
"\n",
|
294 |
+
"print(\"input (test):\", input_text)\n",
|
295 |
+
"\n",
|
296 |
+
"print(\"--------------------------\\n\")\n",
|
297 |
+
"\n",
|
298 |
+
"print(\"model's answer: \\n\")\n",
|
299 |
+
"print(inference(input_text, model, tokenizer))"
|
300 |
+
]
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"cell_type": "code",
|
304 |
+
"execution_count": null,
|
305 |
+
"id": "63b54fe2-f077-4ca8-974e-1bcc41ce57d6",
|
306 |
+
"metadata": {},
|
307 |
+
"outputs": [],
|
308 |
+
"source": [
|
309 |
+
"training_args = TrainingArguments(\n",
|
310 |
+
" output_dir='./results_small',\n",
|
311 |
+
" overwrite_output_dir=True,\n",
|
312 |
+
" num_train_epochs=3,\n",
|
313 |
+
" per_device_train_batch_size=8,\n",
|
314 |
+
" save_steps=2000,\n",
|
315 |
+
" save_total_limit=2,\n",
|
316 |
+
" prediction_loss_only=True,\n",
|
317 |
+
" fp16=True, #v100没法用\n",
|
318 |
+
" )"
|
319 |
+
]
|
320 |
+
},
|
321 |
+
{
|
322 |
+
"cell_type": "code",
|
323 |
+
"execution_count": null,
|
324 |
+
"id": "61df123d-e67d-4548-998a-de1e2781e774",
|
325 |
+
"metadata": {},
|
326 |
+
"outputs": [],
|
327 |
+
"source": [
|
328 |
+
"# 初始化Trainer\n",
|
329 |
+
"trainer = Trainer(\n",
|
330 |
+
" model=model,\n",
|
331 |
+
" args=training_args,\n",
|
332 |
+
" train_dataset=tokenized_datasets['train'],\n",
|
333 |
+
" eval_dataset=tokenized_datasets['test'],\n",
|
334 |
+
" data_collator=data_collator\n",
|
335 |
+
")"
|
336 |
+
]
|
337 |
+
},
|
338 |
+
{
|
339 |
+
"cell_type": "code",
|
340 |
+
"execution_count": null,
|
341 |
+
"id": "a9cd936a-5ea6-43e3-9848-27080f818606",
|
342 |
+
"metadata": {},
|
343 |
+
"outputs": [],
|
344 |
+
"source": [
|
345 |
+
"# 开始训练\n",
|
346 |
+
"trainer.train()"
|
347 |
+
]
|
348 |
+
},
|
349 |
+
{
|
350 |
+
"cell_type": "code",
|
351 |
+
"execution_count": null,
|
352 |
+
"id": "315aae76-44b4-4513-8139-40ef22934873",
|
353 |
+
"metadata": {},
|
354 |
+
"outputs": [],
|
355 |
+
"source": [
|
356 |
+
"save_dir = 'gpt_ft/final'\n",
|
357 |
+
"trainer.save_model(save_dir)\n",
|
358 |
+
"print(\"Saved model to:\", save_dir)"
|
359 |
+
]
|
360 |
+
},
|
361 |
+
{
|
362 |
+
"cell_type": "code",
|
363 |
+
"execution_count": null,
|
364 |
+
"id": "28d2dbbc-02ff-4120-b230-b19905a786cd",
|
365 |
+
"metadata": {},
|
366 |
+
"outputs": [],
|
367 |
+
"source": [
|
368 |
+
"ave_dir = 'gpt_ft/final'\n",
|
369 |
+
"finetuned_model = GPT2LMHeadModel.from_pretrained(save_dir, local_files_only=True)"
|
370 |
+
]
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"cell_type": "code",
|
374 |
+
"execution_count": null,
|
375 |
+
"id": "08987c3c-063a-4e9b-9ebb-e637b0b5bccd",
|
376 |
+
"metadata": {},
|
377 |
+
"outputs": [],
|
378 |
+
"source": [
|
379 |
+
"finetuned_model"
|
380 |
+
]
|
381 |
+
},
|
382 |
+
{
|
383 |
+
"cell_type": "code",
|
384 |
+
"execution_count": null,
|
385 |
+
"id": "d75010e8-6d6a-40ef-852e-0d705adc3da8",
|
386 |
+
"metadata": {},
|
387 |
+
"outputs": [],
|
388 |
+
"source": [
|
389 |
+
"print(\"input (test):\", input_text)\n",
|
390 |
+
"\n",
|
391 |
+
"print(\"--------------------------\\n\")\n",
|
392 |
+
"\n",
|
393 |
+
"print(\"model's answer: \\n\")\n",
|
394 |
+
"print(inference(input_text, finetuned_model, tokenizer))\n",
|
395 |
+
"\n",
|
396 |
+
"print(\"--------------------------\\n\")\n",
|
397 |
+
"print(\"real answer: \\n\")\n",
|
398 |
+
"print(data[\"test\"][0][\"output\"])"
|
399 |
+
]
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"cell_type": "code",
|
403 |
+
"execution_count": null,
|
404 |
+
"id": "64365e15-510e-4abf-92f5-c78b660b37dc",
|
405 |
+
"metadata": {},
|
406 |
+
"outputs": [],
|
407 |
+
"source": [
|
408 |
+
"test_data = data[\"test\"].select(range(100))\n",
|
409 |
+
"\n",
|
410 |
+
"data_list = []\n",
|
411 |
+
"\n",
|
412 |
+
"for entry in test_data:\n",
|
413 |
+
" input_text = format_input(entry)\n",
|
414 |
+
" #print(input_text)\n",
|
415 |
+
" response_text = inference(input_text, finetuned_model, tokenizer)\n",
|
416 |
+
" #print(response_text)\n",
|
417 |
+
" data = {\n",
|
418 |
+
" \"instruction\":entry[\"instruction\"],\n",
|
419 |
+
" \"input\":entry[\"input\"],\n",
|
420 |
+
" \"output\":entry[\"output\"],\n",
|
421 |
+
" \"model_response\":response_text\n",
|
422 |
+
" }\n",
|
423 |
+
"\n",
|
424 |
+
" data_list.append(data)"
|
425 |
+
]
|
426 |
+
},
|
427 |
+
{
|
428 |
+
"cell_type": "code",
|
429 |
+
"execution_count": null,
|
430 |
+
"id": "a45fb780-fc3f-401c-b6e0-6f7d0c1682de",
|
431 |
+
"metadata": {},
|
432 |
+
"outputs": [],
|
433 |
+
"source": [
|
434 |
+
"import json\n",
|
435 |
+
"\n",
|
436 |
+
"# 定义输出文件路径\n",
|
437 |
+
"output_file = 'gpt2-small3-1024.json'\n",
|
438 |
+
"\n",
|
439 |
+
"# 将 Dataset 对象导出为 JSON 文件\n",
|
440 |
+
"# test_data.to_json(output_file)\n",
|
441 |
+
"with open(output_file, \"w\") as file:\n",
|
442 |
+
" json.dump(data_list, file, indent=4) # \"indent\" for pretty-printing"
|
443 |
+
]
|
444 |
+
},
|
445 |
+
{
|
446 |
+
"cell_type": "code",
|
447 |
+
"execution_count": null,
|
448 |
+
"id": "a83c8881-c763-4bba-8b85-584a6722a38e",
|
449 |
+
"metadata": {},
|
450 |
+
"outputs": [],
|
451 |
+
"source": [
|
452 |
+
"import json\n",
|
453 |
+
"\n",
|
454 |
+
"\n",
|
455 |
+
"\n",
|
456 |
+
"with open(output_file, \"r\") as file:\n",
|
457 |
+
" test_data = json.load(file)\n",
|
458 |
+
"\n",
|
459 |
+
"all_num = len(test_data)\n",
|
460 |
+
"right_sum = 0\n",
|
461 |
+
"same_sum = 0\n",
|
462 |
+
"for item in test_data:\n",
|
463 |
+
" output = item[\"output\"]\n",
|
464 |
+
" #output = \" \".join(tokenizer.tokenize(output))\n",
|
465 |
+
" model_response = item[\"model_response\"]\n",
|
466 |
+
" if model_response == output: #same it\n",
|
467 |
+
" same_sum = same_sum + 1\n",
|
468 |
+
" \n",
|
469 |
+
" if model_response.find(output)!=-1: #find it\n",
|
470 |
+
" right_sum = right_sum + 1\n",
|
471 |
+
"\n",
|
472 |
+
"\n",
|
473 |
+
"print(\"presicion\", right_sum/all_num, \"same\", same_sum/all_num)"
|
474 |
+
]
|
475 |
+
}
|
476 |
+
],
|
477 |
+
"metadata": {
|
478 |
+
"kernelspec": {
|
479 |
+
"display_name": "Python 3 (ipykernel)",
|
480 |
+
"language": "python",
|
481 |
+
"name": "python3"
|
482 |
+
},
|
483 |
+
"language_info": {
|
484 |
+
"codemirror_mode": {
|
485 |
+
"name": "ipython",
|
486 |
+
"version": 3
|
487 |
+
},
|
488 |
+
"file_extension": ".py",
|
489 |
+
"mimetype": "text/x-python",
|
490 |
+
"name": "python",
|
491 |
+
"nbconvert_exporter": "python",
|
492 |
+
"pygments_lexer": "ipython3",
|
493 |
+
"version": "3.12.3"
|
494 |
+
}
|
495 |
+
},
|
496 |
+
"nbformat": 4,
|
497 |
+
"nbformat_minor": 5
|
498 |
+
}
|
04-gene-sft/.ipynb_checkpoints/3-llama-expand-dict-checkpoint.ipynb
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "6bad311a-c949-4246-9e6b-6d4ec76699b7",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.3 基于llama的基因数据词典扩充"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "d42860cf-14fc-48f5-ac6c-1fd92a6a92ba",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"前面介绍了huggingface自带的分词器构建代码,这里介绍下更为通用的sentencepiece,部分huggingface其实就是来自于这个框架。\n",
|
17 |
+
"\n",
|
18 |
+
"SentencePiece 是一个语言无关的分词框架,由 Google 开发并开源。它不同于传统的基于词汇表(如词典)的分词方法,而是采用一种无监督的学习方式来训练模型,从而将文本分割成“子词”单元(subword units)。这种方法使得 SentencePiece 在处理未知词、罕见词以及多语言文本时表现出色。\n",
|
19 |
+
"\n",
|
20 |
+
"### 主要特点\n",
|
21 |
+
"\n",
|
22 |
+
"1. **语言无关**:\n",
|
23 |
+
" - SentencePiece 不依赖于任何特定语言的规则或词典,因此它可以应用于任何语言,甚至是混合语言的文本。\n",
|
24 |
+
"\n",
|
25 |
+
"2. **子词分词**:\n",
|
26 |
+
" - 它生成的是子词级别的 token,而不是完整的单词。这种方式可以有效地处理 OOV (out-of-vocabulary) 问题,并且有助于减少词汇表的大小。\n",
|
27 |
+
"\n",
|
28 |
+
"3. **无监督学习**:\n",
|
29 |
+
" - SentencePiece 使用无监督的方法从原始文本中学习分词规则,这意味着你只需要提供未标注的文本数据即可训练分词模型。\n",
|
30 |
+
"\n",
|
31 |
+
"4. **灵活的分词粒度**:\n",
|
32 |
+
" - 可以通过调整参数控制分词的粒度,即生成的子词单元的平均长度。这允许根据具体应用需求优化性能。\n",
|
33 |
+
"\n",
|
34 |
+
"5. **支持 BPE 和 Unigram LM**:\n",
|
35 |
+
" - SentencePiece 实现了两种流行的分词算法:Byte Pair Encoding (BPE) 和 Unigram Language Model (Unigram LM)。这两种方法各有优劣,可以根据任务选择合适的一种。\n",
|
36 |
+
"\n",
|
37 |
+
"6. **易于集成**:\n",
|
38 |
+
" - 提供了多种编程语言的绑定,包括 Python、C++、Go 等,方便在不同环境中使用。\n",
|
39 |
+
"\n",
|
40 |
+
"### 工作流程\n",
|
41 |
+
"\n",
|
42 |
+
"1. **准备语料库**:\n",
|
43 |
+
" - 收集用于训练分词模型的未标注文本数据。\n",
|
44 |
+
"\n",
|
45 |
+
"2. **训练模型**:\n",
|
46 |
+
" - 使用 `sentencepiece_trainer` 工具对收集到的文本进行训练,生成分词模型文件。\n",
|
47 |
+
" ```bash\n",
|
48 |
+
" spm_train --input=your_corpus.txt --model_prefix=myprefix --vocab_size=8000\n",
|
49 |
+
" ```\n",
|
50 |
+
"\n",
|
51 |
+
"3. **编码和解码**:\n",
|
52 |
+
" - 训练完成后,可以使用生成的模型对新文本进行编码(分词)和解码(还原)。\n",
|
53 |
+
" ```python\n",
|
54 |
+
" import sentencepiece as spm\n",
|
55 |
+
"\n",
|
56 |
+
" # 加载训练好的模型\n",
|
57 |
+
" sp = spm.SentencePieceProcessor(model_file='myprefix.model')\n",
|
58 |
+
"\n",
|
59 |
+
" # 分词\n",
|
60 |
+
" encoded = sp.encode(\"这是一个测试句子。\", out_type=str)\n",
|
61 |
+
" print(encoded)\n",
|
62 |
+
"\n",
|
63 |
+
" # 还原\n",
|
64 |
+
" decoded = sp.decode(encoded)\n",
|
65 |
+
" print(decoded)\n",
|
66 |
+
" ```\n",
|
67 |
+
"\n",
|
68 |
+
"### 应用场景\n",
|
69 |
+
"\n",
|
70 |
+
"- **自然语言处理 (NLP)**:广泛应用于各种 NLP 任务,如机器翻译、文本分类、情感分析等。\n",
|
71 |
+
"- **多语言支持**:特别适合处理包含多种语言的文本。\n",
|
72 |
+
"- **低资源语言**:对于那些缺乏丰富词汇资源的语言尤其有用。\n",
|
73 |
+
"- **预训练语言模型**:许多现代预训练语言模型(如 BERT、T5、mBART)都采用了 SentencePiece 作为其分词工具。\n",
|
74 |
+
"\n",
|
75 |
+
"### 小结\n",
|
76 |
+
"\n",
|
77 |
+
"SentencePiece 是一个强大而灵活的分词框架,适用于广泛的文本处理任务。它的无监督学习特性、语言无关性和高效的子词分词能力使其成为处理复杂和多样化文本数据的理想选择。希望这个简单的介绍能帮助你理解 SentencePiece 的基本概念和应用场景。如果有更多问题或需要进一步的帮助,请随时提问!"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "markdown",
|
82 |
+
"id": "a8dedb50-a428-4146-8edf-84e699abf81b",
|
83 |
+
"metadata": {},
|
84 |
+
"source": [
|
85 |
+
"## GENE分词器构建"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": null,
|
91 |
+
"id": "39b5bf12-eaf0-432e-a2b0-99ba437daf3e",
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [],
|
94 |
+
"source": [
|
95 |
+
"!pip install sentencepiece"
|
96 |
+
]
|
97 |
+
},
|
98 |
+
{
|
99 |
+
"cell_type": "code",
|
100 |
+
"execution_count": null,
|
101 |
+
"id": "3b732b8e-53d1-4bfa-891b-2d63b886cc4a",
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [],
|
104 |
+
"source": [
|
105 |
+
"import sentencepiece as spm\n",
|
106 |
+
"\n",
|
107 |
+
"spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',\n",
|
108 |
+
" model_prefix='gene_bpe_seg', \n",
|
109 |
+
" vocab_size=60000,\n",
|
110 |
+
" model_type='bpe', #默认是unigram\n",
|
111 |
+
" num_threads=10,\n",
|
112 |
+
" )"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": null,
|
118 |
+
"id": "19a06b82-31b8-48cb-9c83-ec016da2da8a",
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"from sentencepiece import SentencePieceProcessor\n",
|
123 |
+
"model_path = \"gene_bpe_seg.model\"\n",
|
124 |
+
"sp_model = SentencePieceProcessor(model_file=model_path)\n",
|
125 |
+
"mm = sp_model.EncodeAsPieces(\"TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV\")\n",
|
126 |
+
"print(mm)"
|
127 |
+
]
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "markdown",
|
131 |
+
"id": "958f7bd6-060f-48f4-8afe-02c3032312eb",
|
132 |
+
"metadata": {},
|
133 |
+
"source": [
|
134 |
+
"## 合并词典到llama\n",
|
135 |
+
"\n",
|
136 |
+
"我们以基础版本的llama为例,进行合并,请注意llama的使用限制。\n",
|
137 |
+
"\n",
|
138 |
+
"新版本的llama需要自行认证下载。[链接](https://huggingface.co/meta-llama)\n",
|
139 |
+
"\n",
|
140 |
+
"```\n",
|
141 |
+
"#建议在终端下执行\n",
|
142 |
+
"pip install -U huggingface_hub\n",
|
143 |
+
"export HF_ENDPOINT=https://hf-mirror.com\n",
|
144 |
+
"huggingface-cli download --resume-download yahma/llama-7b-hf --local-dir llama-7b-hf\n",
|
145 |
+
"```"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"cell_type": "code",
|
150 |
+
"execution_count": null,
|
151 |
+
"id": "3bafcc33-2923-4026-bc39-c6ec716d2e3c",
|
152 |
+
"metadata": {},
|
153 |
+
"outputs": [],
|
154 |
+
"source": [
|
155 |
+
"import os\n",
|
156 |
+
"os.environ[\"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION\"]=\"python\"\n",
|
157 |
+
"from transformers import LlamaTokenizer\n",
|
158 |
+
"from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model\n",
|
159 |
+
"import sentencepiece as spm"
|
160 |
+
]
|
161 |
+
},
|
162 |
+
{
|
163 |
+
"cell_type": "code",
|
164 |
+
"execution_count": null,
|
165 |
+
"id": "66cb86ed-3225-4bb0-8aca-6005bc918d03",
|
166 |
+
"metadata": {},
|
167 |
+
"outputs": [],
|
168 |
+
"source": [
|
169 |
+
"llama_tokenizer_dir = \"llama-7b-hf\" \n",
|
170 |
+
"dna_sp_model_file = \"gene_bpe_seg.model\"\n",
|
171 |
+
"\n",
|
172 |
+
"# load\n",
|
173 |
+
"llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)\n",
|
174 |
+
"dna_sp_model = spm.SentencePieceProcessor()\n",
|
175 |
+
"dna_sp_model.Load(dna_sp_model_file)\n",
|
176 |
+
"\n",
|
177 |
+
"llama_spm = sp_pb2_model.ModelProto()\n",
|
178 |
+
"llama_spm.ParseFromString(llama_tokenizer.sp_model.serialized_model_proto())\n",
|
179 |
+
"dna_spm = sp_pb2_model.ModelProto()\n",
|
180 |
+
"dna_spm.ParseFromString(dna_sp_model.serialized_model_proto())\n",
|
181 |
+
"\n",
|
182 |
+
"# print number of tokens\n",
|
183 |
+
"print(len(llama_tokenizer),len(dna_sp_model))\n",
|
184 |
+
"print(llama_tokenizer.all_special_tokens)\n",
|
185 |
+
"print(llama_tokenizer.all_special_ids)\n",
|
186 |
+
"print(llama_tokenizer.special_tokens_map)"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": null,
|
192 |
+
"id": "7ba4240e-bc08-4be0-8ca3-c4e7a47fa055",
|
193 |
+
"metadata": {},
|
194 |
+
"outputs": [],
|
195 |
+
"source": [
|
196 |
+
"## Add dna tokens to LLaMA tokenizer\n",
|
197 |
+
"llama_spm_tokens_set=set(p.piece for p in llama_spm.pieces)\n",
|
198 |
+
"print(len(llama_spm_tokens_set))\n",
|
199 |
+
"print(f\"Before:{len(llama_spm_tokens_set)}\")\n",
|
200 |
+
"for p in dna_spm.pieces:\n",
|
201 |
+
" piece = p.piece\n",
|
202 |
+
" score = p.score\n",
|
203 |
+
" if piece not in llama_spm_tokens_set:\n",
|
204 |
+
" new_p = sp_pb2_model.ModelProto().SentencePiece()\n",
|
205 |
+
" new_p.piece = piece\n",
|
206 |
+
" new_p.score = score # 0?\n",
|
207 |
+
" llama_spm.pieces.append(new_p)\n",
|
208 |
+
"print(f\"New model pieces: {len(llama_spm.pieces)}\")"
|
209 |
+
]
|
210 |
+
},
|
211 |
+
{
|
212 |
+
"cell_type": "code",
|
213 |
+
"execution_count": null,
|
214 |
+
"id": "a240a7d8-c1a9-4473-a5c5-157a25f97c16",
|
215 |
+
"metadata": {},
|
216 |
+
"outputs": [],
|
217 |
+
"source": [
|
218 |
+
"## Save\n",
|
219 |
+
"output_sp_dir = 'merged_gene_eng_tokenizer_sp'\n",
|
220 |
+
"output_hf_dir = 'merged_gene_eng_tokenizer_hf' # the path to save dna-LLaMA tokenizer\n",
|
221 |
+
"os.makedirs(output_sp_dir,exist_ok=True)\n",
|
222 |
+
"with open(output_sp_dir+'/gene_eng_llama_tokenizer.model', 'wb') as f:\n",
|
223 |
+
" f.write(llama_spm.SerializeToString())\n",
|
224 |
+
"\n",
|
225 |
+
"tokenizer = LlamaTokenizer(vocab_file=output_sp_dir+'/gene_eng_llama_tokenizer.model')\n",
|
226 |
+
"tokenizer.save_pretrained(output_hf_dir)\n",
|
227 |
+
"print(f\"gene-LLaMA tokenizer has been saved to {output_hf_dir}\")"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "code",
|
232 |
+
"execution_count": null,
|
233 |
+
"id": "cbd1f648-f8a0-4f16-b516-2ce3e7c7cfee",
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": [
|
237 |
+
"# Test\n",
|
238 |
+
"llama_tokenizer = LlamaTokenizer.from_pretrained(llama_tokenizer_dir)\n",
|
239 |
+
"dna_llama_tokenizer = LlamaTokenizer.from_pretrained(output_hf_dir)\n",
|
240 |
+
"print(tokenizer.all_special_tokens)\n",
|
241 |
+
"print(tokenizer.all_special_ids)\n",
|
242 |
+
"print(tokenizer.special_tokens_map)\n",
|
243 |
+
"text='''TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV,\n",
|
244 |
+
"The primary use of LLaMA is research on large language models, including'''\n",
|
245 |
+
"print(\"Test text:\\n\",text)\n",
|
246 |
+
"print(f\"Tokenized by LLaMA tokenizer:{llama_tokenizer.tokenize(text)}\")\n",
|
247 |
+
"print(f\"Tokenized by GENE-LLaMA tokenizer:{dna_llama_tokenizer.tokenize(text)}\")"
|
248 |
+
]
|
249 |
+
}
|
250 |
+
],
|
251 |
+
"metadata": {
|
252 |
+
"kernelspec": {
|
253 |
+
"display_name": "Python 3 (ipykernel)",
|
254 |
+
"language": "python",
|
255 |
+
"name": "python3"
|
256 |
+
},
|
257 |
+
"language_info": {
|
258 |
+
"codemirror_mode": {
|
259 |
+
"name": "ipython",
|
260 |
+
"version": 3
|
261 |
+
},
|
262 |
+
"file_extension": ".py",
|
263 |
+
"mimetype": "text/x-python",
|
264 |
+
"name": "python",
|
265 |
+
"nbconvert_exporter": "python",
|
266 |
+
"pygments_lexer": "ipython3",
|
267 |
+
"version": "3.12.3"
|
268 |
+
}
|
269 |
+
},
|
270 |
+
"nbformat": 4,
|
271 |
+
"nbformat_minor": 5
|
272 |
+
}
|
04-gene-sft/.ipynb_checkpoints/4-deepspeed-intro-checkpoint.ipynb
ADDED
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "c2e5c9f4-4378-4d39-bc4f-fb4b4a2b2481",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.4 deepspeed分布式训练简介"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "3383c2d7-91a9-4940-b3b2-698fb7d9dbb7",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"## 使用gpt2+deepspeed训练"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "markdown",
|
21 |
+
"id": "c0d29667-1e75-46df-8f65-cae27609ee3f",
|
22 |
+
"metadata": {},
|
23 |
+
"source": [
|
24 |
+
"## deepspeed简介\n",
|
25 |
+
"\n",
|
26 |
+
"DeepSpeed 是微软开发的开源深度学习优化库,专为大规模模型训练和推理设计,能够显著提升训练速度、降低显存占用,并支持分布式计算。以下是 DeepSpeed 的关键特点和功能:\n",
|
27 |
+
"\n",
|
28 |
+
"---\n",
|
29 |
+
"\n",
|
30 |
+
"### **1. 核心特点**\n",
|
31 |
+
"\n",
|
32 |
+
"#### **(1)高效分布式训练**\n",
|
33 |
+
"DeepSpeed 提供先进的分布式技术(如 ZeRO 优化器),支持数百亿甚至上万亿参数的模型训练,同时降低单设备显存需求。\n",
|
34 |
+
"\n",
|
35 |
+
"#### **(2)显存优化**\n",
|
36 |
+
"通过显存分片(ZeRO)、梯度累积和混合精度训练,DeepSpeed 能够在有限显存的情况下训练大模型。\n",
|
37 |
+
"\n",
|
38 |
+
"#### **(3)性能提升**\n",
|
39 |
+
"DeepSpeed 优化了通信和计算过程,可提升多 GPU 分布式训练效率。\n",
|
40 |
+
"\n",
|
41 |
+
"#### **(4)灵活性**\n",
|
42 |
+
"与 PyTorch 无缝集成,并兼容 Hugging Face `transformers` 和其他主流深度学习库。\n",
|
43 |
+
"\n",
|
44 |
+
"#### **(5)推理优化**\n",
|
45 |
+
"支持高效推理(如量化和张量并行),适合大模型的生产部署。\n",
|
46 |
+
"\n",
|
47 |
+
"---\n",
|
48 |
+
"\n",
|
49 |
+
"### **2. 核心技术**\n",
|
50 |
+
"\n",
|
51 |
+
"#### **(1)ZeRO 优化器**\n",
|
52 |
+
"ZeRO(Zero Redundancy Optimizer)是 DeepSpeed 的核心技术之一,分为 3 个阶段:\n",
|
53 |
+
"- **Stage 1**:分片优化器状态(如动量、方差)。\n",
|
54 |
+
"- **Stage 2**:分片优化器状态和梯度。\n",
|
55 |
+
"- **Stage 3**:分片优化器状态、梯度和模型参数,实现全分片优化。\n",
|
56 |
+
"\n",
|
57 |
+
"每个阶段都进一步减少显存需求,Stage 3 可支持超大规模模型(如 GPT-3)。\n",
|
58 |
+
"\n",
|
59 |
+
"#### **(2)混合精度训练**\n",
|
60 |
+
"通过 FP16 或 BF16(半精度浮点数)计算,显著减少显存占用并提升计算效率。\n",
|
61 |
+
"\n",
|
62 |
+
"#### **(3)数据并行与模型并行**\n",
|
63 |
+
"- 数据并行:将数据划分到多个设备,每个设备计算部分梯度。\n",
|
64 |
+
"- 模型并行:将模型的不同部分分配到多个设备。\n",
|
65 |
+
"- 张量并行:将张量运算分解到多个 GPU 上。\n",
|
66 |
+
"\n",
|
67 |
+
"#### **(4)梯度累积**\n",
|
68 |
+
"支持更大的有效批量大小,适合显存受限的设备。\n",
|
69 |
+
"\n",
|
70 |
+
"#### **(5)推理优化**\n",
|
71 |
+
"- 推理阶段的显存优化和加速技术。\n",
|
72 |
+
"- 量化推理,减少模型大小和运行时开销。\n",
|
73 |
+
"\n",
|
74 |
+
"---\n",
|
75 |
+
"\n",
|
76 |
+
"### **3. 适用场景**\n",
|
77 |
+
"\n",
|
78 |
+
"#### **(1)大规模模型训练**\n",
|
79 |
+
"适合训练数十亿或上万亿参数的模型,如 GPT-3、BERT、T5 等。\n",
|
80 |
+
"\n",
|
81 |
+
"#### **(2)分布式训练**\n",
|
82 |
+
"支持单机多卡、多机多卡分布式训练,能高效利用多 GPU 环境。\n",
|
83 |
+
"\n",
|
84 |
+
"#### **(3)显存受限的模型微调**\n",
|
85 |
+
"通过显存优化技术,能在较小显存设备(如 16GB GPU)上微调大模型。\n",
|
86 |
+
"\n",
|
87 |
+
"#### **(4)高效推理**\n",
|
88 |
+
"适用于大语言模型的生产部署,支持推理加速和量化。\n",
|
89 |
+
"\n",
|
90 |
+
"---\n",
|
91 |
+
"\n",
|
92 |
+
"### **4. 优势与局限性**\n",
|
93 |
+
"\n",
|
94 |
+
"#### **优势**\n",
|
95 |
+
"1. 显存需求显著降低,适合超大规模模型训练。\n",
|
96 |
+
"2. 支持多种分布式模式,扩展性强。\n",
|
97 |
+
"3. 与 PyTorch 和 Hugging Face 无缝集成。\n",
|
98 |
+
"4. 推理优化技术降低部署成本。\n",
|
99 |
+
"\n",
|
100 |
+
"#### **局限性**\n",
|
101 |
+
"1. 配置和调优可能较为复杂。\n",
|
102 |
+
"2. 对小规模模型或数据集的性能提升有限。\n",
|
103 |
+
"\n",
|
104 |
+
"---\n",
|
105 |
+
"\n",
|
106 |
+
"### **5. 安装与基本用法**\n",
|
107 |
+
"\n",
|
108 |
+
"#### **安装**\n",
|
109 |
+
"```bash\n",
|
110 |
+
"pip install deepspeed\n",
|
111 |
+
"```\n",
|
112 |
+
"\n",
|
113 |
+
"#### **基本用法**\n",
|
114 |
+
"DeepSpeed 通过配置文件启用特性,例如 ZeRO 优化器:\n",
|
115 |
+
"```python\n",
|
116 |
+
"from transformers import GPT2LMHeadModel, TrainingArguments, Trainer\n",
|
117 |
+
"import deepspeed\n",
|
118 |
+
"\n",
|
119 |
+
"# 配置 DeepSpeed\n",
|
120 |
+
"deepspeed_config = {\n",
|
121 |
+
" \"train_batch_size\": 64,\n",
|
122 |
+
" \"gradient_accumulation_steps\": 8,\n",
|
123 |
+
" \"fp16\": {\n",
|
124 |
+
" \"enabled\": True\n",
|
125 |
+
" },\n",
|
126 |
+
" \"zero_optimization\": {\n",
|
127 |
+
" \"stage\": 2,\n",
|
128 |
+
" \"overlap_comm\": True\n",
|
129 |
+
" }\n",
|
130 |
+
"}\n",
|
131 |
+
"\n",
|
132 |
+
"# 保存配置文件\n",
|
133 |
+
"import json\n",
|
134 |
+
"with open(\"deepspeed_config.json\", \"w\") as f:\n",
|
135 |
+
" json.dump(deepspeed_config, f)\n",
|
136 |
+
"\n",
|
137 |
+
"# 集成到 Hugging Face Trainer\n",
|
138 |
+
"training_args = TrainingArguments(\n",
|
139 |
+
" output_dir=\"./results\",\n",
|
140 |
+
" per_device_train_batch_size=4,\n",
|
141 |
+
" num_train_epochs=3,\n",
|
142 |
+
" learning_rate=5e-5,\n",
|
143 |
+
" fp16=True,\n",
|
144 |
+
" deepspeed=\"./deepspeed_config.json\" # DeepSpeed 配置文件\n",
|
145 |
+
")\n",
|
146 |
+
"\n",
|
147 |
+
"trainer = Trainer(\n",
|
148 |
+
" model=GPT2LMHeadModel.from_pretrained(\"gpt2\"),\n",
|
149 |
+
" args=training_args,\n",
|
150 |
+
" train_dataset=train_dataset,\n",
|
151 |
+
" eval_dataset=eval_dataset\n",
|
152 |
+
")\n",
|
153 |
+
"\n",
|
154 |
+
"trainer.train()\n",
|
155 |
+
"```\n",
|
156 |
+
"\n",
|
157 |
+
"---\n",
|
158 |
+
"\n",
|
159 |
+
"### **6. 总结**\n",
|
160 |
+
"\n",
|
161 |
+
"DeepSpeed 是大模型训练的强力工具,特别是在多 GPU 环境下,其显存优化和分布式训练技术能显著提升训练效率。适用于以下场景:\n",
|
162 |
+
"- 超大规模模型的训练和微调。\n",
|
163 |
+
"- 多机多卡环境的分布式训练。\n",
|
164 |
+
"- 高效推理部署。\n",
|
165 |
+
"\n",
|
166 |
+
"如果需要进一步优化模型训练或部署性能,DeepSpeed 是值得尝试的工具!"
|
167 |
+
]
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"cell_type": "markdown",
|
171 |
+
"id": "75b8219d-8069-4b18-96c8-d5024ee049f1",
|
172 |
+
"metadata": {},
|
173 |
+
"source": [
|
174 |
+
"## 大模型并行方法\n",
|
175 |
+
"\n",
|
176 |
+
"大模型的并行训练旨在克服单个 GPU 显存的限制和加速训练过程,通常适用于参数规模较大的模型(如 GPT-3、T5 等)。并行训练主要包括以下几种方法,每种方法适用于不同的场景和模型特性。\n",
|
177 |
+
"\n",
|
178 |
+
"---\n",
|
179 |
+
"\n",
|
180 |
+
"### **1. 数据并行(Data Parallelism)**\n",
|
181 |
+
"\n",
|
182 |
+
"#### **原理**\n",
|
183 |
+
"- 将数据切分成多个小批次,每个 GPU 处理其中一部分。\n",
|
184 |
+
"- 模型副本被复制到每个 GPU。\n",
|
185 |
+
"- 每个 GPU 独立计算梯度,最终通过梯度同步(如 AllReduce 操作)更新参数。\n",
|
186 |
+
"\n",
|
187 |
+
"#### **特点**\n",
|
188 |
+
"- **优点**:\n",
|
189 |
+
" - 实现简单,是最常用的并行方法。\n",
|
190 |
+
" - 对模型大小没有限制。\n",
|
191 |
+
"- **缺点**:\n",
|
192 |
+
" - 模型副本需要完整加载到每个 GPU,占用显存。\n",
|
193 |
+
" - 在超大规模模型中,显存压力较大。\n",
|
194 |
+
"\n",
|
195 |
+
"#### **适用场景**\n",
|
196 |
+
"- 参数规模适中,显存可以容纳整个模型的场景。\n",
|
197 |
+
"\n",
|
198 |
+
"---\n",
|
199 |
+
"\n",
|
200 |
+
"### **2. 模型并行(Model Parallelism)**\n",
|
201 |
+
"\n",
|
202 |
+
"#### **原理**\n",
|
203 |
+
"- 将模型切分成不同的部分,将不同部分分配到不同的 GPU。\n",
|
204 |
+
"- 前向传播和后向传播时,数据在模型的不同部分之间传递。\n",
|
205 |
+
"\n",
|
206 |
+
"#### **特点**\n",
|
207 |
+
"- **优点**:\n",
|
208 |
+
" - 不需要复制整个模型,可以支持超大规模模型。\n",
|
209 |
+
"- **缺点**:\n",
|
210 |
+
" - GPU 之间通信频繁,可能成为性能瓶颈。\n",
|
211 |
+
" - 实现复杂,切分模型需要精心设计。\n",
|
212 |
+
" \n",
|
213 |
+
"#### **适用场景**\n",
|
214 |
+
"- 单个 GPU 无法容纳完整模型参数的场景。\n",
|
215 |
+
"\n",
|
216 |
+
"#### **具体实现**\n",
|
217 |
+
"- 将 Transformer 的不同层分配到不同的 GPU。\n",
|
218 |
+
"- 常用工具:DeepSpeed 的 Pipeline Parallelism、NVIDIA Megatron-LM。\n",
|
219 |
+
"\n",
|
220 |
+
"---\n",
|
221 |
+
"\n",
|
222 |
+
"### **3. 张量并行(Tensor Parallelism)**\n",
|
223 |
+
"\n",
|
224 |
+
"#### **原理**\n",
|
225 |
+
"- 将模型内部的张量(如权重矩阵)切分为多个子张量,并分配到不同 GPU。\n",
|
226 |
+
"- GPU 之间协作完成矩阵计算。\n",
|
227 |
+
"\n",
|
228 |
+
"#### **特点**\n",
|
229 |
+
"- **优点**:\n",
|
230 |
+
" - 减少了每个 GPU 的显存占用,同时保持模型整体完整性。\n",
|
231 |
+
"- **缺点**:\n",
|
232 |
+
" - 实现较复杂,需要优化通信操作。\n",
|
233 |
+
" - 通信开销较高,适合较大批量的训练。\n",
|
234 |
+
"\n",
|
235 |
+
"#### **适用场景**\n",
|
236 |
+
"- 参数非常大的模型(如 GPT-3)。\n",
|
237 |
+
"- 需要极致优化显存的场景。\n",
|
238 |
+
"\n",
|
239 |
+
"#### **具体实现**\n",
|
240 |
+
"- NVIDIA 的 Megatron-LM 和 Hugging Face Transformers 提供了张量并行的支持。\n",
|
241 |
+
"\n",
|
242 |
+
"---\n",
|
243 |
+
"\n",
|
244 |
+
"### **4. 管道并行(Pipeline Parallelism)**\n",
|
245 |
+
"\n",
|
246 |
+
"#### **原理**\n",
|
247 |
+
"- 将模型分为不同的部分(通常是按层划分),每部分分配到不同的 GPU。\n",
|
248 |
+
"- 数据按照流水线的方式流经每个 GPU。\n",
|
249 |
+
"\n",
|
250 |
+
"#### **特点**\n",
|
251 |
+
"- **优点**:\n",
|
252 |
+
" - 减少每个 GPU 的显存压力。\n",
|
253 |
+
" - 通过流水线增加计算效率。\n",
|
254 |
+
"- **缺点**:\n",
|
255 |
+
" - 引入流水线延迟。\n",
|
256 |
+
" - 实现复杂,需管理数据依赖和同步。\n",
|
257 |
+
"\n",
|
258 |
+
"#### **适用场景**\n",
|
259 |
+
"- 模型非常深,层数较多的场景。\n",
|
260 |
+
"\n",
|
261 |
+
"#### **具体实现**\n",
|
262 |
+
"- DeepSpeed 的 Pipeline Parallelism。\n",
|
263 |
+
"\n",
|
264 |
+
"---\n",
|
265 |
+
"\n",
|
266 |
+
"### **5. 混合并行(Hybrid Parallelism)**\n",
|
267 |
+
"\n",
|
268 |
+
"#### **原理**\n",
|
269 |
+
"- 将数据并行、模型并行、张量并行和管道并行组合使用,充分利用多 GPU 资源。\n",
|
270 |
+
"- 不同的并行方法在不同维度协同工作。\n",
|
271 |
+
"\n",
|
272 |
+
"#### **特点**\n",
|
273 |
+
"- **优点**:\n",
|
274 |
+
" - 灵活且适应性强,适合超大规模模型。\n",
|
275 |
+
"- **缺点**:\n",
|
276 |
+
" - 配置复杂,依赖于框架和训练任务。\n",
|
277 |
+
"\n",
|
278 |
+
"#### **适用场景**\n",
|
279 |
+
"- 超大规模模型(如 GPT-3 或参数量 >1T)。\n",
|
280 |
+
"- 多机多卡的大型训练环境。\n",
|
281 |
+
"\n",
|
282 |
+
"#### **具体实现**\n",
|
283 |
+
"- NVIDIA Megatron-LM 和 DeepSpeed 的混合并行支持。\n",
|
284 |
+
"\n",
|
285 |
+
"---\n",
|
286 |
+
"\n",
|
287 |
+
"### **6. ZeRO 优化并行(Zero Redundancy Optimizer)**\n",
|
288 |
+
"\n",
|
289 |
+
"#### **原理**\n",
|
290 |
+
"- 通过分片存储模型参数、优化器状态和梯度,显著减少每个 GPU 的显存占用。\n",
|
291 |
+
"\n",
|
292 |
+
"#### **特点**\n",
|
293 |
+
"- **优点**:\n",
|
294 |
+
" - 极大降低显存需求。\n",
|
295 |
+
" - 支持超大规模模型。\n",
|
296 |
+
"- **缺点**:\n",
|
297 |
+
" - 对 GPU 间通信要求较高。\n",
|
298 |
+
" - 比数据并行复杂。\n",
|
299 |
+
"\n",
|
300 |
+
"#### **适用场景**\n",
|
301 |
+
"- 超大模型的高效训练。\n",
|
302 |
+
"\n",
|
303 |
+
"#### **具体实现**\n",
|
304 |
+
"- DeepSpeed 提供的 ZeRO Stage 1/2/3。\n",
|
305 |
+
"\n",
|
306 |
+
"---\n",
|
307 |
+
"\n",
|
308 |
+
"### **方法对比**\n",
|
309 |
+
"\n",
|
310 |
+
"| 并行方法 | 主要优点 | 主要缺点 | 适用场景 |\n",
|
311 |
+
"|---------------|-------------------------------|-------------------------------|---------------------------|\n",
|
312 |
+
"| 数据并行 | 简单高效,易实现 | 模型副本占用大量显存 | 模型规模适中,显存足够 |\n",
|
313 |
+
"| 模型并行 | 支持大模型 | 通信开销大,切分复杂 | 超大模型,显存有限 |\n",
|
314 |
+
"| 张量并行 | 高效利用显存 | 实现复杂,通信频繁 | 参数规模极大的模型 |\n",
|
315 |
+
"| 管道并行 | 显存需求降低,适合深模型 | 流水线延迟,数据同步复杂 | 层数多的大型模型 |\n",
|
316 |
+
"| 混合并行 | 灵活适配超大规模模型 | 配置复杂,依赖框架 | 超大规模模型(如 GPT-3) |\n",
|
317 |
+
"| ZeRO 并行 | 极大节省显存,占用少 | 通信成本高 | 超大规模模型显存优化 |\n",
|
318 |
+
"\n",
|
319 |
+
"---\n",
|
320 |
+
"\n",
|
321 |
+
"### **总结**\n",
|
322 |
+
"- **中等规模模型**:优先使用 **数据并行**。\n",
|
323 |
+
"- **单卡显存不足**:采用 **模型并行** 或 **张量并行**。\n",
|
324 |
+
"- **超大规模模型**:使用 **混合并行** 或 DeepSpeed 的 **ZeRO 优化**。\n",
|
325 |
+
"\n",
|
326 |
+
"对于现代超大规模模型,通常采用混合并行方法,比如 NVIDIA 的 Megatron-LM 和微软的 DeepSpeed,它们综合了多种并行策略,能够有效利用计算资源并加速训练。如果您有具体的硬件环境或模型需求,可以进一步探讨适合的并行方案!"
|
327 |
+
]
|
328 |
+
},
|
329 |
+
{
|
330 |
+
"cell_type": "markdown",
|
331 |
+
"id": "cd848439-bac8-46b2-9a0f-59ae7c343954",
|
332 |
+
"metadata": {},
|
333 |
+
"source": [
|
334 |
+
"## deepspeed的并行设计\n",
|
335 |
+
"\n",
|
336 |
+
"\n",
|
337 |
+
"是的,DeepSpeed 支持多种并行策略,包括 **数据并行**、**模型并行** 和 **张量并行**,并且可以通过其配置文件灵活地设置这些并行模式。\n",
|
338 |
+
"\n",
|
339 |
+
"---\n",
|
340 |
+
"\n",
|
341 |
+
"### **1. 数据并行**\n",
|
342 |
+
"\n",
|
343 |
+
"#### **原理**\n",
|
344 |
+
"在数据并行中,DeepSpeed 将数据批次划分到多个 GPU,每个 GPU 上都有模型的完整副本,计算独立的梯度。最终通过 `AllReduce` 操作同步梯度并更新模型参数。\n",
|
345 |
+
"\n",
|
346 |
+
"#### **如何设置**\n",
|
347 |
+
"DeepSpeed 默认支持数据并行,启用 `zero_optimization` 后会自动结合 ZeRO 优化器进行分片数据并行:\n",
|
348 |
+
"```json\n",
|
349 |
+
"{\n",
|
350 |
+
" \"train_batch_size\": 64,\n",
|
351 |
+
" \"gradient_accumulation_steps\": 8,\n",
|
352 |
+
" \"fp16\": {\n",
|
353 |
+
" \"enabled\": true\n",
|
354 |
+
" },\n",
|
355 |
+
" \"zero_optimization\": {\n",
|
356 |
+
" \"stage\": 1\n",
|
357 |
+
" }\n",
|
358 |
+
"}\n",
|
359 |
+
"```\n",
|
360 |
+
"\n",
|
361 |
+
"---\n",
|
362 |
+
"\n",
|
363 |
+
"### **2. 模型并行**\n",
|
364 |
+
"\n",
|
365 |
+
"#### **原理**\n",
|
366 |
+
"模型并行将模型的不同部分(如 Transformer 层或权重张量)分布到多个 GPU。DeepSpeed 本身不直接实现模型并行,但可以与模型并行框架(如 NVIDIA Megatron-LM)集成。\n",
|
367 |
+
"\n",
|
368 |
+
"#### **如何设置**\n",
|
369 |
+
"如果使用模型并行(如层级分割):\n",
|
370 |
+
"1. 使用 DeepSpeed 的 Pipeline Parallelism:\n",
|
371 |
+
" ```json\n",
|
372 |
+
" {\n",
|
373 |
+
" \"train_batch_size\": 64,\n",
|
374 |
+
" \"pipeline_parallel_size\": 2, # 设置流水线并行 GPU 数量\n",
|
375 |
+
" \"fp16\": {\n",
|
376 |
+
" \"enabled\": true\n",
|
377 |
+
" },\n",
|
378 |
+
" \"zero_optimization\": {\n",
|
379 |
+
" \"stage\": 1\n",
|
380 |
+
" }\n",
|
381 |
+
" }\n",
|
382 |
+
" ```\n",
|
383 |
+
"\n",
|
384 |
+
"2. 与 NVIDIA Megatron-LM 集成:\n",
|
385 |
+
" 在代码中使用 Megatron-LM 的模型并行支持,然后结合 DeepSpeed:\n",
|
386 |
+
" ```python\n",
|
387 |
+
" from megatron import get_model_parallel_world_size\n",
|
388 |
+
" import deepspeed\n",
|
389 |
+
"\n",
|
390 |
+
" model = MyModel(...)\n",
|
391 |
+
" model = deepspeed.initialize(\n",
|
392 |
+
" model=model,\n",
|
393 |
+
" model_parallel_size=get_model_parallel_world_size(),\n",
|
394 |
+
" config=\"./deepspeed_config.json\"\n",
|
395 |
+
" )\n",
|
396 |
+
" ```\n",
|
397 |
+
"\n",
|
398 |
+
"---\n",
|
399 |
+
"\n",
|
400 |
+
"### **3. 张量并行**\n",
|
401 |
+
"\n",
|
402 |
+
"#### **原理**\n",
|
403 |
+
"张量并行将模型参数张量(如权重矩阵)分片到多个 GPU,并通过通信协作完成计算。DeepSpeed 提供了张量并行的支持(在 ZeRO Stage 3 中),或者通过集成 Megatron-LM 实现。\n",
|
404 |
+
"\n",
|
405 |
+
"#### **如何设置**\n",
|
406 |
+
"1. **使用 ZeRO Stage 3**:\n",
|
407 |
+
" ZeRO Stage 3 会分片模型参数和优化器状态,类似于张量并行的效果:\n",
|
408 |
+
" ```json\n",
|
409 |
+
" {\n",
|
410 |
+
" \"train_batch_size\": 64,\n",
|
411 |
+
" \"gradient_accumulation_steps\": 8,\n",
|
412 |
+
" \"fp16\": {\n",
|
413 |
+
" \"enabled\": true\n",
|
414 |
+
" },\n",
|
415 |
+
" \"zero_optimization\": {\n",
|
416 |
+
" \"stage\": 3,\n",
|
417 |
+
" \"offload_optimizer\": {\n",
|
418 |
+
" \"device\": \"cpu\",\n",
|
419 |
+
" \"pin_memory\": true\n",
|
420 |
+
" },\n",
|
421 |
+
" \"offload_param\": {\n",
|
422 |
+
" \"device\": \"cpu\",\n",
|
423 |
+
" \"pin_memory\": true\n",
|
424 |
+
" }\n",
|
425 |
+
" }\n",
|
426 |
+
" }\n",
|
427 |
+
" ```\n",
|
428 |
+
"\n",
|
429 |
+
"2. **集成 Megatron-LM**:\n",
|
430 |
+
" 如果需要更复杂的张量并行方案(如矩阵切分),可以通过 Megatron-LM 实现,然后与 DeepSpeed 集成。\n",
|
431 |
+
"\n",
|
432 |
+
"---\n",
|
433 |
+
"\n",
|
434 |
+
"### **4. 混合并行**\n",
|
435 |
+
"\n",
|
436 |
+
"#### **原理**\n",
|
437 |
+
"混合并行结合了数据并行、模型并行和张量并行。DeepSpeed 提供了对这些模式的集成支持,允许您灵活配置。\n",
|
438 |
+
"\n",
|
439 |
+
"#### **如何设置**\n",
|
440 |
+
"结合数据并行和流水线并行:\n",
|
441 |
+
"```json\n",
|
442 |
+
"{\n",
|
443 |
+
" \"train_batch_size\": 64,\n",
|
444 |
+
" \"gradient_accumulation_steps\": 8,\n",
|
445 |
+
" \"fp16\": {\n",
|
446 |
+
" \"enabled\": true\n",
|
447 |
+
" },\n",
|
448 |
+
" \"pipeline_parallel_size\": 2, # 流水线并行\n",
|
449 |
+
" \"zero_optimization\": {\n",
|
450 |
+
" \"stage\": 2\n",
|
451 |
+
" }\n",
|
452 |
+
"}\n",
|
453 |
+
"```\n",
|
454 |
+
"\n",
|
455 |
+
"与张量并行结合:\n",
|
456 |
+
"1. 在代码中配置张量并行:\n",
|
457 |
+
" ```python\n",
|
458 |
+
" from megatron import get_tensor_parallel_world_size\n",
|
459 |
+
" model = MyModel(...)\n",
|
460 |
+
" model = deepspeed.initialize(\n",
|
461 |
+
" model=model,\n",
|
462 |
+
" tensor_parallel_size=get_tensor_parallel_world_size(),\n",
|
463 |
+
" config=\"./deepspeed_config.json\"\n",
|
464 |
+
" )\n",
|
465 |
+
" ```\n",
|
466 |
+
"\n",
|
467 |
+
"2. DeepSpeed 配置文件中启用 ZeRO Stage 3。\n",
|
468 |
+
"\n",
|
469 |
+
"---\n",
|
470 |
+
"\n",
|
471 |
+
"### **5. 选择并行策略**\n",
|
472 |
+
"\n",
|
473 |
+
"| 并行模式 | **支持方式** | **适用场景** |\n",
|
474 |
+
"|---------------|------------------------------------------|-----------------------------------------|\n",
|
475 |
+
"| 数据并行 | 默认支持,结合 ZeRO 优化器 | 模型参数较小,显存压力不大的场景 |\n",
|
476 |
+
"| 模型并行 | 使用 Pipeline Parallelism 或集成 Megatron-LM | 模型参数非常大,单 GPU 无法容纳的场景 |\n",
|
477 |
+
"| 张量并行 | ZeRO Stage 3 或集成 Megatron-LM | 参数矩阵非常大,需要分片计算的场景 |\n",
|
478 |
+
"| 混合并行 | 结合数据并行、模型并行和张量并行 | 超大规模模型(如 GPT-3)训练 |\n",
|
479 |
+
"\n",
|
480 |
+
"---\n",
|
481 |
+
"\n",
|
482 |
+
"### **6. 示例代码**\n",
|
483 |
+
"\n",
|
484 |
+
"以下是集成 ZeRO 和 Pipeline Parallelism 的完整示例:\n",
|
485 |
+
"```python\n",
|
486 |
+
"import deepspeed\n",
|
487 |
+
"from transformers import GPT2LMHeadModel, TrainingArguments, Trainer\n",
|
488 |
+
"from datasets import load_dataset\n",
|
489 |
+
"\n",
|
490 |
+
"# 加载数据\n",
|
491 |
+
"dataset = load_dataset(\"wikitext\", \"wikitext-2-raw-v1\", split=\"train\")\n",
|
492 |
+
"\n",
|
493 |
+
"# 加载模型\n",
|
494 |
+
"model = GPT2LMHeadModel.from_pretrained(\"gpt2\")\n",
|
495 |
+
"\n",
|
496 |
+
"# 配置 DeepSpeed\n",
|
497 |
+
"deepspeed_config = {\n",
|
498 |
+
" \"train_batch_size\": 64,\n",
|
499 |
+
" \"gradient_accumulation_steps\": 8,\n",
|
500 |
+
" \"pipeline_parallel_size\": 2, # 流水线并行\n",
|
501 |
+
" \"fp16\": {\n",
|
502 |
+
" \"enabled\": True\n",
|
503 |
+
" },\n",
|
504 |
+
" \"zero_optimization\": {\n",
|
505 |
+
" \"stage\": 2\n",
|
506 |
+
" }\n",
|
507 |
+
"}\n",
|
508 |
+
"\n",
|
509 |
+
"# 保存配置文件\n",
|
510 |
+
"import json\n",
|
511 |
+
"with open(\"deepspeed_config.json\", \"w\") as f:\n",
|
512 |
+
" json.dump(deepspeed_config, f)\n",
|
513 |
+
"\n",
|
514 |
+
"# 训练参数\n",
|
515 |
+
"training_args = TrainingArguments(\n",
|
516 |
+
" output_dir=\"./results\",\n",
|
517 |
+
" per_device_train_batch_size=4,\n",
|
518 |
+
" num_train_epochs=3,\n",
|
519 |
+
" deepspeed=\"./deepspeed_config.json\", # 指定 DeepSpeed 配置文件\n",
|
520 |
+
")\n",
|
521 |
+
"\n",
|
522 |
+
"# 初始化 Trainer\n",
|
523 |
+
"trainer = Trainer(\n",
|
524 |
+
" model=model,\n",
|
525 |
+
" args=training_args,\n",
|
526 |
+
" train_dataset=dataset\n",
|
527 |
+
")\n",
|
528 |
+
"\n",
|
529 |
+
"# 开始训练\n",
|
530 |
+
"trainer.train()\n",
|
531 |
+
"```\n",
|
532 |
+
"\n",
|
533 |
+
"---\n",
|
534 |
+
"\n",
|
535 |
+
"### **总结**\n",
|
536 |
+
"\n",
|
537 |
+
"- **数据并行**:默认支持,结合 ZeRO 进行优化。\n",
|
538 |
+
"- **模型并行**:使用 Pipeline Parallelism 或与 Megatron-LM 集成。\n",
|
539 |
+
"- **张量并行**:通过 ZeRO Stage 3 或 Megatron-LM 实现。\n",
|
540 |
+
"- **混合并行**:灵活结合多种并行方法,用于超大规模模型。\n",
|
541 |
+
"\n",
|
542 |
+
"DeepSpeed 的配置高度灵活,可以根据模型大小、显存限制和硬件条件选择适合的并行策略。"
|
543 |
+
]
|
544 |
+
},
|
545 |
+
{
|
546 |
+
"cell_type": "markdown",
|
547 |
+
"id": "ab2812bc-f743-4f18-b49c-972781484dc6",
|
548 |
+
"metadata": {},
|
549 |
+
"source": [
|
550 |
+
"## gpt2的训练\n",
|
551 |
+
"\n",
|
552 |
+
"```\n",
|
553 |
+
"#一般方式训练gpt2\n",
|
554 |
+
"python pretain_gpt2.py\n",
|
555 |
+
"\n",
|
556 |
+
"\n",
|
557 |
+
"#deepspeed训练gpt2, 只多一行代码\n",
|
558 |
+
"torchrun --nproc_per_node=6 deepspeed_pretrain_gpt2.py\n",
|
559 |
+
"\n",
|
560 |
+
"```"
|
561 |
+
]
|
562 |
+
},
|
563 |
+
{
|
564 |
+
"cell_type": "code",
|
565 |
+
"execution_count": null,
|
566 |
+
"id": "9cb60dc2-4cec-492d-836b-67694829acf2",
|
567 |
+
"metadata": {},
|
568 |
+
"outputs": [],
|
569 |
+
"source": []
|
570 |
+
}
|
571 |
+
],
|
572 |
+
"metadata": {
|
573 |
+
"kernelspec": {
|
574 |
+
"display_name": "Python 3 (ipykernel)",
|
575 |
+
"language": "python",
|
576 |
+
"name": "python3"
|
577 |
+
},
|
578 |
+
"language_info": {
|
579 |
+
"codemirror_mode": {
|
580 |
+
"name": "ipython",
|
581 |
+
"version": 3
|
582 |
+
},
|
583 |
+
"file_extension": ".py",
|
584 |
+
"mimetype": "text/x-python",
|
585 |
+
"name": "python",
|
586 |
+
"nbconvert_exporter": "python",
|
587 |
+
"pygments_lexer": "ipython3",
|
588 |
+
"version": "3.12.3"
|
589 |
+
}
|
590 |
+
},
|
591 |
+
"nbformat": 4,
|
592 |
+
"nbformat_minor": 5
|
593 |
+
}
|
04-gene-sft/.ipynb_checkpoints/5-llama-continue-train-checkpoint.ipynb
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "1e6d4978-4f0f-4268-aa23-d864857bd6c8",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.5 基于llama的基因大模型持续预训练"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"id": "1ad15cdf-386a-48bf-b44d-5014b1df8f8e",
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": []
|
18 |
+
}
|
19 |
+
],
|
20 |
+
"metadata": {
|
21 |
+
"kernelspec": {
|
22 |
+
"display_name": "Python 3 (ipykernel)",
|
23 |
+
"language": "python",
|
24 |
+
"name": "python3"
|
25 |
+
},
|
26 |
+
"language_info": {
|
27 |
+
"codemirror_mode": {
|
28 |
+
"name": "ipython",
|
29 |
+
"version": 3
|
30 |
+
},
|
31 |
+
"file_extension": ".py",
|
32 |
+
"mimetype": "text/x-python",
|
33 |
+
"name": "python",
|
34 |
+
"nbconvert_exporter": "python",
|
35 |
+
"pygments_lexer": "ipython3",
|
36 |
+
"version": "3.12.3"
|
37 |
+
}
|
38 |
+
},
|
39 |
+
"nbformat": 4,
|
40 |
+
"nbformat_minor": 5
|
41 |
+
}
|
04-gene-sft/.ipynb_checkpoints/6-llama-instruction-ft-checkpoint.ipynb
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "963e9ae0-ac68-44be-8c7d-fb9842784362",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# 4.6 基于llama的基因大模型指令微调"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "182b82c4-d484-4c15-a600-03c3b51367ec",
|
14 |
+
"metadata": {},
|
15 |
+
"source": [
|
16 |
+
"**PEFT**(Parameter-Efficient Fine-Tuning,参数高效微调)是一种优化技术,旨在以最小的参数更新实现对大规模预训练模型(如 GPT、BERT 等)的微调。PEFT 技术通过减少微调所需的参数量,显著降低了存储和计算开销,同时保留模型的性能,特别适合资源受限的场景和领域特定任务的定制化。\n",
|
17 |
+
"\n",
|
18 |
+
"---\n",
|
19 |
+
"\n",
|
20 |
+
"### **1. 核心思想**\n",
|
21 |
+
"传统的微调方式需要更新整个预训练模型的所有参数,PEFT 技术通过只调整少量的参数(如特定层或额外添加的小型模块)实现微调目标,大幅减少了训练开销和存储需求。\n",
|
22 |
+
"\n",
|
23 |
+
"---\n",
|
24 |
+
"\n",
|
25 |
+
"### **2. 常见的 PEFT 方法**\n",
|
26 |
+
"\n",
|
27 |
+
"#### **(1)Adapter 模型**\n",
|
28 |
+
"- 在每一层 Transformer 的输出中插入小型适配器模块,仅训练适配器模块的参数。\n",
|
29 |
+
"- 原始模型参数保持冻结不变。\n",
|
30 |
+
"- 优点:适配器模块参数量小,能适应不同任务。\n",
|
31 |
+
"\n",
|
32 |
+
"示例方法:\n",
|
33 |
+
"- **AdapterFusion**\n",
|
34 |
+
"- **MAD-X**\n",
|
35 |
+
"\n",
|
36 |
+
"---\n",
|
37 |
+
"\n",
|
38 |
+
"#### **(2)Prefix Tuning**\n",
|
39 |
+
"- 在 Transformer 的输入前添加一组可学习的前缀向量,这些前缀与模型的注意力机制交互。\n",
|
40 |
+
"- 只调整前缀向量的参数,而不更新原始模型。\n",
|
41 |
+
"- 优点:对生成任务效果显著,参数量进一步减少。\n",
|
42 |
+
"\n",
|
43 |
+
"---\n",
|
44 |
+
"\n",
|
45 |
+
"#### **(3)LoRA(Low-Rank Adaptation)**\n",
|
46 |
+
"- 将预训练模型中的部分权重分解为两个低秩矩阵,仅调整这些低秩矩阵的参数。\n",
|
47 |
+
"- 原始权重保持冻结状态。\n",
|
48 |
+
"- 优点:参数量极小,计算高效。\n",
|
49 |
+
" \n",
|
50 |
+
"---\n",
|
51 |
+
"\n",
|
52 |
+
"#### **(4)Prompt Tuning**\n",
|
53 |
+
"- 在输入文本中添加可学习的提示(Prompt)。\n",
|
54 |
+
"- 适合 NLP 任务中的文本生成、分类等。\n",
|
55 |
+
"- 优点:实现简单,易于集成到现有框架。\n",
|
56 |
+
"\n",
|
57 |
+
"---\n",
|
58 |
+
"\n",
|
59 |
+
"### **3. PEFT 的优势**\n",
|
60 |
+
"\n",
|
61 |
+
"1. **显著减少参数更新量**:\n",
|
62 |
+
" - 微调传统的大模型(如 GPT-3)需要更新数百亿参数,而 PEFT 仅需更新百万级别甚至更少的参数。\n",
|
63 |
+
"\n",
|
64 |
+
"2. **高效存储**:\n",
|
65 |
+
" - 每个任务的微调结果只需存储少量额外参数,而不是整个模型。\n",
|
66 |
+
"\n",
|
67 |
+
"3. **适用多任务**:\n",
|
68 |
+
" - 同一预训练模型可以通过不同的 PEFT 模块适配多个任务,无需重新训练。\n",
|
69 |
+
"\n",
|
70 |
+
"4. **降低计算开销**:\n",
|
71 |
+
" - 训练所需的内存和计算显著减少,适合资源有限的环境。\n",
|
72 |
+
"\n",
|
73 |
+
"---\n",
|
74 |
+
"\n",
|
75 |
+
"### **4. 应用场景**\n",
|
76 |
+
"\n",
|
77 |
+
"1. **领域特定任务**:\n",
|
78 |
+
" - 医疗、法律、金融等领域微调预训练模型。\n",
|
79 |
+
"\n",
|
80 |
+
"2. **多任务学习**:\n",
|
81 |
+
" - 适配多个任务,复用同一模型的预训练权重。\n",
|
82 |
+
"\n",
|
83 |
+
"3. **资源受限场景**:\n",
|
84 |
+
" - 移动设备、边缘设备上的模型部署。\n",
|
85 |
+
"\n",
|
86 |
+
"---\n",
|
87 |
+
"\n",
|
88 |
+
"### **5. Hugging Face PEFT 库**\n",
|
89 |
+
"\n",
|
90 |
+
"Hugging Face 提供了专门的 PEFT 库,支持多种参数高效微调技术:\n",
|
91 |
+
"- **安装**:\n",
|
92 |
+
" ```bash\n",
|
93 |
+
" pip install peft\n",
|
94 |
+
" ```\n",
|
95 |
+
"- **使用 LoRA 微调示例**:\n",
|
96 |
+
" ```python\n",
|
97 |
+
" from transformers import AutoModelForCausalLM, AutoTokenizer\n",
|
98 |
+
" from peft import LoraConfig, get_peft_model, TaskType\n",
|
99 |
+
"\n",
|
100 |
+
" # 加载模型和分词器\n",
|
101 |
+
" model_name = \"gpt2\"\n",
|
102 |
+
" model = AutoModelForCausalLM.from_pretrained(model_name)\n",
|
103 |
+
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
|
104 |
+
"\n",
|
105 |
+
" # 配置 LoRA\n",
|
106 |
+
" lora_config = LoraConfig(\n",
|
107 |
+
" task_type=TaskType.CAUSAL_LM,\n",
|
108 |
+
" r=8,\n",
|
109 |
+
" lora_alpha=32,\n",
|
110 |
+
" target_modules=[\"q_proj\", \"v_proj\"],\n",
|
111 |
+
" lora_dropout=0.1,\n",
|
112 |
+
" bias=\"none\"\n",
|
113 |
+
" )\n",
|
114 |
+
"\n",
|
115 |
+
" # 使用 LoRA 微调模型\n",
|
116 |
+
" model = get_peft_model(model, lora_config)\n",
|
117 |
+
" model.print_trainable_parameters()\n",
|
118 |
+
"\n",
|
119 |
+
" # 微调代码...\n",
|
120 |
+
" ```\n",
|
121 |
+
"\n",
|
122 |
+
"---\n",
|
123 |
+
"\n",
|
124 |
+
"### **6. PEFT 的局限性**\n",
|
125 |
+
"1. **特定任务限制**:\n",
|
126 |
+
" - 在一些复杂任务中,PEFT 方法可能不如全量微调效果好。\n",
|
127 |
+
"\n",
|
128 |
+
"2. **需要设计合适的模块**:\n",
|
129 |
+
" - 不同任务需要选择和设计合��的 PEFT 技术。\n",
|
130 |
+
"\n",
|
131 |
+
"3. **与模型架构相关**:\n",
|
132 |
+
" - PEFT 技术可能需要对模型架构进行一定程度的修改。\n",
|
133 |
+
"\n",
|
134 |
+
"---\n",
|
135 |
+
"\n",
|
136 |
+
"### **7. 总结**\n",
|
137 |
+
"PEFT 是一个极具潜力的技术,特别适合在有限资源下对大模型进行微调。它在许多领域和任务中已显示出良好的效果,例如 LoRA 和 Adapter 模型已经成为高效微调的主流方法。\n",
|
138 |
+
"\n",
|
139 |
+
"如果您需要实现高效微调,可以结合 Hugging Face 的 PEFT 库快速上手。"
|
140 |
+
]
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"cell_type": "code",
|
144 |
+
"execution_count": 1,
|
145 |
+
"id": "5aa3d240-44e1-4811-8f61-d6ff2500a798",
|
146 |
+
"metadata": {},
|
147 |
+
"outputs": [],
|
148 |
+
"source": [
|
149 |
+
"import subprocess\n",
|
150 |
+
"import os\n",
|
151 |
+
"# 设置环境变量, autodl一般区域\n",
|
152 |
+
"result = subprocess.run('bash -c \"source /etc/network_turbo && env | grep proxy\"', shell=True, capture_output=True, text=True)\n",
|
153 |
+
"output = result.stdout\n",
|
154 |
+
"for line in output.splitlines():\n",
|
155 |
+
" if '=' in line:\n",
|
156 |
+
" var, value = line.split('=', 1)\n",
|
157 |
+
" os.environ[var] = value"
|
158 |
+
]
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"cell_type": "markdown",
|
162 |
+
"id": "17bdb69d-3f0f-465e-bd60-2047a088e264",
|
163 |
+
"metadata": {},
|
164 |
+
"source": [
|
165 |
+
"如果您不确定模型中有哪些模块可以微调,可以打印模型结构:"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"cell_type": "code",
|
170 |
+
"execution_count": null,
|
171 |
+
"id": "41a0c049-9134-4d89-aad0-1aa2241a9fca",
|
172 |
+
"metadata": {},
|
173 |
+
"outputs": [],
|
174 |
+
"source": [
|
175 |
+
"from transformers import AutoModelForCausalLM\n",
|
176 |
+
"\n",
|
177 |
+
"model = AutoModelForCausalLM.from_pretrained(\"gpt2\")\n",
|
178 |
+
"\n",
|
179 |
+
"# 打印所有模块名称\n",
|
180 |
+
"for name, module in model.named_modules():\n",
|
181 |
+
" print(name)"
|
182 |
+
]
|
183 |
+
}
|
184 |
+
],
|
185 |
+
"metadata": {
|
186 |
+
"kernelspec": {
|
187 |
+
"display_name": "Python 3 (ipykernel)",
|
188 |
+
"language": "python",
|
189 |
+
"name": "python3"
|
190 |
+
},
|
191 |
+
"language_info": {
|
192 |
+
"codemirror_mode": {
|
193 |
+
"name": "ipython",
|
194 |
+
"version": 3
|
195 |
+
},
|
196 |
+
"file_extension": ".py",
|
197 |
+
"mimetype": "text/x-python",
|
198 |
+
"name": "python",
|
199 |
+
"nbconvert_exporter": "python",
|
200 |
+
"pygments_lexer": "ipython3",
|
201 |
+
"version": "3.12.3"
|
202 |
+
}
|
203 |
+
},
|
204 |
+
"nbformat": 4,
|
205 |
+
"nbformat_minor": 5
|
206 |
+
}
|
04-gene-sft/.ipynb_checkpoints/build_gene_bpe_seg-checkpoint.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sentencepiece as spm
|
2 |
+
|
3 |
+
spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',
|
4 |
+
model_prefix='gene_bpe_seg',
|
5 |
+
vocab_size=60000,
|
6 |
+
model_type='bpe', #默认是unigram
|
7 |
+
num_threads=10,
|
8 |
+
)
|
9 |
+
|
10 |
+
from sentencepiece import SentencePieceProcessor
|
11 |
+
model_path = "gene_bpe_seg.model"
|
12 |
+
sp_model = SentencePieceProcessor(model_file=model_path)
|
13 |
+
mm = sp_model.EncodeAsPieces("TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV")
|
14 |
+
print(mm)
|
04-gene-sft/.ipynb_checkpoints/deepspeed_pretrain_gpt2-checkpoint.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import os
|
2 |
+
|
3 |
+
# # 设置环境变量
|
4 |
+
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
|
5 |
+
|
6 |
+
# # 打印环境变量以确认设置成功
|
7 |
+
# print(os.environ.get('HF_ENDPOINT'))
|
8 |
+
import subprocess
|
9 |
+
import os
|
10 |
+
# 设置环境变量, autodl一般区域
|
11 |
+
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
|
12 |
+
output = result.stdout
|
13 |
+
for line in output.splitlines():
|
14 |
+
if '=' in line:
|
15 |
+
var, value = line.split('=', 1)
|
16 |
+
os.environ[var] = value
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
import math
|
21 |
+
from transformers import (
|
22 |
+
GPT2Config,
|
23 |
+
GPT2LMHeadModel,
|
24 |
+
GPT2TokenizerFast,
|
25 |
+
TrainingArguments,
|
26 |
+
Trainer,
|
27 |
+
DataCollatorForLanguageModeling,
|
28 |
+
)
|
29 |
+
from datasets import Dataset
|
30 |
+
from datasets import load_dataset
|
31 |
+
import evaluate
|
32 |
+
import numpy as np
|
33 |
+
from transformers import AutoTokenizer,AutoConfig
|
34 |
+
|
35 |
+
|
36 |
+
# 加载 OpenWebText 数据集
|
37 |
+
dataset = load_dataset("text", data_files="../01-data_env/data/dna_1g.txt")["train"].train_test_split(test_size=0.01, shuffle=True)
|
38 |
+
|
39 |
+
# 定义最大输入长度
|
40 |
+
max_length = 256
|
41 |
+
|
42 |
+
|
43 |
+
# 数据预处理
|
44 |
+
def preprocess_function(examples):
|
45 |
+
return tokenizer(examples["text"], truncation=True, max_length=max_length)
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
# 初始化 GPT-2 分词器
|
50 |
+
tokenizer = AutoTokenizer.from_pretrained("gpt2_tokenizer")
|
51 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=5)
|
52 |
+
|
53 |
+
# 4. 创建一个数据收集器,用于动态填充和遮蔽
|
54 |
+
data_collator = DataCollatorForLanguageModeling(
|
55 |
+
tokenizer=tokenizer, mlm=False
|
56 |
+
)
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
# 加载并调整 GPT-2 配置
|
61 |
+
config = AutoConfig.from_pretrained(
|
62 |
+
"gpt2", # 加载 GPT-2 的默认配置
|
63 |
+
vocab_size=len(tokenizer), # 更新词汇表大小为自定义分词器的词汇表大小
|
64 |
+
n_ctx=max_length, # 最大上下文长度(序列长度)
|
65 |
+
n_positions=max_length, # 最大位置编码长度,通常与 n_ctx 一致
|
66 |
+
)
|
67 |
+
|
68 |
+
# 初始化 GPT-2 模型
|
69 |
+
model = GPT2LMHeadModel(config)
|
70 |
+
|
71 |
+
# 定义训练参数
|
72 |
+
training_args = TrainingArguments(
|
73 |
+
output_dir="./gpt2-small",
|
74 |
+
overwrite_output_dir=True,
|
75 |
+
num_train_epochs=5,
|
76 |
+
per_device_train_batch_size=64,
|
77 |
+
save_steps=10000,
|
78 |
+
save_total_limit=2,
|
79 |
+
logging_dir="./logs",
|
80 |
+
logging_steps=20000,
|
81 |
+
evaluation_strategy="steps",
|
82 |
+
eval_steps=10000,
|
83 |
+
learning_rate=5e-5,
|
84 |
+
warmup_steps=500,
|
85 |
+
weight_decay=0.01,
|
86 |
+
fp16=True, # 启用混合精度训练
|
87 |
+
deepspeed="ds_zero2_no_offload.json"
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
# 初始化 Trainer
|
92 |
+
trainer = Trainer(
|
93 |
+
model=model,
|
94 |
+
args=training_args,
|
95 |
+
train_dataset=tokenized_dataset["train"],
|
96 |
+
eval_dataset=tokenized_dataset["test"],
|
97 |
+
tokenizer=tokenizer,
|
98 |
+
data_collator=data_collator,
|
99 |
+
)
|
100 |
+
|
101 |
+
# 开始训练
|
102 |
+
trainer.train()
|
103 |
+
|
104 |
+
|
105 |
+
# 评估 perplexity
|
106 |
+
eval_results = trainer.evaluate()
|
107 |
+
perplexity = math.exp(eval_results["eval_loss"])
|
108 |
+
print(f"Perplexity: {perplexity}")
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
out_model_path = "gpt2-small-gene-openweb"
|
113 |
+
trainer.save_model(out_model_path)
|
114 |
+
tokenizer.save_pretrained(out_model_path)
|
04-gene-sft/.ipynb_checkpoints/ds_zero2_no_offload-checkpoint.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"fp16": {
|
3 |
+
"enabled": "auto",
|
4 |
+
"loss_scale": 0,
|
5 |
+
"loss_scale_window": 100,
|
6 |
+
"initial_scale_power": 16,
|
7 |
+
"hysteresis": 2,
|
8 |
+
"min_loss_scale": 1e-10
|
9 |
+
},
|
10 |
+
|
11 |
+
"zero_optimization": {
|
12 |
+
"stage": 2,
|
13 |
+
"allgather_partitions": true,
|
14 |
+
"allgather_bucket_size": 1e8,
|
15 |
+
"overlap_comm": true,
|
16 |
+
"reduce_scatter": true,
|
17 |
+
"reduce_bucket_size": 1e8,
|
18 |
+
"contiguous_gradients": true
|
19 |
+
},
|
20 |
+
|
21 |
+
"gradient_accumulation_steps": "auto",
|
22 |
+
"gradient_clipping": "auto",
|
23 |
+
"steps_per_print": 2000,
|
24 |
+
"train_batch_size": "auto",
|
25 |
+
"train_micro_batch_size_per_gpu": "auto",
|
26 |
+
"wall_clock_breakdown": false
|
27 |
+
}
|