{ "cells": [ { "cell_type": "markdown", "id": "9131f25f-227b-4dbe-b28d-c5006df092c6", "metadata": {}, "source": [ "# 2.5 基于多模态数据构建大模型" ] }, { "cell_type": "code", "execution_count": null, "id": "1a30b35c-1f5f-41e6-8fe1-5f522c700e9e", "metadata": {}, "outputs": [], "source": [ "from tokenizers import (\n", " decoders,\n", " models,\n", " normalizers,\n", " pre_tokenizers,\n", " processors,\n", " trainers,\n", " Tokenizer,\n", ")\n", "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "688fa3b1-f2ca-457a-abde-117c79b54fa9", "metadata": {}, "outputs": [], "source": [ "tokenizer = Tokenizer(models.BPE())\n", "tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) #use_regex=False,空格当成一般字符串\n", "trainer = trainers.BpeTrainer(vocab_size=90000, special_tokens=[\"<|endoftext|>\"]) #9w words" ] }, { "cell_type": "code", "execution_count": null, "id": "7d680700-1051-4af4-94d6-2ce3071a5979", "metadata": {}, "outputs": [], "source": [ "tokenizer.train([\"../01-data_env/data/dna_1g.txt\",\"../01-data_env/data/protein_1g.txt\",\"../01-data_env/data/english_500m.txt\"]\n", " , trainer=trainer) #all file list, take 10-20 min" ] }, { "cell_type": "code", "execution_count": null, "id": "74434ece-2f6e-46fa-9a9e-ff88e9364de8", "metadata": {}, "outputs": [], "source": [ "tokenizer.save(\"gene_eng_dict.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8ea34e18-6cee-40b9-ba96-d8734153eb9f", "metadata": {}, "outputs": [], "source": [ "#然后我们可以使用from_file() 方法从该文件里重新加载 Tokenizer 对象:\n", "new_tokenizer = Tokenizer.from_file(\"gene_eng_dict.json\")\n", "\n", "#要在 🤗 Transformers 中使用这个标记器,我们必须将它包裹在一个 PreTrainedTokenizerFast 类中\n", "from transformers import GPT2TokenizerFast\n", "gene_eng_tokenizer = GPT2TokenizerFast(tokenizer_object=new_tokenizer)\n", "gene_eng_tokenizer.save_pretrained(\"gene_eng_dict\")\n", "#dna_tokenizer.push_to_hub(\"dna_bpe_dict_1g\", organization=\"dnagpt\", use_auth_token=\"hf_*****\") # push to huggingface" ] }, { "cell_type": "code", "execution_count": null, "id": "16c7a3ef-c924-4fbb-b8ab-c12fab43f019", "metadata": {}, "outputs": [], "source": [ "tokenizer_new = AutoTokenizer.from_pretrained('gene_eng_dict')\n", "tokenizer_new.tokenize(\"TGGCGTGAACCCGGGATCGGG,hello world hello gene, MANITWMANHTGWSDFILLGLFRQSKHPALLCVVIFVVFLMAL\")" ] }, { "cell_type": "markdown", "id": "0ca0b2e3-f270-4645-abbb-cb8535e97a0a", "metadata": {}, "source": [ "## 训练混合模型" ] }, { "cell_type": "code", "execution_count": null, "id": "c9b1c9b4-57a8-4711-912d-307e55481f8a", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig,GPT2Tokenizer\n", "from transformers import GPT2Tokenizer,GPT2Model,AutoModel\n", "from transformers import DataCollatorForLanguageModeling\n", "from transformers import Trainer, TrainingArguments\n", "from transformers import LineByLineTextDataset\n", "from tokenizers import Tokenizer\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "3926a959-4224-4d78-9413-dc47a58087e0", "metadata": {}, "outputs": [], "source": [ "tokenizer = GPT2Tokenizer.from_pretrained(\"gene_eng_dict\")\n", "tokenizer.pad_token = tokenizer.eos_token" ] }, { "cell_type": "code", "execution_count": null, "id": "1c2f5a6d-d405-40dc-a802-f0c1dff50a1e", "metadata": {}, "outputs": [], "source": [ "max_length = 256 #最大输入长度\n", "\n", "config = AutoConfig.from_pretrained(\n", " \"gpt2\",\n", " vocab_size=len(tokenizer),\n", " n_ctx=max_length, #最大长度\n", " bos_token_id=tokenizer.bos_token_id,\n", " eos_token_id=tokenizer.eos_token_id,\n", ")\n", "\n", "model = GPT2LMHeadModel(config) #for pretrain,从头预训练" ] }, { "cell_type": "code", "execution_count": null, "id": "c8a47141-56a7-4e41-8cfd-1b381a64e2c0", "metadata": {}, "outputs": [], "source": [ "# 1. load dna dataset\n", "raw_dataset = load_dataset('text', \n", " data_files=[\"../01-data_env/data/dna_1g.txt\",\"../01-data_env/data/protein_1g.txt\",\"../01-data_env/data/english_500m.txt\"])\n", "\n", "dataset = raw_dataset[\"train\"].train_test_split(test_size=0.05, shuffle=True)\n", "\n", "# 2. tokenize\n", "def tokenize_function(examples):\n", " return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=max_length)\n", "\n", "# 3. 对数据集应用分词函数\n", "tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'], num_proc=15) # 设置为你的 CPU 核心数或根据需要调整\n", "\n", "# 4. 创建一个数据收集器,用于动态填充和遮蔽\n", "data_collator = DataCollatorForLanguageModeling(\n", " tokenizer=tokenizer, mlm=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "f4f802a2-88e2-49c2-a654-9d6e0996433a", "metadata": {}, "outputs": [], "source": [ "run_path = \"gpt2_run\"\n", "train_epoches = 5\n", "batch_size = 10\n", "\n", "\n", "training_args = TrainingArguments(\n", " output_dir=run_path,\n", " overwrite_output_dir=True,\n", " num_train_epochs=train_epoches,\n", " per_device_train_batch_size=batch_size,\n", " save_steps=2000,\n", " save_total_limit=2,\n", " prediction_loss_only=True,\n", " fp16=True, #v100没法用\n", " )\n", "\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_datasets[\"train\"],\n", " eval_dataset=tokenized_datasets[\"test\"],\n", " data_collator=data_collator,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "13fa4a99-ee7c-4d6a-853f-4be04a4ee43c", "metadata": {}, "outputs": [], "source": [ "trainer.train()\n", "trainer.save_model(\"gene_eng_gpt2_v0\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ca452721-3914-49be-a577-d4c257946578", "metadata": {}, "outputs": [], "source": [ "import math\n", "eval_results = trainer.evaluate()\n", "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b7e7a455-0e08-4a75-87c1-0f909829b1c1", "metadata": {}, "outputs": [], "source": [ "#upload model\n", "#model.push_to_hub(\"gene_eng_gpt2_v0\", organization=\"dnagpt\", use_auth_token=\"hf_*******\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }