dushuai112233 commited on
Commit
a093e2c
·
verified ·
1 Parent(s): 4ee2b4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -32
app.py CHANGED
@@ -1,74 +1,76 @@
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
2
  from peft import LoraConfig, get_peft_model, TaskType
3
  from datasets import load_dataset
4
- from torch.utils.tensorboard import SummaryWriter
5
  import torch
6
  import os
7
 
8
  def main():
9
  # 基础模型位置
10
- model_name = "dushuai112233/Qwen2-1.5B-Instruct" # 使用你提供的模型
11
- # 设备
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
  # 加载分词器和模型
15
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
16
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
17
 
18
- # Setup PEFT (Low-Rank Adaption)
19
  peft_config = LoraConfig(
20
  task_type=TaskType.CAUSAL_LM,
21
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
22
  inference_mode=False,
23
- r=8, # 低秩矩阵的秩
24
- lora_alpha=32, # LoRA的alpha超参数
25
  lora_dropout=0.1
26
  )
27
  model = get_peft_model(model, peft_config)
28
 
29
- # 加载 Hugging Face 数据集
30
- ds = load_dataset("dushuai112233/medical") # 自动加载 train 和 val 分区
31
-
32
- # 提取训练集和验证集
33
  train_dataset = ds["train"]
34
  val_dataset = ds["validation"]
35
 
36
- # Tokenize the datasets
37
  def tokenize_function(examples):
38
- # 注意: 对于 Causal LM,通常会使用输入文本作为标签(shifted label)
39
  encodings = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
40
- encodings['labels'] = encodings['input_ids'].copy() # Causal LM labels should be same as input_ids
41
  return encodings
42
 
43
  train_dataset = train_dataset.map(tokenize_function, batched=True)
44
  val_dataset = val_dataset.map(tokenize_function, batched=True)
45
 
46
- # Define Training Arguments
47
  training_args = TrainingArguments(
48
- output_dir="./output", # 保存模型和日志的路径
49
- evaluation_strategy="epoch", # 每个epoch后进行验证
50
- per_device_train_batch_size=1, # 每个设备的batch size
51
- per_device_eval_batch_size=1, # 验证时的batch size
52
- logging_dir="./logs", # 日志目录
53
- logging_steps=10, # 每10步记录一次日志
54
- save_steps=100, # 每100步保存一次模型
55
- num_train_epochs=10, # 训练的epoch数
56
- save_total_limit=2, # 最大保存模型数
 
57
  )
58
 
59
- # Define the Trainer
60
  trainer = Trainer(
61
- model=model, # 训练的模型
62
- args=training_args, # 训练的参数
63
- train_dataset=train_dataset, # 训练数据集
64
- eval_dataset=val_dataset, # 验证数据集
65
- tokenizer=tokenizer, # 用于预处理的分词器
66
  )
67
 
68
- # Start Training
69
- trainer.train()
 
 
 
 
 
 
70
 
71
- # Save the model
72
  model.save_pretrained('./output')
73
 
74
  if __name__ == '__main__':
 
1
  from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
2
  from peft import LoraConfig, get_peft_model, TaskType
3
  from datasets import load_dataset
 
4
  import torch
5
  import os
6
 
7
  def main():
8
  # 基础模型位置
9
+ model_name = "dushuai112233/Qwen2-1.5B-Instruct"
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
12
  # 加载分词器和模型
13
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
14
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
15
 
16
+ # Setup PEFT
17
  peft_config = LoraConfig(
18
  task_type=TaskType.CAUSAL_LM,
19
  target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
20
  inference_mode=False,
21
+ r=8,
22
+ lora_alpha=32,
23
  lora_dropout=0.1
24
  )
25
  model = get_peft_model(model, peft_config)
26
 
27
+ # 加载数据集
28
+ ds = load_dataset("dushuai112233/medical")
 
 
29
  train_dataset = ds["train"]
30
  val_dataset = ds["validation"]
31
 
32
+ # 数据集预处理
33
  def tokenize_function(examples):
 
34
  encodings = tokenizer(examples['question'], padding='max_length', truncation=True, max_length=128)
35
+ encodings['labels'] = encodings['input_ids'].copy()
36
  return encodings
37
 
38
  train_dataset = train_dataset.map(tokenize_function, batched=True)
39
  val_dataset = val_dataset.map(tokenize_function, batched=True)
40
 
41
+ # 设置训练参数
42
  training_args = TrainingArguments(
43
+ output_dir="./output",
44
+ evaluation_strategy="epoch",
45
+ per_device_train_batch_size=1,
46
+ per_device_eval_batch_size=1,
47
+ logging_dir="./logs",
48
+ logging_steps=10,
49
+ save_steps=100, # 每 100 步保存一次检查点
50
+ save_total_limit=2, # 限制最多保存 2 个检查点
51
+ num_train_epochs=10,
52
+ load_best_model_at_end=False, # 是否在训练结束时加载最优模型
53
  )
54
 
55
+ # 定义 Trainer
56
  trainer = Trainer(
57
+ model=model,
58
+ args=training_args,
59
+ train_dataset=train_dataset,
60
+ eval_dataset=val_dataset,
61
+ tokenizer=tokenizer,
62
  )
63
 
64
+ # 检查是否有中断点
65
+ checkpoint = None
66
+ if os.path.exists("./output") and len(os.listdir("./output")) > 0:
67
+ checkpoint = max([os.path.join("./output", ckpt) for ckpt in os.listdir("./output")], key=os.path.getmtime)
68
+ print(f"Resuming training from checkpoint: {checkpoint}")
69
+
70
+ # 开始训练
71
+ trainer.train(resume_from_checkpoint=checkpoint)
72
 
73
+ # 保存最终模型
74
  model.save_pretrained('./output')
75
 
76
  if __name__ == '__main__':