dushuai112233 commited on
Commit
3098aa9
·
verified ·
1 Parent(s): c62ca9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -212
app.py CHANGED
@@ -1,212 +1,212 @@
1
- # import torch
2
- # from torch.utils.data import DataLoader
3
- # from torch.utils.tensorboard import SummaryWriter
4
- # from transformers import AutoModelForCausalLM, AutoTokenizer
5
- # from peft import LoraConfig, get_peft_model, TaskType
6
- # import pandas as pd
7
- # from qa_dataset import QADataset
8
- # from tqdm import tqdm
9
- # import os, time, sys
10
- #
11
- #
12
- # def train_model(model, train_loader, val_loader, optimizer, gradient_accumulation_steps,
13
- # device, num_epochs, model_output_dir, writer):
14
- # batch_step = 0
15
- # for epoch in range(num_epochs):
16
- # time1 = time.time()
17
- # model.train()
18
- # for index, data in enumerate(tqdm(train_loader, file=sys.stdout, desc="Train Epoch: " + str(epoch))):
19
- # input_ids = data['input_ids'].to(device, dtype=torch.long)
20
- # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
21
- # labels = data['labels'].to(device, dtype=torch.long)
22
- # # 前向传播
23
- # outputs = model(
24
- # input_ids=input_ids,
25
- # attention_mask=attention_mask,
26
- # labels=labels,
27
- # )
28
- # loss = outputs.loss
29
- # # 反向传播,计算当前梯度
30
- # loss.backward()
31
- # # 梯度累积步数
32
- # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(train_loader) - 1:
33
- # # 更新网络参数
34
- # optimizer.step()
35
- # # 清空过往梯度
36
- # optimizer.zero_grad()
37
- # writer.add_scalar('Loss/train', loss, batch_step)
38
- # batch_step += 1
39
- # # 100轮打印一次 loss
40
- # if index % 100 == 0 or index == len(train_loader) - 1:
41
- # time2 = time.time()
42
- # tqdm.write(
43
- # f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
44
- # # 验证
45
- # model.eval()
46
- # val_loss = validate_model(model, val_loader, device)
47
- # writer.add_scalar('Loss/val', val_loss, epoch)
48
- # print(f"val loss: {val_loss} , epoch: {epoch}")
49
- # print("Save Model To ", model_output_dir)
50
- # model.save_pretrained(model_output_dir)
51
- #
52
- #
53
- # def validate_model(model, device, val_loader):
54
- # running_loss = 0.0
55
- # with torch.no_grad():
56
- # for _, data in enumerate(tqdm(val_loader, file=sys.stdout, desc="Validation Data")):
57
- # input_ids = data['input_ids'].to(device, dtype=torch.long)
58
- # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
59
- # labels = data['labels'].to(device, dtype=torch.long)
60
- # outputs = model(
61
- # input_ids=input_ids,
62
- # attention_mask=attention_mask,
63
- # labels=labels,
64
- # )
65
- # loss = outputs.loss
66
- # running_loss += loss.item()
67
- # return running_loss / len(val_loader)
68
- #
69
- #
70
- # def main():
71
- # # 基础模型位置
72
- # model_name = "model/Qwen2-1.5B-Instruct"
73
- # # 训练集
74
- # train_json_path = "./data/train.json"
75
- # # 验证集
76
- # val_json_path = "./data/val.json"
77
- # max_source_length = 128
78
- # max_target_length = 256
79
- # epochs = 10
80
- # batch_size = 1
81
- # lr = 1e-4
82
- # gradient_accumulation_steps = 16
83
- # lora_rank = 8
84
- # lora_alpha = 32
85
- # model_output_dir = "output"
86
- # logs_dir = "logs"
87
- # # 设备
88
- # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
89
- # # 加载分词器和模型
90
- # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
91
- # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
92
- # # setup peft
93
- # peft_config = LoraConfig(
94
- # task_type=TaskType.CAUSAL_LM,
95
- # target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
96
- # inference_mode=False,
97
- # r=lora_rank,
98
- # lora_alpha=lora_alpha,
99
- # lora_dropout=0.1
100
- # )
101
- # model = get_peft_model(model, peft_config)
102
- # model.is_parallelizable = True
103
- # model.model_parallel = True
104
- # model.print_trainable_parameters()
105
- # print("Start Load Train Data...")
106
- # train_params = {
107
- # "batch_size": batch_size,
108
- # "shuffle": True,
109
- # "num_workers": 0,
110
- # }
111
- # training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
112
- # training_loader = DataLoader(training_set, **train_params)
113
- # print("Start Load Validation Data...")
114
- # val_params = {
115
- # "batch_size": batch_size,
116
- # "shuffle": False,
117
- # "num_workers": 0,
118
- # }
119
- # val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
120
- # val_loader = DataLoader(val_set, **val_params)
121
- # # 日志记录
122
- # writer = SummaryWriter(logs_dir)
123
- # # 优化器
124
- # optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
125
- # model = model.to(device)
126
- # # 开始训练
127
- # print("Start Training...")
128
- # train_model(
129
- # model=model,
130
- # train_loader=training_loader,
131
- # val_loader=val_loader,
132
- # optimizer=optimizer,
133
- # gradient_accumulation_steps=gradient_accumulation_steps,
134
- # device=device,
135
- # num_epochs=epochs,
136
- # model_output_dir=model_output_dir,
137
- # writer=writer
138
- # )
139
- # writer.close()
140
- #
141
- #
142
- # if __name__ == '__main__':
143
- # main()
144
- #
145
- from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
146
- from peft import LoraConfig, get_peft_model, TaskType
147
- from datasets import load_dataset
148
- from torch.utils.tensorboard import SummaryWriter
149
- import os
150
- import torch
151
- def main():
152
- # 基础模型位置
153
- model_name = "model/Qwen2-1.5B-Instruct"
154
- # 设备
155
- device = "cuda" if torch.cuda.is_available() else "cpu"
156
-
157
- # 加载分词器和模型
158
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
159
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
160
-
161
- # Setup PEFT (Low-Rank Adaption)
162
- peft_config = LoraConfig(
163
- task_type=TaskType.CAUSAL_LM,
164
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
165
- inference_mode=False,
166
- r=8, # 低秩矩阵的秩
167
- lora_alpha=32, # LoRA的alpha超参数
168
- lora_dropout=0.1
169
- )
170
- model = get_peft_model(model, peft_config)
171
-
172
- # Load Dataset
173
- train_dataset = load_dataset('json', data_files='./data/train.json', split='train')
174
- val_dataset = load_dataset('json', data_files='./data/val.json', split='validation')
175
-
176
- # Tokenize the datasets
177
- def tokenize_function(examples):
178
- return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)
179
-
180
- train_dataset = train_dataset.map(tokenize_function, batched=True)
181
- val_dataset = val_dataset.map(tokenize_function, batched=True)
182
-
183
- # Define Training Arguments
184
- training_args = TrainingArguments(
185
- output_dir="./output", # 保存模型和日志的路径
186
- evaluation_strategy="epoch", # 每个epoch后进行验证
187
- per_device_train_batch_size=1, # 每个设备的batch size
188
- per_device_eval_batch_size=1, # 验证时的batch size
189
- logging_dir="./logs", # 日志目录
190
- logging_steps=10, # 每10步记录一次日志
191
- save_steps=100, # 每100步保存一次模型
192
- num_train_epochs=10, # 训练的epoch数
193
- save_total_limit=2, # 最大保存模型数
194
- )
195
-
196
- # Define the Trainer
197
- trainer = Trainer(
198
- model=model, # 训练的模型
199
- args=training_args, # 训练的参数
200
- train_dataset=train_dataset, # 训练数据集
201
- eval_dataset=val_dataset, # 验证数据集
202
- tokenizer=tokenizer, # 用于预处理的分词器
203
- )
204
-
205
- # Start Training
206
- trainer.train()
207
-
208
- # Save the model
209
- model.save_pretrained('./output')
210
-
211
- if __name__ == '__main__':
212
- main()
 
1
+ # import torch
2
+ # from torch.utils.data import DataLoader
3
+ # from torch.utils.tensorboard import SummaryWriter
4
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ # from peft import LoraConfig, get_peft_model, TaskType
6
+ # import pandas as pd
7
+ # from qa_dataset import QADataset
8
+ # from tqdm import tqdm
9
+ # import os, time, sys
10
+ #
11
+ #
12
+ # def train_model(model, train_loader, val_loader, optimizer, gradient_accumulation_steps,
13
+ # device, num_epochs, model_output_dir, writer):
14
+ # batch_step = 0
15
+ # for epoch in range(num_epochs):
16
+ # time1 = time.time()
17
+ # model.train()
18
+ # for index, data in enumerate(tqdm(train_loader, file=sys.stdout, desc="Train Epoch: " + str(epoch))):
19
+ # input_ids = data['input_ids'].to(device, dtype=torch.long)
20
+ # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
21
+ # labels = data['labels'].to(device, dtype=torch.long)
22
+ # # 前向传播
23
+ # outputs = model(
24
+ # input_ids=input_ids,
25
+ # attention_mask=attention_mask,
26
+ # labels=labels,
27
+ # )
28
+ # loss = outputs.loss
29
+ # # 反向传播,计算当前梯度
30
+ # loss.backward()
31
+ # # 梯度累积步数
32
+ # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(train_loader) - 1:
33
+ # # 更新网络参数
34
+ # optimizer.step()
35
+ # # 清空过往梯度
36
+ # optimizer.zero_grad()
37
+ # writer.add_scalar('Loss/train', loss, batch_step)
38
+ # batch_step += 1
39
+ # # 100轮打印一次 loss
40
+ # if index % 100 == 0 or index == len(train_loader) - 1:
41
+ # time2 = time.time()
42
+ # tqdm.write(
43
+ # f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
44
+ # # 验证
45
+ # model.eval()
46
+ # val_loss = validate_model(model, val_loader, device)
47
+ # writer.add_scalar('Loss/val', val_loss, epoch)
48
+ # print(f"val loss: {val_loss} , epoch: {epoch}")
49
+ # print("Save Model To ", model_output_dir)
50
+ # model.save_pretrained(model_output_dir)
51
+ #
52
+ #
53
+ # def validate_model(model, device, val_loader):
54
+ # running_loss = 0.0
55
+ # with torch.no_grad():
56
+ # for _, data in enumerate(tqdm(val_loader, file=sys.stdout, desc="Validation Data")):
57
+ # input_ids = data['input_ids'].to(device, dtype=torch.long)
58
+ # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
59
+ # labels = data['labels'].to(device, dtype=torch.long)
60
+ # outputs = model(
61
+ # input_ids=input_ids,
62
+ # attention_mask=attention_mask,
63
+ # labels=labels,
64
+ # )
65
+ # loss = outputs.loss
66
+ # running_loss += loss.item()
67
+ # return running_loss / len(val_loader)
68
+ #
69
+ #
70
+ # def main():
71
+ # # 基础模型位置
72
+ # model_name = "model/Qwen2-1.5B-Instruct"
73
+ # # 训练集
74
+ # train_json_path = "./data/train.json"
75
+ # # 验证集
76
+ # val_json_path = "./data/val.json"
77
+ # max_source_length = 128
78
+ # max_target_length = 256
79
+ # epochs = 10
80
+ # batch_size = 1
81
+ # lr = 1e-4
82
+ # gradient_accumulation_steps = 16
83
+ # lora_rank = 8
84
+ # lora_alpha = 32
85
+ # model_output_dir = "output"
86
+ # logs_dir = "logs"
87
+ # # 设备
88
+ # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
89
+ # # 加载分词器和模型
90
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
91
+ # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
92
+ # # setup peft
93
+ # peft_config = LoraConfig(
94
+ # task_type=TaskType.CAUSAL_LM,
95
+ # target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
96
+ # inference_mode=False,
97
+ # r=lora_rank,
98
+ # lora_alpha=lora_alpha,
99
+ # lora_dropout=0.1
100
+ # )
101
+ # model = get_peft_model(model, peft_config)
102
+ # model.is_parallelizable = True
103
+ # model.model_parallel = True
104
+ # model.print_trainable_parameters()
105
+ # print("Start Load Train Data...")
106
+ # train_params = {
107
+ # "batch_size": batch_size,
108
+ # "shuffle": True,
109
+ # "num_workers": 0,
110
+ # }
111
+ # training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
112
+ # training_loader = DataLoader(training_set, **train_params)
113
+ # print("Start Load Validation Data...")
114
+ # val_params = {
115
+ # "batch_size": batch_size,
116
+ # "shuffle": False,
117
+ # "num_workers": 0,
118
+ # }
119
+ # val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
120
+ # val_loader = DataLoader(val_set, **val_params)
121
+ # # 日志记录
122
+ # writer = SummaryWriter(logs_dir)
123
+ # # 优化器
124
+ # optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
125
+ # model = model.to(device)
126
+ # # 开始训练
127
+ # print("Start Training...")
128
+ # train_model(
129
+ # model=model,
130
+ # train_loader=training_loader,
131
+ # val_loader=val_loader,
132
+ # optimizer=optimizer,
133
+ # gradient_accumulation_steps=gradient_accumulation_steps,
134
+ # device=device,
135
+ # num_epochs=epochs,
136
+ # model_output_dir=model_output_dir,
137
+ # writer=writer
138
+ # )
139
+ # writer.close()
140
+ #
141
+ #
142
+ # if __name__ == '__main__':
143
+ # main()
144
+ #
145
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
146
+ from peft import LoraConfig, get_peft_model, TaskType
147
+ from datasets import load_dataset
148
+ from torch.utils.tensorboard import SummaryWriter
149
+ import os
150
+ import torch
151
+ def main():
152
+ # 基础模型位置
153
+ model_name = "D:\\Qwen2-1.5B-Instruct"
154
+ # 设备
155
+ device = "cuda" if torch.cuda.is_available() else "cpu"
156
+
157
+ # 加载分词器和模型
158
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
159
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
160
+
161
+ # Setup PEFT (Low-Rank Adaption)
162
+ peft_config = LoraConfig(
163
+ task_type=TaskType.CAUSAL_LM,
164
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
165
+ inference_mode=False,
166
+ r=8, # 低秩矩阵的秩
167
+ lora_alpha=32, # LoRA的alpha超参数
168
+ lora_dropout=0.1
169
+ )
170
+ model = get_peft_model(model, peft_config)
171
+
172
+ # Load Dataset
173
+ train_dataset = load_dataset('json', data_files='./data/train.json', split='train')
174
+ val_dataset = load_dataset('json', data_files='./data/val.json', split='validation')
175
+
176
+ # Tokenize the datasets
177
+ def tokenize_function(examples):
178
+ return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)
179
+
180
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
181
+ val_dataset = val_dataset.map(tokenize_function, batched=True)
182
+
183
+ # Define Training Arguments
184
+ training_args = TrainingArguments(
185
+ output_dir="./output", # 保存模型和日志的路径
186
+ evaluation_strategy="epoch", # 每个epoch后进行验证
187
+ per_device_train_batch_size=1, # 每个设备的batch size
188
+ per_device_eval_batch_size=1, # 验证时的batch size
189
+ logging_dir="./logs", # 日志目录
190
+ logging_steps=10, # 每10步记录一次日志
191
+ save_steps=100, # 每100步保存一次模型
192
+ num_train_epochs=10, # 训练的epoch数
193
+ save_total_limit=2, # 最大保存模型数
194
+ )
195
+
196
+ # Define the Trainer
197
+ trainer = Trainer(
198
+ model=model, # 训练的模型
199
+ args=training_args, # 训练的参数
200
+ train_dataset=train_dataset, # 训练数据集
201
+ eval_dataset=val_dataset, # 验证数据集
202
+ tokenizer=tokenizer, # 用于预处理的分词器
203
+ )
204
+
205
+ # Start Training
206
+ trainer.train()
207
+
208
+ # Save the model
209
+ model.save_pretrained('./output')
210
+
211
+ if __name__ == '__main__':
212
+ main()