dushuai112233 commited on
Commit
47f89ab
·
verified ·
1 Parent(s): 016f433

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +212 -0
  2. dataset.py +49 -0
  3. demo01.py +2 -0
  4. qa_dataset.py +60 -0
  5. test.py +34 -0
app.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # from torch.utils.data import DataLoader
3
+ # from torch.utils.tensorboard import SummaryWriter
4
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ # from peft import LoraConfig, get_peft_model, TaskType
6
+ # import pandas as pd
7
+ # from qa_dataset import QADataset
8
+ # from tqdm import tqdm
9
+ # import os, time, sys
10
+ #
11
+ #
12
+ # def train_model(model, train_loader, val_loader, optimizer, gradient_accumulation_steps,
13
+ # device, num_epochs, model_output_dir, writer):
14
+ # batch_step = 0
15
+ # for epoch in range(num_epochs):
16
+ # time1 = time.time()
17
+ # model.train()
18
+ # for index, data in enumerate(tqdm(train_loader, file=sys.stdout, desc="Train Epoch: " + str(epoch))):
19
+ # input_ids = data['input_ids'].to(device, dtype=torch.long)
20
+ # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
21
+ # labels = data['labels'].to(device, dtype=torch.long)
22
+ # # 前向传播
23
+ # outputs = model(
24
+ # input_ids=input_ids,
25
+ # attention_mask=attention_mask,
26
+ # labels=labels,
27
+ # )
28
+ # loss = outputs.loss
29
+ # # 反向传播,计算当前梯度
30
+ # loss.backward()
31
+ # # 梯度累积步数
32
+ # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(train_loader) - 1:
33
+ # # 更新网络参数
34
+ # optimizer.step()
35
+ # # 清空过往梯度
36
+ # optimizer.zero_grad()
37
+ # writer.add_scalar('Loss/train', loss, batch_step)
38
+ # batch_step += 1
39
+ # # 100轮打印一次 loss
40
+ # if index % 100 == 0 or index == len(train_loader) - 1:
41
+ # time2 = time.time()
42
+ # tqdm.write(
43
+ # f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
44
+ # # 验证
45
+ # model.eval()
46
+ # val_loss = validate_model(model, val_loader, device)
47
+ # writer.add_scalar('Loss/val', val_loss, epoch)
48
+ # print(f"val loss: {val_loss} , epoch: {epoch}")
49
+ # print("Save Model To ", model_output_dir)
50
+ # model.save_pretrained(model_output_dir)
51
+ #
52
+ #
53
+ # def validate_model(model, device, val_loader):
54
+ # running_loss = 0.0
55
+ # with torch.no_grad():
56
+ # for _, data in enumerate(tqdm(val_loader, file=sys.stdout, desc="Validation Data")):
57
+ # input_ids = data['input_ids'].to(device, dtype=torch.long)
58
+ # attention_mask = data['attention_mask'].to(device, dtype=torch.long)
59
+ # labels = data['labels'].to(device, dtype=torch.long)
60
+ # outputs = model(
61
+ # input_ids=input_ids,
62
+ # attention_mask=attention_mask,
63
+ # labels=labels,
64
+ # )
65
+ # loss = outputs.loss
66
+ # running_loss += loss.item()
67
+ # return running_loss / len(val_loader)
68
+ #
69
+ #
70
+ # def main():
71
+ # # 基础模型位置
72
+ # model_name = "model/Qwen2-1.5B-Instruct"
73
+ # # 训练集
74
+ # train_json_path = "./data/train.json"
75
+ # # 验证集
76
+ # val_json_path = "./data/val.json"
77
+ # max_source_length = 128
78
+ # max_target_length = 256
79
+ # epochs = 10
80
+ # batch_size = 1
81
+ # lr = 1e-4
82
+ # gradient_accumulation_steps = 16
83
+ # lora_rank = 8
84
+ # lora_alpha = 32
85
+ # model_output_dir = "output"
86
+ # logs_dir = "logs"
87
+ # # 设备
88
+ # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
89
+ # # 加载分词器和模型
90
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
91
+ # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
92
+ # # setup peft
93
+ # peft_config = LoraConfig(
94
+ # task_type=TaskType.CAUSAL_LM,
95
+ # target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
96
+ # inference_mode=False,
97
+ # r=lora_rank,
98
+ # lora_alpha=lora_alpha,
99
+ # lora_dropout=0.1
100
+ # )
101
+ # model = get_peft_model(model, peft_config)
102
+ # model.is_parallelizable = True
103
+ # model.model_parallel = True
104
+ # model.print_trainable_parameters()
105
+ # print("Start Load Train Data...")
106
+ # train_params = {
107
+ # "batch_size": batch_size,
108
+ # "shuffle": True,
109
+ # "num_workers": 0,
110
+ # }
111
+ # training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
112
+ # training_loader = DataLoader(training_set, **train_params)
113
+ # print("Start Load Validation Data...")
114
+ # val_params = {
115
+ # "batch_size": batch_size,
116
+ # "shuffle": False,
117
+ # "num_workers": 0,
118
+ # }
119
+ # val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
120
+ # val_loader = DataLoader(val_set, **val_params)
121
+ # # 日志记录
122
+ # writer = SummaryWriter(logs_dir)
123
+ # # 优化器
124
+ # optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
125
+ # model = model.to(device)
126
+ # # 开始训练
127
+ # print("Start Training...")
128
+ # train_model(
129
+ # model=model,
130
+ # train_loader=training_loader,
131
+ # val_loader=val_loader,
132
+ # optimizer=optimizer,
133
+ # gradient_accumulation_steps=gradient_accumulation_steps,
134
+ # device=device,
135
+ # num_epochs=epochs,
136
+ # model_output_dir=model_output_dir,
137
+ # writer=writer
138
+ # )
139
+ # writer.close()
140
+ #
141
+ #
142
+ # if __name__ == '__main__':
143
+ # main()
144
+ #
145
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
146
+ from peft import LoraConfig, get_peft_model, TaskType
147
+ from datasets import load_dataset
148
+ from torch.utils.tensorboard import SummaryWriter
149
+ import os
150
+ import torch
151
+ def main():
152
+ # 基础模型位置
153
+ model_name = "model/Qwen2-1.5B-Instruct"
154
+ # 设备
155
+ device = "cuda" if torch.cuda.is_available() else "cpu"
156
+
157
+ # 加载分词器和模型
158
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
159
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
160
+
161
+ # Setup PEFT (Low-Rank Adaption)
162
+ peft_config = LoraConfig(
163
+ task_type=TaskType.CAUSAL_LM,
164
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
165
+ inference_mode=False,
166
+ r=8, # 低秩矩阵的秩
167
+ lora_alpha=32, # LoRA的alpha超参数
168
+ lora_dropout=0.1
169
+ )
170
+ model = get_peft_model(model, peft_config)
171
+
172
+ # Load Dataset
173
+ train_dataset = load_dataset('json', data_files='./data/train.json', split='train')
174
+ val_dataset = load_dataset('json', data_files='./data/val.json', split='validation')
175
+
176
+ # Tokenize the datasets
177
+ def tokenize_function(examples):
178
+ return tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=128)
179
+
180
+ train_dataset = train_dataset.map(tokenize_function, batched=True)
181
+ val_dataset = val_dataset.map(tokenize_function, batched=True)
182
+
183
+ # Define Training Arguments
184
+ training_args = TrainingArguments(
185
+ output_dir="./output", # 保存模型和日志的路径
186
+ evaluation_strategy="epoch", # 每个epoch后进行验证
187
+ per_device_train_batch_size=1, # 每个设备的batch size
188
+ per_device_eval_batch_size=1, # 验证时的batch size
189
+ logging_dir="./logs", # 日志目录
190
+ logging_steps=10, # 每10步记录一次日志
191
+ save_steps=100, # 每100步保存一次模型
192
+ num_train_epochs=10, # 训练的epoch数
193
+ save_total_limit=2, # 最大保存模型数
194
+ )
195
+
196
+ # Define the Trainer
197
+ trainer = Trainer(
198
+ model=model, # 训练的模型
199
+ args=training_args, # 训练的参数
200
+ train_dataset=train_dataset, # 训练数据集
201
+ eval_dataset=val_dataset, # 验证数据集
202
+ tokenizer=tokenizer, # 用于预处理的分词器
203
+ )
204
+
205
+ # Start Training
206
+ trainer.train()
207
+
208
+ # Save the model
209
+ model.save_pretrained('./output')
210
+
211
+ if __name__ == '__main__':
212
+ main()
dataset.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+
4
+ data_path = [
5
+ "./data/Chinese-medical-dialogue-data-master/Data_数据/IM_内科/内科5000-33000.csv",
6
+ "./data/Chinese-medical-dialogue-data-master/Data_数据/Oncology_肿瘤科/肿瘤科5-10000.csv",
7
+ "./data/Chinese-medical-dialogue-data-master/Data_数据/Pediatric_儿科/儿科5-14000.csv",
8
+ "./data/Chinese-medical-dialogue-data-master/Data_数据/Surgical_外科/外科5-14000.csv",
9
+ ]
10
+
11
+ train_json_path = "./data/train.json"
12
+ val_json_path = "./data/val.json"
13
+ # 每个数据取 10000 条作为训练
14
+ train_size = 10000
15
+ # 每个数据取 2000 条作为验证
16
+ val_size = 2000
17
+
18
+
19
+ def main():
20
+ train_f = open(train_json_path, "a", encoding='utf-8')
21
+ val_f = open(val_json_path, "a", encoding='utf-8')
22
+ for path in data_path:
23
+ data = pd.read_csv(path, encoding='ANSI')
24
+ train_count = 0
25
+ val_count = 0
26
+ for index, row in data.iterrows():
27
+ question = row["ask"]
28
+ answer = row["answer"]
29
+ line = {
30
+ "question": question,
31
+ "answer": answer
32
+ }
33
+ line = json.dumps(line, ensure_ascii=False)
34
+ if train_count < train_size:
35
+ train_f.write(line + "\n")
36
+ train_count = train_count + 1
37
+ elif val_count < val_size:
38
+ val_f.write(line + "\n")
39
+ val_count = val_count + 1
40
+ else:
41
+ break
42
+ print("数据处理完毕!")
43
+ train_f.close()
44
+ val_f.close()
45
+
46
+
47
+ if __name__ == '__main__':
48
+ main()
49
+
demo01.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ import numpy as np
2
+ print(np.__version__)
qa_dataset.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch.utils.data import Dataset
2
+ import torch
3
+ import json
4
+ import numpy as np
5
+
6
+
7
+ class QADataset(Dataset):
8
+ def __init__(self, data_path, tokenizer, max_source_length, max_target_length) -> None:
9
+ super().__init__()
10
+ self.tokenizer = tokenizer
11
+ self.max_source_length = max_source_length
12
+ self.max_target_length = max_target_length
13
+ self.max_seq_length = self.max_source_length + self.max_target_length
14
+
15
+ self.data = []
16
+ if data_path:
17
+ with open(data_path, "r", encoding='utf-8') as f:
18
+ for line in f:
19
+ if not line or line == "":
20
+ continue
21
+ json_line = json.loads(line)
22
+ question = json_line["question"]
23
+ answer = json_line["answer"]
24
+ self.data.append({
25
+ "question": question,
26
+ "answer": answer
27
+ })
28
+ print("data load , size:", len(self.data))
29
+
30
+ def preprocess(self, question, answer):
31
+ messages = [
32
+ {"role": "system", "content": "你是一个医疗方面的专家,可以根据患者的问题进行解答。"},
33
+ {"role": "user", "content": question}
34
+ ]
35
+ prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
36
+ instruction = self.tokenizer(prompt, add_special_tokens=False, max_length=self.max_source_length)
37
+ response = self.tokenizer(answer, add_special_tokens=False, max_length=self.max_target_length)
38
+ input_ids = instruction["input_ids"] + response["input_ids"] + [self.tokenizer.pad_token_id]
39
+ attention_mask = (instruction["attention_mask"] + response["attention_mask"] + [1])
40
+ labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [self.tokenizer.pad_token_id]
41
+ if len(input_ids) > self.max_seq_length:
42
+ input_ids = input_ids[:self.max_seq_length]
43
+ attention_mask = attention_mask[:self.max_seq_length]
44
+ labels = labels[:self.max_seq_length]
45
+ return input_ids, attention_mask, labels
46
+
47
+ def __getitem__(self, index):
48
+ item_data = self.data[index]
49
+
50
+ input_ids, attention_mask, labels = self.preprocess(**item_data)
51
+
52
+ return {
53
+ "input_ids": torch.LongTensor(np.array(input_ids)),
54
+ "attention_mask": torch.LongTensor(np.array(attention_mask)),
55
+ "labels": torch.LongTensor(np.array(labels))
56
+ }
57
+
58
+ def __len__(self):
59
+ return len(self.data)
60
+
test.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+ from peft import PeftModel
3
+ import torch
4
+
5
+ model_path = "model/Qwen2-1.5B-Instruct"
6
+ lora_dir = "output"
7
+
8
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
9
+ model = AutoModelForCausalLM.from_pretrained(model_path)
10
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
11
+ model = PeftModel.from_pretrained(model, lora_dir)
12
+ model.to(device)
13
+
14
+ prompt = """
15
+ 5月至今上腹靠右隐痛,右背隐痛带酸,便秘,喜睡,时有腹痛,头痛,腰酸症状?
16
+ """
17
+ messages = [
18
+ {"role": "system", "content": "你是一个医疗方面的专家,可以根据患者的问题进行解答。"},
19
+ {"role": "user", "content": prompt}
20
+ ]
21
+ text = tokenizer.apply_chat_template(
22
+ messages,
23
+ tokenize=False,
24
+ add_generation_prompt=True
25
+ )
26
+ print(text)
27
+ model_inputs = tokenizer([text], return_tensors="pt").to(device)
28
+ generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=258)
29
+ generated_ids = [
30
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
31
+ ]
32
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
33
+ print(response)
34
+