kmichiru commited on
Commit
14c9e49
·
1 Parent(s): 6af5a36

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +7 -3
  2. adapter_config.json +26 -0
  3. adapter_model.bin +3 -0
  4. inference.py +163 -0
  5. isft_mistral.py +187 -0
README.md CHANGED
@@ -1,3 +1,7 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
1
+ ## Training procedure
2
+
3
+ ### Framework versions
4
+
5
+ - PEFT 0.4.0
6
+
7
+ - PEFT 0.4.0
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.1",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 64,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "q_proj",
18
+ "k_proj",
19
+ "v_proj",
20
+ "o_proj",
21
+ "gate_proj",
22
+ "up_proj",
23
+ "down_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ea77ff2bb1943c52e1554007240514abd8bf16a9bb1b47e925a27a71bf555a
3
+ size 671250189
inference.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ torch.cuda.empty_cache()
3
+ from peft import PeftModel
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ base_model = "mistralai/Mistral-7B-Instruct-v0.1"
6
+ # new_model = "kmichiru/Nikaido-7B-mistral-instruct-v0.1"
7
+ new_model = "kmichiru/Nikaido-7B-mistral-instruct-v0.3-vn_v2"
8
+
9
+ # Reload tokenizer
10
+ tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
11
+ tokenizer.pad_token = tokenizer.eos_token
12
+ print(tokenizer.pad_token, tokenizer.pad_token_id)
13
+ tokenizer.padding_side = "right"
14
+
15
+ # Reload the base model
16
+ base_model_reload = AutoModelForCausalLM.from_pretrained(
17
+ base_model, low_cpu_mem_usage=True,
18
+ return_dict=True,torch_dtype=torch.bfloat16,
19
+ device_map= {"": 0})
20
+ model = PeftModel.from_pretrained(base_model_reload, new_model)
21
+ # model = model.merge_and_unload()
22
+
23
+
24
+
25
+ model.config.use_cache = True
26
+ model.eval()
27
+
28
+ def dialogue(role, content):
29
+ return {
30
+ "role": role,
31
+ "content": content
32
+ }
33
+
34
+
35
+ import json, random
36
+ TRAIN_DSET = "iroseka_dataset.jsonl"
37
+ try:
38
+ with open(TRAIN_DSET, "r", encoding="utf-8") as f:
39
+ examples = [json.loads(line) for line in f]
40
+ except FileNotFoundError:
41
+ print("Few-shot data not found, skipping...")
42
+ examples = []
43
+
44
+ def format_chat_history(example, few_shot=0):
45
+ user_msgs = []
46
+ # for inference each round, we only need the user messages
47
+ for msg in example["messages"]:
48
+ # if msg["role"] == "user":
49
+ user_msgs.append(msg["content"])
50
+ messages = [
51
+ dialogue("user", "\n".join(user_msgs)), # join user messages together
52
+ # example["messages"][-1], # the last message is the bot's response
53
+ ]
54
+
55
+ if few_shot > 0:
56
+ # randomly sample a few messages from the dialogue history
57
+ few_shot_data = random.sample(examples, few_shot)
58
+ for few_shot_example in few_shot_data:
59
+ few_shot_msgs = []
60
+ for msg in few_shot_example["messages"]:
61
+ if msg["role"] == "user":
62
+ few_shot_msgs.append(msg["content"])
63
+ messages = [
64
+ dialogue("user", "\n".join(few_shot_msgs)),
65
+ few_shot_example["messages"][-1]
66
+ ] + messages
67
+
68
+ encodeds = tokenizer.apply_chat_template(messages, tokenize=False)
69
+ return encodeds
70
+
71
+ def format_chat_history_v2(example, few_shot):
72
+ # TODO: implement few-shot learning
73
+ user_msg = []
74
+ user_msg.append("<s>")
75
+ for msg in example["messages"]:
76
+ # [INST] What is your favourite condiment? [/INST]
77
+ user_msg.append(f"[INST] {msg['content']} [/INST]")
78
+ # user_msg.append("</s>")
79
+ if "next_speaker" in example:
80
+ user_msg.append(f"[INST] {example['next_speaker']}: ")
81
+ return " ".join(user_msg)
82
+
83
+ from transformers import StoppingCriteria, StoppingCriteriaList
84
+ class StoppingCriteriaSub(StoppingCriteria):
85
+ def __init__(self, stops = [], encounters=1):
86
+ super().__init__()
87
+ self.stops = [stop.to("cuda") for stop in stops]
88
+
89
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
90
+ for seq in input_ids:
91
+ for stop in self.stops:
92
+ if len(seq) >= len(stop) and torch.all((stop == seq[-len(stop):])).item():
93
+ return True
94
+ return False
95
+
96
+ stop_words = ["[/INST]"]
97
+ stop_words_ids = [tokenizer(stop_word, return_tensors='pt', add_special_tokens=False)['input_ids'].squeeze() for stop_word in stop_words]
98
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
99
+
100
+ def inference(chat_history):
101
+ # chat_history: dict, with "messages" key storing dialogue history, in OpenAI format
102
+ formatted = format_chat_history_v2(chat_history, few_shot=1)
103
+ print(formatted)
104
+ model_inputs = tokenizer(
105
+ [formatted],
106
+ return_tensors="pt",
107
+ )
108
+ print(model_inputs)
109
+ model_inputs = model_inputs.to(model.device)
110
+ with torch.no_grad():
111
+ outputs = model.generate(
112
+ input_ids=model_inputs.input_ids,
113
+ attention_mask=model_inputs.attention_mask,
114
+ # max_length=1024,
115
+ do_sample=True,
116
+ top_p=1,
117
+ # contrastive search
118
+ # top_k=50,
119
+ # penalty_alpha=0.6,
120
+ # num_return_sequences=1,
121
+ temperature=0.3,
122
+ # num_return_sequences=3,
123
+ use_cache=True,
124
+ # pad_token_id=tokenizer.eos_token_id, # eos_token_id is not available for some models
125
+ pad_token_id=tokenizer.pad_token_id, # eos_token_id is not available for some models
126
+ eos_token_id=tokenizer.eos_token_id,
127
+ bos_token_id=tokenizer.bos_token_id,
128
+ output_scores=True,
129
+ output_attentions=False,
130
+ output_hidden_states=False,
131
+ max_new_tokens=256,
132
+ # num_beams=9,
133
+ # num_beam_groups=3,
134
+ # repetition_penalty=1.0,
135
+ # diversity_penalty=0.5,
136
+ # num_beams=5,
137
+ # stopping_criteria=stopping_criteria,
138
+ )
139
+ # print(outputs)
140
+ text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
141
+ def postprocess(t):
142
+ t = t.split("[/INST]")
143
+ t = [x.replace("[INST]", "").strip() for x in t]
144
+ t = [x for x in t if x != ""]
145
+ return t[-1]
146
+ # text = [postprocess(t) for t in text]
147
+
148
+ return text
149
+
150
+
151
+ if __name__ == "__main__":
152
+ chat_history = {
153
+ "messages": [
154
+ # dialogue("system", ""),
155
+ dialogue("user", "傍白: 真紅の言葉が胸の中に滑り込んでくる。"),
156
+ dialogue("user", "悠馬: っ"),
157
+ dialogue("user", "傍白: 限界だった。"),
158
+ dialogue("user", "悠馬: 真紅,大好きです。これからもずっと一緒にいてください。"),
159
+ ],
160
+ "next_speaker": "真紅"
161
+ }
162
+ print(inference(chat_history))
163
+
isft_mistral.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM
4
+
5
+ import os
6
+
7
+
8
+ base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"
9
+
10
+ WORK = "vn_v2"
11
+ new_model_id = f"kmichiru/Nikaido-7B-mistral-instruct-v0.3-{WORK}"
12
+
13
+ # DSET = {
14
+ # "train": f"dataset_iroseka/{WORK}_dataset.jsonl",
15
+ # "eval": f"dataset_iroseka/{WORK}_validations.jsonl"
16
+ # }
17
+
18
+ DSET = {
19
+ "train": f"dataset_iroseka/{WORK}_train.jsonl",
20
+ "eval": f"dataset_iroseka/{WORK}_val.jsonl"
21
+ }
22
+
23
+
24
+ dataset = load_dataset("json", data_files=DSET)
25
+ # model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16)
26
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
27
+ # max_length = 1024
28
+ tokenizer.pad_token = tokenizer.eos_token
29
+ tokenizer.padding_side = "right"
30
+
31
+ def dialogue(role, content):
32
+ return {
33
+ "role": role,
34
+ "content": content
35
+ }
36
+
37
+ def format_chat_history(example):
38
+ user_msgs = []
39
+ for msg in example["messages"]:
40
+ if msg["role"] == "user":
41
+ user_msgs.append(msg["content"])
42
+ messages = [
43
+ dialogue("user", "\n".join(user_msgs)), # join user messages together
44
+ example["messages"][-1], # the last message is the bot's response
45
+ ]
46
+ encodeds = tokenizer.apply_chat_template(messages, tokenize=False)
47
+ return encodeds
48
+
49
+ def prep_speaker(msg: str):
50
+ msg = msg.replace("\u3000", " ") # replace full-width spaces
51
+ speaker, content = msg.split(":", 1)
52
+ speaker = speaker.strip()
53
+ content = content.strip()
54
+ if len(speaker) == 0:
55
+ speaker = "傍白"
56
+
57
+ return f"{speaker}: {content}"
58
+
59
+
60
+ def format_chat_history_v2(example):
61
+ user_msg = []
62
+ user_msg.append("<s>")
63
+ for msg in example["messages"]:
64
+ # [INST] What is your favourite condiment? [/INST]
65
+ if msg["role"] != "system":
66
+ user_msg.append(f"[INST] {prep_speaker(msg['content'])} [/INST]")
67
+ # user_msg.append("</s>")
68
+ return " ".join(user_msg)
69
+
70
+ # def format_chat_history_v2(example):
71
+ # user_msgs = []
72
+ # for msg in example["messages"]:
73
+ # if msg["role"] == "user":
74
+ # user_msgs.append(msg["content"])
75
+ # messages = [
76
+ # dialogue("user", "\n".join(user_msgs)), # join user messages together
77
+ # example["messages"][-1], # the last message is the bot's response
78
+ # ]
79
+ # encodeds = tokenizer.apply_chat_template(messages, tokenize=False)
80
+ # return encodeds
81
+
82
+ print(format_chat_history_v2(dataset['train'][0]))
83
+
84
+ def generate_and_tokenize_prompt(prompt, max_length=2048):
85
+ result = tokenizer(
86
+ format_chat_history_v2(prompt),
87
+ truncation=True,
88
+ max_length=max_length,
89
+ padding="max_length",
90
+ )
91
+ result["labels"] = result["input_ids"]
92
+ return result
93
+
94
+ tokenized_dataset = dataset.map(generate_and_tokenize_prompt)
95
+ print(tokenized_dataset['train'][0])
96
+
97
+ # # stats data length
98
+ # def plot_data_lengths(tokenized_dataset):
99
+ # lengths = []
100
+ # for split in tokenized_dataset:
101
+ # lengths += [len(x['input_ids']) for x in tokenized_dataset[split]]
102
+ # print(f"Max length: {max(lengths)}")
103
+ # print(f"Min length: {min(lengths)}")
104
+ # print(f"Mean length: {sum(lengths)/len(lengths)}")
105
+ # print(f"Median length: {sorted(lengths)[len(lengths)//2]}")
106
+
107
+ # plot_data_lengths(tokenized_dataset)
108
+ print(tokenized_dataset['train'][0])
109
+
110
+ #Adding the adapters in the layers
111
+ from peft import LoraConfig, get_peft_model
112
+ def print_trainable_parameters(model):
113
+ """
114
+ Prints the number of trainable parameters in the model.
115
+ """
116
+ trainable_params = 0
117
+ all_param = 0
118
+ for _, param in model.named_parameters():
119
+ all_param += param.numel()
120
+ if param.requires_grad:
121
+ trainable_params += param.numel()
122
+ print(
123
+ f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param}"
124
+ )
125
+ model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.bfloat16)
126
+ # model = prepare_model_for_kbit_training(model)
127
+ peft_config = LoraConfig(
128
+ r=64,
129
+ lora_alpha=64,
130
+ lora_dropout=0.05,
131
+ bias="none",
132
+ task_type="CAUSAL_LM",
133
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"]
134
+ )
135
+ model = get_peft_model(model, peft_config)
136
+ print_trainable_parameters(model)
137
+ print(model)
138
+
139
+ import wandb, os
140
+ # wandb.login()
141
+
142
+ wandb_project = "NikaidoLM"
143
+ if len(wandb_project) > 0:
144
+ os.environ["WANDB_PROJECT"] = wandb_project
145
+
146
+ import transformers
147
+ from datetime import datetime
148
+
149
+ project = wandb_project
150
+ base_model_name = "mistral"
151
+ run_name = base_model_name + "-" + project
152
+ output_name = f"{run_name}-{WORK}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
153
+ output_dir = "/scratch/generalvision/mowentao/mistral-out/" + output_name
154
+
155
+ trainer = transformers.Trainer(
156
+ model=model,
157
+ train_dataset=tokenized_dataset["train"],
158
+ eval_dataset=tokenized_dataset["eval"],
159
+ args=transformers.TrainingArguments(
160
+ output_dir=output_dir,
161
+ warmup_steps=500,
162
+ per_device_train_batch_size=1,
163
+ gradient_accumulation_steps=2,
164
+ num_train_epochs=3,
165
+ weight_decay=5e-4,
166
+ # max_steps=10_000,
167
+ learning_rate=1e-4, # Want a small lr for finetuning
168
+ bf16=True,
169
+ optim="paged_adamw_32bit",
170
+ logging_steps=100, # When to start reporting loss
171
+ logging_dir=output_dir, # Directory for storing logs
172
+ save_strategy="steps", # Save the model checkpoint every logging step
173
+ save_steps=500, # Save checkpoints every 50 steps
174
+ evaluation_strategy="steps", # Evaluate the model every logging step
175
+ eval_steps=100, # Evaluate and save checkpoints every 50 steps
176
+ do_eval=True, # Perform evaluation at the end of training
177
+ report_to="wandb", # Comment this out if you don't want to use weights & baises
178
+ run_name=output_name, # Name of the W&B run (optional)
179
+ lr_scheduler_type="cosine",
180
+ ),
181
+ data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
182
+ )
183
+
184
+ model.config.use_cache = False # silence the warnings. Please re-enable for inference!
185
+ trainer.train()
186
+ trainer.model.save_pretrained(new_model_id)
187
+ wandb.finish()