84basi commited on
Commit
73103bf
β€’
1 Parent(s): ebb0a4f

add readme

Browse files
Files changed (1) hide show
  1. README.md +133 -0
README.md CHANGED
@@ -20,3 +20,136 @@ language:
20
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
21
 
22
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
23
+
24
+ !pip uninstall unsloth -y
25
+ !pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
26
+ !pip install --upgrade torch
27
+ !pip install --upgrade xformers
28
+ !pip install ipywidgets --upgrade
29
+
30
+ import torch
31
+ if torch.cuda.get_device_capability()[0] >= 8:
32
+ !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
33
+
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
35
+ from unsloth import FastLanguageModel
36
+ import torch
37
+ max_seq_length = 512
38
+ dtype = None
39
+ load_in_4bit = True
40
+
41
+ model_id = "llm-jp/llm-jp-3-13b"
42
+ new_model_id = "llm-jp-3-13b-finetune-2"
43
+ model, tokenizer = FastLanguageModel.from_pretrained(
44
+ model_name=model_id,
45
+ dtype=dtype,
46
+ load_in_4bit=load_in_4bit,
47
+ trust_remote_code=True,
48
+ )
49
+
50
+ model = FastLanguageModel.get_peft_model(
51
+ model,
52
+ r = 32,
53
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
54
+ "gate_proj", "up_proj", "down_proj",],
55
+ lora_alpha = 32,
56
+ lora_dropout = 0.05,
57
+ bias = "none",
58
+ use_gradient_checkpointing = "unsloth",
59
+ random_state = 3407,
60
+ use_rslora = False,
61
+ loftq_config = None,
62
+ max_seq_length = max_seq_length,
63
+ )
64
+
65
+ HF_TOKEN = "" #@param {type:"string"}
66
+
67
+ from datasets import load_dataset
68
+ dataset = load_dataset("json", data_files="/content/ichikara-instruction-003-001-1.json")
69
+
70
+ prompt = """### ζŒ‡η€Ί
71
+ {}
72
+ ### ε›žη­”
73
+ {}"""
74
+
75
+
76
+ """
77
+ formatting_prompts_func: ε„γƒ‡γƒΌγ‚Ώγ‚’γƒ—γƒ­γƒ³γƒ—γƒˆγ«εˆγ‚γ›γŸε½’εΌγ«εˆγ‚γ›γ‚‹
78
+ """
79
+ EOS_TOKEN = tokenizer.eos_token
80
+ def formatting_prompts_func(examples):
81
+ input = examples["text"]
82
+ output = examples["output"]
83
+ text = prompt.format(input, output) + EOS_TOKEN
84
+ return { "formatted_text" : text, }
85
+ pass
86
+
87
+ dataset = dataset.map(
88
+ formatting_prompts_func,
89
+ num_proc= 4,
90
+ )
91
+
92
+ from trl import SFTTrainer
93
+ from transformers import TrainingArguments
94
+ from unsloth import is_bfloat16_supported
95
+
96
+ trainer = SFTTrainer(
97
+ model = model,
98
+ tokenizer = tokenizer,
99
+ train_dataset=dataset["train"],
100
+ max_seq_length = max_seq_length,
101
+ dataset_text_field="formatted_text",
102
+ packing = False,
103
+ args = TrainingArguments(
104
+ per_device_train_batch_size = 2,
105
+ gradient_accumulation_steps = 4,
106
+ num_train_epochs = 1,
107
+ logging_steps = 10,
108
+ warmup_steps = 10,
109
+ save_steps=100,
110
+ save_total_limit=2,
111
+ max_steps=-1,
112
+ learning_rate = 2e-4,
113
+ fp16 = not is_bfloat16_supported(),
114
+ bf16 = is_bfloat16_supported(),
115
+ group_by_length=True,
116
+ seed = 3407,
117
+ output_dir = "outputs",
118
+ report_to = "none",
119
+ ),
120
+ )
121
+
122
+ trainer_stats = trainer.train()
123
+
124
+ import json
125
+ datasets = []
126
+ with open("/content/elyza-tasks-100-TV_0.jsonl", "r") as f:
127
+ item = ""
128
+ for line in f:
129
+ line = line.strip()
130
+ item += line
131
+ if item.endswith("}"):
132
+ datasets.append(json.loads(item))
133
+ item = ""
134
+
135
+ from tqdm import tqdm
136
+
137
+ FastLanguageModel.for_inference(model)
138
+
139
+ results = []
140
+ for dt in tqdm(datasets):
141
+ input = dt["input"]
142
+
143
+ prompt = f"""### ζŒ‡η€Ί\n{input}\n### ε›žη­”\n"""
144
+
145
+ inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
146
+
147
+ outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True, do_sample=False, repetition_penalty=1.2)
148
+ prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split('\n### ε›žη­”')[-1]
149
+
150
+ results.append({"task_id": dt["task_id"], "input": input, "output": prediction})
151
+
152
+ with open(f"{new_model_id}_output.jsonl", 'w', encoding='utf-8') as f:
153
+ for result in results:
154
+ json.dump(result, f, ensure_ascii=False)
155
+ f.write('\n')