Spaces:
Runtime error
Runtime error
zetavg
commited on
Commit
·
fb9b56d
1
Parent(s):
dba0ad5
save train info with model
Browse files- llama_lora/lib/finetune.py +21 -2
- llama_lora/ui/finetune_ui.py +19 -3
llama_lora/lib/finetune.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
import sys
|
3 |
from typing import Any, List
|
4 |
|
|
|
|
|
5 |
import fire
|
6 |
import torch
|
7 |
import transformers
|
@@ -47,6 +49,10 @@ def train(
|
|
47 |
# logging
|
48 |
callbacks: List[Any] = []
|
49 |
):
|
|
|
|
|
|
|
|
|
50 |
device_map = "auto"
|
51 |
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
52 |
ddp = world_size != 1
|
@@ -202,6 +208,12 @@ def train(
|
|
202 |
),
|
203 |
callbacks=callbacks,
|
204 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
model.config.use_cache = False
|
206 |
|
207 |
old_state_dict = model.state_dict
|
@@ -214,9 +226,16 @@ def train(
|
|
214 |
if torch.__version__ >= "2" and sys.platform != "win32":
|
215 |
model = torch.compile(model)
|
216 |
|
217 |
-
|
218 |
|
219 |
model.save_pretrained(output_dir)
|
220 |
print(f"Model saved to {output_dir}.")
|
221 |
|
222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import sys
|
3 |
from typing import Any, List
|
4 |
|
5 |
+
import json
|
6 |
+
|
7 |
import fire
|
8 |
import torch
|
9 |
import transformers
|
|
|
49 |
# logging
|
50 |
callbacks: List[Any] = []
|
51 |
):
|
52 |
+
if os.path.exists(output_dir):
|
53 |
+
if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
|
54 |
+
raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
|
55 |
+
|
56 |
device_map = "auto"
|
57 |
world_size = int(os.environ.get("WORLD_SIZE", 1))
|
58 |
ddp = world_size != 1
|
|
|
208 |
),
|
209 |
callbacks=callbacks,
|
210 |
)
|
211 |
+
|
212 |
+
if not os.path.exists(output_dir):
|
213 |
+
os.makedirs(output_dir)
|
214 |
+
with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
|
215 |
+
json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
|
216 |
+
|
217 |
model.config.use_cache = False
|
218 |
|
219 |
old_state_dict = model.state_dict
|
|
|
226 |
if torch.__version__ >= "2" and sys.platform != "win32":
|
227 |
model = torch.compile(model)
|
228 |
|
229 |
+
train_output = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
230 |
|
231 |
model.save_pretrained(output_dir)
|
232 |
print(f"Model saved to {output_dir}.")
|
233 |
|
234 |
+
with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
|
235 |
+
trainer_log_history = "\n".join([json.dumps(line) for line in trainer.state.log_history])
|
236 |
+
trainer_log_history_jsonl_file.write(trainer_log_history)
|
237 |
+
|
238 |
+
with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
|
239 |
+
json.dump(train_output, train_output_json_file, indent=2)
|
240 |
+
|
241 |
+
return train_output
|
llama_lora/ui/finetune_ui.py
CHANGED
@@ -419,11 +419,27 @@ Train data (first 10):
|
|
419 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
420 |
# progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
|
421 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
results = Global.train_fn(
|
423 |
base_model, # base_model
|
424 |
tokenizer, # tokenizer
|
425 |
-
|
426 |
-
model_name), # output_dir
|
427 |
train_data,
|
428 |
# 128, # batch_size (is not used, use gradient_accumulation_steps instead)
|
429 |
micro_batch_size, # micro_batch_size
|
@@ -451,7 +467,7 @@ Train data (first 10):
|
|
451 |
return result_message
|
452 |
|
453 |
except Exception as e:
|
454 |
-
raise gr.Error(e)
|
455 |
|
456 |
|
457 |
def do_abort_training():
|
|
|
419 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
420 |
# progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
|
421 |
|
422 |
+
output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
|
423 |
+
if not os.path.exists(output_dir):
|
424 |
+
os.makedirs(output_dir)
|
425 |
+
|
426 |
+
with open(os.path.join(output_dir, "info.json"), 'w') as info_json_file:
|
427 |
+
dataset_name = "N/A (from text input)"
|
428 |
+
if load_dataset_from == "Data Dir":
|
429 |
+
dataset_name = dataset_from_data_dir
|
430 |
+
|
431 |
+
info = {
|
432 |
+
'base_model': Global.base_model,
|
433 |
+
'prompt_template': template,
|
434 |
+
'dataset_name': dataset_name,
|
435 |
+
'dataset_rows': len(train_data),
|
436 |
+
}
|
437 |
+
json.dump(info, info_json_file, indent=2)
|
438 |
+
|
439 |
results = Global.train_fn(
|
440 |
base_model, # base_model
|
441 |
tokenizer, # tokenizer
|
442 |
+
output_dir, # output_dir
|
|
|
443 |
train_data,
|
444 |
# 128, # batch_size (is not used, use gradient_accumulation_steps instead)
|
445 |
micro_batch_size, # micro_batch_size
|
|
|
467 |
return result_message
|
468 |
|
469 |
except Exception as e:
|
470 |
+
raise gr.Error(f"{e} (To dismiss this error, click the 'Abort' button)")
|
471 |
|
472 |
|
473 |
def do_abort_training():
|