zetavg commited on
Commit
fb9b56d
·
1 Parent(s): dba0ad5

save train info with model

Browse files
llama_lora/lib/finetune.py CHANGED
@@ -2,6 +2,8 @@ import os
2
  import sys
3
  from typing import Any, List
4
 
 
 
5
  import fire
6
  import torch
7
  import transformers
@@ -47,6 +49,10 @@ def train(
47
  # logging
48
  callbacks: List[Any] = []
49
  ):
 
 
 
 
50
  device_map = "auto"
51
  world_size = int(os.environ.get("WORLD_SIZE", 1))
52
  ddp = world_size != 1
@@ -202,6 +208,12 @@ def train(
202
  ),
203
  callbacks=callbacks,
204
  )
 
 
 
 
 
 
205
  model.config.use_cache = False
206
 
207
  old_state_dict = model.state_dict
@@ -214,9 +226,16 @@ def train(
214
  if torch.__version__ >= "2" and sys.platform != "win32":
215
  model = torch.compile(model)
216
 
217
- result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
218
 
219
  model.save_pretrained(output_dir)
220
  print(f"Model saved to {output_dir}.")
221
 
222
- return result
 
 
 
 
 
 
 
 
2
  import sys
3
  from typing import Any, List
4
 
5
+ import json
6
+
7
  import fire
8
  import torch
9
  import transformers
 
49
  # logging
50
  callbacks: List[Any] = []
51
  ):
52
+ if os.path.exists(output_dir):
53
+ if (not os.path.isdir(output_dir)) or os.path.exists(os.path.join(output_dir, 'adapter_config.json')):
54
+ raise ValueError(f"The output directory already exists and is not empty. ({output_dir})")
55
+
56
  device_map = "auto"
57
  world_size = int(os.environ.get("WORLD_SIZE", 1))
58
  ddp = world_size != 1
 
208
  ),
209
  callbacks=callbacks,
210
  )
211
+
212
+ if not os.path.exists(output_dir):
213
+ os.makedirs(output_dir)
214
+ with open(os.path.join(output_dir, "trainer_args.json"), 'w') as trainer_args_json_file:
215
+ json.dump(trainer.args.to_dict(), trainer_args_json_file, indent=2)
216
+
217
  model.config.use_cache = False
218
 
219
  old_state_dict = model.state_dict
 
226
  if torch.__version__ >= "2" and sys.platform != "win32":
227
  model = torch.compile(model)
228
 
229
+ train_output = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
230
 
231
  model.save_pretrained(output_dir)
232
  print(f"Model saved to {output_dir}.")
233
 
234
+ with open(os.path.join(output_dir, "trainer_log_history.jsonl"), 'w') as trainer_log_history_jsonl_file:
235
+ trainer_log_history = "\n".join([json.dumps(line) for line in trainer.state.log_history])
236
+ trainer_log_history_jsonl_file.write(trainer_log_history)
237
+
238
+ with open(os.path.join(output_dir, "train_output.json"), 'w') as train_output_json_file:
239
+ json.dump(train_output, train_output_json_file, indent=2)
240
+
241
+ return train_output
llama_lora/ui/finetune_ui.py CHANGED
@@ -419,11 +419,27 @@ Train data (first 10):
419
  # Do not let other tqdm iterations interfere the progress reporting after training starts.
420
  # progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
421
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  results = Global.train_fn(
423
  base_model, # base_model
424
  tokenizer, # tokenizer
425
- os.path.join(Global.data_dir, "lora_models",
426
- model_name), # output_dir
427
  train_data,
428
  # 128, # batch_size (is not used, use gradient_accumulation_steps instead)
429
  micro_batch_size, # micro_batch_size
@@ -451,7 +467,7 @@ Train data (first 10):
451
  return result_message
452
 
453
  except Exception as e:
454
- raise gr.Error(e)
455
 
456
 
457
  def do_abort_training():
 
419
  # Do not let other tqdm iterations interfere the progress reporting after training starts.
420
  # progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
421
 
422
+ output_dir = os.path.join(Global.data_dir, "lora_models", model_name)
423
+ if not os.path.exists(output_dir):
424
+ os.makedirs(output_dir)
425
+
426
+ with open(os.path.join(output_dir, "info.json"), 'w') as info_json_file:
427
+ dataset_name = "N/A (from text input)"
428
+ if load_dataset_from == "Data Dir":
429
+ dataset_name = dataset_from_data_dir
430
+
431
+ info = {
432
+ 'base_model': Global.base_model,
433
+ 'prompt_template': template,
434
+ 'dataset_name': dataset_name,
435
+ 'dataset_rows': len(train_data),
436
+ }
437
+ json.dump(info, info_json_file, indent=2)
438
+
439
  results = Global.train_fn(
440
  base_model, # base_model
441
  tokenizer, # tokenizer
442
+ output_dir, # output_dir
 
443
  train_data,
444
  # 128, # batch_size (is not used, use gradient_accumulation_steps instead)
445
  micro_batch_size, # micro_batch_size
 
467
  return result_message
468
 
469
  except Exception as e:
470
+ raise gr.Error(f"{e} (To dismiss this error, click the 'Abort' button)")
471
 
472
 
473
  def do_abort_training():