jbilcke-hf HF Staff commited on
Commit
21e03a6
·
1 Parent(s): 1f07207

add more logs

Browse files
Files changed (1) hide show
  1. vms/ui/project/services/training.py +14 -0
vms/ui/project/services/training.py CHANGED
@@ -1495,10 +1495,16 @@ class TrainingService:
1495
  # Check in lora_weights directory
1496
  lora_weights_dir = self.app.output_path / "lora_weights"
1497
  if lora_weights_dir.exists():
 
 
 
 
1498
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1499
  if lora_safetensors.exists():
1500
  logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1501
  return str(lora_safetensors)
 
 
1502
 
1503
  # If not found in root or lora_weights, log the issue
1504
  logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")
@@ -1509,10 +1515,18 @@ class TrainingService:
1509
  if checkpoints:
1510
  logger.info(f"Found {len(checkpoints)} checkpoint directories, but main weights file is missing")
1511
  latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
 
 
 
 
 
 
1512
  checkpoint_weights = latest_checkpoint / "pytorch_lora_weights.safetensors"
1513
  if checkpoint_weights.exists():
1514
  logger.info(f"Found weights in latest checkpoint: {checkpoint_weights}")
1515
  return str(checkpoint_weights)
 
 
1516
 
1517
  return None
1518
 
 
1495
  # Check in lora_weights directory
1496
  lora_weights_dir = self.app.output_path / "lora_weights"
1497
  if lora_weights_dir.exists():
1498
+ logger.info(f"Found lora_weights directory: {lora_weights_dir}")
1499
+ lora_weights_contents = list(lora_weights_dir.glob("*"))
1500
+ logger.info(f"Contents of lora_weights directory: {lora_weights_contents}")
1501
+
1502
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1503
  if lora_safetensors.exists():
1504
  logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1505
  return str(lora_safetensors)
1506
+ else:
1507
+ logger.info(f"pytorch_lora_weights.safetensors not found in lora_weights directory")
1508
 
1509
  # If not found in root or lora_weights, log the issue
1510
  logger.warning(f"Model weights not found at expected location: {model_output_safetensors_path}")
 
1515
  if checkpoints:
1516
  logger.info(f"Found {len(checkpoints)} checkpoint directories, but main weights file is missing")
1517
  latest_checkpoint = max(checkpoints, key=lambda x: int(x.name.split("_")[-1]))
1518
+ logger.info(f"Latest checkpoint directory: {latest_checkpoint}")
1519
+
1520
+ # Log contents of latest checkpoint
1521
+ checkpoint_contents = list(latest_checkpoint.glob("*"))
1522
+ logger.info(f"Contents of latest checkpoint {latest_checkpoint.name}: {checkpoint_contents}")
1523
+
1524
  checkpoint_weights = latest_checkpoint / "pytorch_lora_weights.safetensors"
1525
  if checkpoint_weights.exists():
1526
  logger.info(f"Found weights in latest checkpoint: {checkpoint_weights}")
1527
  return str(checkpoint_weights)
1528
+ else:
1529
+ logger.info(f"pytorch_lora_weights.safetensors not found in checkpoint directory")
1530
 
1531
  return None
1532