Tonic commited on
Commit
f559a91
·
verified ·
1 Parent(s): 11dffe6

attempts to resolve training argument issue

Browse files
Files changed (2) hide show
  1. model.py +1 -5
  2. trainer.py +16 -13
model.py CHANGED
@@ -149,22 +149,18 @@ class SmolLM3Model:
149
  "fp16": self.config.fp16,
150
  "bf16": self.config.bf16,
151
  "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
152
- "ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
153
  "report_to": None,
154
- "remove_unused_columns": False,
155
  "dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
156
  # Removed group_by_length as it's causing issues with newer transformers versions
157
  # Removed length_column_name as it might conflict with data collator
158
  "seed": 42,
159
- "data_seed": 42,
160
  "dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
161
  "max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
162
  "optim": self.config.optimizer,
163
  "lr_scheduler_type": self.config.scheduler,
164
- "warmup_ratio": 0.1,
165
  "save_strategy": "steps",
166
  "logging_strategy": "steps",
167
- "prediction_loss_only": True,
168
  }
169
 
170
  # Override with kwargs
 
149
  "fp16": self.config.fp16,
150
  "bf16": self.config.bf16,
151
  "ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
 
152
  "report_to": None,
 
153
  "dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
154
  # Removed group_by_length as it's causing issues with newer transformers versions
155
  # Removed length_column_name as it might conflict with data collator
156
  "seed": 42,
 
157
  "dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
158
  "max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
159
  "optim": self.config.optimizer,
160
  "lr_scheduler_type": self.config.scheduler,
 
161
  "save_strategy": "steps",
162
  "logging_strategy": "steps",
163
+ # Removed prediction_loss_only as it might cause issues
164
  }
165
 
166
  # Override with kwargs
trainer.py CHANGED
@@ -104,22 +104,25 @@ class SmolLM3Trainer:
104
  # Add monitoring callbacks
105
  callbacks = []
106
 
 
107
  # Add simple console callback
108
- callbacks.append(SimpleConsoleCallback())
109
- logger.info("Added simple console monitoring callback")
110
 
111
  # Try to add Trackio callback if available
112
- if self.monitor and self.monitor.enable_tracking:
113
- try:
114
- trackio_callback = self.monitor.create_monitoring_callback()
115
- if trackio_callback:
116
- callbacks.append(trackio_callback)
117
- logger.info("Added Trackio monitoring callback")
118
- else:
119
- logger.warning("Failed to create Trackio callback")
120
- except Exception as e:
121
- logger.error(f"Error creating Trackio callback: {e}")
122
- logger.info("Continuing with console monitoring only")
 
 
123
 
124
  # Try standard Trainer first (more stable with callbacks)
125
  logger.info("Creating Trainer with training arguments...")
 
104
  # Add monitoring callbacks
105
  callbacks = []
106
 
107
+ # Temporarily disable callbacks to debug the bool object is not callable error
108
  # Add simple console callback
109
+ # callbacks.append(SimpleConsoleCallback())
110
+ # logger.info("Added simple console monitoring callback")
111
 
112
  # Try to add Trackio callback if available
113
+ # if self.monitor and self.monitor.enable_tracking:
114
+ # try:
115
+ # trackio_callback = self.monitor.create_monitoring_callback()
116
+ # if trackio_callback:
117
+ # callbacks.append(trackio_callback)
118
+ # logger.info("Added Trackio monitoring callback")
119
+ # else:
120
+ # logger.warning("Failed to create Trackio callback")
121
+ # except Exception as e:
122
+ # logger.error(f"Error creating Trackio callback: {e}")
123
+ # logger.info("Continuing with console monitoring only")
124
+
125
+ logger.info("Callbacks disabled for debugging")
126
 
127
  # Try standard Trainer first (more stable with callbacks)
128
  logger.info("Creating Trainer with training arguments...")