Spaces:
Running
Running
attempts to resolve training argument issue
Browse files- model.py +1 -5
- trainer.py +16 -13
model.py
CHANGED
@@ -149,22 +149,18 @@ class SmolLM3Model:
|
|
149 |
"fp16": self.config.fp16,
|
150 |
"bf16": self.config.bf16,
|
151 |
"ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
|
152 |
-
"ddp_find_unused_parameters": self.config.ddp_find_unused_parameters if torch.cuda.device_count() > 1 else False,
|
153 |
"report_to": None,
|
154 |
-
"remove_unused_columns": False,
|
155 |
"dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
|
156 |
# Removed group_by_length as it's causing issues with newer transformers versions
|
157 |
# Removed length_column_name as it might conflict with data collator
|
158 |
"seed": 42,
|
159 |
-
"data_seed": 42,
|
160 |
"dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
|
161 |
"max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
|
162 |
"optim": self.config.optimizer,
|
163 |
"lr_scheduler_type": self.config.scheduler,
|
164 |
-
"warmup_ratio": 0.1,
|
165 |
"save_strategy": "steps",
|
166 |
"logging_strategy": "steps",
|
167 |
-
|
168 |
}
|
169 |
|
170 |
# Override with kwargs
|
|
|
149 |
"fp16": self.config.fp16,
|
150 |
"bf16": self.config.bf16,
|
151 |
"ddp_backend": self.config.ddp_backend if torch.cuda.device_count() > 1 else None,
|
|
|
152 |
"report_to": None,
|
|
|
153 |
"dataloader_pin_memory": getattr(self.config, 'dataloader_pin_memory', True),
|
154 |
# Removed group_by_length as it's causing issues with newer transformers versions
|
155 |
# Removed length_column_name as it might conflict with data collator
|
156 |
"seed": 42,
|
|
|
157 |
"dataloader_num_workers": getattr(self.config, 'dataloader_num_workers', 4),
|
158 |
"max_grad_norm": getattr(self.config, 'max_grad_norm', 1.0),
|
159 |
"optim": self.config.optimizer,
|
160 |
"lr_scheduler_type": self.config.scheduler,
|
|
|
161 |
"save_strategy": "steps",
|
162 |
"logging_strategy": "steps",
|
163 |
+
# Removed prediction_loss_only as it might cause issues
|
164 |
}
|
165 |
|
166 |
# Override with kwargs
|
trainer.py
CHANGED
@@ -104,22 +104,25 @@ class SmolLM3Trainer:
|
|
104 |
# Add monitoring callbacks
|
105 |
callbacks = []
|
106 |
|
|
|
107 |
# Add simple console callback
|
108 |
-
callbacks.append(SimpleConsoleCallback())
|
109 |
-
logger.info("Added simple console monitoring callback")
|
110 |
|
111 |
# Try to add Trackio callback if available
|
112 |
-
if self.monitor and self.monitor.enable_tracking:
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
123 |
|
124 |
# Try standard Trainer first (more stable with callbacks)
|
125 |
logger.info("Creating Trainer with training arguments...")
|
|
|
104 |
# Add monitoring callbacks
|
105 |
callbacks = []
|
106 |
|
107 |
+
# Temporarily disable callbacks to debug the bool object is not callable error
|
108 |
# Add simple console callback
|
109 |
+
# callbacks.append(SimpleConsoleCallback())
|
110 |
+
# logger.info("Added simple console monitoring callback")
|
111 |
|
112 |
# Try to add Trackio callback if available
|
113 |
+
# if self.monitor and self.monitor.enable_tracking:
|
114 |
+
# try:
|
115 |
+
# trackio_callback = self.monitor.create_monitoring_callback()
|
116 |
+
# if trackio_callback:
|
117 |
+
# callbacks.append(trackio_callback)
|
118 |
+
# logger.info("Added Trackio monitoring callback")
|
119 |
+
# else:
|
120 |
+
# logger.warning("Failed to create Trackio callback")
|
121 |
+
# except Exception as e:
|
122 |
+
# logger.error(f"Error creating Trackio callback: {e}")
|
123 |
+
# logger.info("Continuing with console monitoring only")
|
124 |
+
|
125 |
+
logger.info("Callbacks disabled for debugging")
|
126 |
|
127 |
# Try standard Trainer first (more stable with callbacks)
|
128 |
logger.info("Creating Trainer with training arguments...")
|