Tonic commited on
Commit
5f8b28d
·
1 Parent(s): a509b8b

improves spaces deployment , configuration for custom settings , adds interface for spaces deployment

Browse files
config/train_smollm3.py CHANGED
@@ -82,6 +82,8 @@ class SmolLM3Config:
82
  # HF Datasets configuration
83
  hf_token: Optional[str] = None
84
  dataset_repo: Optional[str] = None
 
 
85
 
86
 
87
  def __post_init__(self):
 
82
  # HF Datasets configuration
83
  hf_token: Optional[str] = None
84
  dataset_repo: Optional[str] = None
85
+ # Monitoring mode: 'both' | 'dataset' | 'trackio' | 'none'
86
+ monitoring_mode: str = 'both'
87
 
88
 
89
  def __post_init__(self):
interface.py ADDED
@@ -0,0 +1,1165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Gradio Interface for SmolLM3/GPT-OSS Fine-tuning Pipeline
4
+
5
+ This app mirrors the core flow of launch.sh with a click-and-run UI.
6
+ Tokens are read from environment variables:
7
+ - HF_WRITE_TOKEN (required)
8
+ - HF_READ_TOKEN (optional; used to switch the Trackio Space token after training)
9
+
10
+ Key steps (configurable via UI):
11
+ 1) Optional HF Dataset repo setup for Trackio
12
+ 2) Optional Trackio Space deployment
13
+ 3) Training (SmolLM3 or GPT-OSS)
14
+ 4) Push trained model to the HF Hub
15
+ 5) Optional switch Trackio HF_TOKEN to read token
16
+
17
+ This uses the existing scripts in scripts/ and config/ to avoid code duplication.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import os
23
+ import sys
24
+ import time
25
+ import json
26
+ import shlex
27
+ import traceback
28
+ import importlib.util
29
+ from dataclasses import dataclass
30
+ from datetime import datetime
31
+ from pathlib import Path
32
+ from typing import Dict, Any, Generator, Optional, Tuple
33
+
34
+ # Third-party
35
+ try:
36
+ import gradio as gr # type: ignore
37
+ except Exception as _e:
38
+ raise RuntimeError(
39
+ "Gradio is required. Please install it first: pip install gradio"
40
+ ) from _e
41
+
42
+
43
+ # --------------------------------------------------------------------------------------
44
+ # Utilities
45
+ # --------------------------------------------------------------------------------------
46
+
47
+ PROJECT_ROOT = Path(__file__).resolve().parent
48
+
49
+
50
+ def mask_token(token: Optional[str]) -> str:
51
+ if not token:
52
+ return "<not set>"
53
+ token = str(token)
54
+ if len(token) <= 8:
55
+ return "*" * len(token)
56
+ return f"{token[:4]}****{token[-4:]}"
57
+
58
+
59
+ def get_python() -> str:
60
+ return sys.executable or "python"
61
+
62
+
63
+ def get_username_from_token(token: str) -> Optional[str]:
64
+ try:
65
+ from huggingface_hub import HfApi # type: ignore
66
+ api = HfApi(token=token)
67
+ info = api.whoami()
68
+ if isinstance(info, dict):
69
+ return info.get("name") or info.get("username")
70
+ if isinstance(info, str):
71
+ return info
72
+ except Exception:
73
+ return None
74
+ return None
75
+
76
+
77
+ def detect_nvidia_driver() -> Tuple[bool, str]:
78
+ """Detect NVIDIA driver/GPU presence with multiple strategies.
79
+
80
+ Returns (available, human_message).
81
+ """
82
+ # 1) Try torch CUDA
83
+ try:
84
+ import torch # type: ignore
85
+ if torch.cuda.is_available():
86
+ try:
87
+ num = torch.cuda.device_count()
88
+ names = [torch.cuda.get_device_name(i) for i in range(num)]
89
+ return True, f"NVIDIA GPU detected: {', '.join(names)}"
90
+ except Exception:
91
+ return True, "NVIDIA GPU detected (torch.cuda available)"
92
+ except Exception:
93
+ pass
94
+
95
+ # 2) Try NVML via pynvml
96
+ try:
97
+ import pynvml # type: ignore
98
+ try:
99
+ pynvml.nvmlInit()
100
+ cnt = pynvml.nvmlDeviceGetCount()
101
+ names = []
102
+ for i in range(cnt):
103
+ h = pynvml.nvmlDeviceGetHandleByIndex(i)
104
+ names.append(pynvml.nvmlDeviceGetName(h).decode("utf-8", errors="ignore"))
105
+ drv = pynvml.nvmlSystemGetDriverVersion().decode("utf-8", errors="ignore")
106
+ pynvml.nvmlShutdown()
107
+ if cnt > 0:
108
+ return True, f"NVIDIA driver {drv}; GPUs: {', '.join(names)}"
109
+ except Exception:
110
+ pass
111
+ except Exception:
112
+ pass
113
+
114
+ # 3) Try nvidia-smi
115
+ try:
116
+ import subprocess
117
+ res = subprocess.run(["nvidia-smi", "-L"], capture_output=True, text=True, timeout=3)
118
+ if res.returncode == 0 and res.stdout.strip():
119
+ return True, res.stdout.strip().splitlines()[0]
120
+ except Exception:
121
+ pass
122
+
123
+ return False, "No NVIDIA driver/GPU detected"
124
+
125
+
126
+ def duplicate_space_hint() -> str:
127
+ space_id = os.environ.get("SPACE_ID") or os.environ.get("HF_SPACE_ID")
128
+ if space_id:
129
+ space_url = f"https://huggingface.co/spaces/{space_id}"
130
+ dup_url = f"{space_url}?duplicate=true"
131
+ return (
132
+ f"ℹ️ No NVIDIA driver detected. If you're on Hugging Face Spaces, "
133
+ f"please duplicate this Space to GPU hardware: [Duplicate this Space]({dup_url})."
134
+ )
135
+ return (
136
+ "ℹ️ No NVIDIA driver detected. To enable training, run on a machine with an NVIDIA GPU/driver "
137
+ "or duplicate this Space on Hugging Face with GPU hardware."
138
+ )
139
+
140
+
141
+ def _write_generated_config(filename: str, content: str) -> Path:
142
+ """Write a generated config under config/ and return the full path."""
143
+ cfg_dir = PROJECT_ROOT / "config"
144
+ cfg_dir.mkdir(parents=True, exist_ok=True)
145
+ path = cfg_dir / filename
146
+ with open(path, "w", encoding="utf-8") as f:
147
+ f.write(content)
148
+ return path
149
+
150
+
151
+ def generate_medical_o1_config_file(
152
+ dataset_config: str,
153
+ system_message: Optional[str],
154
+ developer_message: Optional[str],
155
+ num_train_epochs: float,
156
+ batch_size: int,
157
+ gradient_accumulation_steps: int,
158
+ learning_rate: float,
159
+ max_seq_length: int,
160
+ ) -> Path:
161
+ """Create a GPT-OSS Medical o1 SFT config file from user inputs."""
162
+ # Sanitize quotes in messages
163
+ def _q(s: Optional[str]) -> str:
164
+ if s is None or s == "":
165
+ return "None"
166
+ return repr(s)
167
+
168
+ py = f"""
169
+ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
170
+
171
+ config = GPTOSSEnhancedCustomConfig(
172
+ dataset_name="FreedomIntelligence/medical-o1-reasoning-SFT",
173
+ dataset_config={repr(dataset_config)},
174
+ dataset_split="train",
175
+ dataset_format="medical_o1_sft",
176
+
177
+ # Field mapping and prefixes
178
+ input_field="Question",
179
+ target_field="Response",
180
+ question_field="Question",
181
+ reasoning_field="Complex_CoT",
182
+ response_field="Response",
183
+ reason_prefix="Reasoning: ",
184
+ answer_prefix="Final Answer: ",
185
+
186
+ # Optional context
187
+ system_message={_q(system_message)},
188
+ developer_message={_q(developer_message)},
189
+
190
+ # Training hyperparameters
191
+ num_train_epochs={num_train_epochs},
192
+ batch_size={batch_size},
193
+ gradient_accumulation_steps={gradient_accumulation_steps},
194
+ learning_rate={learning_rate},
195
+ min_lr=2e-5,
196
+ weight_decay=0.01,
197
+ warmup_ratio=0.03,
198
+
199
+ # Sequence length
200
+ max_seq_length={max_seq_length},
201
+
202
+ # Precision & performance
203
+ fp16=False,
204
+ bf16=True,
205
+ dataloader_num_workers=4,
206
+ dataloader_pin_memory=True,
207
+ dataloader_prefetch_factor=2,
208
+ group_by_length=True,
209
+ remove_unused_columns=True,
210
+
211
+ # LoRA & quantization
212
+ use_lora=True,
213
+ lora_config={
214
+ "r": 16,
215
+ "lora_alpha": 32,
216
+ "lora_dropout": 0.05,
217
+ "target_modules": "all-linear",
218
+ "target_parameters": [
219
+ "7.mlp.experts.gate_up_proj",
220
+ "7.mlp.experts.down_proj",
221
+ "15.mlp.experts.gate_up_proj",
222
+ "15.mlp.experts.down_proj",
223
+ "23.mlp.experts.gate_up_proj",
224
+ "23.mlp.experts.down_proj",
225
+ ],
226
+ "bias": "none",
227
+ "task_type": "CAUSAL_LM",
228
+ },
229
+ use_quantization=True,
230
+ quantization_config={
231
+ "dequantize": True,
232
+ "load_in_4bit": False,
233
+ },
234
+
235
+ # Logging & evaluation
236
+ eval_strategy="steps",
237
+ eval_steps=100,
238
+ logging_steps=10,
239
+ save_strategy="steps",
240
+ save_steps=500,
241
+ save_total_limit=3,
242
+ metric_for_best_model="eval_loss",
243
+ greater_is_better=False,
244
+ )
245
+ """
246
+ return _write_generated_config("_generated_gpt_oss_medical_o1_sft.py", py)
247
+
248
+
249
+ def generate_gpt_oss_custom_config_file(
250
+ dataset_name: str,
251
+ dataset_split: str,
252
+ dataset_format: str,
253
+ input_field: str,
254
+ target_field: Optional[str],
255
+ system_message: Optional[str],
256
+ developer_message: Optional[str],
257
+ model_identity: Optional[str],
258
+ max_samples: Optional[int],
259
+ min_length: int,
260
+ max_length: Optional[int],
261
+ num_train_epochs: float,
262
+ batch_size: int,
263
+ gradient_accumulation_steps: int,
264
+ learning_rate: float,
265
+ min_lr: float,
266
+ weight_decay: float,
267
+ warmup_ratio: float,
268
+ max_seq_length: int,
269
+ lora_r: int,
270
+ lora_alpha: int,
271
+ lora_dropout: float,
272
+ mixed_precision: str, # "bf16"|"fp16"|"fp32"
273
+ num_workers: int,
274
+ quantization_type: str, # "mxfp4"|"bnb4"|"none"
275
+ max_grad_norm: float,
276
+ logging_steps: int,
277
+ eval_steps: int,
278
+ save_steps: int,
279
+ ) -> Path:
280
+ # Precision flags
281
+ if mixed_precision.lower() == "bf16":
282
+ fp16_flag = False
283
+ bf16_flag = True
284
+ elif mixed_precision.lower() == "fp16":
285
+ fp16_flag = True
286
+ bf16_flag = False
287
+ else:
288
+ fp16_flag = False
289
+ bf16_flag = False
290
+
291
+ # Quantization flags/config
292
+ if quantization_type == "mxfp4":
293
+ use_quant = True
294
+ quant_cfg = '{"dequantize": True, "load_in_4bit": False}'
295
+ elif quantization_type == "bnb4":
296
+ use_quant = True
297
+ quant_cfg = '{"dequantize": False, "load_in_4bit": True, "bnb_4bit_compute_dtype": "bfloat16", "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4"}'
298
+ else:
299
+ use_quant = False
300
+ quant_cfg = '{"dequantize": False, "load_in_4bit": False}'
301
+
302
+ def _q(s: Optional[str]) -> str:
303
+ if s is None or s == "":
304
+ return "None"
305
+ return repr(s)
306
+
307
+ py = f"""
308
+ from config.train_gpt_oss_custom import GPTOSSEnhancedCustomConfig
309
+
310
+ config = GPTOSSEnhancedCustomConfig(
311
+ # Dataset
312
+ dataset_name={repr(dataset_name)},
313
+ dataset_split={repr(dataset_split)},
314
+ dataset_format={repr(dataset_format)},
315
+ input_field={repr(input_field)},
316
+ target_field={repr(target_field)} if {repr(target_field)} != 'None' else None,
317
+ system_message={_q(system_message)},
318
+ developer_message={_q(developer_message)},
319
+ max_samples={repr(max_samples)} if {repr(max_samples)} != 'None' else None,
320
+ min_length={min_length},
321
+ max_length={repr(max_length)} if {repr(max_length)} != 'None' else None,
322
+
323
+ # Training hyperparameters
324
+ num_train_epochs={num_train_epochs},
325
+ batch_size={batch_size},
326
+ gradient_accumulation_steps={gradient_accumulation_steps},
327
+ learning_rate={learning_rate},
328
+ min_lr={min_lr},
329
+ weight_decay={weight_decay},
330
+ warmup_ratio={warmup_ratio},
331
+ max_grad_norm={max_grad_norm},
332
+
333
+ # Model
334
+ max_seq_length={max_seq_length},
335
+
336
+ # Precision
337
+ fp16={str(fp16_flag)},
338
+ bf16={str(bf16_flag)},
339
+
340
+ # LoRA
341
+ lora_config={{
342
+ "r": {lora_r},
343
+ "lora_alpha": {lora_alpha},
344
+ "lora_dropout": {lora_dropout},
345
+ "target_modules": "all-linear",
346
+ "bias": "none",
347
+ "task_type": "CAUSAL_LM",
348
+ }},
349
+
350
+ # Quantization
351
+ use_quantization={str(use_quant)},
352
+ quantization_config={quant_cfg},
353
+
354
+ # Performance
355
+ dataloader_num_workers={num_workers},
356
+ dataloader_pin_memory=True,
357
+ group_by_length=True,
358
+
359
+ # Logging & eval
360
+ logging_steps={logging_steps},
361
+ eval_steps={eval_steps},
362
+ save_steps={save_steps},
363
+
364
+ # Chat template (Harmony)
365
+ chat_template_kwargs={{
366
+ "add_generation_prompt": True,
367
+ "tokenize": False,
368
+ "auto_insert_role": True,
369
+ "reasoning_effort": "medium",
370
+ "model_identity": {_q(model_identity) if _q(model_identity) != 'None' else repr('You are GPT-Tonic, a large language model trained by TonicAI.')},
371
+ "builtin_tools": [],
372
+ }},
373
+ )
374
+ """
375
+ return _write_generated_config("_generated_gpt_oss_custom.py", py)
376
+
377
+
378
+ def generate_smollm3_custom_config_file(
379
+ model_name: str,
380
+ dataset_name: Optional[str],
381
+ max_seq_length: int,
382
+ batch_size: int,
383
+ gradient_accumulation_steps: int,
384
+ learning_rate: float,
385
+ save_steps: int,
386
+ eval_steps: int,
387
+ logging_steps: int,
388
+ filter_bad_entries: bool,
389
+ input_field: str,
390
+ target_field: str,
391
+ sample_size: Optional[int],
392
+ sample_seed: int,
393
+ trainer_type: str,
394
+ ) -> Path:
395
+ # Create subclass to include dataset fields similar to other configs
396
+ def _bool(b: bool) -> str:
397
+ return "True" if b else "False"
398
+
399
+ ds_section = """
400
+ # HF Dataset configuration
401
+ dataset_name={}
402
+ dataset_split="train"
403
+ input_field={}
404
+ target_field={}
405
+ filter_bad_entries={}
406
+ bad_entry_field="bad_entry"
407
+ sample_size={}
408
+ sample_seed={}
409
+ """.format(
410
+ repr(dataset_name) if dataset_name else "None",
411
+ repr(input_field),
412
+ repr(target_field),
413
+ _bool(filter_bad_entries),
414
+ repr(sample_size) if sample_size is not None else "None",
415
+ sample_seed,
416
+ )
417
+
418
+ py = f"""
419
+ from dataclasses import dataclass
420
+ from typing import Optional
421
+ from config.train_smollm3 import SmolLM3Config
422
+
423
+ @dataclass
424
+ class SmolLM3GeneratedConfig(SmolLM3Config):
425
+ {ds_section}
426
+
427
+ config = SmolLM3GeneratedConfig(
428
+ trainer_type={repr(trainer_type.lower())},
429
+ model_name={repr(model_name)},
430
+ max_seq_length={max_seq_length},
431
+ use_flash_attention=True,
432
+ use_gradient_checkpointing=True,
433
+
434
+ batch_size={batch_size},
435
+ gradient_accumulation_steps={gradient_accumulation_steps},
436
+ learning_rate={learning_rate},
437
+ weight_decay=0.01,
438
+ warmup_steps=100,
439
+ max_iters=None,
440
+ eval_interval={eval_steps},
441
+ log_interval={logging_steps},
442
+ save_interval={save_steps},
443
+
444
+ optimizer="adamw",
445
+ beta1=0.9,
446
+ beta2=0.95,
447
+ eps=1e-8,
448
+ scheduler="cosine",
449
+ min_lr=1e-6,
450
+ fp16=True,
451
+ bf16=False,
452
+ save_steps={save_steps},
453
+ eval_steps={eval_steps},
454
+ logging_steps={logging_steps},
455
+ save_total_limit=3,
456
+ eval_strategy="steps",
457
+ metric_for_best_model="eval_loss",
458
+ greater_is_better=False,
459
+ load_best_model_at_end=True,
460
+ )
461
+ """
462
+ return _write_generated_config("_generated_smollm3_custom.py", py)
463
+
464
+ def ensure_dataset_repo(username: str, dataset_name: str, token: str) -> Tuple[str, bool, str]:
465
+ """Create or ensure dataset repo exists. Returns (repo_id, created_or_exists, message)."""
466
+ from huggingface_hub import create_repo # type: ignore
467
+ repo_id = f"{username}/{dataset_name}"
468
+ try:
469
+ create_repo(repo_id=repo_id, repo_type="dataset", token=token, exist_ok=True, private=False)
470
+ return repo_id, True, f"Dataset repo ready: {repo_id}"
471
+ except Exception as e:
472
+ return repo_id, False, f"Failed to create dataset repo {repo_id}: {e}"
473
+
474
+
475
+ def import_config_object(config_path: Path) -> Optional[Any]:
476
+ """Import a config file and return its 'config' object if present, else None."""
477
+ try:
478
+ spec = importlib.util.spec_from_file_location("config_module", str(config_path))
479
+ if not spec or not spec.loader:
480
+ return None
481
+ module = importlib.util.module_from_spec(spec)
482
+ spec.loader.exec_module(module) # type: ignore
483
+ if hasattr(module, "config"):
484
+ return getattr(module, "config")
485
+ return None
486
+ except Exception:
487
+ return None
488
+
489
+
490
+ def run_command_stream(args: list[str], env: Dict[str, str], cwd: Optional[Path] = None) -> Generator[str, None, int]:
491
+ """Run a command and yield stdout/stderr lines as they arrive. Returns exit code at the end."""
492
+ import subprocess
493
+
494
+ yield f"$ {' '.join(shlex.quote(a) for a in ([get_python()] + args))}"
495
+ process = subprocess.Popen(
496
+ [get_python()] + args,
497
+ stdout=subprocess.PIPE,
498
+ stderr=subprocess.STDOUT,
499
+ text=True,
500
+ env=env,
501
+ cwd=str(cwd or PROJECT_ROOT),
502
+ bufsize=1,
503
+ universal_newlines=True,
504
+ )
505
+ assert process.stdout is not None
506
+ for line in iter(process.stdout.readline, ""):
507
+ yield line.rstrip()
508
+ process.stdout.close()
509
+ code = process.wait()
510
+ yield f"[exit_code={code}]"
511
+ return code
512
+
513
+
514
+ # --------------------------------------------------------------------------------------
515
+ # Configuration Mappings (mirror launch.sh)
516
+ # --------------------------------------------------------------------------------------
517
+
518
+ SMOL_CONFIGS = {
519
+ "Basic Training": {
520
+ "config_file": "config/train_smollm3.py",
521
+ "default_model": "HuggingFaceTB/SmolLM3-3B",
522
+ },
523
+ "H100 Lightweight (Rapid)": {
524
+ "config_file": "config/train_smollm3_h100_lightweight.py",
525
+ "default_model": "HuggingFaceTB/SmolLM3-3B",
526
+ },
527
+ "A100 Large Scale": {
528
+ "config_file": "config/train_smollm3_openhermes_fr_a100_large.py",
529
+ "default_model": "HuggingFaceTB/SmolLM3-3B",
530
+ },
531
+ "Multiple Passes": {
532
+ "config_file": "config/train_smollm3_openhermes_fr_a100_multiple_passes.py",
533
+ "default_model": "HuggingFaceTB/SmolLM3-3B",
534
+ },
535
+ }
536
+
537
+ GPT_OSS_CONFIGS = {
538
+ "GPT-OSS Basic Training": {
539
+ "config_file": "config/train_gpt_oss_basic.py",
540
+ "default_model": "openai/gpt-oss-20b",
541
+ },
542
+ "GPT-OSS H100 Optimized": {
543
+ "config_file": "config/train_gpt_oss_h100_optimized.py",
544
+ "default_model": "openai/gpt-oss-20b",
545
+ },
546
+ "GPT-OSS Multilingual Reasoning": {
547
+ "config_file": "config/train_gpt_oss_multilingual_reasoning.py",
548
+ "default_model": "openai/gpt-oss-20b",
549
+ },
550
+ "GPT-OSS Memory Optimized": {
551
+ "config_file": "config/train_gpt_oss_memory_optimized.py",
552
+ "default_model": "openai/gpt-oss-20b",
553
+ },
554
+ "GPT-OSS OpenHermes-FR (Recommended)": {
555
+ "config_file": "config/train_gpt_oss_openhermes_fr.py",
556
+ "default_model": "openai/gpt-oss-20b",
557
+ },
558
+ "GPT-OSS OpenHermes-FR Memory Optimized": {
559
+ "config_file": "config/train_gpt_oss_openhermes_fr_memory_optimized.py",
560
+ "default_model": "openai/gpt-oss-20b",
561
+ },
562
+ # Custom dataset and medical SFT can be added later as advanced UI panels
563
+ }
564
+
565
+
566
+ def get_config_map(family: str) -> Dict[str, Dict[str, str]]:
567
+ return SMOL_CONFIGS if family == "SmolLM3" else GPT_OSS_CONFIGS
568
+
569
+
570
+ # --------------------------------------------------------------------------------------
571
+ # Pipeline Orchestration
572
+ # --------------------------------------------------------------------------------------
573
+
574
+ @dataclass
575
+ class PipelineInputs:
576
+ model_family: str
577
+ config_choice: str
578
+ trainer_type: str # "SFT" | "DPO"
579
+ monitoring_mode: str # "both" | "trackio" | "dataset" | "none"
580
+ experiment_name: str
581
+ repo_short: str
582
+ author_name: str
583
+ model_description: str
584
+ trackio_space_name: Optional[str]
585
+ deploy_trackio_space: bool
586
+ create_dataset_repo: bool
587
+ push_to_hub: bool
588
+ switch_to_read_after: bool
589
+ scheduler_override: Optional[str]
590
+ min_lr: Optional[float]
591
+ min_lr_rate: Optional[float]
592
+
593
+
594
+ def make_defaults(model_family: str) -> Tuple[str, str]:
595
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
596
+ family_slug = "gpt-oss" if model_family == "GPT-OSS" else "smollm3"
597
+ exp = f"smolfactory-{family_slug}_{ts}"
598
+ repo_short = f"smolfactory-{datetime.now().strftime('%Y%m%d')}"
599
+ return exp, repo_short
600
+
601
+
602
+ def run_pipeline(params: PipelineInputs) -> Generator[str, None, None]:
603
+ # Tokens from environment
604
+ write_token = os.environ.get("HF_WRITE_TOKEN") or os.environ.get("HF_TOKEN")
605
+ read_token = os.environ.get("HF_READ_TOKEN")
606
+
607
+ if not write_token:
608
+ yield "❌ HF_WRITE_TOKEN (or HF_TOKEN) is not set in the environment."
609
+ return
610
+
611
+ # Resolve username
612
+ username = get_username_from_token(write_token) or os.environ.get("HF_USERNAME")
613
+ if not username:
614
+ yield "❌ Could not resolve Hugging Face username from token."
615
+ return
616
+ yield f"✅ Authenticated as: {username}"
617
+
618
+ # Compute Trackio URL if applicable
619
+ trackio_url: Optional[str] = None
620
+ if params.monitoring_mode != "none" and params.trackio_space_name:
621
+ trackio_url = f"https://huggingface.co/spaces/{username}/{params.trackio_space_name}"
622
+ yield f"Trackio Space URL: {trackio_url}"
623
+
624
+ # Decide space deploy token per monitoring mode
625
+ space_deploy_token = write_token if params.monitoring_mode in ("both", "trackio") else (read_token or write_token)
626
+
627
+ # Dataset repo setup
628
+ dataset_repo = f"{username}/trackio-experiments"
629
+ if params.create_dataset_repo and params.monitoring_mode != "none":
630
+ yield f"Creating/ensuring dataset repo exists: {dataset_repo}"
631
+ rid, ok, msg = ensure_dataset_repo(username, "trackio-experiments", write_token)
632
+ yield ("✅ " if ok else "⚠️ ") + msg
633
+ dataset_repo = rid
634
+
635
+ # Resolve config file and model name
636
+ conf_map = get_config_map(params.model_family)
637
+ if params.config_choice not in conf_map:
638
+ yield f"❌ Unknown config choice: {params.config_choice}"
639
+ return
640
+ config_file = PROJECT_ROOT / conf_map[params.config_choice]["config_file"]
641
+ base_model_fallback = conf_map[params.config_choice]["default_model"]
642
+ if not config_file.exists():
643
+ yield f"❌ Config file not found: {config_file}"
644
+ return
645
+ cfg_obj = import_config_object(config_file)
646
+ base_model = getattr(cfg_obj, "model_name", base_model_fallback) if cfg_obj else base_model_fallback
647
+ dataset_name = getattr(cfg_obj, "dataset_name", None) if cfg_obj else None
648
+ batch_size = getattr(cfg_obj, "batch_size", None) if cfg_obj else None
649
+ learning_rate = getattr(cfg_obj, "learning_rate", None) if cfg_obj else None
650
+ max_seq_length = getattr(cfg_obj, "max_seq_length", None) if cfg_obj else None
651
+
652
+ # Prepare env for subprocesses
653
+ env = os.environ.copy()
654
+ env["HF_TOKEN"] = write_token
655
+ env["HUGGING_FACE_HUB_TOKEN"] = write_token
656
+ env["HF_USERNAME"] = username
657
+ env["TRACKIO_DATASET_REPO"] = dataset_repo
658
+ env["MONITORING_MODE"] = params.monitoring_mode
659
+
660
+ # Optional Trackio Space deployment
661
+ if params.deploy_trackio_space and params.monitoring_mode != "none" and params.trackio_space_name:
662
+ yield f"\n=== Deploying Trackio Space: {params.trackio_space_name} ==="
663
+ # deploy_trackio_space.py expects: space_name, token, git_email, git_name, dataset_repo
664
+ args = [
665
+ str(PROJECT_ROOT / "scripts/trackio_tonic/deploy_trackio_space.py"),
666
+ params.trackio_space_name,
667
+ space_deploy_token,
668
+ f"{username}@users.noreply.hf.co",
669
+ username,
670
+ dataset_repo,
671
+ ]
672
+ for line in run_command_stream(args, env, cwd=PROJECT_ROOT / "scripts/trackio_tonic"):
673
+ yield line
674
+
675
+ # Training output directory
676
+ out_dir = PROJECT_ROOT / "outputs" / f"{params.experiment_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
677
+ out_dir.mkdir(parents=True, exist_ok=True)
678
+ yield f"\nOutput directory: {out_dir}"
679
+
680
+ # Scheduler overrides (GPT-OSS only)
681
+ if params.model_family == "GPT-OSS" and params.scheduler_override:
682
+ env["GPT_OSS_SCHEDULER"] = params.scheduler_override
683
+ if params.min_lr is not None:
684
+ env["GPT_OSS_MIN_LR"] = str(params.min_lr)
685
+ if params.min_lr_rate is not None:
686
+ env["GPT_OSS_MIN_LR_RATE"] = str(params.min_lr_rate)
687
+
688
+ # Start training
689
+ yield f"\n=== Starting Training ({params.model_family}) ==="
690
+ if params.model_family == "GPT-OSS":
691
+ args = [
692
+ str(PROJECT_ROOT / "scripts/training/train_gpt_oss.py"),
693
+ "--config", str(config_file),
694
+ "--experiment-name", params.experiment_name,
695
+ "--output-dir", str(out_dir),
696
+ "--trackio-url", trackio_url or "",
697
+ "--trainer-type", params.trainer_type.lower(),
698
+ ]
699
+ else:
700
+ args = [
701
+ str(PROJECT_ROOT / "scripts/training/train.py"),
702
+ "--config", str(config_file),
703
+ "--experiment-name", params.experiment_name,
704
+ "--output-dir", str(out_dir),
705
+ "--trackio-url", trackio_url or "",
706
+ "--trainer-type", params.trainer_type.lower(),
707
+ ]
708
+
709
+ # Stream training logs
710
+ train_failed = False
711
+ for line in run_command_stream(args, env):
712
+ yield line
713
+ if line.strip().startswith("[exit_code=") and not line.strip().endswith("[exit_code=0]"):
714
+ train_failed = True
715
+ if train_failed:
716
+ yield "❌ Training failed. Aborting remaining steps."
717
+ return
718
+
719
+ # Push to Hub
720
+ if params.push_to_hub:
721
+ yield "\n=== Pushing Model to Hugging Face Hub ==="
722
+ repo_name = f"{username}/{params.repo_short}"
723
+ if params.model_family == "GPT-OSS":
724
+ push_args = [
725
+ str(PROJECT_ROOT / "scripts/model_tonic/push_gpt_oss_to_huggingface.py"),
726
+ str(out_dir),
727
+ repo_name,
728
+ "--token", write_token,
729
+ "--trackio-url", trackio_url or "",
730
+ "--experiment-name", params.experiment_name,
731
+ "--dataset-repo", dataset_repo,
732
+ "--author-name", params.author_name or username,
733
+ "--model-description", params.model_description,
734
+ "--training-config-type", params.config_choice,
735
+ "--model-name", base_model,
736
+ ]
737
+ if dataset_name:
738
+ push_args += ["--dataset-name", str(dataset_name)]
739
+ if batch_size is not None:
740
+ push_args += ["--batch-size", str(batch_size)]
741
+ if learning_rate is not None:
742
+ push_args += ["--learning-rate", str(learning_rate)]
743
+ if max_seq_length is not None:
744
+ push_args += ["--max-seq-length", str(max_seq_length)]
745
+ push_args += ["--trainer-type", params.trainer_type]
746
+ else:
747
+ push_args = [
748
+ str(PROJECT_ROOT / "scripts/model_tonic/push_to_huggingface.py"),
749
+ str(out_dir),
750
+ repo_name,
751
+ "--token", write_token,
752
+ "--trackio-url", trackio_url or "",
753
+ "--experiment-name", params.experiment_name,
754
+ "--dataset-repo", dataset_repo,
755
+ "--author-name", params.author_name or username,
756
+ "--model-description", params.model_description,
757
+ "--training-config-type", params.config_choice,
758
+ "--model-name", base_model,
759
+ ]
760
+ if dataset_name:
761
+ push_args += ["--dataset-name", str(dataset_name)]
762
+ if batch_size is not None:
763
+ push_args += ["--batch-size", str(batch_size)]
764
+ if learning_rate is not None:
765
+ push_args += ["--learning-rate", str(learning_rate)]
766
+ if max_seq_length is not None:
767
+ push_args += ["--max-seq-length", str(max_seq_length)]
768
+ push_args += ["--trainer-type", params.trainer_type]
769
+
770
+ for line in run_command_stream(push_args, env):
771
+ yield line
772
+
773
+ # Switch Space token to read-only (security)
774
+ if params.switch_to_read_after and params.monitoring_mode in ("both", "trackio") and params.trackio_space_name and read_token:
775
+ yield "\n=== Switching Trackio Space HF_TOKEN to READ token ==="
776
+ space_id = f"{username}/{params.trackio_space_name}"
777
+ sw_args = [
778
+ str(PROJECT_ROOT / "scripts/trackio_tonic/switch_to_read_token.py"),
779
+ space_id,
780
+ read_token,
781
+ write_token,
782
+ ]
783
+ for line in run_command_stream(sw_args, env, cwd=PROJECT_ROOT / "scripts/trackio_tonic"):
784
+ yield line
785
+ elif params.switch_to_read_after and not read_token:
786
+ yield "⚠️ HF_READ_TOKEN not set; skipping token switch."
787
+
788
+ # Final summary
789
+ yield "\n🎉 Pipeline completed."
790
+ if params.monitoring_mode != "none" and trackio_url:
791
+ yield f"Trackio: {trackio_url}"
792
+ yield f"Model repo (if pushed): https://huggingface.co/{username}/{params.repo_short}"
793
+ yield f"Outputs: {out_dir}"
794
+
795
+
796
+ # --------------------------------------------------------------------------------------
797
+ # Gradio UI
798
+ # --------------------------------------------------------------------------------------
799
+
800
+ MODEL_FAMILIES = ["SmolLM3", "GPT-OSS"]
801
+ TRAINER_CHOICES = ["SFT", "DPO"]
802
+ MONITORING_CHOICES = ["both", "trackio", "dataset", "none"]
803
+ SCHEDULER_CHOICES = [None, "linear", "cosine", "cosine_with_min_lr", "constant"]
804
+
805
+
806
+ def ui_defaults(family: str) -> Tuple[str, str, str, str]:
807
+ exp, repo_short = make_defaults(family)
808
+ default_desc = (
809
+ "A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
810
+ if family == "GPT-OSS"
811
+ else "A fine-tuned SmolLM3-3B model optimized for instruction following and French language tasks."
812
+ )
813
+ trackio_space_name = f"trackio-monitoring-{datetime.now().strftime('%Y%m%d')}"
814
+ return exp, repo_short, default_desc, trackio_space_name
815
+
816
+
817
+ def on_family_change(family: str) -> Tuple[list[str], str, str, str, str]:
818
+ confs = list(get_config_map(family).keys())
819
+ exp, repo_short, desc, space = ui_defaults(family)
820
+ return confs, confs[0] if confs else "", exp, repo_short, desc
821
+
822
+
823
+ def start_pipeline(
824
+ model_family: str,
825
+ config_choice: str,
826
+ trainer_type: str,
827
+ monitoring_mode: str,
828
+ experiment_name: str,
829
+ repo_short: str,
830
+ author_name: str,
831
+ model_description: str,
832
+ trackio_space_name: str,
833
+ deploy_trackio_space: bool,
834
+ create_dataset_repo: bool,
835
+ push_to_hub: bool,
836
+ switch_to_read_after: bool,
837
+ scheduler_override: Optional[str],
838
+ min_lr: Optional[float],
839
+ min_lr_rate: Optional[float],
840
+ ) -> Generator[str, None, None]:
841
+ try:
842
+ params = PipelineInputs(
843
+ model_family=model_family,
844
+ config_choice=config_choice,
845
+ trainer_type=trainer_type,
846
+ monitoring_mode=monitoring_mode,
847
+ experiment_name=experiment_name,
848
+ repo_short=repo_short,
849
+ author_name=author_name,
850
+ model_description=model_description,
851
+ trackio_space_name=trackio_space_name or None,
852
+ deploy_trackio_space=deploy_trackio_space,
853
+ create_dataset_repo=create_dataset_repo,
854
+ push_to_hub=push_to_hub,
855
+ switch_to_read_after=switch_to_read_after,
856
+ scheduler_override=(scheduler_override or None),
857
+ min_lr=min_lr,
858
+ min_lr_rate=min_lr_rate,
859
+ )
860
+
861
+ # Show token presence
862
+ write_token = os.environ.get("HF_WRITE_TOKEN") or os.environ.get("HF_TOKEN")
863
+ read_token = os.environ.get("HF_READ_TOKEN")
864
+ yield f"HF_WRITE_TOKEN: {mask_token(write_token)}"
865
+ yield f"HF_READ_TOKEN: {mask_token(read_token)}"
866
+
867
+ # Run the orchestrated pipeline
868
+ for line in run_pipeline(params):
869
+ yield line
870
+ # Small delay for smoother streaming
871
+ time.sleep(0.01)
872
+ except Exception as e:
873
+ yield f"❌ Error: {e}"
874
+ tb = traceback.format_exc(limit=2)
875
+ yield tb
876
+
877
+
878
+ with gr.Blocks(title="SmolLM3 / GPT-OSS Fine-tuning Pipeline") as demo:
879
+ # GPU/driver detection banner
880
+ has_gpu, gpu_msg = detect_nvidia_driver()
881
+ if has_gpu:
882
+ gr.Markdown(f"""
883
+ **SmolLM3 / GPT-OSS Fine-tuning Pipeline**
884
+ - {gpu_msg} — training is available on this runtime.
885
+ - Reads tokens from environment: `HF_WRITE_TOKEN` (required), `HF_READ_TOKEN` (optional)
886
+ - Select a config and run training; optionally deploy Trackio and push to Hub
887
+ """)
888
+ else:
889
+ gr.Markdown(f"""
890
+ **SmolLM3 / GPT-OSS Fine-tuning Pipeline**
891
+ - {duplicate_space_hint()}
892
+ - Reads tokens from environment: `HF_WRITE_TOKEN` (required), `HF_READ_TOKEN` (optional)
893
+ - You can still configure and push, but training requires a GPU runtime.
894
+ """)
895
+
896
+ with gr.Row():
897
+ model_family = gr.Dropdown(choices=MODEL_FAMILIES, value="SmolLM3", label="Model family")
898
+ trainer_type = gr.Radio(choices=TRAINER_CHOICES, value="SFT", label="Trainer type")
899
+ monitoring_mode = gr.Dropdown(choices=MONITORING_CHOICES, value="both", label="Monitoring mode")
900
+
901
+ config_choice = gr.Dropdown(choices=list(get_config_map("SmolLM3").keys()), value="Basic Training", label="Training configuration")
902
+
903
+ exp_default, repo_default, desc_default, trackio_space_default = ui_defaults("SmolLM3")
904
+ with gr.Row():
905
+ experiment_name = gr.Textbox(value=exp_default, label="Experiment name")
906
+ repo_short = gr.Textbox(value=repo_default, label="Model repo (short name)")
907
+
908
+ with gr.Row():
909
+ author_name = gr.Textbox(value=os.environ.get("HF_USERNAME", ""), label="Author name")
910
+ model_description = gr.Textbox(value=desc_default, label="Model description")
911
+
912
+ with gr.Row():
913
+ trackio_space_name = gr.Textbox(value=trackio_space_default, label="Trackio Space name (used when monitoring != none)")
914
+ deploy_trackio_space = gr.Checkbox(value=True, label="Deploy Trackio Space")
915
+ create_dataset_repo = gr.Checkbox(value=True, label="Create/ensure HF Dataset repo")
916
+
917
+ with gr.Row():
918
+ push_to_hub = gr.Checkbox(value=True, label="Push model to Hugging Face Hub")
919
+ switch_to_read_after = gr.Checkbox(value=True, label="Switch Space token to READ after training")
920
+
921
+ with gr.Tabs():
922
+ with gr.Tab("Run"):
923
+ with gr.Row():
924
+ model_family = gr.Dropdown(choices=MODEL_FAMILIES, value="SmolLM3", label="Model family")
925
+ trainer_type = gr.Radio(choices=TRAINER_CHOICES, value="SFT", label="Trainer type")
926
+ monitoring_mode = gr.Dropdown(choices=MONITORING_CHOICES, value="both", label="Monitoring mode")
927
+
928
+ config_choice = gr.Dropdown(choices=list(get_config_map("SmolLM3").keys()), value="Basic Training", label="Training configuration")
929
+
930
+ exp_default, repo_default, desc_default, trackio_space_default = ui_defaults("SmolLM3")
931
+ with gr.Row():
932
+ experiment_name = gr.Textbox(value=exp_default, label="Experiment name")
933
+ repo_short = gr.Textbox(value=repo_default, label="Model repo (short name)")
934
+
935
+ with gr.Row():
936
+ author_name = gr.Textbox(value=os.environ.get("HF_USERNAME", ""), label="Author name")
937
+ model_description = gr.Textbox(value=desc_default, label="Model description")
938
+
939
+ with gr.Row():
940
+ trackio_space_name = gr.Textbox(value=trackio_space_default, label="Trackio Space name (used when monitoring != none)")
941
+ deploy_trackio_space = gr.Checkbox(value=True, label="Deploy Trackio Space")
942
+ create_dataset_repo = gr.Checkbox(value=True, label="Create/ensure HF Dataset repo")
943
+
944
+ with gr.Row():
945
+ push_to_hub = gr.Checkbox(value=True, label="Push model to Hugging Face Hub")
946
+ switch_to_read_after = gr.Checkbox(value=True, label="Switch Space token to READ after training")
947
+
948
+ gr.Markdown("### Medical SFT (GPT-OSS o1)")
949
+ gr.Markdown("Configure GPT-OSS Medical o1 SFT (FreedomIntelligence/medical-o1-reasoning-SFT)")
950
+ med_dataset_config = gr.Dropdown(choices=["en", "en_mix", "zh", "zh_mix"], value="en", label="Dataset config")
951
+ med_system = gr.Textbox(value="You are GPT-Tonic, a large language model trained by TonicAI.", label="System message", lines=2)
952
+ med_developer = gr.Textbox(value="You are are GPT-Tonic, an intelligent assistant that always answers health-related queries scientifically.", label="Developer message", lines=3)
953
+ with gr.Row():
954
+ med_epochs = gr.Number(value=2.0, precision=2, label="Epochs")
955
+ med_bs = gr.Number(value=4, precision=0, label="Batch size")
956
+ med_gas = gr.Number(value=4, precision=0, label="Grad accumulation")
957
+ med_lr = gr.Number(value=2e-4, precision=6, label="Learning rate")
958
+ med_msl = gr.Number(value=2048, precision=0, label="Max seq length")
959
+ med_generate = gr.Button("Generate Medical Config")
960
+ med_status = gr.Textbox(label="Generated config path", interactive=False)
961
+
962
+ logs = gr.Textbox(value="", label="Logs", lines=20)
963
+ start_btn = gr.Button("Run Pipeline")
964
+
965
+ with gr.Tab("Advanced Config"):
966
+ with gr.Accordion("GPT-OSS Scheduler Overrides", open=False):
967
+ scheduler_override = gr.Dropdown(choices=[c for c in SCHEDULER_CHOICES if c is not None], value=None, allow_custom_value=True, label="Scheduler override")
968
+ min_lr = gr.Number(value=None, precision=6, label="min_lr (when cosine_with_min_lr)")
969
+ min_lr_rate = gr.Number(value=None, precision=6, label="min_lr_rate (when cosine_with_min_lr)")
970
+
971
+ gr.Markdown("### GPT-OSS Custom Dataset")
972
+ with gr.Row():
973
+ cds_dataset = gr.Textbox(value="legmlai/openhermes-fr", label="Dataset name")
974
+ cds_split = gr.Textbox(value="train", label="Split")
975
+ cds_format = gr.Dropdown(choices=["openhermes_fr", "messages", "text", "medical_o1_sft", "custom", "preference"], value="openhermes_fr", label="Format")
976
+ with gr.Row():
977
+ cds_input = gr.Textbox(value="prompt", label="Input field")
978
+ cds_target = gr.Textbox(value="accepted_completion", label="Target field (optional, blank for None)")
979
+ with gr.Row():
980
+ cds_sys = gr.Textbox(value="", label="System message (optional)")
981
+ cds_dev = gr.Textbox(value="", label="Developer message (optional)")
982
+ with gr.Row():
983
+ cds_identity = gr.Textbox(value="You are GPT-Tonic, a large language model trained by TonicAI.", label="Model identity (chat_template_kwargs.model_identity)")
984
+ with gr.Row():
985
+ cds_max_samples = gr.Number(value=None, precision=0, label="Max samples (optional)")
986
+ cds_min_len = gr.Number(value=10, precision=0, label="Min length")
987
+ cds_max_len = gr.Number(value=None, precision=0, label="Max length (optional)")
988
+ gr.Markdown("#### Training Hyperparameters")
989
+ with gr.Row():
990
+ cds_epochs = gr.Number(value=1.0, precision=2, label="Epochs")
991
+ cds_bs = gr.Number(value=4, precision=0, label="Batch size")
992
+ cds_gas = gr.Number(value=4, precision=0, label="Grad accumulation")
993
+ cds_lr = gr.Number(value=2e-4, precision=6, label="Learning rate")
994
+ cds_minlr = gr.Number(value=2e-5, precision=6, label="Min LR")
995
+ with gr.Row():
996
+ cds_wd = gr.Number(value=0.01, precision=6, label="Weight decay")
997
+ cds_warm = gr.Number(value=0.03, precision=6, label="Warmup ratio")
998
+ cds_msl = gr.Number(value=2048, precision=0, label="Max seq length")
999
+ gr.Markdown("#### LoRA / Precision / Quantization / Perf")
1000
+ with gr.Row():
1001
+ cds_lora_r = gr.Number(value=16, precision=0, label="LoRA r")
1002
+ cds_lora_alpha = gr.Number(value=32, precision=0, label="LoRA alpha")
1003
+ cds_lora_dropout = gr.Number(value=0.05, precision=4, label="LoRA dropout")
1004
+ with gr.Row():
1005
+ cds_precision = gr.Dropdown(choices=["bf16", "fp16", "fp32"], value="bf16", label="Mixed precision")
1006
+ cds_workers = gr.Number(value=4, precision=0, label="Data workers")
1007
+ cds_quant = gr.Dropdown(choices=["mxfp4", "bnb4", "none"], value="mxfp4", label="Quantization")
1008
+ with gr.Row():
1009
+ cds_mgn = gr.Number(value=1.0, precision=4, label="Max grad norm")
1010
+ cds_log_steps = gr.Number(value=10, precision=0, label="Logging steps")
1011
+ cds_eval_steps = gr.Number(value=100, precision=0, label="Eval steps")
1012
+ cds_save_steps = gr.Number(value=500, precision=0, label="Save steps")
1013
+ cds_generate = gr.Button("Generate GPT-OSS Custom Config")
1014
+ cds_status = gr.Textbox(label="Generated config path", interactive=False)
1015
+
1016
+ gr.Markdown("### SmolLM3 Custom Configuration")
1017
+ with gr.Row():
1018
+ sm_model = gr.Textbox(value="HuggingFaceTB/SmolLM3-3B", label="Model name")
1019
+ sm_dataset = gr.Textbox(value="legmlai/openhermes-fr", label="Dataset (optional; leave blank for local)")
1020
+ with gr.Row():
1021
+ sm_msl = gr.Number(value=4096, precision=0, label="Max seq length")
1022
+ sm_bs = gr.Number(value=2, precision=0, label="Batch size")
1023
+ sm_gas = gr.Number(value=8, precision=0, label="Grad accumulation")
1024
+ sm_lr = gr.Number(value=5e-6, precision=8, label="Learning rate")
1025
+ with gr.Row():
1026
+ sm_save = gr.Number(value=500, precision=0, label="Save steps")
1027
+ sm_eval = gr.Number(value=100, precision=0, label="Eval steps")
1028
+ sm_log = gr.Number(value=10, precision=0, label="Logging steps")
1029
+ with gr.Row():
1030
+ sm_filter = gr.Checkbox(value=False, label="Filter bad entries")
1031
+ sm_in = gr.Textbox(value="prompt", label="Input field")
1032
+ sm_out = gr.Textbox(value="accepted_completion", label="Target field")
1033
+ with gr.Row():
1034
+ sm_sample = gr.Number(value=None, precision=0, label="Sample size (optional)")
1035
+ sm_seed = gr.Number(value=42, precision=0, label="Sample seed")
1036
+ sm_trainer = gr.Dropdown(choices=["SFT", "DPO"], value="SFT", label="Trainer type")
1037
+ sm_generate = gr.Button("Generate SmolLM3 Custom Config")
1038
+ sm_status = gr.Textbox(label="Generated config path", interactive=False)
1039
+
1040
+ logs = gr.Textbox(value="", label="Logs", lines=20)
1041
+
1042
+ start_btn = gr.Button("Run Pipeline")
1043
+
1044
+ # Events
1045
+ model_family.change(on_family_change, inputs=model_family, outputs=[config_choice, config_choice, experiment_name, repo_short, model_description])
1046
+
1047
+ # Generate config handlers
1048
+ med_generate.click(
1049
+ lambda dc, sysm, devm, ep, bs, gas, lr, msl: str(
1050
+ generate_medical_o1_config_file(
1051
+ dataset_config=dc,
1052
+ system_message=sysm,
1053
+ developer_message=devm,
1054
+ num_train_epochs=float(ep or 2.0),
1055
+ batch_size=int(bs or 4),
1056
+ gradient_accumulation_steps=int(gas or 4),
1057
+ learning_rate=float(lr or 2e-4),
1058
+ max_seq_length=int(msl or 2048),
1059
+ )
1060
+ ),
1061
+ inputs=[med_dataset_config, med_system, med_developer, med_epochs, med_bs, med_gas, med_lr, med_msl],
1062
+ outputs=[med_status],
1063
+ )
1064
+
1065
+ cds_generate.click(
1066
+ lambda dname, dsplit, dformat, ifld, tfld, sm, dm, ident, ms, minl, maxl, ep, bs, gas, lr, minlr, wd, warm, msl, lr_, la, ld, prec, nw, q, mgn, logst, evst, savst: str(
1067
+ generate_gpt_oss_custom_config_file(
1068
+ dataset_name=dname,
1069
+ dataset_split=dsplit,
1070
+ dataset_format=dformat,
1071
+ input_field=ifld,
1072
+ target_field=(tfld or None),
1073
+ system_message=sm,
1074
+ developer_message=dm,
1075
+ model_identity=ident,
1076
+ max_samples=(int(ms) if ms is not None else None),
1077
+ min_length=int(minl or 10),
1078
+ max_length=(int(maxl) if maxl is not None else None),
1079
+ num_train_epochs=float(ep or 1.0),
1080
+ batch_size=int(bs or 4),
1081
+ gradient_accumulation_steps=int(gas or 4),
1082
+ learning_rate=float(lr or 2e-4),
1083
+ min_lr=float(minlr or 2e-5),
1084
+ weight_decay=float(wd or 0.01),
1085
+ warmup_ratio=float(warm or 0.03),
1086
+ max_seq_length=int(msl or 2048),
1087
+ lora_r=int(lr_),
1088
+ lora_alpha=int(la),
1089
+ lora_dropout=float(ld),
1090
+ mixed_precision=prec,
1091
+ num_workers=int(nw or 4),
1092
+ quantization_type=q,
1093
+ max_grad_norm=float(mgn or 1.0),
1094
+ logging_steps=int(logst or 10),
1095
+ eval_steps=int(evst or 100),
1096
+ save_steps=int(savst or 500),
1097
+ )
1098
+ ),
1099
+ inputs=[
1100
+ cds_dataset, cds_split, cds_format, cds_input, cds_target, cds_sys, cds_dev, cds_identity,
1101
+ cds_max_samples, cds_min_len, cds_max_len, cds_epochs, cds_bs, cds_gas, cds_lr, cds_minlr, cds_wd,
1102
+ cds_warm, cds_msl, cds_lora_r, cds_lora_alpha, cds_lora_dropout, cds_precision, cds_workers, cds_quant,
1103
+ cds_mgn, cds_log_steps, cds_eval_steps, cds_save_steps
1104
+ ],
1105
+ outputs=[cds_status],
1106
+ )
1107
+
1108
+ sm_generate.click(
1109
+ lambda mn, dn, msl, bs, gas, lr, sst, est, lst, fbe, ifld, tfld, ss, seed, tt: str(
1110
+ generate_smollm3_custom_config_file(
1111
+ model_name=mn,
1112
+ dataset_name=(dn or None),
1113
+ max_seq_length=int(msl or 4096),
1114
+ batch_size=int(bs or 2),
1115
+ gradient_accumulation_steps=int(gas or 8),
1116
+ learning_rate=float(lr or 5e-6),
1117
+ save_steps=int(sst or 500),
1118
+ eval_steps=int(est or 100),
1119
+ logging_steps=int(lst or 10),
1120
+ filter_bad_entries=bool(fbe),
1121
+ input_field=ifld,
1122
+ target_field=tfld,
1123
+ sample_size=(int(ss) if ss is not None else None),
1124
+ sample_seed=int(seed or 42),
1125
+ trainer_type=tt,
1126
+ )
1127
+ ),
1128
+ inputs=[
1129
+ sm_model, sm_dataset, sm_msl, sm_bs, sm_gas, sm_lr, sm_save, sm_eval, sm_log,
1130
+ sm_filter, sm_in, sm_out, sm_sample, sm_seed, sm_trainer,
1131
+ ],
1132
+ outputs=[sm_status],
1133
+ )
1134
+
1135
+ start_btn.click(
1136
+ start_pipeline,
1137
+ inputs=[
1138
+ model_family,
1139
+ config_choice,
1140
+ trainer_type,
1141
+ monitoring_mode,
1142
+ experiment_name,
1143
+ repo_short,
1144
+ author_name,
1145
+ model_description,
1146
+ trackio_space_name,
1147
+ deploy_trackio_space,
1148
+ create_dataset_repo,
1149
+ push_to_hub,
1150
+ switch_to_read_after,
1151
+ scheduler_override,
1152
+ min_lr,
1153
+ min_lr_rate,
1154
+ ],
1155
+ outputs=[logs],
1156
+ )
1157
+
1158
+
1159
+ if __name__ == "__main__":
1160
+ # Optional: allow setting server parameters via env
1161
+ server_port = int(os.environ.get("INTERFACE_PORT", "7860"))
1162
+ server_name = os.environ.get("INTERFACE_HOST", "0.0.0.0")
1163
+ demo.queue().launch(server_name=server_name, server_port=server_port)
1164
+
1165
+
launch.sh CHANGED
@@ -478,6 +478,7 @@ get_custom_dataset_config() {
478
  print_info "💬 Harmony Context (optional)"
479
  get_input "System message" "You are GPT-Tonic, a large language model trained by TonicAI." SYSTEM_MESSAGE
480
  get_input "Developer message" "You are an intelligent assistant that can answer customer service queries" DEVELOPER_MESSAGE
 
481
 
482
  # Dataset Filtering Options
483
  echo ""
@@ -601,6 +602,27 @@ update_enhanced_gpt_oss_config() {
601
  ;;
602
  esac
603
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  # Create enhanced config file with all user choices
605
  cat > "$CONFIG_FILE" << EOF
606
  """
@@ -626,11 +648,22 @@ config = GPTOSSEnhancedCustomConfig(
626
  min_length=$MIN_LENGTH,
627
  max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
628
 
629
- # Harmony context
630
- system_message=$(if [ -n "$SYSTEM_MESSAGE" ]; then printf '%s' "\"$SYSTEM_MESSAGE\""; else echo "None"; fi),
631
- developer_message=$(if [ -n "$DEVELOPER_MESSAGE" ]; then printf '%s' "\"$DEVELOPER_MESSAGE\""; else echo "None"; fi),
 
 
632
  use_harmony_format=True,
633
 
 
 
 
 
 
 
 
 
 
634
  # Medical o1 SFT mapping (ignored unless dataset_format == 'medical_o1_sft')
635
  question_field=$(if [ -n "$MED_Q_FIELD" ]; then echo "\"$MED_Q_FIELD\""; else echo "\"Question\""; fi),
636
  reasoning_field=$(if [ -n "$MED_REASON_FIELD" ]; then echo "\"$MED_REASON_FIELD\""; else echo "\"Complex_CoT\""; fi),
@@ -792,7 +825,8 @@ config = SmolLM3Config(
792
  experiment_name="$EXPERIMENT_NAME",
793
 
794
  # HF Datasets configuration
795
- dataset_repo="$TRACKIO_DATASET_REPO"
 
796
  )
797
  EOF
798
  }
@@ -881,6 +915,35 @@ fi
881
 
882
  get_training_config "$TRAINING_CONFIG_TYPE"
883
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
884
  # 2.3 Set a family-specific default model description for the model card
885
  if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
886
  DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
@@ -999,12 +1062,16 @@ get_input "Save steps" "500" SAVE_STEPS
999
  get_input "Evaluation steps" "100" EVAL_STEPS
1000
  get_input "Logging steps" "10" LOGGING_STEPS
1001
 
1002
- # Step 5: Trackio Space configuration
1003
- print_step "Step 5: Trackio Space Configuration"
1004
- echo "======================================"
1005
-
1006
- get_input "Trackio Space name" "trackio-monitoring-$(date +%Y%m%d)" TRACKIO_SPACE_NAME
1007
- TRACKIO_URL="https://huggingface.co/spaces/$HF_USERNAME/$TRACKIO_SPACE_NAME"
 
 
 
 
1008
 
1009
  # Step 6: Confirm configuration
1010
  print_step "Step 6: Configuration Summary"
@@ -1029,6 +1096,7 @@ echo " Model Repo: $REPO_NAME (auto-generated)"
1029
  echo " Author: $AUTHOR_NAME"
1030
  echo " Trackio Space: $TRACKIO_URL"
1031
  echo " HF Dataset: $TRACKIO_DATASET_REPO"
 
1032
  echo ""
1033
 
1034
  read -p "Proceed with this configuration? (y/N): " confirm
@@ -1153,57 +1221,62 @@ get_input "Author name for model card" "$HF_USERNAME" AUTHOR_NAME
1153
  print_info "Model description will be used in the model card and repository."
1154
  get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
1155
 
1156
- # Step 9: Deploy Trackio Space (automated)
1157
- print_step "Step 9: Deploying Trackio Space"
1158
- echo "==================================="
1159
-
1160
- cd scripts/trackio_tonic
1161
-
1162
- print_info "Deploying Trackio Space ..."
1163
- print_info "Space name: $TRACKIO_SPACE_NAME"
1164
- print_info "Username will be auto-detected from token"
1165
- print_info "Secrets will be set automatically via API"
1166
-
1167
- # Ensure environment variables are available for the script
1168
- export HF_TOKEN="$HF_TOKEN"
1169
- export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1170
- export HF_USERNAME="$HF_USERNAME"
1171
-
1172
- # Run deployment script with automated features
1173
- python deploy_trackio_space.py "$TRACKIO_SPACE_NAME" "$HF_TOKEN" "$GIT_EMAIL" "$HF_USERNAME" "$TRACKIO_DATASET_REPO"
1174
-
1175
- print_status "Trackio Space deployed: $TRACKIO_URL"
1176
-
1177
- # Step 10: Setup HF Dataset (automated)
1178
- print_step "Step 10: Setting up HF Dataset"
1179
- echo "=================================="
1180
-
1181
- cd ../dataset_tonic
1182
- print_info "Setting up HF Dataset with automated features..."
1183
- print_info "Username will be auto-detected from token"
1184
- print_info "Dataset repository: $TRACKIO_DATASET_REPO"
1185
-
1186
- # Ensure environment variables are available for the script
1187
- export HF_TOKEN="$HF_TOKEN"
1188
- export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1189
- export HF_USERNAME="$HF_USERNAME"
1190
-
1191
- python setup_hf_dataset.py "$HF_TOKEN"
1192
-
1193
- # Step 11: Configure Trackio (automated)
1194
- print_step "Step 11: Configuring Trackio"
1195
- echo "================================="
1196
-
1197
- cd ../trackio_tonic
1198
- print_info "Configuring Trackio ..."
1199
- print_info "Username will be auto-detected from token"
1200
 
1201
- # Ensure environment variables are available for the script
1202
- export HF_TOKEN="$HF_TOKEN"
1203
- export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1204
- export HF_USERNAME="$HF_USERNAME"
 
 
 
 
 
 
 
 
 
 
 
 
1205
 
1206
- python configure_trackio.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1207
 
1208
  # Step 12: Training Configuration
1209
  print_step "Step 12: Training Configuration"
@@ -1256,11 +1329,12 @@ print_info "Trackio: $TRACKIO_URL"
1256
  # Ensure environment variables are available for training
1257
  export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
1258
  export HF_READ_TOKEN="$HF_READ_TOKEN"
1259
- export HF_TOKEN="$HF_TOKEN"
1260
- export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
1261
  export HF_USERNAME="$HF_USERNAME"
1262
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
1263
  export OUTPUT_DIR="$OUTPUT_DIR"
 
1264
 
1265
  # Run the appropriate training script based on model type
1266
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
@@ -1334,32 +1408,31 @@ else
1334
  --trainer-type "$TRAINER_TYPE"
1335
  fi
1336
 
1337
- # Step 16.5: Switch Trackio Space to Read Token (Security)
1338
- print_step "Step 16.5: Switching to Read Token for Security"
1339
- echo "===================================================="
1340
-
1341
- print_info "Switching Trackio Space HF_TOKEN from write token to read token for security..."
1342
- print_info "This ensures the space can only read datasets, not write to repositories"
1343
-
1344
- # Ensure environment variables are available for token switch
1345
- export HF_TOKEN="$HF_WRITE_TOKEN" # Use write token to update space
1346
- export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
1347
- export HF_USERNAME="$HF_USERNAME"
1348
-
1349
- # Switch HF_TOKEN in Trackio Space from write to read token
1350
- cd scripts/trackio_tonic
1351
- python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
1352
-
1353
- if [ $? -eq 0 ]; then
1354
- print_status " Successfully switched Trackio Space HF_TOKEN to read token"
1355
- print_info "🔒 Space now uses read-only permissions for security"
 
 
1356
  else
1357
- print_warning "⚠️ Failed to switch to read token, but continuing with pipeline"
1358
- print_info "You can manually switch the token in your Space settings later"
1359
  fi
1360
 
1361
- cd ../..
1362
-
1363
  # Step 17: Deploy Demo Space
1364
  print_step "Step 17: Deploying Demo Space"
1365
  echo "=================================="
@@ -1387,7 +1460,8 @@ export HF_USERNAME="$HF_USERNAME"
1387
  --hf-username "$HF_USERNAME" \
1388
  --model-id "$DEMO_MODEL_ID" \
1389
  --subfolder "$DEMO_SUBFOLDER" \
1390
- --space-name "${REPO_SHORT}-demo"
 
1391
 
1392
  if [ $? -eq 0 ]; then
1393
  DEMO_SPACE_URL="https://huggingface.co/spaces/$HF_USERNAME/${REPO_SHORT}-demo"
 
478
  print_info "💬 Harmony Context (optional)"
479
  get_input "System message" "You are GPT-Tonic, a large language model trained by TonicAI." SYSTEM_MESSAGE
480
  get_input "Developer message" "You are an intelligent assistant that can answer customer service queries" DEVELOPER_MESSAGE
481
+ get_input "Model identity/persona (used in chat_template_kwargs.model_identity)" "You are GPT-Tonic, a large language model trained by TonicAI." MODEL_IDENTITY
482
 
483
  # Dataset Filtering Options
484
  echo ""
 
602
  ;;
603
  esac
604
 
605
+ # Safely serialize free-text fields to valid Python literals
606
+ SYSTEM_MESSAGE_LITERAL=$(SYSTEM_MESSAGE="$SYSTEM_MESSAGE" python - <<'PY'
607
+ import json, os
608
+ v = os.environ.get('SYSTEM_MESSAGE', '')
609
+ print('None' if not v else json.dumps(v))
610
+ PY
611
+ )
612
+ DEVELOPER_MESSAGE_LITERAL=$(DEVELOPER_MESSAGE="$DEVELOPER_MESSAGE" python - <<'PY'
613
+ import json, os
614
+ v = os.environ.get('DEVELOPER_MESSAGE', '')
615
+ print('None' if not v else json.dumps(v))
616
+ PY
617
+ )
618
+ MODEL_IDENTITY_DEFAULT="You are GPT-Tonic, a large language model trained by TonicAI."
619
+ MODEL_IDENTITY_LITERAL=$(MODEL_IDENTITY="${MODEL_IDENTITY:-$MODEL_IDENTITY_DEFAULT}" python - <<'PY'
620
+ import json, os
621
+ v = os.environ.get('MODEL_IDENTITY', '')
622
+ print(json.dumps(v))
623
+ PY
624
+ )
625
+
626
  # Create enhanced config file with all user choices
627
  cat > "$CONFIG_FILE" << EOF
628
  """
 
648
  min_length=$MIN_LENGTH,
649
  max_length=$(if [ -n "$MAX_LENGTH" ]; then echo "$MAX_LENGTH"; else echo "None"; fi),
650
 
651
+ # ============================================================================
652
+ # HARMONY CONFIGURATION
653
+ # ============================================================================
654
+ system_message=$SYSTEM_MESSAGE_LITERAL,
655
+ developer_message=$DEVELOPER_MESSAGE_LITERAL,
656
  use_harmony_format=True,
657
 
658
+ chat_template_kwargs={
659
+ "add_generation_prompt": True,
660
+ "tokenize": False,
661
+ "auto_insert_role": True,
662
+ "reasoning_effort": "medium",
663
+ "model_identity": $MODEL_IDENTITY_LITERAL,
664
+ "builtin_tools": [],
665
+ },
666
+
667
  # Medical o1 SFT mapping (ignored unless dataset_format == 'medical_o1_sft')
668
  question_field=$(if [ -n "$MED_Q_FIELD" ]; then echo "\"$MED_Q_FIELD\""; else echo "\"Question\""; fi),
669
  reasoning_field=$(if [ -n "$MED_REASON_FIELD" ]; then echo "\"$MED_REASON_FIELD\""; else echo "\"Complex_CoT\""; fi),
 
825
  experiment_name="$EXPERIMENT_NAME",
826
 
827
  # HF Datasets configuration
828
+ dataset_repo="$TRACKIO_DATASET_REPO",
829
+ monitoring_mode="$MONITORING_MODE",
830
  )
831
  EOF
832
  }
 
915
 
916
  get_training_config "$TRAINING_CONFIG_TYPE"
917
 
918
+ # Step 2.4: Monitoring mode selection
919
+ print_step "Step 2.4: Monitoring Mode"
920
+ echo "=============================="
921
+ echo "Choose how to log your experiment:"
922
+ select_option "Select monitoring mode:" \
923
+ "Both (Trackio + Dataset)" \
924
+ "Trackio only" \
925
+ "Dataset only" \
926
+ "None (local only)" \
927
+ MONITORING_MODE_OPTION
928
+
929
+ case "$MONITORING_MODE_OPTION" in
930
+ "Both (Trackio + Dataset)") MONITORING_MODE="both" ;;
931
+ "Trackio only") MONITORING_MODE="trackio" ;;
932
+ "Dataset only") MONITORING_MODE="dataset" ;;
933
+ "None (local only)") MONITORING_MODE="none" ;;
934
+ *) MONITORING_MODE="both" ;;
935
+ esac
936
+
937
+ # Decide which token to use for the Trackio Space secret
938
+ # - dataset: read-only token (Space only needs to read datasets)
939
+ # - trackio/both: write token until end of training (Space writes to datasets)
940
+ # - none: Space is skipped
941
+ if [ "$MONITORING_MODE" = "dataset" ]; then
942
+ SPACE_DEPLOY_TOKEN="$HF_READ_TOKEN"
943
+ else
944
+ SPACE_DEPLOY_TOKEN="$HF_WRITE_TOKEN"
945
+ fi
946
+
947
  # 2.3 Set a family-specific default model description for the model card
948
  if [ "$MODEL_FAMILY" = "GPT-OSS" ]; then
949
  DEFAULT_MODEL_DESCRIPTION="A fine-tuned GPT-OSS-20B model optimized for multilingual reasoning and instruction following."
 
1062
  get_input "Evaluation steps" "100" EVAL_STEPS
1063
  get_input "Logging steps" "10" LOGGING_STEPS
1064
 
1065
+ # Step 5: Trackio Space configuration (skip when local-only)
1066
+ if [ "$MONITORING_MODE" != "none" ]; then
1067
+ print_step "Step 5: Trackio Space Configuration"
1068
+ echo "======================================"
1069
+ get_input "Trackio Space name" "trackio-monitoring-$(date +%Y%m%d)" TRACKIO_SPACE_NAME
1070
+ TRACKIO_URL="https://huggingface.co/spaces/$HF_USERNAME/$TRACKIO_SPACE_NAME"
1071
+ else
1072
+ TRACKIO_SPACE_NAME=""
1073
+ TRACKIO_URL=""
1074
+ fi
1075
 
1076
  # Step 6: Confirm configuration
1077
  print_step "Step 6: Configuration Summary"
 
1096
  echo " Author: $AUTHOR_NAME"
1097
  echo " Trackio Space: $TRACKIO_URL"
1098
  echo " HF Dataset: $TRACKIO_DATASET_REPO"
1099
+ echo " Monitoring Mode: $MONITORING_MODE"
1100
  echo ""
1101
 
1102
  read -p "Proceed with this configuration? (y/N): " confirm
 
1221
  print_info "Model description will be used in the model card and repository."
1222
  get_input "Model description" "$DEFAULT_MODEL_DESCRIPTION" MODEL_DESCRIPTION
1223
 
1224
+ # Step 9: Deploy Trackio Space (automated, skipped for local-only)
1225
+ if [ "$MONITORING_MODE" != "none" ]; then
1226
+ print_step "Step 9: Deploying Trackio Space"
1227
+ echo "==================================="
1228
+ cd scripts/trackio_tonic
1229
+ print_info "Deploying Trackio Space ..."
1230
+ print_info "Space name: $TRACKIO_SPACE_NAME"
1231
+ print_info "Username will be auto-detected from token"
1232
+ if [ "$MONITORING_MODE" = "dataset" ]; then
1233
+ print_info "Deploying with READ token (Space will NOT write to datasets)"
1234
+ else
1235
+ print_info "Deploying with WRITE token (Space will write to datasets during training)"
1236
+ fi
1237
+ # Ensure environment variables are available for the script
1238
+ export HF_TOKEN="$SPACE_DEPLOY_TOKEN"
1239
+ export HUGGING_FACE_HUB_TOKEN="$SPACE_DEPLOY_TOKEN"
1240
+ export HF_USERNAME="$HF_USERNAME"
1241
+ # Run deployment script with automated features (pass deploy token)
1242
+ python deploy_trackio_space.py "$TRACKIO_SPACE_NAME" "$SPACE_DEPLOY_TOKEN" "$GIT_EMAIL" "$HF_USERNAME" "$TRACKIO_DATASET_REPO"
1243
+ print_status "Trackio Space deployed: $TRACKIO_URL"
1244
+ else
1245
+ print_info "Skipping Trackio Space deployment (monitoring_mode=$MONITORING_MODE)"
1246
+ fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1247
 
1248
+ if [ "$MONITORING_MODE" != "none" ]; then
1249
+ # Step 10: Setup HF Dataset (automated) — required unless local-only
1250
+ print_step "Step 10: Setting up HF Dataset"
1251
+ echo "=================================="
1252
+ cd ../dataset_tonic
1253
+ print_info "Setting up HF Dataset with automated features..."
1254
+ print_info "Username will be auto-detected from token"
1255
+ print_info "Dataset repository: $TRACKIO_DATASET_REPO"
1256
+ # Ensure environment variables are available for the script
1257
+ export HF_TOKEN="$HF_WRITE_TOKEN"
1258
+ export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
1259
+ export HF_USERNAME="$HF_USERNAME"
1260
+ python setup_hf_dataset.py "$HF_TOKEN"
1261
+ else
1262
+ print_info "Skipping HF Dataset setup (monitoring_mode=$MONITORING_MODE)"
1263
+ fi
1264
 
1265
+ # Step 11: Configure Trackio (automated) — skipped for local-only
1266
+ if [ "$MONITORING_MODE" != "none" ]; then
1267
+ print_step "Step 11: Configuring Trackio"
1268
+ echo "================================="
1269
+ cd ../trackio_tonic
1270
+ print_info "Configuring Trackio ..."
1271
+ print_info "Username will be auto-detected from token"
1272
+ # Ensure environment variables are available for the script
1273
+ export HF_TOKEN="$SPACE_DEPLOY_TOKEN"
1274
+ export HUGGING_FACE_HUB_TOKEN="$SPACE_DEPLOY_TOKEN"
1275
+ export HF_USERNAME="$HF_USERNAME"
1276
+ python configure_trackio.py
1277
+ else
1278
+ print_info "Skipping Trackio configuration (monitoring_mode=$MONITORING_MODE)"
1279
+ fi
1280
 
1281
  # Step 12: Training Configuration
1282
  print_step "Step 12: Training Configuration"
 
1329
  # Ensure environment variables are available for training
1330
  export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
1331
  export HF_READ_TOKEN="$HF_READ_TOKEN"
1332
+ export HF_TOKEN="$HF_WRITE_TOKEN"
1333
+ export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
1334
  export HF_USERNAME="$HF_USERNAME"
1335
  export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
1336
  export OUTPUT_DIR="$OUTPUT_DIR"
1337
+ export MONITORING_MODE="$MONITORING_MODE"
1338
 
1339
  # Run the appropriate training script based on model type
1340
  if [[ "$MODEL_NAME" == *"gpt-oss"* ]]; then
 
1408
  --trainer-type "$TRAINER_TYPE"
1409
  fi
1410
 
1411
+ # Step 16.5: Switch Trackio Space to Read Token (Security) — only for trackio/both
1412
+ if [ "$MONITORING_MODE" = "trackio" ] || [ "$MONITORING_MODE" = "both" ]; then
1413
+ print_step "Step 16.5: Switching to Read Token for Security"
1414
+ echo "===================================================="
1415
+ print_info "Switching Trackio Space HF_TOKEN from write token to read token for security..."
1416
+ print_info "This ensures the space can only read datasets, not write to repositories"
1417
+ # Ensure environment variables are available for token switch
1418
+ export HF_TOKEN="$HF_WRITE_TOKEN" # Use write token to update space
1419
+ export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
1420
+ export HF_USERNAME="$HF_USERNAME"
1421
+ # Switch HF_TOKEN in Trackio Space from write to read token
1422
+ cd scripts/trackio_tonic
1423
+ python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
1424
+ if [ $? -eq 0 ]; then
1425
+ print_status " Successfully switched Trackio Space HF_TOKEN to read token"
1426
+ print_info "🔒 Space now uses read-only permissions for security"
1427
+ else
1428
+ print_warning "⚠️ Failed to switch to read token, but continuing with pipeline"
1429
+ print_info "You can manually switch the token in your Space settings later"
1430
+ fi
1431
+ cd ../..
1432
  else
1433
+ print_info "Skipping token switch (monitoring_mode=$MONITORING_MODE)"
 
1434
  fi
1435
 
 
 
1436
  # Step 17: Deploy Demo Space
1437
  print_step "Step 17: Deploying Demo Space"
1438
  echo "=================================="
 
1460
  --hf-username "$HF_USERNAME" \
1461
  --model-id "$DEMO_MODEL_ID" \
1462
  --subfolder "$DEMO_SUBFOLDER" \
1463
+ --space-name "${REPO_SHORT}-demo" \
1464
+ --config-file "$CONFIG_FILE"
1465
 
1466
  if [ $? -eq 0 ]; then
1467
  DEMO_SPACE_URL="https://huggingface.co/spaces/$HF_USERNAME/${REPO_SHORT}-demo"
requirements/requirements_core.txt CHANGED
@@ -22,4 +22,6 @@ pynvml>=12.0.0
22
  # GPT-OSS specific dependencies
23
  # Note: GPT-OSS requires specific versions for optimal performance
24
  # These are compatible with the tutorial requirements
25
- bitsandbytes>=0.41.0 # For 4-bit quantization
 
 
 
22
  # GPT-OSS specific dependencies
23
  # Note: GPT-OSS requires specific versions for optimal performance
24
  # These are compatible with the tutorial requirements
25
+ bitsandbytes>=0.41.0 # For 4-bit quantization
26
+ triton >= 3.4.0
27
+ kernels
scripts/deploy_demo_space.py CHANGED
@@ -39,7 +39,7 @@ class DemoSpaceDeployer:
39
 
40
  def __init__(self, hf_token: str, hf_username: str, model_id: str,
41
  subfolder: str = "int4", space_name: Optional[str] = None,
42
- demo_type: Optional[str] = None):
43
  self.hf_token = hf_token
44
  self.hf_username = hf_username
45
  # Allow passing just a repo name without username and auto-prefix
@@ -48,6 +48,13 @@ class DemoSpaceDeployer:
48
  self.space_name = space_name or f"{self.model_id.split('/')[-1]}-demo"
49
  self.space_id = f"{hf_username}/{self.space_name}"
50
  self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
 
 
 
 
 
 
 
51
 
52
  # Determine demo type from model_id if not provided
53
  if demo_type is None:
@@ -64,6 +71,45 @@ class DemoSpaceDeployer:
64
  else:
65
  self.api = None
66
  logger.warning("huggingface_hub not available, using CLI fallback")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  def _detect_demo_type(self, model_id: str) -> str:
69
  """Detect the appropriate demo type based on model ID"""
@@ -89,25 +135,34 @@ class DemoSpaceDeployer:
89
  if self.demo_type == "gpt":
90
  # For GPT-OSS models, we need more sophisticated environment setup
91
  model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
92
-
93
  env_setup = f"""
94
  # Environment variables for GPT-OSS model configuration
95
  import os
96
- os.environ['HF_MODEL_ID'] = '{self.model_id}'
97
- os.environ['LORA_MODEL_ID'] = '{self.model_id}'
98
  os.environ['BASE_MODEL_ID'] = 'openai/gpt-oss-20b'
99
- os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
100
- os.environ['MODEL_NAME'] = '{model_name}'
 
 
 
 
101
 
102
  """
103
  else:
104
  # For SmolLM models, use simpler setup
 
105
  env_setup = f"""
106
  # Environment variables for model configuration
107
  import os
108
- os.environ['HF_MODEL_ID'] = '{self.model_id}'
109
- os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
110
- os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
 
 
 
 
111
 
112
  """
113
  return env_setup
@@ -162,6 +217,40 @@ os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
162
  description="Display name for the model"
163
  )
164
  logger.info(f"✅ Successfully set MODEL_NAME variable: {model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
  except Exception as e:
167
  logger.error(f"❌ Failed to set model variables: {e}")
@@ -314,28 +403,51 @@ os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
314
 
315
  logger.info("✅ Updated app.py with model configuration")
316
 
317
- # Create README.md for the space
318
- readme_content = f"""# Demo: {self.model_id}
319
-
320
- This is an interactive demo for the fine-tuned model {self.model_id}.
321
-
322
- ## Features
323
- - Interactive chat interface
324
- - Customizable system prompts
325
- - Advanced generation parameters
326
- - Thinking mode support
327
-
328
- ## Model Information
329
- - **Model ID**: {self.model_id}
330
- - **Subfolder**: {self.subfolder if self.subfolder and self.subfolder.strip() else "main"}
331
- - **Deployed by**: {self.hf_username}
332
-
333
- ## Usage
334
- Simply start chatting with the model using the interface below!
335
 
336
- ---
337
- *This demo was automatically deployed by the SmolLM3 Fine-tuning Pipeline*
338
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  with open(Path(temp_dir) / "README.md", 'w', encoding='utf-8') as f:
341
  f.write(readme_content)
@@ -465,6 +577,12 @@ Simply start chatting with the model using the interface below!
465
  logger.info(f" LORA_MODEL_ID={self.model_id}")
466
  logger.info(f" BASE_MODEL_ID=openai/gpt-oss-20b")
467
  logger.info(f" MODEL_NAME={model_name}")
 
 
 
 
 
 
468
 
469
  logger.info(f"\n🔧 To set secrets in your Space:")
470
  logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
@@ -574,6 +692,7 @@ def main():
574
  parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
575
  parser.add_argument("--space-name", help="Custom space name (optional)")
576
  parser.add_argument("--demo-type", choices=["smol", "gpt"], help="Demo type: 'smol' for SmolLM, 'gpt' for GPT-OSS (auto-detected if not specified)")
 
577
 
578
  args = parser.parse_args()
579
 
@@ -583,7 +702,8 @@ def main():
583
  model_id=args.model_id,
584
  subfolder=args.subfolder,
585
  space_name=args.space_name,
586
- demo_type=args.demo_type
 
587
  )
588
 
589
  success = deployer.deploy()
 
39
 
40
  def __init__(self, hf_token: str, hf_username: str, model_id: str,
41
  subfolder: str = "int4", space_name: Optional[str] = None,
42
+ demo_type: Optional[str] = None, config_file: Optional[str] = None):
43
  self.hf_token = hf_token
44
  self.hf_username = hf_username
45
  # Allow passing just a repo name without username and auto-prefix
 
48
  self.space_name = space_name or f"{self.model_id.split('/')[-1]}-demo"
49
  self.space_id = f"{hf_username}/{self.space_name}"
50
  self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
51
+ self.config_file = config_file
52
+
53
+ # Config-derived context
54
+ self.system_message: Optional[str] = None
55
+ self.developer_message: Optional[str] = None
56
+ self.model_identity: Optional[str] = None
57
+ self.reasoning_effort: Optional[str] = None
58
 
59
  # Determine demo type from model_id if not provided
60
  if demo_type is None:
 
71
  else:
72
  self.api = None
73
  logger.warning("huggingface_hub not available, using CLI fallback")
74
+
75
+ # Load optional config-specified messages
76
+ try:
77
+ self._load_config_messages()
78
+ except Exception as e:
79
+ logger.warning(f"Could not load config messages: {e}")
80
+
81
+ def _load_config_messages(self) -> None:
82
+ """Load system/developer/model_identity from a training config file if provided."""
83
+ if not self.config_file:
84
+ return
85
+ cfg_path = Path(self.config_file)
86
+ if not cfg_path.exists():
87
+ logger.warning(f"Config file not found: {cfg_path}")
88
+ return
89
+
90
+ # Ensure project root and config dir are importable for relative imports inside config
91
+ project_root = Path(__file__).parent.parent
92
+ if str(project_root) not in sys.path:
93
+ sys.path.insert(0, str(project_root))
94
+ cfg_dir = project_root / "config"
95
+ if str(cfg_dir) not in sys.path:
96
+ sys.path.insert(0, str(cfg_dir))
97
+
98
+ import importlib.util
99
+ spec = importlib.util.spec_from_file_location("config_module", str(cfg_path))
100
+ if not spec or not spec.loader:
101
+ return
102
+ module = importlib.util.module_from_spec(spec)
103
+ spec.loader.exec_module(module) # type: ignore
104
+ cfg = getattr(module, "config", None)
105
+ if cfg is None:
106
+ return
107
+ self.system_message = getattr(cfg, "system_message", None)
108
+ self.developer_message = getattr(cfg, "developer_message", None)
109
+ chat_kwargs = getattr(cfg, "chat_template_kwargs", None)
110
+ if isinstance(chat_kwargs, dict):
111
+ self.model_identity = chat_kwargs.get("model_identity")
112
+ self.reasoning_effort = chat_kwargs.get("reasoning_effort")
113
 
114
  def _detect_demo_type(self, model_id: str) -> str:
115
  """Detect the appropriate demo type based on model ID"""
 
135
  if self.demo_type == "gpt":
136
  # For GPT-OSS models, we need more sophisticated environment setup
137
  model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
138
+ import json as _json
139
  env_setup = f"""
140
  # Environment variables for GPT-OSS model configuration
141
  import os
142
+ os.environ['HF_MODEL_ID'] = {_json.dumps(self.model_id)}
143
+ os.environ['LORA_MODEL_ID'] = {_json.dumps(self.model_id)}
144
  os.environ['BASE_MODEL_ID'] = 'openai/gpt-oss-20b'
145
+ os.environ['MODEL_SUBFOLDER'] = {_json.dumps(self.subfolder if self.subfolder else "")}
146
+ os.environ['MODEL_NAME'] = {_json.dumps(model_name)}
147
+ os.environ['MODEL_IDENTITY'] = {_json.dumps(self.model_identity or "")}
148
+ os.environ['SYSTEM_MESSAGE'] = {_json.dumps(self.system_message or (self.model_identity or ""))}
149
+ os.environ['DEVELOPER_MESSAGE'] = {_json.dumps(self.developer_message or "")}
150
+ os.environ['REASONING_EFFORT'] = {_json.dumps((self.reasoning_effort or "medium"))}
151
 
152
  """
153
  else:
154
  # For SmolLM models, use simpler setup
155
+ import json as _json
156
  env_setup = f"""
157
  # Environment variables for model configuration
158
  import os
159
+ os.environ['HF_MODEL_ID'] = {_json.dumps(self.model_id)}
160
+ os.environ['MODEL_SUBFOLDER'] = {_json.dumps(self.subfolder if self.subfolder else "")}
161
+ os.environ['MODEL_NAME'] = {_json.dumps(self.model_id.split("/")[-1])}
162
+ os.environ['MODEL_IDENTITY'] = {_json.dumps(self.model_identity or "")}
163
+ os.environ['SYSTEM_MESSAGE'] = {_json.dumps(self.system_message or (self.model_identity or ""))}
164
+ os.environ['DEVELOPER_MESSAGE'] = {_json.dumps(self.developer_message or "")}
165
+ os.environ['REASONING_EFFORT'] = {_json.dumps((self.reasoning_effort or "medium"))}
166
 
167
  """
168
  return env_setup
 
217
  description="Display name for the model"
218
  )
219
  logger.info(f"✅ Successfully set MODEL_NAME variable: {model_name}")
220
+
221
+ # Optional context variables
222
+ if self.model_identity:
223
+ self.api.add_space_variable(
224
+ repo_id=self.space_id,
225
+ key="MODEL_IDENTITY",
226
+ value=self.model_identity,
227
+ description="Default model identity/system persona"
228
+ )
229
+ logger.info("✅ Set MODEL_IDENTITY variable")
230
+ if self.system_message or self.model_identity:
231
+ self.api.add_space_variable(
232
+ repo_id=self.space_id,
233
+ key="SYSTEM_MESSAGE",
234
+ value=self.system_message or self.model_identity or "",
235
+ description="Default system message"
236
+ )
237
+ logger.info("✅ Set SYSTEM_MESSAGE variable")
238
+ if self.developer_message:
239
+ self.api.add_space_variable(
240
+ repo_id=self.space_id,
241
+ key="DEVELOPER_MESSAGE",
242
+ value=self.developer_message,
243
+ description="Default developer message"
244
+ )
245
+ logger.info("✅ Set DEVELOPER_MESSAGE variable")
246
+ if self.reasoning_effort:
247
+ self.api.add_space_variable(
248
+ repo_id=self.space_id,
249
+ key="REASONING_EFFORT",
250
+ value=self.reasoning_effort,
251
+ description="Default reasoning effort (low|medium|high)"
252
+ )
253
+ logger.info("✅ Set REASONING_EFFORT variable")
254
 
255
  except Exception as e:
256
  logger.error(f"❌ Failed to set model variables: {e}")
 
403
 
404
  logger.info("✅ Updated app.py with model configuration")
405
 
406
+ # YAML front matter required by Hugging Face Spaces
407
+ yaml_front_matter = (
408
+ f"---\n"
409
+ f"title: {'GPT-OSS Demo' if self.demo_type == 'gpt' else 'SmolLM3 Demo'}\n"
410
+ f"emoji: {'🌟' if self.demo_type == 'gpt' else '💃🏻'}\n"
411
+ f"colorFrom: {'blue' if self.demo_type == 'gpt' else 'green'}\n"
412
+ f"colorTo: {'pink' if self.demo_type == 'gpt' else 'purple'}\n"
413
+ f"sdk: gradio\n"
414
+ f"sdk_version: 5.40.0\n"
415
+ f"app_file: app.py\n"
416
+ f"pinned: false\n"
417
+ f"short_description: Interactive demo for {self.model_id}\n"
418
+ + ("license: mit\n" if self.demo_type != 'gpt' else "") +
419
+ f"---\n\n"
420
+ )
 
 
 
421
 
422
+ # Create README.md for the space (include configuration details)
423
+ readme_content = (
424
+ yaml_front_matter
425
+ + f"# Demo: {self.model_id}\n\n"
426
+ + f"This is an interactive demo for the fine-tuned model {self.model_id}.\n\n"
427
+ + "## Features\n"
428
+ "- Interactive chat interface\n"
429
+ "- Customizable system & developer prompts\n"
430
+ "- Advanced generation parameters\n"
431
+ "- Thinking mode support\n\n"
432
+ + "## Model Information\n"
433
+ f"- **Model ID**: {self.model_id}\n"
434
+ f"- **Subfolder**: {self.subfolder if self.subfolder and self.subfolder.strip() else 'main'}\n"
435
+ f"- **Deployed by**: {self.hf_username}\n"
436
+ + ("- **Base Model**: openai/gpt-oss-20b\n" if self.demo_type == 'gpt' else "")
437
+ + "\n"
438
+ + "## Configuration\n"
439
+ "- **Model Identity**:\n\n"
440
+ f"```\n{self.model_identity or 'Not set'}\n```\n\n"
441
+ "- **System Message** (default):\n\n"
442
+ f"```\n{(self.system_message or self.model_identity) or 'Not set'}\n```\n\n"
443
+ "- **Developer Message** (default):\n\n"
444
+ f"```\n{self.developer_message or 'Not set'}\n```\n\n"
445
+ "These defaults come from the selected training configuration and can be adjusted in the UI when you run the demo.\n\n"
446
+ + "## Usage\n"
447
+ "Simply start chatting with the model using the interface below!\n\n"
448
+ + "---\n"
449
+ "*This demo was automatically deployed by the SmolFactory Fine-tuning Pipeline*\n"
450
+ )
451
 
452
  with open(Path(temp_dir) / "README.md", 'w', encoding='utf-8') as f:
453
  f.write(readme_content)
 
577
  logger.info(f" LORA_MODEL_ID={self.model_id}")
578
  logger.info(f" BASE_MODEL_ID=openai/gpt-oss-20b")
579
  logger.info(f" MODEL_NAME={model_name}")
580
+ if self.model_identity:
581
+ logger.info(f" MODEL_IDENTITY={self.model_identity}")
582
+ if self.system_message:
583
+ logger.info(f" SYSTEM_MESSAGE={self.system_message}")
584
+ if self.developer_message:
585
+ logger.info(f" DEVELOPER_MESSAGE={self.developer_message}")
586
 
587
  logger.info(f"\n🔧 To set secrets in your Space:")
588
  logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
 
692
  parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
693
  parser.add_argument("--space-name", help="Custom space name (optional)")
694
  parser.add_argument("--demo-type", choices=["smol", "gpt"], help="Demo type: 'smol' for SmolLM, 'gpt' for GPT-OSS (auto-detected if not specified)")
695
+ parser.add_argument("--config-file", help="Path to the training config file to import context (system/developer/model_identity)")
696
 
697
  args = parser.parse_args()
698
 
 
702
  model_id=args.model_id,
703
  subfolder=args.subfolder,
704
  space_name=args.space_name,
705
+ demo_type=args.demo_type,
706
+ config_file=args.config_file,
707
  )
708
 
709
  success = deployer.deploy()
scripts/training/train_gpt_oss.py CHANGED
@@ -980,7 +980,8 @@ def train_gpt_oss(config_path, experiment_name, output_dir, trackio_url, trainer
980
  log_metrics=True,
981
  log_config=True,
982
  hf_token=os.environ.get('HF_TOKEN'),
983
- dataset_repo=os.environ.get('TRACKIO_DATASET_REPO')
 
984
  )
985
  # Log configuration once
986
  try:
 
980
  log_metrics=True,
981
  log_config=True,
982
  hf_token=os.environ.get('HF_TOKEN'),
983
+ dataset_repo=os.environ.get('TRACKIO_DATASET_REPO'),
984
+ monitoring_mode=os.environ.get('MONITORING_MODE', 'both'),
985
  )
986
  # Log configuration once
987
  try:
src/monitoring.py CHANGED
@@ -31,7 +31,14 @@ except ImportError:
31
  logger = logging.getLogger(__name__)
32
 
33
  class SmolLM3Monitor:
34
- """Monitoring and tracking for SmolLM3 fine-tuning experiments with HF Datasets support"""
 
 
 
 
 
 
 
35
 
36
  def __init__(
37
  self,
@@ -43,10 +50,25 @@ class SmolLM3Monitor:
43
  log_metrics: bool = True,
44
  log_config: bool = True,
45
  hf_token: Optional[str] = None,
46
- dataset_repo: Optional[str] = None
 
47
  ):
48
  self.experiment_name = experiment_name
49
- self.enable_tracking = enable_tracking and TRACKIO_AVAILABLE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  self.log_artifacts = log_artifacts
51
  self.log_metrics_enabled = log_metrics # Rename to avoid conflict
52
  self.log_config_enabled = log_config # Rename to avoid conflict
@@ -57,7 +79,6 @@ class SmolLM3Monitor:
57
  self.flush_interval = 10
58
 
59
  # HF Datasets configuration
60
- self.hf_token = hf_token or os.environ.get('HF_TOKEN')
61
  self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
62
 
63
  # Ensure dataset repository is properly set
@@ -73,19 +94,20 @@ class SmolLM3Monitor:
73
 
74
  # Initialize Trackio API client
75
  self.trackio_client = None
76
- if self.enable_tracking:
77
  self._setup_trackio(trackio_url, trackio_token)
78
 
79
  # Initialize HF Datasets client
80
  self.hf_dataset_client = None
81
- if self.hf_token:
 
82
  self._setup_hf_datasets()
83
 
84
  logger.info("Initialized monitoring for experiment: %s", experiment_name)
85
  logger.info("Dataset repository: %s", self.dataset_repo)
86
 
87
  # Create experiment in Trackio if tracking is enabled
88
- if self.enable_tracking and self.trackio_client:
89
  self._create_experiment()
90
 
91
  def _setup_hf_datasets(self):
@@ -136,6 +158,7 @@ class SmolLM3Monitor:
136
  if not space_id:
137
  logger.warning("No Trackio Space configured via param or env (TRACKIO_URL/TRACKIO_SPACE_ID). Disabling Trackio tracking.")
138
  self.enable_tracking = False
 
139
  return
140
 
141
  # Get HF token for Space resolution
@@ -151,6 +174,7 @@ class SmolLM3Monitor:
151
  logger.warning(f"Trackio Space not accessible: {connection_test['error']}")
152
  logger.info("Continuing with HF Datasets only")
153
  self.enable_tracking = False
 
154
  return
155
  logger.info("✅ Trackio Space connection successful")
156
 
@@ -158,11 +182,13 @@ class SmolLM3Monitor:
158
  logger.warning(f"Trackio Space not accessible: {e}")
159
  logger.info("Continuing with HF Datasets only")
160
  self.enable_tracking = False
 
161
  return
162
 
163
  except Exception as e:
164
  logger.error(f"Failed to setup Trackio: {e}")
165
  self.enable_tracking = False
 
166
 
167
  def _create_experiment(self):
168
  """Create experiment in Trackio and set experiment_id"""
@@ -218,6 +244,11 @@ class SmolLM3Monitor:
218
  - Artifacts/logs: union with de-dup, preserve order
219
  - Top-level scalar fields (e.g., status, name, description, created_at) update only when provided
220
  """
 
 
 
 
 
221
  if not self.dataset_manager:
222
  logger.warning("⚠️ Dataset manager not available")
223
  return False
@@ -401,7 +432,7 @@ class SmolLM3Monitor:
401
 
402
  try:
403
  # Log configuration as parameters
404
- if self.enable_tracking and self.trackio_client:
405
  try:
406
  result = self.trackio_client.log_parameters(
407
  experiment_id=self.experiment_id,
@@ -416,7 +447,8 @@ class SmolLM3Monitor:
416
  logger.warning("Trackio configuration logging failed: %s", e)
417
 
418
  # Save to HF Dataset
419
- self._save_to_hf_dataset(config)
 
420
 
421
  # Also save config locally
422
  config_path = "config_{}_{}.json".format(
@@ -467,7 +499,7 @@ class SmolLM3Monitor:
467
  metrics['step'] = step
468
 
469
  # Log to Trackio (if available)
470
- if self.enable_tracking and self.trackio_client:
471
  try:
472
  result = self.trackio_client.log_metrics(
473
  experiment_id=self.experiment_id,
@@ -486,18 +518,19 @@ class SmolLM3Monitor:
486
  self.metrics_history.append(metrics)
487
 
488
  # Save to HF Dataset periodically (configurable)
489
- flush_every = max(1, int(getattr(self, 'flush_interval', 10)))
490
- # Only append the delta since last flush to minimize risk
491
- try:
492
- if not hasattr(self, '_last_flushed_index'):
493
- self._last_flushed_index = 0
494
- if len(self.metrics_history) - self._last_flushed_index >= flush_every:
495
- new_slice = self.metrics_history[self._last_flushed_index:]
496
- # Persist only the tail slice; merge code will union-append
497
- self._save_to_hf_dataset({'metrics': new_slice})
498
- self._last_flushed_index = len(self.metrics_history)
499
- except Exception:
500
- pass
 
501
 
502
  logger.debug("Metrics logged: %s", metrics)
503
 
@@ -518,7 +551,7 @@ class SmolLM3Monitor:
518
  "checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
519
  }
520
 
521
- if self.enable_tracking and self.trackio_client:
522
  result = self.trackio_client.log_parameters(
523
  experiment_id=self.experiment_id,
524
  parameters=checkpoint_info
@@ -531,10 +564,11 @@ class SmolLM3Monitor:
531
 
532
  self.artifacts.append(checkpoint_path)
533
  # Also preserve checkpoint info in HF dataset
534
- try:
535
- self._save_to_hf_dataset({'artifacts': [checkpoint_path], **checkpoint_info})
536
- except Exception:
537
- pass
 
538
  logger.info("Checkpoint logged: %s", checkpoint_path)
539
 
540
  except Exception as e:
@@ -597,7 +631,7 @@ class SmolLM3Monitor:
597
  summary['experiment_duration_hours'] = duration / 3600
598
 
599
  # Log final summary to Trackio
600
- if self.enable_tracking and self.trackio_client:
601
  result = self.trackio_client.log_parameters(
602
  experiment_id=self.experiment_id,
603
  parameters=summary
@@ -609,7 +643,8 @@ class SmolLM3Monitor:
609
  logger.error("Failed to log training summary to Trackio: %s", result)
610
 
611
  # Save to HF Dataset
612
- self._save_to_hf_dataset(summary)
 
613
 
614
  # Save summary locally
615
  summary_path = "training_summary_{}_{}.json".format(
@@ -731,7 +766,7 @@ class SmolLM3Monitor:
731
 
732
  def get_experiment_url(self) -> Optional[str]:
733
  """Get the URL to view the experiment in Trackio"""
734
- if self.trackio_client and self.experiment_id:
735
  return "{}?tab=view_experiments".format(self.trackio_client.space_url)
736
  return None
737
 
@@ -744,7 +779,7 @@ class SmolLM3Monitor:
744
  """
745
  logger.info(f"🔚 Closing monitoring session with status: {final_status}")
746
 
747
- if self.enable_tracking and self.trackio_client:
748
  try:
749
  # Mark experiment as completed in Trackio
750
  result = self.trackio_client.update_experiment_status(
@@ -759,7 +794,7 @@ class SmolLM3Monitor:
759
  logger.error("❌ Failed to close Trackio monitoring session: %s", e)
760
 
761
  # Final save to HF Dataset with proper status update
762
- if self.dataset_manager:
763
  try:
764
  # Update experiment with final status without clobbering metrics
765
  final_experiment_data = {
@@ -798,5 +833,6 @@ def create_monitor_from_config(config, experiment_name: Optional[str] = None) ->
798
  log_metrics=getattr(config, 'log_metrics', True),
799
  log_config=getattr(config, 'log_config', True),
800
  hf_token=getattr(config, 'hf_token', None),
801
- dataset_repo=getattr(config, 'dataset_repo', None)
 
802
  )
 
31
  logger = logging.getLogger(__name__)
32
 
33
  class SmolLM3Monitor:
34
+ """Monitoring and tracking for SmolLM3 fine-tuning experiments with HF Datasets support
35
+
36
+ Monitoring modes:
37
+ - "both": Log to Trackio Space and HF Datasets (plus local JSON files)
38
+ - "dataset": Log only to HF Datasets (plus local JSON files). Trackio Space is not written to
39
+ - "trackio": Log only to Trackio Space (plus local JSON files). HF Datasets writes are disabled
40
+ - "none": Local-only logging; no remote writes
41
+ """
42
 
43
  def __init__(
44
  self,
 
50
  log_metrics: bool = True,
51
  log_config: bool = True,
52
  hf_token: Optional[str] = None,
53
+ dataset_repo: Optional[str] = None,
54
+ monitoring_mode: Optional[str] = None,
55
  ):
56
  self.experiment_name = experiment_name
57
+ # Determine monitoring mode (env override supported)
58
+ mode_env = os.environ.get('MONITORING_MODE')
59
+ selected_mode = (monitoring_mode or mode_env or 'both').strip().lower()
60
+ if selected_mode not in ('both', 'dataset', 'trackio', 'none'):
61
+ selected_mode = 'both'
62
+ self.monitoring_mode = selected_mode
63
+
64
+ # Track which backends are active
65
+ self.use_trackio = (selected_mode in ('both', 'trackio')) and enable_tracking and TRACKIO_AVAILABLE
66
+ # HF dataset only if mode requires it and token is available (repo validated later)
67
+ self.hf_token = hf_token or os.environ.get('HF_TOKEN')
68
+ self.use_dataset = (selected_mode in ('both', 'dataset')) and bool(self.hf_token)
69
+
70
+ # For TRL compatibility, "enable_tracking" reflects Trackio availability
71
+ self.enable_tracking = self.use_trackio
72
  self.log_artifacts = log_artifacts
73
  self.log_metrics_enabled = log_metrics # Rename to avoid conflict
74
  self.log_config_enabled = log_config # Rename to avoid conflict
 
79
  self.flush_interval = 10
80
 
81
  # HF Datasets configuration
 
82
  self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
83
 
84
  # Ensure dataset repository is properly set
 
94
 
95
  # Initialize Trackio API client
96
  self.trackio_client = None
97
+ if self.use_trackio:
98
  self._setup_trackio(trackio_url, trackio_token)
99
 
100
  # Initialize HF Datasets client
101
  self.hf_dataset_client = None
102
+ self.dataset_manager = None
103
+ if self.use_dataset:
104
  self._setup_hf_datasets()
105
 
106
  logger.info("Initialized monitoring for experiment: %s", experiment_name)
107
  logger.info("Dataset repository: %s", self.dataset_repo)
108
 
109
  # Create experiment in Trackio if tracking is enabled
110
+ if self.use_trackio and self.trackio_client:
111
  self._create_experiment()
112
 
113
  def _setup_hf_datasets(self):
 
158
  if not space_id:
159
  logger.warning("No Trackio Space configured via param or env (TRACKIO_URL/TRACKIO_SPACE_ID). Disabling Trackio tracking.")
160
  self.enable_tracking = False
161
+ self.use_trackio = False
162
  return
163
 
164
  # Get HF token for Space resolution
 
174
  logger.warning(f"Trackio Space not accessible: {connection_test['error']}")
175
  logger.info("Continuing with HF Datasets only")
176
  self.enable_tracking = False
177
+ self.use_trackio = False
178
  return
179
  logger.info("✅ Trackio Space connection successful")
180
 
 
182
  logger.warning(f"Trackio Space not accessible: {e}")
183
  logger.info("Continuing with HF Datasets only")
184
  self.enable_tracking = False
185
+ self.use_trackio = False
186
  return
187
 
188
  except Exception as e:
189
  logger.error(f"Failed to setup Trackio: {e}")
190
  self.enable_tracking = False
191
+ self.use_trackio = False
192
 
193
  def _create_experiment(self):
194
  """Create experiment in Trackio and set experiment_id"""
 
244
  - Artifacts/logs: union with de-dup, preserve order
245
  - Top-level scalar fields (e.g., status, name, description, created_at) update only when provided
246
  """
247
+ # Respect monitoring mode
248
+ if not self.use_dataset:
249
+ logger.debug("Dataset persistence disabled by monitoring_mode=%s", self.monitoring_mode)
250
+ return False
251
+
252
  if not self.dataset_manager:
253
  logger.warning("⚠️ Dataset manager not available")
254
  return False
 
432
 
433
  try:
434
  # Log configuration as parameters
435
+ if self.use_trackio and self.trackio_client:
436
  try:
437
  result = self.trackio_client.log_parameters(
438
  experiment_id=self.experiment_id,
 
447
  logger.warning("Trackio configuration logging failed: %s", e)
448
 
449
  # Save to HF Dataset
450
+ if self.use_dataset:
451
+ self._save_to_hf_dataset(config)
452
 
453
  # Also save config locally
454
  config_path = "config_{}_{}.json".format(
 
499
  metrics['step'] = step
500
 
501
  # Log to Trackio (if available)
502
+ if self.use_trackio and self.trackio_client:
503
  try:
504
  result = self.trackio_client.log_metrics(
505
  experiment_id=self.experiment_id,
 
518
  self.metrics_history.append(metrics)
519
 
520
  # Save to HF Dataset periodically (configurable)
521
+ if self.use_dataset:
522
+ flush_every = max(1, int(getattr(self, 'flush_interval', 10)))
523
+ # Only append the delta since last flush to minimize risk
524
+ try:
525
+ if not hasattr(self, '_last_flushed_index'):
526
+ self._last_flushed_index = 0
527
+ if len(self.metrics_history) - self._last_flushed_index >= flush_every:
528
+ new_slice = self.metrics_history[self._last_flushed_index:]
529
+ # Persist only the tail slice; merge code will union-append
530
+ self._save_to_hf_dataset({'metrics': new_slice})
531
+ self._last_flushed_index = len(self.metrics_history)
532
+ except Exception:
533
+ pass
534
 
535
  logger.debug("Metrics logged: %s", metrics)
536
 
 
551
  "checkpoint_size": os.path.getsize(checkpoint_path) if os.path.exists(checkpoint_path) else 0
552
  }
553
 
554
+ if self.use_trackio and self.trackio_client:
555
  result = self.trackio_client.log_parameters(
556
  experiment_id=self.experiment_id,
557
  parameters=checkpoint_info
 
564
 
565
  self.artifacts.append(checkpoint_path)
566
  # Also preserve checkpoint info in HF dataset
567
+ if self.use_dataset:
568
+ try:
569
+ self._save_to_hf_dataset({'artifacts': [checkpoint_path], **checkpoint_info})
570
+ except Exception:
571
+ pass
572
  logger.info("Checkpoint logged: %s", checkpoint_path)
573
 
574
  except Exception as e:
 
631
  summary['experiment_duration_hours'] = duration / 3600
632
 
633
  # Log final summary to Trackio
634
+ if self.use_trackio and self.trackio_client:
635
  result = self.trackio_client.log_parameters(
636
  experiment_id=self.experiment_id,
637
  parameters=summary
 
643
  logger.error("Failed to log training summary to Trackio: %s", result)
644
 
645
  # Save to HF Dataset
646
+ if self.use_dataset:
647
+ self._save_to_hf_dataset(summary)
648
 
649
  # Save summary locally
650
  summary_path = "training_summary_{}_{}.json".format(
 
766
 
767
  def get_experiment_url(self) -> Optional[str]:
768
  """Get the URL to view the experiment in Trackio"""
769
+ if self.use_trackio and self.trackio_client and self.experiment_id:
770
  return "{}?tab=view_experiments".format(self.trackio_client.space_url)
771
  return None
772
 
 
779
  """
780
  logger.info(f"🔚 Closing monitoring session with status: {final_status}")
781
 
782
+ if self.use_trackio and self.trackio_client:
783
  try:
784
  # Mark experiment as completed in Trackio
785
  result = self.trackio_client.update_experiment_status(
 
794
  logger.error("❌ Failed to close Trackio monitoring session: %s", e)
795
 
796
  # Final save to HF Dataset with proper status update
797
+ if self.use_dataset and self.dataset_manager:
798
  try:
799
  # Update experiment with final status without clobbering metrics
800
  final_experiment_data = {
 
833
  log_metrics=getattr(config, 'log_metrics', True),
834
  log_config=getattr(config, 'log_config', True),
835
  hf_token=getattr(config, 'hf_token', None),
836
+ dataset_repo=getattr(config, 'dataset_repo', None),
837
+ monitoring_mode=getattr(config, 'monitoring_mode', os.environ.get('MONITORING_MODE', 'both'))
838
  )
src/trackio.py CHANGED
@@ -49,6 +49,11 @@ def init(
49
  trackio_token = kwargs.get('trackio_token') or os.environ.get('TRACKIO_TOKEN')
50
  hf_token = kwargs.get('hf_token') or os.environ.get('HF_TOKEN')
51
  dataset_repo = kwargs.get('dataset_repo') or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
 
 
 
 
 
52
 
53
  # Use experiment_name if provided, otherwise use project_name
54
  exp_name = experiment_name or project_name
@@ -63,7 +68,8 @@ def init(
63
  log_metrics=True,
64
  log_config=True,
65
  hf_token=hf_token,
66
- dataset_repo=dataset_repo
 
67
  )
68
  # The monitor constructor creates the experiment remotely and sets
69
  # `experiment_id`. Do NOT overwrite it with a locally generated ID.
@@ -229,6 +235,7 @@ class TrackioConfig:
229
  self.trackio_token = os.environ.get('TRACKIO_TOKEN')
230
  self.hf_token = os.environ.get('HF_TOKEN')
231
  self.dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
 
232
 
233
  def update(self, config_dict: Dict[str, Any] = None, **kwargs):
234
  """
 
49
  trackio_token = kwargs.get('trackio_token') or os.environ.get('TRACKIO_TOKEN')
50
  hf_token = kwargs.get('hf_token') or os.environ.get('HF_TOKEN')
51
  dataset_repo = kwargs.get('dataset_repo') or os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
52
+ monitoring_mode = (
53
+ kwargs.get('monitoring_mode')
54
+ or os.environ.get('MONITORING_MODE')
55
+ or 'both'
56
+ )
57
 
58
  # Use experiment_name if provided, otherwise use project_name
59
  exp_name = experiment_name or project_name
 
68
  log_metrics=True,
69
  log_config=True,
70
  hf_token=hf_token,
71
+ dataset_repo=dataset_repo,
72
+ monitoring_mode=monitoring_mode,
73
  )
74
  # The monitor constructor creates the experiment remotely and sets
75
  # `experiment_id`. Do NOT overwrite it with a locally generated ID.
 
235
  self.trackio_token = os.environ.get('TRACKIO_TOKEN')
236
  self.hf_token = os.environ.get('HF_TOKEN')
237
  self.dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
238
+ self.monitoring_mode = os.environ.get('MONITORING_MODE', 'both')
239
 
240
  def update(self, config_dict: Dict[str, Any] = None, **kwargs):
241
  """
src/train.py CHANGED
@@ -154,21 +154,25 @@ def main():
154
 
155
  logger.info(f"Output path: {output_path}")
156
 
157
- # Initialize monitoring
158
  monitor = None
159
- if config.enable_tracking:
160
- try:
 
 
 
 
 
161
  monitor = create_monitor_from_config(config, args.experiment_name)
162
  logger.info(f"✅ Monitoring initialized for experiment: {monitor.experiment_name}")
 
163
  logger.info(f"📊 Dataset repository: {monitor.dataset_repo}")
164
-
165
  # Log configuration
166
  config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
167
  monitor.log_configuration(config_dict)
168
-
169
- except Exception as e:
170
- logger.error(f"Failed to initialize monitoring: {e}")
171
- logger.warning("Continuing without monitoring...")
172
 
173
  # Initialize model
174
  model = SmolLM3Model(
 
154
 
155
  logger.info(f"Output path: {output_path}")
156
 
157
+ # Initialize monitoring (supports local-only mode)
158
  monitor = None
159
+ try:
160
+ monitoring_mode = getattr(config, 'monitoring_mode', os.environ.get('MONITORING_MODE', 'both')).lower()
161
+ should_create_monitor = (
162
+ monitoring_mode in ('both', 'dataset', 'trackio', 'none')
163
+ and (getattr(config, 'enable_tracking', True) or monitoring_mode in ('dataset', 'none'))
164
+ )
165
+ if should_create_monitor:
166
  monitor = create_monitor_from_config(config, args.experiment_name)
167
  logger.info(f"✅ Monitoring initialized for experiment: {monitor.experiment_name}")
168
+ logger.info(f"📊 Monitoring mode: {monitor.monitoring_mode}")
169
  logger.info(f"📊 Dataset repository: {monitor.dataset_repo}")
 
170
  # Log configuration
171
  config_dict = {k: v for k, v in vars(config).items() if not k.startswith('_')}
172
  monitor.log_configuration(config_dict)
173
+ except Exception as e:
174
+ logger.error(f"Failed to initialize monitoring: {e}")
175
+ logger.warning("Continuing without monitoring...")
 
176
 
177
  # Initialize model
178
  model = SmolLM3Model(
templates/model_card.md CHANGED
@@ -11,7 +11,7 @@ tags:
11
  - text-generation
12
  - tonic
13
  - legml
14
- - {{#if quantized_models}}quantized{{/if}}
15
  pipeline_tag: text-generation
16
  base_model: {{base_model}}
17
  {{#if dataset_name}}
 
11
  - text-generation
12
  - tonic
13
  - legml
14
+ {{#if quantized_models}}- quantized{{/if}}
15
  pipeline_tag: text-generation
16
  base_model: {{base_model}}
17
  {{#if dataset_name}}
templates/spaces/demo_gpt/app.py CHANGED
@@ -18,6 +18,12 @@ LORA_MODEL_ID = os.getenv('LORA_MODEL_ID', os.getenv('HF_MODEL_ID', 'Tonic/gpt-o
18
  MODEL_NAME = os.getenv('MODEL_NAME', 'GPT-OSS Multilingual Reasoner')
19
  MODEL_SUBFOLDER = os.getenv('MODEL_SUBFOLDER', '')
20
 
 
 
 
 
 
 
21
  # If the LORA_MODEL_ID is the same as BASE_MODEL_ID, this is a merged model, not LoRA
22
  USE_LORA = LORA_MODEL_ID != BASE_MODEL_ID and not LORA_MODEL_ID.startswith(BASE_MODEL_ID)
23
 
@@ -130,7 +136,7 @@ def format_analysis_response(text):
130
  return cleaned
131
 
132
  @spaces.GPU(duration=60)
133
- def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
134
  if not input_data.strip():
135
  yield "Please enter a prompt."
136
  return
@@ -140,14 +146,37 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
140
  logging.info(f"[System] {system_prompt} | Temp={temperature} | Max tokens={max_new_tokens}")
141
 
142
  new_message = {"role": "user", "content": input_data}
143
- system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
 
 
 
 
 
 
 
 
 
 
 
144
  processed_history = format_conversation_history(chat_history)
145
- messages = system_message + processed_history + [new_message]
146
- prompt = tokenizer.apply_chat_template(
147
- messages,
148
- tokenize=False,
149
- add_generation_prompt=True
150
- )
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # Create streamer for proper streaming
153
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -211,12 +240,30 @@ demo = gr.ChatInterface(
211
  fn=generate_response,
212
  additional_inputs=[
213
  gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
 
 
 
 
 
 
214
  gr.Textbox(
215
  label="System Prompt",
216
- value="You are a helpful assistant. Reasoning: medium",
217
  lines=4,
218
  placeholder="Change system prompt"
219
  ),
 
 
 
 
 
 
 
 
 
 
 
 
220
  gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
221
  gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
222
  gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
 
18
  MODEL_NAME = os.getenv('MODEL_NAME', 'GPT-OSS Multilingual Reasoner')
19
  MODEL_SUBFOLDER = os.getenv('MODEL_SUBFOLDER', '')
20
 
21
+ # Optional persona and prompts derived from training config
22
+ MODEL_IDENTITY = os.getenv('MODEL_IDENTITY', '')
23
+ DEFAULT_SYSTEM_PROMPT = os.getenv('SYSTEM_MESSAGE', MODEL_IDENTITY or 'You are a helpful assistant. Reasoning: medium')
24
+ DEFAULT_DEVELOPER_PROMPT = os.getenv('DEVELOPER_MESSAGE', '')
25
+ DEFAULT_REASONING_EFFORT = os.getenv('REASONING_EFFORT', 'medium')
26
+
27
  # If the LORA_MODEL_ID is the same as BASE_MODEL_ID, this is a merged model, not LoRA
28
  USE_LORA = LORA_MODEL_ID != BASE_MODEL_ID and not LORA_MODEL_ID.startswith(BASE_MODEL_ID)
29
 
 
136
  return cleaned
137
 
138
  @spaces.GPU(duration=60)
139
+ def generate_response(input_data, chat_history, max_new_tokens, model_identity, system_prompt, developer_prompt, reasoning_effort, temperature, top_p, top_k, repetition_penalty):
140
  if not input_data.strip():
141
  yield "Please enter a prompt."
142
  return
 
146
  logging.info(f"[System] {system_prompt} | Temp={temperature} | Max tokens={max_new_tokens}")
147
 
148
  new_message = {"role": "user", "content": input_data}
149
+ # Combine model identity with system prompt for a single system message
150
+ combined_parts = []
151
+ if model_identity and model_identity.strip():
152
+ combined_parts.append(model_identity.strip())
153
+ if system_prompt and system_prompt.strip():
154
+ combined_parts.append(system_prompt.strip())
155
+ if reasoning_effort and isinstance(reasoning_effort, str) and reasoning_effort.strip():
156
+ # Append explicit reasoning directive
157
+ combined_parts.append(f"Reasoning: {reasoning_effort.strip()}")
158
+ combined_system = "\n\n".join(combined_parts).strip()
159
+ system_message = ([{"role": "system", "content": combined_system}] if combined_system else [])
160
+ developer_message = [{"role": "developer", "content": developer_prompt}] if developer_prompt else []
161
  processed_history = format_conversation_history(chat_history)
162
+ messages = system_message + developer_message + processed_history + [new_message]
163
+ try:
164
+ prompt = tokenizer.apply_chat_template(
165
+ messages,
166
+ tokenize=False,
167
+ add_generation_prompt=True
168
+ )
169
+ except Exception:
170
+ # Fallback: merge developer prompt into system prompt if template doesn't support 'developer' role
171
+ fallback_sys = combined_system
172
+ if developer_prompt:
173
+ fallback_sys = (fallback_sys + ("\n\n[Developer]\n" if fallback_sys else "[Developer]\n") + developer_prompt).strip()
174
+ fallback_messages = ([{"role": "system", "content": fallback_sys}] if fallback_sys else []) + processed_history + [new_message]
175
+ prompt = tokenizer.apply_chat_template(
176
+ fallback_messages,
177
+ tokenize=False,
178
+ add_generation_prompt=True
179
+ )
180
 
181
  # Create streamer for proper streaming
182
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
240
  fn=generate_response,
241
  additional_inputs=[
242
  gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
243
+ gr.Textbox(
244
+ label="Model Identity",
245
+ value=MODEL_IDENTITY,
246
+ lines=3,
247
+ placeholder="Optional identity/persona for the model"
248
+ ),
249
  gr.Textbox(
250
  label="System Prompt",
251
+ value=DEFAULT_SYSTEM_PROMPT,
252
  lines=4,
253
  placeholder="Change system prompt"
254
  ),
255
+ gr.Textbox(
256
+ label="Developer Prompt",
257
+ value=DEFAULT_DEVELOPER_PROMPT,
258
+ lines=4,
259
+ placeholder="Optional developer instructions"
260
+ ),
261
+ gr.Dropdown(
262
+ label="Reasoning Effort",
263
+ choices=["low", "medium", "high"],
264
+ value=DEFAULT_REASONING_EFFORT,
265
+ interactive=True,
266
+ ),
267
  gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
268
  gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
269
  gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
templates/spaces/trackio/app.py CHANGED
@@ -1143,33 +1143,63 @@ def create_metrics_plot(experiment_id: str, metric_name: str = "loss") -> go.Fig
1143
  )
1144
  return fig
1145
 
 
 
 
 
 
 
 
 
 
 
1146
  # Ensure steps are numeric and monotonically increasing to avoid zig-zag lines
1147
  try:
1148
  df = df.copy()
1149
- # If step looks constant or missing, try to derive it from a common field
1150
- if 'step' not in df or df['step'].nunique() <= 1:
1151
- for alt in ['train/global_step', 'global_step', 'train/step']:
1152
- if alt in df.columns and df[alt].notna().any():
1153
- df['step'] = pd.to_numeric(df[alt], errors='coerce')
1154
- break
1155
- # If still missing or constant, fallback to an inferred counter by order of arrival
1156
- if 'step' not in df.columns or df['step'].isna().all() or df['step'].nunique() <= 1:
1157
- df['step'] = range(1, len(df) + 1)
 
 
1158
  else:
1159
- df['step'] = pd.to_numeric(df.get('step', -1), errors='coerce').fillna(-1)
1160
- df.sort_values('step', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
1161
  except Exception:
1162
- pass
1163
- fig = px.line(df, x='step', y=metric_name, title=f'{metric_name} over time')
 
 
 
 
 
1164
  fig.update_layout(
1165
- xaxis_title="Training Step",
1166
  yaxis_title=metric_name.title(),
1167
  hovermode='x unified'
1168
  )
1169
- # Avoid interpolating across missing steps which can create odd visuals
1170
  try:
1171
  for trace in fig.data:
1172
- trace.connectgaps = False
 
 
1173
  except Exception:
1174
  pass
1175
  return fig
@@ -1547,6 +1577,16 @@ def create_combined_metrics_plot(experiment_id: str) -> go.Figure:
1547
  # Define colors for different metrics
1548
  colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']
1549
 
 
 
 
 
 
 
 
 
 
 
1550
  for i, metric in enumerate(numeric_cols):
1551
  if metric in df.columns and not df[metric].isna().all():
1552
  row = (i // n_cols) + 1
@@ -1556,31 +1596,54 @@ def create_combined_metrics_plot(experiment_id: str) -> go.Figure:
1556
  # Clean steps for each subplot too
1557
  try:
1558
  df_sub = df.copy()
1559
- if 'step' not in df_sub or df_sub['step'].nunique() <= 1:
1560
- for alt in ['train/global_step', 'global_step', 'train/step']:
1561
- if alt in df_sub.columns and df_sub[alt].notna().any():
1562
- df_sub['step'] = pd.to_numeric(df_sub[alt], errors='coerce')
1563
- break
1564
- if 'step' not in df_sub.columns or df_sub['step'].isna().all() or df_sub['step'].nunique() <= 1:
1565
- df_sub['step'] = range(1, len(df_sub) + 1)
 
 
 
1566
  else:
1567
- df_sub['step'] = pd.to_numeric(df_sub.get('step', -1), errors='coerce').fillna(-1)
1568
- df_sub.sort_values('step', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
1569
  except Exception:
1570
  df_sub = df
 
 
 
1571
  fig.add_trace(
1572
  go.Scatter(
1573
- x=df_sub['step'].tolist(),
1574
- y=df_sub[metric].tolist(),
1575
  mode='lines+markers',
1576
  name=metric,
1577
  line=dict(width=2, color=color),
1578
  marker=dict(size=4, color=color),
1579
  showlegend=False,
1580
- connectgaps=False
1581
  ),
1582
  row=row, col=col
1583
  )
 
 
 
 
 
1584
 
1585
  fig.update_layout(
1586
  title=f"All Metrics for Experiment {experiment_id}",
@@ -1677,7 +1740,7 @@ def create_experiment_comparison_from_selection(selected_experiments: list, sele
1677
  plot_bgcolor='white', paper_bgcolor='white'
1678
  )
1679
  return fig
1680
-
1681
  if not selected_metrics:
1682
  fig = go.Figure()
1683
  fig.add_annotation(
@@ -1691,10 +1754,180 @@ def create_experiment_comparison_from_selection(selected_experiments: list, sele
1691
  plot_bgcolor='white', paper_bgcolor='white'
1692
  )
1693
  return fig
1694
-
1695
- # Use the existing comparison function with comma-separated IDs
1696
- experiment_ids_str = ",".join(selected_experiments)
1697
- return create_experiment_comparison(experiment_ids_str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1698
 
1699
  except Exception as e:
1700
  logger.error(f"Error creating comparison from selection: {str(e)}")
 
1143
  )
1144
  return fig
1145
 
1146
+ # Helper predicates
1147
+ def _is_eval_metric(name: str) -> bool:
1148
+ return name.startswith('eval_') or name.startswith('eval/')
1149
+
1150
+ def _is_system_metric(name: str) -> bool:
1151
+ import re
1152
+ if name in ("cpu_percent", "memory_percent"):
1153
+ return True
1154
+ return re.match(r"^gpu_\d+_(memory_allocated|memory_reserved|utilization)$", name) is not None
1155
+
1156
  # Ensure steps are numeric and monotonically increasing to avoid zig-zag lines
1157
  try:
1158
  df = df.copy()
1159
+ # Choose x-axis: time for system metrics, step otherwise
1160
+ use_time_axis = _is_system_metric(metric_name)
1161
+
1162
+ if use_time_axis:
1163
+ # Convert timestamp to datetime for nicer axis rendering
1164
+ df['time'] = pd.to_datetime(df.get('timestamp', ''), errors='coerce')
1165
+ # Fallback order if timestamps are missing
1166
+ if df['time'].isna().all():
1167
+ df['time'] = range(1, len(df) + 1)
1168
+ df.sort_values('time', inplace=True)
1169
+ x_field = 'time'
1170
  else:
1171
+ # If step looks constant or missing, try to derive it from a common field
1172
+ if 'step' not in df or df['step'].nunique() <= 1:
1173
+ for alt in ['train/global_step', 'global_step', 'train/step']:
1174
+ if alt in df.columns and df[alt].notna().any():
1175
+ df['step'] = pd.to_numeric(df[alt], errors='coerce')
1176
+ break
1177
+ # If still missing or constant, fallback to an inferred counter by order of arrival
1178
+ if 'step' not in df.columns or df['step'].isna().all() or df['step'].nunique() <= 1:
1179
+ df['step'] = range(1, len(df) + 1)
1180
+ else:
1181
+ df['step'] = pd.to_numeric(df.get('step', -1), errors='coerce').fillna(-1)
1182
+ df.sort_values('step', inplace=True)
1183
+ x_field = 'step'
1184
  except Exception:
1185
+ x_field = 'step'
1186
+ # Filter rows where the metric is present to ensure connected lines
1187
+ try:
1188
+ plot_df = df[[x_field, metric_name]].dropna(subset=[metric_name]).copy()
1189
+ except Exception:
1190
+ plot_df = df
1191
+ fig = px.line(plot_df, x=x_field, y=metric_name, title=f'{metric_name} over time')
1192
  fig.update_layout(
1193
+ xaxis_title="Time" if (metric_name in ("cpu_percent", "memory_percent") or metric_name.startswith('gpu_')) else "Training Step",
1194
  yaxis_title=metric_name.title(),
1195
  hovermode='x unified'
1196
  )
1197
+ # Connect points for evaluation metrics, avoid connecting gaps for others
1198
  try:
1199
  for trace in fig.data:
1200
+ trace.connectgaps = True if _is_eval_metric(metric_name) else False
1201
+ # Force line+markers to visually connect points
1202
+ trace.mode = 'lines+markers'
1203
  except Exception:
1204
  pass
1205
  return fig
 
1577
  # Define colors for different metrics
1578
  colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']
1579
 
1580
+ # Helper predicates
1581
+ def _is_eval_metric(name: str) -> bool:
1582
+ return name.startswith('eval_') or name.startswith('eval/')
1583
+
1584
+ def _is_system_metric(name: str) -> bool:
1585
+ import re
1586
+ if name in ("cpu_percent", "memory_percent"):
1587
+ return True
1588
+ return re.match(r"^gpu_\d+_(memory_allocated|memory_reserved|utilization)$", name) is not None
1589
+
1590
  for i, metric in enumerate(numeric_cols):
1591
  if metric in df.columns and not df[metric].isna().all():
1592
  row = (i // n_cols) + 1
 
1596
  # Clean steps for each subplot too
1597
  try:
1598
  df_sub = df.copy()
1599
+ use_time_axis = _is_system_metric(metric)
1600
+ if use_time_axis:
1601
+ df_sub['time'] = pd.to_datetime(df_sub.get('timestamp', ''), errors='coerce')
1602
+ if df_sub['time'].isna().all():
1603
+ df_sub['time'] = range(1, len(df_sub) + 1)
1604
+ df_sub.sort_values('time', inplace=True)
1605
+ # Filter to available metric points only to ensure connected lines
1606
+ metric_mask = df_sub[metric].notna()
1607
+ x_vals = df_sub.loc[metric_mask, 'time'].tolist()
1608
+ y_vals = df_sub.loc[metric_mask, metric].tolist()
1609
  else:
1610
+ if 'step' not in df_sub or df_sub['step'].nunique() <= 1:
1611
+ for alt in ['train/global_step', 'global_step', 'train/step']:
1612
+ if alt in df_sub.columns and df_sub[alt].notna().any():
1613
+ df_sub['step'] = pd.to_numeric(df_sub[alt], errors='coerce')
1614
+ break
1615
+ if 'step' not in df_sub.columns or df_sub['step'].isna().all() or df_sub['step'].nunique() <= 1:
1616
+ df_sub['step'] = range(1, len(df_sub) + 1)
1617
+ else:
1618
+ df_sub['step'] = pd.to_numeric(df_sub.get('step', -1), errors='coerce').fillna(-1)
1619
+ df_sub.sort_values('step', inplace=True)
1620
+ # Filter to available metric points only to ensure connected lines
1621
+ metric_mask = df_sub[metric].notna()
1622
+ x_vals = df_sub.loc[metric_mask, 'step'].tolist()
1623
+ y_vals = df_sub.loc[metric_mask, metric].tolist()
1624
  except Exception:
1625
  df_sub = df
1626
+ metric_mask = df_sub[metric].notna() if metric in df_sub else []
1627
+ x_vals = df_sub.get('step', list(range(1, len(df_sub) + 1))).tolist()
1628
+ y_vals = df_sub.get(metric, []).tolist()
1629
  fig.add_trace(
1630
  go.Scatter(
1631
+ x=x_vals,
1632
+ y=y_vals,
1633
  mode='lines+markers',
1634
  name=metric,
1635
  line=dict(width=2, color=color),
1636
  marker=dict(size=4, color=color),
1637
  showlegend=False,
1638
+ connectgaps=True if _is_eval_metric(metric) else False
1639
  ),
1640
  row=row, col=col
1641
  )
1642
+ # Set axis titles per subplot for clarity
1643
+ try:
1644
+ fig.update_xaxes(title_text=("Time" if use_time_axis else "Training Step"), row=row, col=col)
1645
+ except Exception:
1646
+ pass
1647
 
1648
  fig.update_layout(
1649
  title=f"All Metrics for Experiment {experiment_id}",
 
1740
  plot_bgcolor='white', paper_bgcolor='white'
1741
  )
1742
  return fig
1743
+
1744
  if not selected_metrics:
1745
  fig = go.Figure()
1746
  fig.add_annotation(
 
1754
  plot_bgcolor='white', paper_bgcolor='white'
1755
  )
1756
  return fig
1757
+
1758
+ # Prepare dataframes for each selected experiment once
1759
+ experiment_to_dataframe = {}
1760
+ for experiment_id in selected_experiments:
1761
+ try:
1762
+ experiment_to_dataframe[experiment_id] = get_metrics_dataframe(experiment_id)
1763
+ except Exception:
1764
+ experiment_to_dataframe[experiment_id] = pd.DataFrame()
1765
+
1766
+ # Setup subplots: one subplot per selected metric
1767
+ from plotly.subplots import make_subplots
1768
+
1769
+ num_metrics = len(selected_metrics)
1770
+ num_columns = min(3, num_metrics)
1771
+ num_rows = (num_metrics + num_columns - 1) // num_columns
1772
+
1773
+ fig = make_subplots(
1774
+ rows=num_rows,
1775
+ cols=num_columns,
1776
+ subplot_titles=selected_metrics,
1777
+ vertical_spacing=0.05,
1778
+ horizontal_spacing=0.1
1779
+ )
1780
+
1781
+ # Color palette for experiments (consistent colors across subplots)
1782
+ try:
1783
+ palette = px.colors.qualitative.Plotly
1784
+ except Exception:
1785
+ palette = [
1786
+ 'blue', 'red', 'green', 'orange', 'purple', 'brown',
1787
+ 'pink', 'gray', 'cyan', 'magenta'
1788
+ ]
1789
+ experiment_to_color = {
1790
+ exp_id: palette[idx % len(palette)] for idx, exp_id in enumerate(selected_experiments)
1791
+ }
1792
+
1793
+ # Helper predicates (match logic used elsewhere in this file)
1794
+ def _is_eval_metric(name: str) -> bool:
1795
+ return name.startswith('eval_') or name.startswith('eval/')
1796
+
1797
+ def _is_system_metric(name: str) -> bool:
1798
+ import re
1799
+ if name in ("cpu_percent", "memory_percent"):
1800
+ return True
1801
+ return re.match(r"^gpu_\d+_(memory_allocated|memory_reserved|utilization)$", name) is not None
1802
+
1803
+ any_trace_added = False
1804
+
1805
+ for metric_index, metric_name in enumerate(selected_metrics):
1806
+ row = (metric_index // num_columns) + 1
1807
+ col = (metric_index % num_columns) + 1
1808
+
1809
+ subplot_has_data = False
1810
+
1811
+ for experiment_id, df in experiment_to_dataframe.items():
1812
+ if df is None or df.empty or metric_name not in df.columns:
1813
+ continue
1814
+
1815
+ # Build x/y based on metric type
1816
+ try:
1817
+ df_local = df.copy()
1818
+ use_time_axis = _is_system_metric(metric_name)
1819
+
1820
+ if use_time_axis:
1821
+ # Time axis: use timestamp → datetime
1822
+ df_local['time'] = pd.to_datetime(df_local.get('timestamp', ''), errors='coerce')
1823
+ if df_local['time'].isna().all():
1824
+ df_local['time'] = range(1, len(df_local) + 1)
1825
+ df_local.sort_values('time', inplace=True)
1826
+ valid_mask = df_local[metric_name].notna()
1827
+ x_values = df_local.loc[valid_mask, 'time'].tolist()
1828
+ y_values = df_local.loc[valid_mask, metric_name].tolist()
1829
+ else:
1830
+ # Step axis: ensure a reasonable step column exists
1831
+ if 'step' not in df_local or df_local['step'].nunique() <= 1:
1832
+ for alternative in ['train/global_step', 'global_step', 'train/step']:
1833
+ if alternative in df_local.columns and df_local[alternative].notna().any():
1834
+ df_local['step'] = pd.to_numeric(df_local[alternative], errors='coerce')
1835
+ break
1836
+ if 'step' not in df_local.columns or df_local['step'].isna().all() or df_local['step'].nunique() <= 1:
1837
+ df_local['step'] = range(1, len(df_local) + 1)
1838
+ else:
1839
+ df_local['step'] = pd.to_numeric(df_local.get('step', -1), errors='coerce').fillna(-1)
1840
+ df_local.sort_values('step', inplace=True)
1841
+ valid_mask = df_local[metric_name].notna()
1842
+ x_values = df_local.loc[valid_mask, 'step'].tolist()
1843
+ y_values = df_local.loc[valid_mask, metric_name].tolist()
1844
+ except Exception:
1845
+ # Fallback to naive arrays
1846
+ valid_mask = df[metric_name].notna()
1847
+ x_values = df.loc[valid_mask, 'step'].tolist() if 'step' in df.columns else list(range(1, len(df) + 1))
1848
+ y_values = df.loc[valid_mask, metric_name].tolist() if metric_name in df.columns else []
1849
+
1850
+ if not x_values or not y_values:
1851
+ continue
1852
+
1853
+ subplot_has_data = True
1854
+ any_trace_added = True
1855
+ color = experiment_to_color.get(experiment_id, 'blue')
1856
+
1857
+ fig.add_trace(
1858
+ go.Scatter(
1859
+ x=x_values,
1860
+ y=y_values,
1861
+ mode='lines+markers',
1862
+ name=experiment_id,
1863
+ line=dict(width=2, color=color),
1864
+ marker=dict(size=4, color=color),
1865
+ showlegend=True,
1866
+ connectgaps=True if _is_eval_metric(metric_name) else False
1867
+ ),
1868
+ row=row,
1869
+ col=col
1870
+ )
1871
+
1872
+ # Axis titles per subplot
1873
+ try:
1874
+ fig.update_xaxes(
1875
+ title_text=("Time" if _is_system_metric(metric_name) else "Training Step"),
1876
+ row=row,
1877
+ col=col
1878
+ )
1879
+ fig.update_yaxes(title_text=metric_name, row=row, col=col)
1880
+ except Exception:
1881
+ pass
1882
+
1883
+ # If no experiment had data for this metric, annotate the subplot
1884
+ if not subplot_has_data:
1885
+ try:
1886
+ fig.add_annotation(
1887
+ text=f"No data for metric: {metric_name}",
1888
+ xref="paper", yref="paper",
1889
+ x=0.5, y=0.5, showarrow=False,
1890
+ font=dict(size=12, color="gray"),
1891
+ row=row, col=col
1892
+ )
1893
+ except Exception:
1894
+ fig.add_annotation(
1895
+ text=f"No data for metric: {metric_name}",
1896
+ xref="paper", yref="paper",
1897
+ x=0.5, y=0.5, showarrow=False,
1898
+ font=dict(size=12, color="gray")
1899
+ )
1900
+
1901
+ fig.update_layout(
1902
+ title="Experiment Comparison",
1903
+ height=max(350, 320 * num_rows),
1904
+ plot_bgcolor='white',
1905
+ paper_bgcolor='white',
1906
+ hovermode='x unified',
1907
+ legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
1908
+ )
1909
+
1910
+ # Grid lines for all subplots
1911
+ for r in range(1, num_rows + 1):
1912
+ for c in range(1, num_columns + 1):
1913
+ fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', row=r, col=c)
1914
+ fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', row=r, col=c)
1915
+
1916
+ if not any_trace_added:
1917
+ # Overall annotation if literally nothing to plot
1918
+ fig = go.Figure()
1919
+ fig.add_annotation(
1920
+ text="No comparable data available for the selected experiments/metrics",
1921
+ xref="paper", yref="paper",
1922
+ x=0.5, y=0.5, showarrow=False,
1923
+ font=dict(size=16, color="orange")
1924
+ )
1925
+ fig.update_layout(
1926
+ title="No Data",
1927
+ plot_bgcolor='white', paper_bgcolor='white'
1928
+ )
1929
+
1930
+ return fig
1931
 
1932
  except Exception as e:
1933
  logger.error(f"Error creating comparison from selection: {str(e)}")