Spaces:
Running
Running
adds single token logic read/write , adds gpt-oss demo space , adds spaces refactor , adds new version of track tonic , adds logic in launch.sh
Browse files- docs/datasetflow.svg +1 -0
- launch.sh +18 -28
- scripts/dataset_tonic/setup_hf_dataset.py +86 -80
- scripts/deploy_demo_space.py +133 -30
- scripts/model_tonic/push_gpt_oss_to_huggingface.py +2 -2
- scripts/trackio_tonic/configure_trackio.py +22 -48
- scripts/trackio_tonic/deploy_trackio_space.py +11 -38
- scripts/trackio_tonic/switch_to_read_token.py +14 -11
- src/dataset_utils.py +328 -0
- src/monitoring.py +70 -48
- templates/spaces/demo_gpt/README.md +15 -0
- templates/spaces/demo_gpt/app.py +262 -0
- templates/spaces/demo_gpt/requirements.txt +9 -0
- templates/spaces/{demo β demo_smol}/README.md +0 -0
- templates/spaces/{demo β demo_smol}/app.py +0 -0
- templates/spaces/{demo β demo_smol}/requirements.txt +0 -0
- templates/spaces/{README.md β trackio/README.md} +0 -0
- templates/spaces/{app.py β trackio/app.py} +1154 -303
- templates/spaces/trackio/dataset_utils.py +328 -0
- templates/spaces/{requirements.txt β trackio/requirements.txt} +0 -0
- templates/spaces/trackio/trackio_api_client.py +320 -0
- tests/test_data_preservation.py +187 -0
- tests/test_demo_deployment.py +9 -6
- tests/test_deployment.py +11 -7
- tests/test_hf_datasets.py +2 -2
- tests/test_latest_deployment.py +12 -8
- tests/test_readme_template.py +2 -2
- tests/test_real_dataset_access.py +201 -0
- tests/test_trackio_dataset_fix.py +167 -0
- tests/test_trackio_deployment.py +5 -5
- tests/test_trackio_space_diagnostics.py +191 -0
docs/datasetflow.svg
ADDED
|
launch.sh
CHANGED
@@ -452,8 +452,10 @@ print_step "Step 1: User Authentication"
|
|
452 |
echo "================================"
|
453 |
|
454 |
print_info "You'll need two Hugging Face tokens:"
|
455 |
-
echo "1. Write Token - Used
|
456 |
-
echo "2. Read Token -
|
|
|
|
|
457 |
echo ""
|
458 |
|
459 |
print_info "Getting Write Token (for training operations)..."
|
@@ -489,7 +491,7 @@ else
|
|
489 |
exit 1
|
490 |
fi
|
491 |
|
492 |
-
# Set the main HF_TOKEN to write token for training operations
|
493 |
HF_TOKEN="$HF_WRITE_TOKEN"
|
494 |
|
495 |
# Step 2: Select training configuration
|
@@ -669,8 +671,6 @@ fi
|
|
669 |
|
670 |
# Set environment variables before creating virtual environment
|
671 |
print_info "Setting up environment variables..."
|
672 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
673 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
674 |
export HF_TOKEN="$HF_TOKEN"
|
675 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
676 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
@@ -682,8 +682,6 @@ source smollm3_env/bin/activate
|
|
682 |
|
683 |
# Re-export environment variables in the virtual environment
|
684 |
print_info "Configuring environment variables in virtual environment..."
|
685 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
686 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
687 |
export HF_TOKEN="$HF_TOKEN"
|
688 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
689 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
@@ -712,16 +710,16 @@ print_status "HF token configured for Python API usage"
|
|
712 |
print_info "Username: $HF_USERNAME (auto-detected from token)"
|
713 |
print_info "Token available in environment: ${HF_TOKEN:0:10}...${HF_TOKEN: -4}"
|
714 |
|
715 |
-
# Verify
|
716 |
print_info "Verifying token availability in virtual environment..."
|
717 |
-
if [ -n "$
|
718 |
-
print_status "β
|
719 |
-
print_info "
|
720 |
-
print_info " HF_READ_TOKEN: ${HF_READ_TOKEN:0:10}...${HF_READ_TOKEN: -4}"
|
721 |
print_info " HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN:0:10}...${HUGGING_FACE_HUB_TOKEN: -4}"
|
|
|
722 |
else
|
723 |
-
print_error "β
|
724 |
-
print_error "Please check your
|
725 |
exit 1
|
726 |
fi
|
727 |
|
@@ -771,8 +769,6 @@ print_info "Username will be auto-detected from token"
|
|
771 |
print_info "Secrets will be set automatically via API"
|
772 |
|
773 |
# Ensure environment variables are available for the script
|
774 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
775 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
776 |
export HF_TOKEN="$HF_TOKEN"
|
777 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
778 |
export HF_USERNAME="$HF_USERNAME"
|
@@ -792,8 +788,6 @@ print_info "Username will be auto-detected from token"
|
|
792 |
print_info "Dataset repository: $TRACKIO_DATASET_REPO"
|
793 |
|
794 |
# Ensure environment variables are available for the script
|
795 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
796 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
797 |
export HF_TOKEN="$HF_TOKEN"
|
798 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
799 |
export HF_USERNAME="$HF_USERNAME"
|
@@ -809,8 +803,6 @@ print_info "Configuring Trackio ..."
|
|
809 |
print_info "Username will be auto-detected from token"
|
810 |
|
811 |
# Ensure environment variables are available for the script
|
812 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
813 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
814 |
export HF_TOKEN="$HF_TOKEN"
|
815 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
816 |
export HF_USERNAME="$HF_USERNAME"
|
@@ -920,7 +912,7 @@ fi
|
|
920 |
print_step "Step 16.5: Switching to Read Token for Security"
|
921 |
echo "===================================================="
|
922 |
|
923 |
-
print_info "Switching Trackio Space from write token to read token for security..."
|
924 |
print_info "This ensures the space can only read datasets, not write to repositories"
|
925 |
|
926 |
# Ensure environment variables are available for token switch
|
@@ -928,12 +920,12 @@ export HF_TOKEN="$HF_WRITE_TOKEN" # Use write token to update space
|
|
928 |
export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
|
929 |
export HF_USERNAME="$HF_USERNAME"
|
930 |
|
931 |
-
# Switch
|
932 |
cd scripts/trackio_tonic
|
933 |
python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
|
934 |
|
935 |
if [ $? -eq 0 ]; then
|
936 |
-
print_status "β
Successfully switched Trackio Space to read token"
|
937 |
print_info "π Space now uses read-only permissions for security"
|
938 |
else
|
939 |
print_warning "β οΈ Failed to switch to read token, but continuing with pipeline"
|
@@ -957,8 +949,6 @@ if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
|
957 |
DEMO_SUBFOLDER=""
|
958 |
|
959 |
# Ensure environment variables are available for demo deployment
|
960 |
-
export HF_WRITE_TOKEN="$HF_WRITE_TOKEN"
|
961 |
-
export HF_READ_TOKEN="$HF_READ_TOKEN"
|
962 |
export HF_TOKEN="$HF_TOKEN"
|
963 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
964 |
export HF_USERNAME="$HF_USERNAME"
|
@@ -999,7 +989,7 @@ cat > training_summary.md << EOF
|
|
999 |
- **HF Dataset**: $TRACKIO_DATASET_REPO
|
1000 |
- **Training Config**: $TRAINING_CONFIG_TYPE
|
1001 |
- **Trainer Type**: $TRAINER_TYPE
|
1002 |
-
- **Security**:
|
1003 |
$(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
1004 |
echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
|
1005 |
fi)
|
@@ -1015,7 +1005,7 @@ fi)
|
|
1015 |
- **Model Repository**: https://huggingface.co/$REPO_NAME
|
1016 |
- **Trackio Monitoring**: $TRACKIO_URL
|
1017 |
- **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
|
1018 |
-
- **Security**: Trackio Space switched to read-only token for security
|
1019 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
1020 |
echo "- **Demo Space**: https://huggingface.co/spaces/$HF_USERNAME/${REPO_NAME}-demo"
|
1021 |
fi)
|
@@ -1053,7 +1043,7 @@ echo ""
|
|
1053 |
echo "π Next steps:"
|
1054 |
echo "1. Monitor training progress in your Trackio Space"
|
1055 |
echo "2. Check the model repository on Hugging Face Hub"
|
1056 |
-
echo "3. Your Trackio Space is now secured with read-only permissions"
|
1057 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
1058 |
echo "3. Make your huggingface space a ZeroGPU Space & Test your model"
|
1059 |
fi)
|
|
|
452 |
echo "================================"
|
453 |
|
454 |
print_info "You'll need two Hugging Face tokens:"
|
455 |
+
echo "1. Write Token - Used initially for training and creating repositories"
|
456 |
+
echo "2. Read Token - Will replace the write token in Trackio Space after training for security"
|
457 |
+
echo ""
|
458 |
+
print_info "The pipeline will start with the write token in HF_TOKEN, then switch to read token automatically."
|
459 |
echo ""
|
460 |
|
461 |
print_info "Getting Write Token (for training operations)..."
|
|
|
491 |
exit 1
|
492 |
fi
|
493 |
|
494 |
+
# Set the main HF_TOKEN to write token for training operations (will be switched later)
|
495 |
HF_TOKEN="$HF_WRITE_TOKEN"
|
496 |
|
497 |
# Step 2: Select training configuration
|
|
|
671 |
|
672 |
# Set environment variables before creating virtual environment
|
673 |
print_info "Setting up environment variables..."
|
|
|
|
|
674 |
export HF_TOKEN="$HF_TOKEN"
|
675 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
676 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
|
682 |
|
683 |
# Re-export environment variables in the virtual environment
|
684 |
print_info "Configuring environment variables in virtual environment..."
|
|
|
|
|
685 |
export HF_TOKEN="$HF_TOKEN"
|
686 |
export TRACKIO_DATASET_REPO="$TRACKIO_DATASET_REPO"
|
687 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
|
710 |
print_info "Username: $HF_USERNAME (auto-detected from token)"
|
711 |
print_info "Token available in environment: ${HF_TOKEN:0:10}...${HF_TOKEN: -4}"
|
712 |
|
713 |
+
# Verify token is available in the virtual environment
|
714 |
print_info "Verifying token availability in virtual environment..."
|
715 |
+
if [ -n "$HF_TOKEN" ] && [ -n "$HUGGING_FACE_HUB_TOKEN" ]; then
|
716 |
+
print_status "β
Token properly configured in virtual environment"
|
717 |
+
print_info " HF_TOKEN: ${HF_TOKEN:0:10}...${HF_TOKEN: -4} (currently using WRITE token)"
|
|
|
718 |
print_info " HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN:0:10}...${HUGGING_FACE_HUB_TOKEN: -4}"
|
719 |
+
print_info " Will be switched to READ token after training for security"
|
720 |
else
|
721 |
+
print_error "β Token not properly configured in virtual environment"
|
722 |
+
print_error "Please check your token and try again"
|
723 |
exit 1
|
724 |
fi
|
725 |
|
|
|
769 |
print_info "Secrets will be set automatically via API"
|
770 |
|
771 |
# Ensure environment variables are available for the script
|
|
|
|
|
772 |
export HF_TOKEN="$HF_TOKEN"
|
773 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
774 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
788 |
print_info "Dataset repository: $TRACKIO_DATASET_REPO"
|
789 |
|
790 |
# Ensure environment variables are available for the script
|
|
|
|
|
791 |
export HF_TOKEN="$HF_TOKEN"
|
792 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
793 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
803 |
print_info "Username will be auto-detected from token"
|
804 |
|
805 |
# Ensure environment variables are available for the script
|
|
|
|
|
806 |
export HF_TOKEN="$HF_TOKEN"
|
807 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
808 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
912 |
print_step "Step 16.5: Switching to Read Token for Security"
|
913 |
echo "===================================================="
|
914 |
|
915 |
+
print_info "Switching Trackio Space HF_TOKEN from write token to read token for security..."
|
916 |
print_info "This ensures the space can only read datasets, not write to repositories"
|
917 |
|
918 |
# Ensure environment variables are available for token switch
|
|
|
920 |
export HUGGING_FACE_HUB_TOKEN="$HF_WRITE_TOKEN"
|
921 |
export HF_USERNAME="$HF_USERNAME"
|
922 |
|
923 |
+
# Switch HF_TOKEN in Trackio Space from write to read token
|
924 |
cd scripts/trackio_tonic
|
925 |
python switch_to_read_token.py "$HF_USERNAME/$TRACKIO_SPACE_NAME" "$HF_READ_TOKEN" "$HF_WRITE_TOKEN"
|
926 |
|
927 |
if [ $? -eq 0 ]; then
|
928 |
+
print_status "β
Successfully switched Trackio Space HF_TOKEN to read token"
|
929 |
print_info "π Space now uses read-only permissions for security"
|
930 |
else
|
931 |
print_warning "β οΈ Failed to switch to read token, but continuing with pipeline"
|
|
|
949 |
DEMO_SUBFOLDER=""
|
950 |
|
951 |
# Ensure environment variables are available for demo deployment
|
|
|
|
|
952 |
export HF_TOKEN="$HF_TOKEN"
|
953 |
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
954 |
export HF_USERNAME="$HF_USERNAME"
|
|
|
989 |
- **HF Dataset**: $TRACKIO_DATASET_REPO
|
990 |
- **Training Config**: $TRAINING_CONFIG_TYPE
|
991 |
- **Trainer Type**: $TRAINER_TYPE
|
992 |
+
- **Security**: Single HF_TOKEN switched from write to read token
|
993 |
$(if [ "$TRAINING_CONFIG_TYPE" = "H100 Lightweight (Rapid)" ]; then
|
994 |
echo "- **Dataset Sample Size**: ${DATASET_SAMPLE_SIZE:-80000}"
|
995 |
fi)
|
|
|
1005 |
- **Model Repository**: https://huggingface.co/$REPO_NAME
|
1006 |
- **Trackio Monitoring**: $TRACKIO_URL
|
1007 |
- **Experiment Data**: https://huggingface.co/datasets/$TRACKIO_DATASET_REPO
|
1008 |
+
- **Security**: Trackio Space HF_TOKEN switched to read-only token for security
|
1009 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
1010 |
echo "- **Demo Space**: https://huggingface.co/spaces/$HF_USERNAME/${REPO_NAME}-demo"
|
1011 |
fi)
|
|
|
1043 |
echo "π Next steps:"
|
1044 |
echo "1. Monitor training progress in your Trackio Space"
|
1045 |
echo "2. Check the model repository on Hugging Face Hub"
|
1046 |
+
echo "3. Your Trackio Space HF_TOKEN is now secured with read-only permissions"
|
1047 |
$(if [ "$DEPLOY_DEMO" = "y" ] || [ "$DEPLOY_DEMO" = "Y" ]; then
|
1048 |
echo "3. Make your huggingface space a ZeroGPU Space & Test your model"
|
1049 |
fi)
|
scripts/dataset_tonic/setup_hf_dataset.py
CHANGED
@@ -145,7 +145,7 @@ def setup_trackio_dataset(dataset_name: str = None, token: str = None) -> bool:
|
|
145 |
|
146 |
def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
147 |
"""
|
148 |
-
Add initial experiment data to the dataset.
|
149 |
|
150 |
Args:
|
151 |
repo_id (str): Dataset repository ID
|
@@ -163,89 +163,95 @@ def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
|
163 |
print("β οΈ No token available for uploading data")
|
164 |
return False
|
165 |
|
166 |
-
#
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
'name': 'smollm3-finetune-demo',
|
171 |
-
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
172 |
-
'created_at': datetime.now().isoformat(),
|
173 |
-
'status': 'completed',
|
174 |
-
'metrics': json.dumps([
|
175 |
-
{
|
176 |
-
'timestamp': datetime.now().isoformat(),
|
177 |
-
'step': 100,
|
178 |
-
'metrics': {
|
179 |
-
'loss': 1.15,
|
180 |
-
'grad_norm': 10.5,
|
181 |
-
'learning_rate': 5e-6,
|
182 |
-
'num_tokens': 1000000.0,
|
183 |
-
'mean_token_accuracy': 0.76,
|
184 |
-
'epoch': 0.1,
|
185 |
-
'total_tokens': 1000000.0,
|
186 |
-
'throughput': 2000000.0,
|
187 |
-
'step_time': 0.5,
|
188 |
-
'batch_size': 2,
|
189 |
-
'seq_len': 4096,
|
190 |
-
'token_acc': 0.76,
|
191 |
-
'gpu_memory_allocated': 15.2,
|
192 |
-
'gpu_memory_reserved': 70.1,
|
193 |
-
'gpu_utilization': 85.2,
|
194 |
-
'cpu_percent': 2.7,
|
195 |
-
'memory_percent': 10.1
|
196 |
-
}
|
197 |
-
}
|
198 |
-
]),
|
199 |
-
'parameters': json.dumps({
|
200 |
-
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
201 |
-
'max_seq_length': 4096,
|
202 |
-
'batch_size': 2,
|
203 |
-
'learning_rate': 5e-6,
|
204 |
-
'epochs': 3,
|
205 |
-
'dataset': 'OpenHermes-FR',
|
206 |
-
'trainer_type': 'SFTTrainer',
|
207 |
-
'hardware': 'GPU (H100/A100)',
|
208 |
-
'mixed_precision': True,
|
209 |
-
'gradient_checkpointing': True,
|
210 |
-
'flash_attention': True
|
211 |
-
}),
|
212 |
-
'artifacts': json.dumps([]),
|
213 |
-
'logs': json.dumps([
|
214 |
-
{
|
215 |
-
'timestamp': datetime.now().isoformat(),
|
216 |
-
'level': 'INFO',
|
217 |
-
'message': 'Training started successfully'
|
218 |
-
},
|
219 |
-
{
|
220 |
-
'timestamp': datetime.now().isoformat(),
|
221 |
-
'level': 'INFO',
|
222 |
-
'message': 'Model loaded and configured'
|
223 |
-
},
|
224 |
-
{
|
225 |
-
'timestamp': datetime.now().isoformat(),
|
226 |
-
'level': 'INFO',
|
227 |
-
'message': 'Dataset loaded and preprocessed'
|
228 |
-
}
|
229 |
-
]),
|
230 |
-
'last_updated': datetime.now().isoformat()
|
231 |
-
}
|
232 |
-
]
|
233 |
|
234 |
-
#
|
235 |
-
|
236 |
|
237 |
-
#
|
238 |
-
|
|
|
|
|
239 |
|
240 |
-
#
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
# Add README template
|
251 |
add_dataset_readme(repo_id, token)
|
|
|
145 |
|
146 |
def add_initial_experiment_data(repo_id: str, token: str = None) -> bool:
|
147 |
"""
|
148 |
+
Add initial experiment data to the dataset using data preservation.
|
149 |
|
150 |
Args:
|
151 |
repo_id (str): Dataset repository ID
|
|
|
163 |
print("β οΈ No token available for uploading data")
|
164 |
return False
|
165 |
|
166 |
+
# Import dataset manager
|
167 |
+
import sys
|
168 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
169 |
+
from dataset_utils import TrackioDatasetManager
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
+
# Initialize dataset manager
|
172 |
+
dataset_manager = TrackioDatasetManager(repo_id, token)
|
173 |
|
174 |
+
# Check if dataset already has data
|
175 |
+
existing_experiments = dataset_manager.load_existing_experiments()
|
176 |
+
if existing_experiments:
|
177 |
+
print(f"βΉοΈ Dataset already contains {len(existing_experiments)} experiments, preserving existing data")
|
178 |
|
179 |
+
# Initial experiment data
|
180 |
+
initial_experiment = {
|
181 |
+
'experiment_id': f'exp_demo_{datetime.now().strftime("%Y%m%d_%H%M%S")}',
|
182 |
+
'name': 'smollm3-finetune-demo',
|
183 |
+
'description': 'SmolLM3 fine-tuning experiment demo with comprehensive metrics tracking',
|
184 |
+
'created_at': datetime.now().isoformat(),
|
185 |
+
'status': 'completed',
|
186 |
+
'metrics': json.dumps([
|
187 |
+
{
|
188 |
+
'timestamp': datetime.now().isoformat(),
|
189 |
+
'step': 100,
|
190 |
+
'metrics': {
|
191 |
+
'loss': 1.15,
|
192 |
+
'grad_norm': 10.5,
|
193 |
+
'learning_rate': 5e-6,
|
194 |
+
'num_tokens': 1000000.0,
|
195 |
+
'mean_token_accuracy': 0.76,
|
196 |
+
'epoch': 0.1,
|
197 |
+
'total_tokens': 1000000.0,
|
198 |
+
'throughput': 2000000.0,
|
199 |
+
'step_time': 0.5,
|
200 |
+
'batch_size': 2,
|
201 |
+
'seq_len': 4096,
|
202 |
+
'token_acc': 0.76,
|
203 |
+
'gpu_memory_allocated': 15.2,
|
204 |
+
'gpu_memory_reserved': 70.1,
|
205 |
+
'gpu_utilization': 85.2,
|
206 |
+
'cpu_percent': 2.7,
|
207 |
+
'memory_percent': 10.1
|
208 |
+
}
|
209 |
+
}
|
210 |
+
]),
|
211 |
+
'parameters': json.dumps({
|
212 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
213 |
+
'max_seq_length': 4096,
|
214 |
+
'batch_size': 2,
|
215 |
+
'learning_rate': 5e-6,
|
216 |
+
'epochs': 3,
|
217 |
+
'dataset': 'OpenHermes-FR',
|
218 |
+
'trainer_type': 'SFTTrainer',
|
219 |
+
'hardware': 'GPU (H100/A100)',
|
220 |
+
'mixed_precision': True,
|
221 |
+
'gradient_checkpointing': True,
|
222 |
+
'flash_attention': True
|
223 |
+
}),
|
224 |
+
'artifacts': json.dumps([]),
|
225 |
+
'logs': json.dumps([
|
226 |
+
{
|
227 |
+
'timestamp': datetime.now().isoformat(),
|
228 |
+
'level': 'INFO',
|
229 |
+
'message': 'Training started successfully'
|
230 |
+
},
|
231 |
+
{
|
232 |
+
'timestamp': datetime.now().isoformat(),
|
233 |
+
'level': 'INFO',
|
234 |
+
'message': 'Model loaded and configured'
|
235 |
+
},
|
236 |
+
{
|
237 |
+
'timestamp': datetime.now().isoformat(),
|
238 |
+
'level': 'INFO',
|
239 |
+
'message': 'Dataset loaded and preprocessed'
|
240 |
+
}
|
241 |
+
]),
|
242 |
+
'last_updated': datetime.now().isoformat()
|
243 |
+
}
|
244 |
+
|
245 |
+
# Use dataset manager to safely add the experiment
|
246 |
+
success = dataset_manager.upsert_experiment(initial_experiment)
|
247 |
|
248 |
+
if success:
|
249 |
+
print(f"β
Successfully added initial experiment data to {repo_id}")
|
250 |
+
final_count = len(dataset_manager.load_existing_experiments())
|
251 |
+
print(f"π Dataset now contains {final_count} total experiments")
|
252 |
+
else:
|
253 |
+
print(f"β Failed to add initial experiment data to {repo_id}")
|
254 |
+
return False
|
255 |
|
256 |
# Add README template
|
257 |
add_dataset_readme(repo_id, token)
|
scripts/deploy_demo_space.py
CHANGED
@@ -38,7 +38,8 @@ class DemoSpaceDeployer:
|
|
38 |
"""Deploy demo space to Hugging Face Spaces"""
|
39 |
|
40 |
def __init__(self, hf_token: str, hf_username: str, model_id: str,
|
41 |
-
subfolder: str = "int4", space_name: Optional[str] = None
|
|
|
42 |
self.hf_token = hf_token
|
43 |
self.hf_username = hf_username
|
44 |
self.model_id = model_id
|
@@ -47,8 +48,13 @@ class DemoSpaceDeployer:
|
|
47 |
self.space_id = f"{hf_username}/{self.space_name}"
|
48 |
self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
|
49 |
|
50 |
-
#
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
52 |
self.workspace_dir = Path.cwd()
|
53 |
|
54 |
# Initialize HF API
|
@@ -58,6 +64,107 @@ class DemoSpaceDeployer:
|
|
58 |
self.api = None
|
59 |
logger.warning("huggingface_hub not available, using CLI fallback")
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def validate_model_exists(self) -> bool:
|
62 |
"""Validate that the model exists on Hugging Face Hub"""
|
63 |
try:
|
@@ -187,14 +294,7 @@ class DemoSpaceDeployer:
|
|
187 |
content = f.read()
|
188 |
|
189 |
# Add environment variable setup at the top
|
190 |
-
env_setup =
|
191 |
-
# Environment variables for model configuration
|
192 |
-
import os
|
193 |
-
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
194 |
-
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
195 |
-
os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
|
196 |
-
|
197 |
-
"""
|
198 |
|
199 |
# Insert after imports
|
200 |
lines = content.split('\n')
|
@@ -335,24 +435,7 @@ Simply start chatting with the model using the interface below!
|
|
335 |
logger.info("β
Successfully set HF_TOKEN secret via API")
|
336 |
|
337 |
# Set model-specific environment variables
|
338 |
-
self.
|
339 |
-
repo_id=self.space_id,
|
340 |
-
key="HF_MODEL_ID",
|
341 |
-
value=self.model_id,
|
342 |
-
description="Model ID for the demo"
|
343 |
-
)
|
344 |
-
logger.info(f"β
Successfully set HF_MODEL_ID variable: {self.model_id}")
|
345 |
-
|
346 |
-
if self.subfolder and self.subfolder.strip():
|
347 |
-
self.api.add_space_variable(
|
348 |
-
repo_id=self.space_id,
|
349 |
-
key="MODEL_SUBFOLDER",
|
350 |
-
value=self.subfolder,
|
351 |
-
description="Model subfolder for the demo"
|
352 |
-
)
|
353 |
-
logger.info(f"β
Successfully set MODEL_SUBFOLDER variable: {self.subfolder}")
|
354 |
-
else:
|
355 |
-
logger.info("βΉοΈ No subfolder specified, using main model")
|
356 |
|
357 |
return True
|
358 |
|
@@ -375,6 +458,13 @@ Simply start chatting with the model using the interface below!
|
|
375 |
else:
|
376 |
logger.info(" MODEL_SUBFOLDER=(empty - using main model)")
|
377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
logger.info(f"\nπ§ To set secrets in your Space:")
|
379 |
logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
|
380 |
logger.info("2. Navigate to the 'Repository secrets' section")
|
@@ -389,6 +479,17 @@ Simply start chatting with the model using the interface below!
|
|
389 |
else:
|
390 |
logger.info(" Name: MODEL_SUBFOLDER")
|
391 |
logger.info(" Value: (leave empty)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
logger.info("4. Save the secrets")
|
393 |
|
394 |
return True
|
@@ -471,6 +572,7 @@ def main():
|
|
471 |
parser.add_argument("--model-id", required=True, help="Model ID to deploy demo for")
|
472 |
parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
|
473 |
parser.add_argument("--space-name", help="Custom space name (optional)")
|
|
|
474 |
|
475 |
args = parser.parse_args()
|
476 |
|
@@ -479,7 +581,8 @@ def main():
|
|
479 |
hf_username=args.hf_username,
|
480 |
model_id=args.model_id,
|
481 |
subfolder=args.subfolder,
|
482 |
-
space_name=args.space_name
|
|
|
483 |
)
|
484 |
|
485 |
success = deployer.deploy()
|
|
|
38 |
"""Deploy demo space to Hugging Face Spaces"""
|
39 |
|
40 |
def __init__(self, hf_token: str, hf_username: str, model_id: str,
|
41 |
+
subfolder: str = "int4", space_name: Optional[str] = None,
|
42 |
+
demo_type: Optional[str] = None):
|
43 |
self.hf_token = hf_token
|
44 |
self.hf_username = hf_username
|
45 |
self.model_id = model_id
|
|
|
48 |
self.space_id = f"{hf_username}/{self.space_name}"
|
49 |
self.space_url = f"https://huggingface.co/spaces/{self.space_id}"
|
50 |
|
51 |
+
# Determine demo type from model_id if not provided
|
52 |
+
if demo_type is None:
|
53 |
+
demo_type = self._detect_demo_type(model_id)
|
54 |
+
|
55 |
+
# Template paths based on model type
|
56 |
+
self.demo_type = demo_type
|
57 |
+
self.template_dir = Path(__file__).parent.parent / "templates" / "spaces" / f"demo_{demo_type}"
|
58 |
self.workspace_dir = Path.cwd()
|
59 |
|
60 |
# Initialize HF API
|
|
|
64 |
self.api = None
|
65 |
logger.warning("huggingface_hub not available, using CLI fallback")
|
66 |
|
67 |
+
def _detect_demo_type(self, model_id: str) -> str:
|
68 |
+
"""Detect the appropriate demo type based on model ID"""
|
69 |
+
model_id_lower = model_id.lower()
|
70 |
+
|
71 |
+
# Check for GPT-OSS models
|
72 |
+
if "gpt-oss" in model_id_lower or "gpt_oss" in model_id_lower:
|
73 |
+
logger.info(f"Detected GPT-OSS model, using demo_gpt template")
|
74 |
+
return "gpt"
|
75 |
+
|
76 |
+
# Check for SmolLM models (default)
|
77 |
+
elif "smollm" in model_id_lower or "smol" in model_id_lower:
|
78 |
+
logger.info(f"Detected SmolLM model, using demo_smol template")
|
79 |
+
return "smol"
|
80 |
+
|
81 |
+
# Default to SmolLM for unknown models
|
82 |
+
else:
|
83 |
+
logger.info(f"Unknown model type, defaulting to demo_smol template")
|
84 |
+
return "smol"
|
85 |
+
|
86 |
+
def _generate_env_setup(self) -> str:
|
87 |
+
"""Generate environment variable setup based on demo type and model"""
|
88 |
+
if self.demo_type == "gpt":
|
89 |
+
# For GPT-OSS models, we need more sophisticated environment setup
|
90 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
91 |
+
|
92 |
+
env_setup = f"""
|
93 |
+
# Environment variables for GPT-OSS model configuration
|
94 |
+
import os
|
95 |
+
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
96 |
+
os.environ['LORA_MODEL_ID'] = '{self.model_id}'
|
97 |
+
os.environ['BASE_MODEL_ID'] = 'openai/gpt-oss-20b'
|
98 |
+
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
99 |
+
os.environ['MODEL_NAME'] = '{model_name}'
|
100 |
+
|
101 |
+
"""
|
102 |
+
else:
|
103 |
+
# For SmolLM models, use simpler setup
|
104 |
+
env_setup = f"""
|
105 |
+
# Environment variables for model configuration
|
106 |
+
import os
|
107 |
+
os.environ['HF_MODEL_ID'] = '{self.model_id}'
|
108 |
+
os.environ['MODEL_SUBFOLDER'] = '{self.subfolder if self.subfolder else ""}'
|
109 |
+
os.environ['MODEL_NAME'] = '{self.model_id.split("/")[-1]}'
|
110 |
+
|
111 |
+
"""
|
112 |
+
return env_setup
|
113 |
+
|
114 |
+
def _set_model_variables(self):
|
115 |
+
"""Set model-specific environment variables in the space"""
|
116 |
+
try:
|
117 |
+
# Common variables for all models
|
118 |
+
self.api.add_space_variable(
|
119 |
+
repo_id=self.space_id,
|
120 |
+
key="HF_MODEL_ID",
|
121 |
+
value=self.model_id,
|
122 |
+
description="Model ID for the demo"
|
123 |
+
)
|
124 |
+
logger.info(f"β
Successfully set HF_MODEL_ID variable: {self.model_id}")
|
125 |
+
|
126 |
+
if self.subfolder and self.subfolder.strip():
|
127 |
+
self.api.add_space_variable(
|
128 |
+
repo_id=self.space_id,
|
129 |
+
key="MODEL_SUBFOLDER",
|
130 |
+
value=self.subfolder,
|
131 |
+
description="Model subfolder for the demo"
|
132 |
+
)
|
133 |
+
logger.info(f"β
Successfully set MODEL_SUBFOLDER variable: {self.subfolder}")
|
134 |
+
else:
|
135 |
+
logger.info("βΉοΈ No subfolder specified, using main model")
|
136 |
+
|
137 |
+
# GPT-OSS specific variables
|
138 |
+
if self.demo_type == "gpt":
|
139 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
140 |
+
|
141 |
+
self.api.add_space_variable(
|
142 |
+
repo_id=self.space_id,
|
143 |
+
key="LORA_MODEL_ID",
|
144 |
+
value=self.model_id,
|
145 |
+
description="LoRA/Fine-tuned model ID"
|
146 |
+
)
|
147 |
+
logger.info(f"β
Successfully set LORA_MODEL_ID variable: {self.model_id}")
|
148 |
+
|
149 |
+
self.api.add_space_variable(
|
150 |
+
repo_id=self.space_id,
|
151 |
+
key="BASE_MODEL_ID",
|
152 |
+
value="openai/gpt-oss-20b",
|
153 |
+
description="Base model ID for GPT-OSS"
|
154 |
+
)
|
155 |
+
logger.info("β
Successfully set BASE_MODEL_ID variable: openai/gpt-oss-20b")
|
156 |
+
|
157 |
+
self.api.add_space_variable(
|
158 |
+
repo_id=self.space_id,
|
159 |
+
key="MODEL_NAME",
|
160 |
+
value=model_name,
|
161 |
+
description="Display name for the model"
|
162 |
+
)
|
163 |
+
logger.info(f"β
Successfully set MODEL_NAME variable: {model_name}")
|
164 |
+
|
165 |
+
except Exception as e:
|
166 |
+
logger.error(f"β Failed to set model variables: {e}")
|
167 |
+
|
168 |
def validate_model_exists(self) -> bool:
|
169 |
"""Validate that the model exists on Hugging Face Hub"""
|
170 |
try:
|
|
|
294 |
content = f.read()
|
295 |
|
296 |
# Add environment variable setup at the top
|
297 |
+
env_setup = self._generate_env_setup()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
298 |
|
299 |
# Insert after imports
|
300 |
lines = content.split('\n')
|
|
|
435 |
logger.info("β
Successfully set HF_TOKEN secret via API")
|
436 |
|
437 |
# Set model-specific environment variables
|
438 |
+
self._set_model_variables()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
|
440 |
return True
|
441 |
|
|
|
458 |
else:
|
459 |
logger.info(" MODEL_SUBFOLDER=(empty - using main model)")
|
460 |
|
461 |
+
# GPT-OSS specific variables
|
462 |
+
if self.demo_type == "gpt":
|
463 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
464 |
+
logger.info(f" LORA_MODEL_ID={self.model_id}")
|
465 |
+
logger.info(f" BASE_MODEL_ID=openai/gpt-oss-20b")
|
466 |
+
logger.info(f" MODEL_NAME={model_name}")
|
467 |
+
|
468 |
logger.info(f"\nπ§ To set secrets in your Space:")
|
469 |
logger.info(f"1. Go to your Space settings: {self.space_url}/settings")
|
470 |
logger.info("2. Navigate to the 'Repository secrets' section")
|
|
|
479 |
else:
|
480 |
logger.info(" Name: MODEL_SUBFOLDER")
|
481 |
logger.info(" Value: (leave empty)")
|
482 |
+
|
483 |
+
# GPT-OSS specific variables
|
484 |
+
if self.demo_type == "gpt":
|
485 |
+
model_name = self.model_id.split("/")[-1] if "/" in self.model_id else self.model_id
|
486 |
+
logger.info(f" Name: LORA_MODEL_ID")
|
487 |
+
logger.info(f" Value: {self.model_id}")
|
488 |
+
logger.info(f" Name: BASE_MODEL_ID")
|
489 |
+
logger.info(f" Value: openai/gpt-oss-20b")
|
490 |
+
logger.info(f" Name: MODEL_NAME")
|
491 |
+
logger.info(f" Value: {model_name}")
|
492 |
+
|
493 |
logger.info("4. Save the secrets")
|
494 |
|
495 |
return True
|
|
|
572 |
parser.add_argument("--model-id", required=True, help="Model ID to deploy demo for")
|
573 |
parser.add_argument("--subfolder", default="int4", help="Model subfolder (default: int4)")
|
574 |
parser.add_argument("--space-name", help="Custom space name (optional)")
|
575 |
+
parser.add_argument("--demo-type", choices=["smol", "gpt"], help="Demo type: 'smol' for SmolLM, 'gpt' for GPT-OSS (auto-detected if not specified)")
|
576 |
|
577 |
args = parser.parse_args()
|
578 |
|
|
|
581 |
hf_username=args.hf_username,
|
582 |
model_id=args.model_id,
|
583 |
subfolder=args.subfolder,
|
584 |
+
space_name=args.space_name,
|
585 |
+
demo_type=args.demo_type
|
586 |
)
|
587 |
|
588 |
success = deployer.deploy()
|
scripts/model_tonic/push_gpt_oss_to_huggingface.py
CHANGED
@@ -169,8 +169,8 @@ If you use this model in your research, please cite:
|
|
169 |
author = {{{author_name}}},
|
170 |
title = {{{model_name}}},
|
171 |
year = {{{datetime.now().year}}},
|
172 |
-
publisher = {Hugging Face},
|
173 |
-
journal = {Hugging Face repository},
|
174 |
howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
|
175 |
}}
|
176 |
```
|
|
|
169 |
author = {{{author_name}}},
|
170 |
title = {{{model_name}}},
|
171 |
year = {{{datetime.now().year}}},
|
172 |
+
publisher = {{Hugging Face}},
|
173 |
+
journal = {{Hugging Face repository}},
|
174 |
howpublished = {{\\url{{https://huggingface.co/{model_name}}}}}
|
175 |
}}
|
176 |
```
|
scripts/trackio_tonic/configure_trackio.py
CHANGED
@@ -79,13 +79,11 @@ def configure_trackio():
|
|
79 |
print("π§ Trackio Configuration")
|
80 |
print("=" * 40)
|
81 |
|
82 |
-
# Get HF
|
83 |
-
|
84 |
-
hf_read_token = os.environ.get('HF_READ_TOKEN')
|
85 |
-
hf_token = os.environ.get('HF_TOKEN') # Legacy support
|
86 |
|
87 |
-
# Use
|
88 |
-
active_token =
|
89 |
|
90 |
if active_token:
|
91 |
username = get_username_from_token(active_token)
|
@@ -102,9 +100,7 @@ def configure_trackio():
|
|
102 |
|
103 |
# Current configuration
|
104 |
current_config = {
|
105 |
-
'
|
106 |
-
'HF_READ_TOKEN': hf_read_token or 'Not set',
|
107 |
-
'HF_TOKEN': hf_token or 'Not set', # Legacy
|
108 |
'TRACKIO_DATASET_REPO': dataset_repo,
|
109 |
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
|
110 |
'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
|
@@ -116,12 +112,10 @@ def configure_trackio():
|
|
116 |
print(f" {status} {key}: {value}")
|
117 |
|
118 |
print("\nπ― Configuration Options:")
|
119 |
-
print("1. Set
|
120 |
-
print("2. Set
|
121 |
-
print("3. Set
|
122 |
-
print("4. Set
|
123 |
-
print("5. Set SPACE_ID - HF Space ID (auto-detected)")
|
124 |
-
print("6. Set TRACKIO_URL - Trackio Space URL (auto-detected)")
|
125 |
|
126 |
# Check if running on HF Spaces
|
127 |
if os.environ.get('SPACE_ID'):
|
@@ -131,37 +125,21 @@ def configure_trackio():
|
|
131 |
# Validate configuration
|
132 |
print("\nπ Configuration Validation:")
|
133 |
|
134 |
-
# Check
|
135 |
-
if current_config['HF_WRITE_TOKEN'] != 'Not set':
|
136 |
-
print("β
HF_WRITE_TOKEN is set")
|
137 |
-
print(" This allows training operations and repository creation")
|
138 |
-
else:
|
139 |
-
print("β HF_WRITE_TOKEN is not set")
|
140 |
-
print(" Please set HF_WRITE_TOKEN for training operations")
|
141 |
-
print(" Get your token from: https://huggingface.co/settings/tokens")
|
142 |
-
|
143 |
-
# Check HF_READ_TOKEN
|
144 |
-
if current_config['HF_READ_TOKEN'] != 'Not set':
|
145 |
-
print("β
HF_READ_TOKEN is set")
|
146 |
-
print(" This will be used for Trackio Space security")
|
147 |
-
else:
|
148 |
-
print("β HF_READ_TOKEN is not set")
|
149 |
-
print(" Please set HF_READ_TOKEN for Space security")
|
150 |
-
print(" Get your token from: https://huggingface.co/settings/tokens")
|
151 |
-
|
152 |
-
# Check legacy HF_TOKEN
|
153 |
if current_config['HF_TOKEN'] != 'Not set':
|
154 |
-
print("β
HF_TOKEN
|
155 |
-
print(" This
|
|
|
156 |
else:
|
157 |
-
print("
|
158 |
-
print("
|
|
|
159 |
|
160 |
# Check dataset repository
|
161 |
print(f"π Dataset Repository: {dataset_repo}")
|
162 |
|
163 |
# Test dataset access if token is available
|
164 |
-
test_token = current_config['
|
165 |
if test_token != 'Not set':
|
166 |
print("\nπ§ͺ Testing Dataset Access...")
|
167 |
try:
|
@@ -216,15 +194,13 @@ def configure_trackio():
|
|
216 |
# Generate configuration file
|
217 |
config_file = "trackio_config.json"
|
218 |
config_data = {
|
219 |
-
'
|
220 |
-
'hf_read_token': current_config['HF_READ_TOKEN'],
|
221 |
-
'hf_token': current_config['HF_TOKEN'], # Legacy
|
222 |
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
|
223 |
'space_id': current_config['SPACE_ID'],
|
224 |
'trackio_url': current_config['TRACKIO_URL'],
|
225 |
'username': username,
|
226 |
'last_updated': datetime.now().isoformat(),
|
227 |
-
'notes': 'Trackio configuration -
|
228 |
}
|
229 |
|
230 |
with open(config_file, 'w') as f:
|
@@ -235,16 +211,14 @@ def configure_trackio():
|
|
235 |
# Show environment variable commands
|
236 |
print("\nπ Environment Variables for HF Space:")
|
237 |
print("=" * 50)
|
238 |
-
print(f"
|
239 |
-
print(f"HF_READ_TOKEN={current_config['HF_READ_TOKEN']}")
|
240 |
-
print(f"HF_TOKEN={current_config['HF_TOKEN']}") # Legacy
|
241 |
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
|
242 |
if current_config['TRACKIO_URL'] != 'Not set':
|
243 |
print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")
|
244 |
|
245 |
print("\nπ― Next Steps:")
|
246 |
-
print("1.
|
247 |
-
print("2.
|
248 |
print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
|
249 |
print("4. Deploy your updated app.py to the Space")
|
250 |
print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")
|
|
|
79 |
print("π§ Trackio Configuration")
|
80 |
print("=" * 40)
|
81 |
|
82 |
+
# Get HF token (single token approach)
|
83 |
+
hf_token = os.environ.get('HF_TOKEN')
|
|
|
|
|
84 |
|
85 |
+
# Use the single HF_TOKEN
|
86 |
+
active_token = hf_token
|
87 |
|
88 |
if active_token:
|
89 |
username = get_username_from_token(active_token)
|
|
|
100 |
|
101 |
# Current configuration
|
102 |
current_config = {
|
103 |
+
'HF_TOKEN': hf_token or 'Not set',
|
|
|
|
|
104 |
'TRACKIO_DATASET_REPO': dataset_repo,
|
105 |
'SPACE_ID': os.environ.get('SPACE_ID', 'Not set'),
|
106 |
'TRACKIO_URL': os.environ.get('TRACKIO_URL', 'Not set')
|
|
|
112 |
print(f" {status} {key}: {value}")
|
113 |
|
114 |
print("\nπ― Configuration Options:")
|
115 |
+
print("1. Set HF_TOKEN - Main token (starts as write, switches to read after training)")
|
116 |
+
print("2. Set TRACKIO_DATASET_REPO - Dataset repository (optional)")
|
117 |
+
print("3. Set SPACE_ID - HF Space ID (auto-detected)")
|
118 |
+
print("4. Set TRACKIO_URL - Trackio Space URL (auto-detected)")
|
|
|
|
|
119 |
|
120 |
# Check if running on HF Spaces
|
121 |
if os.environ.get('SPACE_ID'):
|
|
|
125 |
# Validate configuration
|
126 |
print("\nπ Configuration Validation:")
|
127 |
|
128 |
+
# Check HF_TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
if current_config['HF_TOKEN'] != 'Not set':
|
130 |
+
print("β
HF_TOKEN is set")
|
131 |
+
print(" This allows training operations and dataset access")
|
132 |
+
print(" Note: Token will be automatically switched from write to read after training")
|
133 |
else:
|
134 |
+
print("β HF_TOKEN is not set")
|
135 |
+
print(" Please set HF_TOKEN for training operations")
|
136 |
+
print(" Get your token from: https://huggingface.co/settings/tokens")
|
137 |
|
138 |
# Check dataset repository
|
139 |
print(f"π Dataset Repository: {dataset_repo}")
|
140 |
|
141 |
# Test dataset access if token is available
|
142 |
+
test_token = current_config['HF_TOKEN']
|
143 |
if test_token != 'Not set':
|
144 |
print("\nπ§ͺ Testing Dataset Access...")
|
145 |
try:
|
|
|
194 |
# Generate configuration file
|
195 |
config_file = "trackio_config.json"
|
196 |
config_data = {
|
197 |
+
'hf_token': current_config['HF_TOKEN'],
|
|
|
|
|
198 |
'dataset_repo': current_config['TRACKIO_DATASET_REPO'],
|
199 |
'space_id': current_config['SPACE_ID'],
|
200 |
'trackio_url': current_config['TRACKIO_URL'],
|
201 |
'username': username,
|
202 |
'last_updated': datetime.now().isoformat(),
|
203 |
+
'notes': 'Trackio configuration - HF_TOKEN starts as write token, switches to read token after training'
|
204 |
}
|
205 |
|
206 |
with open(config_file, 'w') as f:
|
|
|
211 |
# Show environment variable commands
|
212 |
print("\nπ Environment Variables for HF Space:")
|
213 |
print("=" * 50)
|
214 |
+
print(f"HF_TOKEN={current_config['HF_TOKEN']}")
|
|
|
|
|
215 |
print(f"TRACKIO_DATASET_REPO={current_config['TRACKIO_DATASET_REPO']}")
|
216 |
if current_config['TRACKIO_URL'] != 'Not set':
|
217 |
print(f"TRACKIO_URL={current_config['TRACKIO_URL']}")
|
218 |
|
219 |
print("\nπ― Next Steps:")
|
220 |
+
print("1. HF_TOKEN will be automatically set during deployment (starts as write token)")
|
221 |
+
print("2. HF_TOKEN will be automatically switched to read token after training")
|
222 |
print("3. Optionally set TRACKIO_DATASET_REPO to use a different dataset")
|
223 |
print("4. Deploy your updated app.py to the Space")
|
224 |
print("5. Run setup_hf_dataset.py if you haven't created the dataset yet")
|
scripts/trackio_tonic/deploy_trackio_space.py
CHANGED
@@ -196,16 +196,16 @@ class TrackioSpaceDeployer:
|
|
196 |
|
197 |
# Get the project root directory (3 levels up from this script)
|
198 |
project_root = Path(__file__).parent.parent.parent
|
199 |
-
templates_dir = project_root / "templates" / "spaces"
|
200 |
|
201 |
-
# Files to copy from templates/spaces
|
202 |
files_to_copy = [
|
203 |
"app.py",
|
204 |
"requirements.txt",
|
205 |
"README.md"
|
206 |
]
|
207 |
|
208 |
-
# Copy files from templates/spaces to temp directory
|
209 |
copied_files = []
|
210 |
for file_name in files_to_copy:
|
211 |
source_path = templates_dir / file_name
|
@@ -334,36 +334,16 @@ class TrackioSpaceDeployer:
|
|
334 |
|
335 |
repo_id = f"{self.username}/{self.space_name}"
|
336 |
|
337 |
-
#
|
338 |
-
|
339 |
-
hf_read_token = os.getenv('HF_READ_TOKEN', self.token)
|
340 |
-
hf_token = os.getenv('HF_TOKEN', self.token) # Legacy
|
341 |
|
342 |
-
# Set the
|
343 |
try:
|
344 |
-
self.api.add_space_secret(
|
345 |
-
repo_id=repo_id,
|
346 |
-
key="HF_WRITE_TOKEN",
|
347 |
-
value=hf_write_token,
|
348 |
-
description="Hugging Face write token for training operations"
|
349 |
-
)
|
350 |
-
print("β
Successfully set HF_WRITE_TOKEN secret via API")
|
351 |
-
|
352 |
-
# Set the HF_READ_TOKEN secret for the space using the API
|
353 |
-
self.api.add_space_secret(
|
354 |
-
repo_id=repo_id,
|
355 |
-
key="HF_READ_TOKEN",
|
356 |
-
value=hf_read_token,
|
357 |
-
description="Hugging Face read token for security"
|
358 |
-
)
|
359 |
-
print("β
Successfully set HF_READ_TOKEN secret via API")
|
360 |
-
|
361 |
-
# Set legacy HF_TOKEN secret for backward compatibility
|
362 |
self.api.add_space_secret(
|
363 |
repo_id=repo_id,
|
364 |
key="HF_TOKEN",
|
365 |
value=hf_token,
|
366 |
-
description="Hugging Face token for dataset access (
|
367 |
)
|
368 |
print("β
Successfully set HF_TOKEN secret via API")
|
369 |
|
@@ -401,13 +381,9 @@ class TrackioSpaceDeployer:
|
|
401 |
"""Fallback method for manual secret setup"""
|
402 |
print("π Manual Space Secrets Configuration:")
|
403 |
|
404 |
-
#
|
405 |
-
|
406 |
-
hf_read_token = os.getenv('HF_READ_TOKEN', self.token)
|
407 |
-
hf_token = os.getenv('HF_TOKEN', self.token) # Legacy
|
408 |
|
409 |
-
print(f" HF_WRITE_TOKEN={hf_write_token}")
|
410 |
-
print(f" HF_READ_TOKEN={hf_read_token}")
|
411 |
print(f" HF_TOKEN={hf_token}")
|
412 |
|
413 |
dataset_repo = self.dataset_repo or f"{self.username}/trackio-experiments"
|
@@ -415,13 +391,9 @@ class TrackioSpaceDeployer:
|
|
415 |
print(f" TRACKIO_URL={self.space_url}")
|
416 |
|
417 |
print("\nπ§ To set secrets in your Space:")
|
418 |
-
print("1. Go to your Space settings: {self.space_url}/settings")
|
419 |
print("2. Navigate to the 'Repository secrets' section")
|
420 |
print("3. Add the following secrets:")
|
421 |
-
print(f" Name: HF_WRITE_TOKEN")
|
422 |
-
print(f" Value: {hf_write_token}")
|
423 |
-
print(f" Name: HF_READ_TOKEN")
|
424 |
-
print(f" Value: {hf_read_token}")
|
425 |
print(f" Name: HF_TOKEN")
|
426 |
print(f" Value: {hf_token}")
|
427 |
print(f" Name: TRACKIO_DATASET_REPO")
|
@@ -429,6 +401,7 @@ class TrackioSpaceDeployer:
|
|
429 |
print(f" Name: TRACKIO_URL")
|
430 |
print(f" Value: {self.space_url}")
|
431 |
print("4. Save the secrets")
|
|
|
432 |
|
433 |
return True
|
434 |
|
|
|
196 |
|
197 |
# Get the project root directory (3 levels up from this script)
|
198 |
project_root = Path(__file__).parent.parent.parent
|
199 |
+
templates_dir = project_root / "templates" / "spaces" / "trackio"
|
200 |
|
201 |
+
# Files to copy from templates/spaces/trackio
|
202 |
files_to_copy = [
|
203 |
"app.py",
|
204 |
"requirements.txt",
|
205 |
"README.md"
|
206 |
]
|
207 |
|
208 |
+
# Copy files from templates/spaces/trackio to temp directory
|
209 |
copied_files = []
|
210 |
for file_name in files_to_copy:
|
211 |
source_path = templates_dir / file_name
|
|
|
334 |
|
335 |
repo_id = f"{self.username}/{self.space_name}"
|
336 |
|
337 |
+
# Use the provided token as HF_TOKEN (starts as write token, will be switched to read token later)
|
338 |
+
hf_token = self.token
|
|
|
|
|
339 |
|
340 |
+
# Set the HF_TOKEN secret for the space using the API
|
341 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
342 |
self.api.add_space_secret(
|
343 |
repo_id=repo_id,
|
344 |
key="HF_TOKEN",
|
345 |
value=hf_token,
|
346 |
+
description="Hugging Face token for dataset access (starts as write, switches to read)"
|
347 |
)
|
348 |
print("β
Successfully set HF_TOKEN secret via API")
|
349 |
|
|
|
381 |
"""Fallback method for manual secret setup"""
|
382 |
print("π Manual Space Secrets Configuration:")
|
383 |
|
384 |
+
# Use the provided token as HF_TOKEN
|
385 |
+
hf_token = self.token
|
|
|
|
|
386 |
|
|
|
|
|
387 |
print(f" HF_TOKEN={hf_token}")
|
388 |
|
389 |
dataset_repo = self.dataset_repo or f"{self.username}/trackio-experiments"
|
|
|
391 |
print(f" TRACKIO_URL={self.space_url}")
|
392 |
|
393 |
print("\nπ§ To set secrets in your Space:")
|
394 |
+
print(f"1. Go to your Space settings: {self.space_url}/settings")
|
395 |
print("2. Navigate to the 'Repository secrets' section")
|
396 |
print("3. Add the following secrets:")
|
|
|
|
|
|
|
|
|
397 |
print(f" Name: HF_TOKEN")
|
398 |
print(f" Value: {hf_token}")
|
399 |
print(f" Name: TRACKIO_DATASET_REPO")
|
|
|
401 |
print(f" Name: TRACKIO_URL")
|
402 |
print(f" Value: {self.space_url}")
|
403 |
print("4. Save the secrets")
|
404 |
+
print("\nNote: HF_TOKEN starts as write token and will be switched to read token after training")
|
405 |
|
406 |
return True
|
407 |
|
scripts/trackio_tonic/switch_to_read_token.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
-
Switch Trackio Space from Write Token to Read Token
|
4 |
|
5 |
This script switches the HF_TOKEN secret in a Trackio Space from a write token
|
6 |
to a read token after the experiment is complete, for security purposes.
|
|
|
7 |
"""
|
8 |
|
9 |
import os
|
@@ -61,8 +62,8 @@ def switch_space_token(space_id: str, read_token: str, write_token: str) -> bool
|
|
61 |
|
62 |
Args:
|
63 |
space_id (str): The space ID (username/space-name)
|
64 |
-
read_token (str): The read token to set
|
65 |
-
write_token (str): The write token (for
|
66 |
|
67 |
Returns:
|
68 |
bool: True if successful, False otherwise
|
@@ -93,23 +94,24 @@ def switch_space_token(space_id: str, read_token: str, write_token: str) -> bool
|
|
93 |
# Use the write token to update the space (since we need write access)
|
94 |
api = HfApi(token=write_token)
|
95 |
|
96 |
-
# Update the HF_TOKEN secret in the space
|
97 |
try:
|
98 |
api.add_space_secret(
|
99 |
repo_id=space_id,
|
100 |
key="HF_TOKEN",
|
101 |
value=read_token,
|
102 |
-
description="Hugging Face
|
103 |
)
|
104 |
-
print(f"β
Successfully switched HF_TOKEN to read token in space: {space_id}")
|
|
|
105 |
return True
|
106 |
|
107 |
except Exception as e:
|
108 |
-
print(f"β Failed to update
|
109 |
return False
|
110 |
|
111 |
except Exception as e:
|
112 |
-
print(f"β Error switching
|
113 |
return False
|
114 |
|
115 |
def main():
|
@@ -137,12 +139,13 @@ def main():
|
|
137 |
success = switch_space_token(space_id, read_token, write_token)
|
138 |
|
139 |
if success:
|
140 |
-
print("\nβ
|
141 |
print(f"π Space: {space_id}")
|
142 |
-
print("π HF_TOKEN now uses read-only permissions")
|
143 |
print("π‘ The space can still read datasets but cannot write to repositories")
|
|
|
144 |
else:
|
145 |
-
print("\nβ
|
146 |
print("Please check your tokens and try again.")
|
147 |
sys.exit(1)
|
148 |
|
|
|
1 |
#!/usr/bin/env python3
|
2 |
"""
|
3 |
+
Switch Trackio Space HF_TOKEN from Write Token to Read Token
|
4 |
|
5 |
This script switches the HF_TOKEN secret in a Trackio Space from a write token
|
6 |
to a read token after the experiment is complete, for security purposes.
|
7 |
+
The space uses only HF_TOKEN, which starts as write token and gets switched to read token.
|
8 |
"""
|
9 |
|
10 |
import os
|
|
|
62 |
|
63 |
Args:
|
64 |
space_id (str): The space ID (username/space-name)
|
65 |
+
read_token (str): The read token to set as new HF_TOKEN
|
66 |
+
write_token (str): The write token (for authentication to update the space)
|
67 |
|
68 |
Returns:
|
69 |
bool: True if successful, False otherwise
|
|
|
94 |
# Use the write token to update the space (since we need write access)
|
95 |
api = HfApi(token=write_token)
|
96 |
|
97 |
+
# Update the HF_TOKEN secret in the space from write token to read token
|
98 |
try:
|
99 |
api.add_space_secret(
|
100 |
repo_id=space_id,
|
101 |
key="HF_TOKEN",
|
102 |
value=read_token,
|
103 |
+
description="Hugging Face token for dataset access (switched from write to read for security)"
|
104 |
)
|
105 |
+
print(f"β
Successfully switched HF_TOKEN from write to read token in space: {space_id}")
|
106 |
+
print(f"π Space now uses read-only permissions for enhanced security")
|
107 |
return True
|
108 |
|
109 |
except Exception as e:
|
110 |
+
print(f"β Failed to update HF_TOKEN secret: {e}")
|
111 |
return False
|
112 |
|
113 |
except Exception as e:
|
114 |
+
print(f"β Error switching HF_TOKEN: {e}")
|
115 |
return False
|
116 |
|
117 |
def main():
|
|
|
139 |
success = switch_space_token(space_id, read_token, write_token)
|
140 |
|
141 |
if success:
|
142 |
+
print("\nβ
HF_TOKEN switch completed successfully!")
|
143 |
print(f"π Space: {space_id}")
|
144 |
+
print("π HF_TOKEN now uses read-only permissions for enhanced security")
|
145 |
print("π‘ The space can still read datasets but cannot write to repositories")
|
146 |
+
print("π― Training is complete - space is now secure for monitoring")
|
147 |
else:
|
148 |
+
print("\nβ HF_TOKEN switch failed!")
|
149 |
print("Please check your tokens and try again.")
|
150 |
sys.exit(1)
|
151 |
|
src/dataset_utils.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Dataset utilities for Trackio experiment data management
|
4 |
+
Provides functions for safe dataset operations with data preservation
|
5 |
+
"""
|
6 |
+
|
7 |
+
import json
|
8 |
+
import logging
|
9 |
+
from datetime import datetime
|
10 |
+
from typing import Dict, Any, List, Optional, Union
|
11 |
+
from datasets import Dataset, load_dataset
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
class TrackioDatasetManager:
|
16 |
+
"""
|
17 |
+
Manager class for Trackio experiment datasets with data preservation.
|
18 |
+
|
19 |
+
This class ensures that existing experiment data is always preserved
|
20 |
+
when adding new experiments or updating existing ones.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self, dataset_repo: str, hf_token: str):
|
24 |
+
"""
|
25 |
+
Initialize the dataset manager.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
dataset_repo (str): HF dataset repository ID (e.g., "username/dataset-name")
|
29 |
+
hf_token (str): Hugging Face token for authentication
|
30 |
+
"""
|
31 |
+
self.dataset_repo = dataset_repo
|
32 |
+
self.hf_token = hf_token
|
33 |
+
self._validate_repo_format()
|
34 |
+
|
35 |
+
def _validate_repo_format(self):
|
36 |
+
"""Validate dataset repository format"""
|
37 |
+
if not self.dataset_repo or '/' not in self.dataset_repo:
|
38 |
+
raise ValueError(f"Invalid dataset repository format: {self.dataset_repo}")
|
39 |
+
|
40 |
+
def check_dataset_exists(self) -> bool:
|
41 |
+
"""
|
42 |
+
Check if the dataset repository exists and is accessible.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
bool: True if dataset exists and is accessible, False otherwise
|
46 |
+
"""
|
47 |
+
try:
|
48 |
+
load_dataset(self.dataset_repo, token=self.hf_token)
|
49 |
+
logger.info(f"β
Dataset {self.dataset_repo} exists and is accessible")
|
50 |
+
return True
|
51 |
+
except Exception as e:
|
52 |
+
logger.info(f"π Dataset {self.dataset_repo} doesn't exist or isn't accessible: {e}")
|
53 |
+
return False
|
54 |
+
|
55 |
+
def load_existing_experiments(self) -> List[Dict[str, Any]]:
|
56 |
+
"""
|
57 |
+
Load all existing experiments from the dataset.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
List[Dict[str, Any]]: List of existing experiment dictionaries
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
if not self.check_dataset_exists():
|
64 |
+
logger.info("π No existing dataset found, returning empty list")
|
65 |
+
return []
|
66 |
+
|
67 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
68 |
+
|
69 |
+
if 'train' not in dataset:
|
70 |
+
logger.info("π No 'train' split found in dataset")
|
71 |
+
return []
|
72 |
+
|
73 |
+
experiments = list(dataset['train'])
|
74 |
+
logger.info(f"π Loaded {len(experiments)} existing experiments")
|
75 |
+
|
76 |
+
# Validate experiment structure
|
77 |
+
valid_experiments = []
|
78 |
+
for exp in experiments:
|
79 |
+
if self._validate_experiment_structure(exp):
|
80 |
+
valid_experiments.append(exp)
|
81 |
+
else:
|
82 |
+
logger.warning(f"β οΈ Skipping invalid experiment: {exp.get('experiment_id', 'unknown')}")
|
83 |
+
|
84 |
+
logger.info(f"π {len(valid_experiments)} valid experiments loaded")
|
85 |
+
return valid_experiments
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
logger.error(f"β Failed to load existing experiments: {e}")
|
89 |
+
return []
|
90 |
+
|
91 |
+
def _validate_experiment_structure(self, experiment: Dict[str, Any]) -> bool:
|
92 |
+
"""
|
93 |
+
Validate that an experiment has the required structure.
|
94 |
+
|
95 |
+
Args:
|
96 |
+
experiment (Dict[str, Any]): Experiment dictionary to validate
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
bool: True if experiment structure is valid
|
100 |
+
"""
|
101 |
+
required_fields = [
|
102 |
+
'experiment_id', 'name', 'description', 'created_at',
|
103 |
+
'status', 'metrics', 'parameters', 'artifacts', 'logs'
|
104 |
+
]
|
105 |
+
|
106 |
+
for field in required_fields:
|
107 |
+
if field not in experiment:
|
108 |
+
logger.warning(f"β οΈ Missing required field '{field}' in experiment")
|
109 |
+
return False
|
110 |
+
|
111 |
+
# Validate JSON fields
|
112 |
+
json_fields = ['metrics', 'parameters', 'artifacts', 'logs']
|
113 |
+
for field in json_fields:
|
114 |
+
if isinstance(experiment[field], str):
|
115 |
+
try:
|
116 |
+
json.loads(experiment[field])
|
117 |
+
except json.JSONDecodeError:
|
118 |
+
logger.warning(f"β οΈ Invalid JSON in field '{field}' for experiment {experiment.get('experiment_id')}")
|
119 |
+
return False
|
120 |
+
|
121 |
+
return True
|
122 |
+
|
123 |
+
def save_experiments(self, experiments: List[Dict[str, Any]], commit_message: Optional[str] = None) -> bool:
|
124 |
+
"""
|
125 |
+
Save a list of experiments to the dataset, preserving data integrity.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
experiments (List[Dict[str, Any]]): List of experiment dictionaries
|
129 |
+
commit_message (Optional[str]): Custom commit message
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
bool: True if save was successful, False otherwise
|
133 |
+
"""
|
134 |
+
try:
|
135 |
+
if not experiments:
|
136 |
+
logger.warning("β οΈ No experiments to save")
|
137 |
+
return False
|
138 |
+
|
139 |
+
# Validate all experiments before saving
|
140 |
+
valid_experiments = []
|
141 |
+
for exp in experiments:
|
142 |
+
if self._validate_experiment_structure(exp):
|
143 |
+
# Ensure last_updated is set
|
144 |
+
if 'last_updated' not in exp:
|
145 |
+
exp['last_updated'] = datetime.now().isoformat()
|
146 |
+
valid_experiments.append(exp)
|
147 |
+
else:
|
148 |
+
logger.error(f"β Invalid experiment structure: {exp.get('experiment_id', 'unknown')}")
|
149 |
+
return False
|
150 |
+
|
151 |
+
# Create dataset
|
152 |
+
dataset = Dataset.from_list(valid_experiments)
|
153 |
+
|
154 |
+
# Generate commit message if not provided
|
155 |
+
if not commit_message:
|
156 |
+
commit_message = f"Update dataset with {len(valid_experiments)} experiments ({datetime.now().isoformat()})"
|
157 |
+
|
158 |
+
# Push to hub
|
159 |
+
dataset.push_to_hub(
|
160 |
+
self.dataset_repo,
|
161 |
+
token=self.hf_token,
|
162 |
+
private=True,
|
163 |
+
commit_message=commit_message
|
164 |
+
)
|
165 |
+
|
166 |
+
logger.info(f"β
Successfully saved {len(valid_experiments)} experiments to {self.dataset_repo}")
|
167 |
+
return True
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"β Failed to save experiments to dataset: {e}")
|
171 |
+
return False
|
172 |
+
|
173 |
+
def upsert_experiment(self, experiment: Dict[str, Any]) -> bool:
|
174 |
+
"""
|
175 |
+
Insert a new experiment or update an existing one, preserving all other data.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
experiment (Dict[str, Any]): Experiment dictionary to upsert
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
bool: True if operation was successful, False otherwise
|
182 |
+
"""
|
183 |
+
try:
|
184 |
+
# Validate the experiment structure
|
185 |
+
if not self._validate_experiment_structure(experiment):
|
186 |
+
logger.error(f"β Invalid experiment structure for {experiment.get('experiment_id', 'unknown')}")
|
187 |
+
return False
|
188 |
+
|
189 |
+
# Load existing experiments
|
190 |
+
existing_experiments = self.load_existing_experiments()
|
191 |
+
|
192 |
+
# Find if experiment already exists
|
193 |
+
experiment_id = experiment['experiment_id']
|
194 |
+
experiment_found = False
|
195 |
+
updated_experiments = []
|
196 |
+
|
197 |
+
for existing_exp in existing_experiments:
|
198 |
+
if existing_exp.get('experiment_id') == experiment_id:
|
199 |
+
# Update existing experiment
|
200 |
+
logger.info(f"π Updating existing experiment: {experiment_id}")
|
201 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
202 |
+
updated_experiments.append(experiment)
|
203 |
+
experiment_found = True
|
204 |
+
else:
|
205 |
+
# Preserve existing experiment
|
206 |
+
updated_experiments.append(existing_exp)
|
207 |
+
|
208 |
+
# If experiment doesn't exist, add it
|
209 |
+
if not experiment_found:
|
210 |
+
logger.info(f"β Adding new experiment: {experiment_id}")
|
211 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
212 |
+
updated_experiments.append(experiment)
|
213 |
+
|
214 |
+
# Save all experiments
|
215 |
+
commit_message = f"{'Update' if experiment_found else 'Add'} experiment {experiment_id} (preserving {len(existing_experiments)} existing experiments)"
|
216 |
+
|
217 |
+
return self.save_experiments(updated_experiments, commit_message)
|
218 |
+
|
219 |
+
except Exception as e:
|
220 |
+
logger.error(f"β Failed to upsert experiment: {e}")
|
221 |
+
return False
|
222 |
+
|
223 |
+
def get_experiment_by_id(self, experiment_id: str) -> Optional[Dict[str, Any]]:
|
224 |
+
"""
|
225 |
+
Retrieve a specific experiment by its ID.
|
226 |
+
|
227 |
+
Args:
|
228 |
+
experiment_id (str): The experiment ID to search for
|
229 |
+
|
230 |
+
Returns:
|
231 |
+
Optional[Dict[str, Any]]: The experiment dictionary if found, None otherwise
|
232 |
+
"""
|
233 |
+
try:
|
234 |
+
experiments = self.load_existing_experiments()
|
235 |
+
|
236 |
+
for exp in experiments:
|
237 |
+
if exp.get('experiment_id') == experiment_id:
|
238 |
+
logger.info(f"β
Found experiment: {experiment_id}")
|
239 |
+
return exp
|
240 |
+
|
241 |
+
logger.info(f"π Experiment not found: {experiment_id}")
|
242 |
+
return None
|
243 |
+
|
244 |
+
except Exception as e:
|
245 |
+
logger.error(f"β Failed to get experiment {experiment_id}: {e}")
|
246 |
+
return None
|
247 |
+
|
248 |
+
def list_experiments(self, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
|
249 |
+
"""
|
250 |
+
List all experiments, optionally filtered by status.
|
251 |
+
|
252 |
+
Args:
|
253 |
+
status_filter (Optional[str]): Filter by experiment status (running, completed, failed, paused)
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
List[Dict[str, Any]]: List of experiments matching the filter
|
257 |
+
"""
|
258 |
+
try:
|
259 |
+
experiments = self.load_existing_experiments()
|
260 |
+
|
261 |
+
if status_filter:
|
262 |
+
filtered_experiments = [exp for exp in experiments if exp.get('status') == status_filter]
|
263 |
+
logger.info(f"π Found {len(filtered_experiments)} experiments with status '{status_filter}'")
|
264 |
+
return filtered_experiments
|
265 |
+
|
266 |
+
logger.info(f"π Found {len(experiments)} total experiments")
|
267 |
+
return experiments
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
logger.error(f"β Failed to list experiments: {e}")
|
271 |
+
return []
|
272 |
+
|
273 |
+
def backup_dataset(self, backup_suffix: Optional[str] = None) -> str:
|
274 |
+
"""
|
275 |
+
Create a backup of the current dataset.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
backup_suffix (Optional[str]): Optional suffix for backup repo name
|
279 |
+
|
280 |
+
Returns:
|
281 |
+
str: Backup repository name if successful, empty string otherwise
|
282 |
+
"""
|
283 |
+
try:
|
284 |
+
if not backup_suffix:
|
285 |
+
backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')
|
286 |
+
|
287 |
+
backup_repo = f"{self.dataset_repo}-backup-{backup_suffix}"
|
288 |
+
|
289 |
+
# Load current experiments
|
290 |
+
experiments = self.load_existing_experiments()
|
291 |
+
|
292 |
+
if not experiments:
|
293 |
+
logger.warning("β οΈ No experiments to backup")
|
294 |
+
return ""
|
295 |
+
|
296 |
+
# Create backup dataset manager
|
297 |
+
backup_manager = TrackioDatasetManager(backup_repo, self.hf_token)
|
298 |
+
|
299 |
+
# Save to backup
|
300 |
+
success = backup_manager.save_experiments(
|
301 |
+
experiments,
|
302 |
+
f"Backup of {self.dataset_repo} created on {datetime.now().isoformat()}"
|
303 |
+
)
|
304 |
+
|
305 |
+
if success:
|
306 |
+
logger.info(f"β
Backup created: {backup_repo}")
|
307 |
+
return backup_repo
|
308 |
+
else:
|
309 |
+
logger.error("β Failed to create backup")
|
310 |
+
return ""
|
311 |
+
|
312 |
+
except Exception as e:
|
313 |
+
logger.error(f"β Failed to create backup: {e}")
|
314 |
+
return ""
|
315 |
+
|
316 |
+
|
317 |
+
def create_dataset_manager(dataset_repo: str, hf_token: str) -> TrackioDatasetManager:
|
318 |
+
"""
|
319 |
+
Factory function to create a TrackioDatasetManager instance.
|
320 |
+
|
321 |
+
Args:
|
322 |
+
dataset_repo (str): HF dataset repository ID
|
323 |
+
hf_token (str): Hugging Face token
|
324 |
+
|
325 |
+
Returns:
|
326 |
+
TrackioDatasetManager: Configured dataset manager instance
|
327 |
+
"""
|
328 |
+
return TrackioDatasetManager(dataset_repo, hf_token)
|
src/monitoring.py
CHANGED
@@ -16,6 +16,7 @@ try:
|
|
16 |
from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
|
17 |
TRACKIO_AVAILABLE = True
|
18 |
except ImportError:
|
|
|
19 |
TRACKIO_AVAILABLE = False
|
20 |
print("Warning: Trackio API client not available. Install with: pip install requests")
|
21 |
|
@@ -87,20 +88,33 @@ class SmolLM3Monitor:
|
|
87 |
try:
|
88 |
from datasets import Dataset
|
89 |
from huggingface_hub import HfApi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
self.hf_dataset_client = {
|
92 |
'Dataset': Dataset,
|
93 |
'HfApi': HfApi,
|
94 |
'api': HfApi(token=self.hf_token)
|
95 |
}
|
96 |
-
|
|
|
|
|
|
|
97 |
|
98 |
except ImportError:
|
99 |
logger.warning("β οΈ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
|
100 |
self.hf_dataset_client = None
|
|
|
101 |
except Exception as e:
|
102 |
logger.error("Failed to initialize HF Datasets client: %s", e)
|
103 |
self.hf_dataset_client = None
|
|
|
104 |
|
105 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
106 |
"""Setup Trackio API client"""
|
@@ -184,55 +198,38 @@ class SmolLM3Monitor:
|
|
184 |
self.experiment_id = f"exp_{timestamp}"
|
185 |
|
186 |
def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
|
187 |
-
"""Save experiment data to HF Dataset"""
|
188 |
-
if not self.
|
189 |
-
logger.warning("β οΈ
|
190 |
return False
|
191 |
|
192 |
try:
|
193 |
-
#
|
194 |
-
|
195 |
-
logger.error("β Dataset repository is empty")
|
196 |
-
return False
|
197 |
-
|
198 |
-
# Validate dataset repository format
|
199 |
-
if '/' not in self.dataset_repo:
|
200 |
-
logger.error(f"β Invalid dataset repository format: {self.dataset_repo}")
|
201 |
-
return False
|
202 |
-
|
203 |
-
Dataset = self.hf_dataset_client['Dataset']
|
204 |
-
api = self.hf_dataset_client['api']
|
205 |
-
|
206 |
-
# Create dataset from experiment data with correct structure
|
207 |
-
# Match the structure used in setup_hf_dataset.py
|
208 |
-
dataset_data = [{
|
209 |
'experiment_id': self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
210 |
'name': self.experiment_name,
|
211 |
'description': "SmolLM3 fine-tuning experiment",
|
212 |
'created_at': self.start_time.isoformat(),
|
213 |
'status': 'running',
|
214 |
-
'metrics': json.dumps(self.metrics_history),
|
215 |
-
'parameters': json.dumps(experiment_data),
|
216 |
-
'artifacts': json.dumps(self.artifacts),
|
217 |
-
'logs': json.dumps([]),
|
218 |
'last_updated': datetime.now().isoformat()
|
219 |
-
}
|
220 |
-
|
221 |
-
# Create dataset from the experiment data
|
222 |
-
dataset = Dataset.from_list(dataset_data)
|
223 |
-
|
224 |
-
# Push to hub
|
225 |
-
dataset.push_to_hub(
|
226 |
-
self.dataset_repo,
|
227 |
-
token=self.hf_token,
|
228 |
-
private=True
|
229 |
-
)
|
230 |
|
231 |
-
|
232 |
-
|
233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
except Exception as e:
|
235 |
-
logger.error(f"Failed to save to HF Dataset: {e}")
|
236 |
return False
|
237 |
|
238 |
def log_configuration(self, config: Dict[str, Any]):
|
@@ -556,25 +553,50 @@ class SmolLM3Monitor:
|
|
556 |
return "{}?tab=view_experiments".format(self.trackio_client.space_url)
|
557 |
return None
|
558 |
|
559 |
-
def close(self):
|
560 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
if self.enable_tracking and self.trackio_client:
|
562 |
try:
|
563 |
-
# Mark experiment as completed
|
564 |
result = self.trackio_client.update_experiment_status(
|
565 |
experiment_id=self.experiment_id,
|
566 |
-
status=
|
567 |
)
|
568 |
if "success" in result:
|
569 |
-
logger.info("
|
570 |
else:
|
571 |
-
logger.error("Failed to close monitoring session: %s", result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
572 |
except Exception as e:
|
573 |
-
logger.error("Failed to
|
574 |
|
575 |
-
|
576 |
-
if self.hf_dataset_client:
|
577 |
-
self._save_to_hf_dataset({'status': 'completed'})
|
578 |
|
579 |
# Utility function to create monitor from config
|
580 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
|
|
16 |
from scripts.trackio_tonic.trackio_api_client import TrackioAPIClient
|
17 |
TRACKIO_AVAILABLE = True
|
18 |
except ImportError:
|
19 |
+
TrackioAPIClient = None
|
20 |
TRACKIO_AVAILABLE = False
|
21 |
print("Warning: Trackio API client not available. Install with: pip install requests")
|
22 |
|
|
|
88 |
try:
|
89 |
from datasets import Dataset
|
90 |
from huggingface_hub import HfApi
|
91 |
+
try:
|
92 |
+
from .dataset_utils import create_dataset_manager
|
93 |
+
except ImportError:
|
94 |
+
# Try importing from same directory
|
95 |
+
import sys
|
96 |
+
import os
|
97 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
98 |
+
from dataset_utils import create_dataset_manager
|
99 |
|
100 |
self.hf_dataset_client = {
|
101 |
'Dataset': Dataset,
|
102 |
'HfApi': HfApi,
|
103 |
'api': HfApi(token=self.hf_token)
|
104 |
}
|
105 |
+
|
106 |
+
# Initialize dataset manager for safe operations
|
107 |
+
self.dataset_manager = create_dataset_manager(self.dataset_repo, self.hf_token)
|
108 |
+
logger.info("β
HF Datasets client and manager initialized for %s", self.dataset_repo)
|
109 |
|
110 |
except ImportError:
|
111 |
logger.warning("β οΈ datasets or huggingface-hub not available. Install with: pip install datasets huggingface-hub")
|
112 |
self.hf_dataset_client = None
|
113 |
+
self.dataset_manager = None
|
114 |
except Exception as e:
|
115 |
logger.error("Failed to initialize HF Datasets client: %s", e)
|
116 |
self.hf_dataset_client = None
|
117 |
+
self.dataset_manager = None
|
118 |
|
119 |
def _setup_trackio(self, trackio_url: Optional[str], trackio_token: Optional[str]):
|
120 |
"""Setup Trackio API client"""
|
|
|
198 |
self.experiment_id = f"exp_{timestamp}"
|
199 |
|
200 |
def _save_to_hf_dataset(self, experiment_data: Dict[str, Any]):
|
201 |
+
"""Save experiment data to HF Dataset with data preservation using dataset manager"""
|
202 |
+
if not self.dataset_manager:
|
203 |
+
logger.warning("β οΈ Dataset manager not available")
|
204 |
return False
|
205 |
|
206 |
try:
|
207 |
+
# Prepare current experiment data with standardized structure
|
208 |
+
current_experiment = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
'experiment_id': self.experiment_id or f"exp_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
|
210 |
'name': self.experiment_name,
|
211 |
'description': "SmolLM3 fine-tuning experiment",
|
212 |
'created_at': self.start_time.isoformat(),
|
213 |
'status': 'running',
|
214 |
+
'metrics': json.dumps(self.metrics_history, default=str),
|
215 |
+
'parameters': json.dumps(experiment_data, default=str),
|
216 |
+
'artifacts': json.dumps(self.artifacts, default=str),
|
217 |
+
'logs': json.dumps([], default=str),
|
218 |
'last_updated': datetime.now().isoformat()
|
219 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
+
# Use dataset manager to safely upsert the experiment
|
222 |
+
success = self.dataset_manager.upsert_experiment(current_experiment)
|
223 |
|
224 |
+
if success:
|
225 |
+
logger.info(f"β
Experiment data saved to HF Dataset: {self.dataset_repo}")
|
226 |
+
return True
|
227 |
+
else:
|
228 |
+
logger.error(f"β Failed to save experiment data to HF Dataset")
|
229 |
+
return False
|
230 |
+
|
231 |
except Exception as e:
|
232 |
+
logger.error(f"β Failed to save to HF Dataset: {e}")
|
233 |
return False
|
234 |
|
235 |
def log_configuration(self, config: Dict[str, Any]):
|
|
|
553 |
return "{}?tab=view_experiments".format(self.trackio_client.space_url)
|
554 |
return None
|
555 |
|
556 |
+
def close(self, final_status: str = "completed"):
|
557 |
+
"""
|
558 |
+
Close the monitoring session with final status update
|
559 |
+
|
560 |
+
Args:
|
561 |
+
final_status (str): Final status for the experiment (completed, failed, etc.)
|
562 |
+
"""
|
563 |
+
logger.info(f"π Closing monitoring session with status: {final_status}")
|
564 |
+
|
565 |
if self.enable_tracking and self.trackio_client:
|
566 |
try:
|
567 |
+
# Mark experiment as completed in Trackio
|
568 |
result = self.trackio_client.update_experiment_status(
|
569 |
experiment_id=self.experiment_id,
|
570 |
+
status=final_status
|
571 |
)
|
572 |
if "success" in result:
|
573 |
+
logger.info("β
Trackio monitoring session closed")
|
574 |
else:
|
575 |
+
logger.error("β Failed to close Trackio monitoring session: %s", result)
|
576 |
+
except Exception as e:
|
577 |
+
logger.error("β Failed to close Trackio monitoring session: %s", e)
|
578 |
+
|
579 |
+
# Final save to HF Dataset with proper status update
|
580 |
+
if self.dataset_manager:
|
581 |
+
try:
|
582 |
+
# Update experiment with final status
|
583 |
+
final_experiment_data = {
|
584 |
+
'status': final_status,
|
585 |
+
'experiment_end_time': datetime.now().isoformat(),
|
586 |
+
'final_metrics_count': len(self.metrics_history),
|
587 |
+
'total_artifacts': len(self.artifacts)
|
588 |
+
}
|
589 |
+
|
590 |
+
success = self._save_to_hf_dataset(final_experiment_data)
|
591 |
+
if success:
|
592 |
+
logger.info("β
Final experiment data saved to HF Dataset")
|
593 |
+
else:
|
594 |
+
logger.error("β Failed to save final experiment data")
|
595 |
+
|
596 |
except Exception as e:
|
597 |
+
logger.error(f"β Failed to save final experiment data: {e}")
|
598 |
|
599 |
+
logger.info(f"π― Monitoring session closed for experiment: {self.experiment_id}")
|
|
|
|
|
600 |
|
601 |
# Utility function to create monitor from config
|
602 |
def create_monitor_from_config(config, experiment_name: Optional[str] = None) -> SmolLM3Monitor:
|
templates/spaces/demo_gpt/README.md
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: GPT-OSS-20B Multilingual Reasoner Demo
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: pink
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.40.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
short_description: GPT-OSS-20B Multilingual Reasoner LoRA adapter
|
11 |
+
---
|
12 |
+
|
13 |
+
This demo showcases the GPT-OSS-20B model fine-tuned with LoRA for enhanced multilingual reasoning capabilities. The model is based on OpenAI's GPT-OSS-20B base model with a LoRA adapter from Tonic.
|
14 |
+
|
15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
templates/spaces/demo_gpt/app.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, pipeline
|
2 |
+
import torch
|
3 |
+
from threading import Thread
|
4 |
+
import gradio as gr
|
5 |
+
import spaces
|
6 |
+
import re
|
7 |
+
import logging
|
8 |
+
import os
|
9 |
+
from peft import PeftModel
|
10 |
+
|
11 |
+
# ----------------------------------------------------------------------
|
12 |
+
# Environment Variables Configuration
|
13 |
+
# ----------------------------------------------------------------------
|
14 |
+
|
15 |
+
# Get model configuration from environment variables
|
16 |
+
BASE_MODEL_ID = os.getenv('BASE_MODEL_ID', 'openai/gpt-oss-20b')
|
17 |
+
LORA_MODEL_ID = os.getenv('LORA_MODEL_ID', os.getenv('HF_MODEL_ID', 'Tonic/gpt-oss-20b-multilingual-reasoner'))
|
18 |
+
MODEL_NAME = os.getenv('MODEL_NAME', 'GPT-OSS Multilingual Reasoner')
|
19 |
+
MODEL_SUBFOLDER = os.getenv('MODEL_SUBFOLDER', '')
|
20 |
+
|
21 |
+
# If the LORA_MODEL_ID is the same as BASE_MODEL_ID, this is a merged model, not LoRA
|
22 |
+
USE_LORA = LORA_MODEL_ID != BASE_MODEL_ID and not LORA_MODEL_ID.startswith(BASE_MODEL_ID)
|
23 |
+
|
24 |
+
print(f"π§ Configuration:")
|
25 |
+
print(f" Base Model: {BASE_MODEL_ID}")
|
26 |
+
print(f" Model ID: {LORA_MODEL_ID}")
|
27 |
+
print(f" Model Name: {MODEL_NAME}")
|
28 |
+
print(f" Model Subfolder: {MODEL_SUBFOLDER}")
|
29 |
+
print(f" Use LoRA: {USE_LORA}")
|
30 |
+
|
31 |
+
# ----------------------------------------------------------------------
|
32 |
+
# KaTeX delimiter config for Gradio
|
33 |
+
# ----------------------------------------------------------------------
|
34 |
+
|
35 |
+
LATEX_DELIMS = [
|
36 |
+
{"left": "$$", "right": "$$", "display": True},
|
37 |
+
{"left": "$", "right": "$", "display": False},
|
38 |
+
{"left": "\\[", "right": "\\]", "display": True},
|
39 |
+
{"left": "\\(", "right": "\\)", "display": False},
|
40 |
+
]
|
41 |
+
|
42 |
+
# Configure logging
|
43 |
+
logging.basicConfig(level=logging.INFO)
|
44 |
+
|
45 |
+
# Load the model
|
46 |
+
try:
|
47 |
+
if USE_LORA:
|
48 |
+
# Load base model and LoRA adapter separately
|
49 |
+
print(f"π Loading base model: {BASE_MODEL_ID}")
|
50 |
+
base_model = AutoModelForCausalLM.from_pretrained(
|
51 |
+
BASE_MODEL_ID,
|
52 |
+
torch_dtype="auto",
|
53 |
+
device_map="auto",
|
54 |
+
attn_implementation="kernels-community/vllm-flash-attn3"
|
55 |
+
)
|
56 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
|
57 |
+
|
58 |
+
# Load the LoRA adapter
|
59 |
+
try:
|
60 |
+
print(f"π Loading LoRA adapter: {LORA_MODEL_ID}")
|
61 |
+
if MODEL_SUBFOLDER and MODEL_SUBFOLDER.strip():
|
62 |
+
model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER)
|
63 |
+
else:
|
64 |
+
model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID)
|
65 |
+
print("β
LoRA model loaded successfully!")
|
66 |
+
except Exception as lora_error:
|
67 |
+
print(f"β οΈ LoRA adapter failed to load: {lora_error}")
|
68 |
+
print("π Falling back to base model...")
|
69 |
+
model = base_model
|
70 |
+
else:
|
71 |
+
# Load merged/fine-tuned model directly
|
72 |
+
print(f"π Loading merged model: {LORA_MODEL_ID}")
|
73 |
+
model_kwargs = {
|
74 |
+
"torch_dtype": "auto",
|
75 |
+
"device_map": "auto",
|
76 |
+
"attn_implementation": "kernels-community/vllm-flash-attn3"
|
77 |
+
}
|
78 |
+
|
79 |
+
if MODEL_SUBFOLDER and MODEL_SUBFOLDER.strip():
|
80 |
+
model = AutoModelForCausalLM.from_pretrained(LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER, **model_kwargs)
|
81 |
+
tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_ID, subfolder=MODEL_SUBFOLDER)
|
82 |
+
else:
|
83 |
+
model = AutoModelForCausalLM.from_pretrained(LORA_MODEL_ID, **model_kwargs)
|
84 |
+
tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_ID)
|
85 |
+
print("β
Merged model loaded successfully!")
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"β Error loading model: {e}")
|
89 |
+
raise e
|
90 |
+
|
91 |
+
def format_conversation_history(chat_history):
|
92 |
+
messages = []
|
93 |
+
for item in chat_history:
|
94 |
+
role = item["role"]
|
95 |
+
content = item["content"]
|
96 |
+
if isinstance(content, list):
|
97 |
+
content = content[0]["text"] if content and "text" in content[0] else str(content)
|
98 |
+
messages.append({"role": role, "content": content})
|
99 |
+
return messages
|
100 |
+
|
101 |
+
def format_analysis_response(text):
|
102 |
+
"""Enhanced response formatting with better structure and LaTeX support."""
|
103 |
+
# Look for analysis section followed by final response
|
104 |
+
m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL | re.IGNORECASE)
|
105 |
+
if m:
|
106 |
+
reasoning = m.group(1).strip()
|
107 |
+
response = text.split("assistantfinal", 1)[-1].strip()
|
108 |
+
|
109 |
+
# Clean up the reasoning section
|
110 |
+
reasoning = re.sub(r'^analysis\s*', '', reasoning, flags=re.IGNORECASE).strip()
|
111 |
+
|
112 |
+
# Format with improved structure
|
113 |
+
formatted = (
|
114 |
+
f"**π€ Analysis & Reasoning:**\n\n"
|
115 |
+
f"*{reasoning}*\n\n"
|
116 |
+
f"---\n\n"
|
117 |
+
f"**π¬ Final Response:**\n\n{response}"
|
118 |
+
)
|
119 |
+
|
120 |
+
# Ensure LaTeX delimiters are balanced
|
121 |
+
if formatted.count("$") % 2:
|
122 |
+
formatted += "$"
|
123 |
+
|
124 |
+
return formatted
|
125 |
+
|
126 |
+
# Fallback: clean up the text and return as-is
|
127 |
+
cleaned = re.sub(r'^analysis\s*', '', text, flags=re.IGNORECASE).strip()
|
128 |
+
if cleaned.count("$") % 2:
|
129 |
+
cleaned += "$"
|
130 |
+
return cleaned
|
131 |
+
|
132 |
+
@spaces.GPU(duration=60)
|
133 |
+
def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
|
134 |
+
if not input_data.strip():
|
135 |
+
yield "Please enter a prompt."
|
136 |
+
return
|
137 |
+
|
138 |
+
# Log the request
|
139 |
+
logging.info(f"[User] {input_data}")
|
140 |
+
logging.info(f"[System] {system_prompt} | Temp={temperature} | Max tokens={max_new_tokens}")
|
141 |
+
|
142 |
+
new_message = {"role": "user", "content": input_data}
|
143 |
+
system_message = [{"role": "system", "content": system_prompt}] if system_prompt else []
|
144 |
+
processed_history = format_conversation_history(chat_history)
|
145 |
+
messages = system_message + processed_history + [new_message]
|
146 |
+
prompt = tokenizer.apply_chat_template(
|
147 |
+
messages,
|
148 |
+
tokenize=False,
|
149 |
+
add_generation_prompt=True
|
150 |
+
)
|
151 |
+
|
152 |
+
# Create streamer for proper streaming
|
153 |
+
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
154 |
+
|
155 |
+
# Prepare generation kwargs
|
156 |
+
generation_kwargs = {
|
157 |
+
"max_new_tokens": max_new_tokens,
|
158 |
+
"do_sample": True,
|
159 |
+
"temperature": temperature,
|
160 |
+
"top_p": top_p,
|
161 |
+
"top_k": top_k,
|
162 |
+
"repetition_penalty": repetition_penalty,
|
163 |
+
"pad_token_id": tokenizer.eos_token_id,
|
164 |
+
"streamer": streamer,
|
165 |
+
"use_cache": True
|
166 |
+
}
|
167 |
+
|
168 |
+
# Tokenize input using the chat template
|
169 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
170 |
+
|
171 |
+
# Start generation in a separate thread
|
172 |
+
thread = Thread(target=model.generate, kwargs={**inputs, **generation_kwargs})
|
173 |
+
thread.start()
|
174 |
+
|
175 |
+
# Stream the response with enhanced formatting
|
176 |
+
collected_text = ""
|
177 |
+
buffer = ""
|
178 |
+
yielded_once = False
|
179 |
+
|
180 |
+
try:
|
181 |
+
for chunk in streamer:
|
182 |
+
if not chunk:
|
183 |
+
continue
|
184 |
+
|
185 |
+
collected_text += chunk
|
186 |
+
buffer += chunk
|
187 |
+
|
188 |
+
# Initial yield to show immediate response
|
189 |
+
if not yielded_once:
|
190 |
+
yield chunk
|
191 |
+
buffer = ""
|
192 |
+
yielded_once = True
|
193 |
+
continue
|
194 |
+
|
195 |
+
# Yield accumulated text periodically for smooth streaming
|
196 |
+
if "\n" in buffer or len(buffer) > 150:
|
197 |
+
# Use enhanced formatting for partial text
|
198 |
+
partial_formatted = format_analysis_response(collected_text)
|
199 |
+
yield partial_formatted
|
200 |
+
buffer = ""
|
201 |
+
|
202 |
+
# Final formatting with complete text
|
203 |
+
final_formatted = format_analysis_response(collected_text)
|
204 |
+
yield final_formatted
|
205 |
+
|
206 |
+
except Exception as e:
|
207 |
+
logging.exception("Generation streaming failed")
|
208 |
+
yield f"β Error during generation: {e}"
|
209 |
+
|
210 |
+
demo = gr.ChatInterface(
|
211 |
+
fn=generate_response,
|
212 |
+
additional_inputs=[
|
213 |
+
gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
|
214 |
+
gr.Textbox(
|
215 |
+
label="System Prompt",
|
216 |
+
value="You are a helpful assistant. Reasoning: medium",
|
217 |
+
lines=4,
|
218 |
+
placeholder="Change system prompt"
|
219 |
+
),
|
220 |
+
gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
|
221 |
+
gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
|
222 |
+
gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
|
223 |
+
gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
|
224 |
+
],
|
225 |
+
examples=[
|
226 |
+
[{"text": "Explain Newton's laws clearly and concisely with mathematical formulas"}],
|
227 |
+
[{"text": "Write a Python function to calculate the Fibonacci sequence"}],
|
228 |
+
[{"text": "What are the benefits of open weight AI models? Include analysis."}],
|
229 |
+
[{"text": "Solve this equation: $x^2 + 5x + 6 = 0$"}],
|
230 |
+
],
|
231 |
+
cache_examples=False,
|
232 |
+
type="messages",
|
233 |
+
description=f"""
|
234 |
+
|
235 |
+
# ππ»ββοΈWelcome to π{MODEL_NAME} Demo !
|
236 |
+
|
237 |
+
**Model**: `{LORA_MODEL_ID}`
|
238 |
+
**Base**: `{BASE_MODEL_ID}`
|
239 |
+
|
240 |
+
β¨ **Enhanced Features:**
|
241 |
+
- π§ **Advanced Reasoning**: Detailed analysis and step-by-step thinking
|
242 |
+
- π **LaTeX Support**: Mathematical formulas rendered beautifully (use `$` or `$$`)
|
243 |
+
- π― **Improved Formatting**: Clear separation of reasoning and final responses
|
244 |
+
- π **Smart Logging**: Better error handling and request tracking
|
245 |
+
|
246 |
+
π‘ **Usage Tips:**
|
247 |
+
- Adjust reasoning level in system prompt (e.g., "Reasoning: high")
|
248 |
+
- Use LaTeX for math: `$E = mc^2$` or `$$\\int x^2 dx$$`
|
249 |
+
- Wait a couple of seconds initially for model loading
|
250 |
+
""",
|
251 |
+
fill_height=True,
|
252 |
+
textbox=gr.Textbox(
|
253 |
+
label="Query Input",
|
254 |
+
placeholder="Type your prompt (supports LaTeX: $x^2 + y^2 = z^2$)"
|
255 |
+
),
|
256 |
+
stop_btn="Stop Generation",
|
257 |
+
multimodal=False,
|
258 |
+
theme=gr.themes.Soft()
|
259 |
+
)
|
260 |
+
|
261 |
+
if __name__ == "__main__":
|
262 |
+
demo.launch(share=True)
|
templates/spaces/demo_gpt/requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
git+https://github.com/huggingface/transformers.git
|
3 |
+
peft
|
4 |
+
trl
|
5 |
+
bitsandbytes
|
6 |
+
triton
|
7 |
+
accelerate
|
8 |
+
kernels
|
9 |
+
openai-harmony
|
templates/spaces/{demo β demo_smol}/README.md
RENAMED
File without changes
|
templates/spaces/{demo β demo_smol}/app.py
RENAMED
File without changes
|
templates/spaces/{demo β demo_smol}/requirements.txt
RENAMED
File without changes
|
templates/spaces/{README.md β trackio/README.md}
RENAMED
File without changes
|
templates/spaces/{app.py β trackio/app.py}
RENAMED
@@ -14,6 +14,8 @@ import plotly.graph_objects as go
|
|
14 |
import plotly.express as px
|
15 |
import pandas as pd
|
16 |
import numpy as np
|
|
|
|
|
17 |
|
18 |
# Setup logging
|
19 |
logging.basicConfig(level=logging.INFO)
|
@@ -27,9 +29,24 @@ class TrackioSpace:
|
|
27 |
self.current_experiment = None
|
28 |
|
29 |
# Get dataset repository and HF token from parameters or environment variables
|
30 |
-
self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', '
|
31 |
self.hf_token = hf_token or os.environ.get('HF_TOKEN')
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
logger.info(f"π§ Using dataset repository: {self.dataset_repo}")
|
34 |
|
35 |
if not self.hf_token:
|
@@ -38,47 +55,139 @@ class TrackioSpace:
|
|
38 |
self._load_experiments()
|
39 |
|
40 |
def _load_experiments(self):
|
41 |
-
"""Load experiments from HF Dataset"""
|
42 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
if self.hf_token:
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
if
|
54 |
-
|
55 |
-
|
56 |
-
if exp_id:
|
57 |
-
self.experiments[exp_id] = {
|
58 |
-
'id': exp_id,
|
59 |
-
'name': row.get('name', ''),
|
60 |
-
'description': row.get('description', ''),
|
61 |
-
'created_at': row.get('created_at', ''),
|
62 |
-
'status': row.get('status', 'running'),
|
63 |
-
'metrics': json.loads(row.get('metrics', '[]')),
|
64 |
-
'parameters': json.loads(row.get('parameters', '{}')),
|
65 |
-
'artifacts': json.loads(row.get('artifacts', '[]')),
|
66 |
-
'logs': json.loads(row.get('logs', '[]'))
|
67 |
-
}
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
except Exception as e:
|
80 |
-
logger.
|
81 |
-
|
82 |
|
83 |
def _load_backup_experiments(self):
|
84 |
"""Load backup experiments when dataset is not available"""
|
@@ -312,12 +421,61 @@ class TrackioSpace:
|
|
312 |
logger.info(f"β
Loaded {len(backup_experiments)} backup experiments")
|
313 |
|
314 |
def _save_experiments(self):
|
315 |
-
"""Save experiments to HF Dataset"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
try:
|
317 |
if self.hf_token:
|
318 |
from datasets import Dataset
|
319 |
from huggingface_hub import HfApi
|
320 |
|
|
|
|
|
321 |
# Convert experiments to dataset format
|
322 |
dataset_data = []
|
323 |
for exp_id, exp_data in self.experiments.items():
|
@@ -327,10 +485,10 @@ class TrackioSpace:
|
|
327 |
'description': exp_data.get('description', ''),
|
328 |
'created_at': exp_data.get('created_at', ''),
|
329 |
'status': exp_data.get('status', 'running'),
|
330 |
-
'metrics': json.dumps(exp_data.get('metrics', [])),
|
331 |
-
'parameters': json.dumps(exp_data.get('parameters', {})),
|
332 |
-
'artifacts': json.dumps(exp_data.get('artifacts', [])),
|
333 |
-
'logs': json.dumps(exp_data.get('logs', [])),
|
334 |
'last_updated': datetime.now().isoformat()
|
335 |
})
|
336 |
|
@@ -342,16 +500,17 @@ class TrackioSpace:
|
|
342 |
dataset.push_to_hub(
|
343 |
self.dataset_repo,
|
344 |
token=self.hf_token,
|
345 |
-
private=True
|
|
|
346 |
)
|
347 |
|
348 |
-
logger.info(f"β
Saved {len(dataset_data)} experiments to {self.dataset_repo}")
|
349 |
|
350 |
else:
|
351 |
logger.warning("β οΈ No HF_TOKEN available, experiments not saved to dataset")
|
352 |
|
353 |
except Exception as e:
|
354 |
-
logger.error(f"Failed to save experiments
|
355 |
# Fall back to local file for backup
|
356 |
try:
|
357 |
data = {
|
@@ -363,7 +522,7 @@ class TrackioSpace:
|
|
363 |
json.dump(data, f, indent=2, default=str)
|
364 |
logger.info("β
Saved backup to local file")
|
365 |
except Exception as backup_e:
|
366 |
-
logger.error(f"Failed to save backup: {backup_e}")
|
367 |
|
368 |
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
369 |
"""Create a new experiment"""
|
@@ -483,7 +642,10 @@ def update_trackio_config(hf_token: str, dataset_repo: str) -> str:
|
|
483 |
# Reload experiments with new configuration
|
484 |
trackio_space._load_experiments()
|
485 |
|
486 |
-
|
|
|
|
|
|
|
487 |
|
488 |
except Exception as e:
|
489 |
return f"β Failed to update configuration: {str(e)}"
|
@@ -502,10 +664,42 @@ def test_dataset_connection(hf_token: str, dataset_repo: str) -> str:
|
|
502 |
# Test loading the dataset
|
503 |
dataset = load_dataset(dataset_repo, token=hf_token)
|
504 |
|
505 |
-
# Count experiments
|
506 |
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
507 |
|
508 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
|
510 |
except Exception as e:
|
511 |
return f"β Connection failed: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token is correct\n2. Verify the dataset repository exists\n3. Ensure your token has read access to the dataset"
|
@@ -534,12 +728,34 @@ def create_dataset_repository(hf_token: str, dataset_repo: str) -> str:
|
|
534 |
# Check if dataset exists
|
535 |
try:
|
536 |
api.dataset_info(dataset_repo)
|
537 |
-
return f"β
Dataset {dataset_repo} already exists
|
538 |
except:
|
539 |
# Dataset doesn't exist, create it
|
540 |
pass
|
541 |
|
542 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
empty_dataset = Dataset.from_dict({
|
544 |
'experiment_id': [],
|
545 |
'name': [],
|
@@ -557,22 +773,34 @@ def create_dataset_repository(hf_token: str, dataset_repo: str) -> str:
|
|
557 |
empty_dataset.push_to_hub(
|
558 |
dataset_repo,
|
559 |
token=hf_token,
|
560 |
-
private=True
|
|
|
561 |
)
|
562 |
|
563 |
-
return f"β
Dataset {dataset_repo} created successfully!\nπ View at: https://huggingface.co/datasets/{dataset_repo}\nπ Ready to store experiments"
|
564 |
|
565 |
except Exception as e:
|
566 |
-
return f"β Failed to create dataset: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token has write permissions\n2. Verify the username in the repository name\n3. Ensure the dataset name is valid"
|
567 |
|
568 |
# Initialize API client for remote data
|
569 |
api_client = None
|
570 |
try:
|
571 |
from trackio_api_client import TrackioAPIClient
|
572 |
-
|
573 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
574 |
except ImportError:
|
575 |
logger.warning("β οΈ API client not available, using local data only")
|
|
|
|
|
576 |
|
577 |
# Add Hugging Face Spaces compatibility
|
578 |
def is_huggingface_spaces():
|
@@ -616,6 +844,7 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
616 |
lines = experiment_details.split('\n')
|
617 |
metrics_data = []
|
618 |
|
|
|
619 |
for line in lines:
|
620 |
if 'Step:' in line and 'Metrics:' in line:
|
621 |
# Extract step and metrics from the line
|
@@ -637,6 +866,11 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
637 |
logger.warning(f"Failed to parse metrics line: {line} - {e}")
|
638 |
continue
|
639 |
|
|
|
|
|
|
|
|
|
|
|
640 |
if metrics_data:
|
641 |
return pd.DataFrame(metrics_data)
|
642 |
else:
|
@@ -647,22 +881,65 @@ def parse_remote_metrics_data(experiment_details: str) -> pd.DataFrame:
|
|
647 |
return pd.DataFrame()
|
648 |
|
649 |
def get_metrics_dataframe(experiment_id: str) -> pd.DataFrame:
|
650 |
-
"""Get metrics as a pandas DataFrame for plotting - tries
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
666 |
|
667 |
def create_experiment_interface(name: str, description: str) -> str:
|
668 |
"""Create a new experiment"""
|
@@ -919,12 +1196,622 @@ def create_demo_experiment():
|
|
919 |
except Exception as e:
|
920 |
return f"β Error creating demo experiment: {str(e)}"
|
921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
# Create Gradio interface
|
923 |
with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
|
924 |
gr.Markdown("# π Trackio Experiment Tracking & Monitoring")
|
925 |
gr.Markdown("Monitor and track your ML experiments with real-time visualization!")
|
926 |
|
927 |
with gr.Tabs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
# Configuration Tab
|
929 |
with gr.Tab("βοΈ Configuration"):
|
930 |
gr.Markdown("### Configure HF Datasets Connection")
|
@@ -941,7 +1828,7 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
941 |
dataset_repo_input = gr.Textbox(
|
942 |
label="Dataset Repository",
|
943 |
placeholder="your-username/your-dataset-name",
|
944 |
-
value="
|
945 |
info="HF Dataset repository for experiment storage"
|
946 |
)
|
947 |
|
@@ -953,9 +1840,9 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
953 |
gr.Markdown("### Current Configuration")
|
954 |
current_config_output = gr.Textbox(
|
955 |
label="Status",
|
956 |
-
lines=
|
957 |
interactive=False,
|
958 |
-
value=f"π Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ Experiments: {len(trackio_space.experiments)}"
|
959 |
)
|
960 |
|
961 |
with gr.Column():
|
@@ -978,12 +1865,204 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
978 |
- `HF_TOKEN`: Your Hugging Face token
|
979 |
- `TRACKIO_DATASET_REPO`: Dataset repository
|
980 |
|
|
|
|
|
|
|
|
|
|
|
981 |
**Actions:**
|
982 |
- **Update Configuration**: Apply new settings and reload experiments
|
983 |
- **Test Connection**: Verify access to the dataset repository
|
984 |
- **Create Dataset**: Create a new dataset repository if it doesn't exist
|
985 |
""")
|
986 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
987 |
update_config_btn.click(
|
988 |
update_trackio_config,
|
989 |
inputs=[hf_token_input, dataset_repo_input],
|
@@ -1001,237 +2080,9 @@ with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as
|
|
1001 |
inputs=[hf_token_input, dataset_repo_input],
|
1002 |
outputs=current_config_output
|
1003 |
)
|
|
|
1004 |
|
1005 |
-
|
1006 |
-
with gr.Tab("Create Experiment"):
|
1007 |
-
gr.Markdown("### Create a New Experiment")
|
1008 |
-
with gr.Row():
|
1009 |
-
with gr.Column():
|
1010 |
-
experiment_name = gr.Textbox(
|
1011 |
-
label="Experiment Name",
|
1012 |
-
placeholder="my_smollm3_finetune",
|
1013 |
-
value="smollm3_finetune"
|
1014 |
-
)
|
1015 |
-
experiment_description = gr.Textbox(
|
1016 |
-
label="Description",
|
1017 |
-
placeholder="Fine-tuning SmolLM3 model on custom dataset",
|
1018 |
-
value="SmolLM3 fine-tuning experiment"
|
1019 |
-
)
|
1020 |
-
create_btn = gr.Button("Create Experiment", variant="primary")
|
1021 |
-
|
1022 |
-
with gr.Column():
|
1023 |
-
create_output = gr.Textbox(
|
1024 |
-
label="Result",
|
1025 |
-
lines=5,
|
1026 |
-
interactive=False
|
1027 |
-
)
|
1028 |
-
|
1029 |
-
create_btn.click(
|
1030 |
-
create_experiment_interface,
|
1031 |
-
inputs=[experiment_name, experiment_description],
|
1032 |
-
outputs=create_output
|
1033 |
-
)
|
1034 |
-
|
1035 |
-
# Log Metrics Tab
|
1036 |
-
with gr.Tab("Log Metrics"):
|
1037 |
-
gr.Markdown("### Log Training Metrics")
|
1038 |
-
with gr.Row():
|
1039 |
-
with gr.Column():
|
1040 |
-
metrics_exp_id = gr.Textbox(
|
1041 |
-
label="Experiment ID",
|
1042 |
-
placeholder="exp_20231201_143022"
|
1043 |
-
)
|
1044 |
-
metrics_json = gr.Textbox(
|
1045 |
-
label="Metrics (JSON)",
|
1046 |
-
placeholder='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5}',
|
1047 |
-
value='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5, "gpu_memory": 22.5}'
|
1048 |
-
)
|
1049 |
-
metrics_step = gr.Textbox(
|
1050 |
-
label="Step (optional)",
|
1051 |
-
placeholder="100"
|
1052 |
-
)
|
1053 |
-
log_metrics_btn = gr.Button("Log Metrics", variant="primary")
|
1054 |
-
|
1055 |
-
with gr.Column():
|
1056 |
-
metrics_output = gr.Textbox(
|
1057 |
-
label="Result",
|
1058 |
-
lines=5,
|
1059 |
-
interactive=False
|
1060 |
-
)
|
1061 |
-
|
1062 |
-
log_metrics_btn.click(
|
1063 |
-
log_metrics_interface,
|
1064 |
-
inputs=[metrics_exp_id, metrics_json, metrics_step],
|
1065 |
-
outputs=metrics_output
|
1066 |
-
)
|
1067 |
-
|
1068 |
-
# Log Parameters Tab
|
1069 |
-
with gr.Tab("Log Parameters"):
|
1070 |
-
gr.Markdown("### Log Experiment Parameters")
|
1071 |
-
with gr.Row():
|
1072 |
-
with gr.Column():
|
1073 |
-
params_exp_id = gr.Textbox(
|
1074 |
-
label="Experiment ID",
|
1075 |
-
placeholder="exp_20231201_143022"
|
1076 |
-
)
|
1077 |
-
parameters_json = gr.Textbox(
|
1078 |
-
label="Parameters (JSON)",
|
1079 |
-
placeholder='{"learning_rate": 2e-5, "batch_size": 4}',
|
1080 |
-
value='{"learning_rate": 3.5e-6, "batch_size": 8, "model_name": "HuggingFaceTB/SmolLM3-3B", "max_iters": 18000, "mixed_precision": "bf16"}'
|
1081 |
-
)
|
1082 |
-
log_params_btn = gr.Button("Log Parameters", variant="primary")
|
1083 |
-
|
1084 |
-
with gr.Column():
|
1085 |
-
params_output = gr.Textbox(
|
1086 |
-
label="Result",
|
1087 |
-
lines=5,
|
1088 |
-
interactive=False
|
1089 |
-
)
|
1090 |
-
|
1091 |
-
log_params_btn.click(
|
1092 |
-
log_parameters_interface,
|
1093 |
-
inputs=[params_exp_id, parameters_json],
|
1094 |
-
outputs=params_output
|
1095 |
-
)
|
1096 |
-
|
1097 |
-
# View Experiments Tab
|
1098 |
-
with gr.Tab("View Experiments"):
|
1099 |
-
gr.Markdown("### View Experiment Details")
|
1100 |
-
with gr.Row():
|
1101 |
-
with gr.Column():
|
1102 |
-
view_exp_id = gr.Textbox(
|
1103 |
-
label="Experiment ID",
|
1104 |
-
placeholder="exp_20231201_143022"
|
1105 |
-
)
|
1106 |
-
view_btn = gr.Button("View Experiment", variant="primary")
|
1107 |
-
list_btn = gr.Button("List All Experiments", variant="secondary")
|
1108 |
-
|
1109 |
-
with gr.Column():
|
1110 |
-
view_output = gr.Textbox(
|
1111 |
-
label="Experiment Details",
|
1112 |
-
lines=20,
|
1113 |
-
interactive=False
|
1114 |
-
)
|
1115 |
-
|
1116 |
-
view_btn.click(
|
1117 |
-
get_experiment_details,
|
1118 |
-
inputs=[view_exp_id],
|
1119 |
-
outputs=view_output
|
1120 |
-
)
|
1121 |
-
|
1122 |
-
list_btn.click(
|
1123 |
-
list_experiments_interface,
|
1124 |
-
inputs=[],
|
1125 |
-
outputs=view_output
|
1126 |
-
)
|
1127 |
-
|
1128 |
-
# Visualization Tab
|
1129 |
-
with gr.Tab("π Visualizations"):
|
1130 |
-
gr.Markdown("### Training Metrics Visualization")
|
1131 |
-
with gr.Row():
|
1132 |
-
with gr.Column():
|
1133 |
-
plot_exp_id = gr.Textbox(
|
1134 |
-
label="Experiment ID",
|
1135 |
-
placeholder="exp_20231201_143022"
|
1136 |
-
)
|
1137 |
-
metric_dropdown = gr.Dropdown(
|
1138 |
-
label="Metric to Plot",
|
1139 |
-
choices=[
|
1140 |
-
"loss", "accuracy", "learning_rate", "gpu_memory", "training_time",
|
1141 |
-
"total_tokens", "truncated_tokens", "padding_tokens", "throughput", "step_time",
|
1142 |
-
"batch_size", "seq_len", "token_acc", "train/gate_ortho", "train/center"
|
1143 |
-
],
|
1144 |
-
value="loss"
|
1145 |
-
)
|
1146 |
-
plot_btn = gr.Button("Create Plot", variant="primary")
|
1147 |
-
|
1148 |
-
with gr.Column():
|
1149 |
-
plot_output = gr.Plot(label="Training Metrics")
|
1150 |
-
|
1151 |
-
plot_btn.click(
|
1152 |
-
create_metrics_plot,
|
1153 |
-
inputs=[plot_exp_id, metric_dropdown],
|
1154 |
-
outputs=plot_output
|
1155 |
-
)
|
1156 |
-
|
1157 |
-
gr.Markdown("### Experiment Comparison")
|
1158 |
-
with gr.Row():
|
1159 |
-
with gr.Column():
|
1160 |
-
comparison_exp_ids = gr.Textbox(
|
1161 |
-
label="Experiment IDs (comma-separated)",
|
1162 |
-
placeholder="exp_1,exp_2,exp_3"
|
1163 |
-
)
|
1164 |
-
comparison_btn = gr.Button("Compare Experiments", variant="primary")
|
1165 |
-
|
1166 |
-
with gr.Column():
|
1167 |
-
comparison_plot = gr.Plot(label="Experiment Comparison")
|
1168 |
-
|
1169 |
-
comparison_btn.click(
|
1170 |
-
create_experiment_comparison,
|
1171 |
-
inputs=[comparison_exp_ids],
|
1172 |
-
outputs=comparison_plot
|
1173 |
-
)
|
1174 |
-
|
1175 |
-
# Demo Data Tab
|
1176 |
-
with gr.Tab("π― Demo Data"):
|
1177 |
-
gr.Markdown("### Generate Demo Training Data")
|
1178 |
-
gr.Markdown("Use this to simulate training data for testing the interface")
|
1179 |
-
with gr.Row():
|
1180 |
-
with gr.Column():
|
1181 |
-
demo_exp_id = gr.Textbox(
|
1182 |
-
label="Experiment ID",
|
1183 |
-
placeholder="exp_20231201_143022"
|
1184 |
-
)
|
1185 |
-
demo_btn = gr.Button("Generate Demo Data", variant="primary")
|
1186 |
-
create_demo_btn = gr.Button("Create Demo Experiment", variant="secondary")
|
1187 |
-
|
1188 |
-
with gr.Column():
|
1189 |
-
demo_output = gr.Textbox(
|
1190 |
-
label="Result",
|
1191 |
-
lines=5,
|
1192 |
-
interactive=False
|
1193 |
-
)
|
1194 |
-
|
1195 |
-
demo_btn.click(
|
1196 |
-
simulate_training_data,
|
1197 |
-
inputs=[demo_exp_id],
|
1198 |
-
outputs=demo_output
|
1199 |
-
)
|
1200 |
-
|
1201 |
-
create_demo_btn.click(
|
1202 |
-
create_demo_experiment,
|
1203 |
-
inputs=[],
|
1204 |
-
outputs=demo_output
|
1205 |
-
)
|
1206 |
-
|
1207 |
-
# Update Status Tab
|
1208 |
-
with gr.Tab("Update Status"):
|
1209 |
-
gr.Markdown("### Update Experiment Status")
|
1210 |
-
with gr.Row():
|
1211 |
-
with gr.Column():
|
1212 |
-
status_exp_id = gr.Textbox(
|
1213 |
-
label="Experiment ID",
|
1214 |
-
placeholder="exp_20231201_143022"
|
1215 |
-
)
|
1216 |
-
status_dropdown = gr.Dropdown(
|
1217 |
-
label="Status",
|
1218 |
-
choices=["running", "completed", "failed", "paused"],
|
1219 |
-
value="running"
|
1220 |
-
)
|
1221 |
-
update_status_btn = gr.Button("Update Status", variant="primary")
|
1222 |
-
|
1223 |
-
with gr.Column():
|
1224 |
-
status_output = gr.Textbox(
|
1225 |
-
label="Result",
|
1226 |
-
lines=3,
|
1227 |
-
interactive=False
|
1228 |
-
)
|
1229 |
-
|
1230 |
-
update_status_btn.click(
|
1231 |
-
update_experiment_status_interface,
|
1232 |
-
inputs=[status_exp_id, status_dropdown],
|
1233 |
-
outputs=status_output
|
1234 |
-
)
|
1235 |
|
1236 |
# Launch the app
|
1237 |
if __name__ == "__main__":
|
|
|
14 |
import plotly.express as px
|
15 |
import pandas as pd
|
16 |
import numpy as np
|
17 |
+
import plotly.io as pio
|
18 |
+
pio.templates.default = "plotly_white"
|
19 |
|
20 |
# Setup logging
|
21 |
logging.basicConfig(level=logging.INFO)
|
|
|
29 |
self.current_experiment = None
|
30 |
|
31 |
# Get dataset repository and HF token from parameters or environment variables
|
32 |
+
self.dataset_repo = dataset_repo or os.environ.get('TRACKIO_DATASET_REPO', 'Tonic/trackio-experiments')
|
33 |
self.hf_token = hf_token or os.environ.get('HF_TOKEN')
|
34 |
|
35 |
+
# Initialize dataset manager for safe operations
|
36 |
+
self.dataset_manager = None
|
37 |
+
if self.hf_token and self.dataset_repo:
|
38 |
+
try:
|
39 |
+
# Import dataset manager
|
40 |
+
import sys
|
41 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
42 |
+
from dataset_utils import TrackioDatasetManager
|
43 |
+
self.dataset_manager = TrackioDatasetManager(self.dataset_repo, self.hf_token)
|
44 |
+
logger.info("β
Dataset manager initialized for safe operations")
|
45 |
+
except ImportError:
|
46 |
+
logger.warning("β οΈ Dataset manager not available, using legacy data handling")
|
47 |
+
except Exception as e:
|
48 |
+
logger.warning(f"β οΈ Failed to initialize dataset manager: {e}")
|
49 |
+
|
50 |
logger.info(f"π§ Using dataset repository: {self.dataset_repo}")
|
51 |
|
52 |
if not self.hf_token:
|
|
|
55 |
self._load_experiments()
|
56 |
|
57 |
def _load_experiments(self):
|
58 |
+
"""Load experiments from HF Dataset with data preservation support"""
|
59 |
try:
|
60 |
+
# Try using dataset manager first for safe operations
|
61 |
+
if self.dataset_manager:
|
62 |
+
logger.info("π Loading experiments using dataset manager")
|
63 |
+
experiments_list = self.dataset_manager.load_existing_experiments()
|
64 |
+
|
65 |
+
# Convert list to dict format expected by the interface
|
66 |
+
self.experiments = {}
|
67 |
+
for exp_data in experiments_list:
|
68 |
+
exp_id = exp_data.get('experiment_id')
|
69 |
+
if exp_id:
|
70 |
+
converted_experiment = self._convert_dataset_row_to_experiment(exp_data)
|
71 |
+
if converted_experiment:
|
72 |
+
self.experiments[exp_id] = converted_experiment
|
73 |
+
|
74 |
+
logger.info(f"β
Loaded {len(self.experiments)} experiments using dataset manager")
|
75 |
+
|
76 |
+
# Sort experiments by creation date (newest first)
|
77 |
+
self.experiments = dict(sorted(
|
78 |
+
self.experiments.items(),
|
79 |
+
key=lambda x: x[1].get('created_at', ''),
|
80 |
+
reverse=True
|
81 |
+
))
|
82 |
+
|
83 |
+
# If no experiments found, use backup
|
84 |
+
if not self.experiments:
|
85 |
+
logger.info("π No experiments found in dataset, using backup data")
|
86 |
+
self._load_backup_experiments()
|
87 |
+
|
88 |
+
return
|
89 |
+
|
90 |
+
# Fallback to direct dataset loading if dataset manager not available
|
91 |
if self.hf_token:
|
92 |
+
success = self._load_experiments_direct()
|
93 |
+
if success:
|
94 |
+
return
|
95 |
+
|
96 |
+
# Final fallback to backup data
|
97 |
+
logger.info("π Using backup data")
|
98 |
+
self._load_backup_experiments()
|
99 |
|
100 |
+
except Exception as e:
|
101 |
+
logger.error(f"β Failed to load experiments: {e}")
|
102 |
+
self._load_backup_experiments()
|
103 |
+
|
104 |
+
def _load_experiments_direct(self) -> bool:
|
105 |
+
"""Load experiments directly from HF Dataset without dataset manager"""
|
106 |
+
try:
|
107 |
+
from datasets import load_dataset
|
108 |
+
|
109 |
+
logger.info(f"π Loading experiments directly from {self.dataset_repo}")
|
110 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
111 |
+
logger.info(f"β
Successfully loaded dataset from {self.dataset_repo}")
|
112 |
+
|
113 |
+
# Convert dataset to experiments dict
|
114 |
+
self.experiments = {}
|
115 |
+
if 'train' in dataset:
|
116 |
+
for row in dataset['train']:
|
117 |
+
exp_id = row.get('experiment_id')
|
118 |
+
if exp_id:
|
119 |
+
converted_experiment = self._convert_dataset_row_to_experiment(row)
|
120 |
+
if converted_experiment:
|
121 |
+
self.experiments[exp_id] = converted_experiment
|
122 |
+
|
123 |
+
logger.info(f"π Successfully loaded {len(self.experiments)} experiments from dataset")
|
124 |
+
|
125 |
+
# Sort experiments by creation date (newest first)
|
126 |
+
self.experiments = dict(sorted(
|
127 |
+
self.experiments.items(),
|
128 |
+
key=lambda x: x[1].get('created_at', ''),
|
129 |
+
reverse=True
|
130 |
+
))
|
131 |
+
|
132 |
+
return True
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
logger.warning(f"β οΈ Failed to load from dataset directly: {e}")
|
136 |
+
return False
|
137 |
+
|
138 |
+
def _convert_dataset_row_to_experiment(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
139 |
+
"""Convert a dataset row to experiment format, handling JSON parsing safely"""
|
140 |
+
try:
|
141 |
+
exp_id = row.get('experiment_id')
|
142 |
+
if not exp_id:
|
143 |
+
return None
|
144 |
+
|
145 |
+
# Parse JSON fields safely
|
146 |
+
try:
|
147 |
+
metrics_raw = row.get('metrics', '[]')
|
148 |
+
if isinstance(metrics_raw, str):
|
149 |
+
metrics = json.loads(metrics_raw) if metrics_raw else []
|
150 |
+
else:
|
151 |
+
metrics = metrics_raw if metrics_raw else []
|
152 |
|
153 |
+
parameters_raw = row.get('parameters', '{}')
|
154 |
+
if isinstance(parameters_raw, str):
|
155 |
+
parameters = json.loads(parameters_raw) if parameters_raw else {}
|
156 |
+
else:
|
157 |
+
parameters = parameters_raw if parameters_raw else {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
+
artifacts_raw = row.get('artifacts', '[]')
|
160 |
+
if isinstance(artifacts_raw, str):
|
161 |
+
artifacts = json.loads(artifacts_raw) if artifacts_raw else []
|
162 |
+
else:
|
163 |
+
artifacts = artifacts_raw if artifacts_raw else []
|
164 |
|
165 |
+
logs_raw = row.get('logs', '[]')
|
166 |
+
if isinstance(logs_raw, str):
|
167 |
+
logs = json.loads(logs_raw) if logs_raw else []
|
168 |
+
else:
|
169 |
+
logs = logs_raw if logs_raw else []
|
170 |
+
|
171 |
+
except json.JSONDecodeError as json_err:
|
172 |
+
logger.warning(f"JSON decode error for experiment {exp_id}: {json_err}")
|
173 |
+
metrics, parameters, artifacts, logs = [], {}, [], []
|
174 |
+
|
175 |
+
return {
|
176 |
+
'id': exp_id,
|
177 |
+
'name': row.get('name', ''),
|
178 |
+
'description': row.get('description', ''),
|
179 |
+
'created_at': row.get('created_at', ''),
|
180 |
+
'status': row.get('status', 'running'),
|
181 |
+
'metrics': metrics,
|
182 |
+
'parameters': parameters,
|
183 |
+
'artifacts': artifacts,
|
184 |
+
'logs': logs,
|
185 |
+
'last_updated': row.get('last_updated', '')
|
186 |
+
}
|
187 |
+
|
188 |
except Exception as e:
|
189 |
+
logger.warning(f"Failed to convert dataset row to experiment: {e}")
|
190 |
+
return None
|
191 |
|
192 |
def _load_backup_experiments(self):
|
193 |
"""Load backup experiments when dataset is not available"""
|
|
|
421 |
logger.info(f"β
Loaded {len(backup_experiments)} backup experiments")
|
422 |
|
423 |
def _save_experiments(self):
|
424 |
+
"""Save experiments to HF Dataset with data preservation"""
|
425 |
+
try:
|
426 |
+
# Use dataset manager for safe operations if available
|
427 |
+
if self.dataset_manager:
|
428 |
+
logger.info("πΎ Saving experiments using dataset manager (data preservation)")
|
429 |
+
|
430 |
+
# Convert current experiments to dataset format
|
431 |
+
experiments_to_save = []
|
432 |
+
for exp_id, exp_data in self.experiments.items():
|
433 |
+
experiment_entry = {
|
434 |
+
'experiment_id': exp_id,
|
435 |
+
'name': exp_data.get('name', ''),
|
436 |
+
'description': exp_data.get('description', ''),
|
437 |
+
'created_at': exp_data.get('created_at', ''),
|
438 |
+
'status': exp_data.get('status', 'running'),
|
439 |
+
'metrics': json.dumps(exp_data.get('metrics', []), default=str),
|
440 |
+
'parameters': json.dumps(exp_data.get('parameters', {}), default=str),
|
441 |
+
'artifacts': json.dumps(exp_data.get('artifacts', []), default=str),
|
442 |
+
'logs': json.dumps(exp_data.get('logs', []), default=str),
|
443 |
+
'last_updated': datetime.now().isoformat()
|
444 |
+
}
|
445 |
+
experiments_to_save.append(experiment_entry)
|
446 |
+
|
447 |
+
# Use dataset manager to save with data preservation
|
448 |
+
success = self.dataset_manager.save_experiments(
|
449 |
+
experiments_to_save,
|
450 |
+
f"Update experiments from Trackio Space ({len(experiments_to_save)} total experiments)"
|
451 |
+
)
|
452 |
+
|
453 |
+
if success:
|
454 |
+
logger.info(f"β
Successfully saved {len(experiments_to_save)} experiments with data preservation")
|
455 |
+
else:
|
456 |
+
logger.error("β Failed to save experiments using dataset manager")
|
457 |
+
# Fallback to legacy method
|
458 |
+
self._save_experiments_legacy()
|
459 |
+
|
460 |
+
return
|
461 |
+
|
462 |
+
# Fallback to legacy method if dataset manager not available
|
463 |
+
self._save_experiments_legacy()
|
464 |
+
|
465 |
+
except Exception as e:
|
466 |
+
logger.error(f"β Failed to save experiments: {e}")
|
467 |
+
# Fallback to legacy method
|
468 |
+
self._save_experiments_legacy()
|
469 |
+
|
470 |
+
def _save_experiments_legacy(self):
|
471 |
+
"""Legacy save method without data preservation (fallback only)"""
|
472 |
try:
|
473 |
if self.hf_token:
|
474 |
from datasets import Dataset
|
475 |
from huggingface_hub import HfApi
|
476 |
|
477 |
+
logger.warning("β οΈ Using legacy save method - data preservation not guaranteed")
|
478 |
+
|
479 |
# Convert experiments to dataset format
|
480 |
dataset_data = []
|
481 |
for exp_id, exp_data in self.experiments.items():
|
|
|
485 |
'description': exp_data.get('description', ''),
|
486 |
'created_at': exp_data.get('created_at', ''),
|
487 |
'status': exp_data.get('status', 'running'),
|
488 |
+
'metrics': json.dumps(exp_data.get('metrics', []), default=str),
|
489 |
+
'parameters': json.dumps(exp_data.get('parameters', {}), default=str),
|
490 |
+
'artifacts': json.dumps(exp_data.get('artifacts', []), default=str),
|
491 |
+
'logs': json.dumps(exp_data.get('logs', []), default=str),
|
492 |
'last_updated': datetime.now().isoformat()
|
493 |
})
|
494 |
|
|
|
500 |
dataset.push_to_hub(
|
501 |
self.dataset_repo,
|
502 |
token=self.hf_token,
|
503 |
+
private=True,
|
504 |
+
commit_message=f"Legacy update: {len(dataset_data)} experiments"
|
505 |
)
|
506 |
|
507 |
+
logger.info(f"β
Saved {len(dataset_data)} experiments to {self.dataset_repo} (legacy method)")
|
508 |
|
509 |
else:
|
510 |
logger.warning("β οΈ No HF_TOKEN available, experiments not saved to dataset")
|
511 |
|
512 |
except Exception as e:
|
513 |
+
logger.error(f"β Failed to save experiments with legacy method: {e}")
|
514 |
# Fall back to local file for backup
|
515 |
try:
|
516 |
data = {
|
|
|
522 |
json.dump(data, f, indent=2, default=str)
|
523 |
logger.info("β
Saved backup to local file")
|
524 |
except Exception as backup_e:
|
525 |
+
logger.error(f"β Failed to save backup: {backup_e}")
|
526 |
|
527 |
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
528 |
"""Create a new experiment"""
|
|
|
642 |
# Reload experiments with new configuration
|
643 |
trackio_space._load_experiments()
|
644 |
|
645 |
+
# Check if dataset manager is available
|
646 |
+
manager_status = "β
Available (data preservation enabled)" if trackio_space.dataset_manager else "β οΈ Not available (legacy mode)"
|
647 |
+
|
648 |
+
return f"β
Configuration updated successfully!\nπ Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ‘οΈ Data Manager: {manager_status}\nπ Loaded {len(trackio_space.experiments)} experiments"
|
649 |
|
650 |
except Exception as e:
|
651 |
return f"β Failed to update configuration: {str(e)}"
|
|
|
664 |
# Test loading the dataset
|
665 |
dataset = load_dataset(dataset_repo, token=hf_token)
|
666 |
|
667 |
+
# Count experiments and analyze structure
|
668 |
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
669 |
|
670 |
+
# Get column information
|
671 |
+
columns = list(dataset['train'].column_names) if 'train' in dataset else []
|
672 |
+
|
673 |
+
# Sample first few experiment IDs
|
674 |
+
sample_experiments = []
|
675 |
+
if 'train' in dataset and experiment_count > 0:
|
676 |
+
for i, row in enumerate(dataset['train']):
|
677 |
+
if i >= 3: # Only show first 3
|
678 |
+
break
|
679 |
+
sample_experiments.append(row.get('experiment_id', 'unknown'))
|
680 |
+
|
681 |
+
result = f"β
Connection successful!\nπ Dataset: {dataset_repo}\nπ Found {experiment_count} experiments\nπ Dataset URL: https://huggingface.co/datasets/{dataset_repo}\n\n"
|
682 |
+
result += f"π Dataset Columns: {', '.join(columns)}\n"
|
683 |
+
if sample_experiments:
|
684 |
+
result += f"π¬ Sample Experiments: {', '.join(sample_experiments)}\n"
|
685 |
+
|
686 |
+
# Test parsing one experiment if available
|
687 |
+
if 'train' in dataset and experiment_count > 0:
|
688 |
+
first_row = dataset['train'][0]
|
689 |
+
exp_id = first_row.get('experiment_id', 'unknown')
|
690 |
+
metrics_raw = first_row.get('metrics', '[]')
|
691 |
+
|
692 |
+
try:
|
693 |
+
if isinstance(metrics_raw, str):
|
694 |
+
metrics = json.loads(metrics_raw)
|
695 |
+
metrics_count = len(metrics) if isinstance(metrics, list) else 0
|
696 |
+
result += f"π First experiment ({exp_id}) metrics: {metrics_count} entries\n"
|
697 |
+
else:
|
698 |
+
result += f"π First experiment ({exp_id}) metrics: Non-string format\n"
|
699 |
+
except json.JSONDecodeError as e:
|
700 |
+
result += f"β οΈ JSON parse error in first experiment: {e}\n"
|
701 |
+
|
702 |
+
return result
|
703 |
|
704 |
except Exception as e:
|
705 |
return f"β Connection failed: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token is correct\n2. Verify the dataset repository exists\n3. Ensure your token has read access to the dataset"
|
|
|
728 |
# Check if dataset exists
|
729 |
try:
|
730 |
api.dataset_info(dataset_repo)
|
731 |
+
return f"β
Dataset {dataset_repo} already exists!\nπ‘οΈ Data preservation is enabled for existing datasets\nπ View at: https://huggingface.co/datasets/{dataset_repo}"
|
732 |
except:
|
733 |
# Dataset doesn't exist, create it
|
734 |
pass
|
735 |
|
736 |
+
# Try to initialize dataset manager to use its repository creation
|
737 |
+
try:
|
738 |
+
# Import dataset manager
|
739 |
+
import sys
|
740 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'src'))
|
741 |
+
from dataset_utils import TrackioDatasetManager
|
742 |
+
|
743 |
+
# Create dataset manager instance
|
744 |
+
dataset_manager = TrackioDatasetManager(dataset_repo, hf_token)
|
745 |
+
|
746 |
+
# Check if dataset exists using the manager
|
747 |
+
exists = dataset_manager.check_dataset_exists()
|
748 |
+
if exists:
|
749 |
+
return f"β
Dataset {dataset_repo} already exists!\nπ‘οΈ Data preservation is enabled\nπ View at: https://huggingface.co/datasets/{dataset_repo}"
|
750 |
+
|
751 |
+
except ImportError:
|
752 |
+
# Dataset manager not available, use legacy method
|
753 |
+
pass
|
754 |
+
except Exception as e:
|
755 |
+
# Dataset manager failed, use legacy method
|
756 |
+
logger.warning(f"Dataset manager failed: {e}, using legacy method")
|
757 |
+
|
758 |
+
# Create empty dataset with proper structure
|
759 |
empty_dataset = Dataset.from_dict({
|
760 |
'experiment_id': [],
|
761 |
'name': [],
|
|
|
773 |
empty_dataset.push_to_hub(
|
774 |
dataset_repo,
|
775 |
token=hf_token,
|
776 |
+
private=True,
|
777 |
+
commit_message="Create Trackio experiment dataset with data preservation support"
|
778 |
)
|
779 |
|
780 |
+
return f"β
Dataset {dataset_repo} created successfully!\nπ‘οΈ Data preservation is now enabled\nπ View at: https://huggingface.co/datasets/{dataset_repo}\nπ Ready to store experiments safely"
|
781 |
|
782 |
except Exception as e:
|
783 |
+
return f"β Failed to create dataset: {str(e)}\n\nπ‘ Troubleshooting:\n1. Check your HF token has write permissions\n2. Verify the username in the repository name\n3. Ensure the dataset name is valid\n4. Check internet connectivity"
|
784 |
|
785 |
# Initialize API client for remote data
|
786 |
api_client = None
|
787 |
try:
|
788 |
from trackio_api_client import TrackioAPIClient
|
789 |
+
# Get Trackio URL from environment or use default
|
790 |
+
trackio_url = os.environ.get('TRACKIO_URL', 'https://tonic-test-trackio-test.hf.space')
|
791 |
+
|
792 |
+
# Clean up URL to avoid double protocol issues
|
793 |
+
if trackio_url.startswith('https://https://'):
|
794 |
+
trackio_url = trackio_url.replace('https://https://', 'https://')
|
795 |
+
elif trackio_url.startswith('http://http://'):
|
796 |
+
trackio_url = trackio_url.replace('http://http://', 'http://')
|
797 |
+
|
798 |
+
api_client = TrackioAPIClient(trackio_url)
|
799 |
+
logger.info(f"β
API client initialized for remote data access: {trackio_url}")
|
800 |
except ImportError:
|
801 |
logger.warning("β οΈ API client not available, using local data only")
|
802 |
+
except Exception as e:
|
803 |
+
logger.warning(f"β οΈ Failed to initialize API client: {e}, using local data only")
|
804 |
|
805 |
# Add Hugging Face Spaces compatibility
|
806 |
def is_huggingface_spaces():
|
|
|
844 |
lines = experiment_details.split('\n')
|
845 |
metrics_data = []
|
846 |
|
847 |
+
# First try to parse the new format with structured experiment details
|
848 |
for line in lines:
|
849 |
if 'Step:' in line and 'Metrics:' in line:
|
850 |
# Extract step and metrics from the line
|
|
|
866 |
logger.warning(f"Failed to parse metrics line: {line} - {e}")
|
867 |
continue
|
868 |
|
869 |
+
# If no metrics found in text format, try to parse from the dataset directly
|
870 |
+
if not metrics_data:
|
871 |
+
logger.info("No metrics found in text format, trying to parse from experiment structure")
|
872 |
+
# This will be handled by the updated get_remote_experiment_data function
|
873 |
+
|
874 |
if metrics_data:
|
875 |
return pd.DataFrame(metrics_data)
|
876 |
else:
|
|
|
881 |
return pd.DataFrame()
|
882 |
|
883 |
def get_metrics_dataframe(experiment_id: str) -> pd.DataFrame:
|
884 |
+
"""Get metrics as a pandas DataFrame for plotting - tries dataset first, then local backup"""
|
885 |
+
try:
|
886 |
+
# First try to get data directly from the dataset using the dataset manager
|
887 |
+
if trackio_space.dataset_manager:
|
888 |
+
logger.info(f"Getting metrics for {experiment_id} from dataset")
|
889 |
+
experiment_data = trackio_space.dataset_manager.get_experiment_by_id(experiment_id)
|
890 |
+
|
891 |
+
if experiment_data:
|
892 |
+
# Parse metrics from the dataset
|
893 |
+
metrics_json = experiment_data.get('metrics', '[]')
|
894 |
+
if isinstance(metrics_json, str):
|
895 |
+
try:
|
896 |
+
metrics_list = json.loads(metrics_json)
|
897 |
+
|
898 |
+
# Convert to DataFrame format
|
899 |
+
df_data = []
|
900 |
+
for metric_entry in metrics_list:
|
901 |
+
if isinstance(metric_entry, dict):
|
902 |
+
step = metric_entry.get('step', 0)
|
903 |
+
timestamp = metric_entry.get('timestamp', '')
|
904 |
+
metrics = metric_entry.get('metrics', {})
|
905 |
+
|
906 |
+
row = {'step': step, 'timestamp': timestamp}
|
907 |
+
row.update(metrics)
|
908 |
+
df_data.append(row)
|
909 |
+
|
910 |
+
if df_data:
|
911 |
+
logger.info(f"Found {len(df_data)} metrics entries from dataset for {experiment_id}")
|
912 |
+
return pd.DataFrame(df_data)
|
913 |
+
else:
|
914 |
+
logger.warning(f"No valid metrics found in dataset for {experiment_id}")
|
915 |
+
except json.JSONDecodeError as e:
|
916 |
+
logger.warning(f"Failed to parse metrics JSON for {experiment_id}: {e}")
|
917 |
+
else:
|
918 |
+
logger.warning(f"Metrics data is not a JSON string for {experiment_id}")
|
919 |
+
else:
|
920 |
+
logger.warning(f"Experiment {experiment_id} not found in dataset")
|
921 |
+
|
922 |
+
# Try legacy remote data approach
|
923 |
+
remote_data = get_remote_experiment_data(experiment_id)
|
924 |
+
if remote_data:
|
925 |
+
logger.info(f"Using remote API data for {experiment_id}")
|
926 |
+
# Parse the remote experiment details to extract metrics
|
927 |
+
df = parse_remote_metrics_data(remote_data["data"])
|
928 |
+
if not df.empty:
|
929 |
+
logger.info(f"Found {len(df)} metrics entries from remote API")
|
930 |
+
return df
|
931 |
+
else:
|
932 |
+
logger.warning(f"No metrics found in remote API data for {experiment_id}")
|
933 |
+
|
934 |
+
# Fall back to local data
|
935 |
+
logger.info(f"Using local backup data for {experiment_id}")
|
936 |
+
return trackio_space.get_metrics_dataframe(experiment_id)
|
937 |
+
|
938 |
+
except Exception as e:
|
939 |
+
logger.error(f"Error getting metrics dataframe for {experiment_id}: {e}")
|
940 |
+
# Fall back to local data
|
941 |
+
logger.info(f"Falling back to local data for {experiment_id}")
|
942 |
+
return trackio_space.get_metrics_dataframe(experiment_id)
|
943 |
|
944 |
def create_experiment_interface(name: str, description: str) -> str:
|
945 |
"""Create a new experiment"""
|
|
|
1196 |
except Exception as e:
|
1197 |
return f"β Error creating demo experiment: {str(e)}"
|
1198 |
|
1199 |
+
|
1200 |
+
# Helper functions for the new interface
|
1201 |
+
def get_experiment_dropdown_choices() -> list:
|
1202 |
+
"""Get the list of experiments for the dropdown"""
|
1203 |
+
experiments = list(trackio_space.experiments.keys())
|
1204 |
+
if not experiments:
|
1205 |
+
return ["No experiments available"]
|
1206 |
+
return experiments
|
1207 |
+
|
1208 |
+
def refresh_experiment_dropdown() -> tuple:
|
1209 |
+
"""Refresh the experiment dropdown and return current choices"""
|
1210 |
+
choices = get_experiment_dropdown_choices()
|
1211 |
+
current_value = choices[0] if choices and choices[0] != "No experiments available" else None
|
1212 |
+
return gr.Dropdown(choices=choices, value=current_value)
|
1213 |
+
|
1214 |
+
def get_available_metrics_for_experiments(experiment_ids: list) -> list:
|
1215 |
+
"""Get all available metrics across selected experiments"""
|
1216 |
+
try:
|
1217 |
+
all_metrics = set()
|
1218 |
+
for exp_id in experiment_ids:
|
1219 |
+
df = get_metrics_dataframe(exp_id)
|
1220 |
+
if not df.empty:
|
1221 |
+
# Get numeric columns (excluding step and timestamp)
|
1222 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
1223 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
1224 |
+
all_metrics.update(numeric_cols)
|
1225 |
+
|
1226 |
+
return sorted(list(all_metrics))
|
1227 |
+
except Exception as e:
|
1228 |
+
logger.error(f"Error getting available metrics: {str(e)}")
|
1229 |
+
return ["loss", "accuracy"]
|
1230 |
+
|
1231 |
+
def create_test_plot() -> go.Figure:
|
1232 |
+
"""Create a simple test plot to verify plotly rendering works"""
|
1233 |
+
try:
|
1234 |
+
# Create simple test data
|
1235 |
+
x = [1, 2, 3, 4, 5]
|
1236 |
+
y = [1, 4, 2, 3, 5]
|
1237 |
+
|
1238 |
+
fig = go.Figure()
|
1239 |
+
fig.add_trace(go.Scatter(
|
1240 |
+
x=x,
|
1241 |
+
y=y,
|
1242 |
+
mode='lines+markers',
|
1243 |
+
name='Test Data',
|
1244 |
+
line=dict(width=2, color='blue'),
|
1245 |
+
marker=dict(size=5, color='red'),
|
1246 |
+
connectgaps=True,
|
1247 |
+
hovertemplate='<b>X:</b> %{x}<br><b>Y:</b> %{y}<extra></extra>'
|
1248 |
+
))
|
1249 |
+
|
1250 |
+
fig.update_layout(
|
1251 |
+
title="Test Plot - If you can see this, plotly is working!",
|
1252 |
+
xaxis_title="X Axis",
|
1253 |
+
yaxis_title="Y Axis",
|
1254 |
+
plot_bgcolor='white',
|
1255 |
+
paper_bgcolor='white',
|
1256 |
+
font=dict(size=14),
|
1257 |
+
margin=dict(l=50, r=50, t=80, b=50)
|
1258 |
+
)
|
1259 |
+
|
1260 |
+
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
|
1261 |
+
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
|
1262 |
+
|
1263 |
+
logger.info("Test plot created successfully")
|
1264 |
+
return fig
|
1265 |
+
|
1266 |
+
except Exception as e:
|
1267 |
+
logger.error(f"Error creating test plot: {str(e)}")
|
1268 |
+
fig = go.Figure()
|
1269 |
+
fig.add_annotation(
|
1270 |
+
text=f"Test plot error: {str(e)}",
|
1271 |
+
xref="paper", yref="paper",
|
1272 |
+
x=0.5, y=0.5, showarrow=False,
|
1273 |
+
font=dict(size=14, color="red")
|
1274 |
+
)
|
1275 |
+
return fig
|
1276 |
+
|
1277 |
+
def get_experiment_status_summary(experiment_id: str) -> str:
|
1278 |
+
"""Get a formatted summary of experiment status and metadata"""
|
1279 |
+
try:
|
1280 |
+
experiment = trackio_space.get_experiment(experiment_id)
|
1281 |
+
if not experiment:
|
1282 |
+
return f"Experiment {experiment_id} not found."
|
1283 |
+
|
1284 |
+
summary = f"π EXPERIMENT STATUS SUMMARY\n{'='*50}\n"
|
1285 |
+
summary += f"ID: {experiment['id']}\n"
|
1286 |
+
summary += f"Name: {experiment['name']}\n"
|
1287 |
+
summary += f"Description: {experiment['description']}\n"
|
1288 |
+
summary += f"Status: {experiment['status']}\n"
|
1289 |
+
summary += f"Created: {experiment['created_at']}\n"
|
1290 |
+
summary += f"Metrics entries: {len(experiment['metrics'])}\n"
|
1291 |
+
summary += f"Parameters: {len(experiment['parameters'])}\n"
|
1292 |
+
summary += f"Artifacts: {len(experiment['artifacts'])}\n"
|
1293 |
+
summary += f"Logs: {len(experiment['logs'])}\n"
|
1294 |
+
|
1295 |
+
# Add latest metrics if available
|
1296 |
+
if experiment['metrics']:
|
1297 |
+
latest = experiment['metrics'][-1]
|
1298 |
+
summary += f"\nπ LATEST METRICS (Step {latest.get('step', 'N/A')}):\n"
|
1299 |
+
for k, v in latest.get('metrics', {}).items():
|
1300 |
+
summary += f" {k}: {v}\n"
|
1301 |
+
|
1302 |
+
return summary
|
1303 |
+
except Exception as e:
|
1304 |
+
return f"Error generating status summary: {str(e)}"
|
1305 |
+
|
1306 |
+
def get_experiment_parameters_summary(experiment_id: str) -> str:
|
1307 |
+
"""Get a formatted summary of experiment parameters"""
|
1308 |
+
try:
|
1309 |
+
experiment = trackio_space.get_experiment(experiment_id)
|
1310 |
+
if not experiment:
|
1311 |
+
return f"Experiment {experiment_id} not found."
|
1312 |
+
|
1313 |
+
params = experiment.get('parameters', {})
|
1314 |
+
if not params:
|
1315 |
+
return "No parameters logged for this experiment."
|
1316 |
+
|
1317 |
+
summary = f"π§ PARAMETERS FOR {experiment_id}\n{'='*50}\n"
|
1318 |
+
|
1319 |
+
# Group parameters by category
|
1320 |
+
model_params = {k: v for k, v in params.items() if 'model' in k.lower() or 'name' in k.lower()}
|
1321 |
+
training_params = {k: v for k, v in params.items() if any(x in k.lower() for x in ['learning', 'batch', 'epoch', 'step', 'iter', 'optimizer'])}
|
1322 |
+
data_params = {k: v for k, v in params.items() if any(x in k.lower() for x in ['data', 'dataset', 'file', 'split'])}
|
1323 |
+
other_params = {k: v for k, v in params.items() if k not in model_params and k not in training_params and k not in data_params}
|
1324 |
+
|
1325 |
+
if model_params:
|
1326 |
+
summary += "π€ MODEL PARAMETERS:\n"
|
1327 |
+
for k, v in model_params.items():
|
1328 |
+
summary += f" {k}: {v}\n"
|
1329 |
+
summary += "\n"
|
1330 |
+
|
1331 |
+
if training_params:
|
1332 |
+
summary += "π TRAINING PARAMETERS:\n"
|
1333 |
+
for k, v in training_params.items():
|
1334 |
+
summary += f" {k}: {v}\n"
|
1335 |
+
summary += "\n"
|
1336 |
+
|
1337 |
+
if data_params:
|
1338 |
+
summary += "π DATA PARAMETERS:\n"
|
1339 |
+
for k, v in data_params.items():
|
1340 |
+
summary += f" {k}: {v}\n"
|
1341 |
+
summary += "\n"
|
1342 |
+
|
1343 |
+
if other_params:
|
1344 |
+
summary += "βοΈ OTHER PARAMETERS:\n"
|
1345 |
+
for k, v in other_params.items():
|
1346 |
+
summary += f" {k}: {v}\n"
|
1347 |
+
|
1348 |
+
return summary
|
1349 |
+
except Exception as e:
|
1350 |
+
return f"Error generating parameters summary: {str(e)}"
|
1351 |
+
|
1352 |
+
def get_experiment_metrics_summary(experiment_id: str) -> str:
|
1353 |
+
"""Get a summary of all metrics for an experiment"""
|
1354 |
+
try:
|
1355 |
+
df = get_metrics_dataframe(experiment_id)
|
1356 |
+
if df.empty:
|
1357 |
+
return "No metrics data available for this experiment.\n\nπ‘ This could mean:\nβ’ The experiment hasn't started logging metrics yet\nβ’ The experiment is using a different data format\nβ’ No training has been performed on this experiment"
|
1358 |
+
|
1359 |
+
# Get numeric columns (excluding step and timestamp)
|
1360 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
1361 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
1362 |
+
|
1363 |
+
if not numeric_cols:
|
1364 |
+
return "No numeric metrics found for this experiment.\n\nπ‘ This could mean:\nβ’ Only timestamp data is available\nβ’ Metrics are stored in a different format\nβ’ The experiment hasn't logged any numeric metrics yet"
|
1365 |
+
|
1366 |
+
summary = f"π METRICS SUMMARY FOR {experiment_id}\n{'='*50}\n"
|
1367 |
+
summary += f"Total data points: {len(df)}\n"
|
1368 |
+
summary += f"Steps range: {df['step'].min()} - {df['step'].max()}\n"
|
1369 |
+
summary += f"Available metrics: {', '.join(numeric_cols)}\n\n"
|
1370 |
+
|
1371 |
+
for col in numeric_cols:
|
1372 |
+
if col in df.columns:
|
1373 |
+
values = df[col].dropna()
|
1374 |
+
if len(values) > 0:
|
1375 |
+
summary += f"{col}:\n"
|
1376 |
+
summary += f" Min: {values.min():.6f}\n"
|
1377 |
+
summary += f" Max: {values.max():.6f}\n"
|
1378 |
+
summary += f" Mean: {values.mean():.6f}\n"
|
1379 |
+
summary += f" Latest: {values.iloc[-1]:.6f}\n\n"
|
1380 |
+
|
1381 |
+
return summary
|
1382 |
+
except Exception as e:
|
1383 |
+
return f"Error generating metrics summary: {str(e)}"
|
1384 |
+
|
1385 |
+
def create_combined_metrics_plot(experiment_id: str) -> go.Figure:
|
1386 |
+
"""Create a combined plot showing all metrics for an experiment"""
|
1387 |
+
try:
|
1388 |
+
if not experiment_id:
|
1389 |
+
fig = go.Figure()
|
1390 |
+
fig.add_annotation(
|
1391 |
+
text="No experiment selected",
|
1392 |
+
xref="paper", yref="paper",
|
1393 |
+
x=0.5, y=0.5, showarrow=False,
|
1394 |
+
font=dict(size=16, color="gray")
|
1395 |
+
)
|
1396 |
+
fig.update_layout(
|
1397 |
+
title="Select an Experiment",
|
1398 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
1399 |
+
)
|
1400 |
+
return fig
|
1401 |
+
|
1402 |
+
df = get_metrics_dataframe(experiment_id)
|
1403 |
+
if df.empty:
|
1404 |
+
fig = go.Figure()
|
1405 |
+
fig.add_annotation(
|
1406 |
+
text="No metrics data available for this experiment",
|
1407 |
+
xref="paper", yref="paper",
|
1408 |
+
x=0.5, y=0.5, showarrow=False,
|
1409 |
+
font=dict(size=16, color="red")
|
1410 |
+
)
|
1411 |
+
fig.update_layout(
|
1412 |
+
title="No Data Available",
|
1413 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
1414 |
+
)
|
1415 |
+
return fig
|
1416 |
+
|
1417 |
+
# Get numeric columns (excluding step and timestamp)
|
1418 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
1419 |
+
numeric_cols = [col for col in numeric_cols if col not in ['step']]
|
1420 |
+
|
1421 |
+
if not numeric_cols:
|
1422 |
+
fig = go.Figure()
|
1423 |
+
fig.add_annotation(
|
1424 |
+
text="No numeric metrics found for this experiment",
|
1425 |
+
xref="paper", yref="paper",
|
1426 |
+
x=0.5, y=0.5, showarrow=False,
|
1427 |
+
font=dict(size=16, color="orange")
|
1428 |
+
)
|
1429 |
+
fig.update_layout(
|
1430 |
+
title="No Metrics Found",
|
1431 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
1432 |
+
)
|
1433 |
+
return fig
|
1434 |
+
|
1435 |
+
# Create subplots for multiple metrics
|
1436 |
+
from plotly.subplots import make_subplots
|
1437 |
+
|
1438 |
+
# Determine number of rows and columns for subplots
|
1439 |
+
n_metrics = len(numeric_cols)
|
1440 |
+
n_cols = min(3, n_metrics) # Max 3 columns
|
1441 |
+
n_rows = (n_metrics + n_cols - 1) // n_cols
|
1442 |
+
|
1443 |
+
fig = make_subplots(
|
1444 |
+
rows=n_rows, cols=n_cols,
|
1445 |
+
subplot_titles=numeric_cols,
|
1446 |
+
vertical_spacing=0.05,
|
1447 |
+
horizontal_spacing=0.1
|
1448 |
+
)
|
1449 |
+
|
1450 |
+
# Define colors for different metrics
|
1451 |
+
colors = ['blue', 'red', 'green', 'orange', 'purple', 'brown', 'pink', 'gray', 'cyan', 'magenta']
|
1452 |
+
|
1453 |
+
for i, metric in enumerate(numeric_cols):
|
1454 |
+
if metric in df.columns and not df[metric].isna().all():
|
1455 |
+
row = (i // n_cols) + 1
|
1456 |
+
col = (i % n_cols) + 1
|
1457 |
+
color = colors[i % len(colors)]
|
1458 |
+
|
1459 |
+
fig.add_trace(
|
1460 |
+
go.Scatter(
|
1461 |
+
x=df['step'].tolist(),
|
1462 |
+
y=df[metric].tolist(),
|
1463 |
+
mode='lines+markers',
|
1464 |
+
name=metric,
|
1465 |
+
line=dict(width=2, color=color),
|
1466 |
+
marker=dict(size=4, color=color),
|
1467 |
+
showlegend=False,
|
1468 |
+
connectgaps=True
|
1469 |
+
),
|
1470 |
+
row=row, col=col
|
1471 |
+
)
|
1472 |
+
|
1473 |
+
fig.update_layout(
|
1474 |
+
title=f"All Metrics for Experiment {experiment_id}",
|
1475 |
+
height=350 * n_rows,
|
1476 |
+
plot_bgcolor='white',
|
1477 |
+
paper_bgcolor='white',
|
1478 |
+
font=dict(size=12),
|
1479 |
+
margin=dict(l=50, r=50, t=80, b=50)
|
1480 |
+
)
|
1481 |
+
|
1482 |
+
# Update all subplot axes
|
1483 |
+
for i in range(1, n_rows + 1):
|
1484 |
+
for j in range(1, n_cols + 1):
|
1485 |
+
fig.update_xaxes(
|
1486 |
+
showgrid=True, gridwidth=1, gridcolor='lightgray',
|
1487 |
+
zeroline=True, zerolinecolor='black',
|
1488 |
+
row=i, col=j
|
1489 |
+
)
|
1490 |
+
fig.update_yaxes(
|
1491 |
+
showgrid=True, gridwidth=1, gridcolor='lightgray',
|
1492 |
+
zeroline=True, zerolinecolor='black',
|
1493 |
+
row=i, col=j
|
1494 |
+
)
|
1495 |
+
|
1496 |
+
return fig
|
1497 |
+
|
1498 |
+
except Exception as e:
|
1499 |
+
logger.error(f"Error creating combined metrics plot: {str(e)}")
|
1500 |
+
fig = go.Figure()
|
1501 |
+
fig.add_annotation(
|
1502 |
+
text=f"Error creating combined plot: {str(e)}",
|
1503 |
+
xref="paper", yref="paper",
|
1504 |
+
x=0.5, y=0.5, showarrow=False,
|
1505 |
+
font=dict(size=14, color="red")
|
1506 |
+
)
|
1507 |
+
return fig
|
1508 |
+
|
1509 |
+
def update_dashboard(experiment_id: str) -> tuple:
|
1510 |
+
"""Update all dashboard components for a selected experiment"""
|
1511 |
+
try:
|
1512 |
+
if not experiment_id or experiment_id == "No experiments available":
|
1513 |
+
return (
|
1514 |
+
"Please select an experiment from the dropdown.",
|
1515 |
+
"No experiment selected.",
|
1516 |
+
"No experiment selected.",
|
1517 |
+
create_combined_metrics_plot(""),
|
1518 |
+
"No experiment selected."
|
1519 |
+
)
|
1520 |
+
|
1521 |
+
# Get all the dashboard components
|
1522 |
+
status_summary = get_experiment_status_summary(experiment_id)
|
1523 |
+
parameters_summary = get_experiment_parameters_summary(experiment_id)
|
1524 |
+
metrics_summary = get_experiment_metrics_summary(experiment_id)
|
1525 |
+
combined_plot = create_combined_metrics_plot(experiment_id)
|
1526 |
+
|
1527 |
+
# Create a combined summary
|
1528 |
+
combined_summary = f"{status_summary}\n\n{parameters_summary}\n\n{metrics_summary}"
|
1529 |
+
|
1530 |
+
return (
|
1531 |
+
status_summary,
|
1532 |
+
parameters_summary,
|
1533 |
+
metrics_summary,
|
1534 |
+
combined_plot,
|
1535 |
+
combined_summary
|
1536 |
+
)
|
1537 |
+
except Exception as e:
|
1538 |
+
error_msg = f"Error updating dashboard: {str(e)}"
|
1539 |
+
return (error_msg, error_msg, error_msg, create_combined_metrics_plot(""), error_msg)
|
1540 |
+
|
1541 |
+
def update_dashboard_metric_plot(experiment_id: str, metric_name: str = "loss") -> go.Figure:
|
1542 |
+
"""Update the dashboard metric plot for a selected experiment and metric"""
|
1543 |
+
try:
|
1544 |
+
if not experiment_id or experiment_id == "No experiments available":
|
1545 |
+
return create_metrics_plot("", metric_name)
|
1546 |
+
|
1547 |
+
return create_metrics_plot(experiment_id, metric_name)
|
1548 |
+
except Exception as e:
|
1549 |
+
logger.error(f"Error updating dashboard metric plot: {str(e)}")
|
1550 |
+
return create_metrics_plot("", metric_name)
|
1551 |
+
|
1552 |
+
def create_experiment_comparison_from_selection(selected_experiments: list, selected_metrics: list) -> go.Figure:
|
1553 |
+
"""Create experiment comparison from checkbox selections"""
|
1554 |
+
try:
|
1555 |
+
if not selected_experiments:
|
1556 |
+
fig = go.Figure()
|
1557 |
+
fig.add_annotation(
|
1558 |
+
text="Please select at least one experiment to compare",
|
1559 |
+
xref="paper", yref="paper",
|
1560 |
+
x=0.5, y=0.5, showarrow=False,
|
1561 |
+
font=dict(size=16, color="orange")
|
1562 |
+
)
|
1563 |
+
fig.update_layout(
|
1564 |
+
title="No Experiments Selected",
|
1565 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
1566 |
+
)
|
1567 |
+
return fig
|
1568 |
+
|
1569 |
+
if not selected_metrics:
|
1570 |
+
fig = go.Figure()
|
1571 |
+
fig.add_annotation(
|
1572 |
+
text="Please select at least one metric to compare",
|
1573 |
+
xref="paper", yref="paper",
|
1574 |
+
x=0.5, y=0.5, showarrow=False,
|
1575 |
+
font=dict(size=16, color="orange")
|
1576 |
+
)
|
1577 |
+
fig.update_layout(
|
1578 |
+
title="No Metrics Selected",
|
1579 |
+
plot_bgcolor='white', paper_bgcolor='white'
|
1580 |
+
)
|
1581 |
+
return fig
|
1582 |
+
|
1583 |
+
# Use the existing comparison function with comma-separated IDs
|
1584 |
+
experiment_ids_str = ",".join(selected_experiments)
|
1585 |
+
return create_experiment_comparison(experiment_ids_str)
|
1586 |
+
|
1587 |
+
except Exception as e:
|
1588 |
+
logger.error(f"Error creating comparison from selection: {str(e)}")
|
1589 |
+
fig = go.Figure()
|
1590 |
+
fig.add_annotation(
|
1591 |
+
text=f"Error creating comparison: {str(e)}",
|
1592 |
+
xref="paper", yref="paper",
|
1593 |
+
x=0.5, y=0.5, showarrow=False,
|
1594 |
+
font=dict(size=14, color="red")
|
1595 |
+
)
|
1596 |
+
return fig
|
1597 |
+
|
1598 |
+
def refresh_comparison_options() -> tuple:
|
1599 |
+
"""Refresh the experiment and metric options for comparison"""
|
1600 |
+
try:
|
1601 |
+
# Get updated experiment choices
|
1602 |
+
experiment_choices = get_experiment_dropdown_choices()
|
1603 |
+
if experiment_choices == ["No experiments available"]:
|
1604 |
+
experiment_choices = []
|
1605 |
+
|
1606 |
+
# Get available metrics from all experiments
|
1607 |
+
all_experiments = list(trackio_space.experiments.keys())
|
1608 |
+
available_metrics = get_available_metrics_for_experiments(all_experiments)
|
1609 |
+
|
1610 |
+
# Default to common metrics if available
|
1611 |
+
default_metrics = []
|
1612 |
+
common_metrics = ["loss", "accuracy", "learning_rate", "gpu_memory"]
|
1613 |
+
for metric in common_metrics:
|
1614 |
+
if metric in available_metrics:
|
1615 |
+
default_metrics.append(metric)
|
1616 |
+
|
1617 |
+
# If no common metrics, use first few available
|
1618 |
+
if not default_metrics and available_metrics:
|
1619 |
+
default_metrics = available_metrics[:2]
|
1620 |
+
|
1621 |
+
return gr.CheckboxGroup(choices=experiment_choices, value=[]), gr.CheckboxGroup(choices=available_metrics, value=default_metrics)
|
1622 |
+
except Exception as e:
|
1623 |
+
logger.error(f"Error refreshing comparison options: {str(e)}")
|
1624 |
+
return gr.CheckboxGroup(choices=[], value=[]), gr.CheckboxGroup(choices=["loss", "accuracy"], value=[])
|
1625 |
+
|
1626 |
# Create Gradio interface
|
1627 |
with gr.Blocks(title="Trackio - Experiment Tracking", theme=gr.themes.Soft()) as demo:
|
1628 |
gr.Markdown("# π Trackio Experiment Tracking & Monitoring")
|
1629 |
gr.Markdown("Monitor and track your ML experiments with real-time visualization!")
|
1630 |
|
1631 |
with gr.Tabs():
|
1632 |
+
# Dashboard Tab (NEW)
|
1633 |
+
with gr.Tab("π Dashboard"):
|
1634 |
+
gr.Markdown("### Comprehensive Experiment Dashboard")
|
1635 |
+
gr.Markdown("Select an experiment to view all its data, plots, and information in one place.")
|
1636 |
+
|
1637 |
+
# Row 1: Experiment Selection
|
1638 |
+
with gr.Row():
|
1639 |
+
with gr.Column(scale=3):
|
1640 |
+
# Experiment selection dropdown
|
1641 |
+
experiment_dropdown = gr.Dropdown(
|
1642 |
+
label="Select Experiment",
|
1643 |
+
choices=get_experiment_dropdown_choices(),
|
1644 |
+
value=get_experiment_dropdown_choices()[0] if get_experiment_dropdown_choices() and get_experiment_dropdown_choices()[0] != "No experiments available" else None,
|
1645 |
+
info="Choose an experiment to view its dashboard"
|
1646 |
+
)
|
1647 |
+
|
1648 |
+
with gr.Column(scale=1):
|
1649 |
+
with gr.Row():
|
1650 |
+
refresh_dropdown_btn = gr.Button("π Refresh List", variant="secondary", size="sm")
|
1651 |
+
refresh_dashboard_btn = gr.Button("π Refresh Dashboard", variant="primary", size="sm")
|
1652 |
+
|
1653 |
+
# Row 2: All Metrics Plots
|
1654 |
+
with gr.Row():
|
1655 |
+
with gr.Column(scale=3):
|
1656 |
+
with gr.Row():
|
1657 |
+
gr.Markdown("### π All Metrics Plots")
|
1658 |
+
with gr.Row():
|
1659 |
+
with gr.Column(scale=3):
|
1660 |
+
dashboard_plots = gr.Plot(
|
1661 |
+
label="Training Metrics",
|
1662 |
+
container=True,
|
1663 |
+
show_label=True,
|
1664 |
+
elem_classes=["plot-container"]
|
1665 |
+
)
|
1666 |
+
|
1667 |
+
# Row 3: Training Metrics Visualization Accordion
|
1668 |
+
with gr.Row():
|
1669 |
+
with gr.Accordion("π Training Metrics Visualization", open=False):
|
1670 |
+
with gr.Row():
|
1671 |
+
with gr.Column():
|
1672 |
+
metric_dropdown = gr.Dropdown(
|
1673 |
+
label="Metric to Plot",
|
1674 |
+
choices=[
|
1675 |
+
"loss", "accuracy", "learning_rate", "gpu_memory", "training_time",
|
1676 |
+
"total_tokens", "truncated_tokens", "padding_tokens", "throughput", "step_time",
|
1677 |
+
"batch_size", "seq_len", "token_acc", "train/gate_ortho", "train/center"
|
1678 |
+
],
|
1679 |
+
value="loss"
|
1680 |
+
)
|
1681 |
+
plot_btn = gr.Button("Create Plot", variant="primary")
|
1682 |
+
test_plot_btn = gr.Button("Test Plot Rendering", variant="secondary")
|
1683 |
+
|
1684 |
+
with gr.Row():
|
1685 |
+
dashboard_metric_plot = gr.Plot(
|
1686 |
+
label="Training Metrics",
|
1687 |
+
container=True,
|
1688 |
+
show_label=True,
|
1689 |
+
elem_classes=["plot-container"]
|
1690 |
+
)
|
1691 |
+
|
1692 |
+
plot_btn.click(
|
1693 |
+
create_metrics_plot,
|
1694 |
+
inputs=[experiment_dropdown, metric_dropdown],
|
1695 |
+
outputs=dashboard_metric_plot
|
1696 |
+
)
|
1697 |
+
|
1698 |
+
test_plot_btn.click(
|
1699 |
+
create_test_plot,
|
1700 |
+
inputs=[],
|
1701 |
+
outputs=dashboard_metric_plot
|
1702 |
+
)
|
1703 |
+
|
1704 |
+
# Row 4: Accordion with Detailed Information
|
1705 |
+
with gr.Row():
|
1706 |
+
with gr.Accordion("π Experiment Details", open=False):
|
1707 |
+
with gr.Tabs():
|
1708 |
+
with gr.Tab("π Status"):
|
1709 |
+
dashboard_status = gr.Textbox(
|
1710 |
+
label="Experiment Status",
|
1711 |
+
lines=8,
|
1712 |
+
interactive=False
|
1713 |
+
)
|
1714 |
+
|
1715 |
+
with gr.Tab("π§ Parameters"):
|
1716 |
+
dashboard_parameters = gr.Textbox(
|
1717 |
+
label="Experiment Parameters",
|
1718 |
+
lines=12,
|
1719 |
+
interactive=False
|
1720 |
+
)
|
1721 |
+
|
1722 |
+
with gr.Tab("π Metrics Summary"):
|
1723 |
+
dashboard_metrics = gr.Textbox(
|
1724 |
+
label="Metrics Summary",
|
1725 |
+
lines=12,
|
1726 |
+
interactive=False
|
1727 |
+
)
|
1728 |
+
|
1729 |
+
with gr.Tab("π Complete Summary"):
|
1730 |
+
dashboard_summary = gr.Textbox(
|
1731 |
+
label="Full Experiment Summary",
|
1732 |
+
lines=20,
|
1733 |
+
interactive=False
|
1734 |
+
)
|
1735 |
+
|
1736 |
+
# Connect the dashboard update function
|
1737 |
+
experiment_dropdown.change(
|
1738 |
+
update_dashboard,
|
1739 |
+
inputs=[experiment_dropdown],
|
1740 |
+
outputs=[dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
1741 |
+
)
|
1742 |
+
|
1743 |
+
refresh_dashboard_btn.click(
|
1744 |
+
update_dashboard,
|
1745 |
+
inputs=[experiment_dropdown],
|
1746 |
+
outputs=[dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
1747 |
+
)
|
1748 |
+
|
1749 |
+
# Connect the metric plot update function
|
1750 |
+
metric_dropdown.change(
|
1751 |
+
update_dashboard_metric_plot,
|
1752 |
+
inputs=[experiment_dropdown, metric_dropdown],
|
1753 |
+
outputs=[dashboard_metric_plot]
|
1754 |
+
)
|
1755 |
+
|
1756 |
+
refresh_dropdown_btn.click(
|
1757 |
+
refresh_experiment_dropdown,
|
1758 |
+
inputs=[],
|
1759 |
+
outputs=[experiment_dropdown]
|
1760 |
+
)
|
1761 |
+
|
1762 |
+
|
1763 |
+
# Experiment Comparison Tab
|
1764 |
+
with gr.Tab("π Experiment Comparison"):
|
1765 |
+
gr.Markdown("### Compare Multiple Experiments")
|
1766 |
+
gr.Markdown("Select experiments and metrics to compare from the available options below.")
|
1767 |
+
|
1768 |
+
# Selection controls
|
1769 |
+
with gr.Row():
|
1770 |
+
with gr.Column(scale=2):
|
1771 |
+
gr.Markdown("### Available Experiments")
|
1772 |
+
experiment_checkboxes = gr.CheckboxGroup(
|
1773 |
+
label="Select Experiments to Compare",
|
1774 |
+
choices=get_experiment_dropdown_choices(),
|
1775 |
+
value=[],
|
1776 |
+
info="Choose experiments to include in the comparison"
|
1777 |
+
)
|
1778 |
+
|
1779 |
+
gr.Markdown("### Available Metrics")
|
1780 |
+
metric_checkboxes = gr.CheckboxGroup(
|
1781 |
+
label="Select Metrics to Compare",
|
1782 |
+
choices=get_available_metrics_for_experiments(list(trackio_space.experiments.keys())),
|
1783 |
+
value=["loss", "accuracy"],
|
1784 |
+
info="Choose metrics to include in the comparison"
|
1785 |
+
)
|
1786 |
+
|
1787 |
+
with gr.Row():
|
1788 |
+
comparison_btn = gr.Button("Compare Selected", variant="primary")
|
1789 |
+
refresh_options_btn = gr.Button("π Refresh Options", variant="secondary")
|
1790 |
+
|
1791 |
+
with gr.Column(scale=1):
|
1792 |
+
gr.Markdown("### Comparison Results")
|
1793 |
+
gr.Markdown("The comparison will show subplots for the selected metrics across the selected experiments.")
|
1794 |
+
|
1795 |
+
# Comparison plots as subplots
|
1796 |
+
comparison_plot = gr.Plot(
|
1797 |
+
label="Experiment Comparison Dashboard",
|
1798 |
+
container=True,
|
1799 |
+
show_label=True,
|
1800 |
+
elem_classes=["plot-container"]
|
1801 |
+
)
|
1802 |
+
|
1803 |
+
comparison_btn.click(
|
1804 |
+
create_experiment_comparison_from_selection,
|
1805 |
+
inputs=[experiment_checkboxes, metric_checkboxes],
|
1806 |
+
outputs=comparison_plot
|
1807 |
+
)
|
1808 |
+
|
1809 |
+
refresh_options_btn.click(
|
1810 |
+
refresh_comparison_options,
|
1811 |
+
inputs=[],
|
1812 |
+
outputs=[experiment_checkboxes, metric_checkboxes]
|
1813 |
+
)
|
1814 |
+
|
1815 |
# Configuration Tab
|
1816 |
with gr.Tab("βοΈ Configuration"):
|
1817 |
gr.Markdown("### Configure HF Datasets Connection")
|
|
|
1828 |
dataset_repo_input = gr.Textbox(
|
1829 |
label="Dataset Repository",
|
1830 |
placeholder="your-username/your-dataset-name",
|
1831 |
+
value="Tonic/trackio-experiments",
|
1832 |
info="HF Dataset repository for experiment storage"
|
1833 |
)
|
1834 |
|
|
|
1840 |
gr.Markdown("### Current Configuration")
|
1841 |
current_config_output = gr.Textbox(
|
1842 |
label="Status",
|
1843 |
+
lines=10,
|
1844 |
interactive=False,
|
1845 |
+
value=f"π Dataset: {trackio_space.dataset_repo}\nπ HF Token: {'Set' if trackio_space.hf_token else 'Not set'}\nπ‘οΈ Data Preservation: {'β
Enabled' if trackio_space.dataset_manager else 'β οΈ Legacy Mode'}\nπ Experiments: {len(trackio_space.experiments)}\nπ Available Experiments: {', '.join(list(trackio_space.experiments.keys())[:3])}{'...' if len(trackio_space.experiments) > 3 else ''}"
|
1846 |
)
|
1847 |
|
1848 |
with gr.Column():
|
|
|
1865 |
- `HF_TOKEN`: Your Hugging Face token
|
1866 |
- `TRACKIO_DATASET_REPO`: Dataset repository
|
1867 |
|
1868 |
+
**Data Preservation:**
|
1869 |
+
- β
**Enabled**: All experiment data is preserved when adding/updating experiments
|
1870 |
+
- β οΈ **Legacy Mode**: Data preservation not guaranteed (fallback mode)
|
1871 |
+
- Data preservation requires the dataset management utilities to be available
|
1872 |
+
|
1873 |
**Actions:**
|
1874 |
- **Update Configuration**: Apply new settings and reload experiments
|
1875 |
- **Test Connection**: Verify access to the dataset repository
|
1876 |
- **Create Dataset**: Create a new dataset repository if it doesn't exist
|
1877 |
""")
|
1878 |
|
1879 |
+
# Experiment Management Accordion
|
1880 |
+
with gr.Accordion("π§ Experiment Management", open=False):
|
1881 |
+
with gr.Tabs():
|
1882 |
+
# Create Experiment Tab
|
1883 |
+
with gr.Tab("Create Experiment"):
|
1884 |
+
gr.Markdown("### Create a New Experiment")
|
1885 |
+
with gr.Row():
|
1886 |
+
with gr.Column():
|
1887 |
+
create_exp_name = gr.Textbox(
|
1888 |
+
label="Experiment Name",
|
1889 |
+
placeholder="my_smollm3_finetune",
|
1890 |
+
value="smollm3_finetune"
|
1891 |
+
)
|
1892 |
+
create_exp_description = gr.Textbox(
|
1893 |
+
label="Description",
|
1894 |
+
placeholder="Fine-tuning SmolLM3 model on custom dataset",
|
1895 |
+
value="SmolLM3 fine-tuning experiment"
|
1896 |
+
)
|
1897 |
+
create_exp_btn = gr.Button("Create Experiment", variant="primary")
|
1898 |
+
|
1899 |
+
with gr.Column():
|
1900 |
+
create_exp_output = gr.Textbox(
|
1901 |
+
label="Result",
|
1902 |
+
lines=5,
|
1903 |
+
interactive=False
|
1904 |
+
)
|
1905 |
+
|
1906 |
+
create_exp_btn.click(
|
1907 |
+
create_experiment_interface,
|
1908 |
+
inputs=[create_exp_name, create_exp_description],
|
1909 |
+
outputs=[create_exp_output, experiment_dropdown]
|
1910 |
+
)
|
1911 |
+
|
1912 |
+
# Log Metrics Tab
|
1913 |
+
with gr.Tab("Log Metrics"):
|
1914 |
+
gr.Markdown("### Log Training Metrics")
|
1915 |
+
with gr.Row():
|
1916 |
+
with gr.Column():
|
1917 |
+
log_metrics_exp_id = gr.Textbox(
|
1918 |
+
label="Experiment ID",
|
1919 |
+
placeholder="exp_20231201_143022"
|
1920 |
+
)
|
1921 |
+
log_metrics_json = gr.Textbox(
|
1922 |
+
label="Metrics (JSON)",
|
1923 |
+
placeholder='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5}',
|
1924 |
+
value='{"loss": 0.5, "accuracy": 0.85, "learning_rate": 2e-5, "gpu_memory": 22.5}'
|
1925 |
+
)
|
1926 |
+
log_metrics_step = gr.Textbox(
|
1927 |
+
label="Step (optional)",
|
1928 |
+
placeholder="100"
|
1929 |
+
)
|
1930 |
+
log_metrics_btn = gr.Button("Log Metrics", variant="primary")
|
1931 |
+
|
1932 |
+
with gr.Column():
|
1933 |
+
log_metrics_output = gr.Textbox(
|
1934 |
+
label="Result",
|
1935 |
+
lines=5,
|
1936 |
+
interactive=False
|
1937 |
+
)
|
1938 |
+
|
1939 |
+
log_metrics_btn.click(
|
1940 |
+
log_metrics_interface,
|
1941 |
+
inputs=[log_metrics_exp_id, log_metrics_json, log_metrics_step],
|
1942 |
+
outputs=log_metrics_output
|
1943 |
+
)
|
1944 |
+
|
1945 |
+
# Log Parameters Tab
|
1946 |
+
with gr.Tab("Log Parameters"):
|
1947 |
+
gr.Markdown("### Log Experiment Parameters")
|
1948 |
+
with gr.Row():
|
1949 |
+
with gr.Column():
|
1950 |
+
log_params_exp_id = gr.Textbox(
|
1951 |
+
label="Experiment ID",
|
1952 |
+
placeholder="exp_20231201_143022"
|
1953 |
+
)
|
1954 |
+
log_params_json = gr.Textbox(
|
1955 |
+
label="Parameters (JSON)",
|
1956 |
+
placeholder='{"learning_rate": 2e-5, "batch_size": 4}',
|
1957 |
+
value='{"learning_rate": 3.5e-6, "batch_size": 8, "model_name": "HuggingFaceTB/SmolLM3-3B", "max_iters": 18000, "mixed_precision": "bf16"}'
|
1958 |
+
)
|
1959 |
+
log_params_btn = gr.Button("Log Parameters", variant="primary")
|
1960 |
+
|
1961 |
+
with gr.Column():
|
1962 |
+
log_params_output = gr.Textbox(
|
1963 |
+
label="Result",
|
1964 |
+
lines=5,
|
1965 |
+
interactive=False
|
1966 |
+
)
|
1967 |
+
|
1968 |
+
log_params_btn.click(
|
1969 |
+
log_parameters_interface,
|
1970 |
+
inputs=[log_params_exp_id, log_params_json],
|
1971 |
+
outputs=log_params_output
|
1972 |
+
)
|
1973 |
+
|
1974 |
+
# View Experiments Tab
|
1975 |
+
with gr.Tab("View Experiments"):
|
1976 |
+
gr.Markdown("### View Experiment Details")
|
1977 |
+
with gr.Row():
|
1978 |
+
with gr.Column():
|
1979 |
+
view_exp_id = gr.Textbox(
|
1980 |
+
label="Experiment ID",
|
1981 |
+
placeholder="exp_20231201_143022"
|
1982 |
+
)
|
1983 |
+
view_btn = gr.Button("View Experiment", variant="primary")
|
1984 |
+
list_btn = gr.Button("List All Experiments", variant="secondary")
|
1985 |
+
|
1986 |
+
with gr.Column():
|
1987 |
+
view_output = gr.Textbox(
|
1988 |
+
label="Experiment Details",
|
1989 |
+
lines=20,
|
1990 |
+
interactive=False
|
1991 |
+
)
|
1992 |
+
|
1993 |
+
view_btn.click(
|
1994 |
+
get_experiment_details,
|
1995 |
+
inputs=[view_exp_id],
|
1996 |
+
outputs=view_output
|
1997 |
+
)
|
1998 |
+
|
1999 |
+
list_btn.click(
|
2000 |
+
list_experiments_interface,
|
2001 |
+
inputs=[],
|
2002 |
+
outputs=view_output
|
2003 |
+
)
|
2004 |
+
|
2005 |
+
# Update Status Tab
|
2006 |
+
with gr.Tab("Update Status"):
|
2007 |
+
gr.Markdown("### Update Experiment Status")
|
2008 |
+
with gr.Row():
|
2009 |
+
with gr.Column():
|
2010 |
+
status_exp_id = gr.Textbox(
|
2011 |
+
label="Experiment ID",
|
2012 |
+
placeholder="exp_20231201_143022"
|
2013 |
+
)
|
2014 |
+
status_dropdown = gr.Dropdown(
|
2015 |
+
label="Status",
|
2016 |
+
choices=["running", "completed", "failed", "paused"],
|
2017 |
+
value="running"
|
2018 |
+
)
|
2019 |
+
update_status_btn = gr.Button("Update Status", variant="primary")
|
2020 |
+
|
2021 |
+
with gr.Column():
|
2022 |
+
status_output = gr.Textbox(
|
2023 |
+
label="Result",
|
2024 |
+
lines=3,
|
2025 |
+
interactive=False
|
2026 |
+
)
|
2027 |
+
|
2028 |
+
update_status_btn.click(
|
2029 |
+
update_experiment_status_interface,
|
2030 |
+
inputs=[status_exp_id, status_dropdown],
|
2031 |
+
outputs=status_output
|
2032 |
+
)
|
2033 |
+
|
2034 |
+
# Demo Data Tab
|
2035 |
+
with gr.Tab("Demo Data"):
|
2036 |
+
gr.Markdown("### Generate Demo Training Data")
|
2037 |
+
gr.Markdown("Use this to simulate training data for testing the interface")
|
2038 |
+
with gr.Row():
|
2039 |
+
with gr.Column():
|
2040 |
+
demo_exp_id = gr.Textbox(
|
2041 |
+
label="Experiment ID",
|
2042 |
+
placeholder="exp_20231201_143022"
|
2043 |
+
)
|
2044 |
+
demo_btn = gr.Button("Generate Demo Data", variant="primary")
|
2045 |
+
create_demo_btn = gr.Button("Create Demo Experiment", variant="secondary")
|
2046 |
+
|
2047 |
+
with gr.Column():
|
2048 |
+
demo_output = gr.Textbox(
|
2049 |
+
label="Result",
|
2050 |
+
lines=5,
|
2051 |
+
interactive=False
|
2052 |
+
)
|
2053 |
+
|
2054 |
+
demo_btn.click(
|
2055 |
+
simulate_training_data,
|
2056 |
+
inputs=[demo_exp_id],
|
2057 |
+
outputs=[demo_output, dashboard_status, dashboard_parameters, dashboard_metrics, dashboard_plots, dashboard_summary]
|
2058 |
+
)
|
2059 |
+
|
2060 |
+
create_demo_btn.click(
|
2061 |
+
create_demo_experiment,
|
2062 |
+
inputs=[],
|
2063 |
+
outputs=[demo_output, experiment_dropdown]
|
2064 |
+
)
|
2065 |
+
|
2066 |
update_config_btn.click(
|
2067 |
update_trackio_config,
|
2068 |
inputs=[hf_token_input, dataset_repo_input],
|
|
|
2080 |
inputs=[hf_token_input, dataset_repo_input],
|
2081 |
outputs=current_config_output
|
2082 |
)
|
2083 |
+
|
2084 |
|
2085 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2086 |
|
2087 |
# Launch the app
|
2088 |
if __name__ == "__main__":
|
templates/spaces/trackio/dataset_utils.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Dataset utilities for Trackio experiment data management
|
4 |
+
Provides functions for safe dataset operations with data preservation
|
5 |
+
"""
|
6 |
+
|
7 |
+
import json
|
8 |
+
import logging
|
9 |
+
from datetime import datetime
|
10 |
+
from typing import Dict, Any, List, Optional, Union
|
11 |
+
from datasets import Dataset, load_dataset
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
class TrackioDatasetManager:
|
16 |
+
"""
|
17 |
+
Manager class for Trackio experiment datasets with data preservation.
|
18 |
+
|
19 |
+
This class ensures that existing experiment data is always preserved
|
20 |
+
when adding new experiments or updating existing ones.
|
21 |
+
"""
|
22 |
+
|
23 |
+
def __init__(self, dataset_repo: str, hf_token: str):
|
24 |
+
"""
|
25 |
+
Initialize the dataset manager.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
dataset_repo (str): HF dataset repository ID (e.g., "username/dataset-name")
|
29 |
+
hf_token (str): Hugging Face token for authentication
|
30 |
+
"""
|
31 |
+
self.dataset_repo = dataset_repo
|
32 |
+
self.hf_token = hf_token
|
33 |
+
self._validate_repo_format()
|
34 |
+
|
35 |
+
def _validate_repo_format(self):
|
36 |
+
"""Validate dataset repository format"""
|
37 |
+
if not self.dataset_repo or '/' not in self.dataset_repo:
|
38 |
+
raise ValueError(f"Invalid dataset repository format: {self.dataset_repo}")
|
39 |
+
|
40 |
+
def check_dataset_exists(self) -> bool:
|
41 |
+
"""
|
42 |
+
Check if the dataset repository exists and is accessible.
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
bool: True if dataset exists and is accessible, False otherwise
|
46 |
+
"""
|
47 |
+
try:
|
48 |
+
load_dataset(self.dataset_repo, token=self.hf_token)
|
49 |
+
logger.info(f"β
Dataset {self.dataset_repo} exists and is accessible")
|
50 |
+
return True
|
51 |
+
except Exception as e:
|
52 |
+
logger.info(f"π Dataset {self.dataset_repo} doesn't exist or isn't accessible: {e}")
|
53 |
+
return False
|
54 |
+
|
55 |
+
def load_existing_experiments(self) -> List[Dict[str, Any]]:
|
56 |
+
"""
|
57 |
+
Load all existing experiments from the dataset.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
List[Dict[str, Any]]: List of existing experiment dictionaries
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
if not self.check_dataset_exists():
|
64 |
+
logger.info("π No existing dataset found, returning empty list")
|
65 |
+
return []
|
66 |
+
|
67 |
+
dataset = load_dataset(self.dataset_repo, token=self.hf_token)
|
68 |
+
|
69 |
+
if 'train' not in dataset:
|
70 |
+
logger.info("π No 'train' split found in dataset")
|
71 |
+
return []
|
72 |
+
|
73 |
+
experiments = list(dataset['train'])
|
74 |
+
logger.info(f"π Loaded {len(experiments)} existing experiments")
|
75 |
+
|
76 |
+
# Validate experiment structure
|
77 |
+
valid_experiments = []
|
78 |
+
for exp in experiments:
|
79 |
+
if self._validate_experiment_structure(exp):
|
80 |
+
valid_experiments.append(exp)
|
81 |
+
else:
|
82 |
+
logger.warning(f"β οΈ Skipping invalid experiment: {exp.get('experiment_id', 'unknown')}")
|
83 |
+
|
84 |
+
logger.info(f"π {len(valid_experiments)} valid experiments loaded")
|
85 |
+
return valid_experiments
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
logger.error(f"β Failed to load existing experiments: {e}")
|
89 |
+
return []
|
90 |
+
|
91 |
+
def _validate_experiment_structure(self, experiment: Dict[str, Any]) -> bool:
|
92 |
+
"""
|
93 |
+
Validate that an experiment has the required structure.
|
94 |
+
|
95 |
+
Args:
|
96 |
+
experiment (Dict[str, Any]): Experiment dictionary to validate
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
bool: True if experiment structure is valid
|
100 |
+
"""
|
101 |
+
required_fields = [
|
102 |
+
'experiment_id', 'name', 'description', 'created_at',
|
103 |
+
'status', 'metrics', 'parameters', 'artifacts', 'logs'
|
104 |
+
]
|
105 |
+
|
106 |
+
for field in required_fields:
|
107 |
+
if field not in experiment:
|
108 |
+
logger.warning(f"β οΈ Missing required field '{field}' in experiment")
|
109 |
+
return False
|
110 |
+
|
111 |
+
# Validate JSON fields
|
112 |
+
json_fields = ['metrics', 'parameters', 'artifacts', 'logs']
|
113 |
+
for field in json_fields:
|
114 |
+
if isinstance(experiment[field], str):
|
115 |
+
try:
|
116 |
+
json.loads(experiment[field])
|
117 |
+
except json.JSONDecodeError:
|
118 |
+
logger.warning(f"β οΈ Invalid JSON in field '{field}' for experiment {experiment.get('experiment_id')}")
|
119 |
+
return False
|
120 |
+
|
121 |
+
return True
|
122 |
+
|
123 |
+
def save_experiments(self, experiments: List[Dict[str, Any]], commit_message: Optional[str] = None) -> bool:
|
124 |
+
"""
|
125 |
+
Save a list of experiments to the dataset, preserving data integrity.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
experiments (List[Dict[str, Any]]): List of experiment dictionaries
|
129 |
+
commit_message (Optional[str]): Custom commit message
|
130 |
+
|
131 |
+
Returns:
|
132 |
+
bool: True if save was successful, False otherwise
|
133 |
+
"""
|
134 |
+
try:
|
135 |
+
if not experiments:
|
136 |
+
logger.warning("β οΈ No experiments to save")
|
137 |
+
return False
|
138 |
+
|
139 |
+
# Validate all experiments before saving
|
140 |
+
valid_experiments = []
|
141 |
+
for exp in experiments:
|
142 |
+
if self._validate_experiment_structure(exp):
|
143 |
+
# Ensure last_updated is set
|
144 |
+
if 'last_updated' not in exp:
|
145 |
+
exp['last_updated'] = datetime.now().isoformat()
|
146 |
+
valid_experiments.append(exp)
|
147 |
+
else:
|
148 |
+
logger.error(f"β Invalid experiment structure: {exp.get('experiment_id', 'unknown')}")
|
149 |
+
return False
|
150 |
+
|
151 |
+
# Create dataset
|
152 |
+
dataset = Dataset.from_list(valid_experiments)
|
153 |
+
|
154 |
+
# Generate commit message if not provided
|
155 |
+
if not commit_message:
|
156 |
+
commit_message = f"Update dataset with {len(valid_experiments)} experiments ({datetime.now().isoformat()})"
|
157 |
+
|
158 |
+
# Push to hub
|
159 |
+
dataset.push_to_hub(
|
160 |
+
self.dataset_repo,
|
161 |
+
token=self.hf_token,
|
162 |
+
private=True,
|
163 |
+
commit_message=commit_message
|
164 |
+
)
|
165 |
+
|
166 |
+
logger.info(f"β
Successfully saved {len(valid_experiments)} experiments to {self.dataset_repo}")
|
167 |
+
return True
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
logger.error(f"β Failed to save experiments to dataset: {e}")
|
171 |
+
return False
|
172 |
+
|
173 |
+
def upsert_experiment(self, experiment: Dict[str, Any]) -> bool:
|
174 |
+
"""
|
175 |
+
Insert a new experiment or update an existing one, preserving all other data.
|
176 |
+
|
177 |
+
Args:
|
178 |
+
experiment (Dict[str, Any]): Experiment dictionary to upsert
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
bool: True if operation was successful, False otherwise
|
182 |
+
"""
|
183 |
+
try:
|
184 |
+
# Validate the experiment structure
|
185 |
+
if not self._validate_experiment_structure(experiment):
|
186 |
+
logger.error(f"β Invalid experiment structure for {experiment.get('experiment_id', 'unknown')}")
|
187 |
+
return False
|
188 |
+
|
189 |
+
# Load existing experiments
|
190 |
+
existing_experiments = self.load_existing_experiments()
|
191 |
+
|
192 |
+
# Find if experiment already exists
|
193 |
+
experiment_id = experiment['experiment_id']
|
194 |
+
experiment_found = False
|
195 |
+
updated_experiments = []
|
196 |
+
|
197 |
+
for existing_exp in existing_experiments:
|
198 |
+
if existing_exp.get('experiment_id') == experiment_id:
|
199 |
+
# Update existing experiment
|
200 |
+
logger.info(f"π Updating existing experiment: {experiment_id}")
|
201 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
202 |
+
updated_experiments.append(experiment)
|
203 |
+
experiment_found = True
|
204 |
+
else:
|
205 |
+
# Preserve existing experiment
|
206 |
+
updated_experiments.append(existing_exp)
|
207 |
+
|
208 |
+
# If experiment doesn't exist, add it
|
209 |
+
if not experiment_found:
|
210 |
+
logger.info(f"β Adding new experiment: {experiment_id}")
|
211 |
+
experiment['last_updated'] = datetime.now().isoformat()
|
212 |
+
updated_experiments.append(experiment)
|
213 |
+
|
214 |
+
# Save all experiments
|
215 |
+
commit_message = f"{'Update' if experiment_found else 'Add'} experiment {experiment_id} (preserving {len(existing_experiments)} existing experiments)"
|
216 |
+
|
217 |
+
return self.save_experiments(updated_experiments, commit_message)
|
218 |
+
|
219 |
+
except Exception as e:
|
220 |
+
logger.error(f"β Failed to upsert experiment: {e}")
|
221 |
+
return False
|
222 |
+
|
223 |
+
def get_experiment_by_id(self, experiment_id: str) -> Optional[Dict[str, Any]]:
|
224 |
+
"""
|
225 |
+
Retrieve a specific experiment by its ID.
|
226 |
+
|
227 |
+
Args:
|
228 |
+
experiment_id (str): The experiment ID to search for
|
229 |
+
|
230 |
+
Returns:
|
231 |
+
Optional[Dict[str, Any]]: The experiment dictionary if found, None otherwise
|
232 |
+
"""
|
233 |
+
try:
|
234 |
+
experiments = self.load_existing_experiments()
|
235 |
+
|
236 |
+
for exp in experiments:
|
237 |
+
if exp.get('experiment_id') == experiment_id:
|
238 |
+
logger.info(f"β
Found experiment: {experiment_id}")
|
239 |
+
return exp
|
240 |
+
|
241 |
+
logger.info(f"π Experiment not found: {experiment_id}")
|
242 |
+
return None
|
243 |
+
|
244 |
+
except Exception as e:
|
245 |
+
logger.error(f"β Failed to get experiment {experiment_id}: {e}")
|
246 |
+
return None
|
247 |
+
|
248 |
+
def list_experiments(self, status_filter: Optional[str] = None) -> List[Dict[str, Any]]:
|
249 |
+
"""
|
250 |
+
List all experiments, optionally filtered by status.
|
251 |
+
|
252 |
+
Args:
|
253 |
+
status_filter (Optional[str]): Filter by experiment status (running, completed, failed, paused)
|
254 |
+
|
255 |
+
Returns:
|
256 |
+
List[Dict[str, Any]]: List of experiments matching the filter
|
257 |
+
"""
|
258 |
+
try:
|
259 |
+
experiments = self.load_existing_experiments()
|
260 |
+
|
261 |
+
if status_filter:
|
262 |
+
filtered_experiments = [exp for exp in experiments if exp.get('status') == status_filter]
|
263 |
+
logger.info(f"π Found {len(filtered_experiments)} experiments with status '{status_filter}'")
|
264 |
+
return filtered_experiments
|
265 |
+
|
266 |
+
logger.info(f"π Found {len(experiments)} total experiments")
|
267 |
+
return experiments
|
268 |
+
|
269 |
+
except Exception as e:
|
270 |
+
logger.error(f"β Failed to list experiments: {e}")
|
271 |
+
return []
|
272 |
+
|
273 |
+
def backup_dataset(self, backup_suffix: Optional[str] = None) -> str:
|
274 |
+
"""
|
275 |
+
Create a backup of the current dataset.
|
276 |
+
|
277 |
+
Args:
|
278 |
+
backup_suffix (Optional[str]): Optional suffix for backup repo name
|
279 |
+
|
280 |
+
Returns:
|
281 |
+
str: Backup repository name if successful, empty string otherwise
|
282 |
+
"""
|
283 |
+
try:
|
284 |
+
if not backup_suffix:
|
285 |
+
backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')
|
286 |
+
|
287 |
+
backup_repo = f"{self.dataset_repo}-backup-{backup_suffix}"
|
288 |
+
|
289 |
+
# Load current experiments
|
290 |
+
experiments = self.load_existing_experiments()
|
291 |
+
|
292 |
+
if not experiments:
|
293 |
+
logger.warning("β οΈ No experiments to backup")
|
294 |
+
return ""
|
295 |
+
|
296 |
+
# Create backup dataset manager
|
297 |
+
backup_manager = TrackioDatasetManager(backup_repo, self.hf_token)
|
298 |
+
|
299 |
+
# Save to backup
|
300 |
+
success = backup_manager.save_experiments(
|
301 |
+
experiments,
|
302 |
+
f"Backup of {self.dataset_repo} created on {datetime.now().isoformat()}"
|
303 |
+
)
|
304 |
+
|
305 |
+
if success:
|
306 |
+
logger.info(f"β
Backup created: {backup_repo}")
|
307 |
+
return backup_repo
|
308 |
+
else:
|
309 |
+
logger.error("β Failed to create backup")
|
310 |
+
return ""
|
311 |
+
|
312 |
+
except Exception as e:
|
313 |
+
logger.error(f"β Failed to create backup: {e}")
|
314 |
+
return ""
|
315 |
+
|
316 |
+
|
317 |
+
def create_dataset_manager(dataset_repo: str, hf_token: str) -> TrackioDatasetManager:
|
318 |
+
"""
|
319 |
+
Factory function to create a TrackioDatasetManager instance.
|
320 |
+
|
321 |
+
Args:
|
322 |
+
dataset_repo (str): HF dataset repository ID
|
323 |
+
hf_token (str): Hugging Face token
|
324 |
+
|
325 |
+
Returns:
|
326 |
+
TrackioDatasetManager: Configured dataset manager instance
|
327 |
+
"""
|
328 |
+
return TrackioDatasetManager(dataset_repo, hf_token)
|
templates/spaces/{requirements.txt β trackio/requirements.txt}
RENAMED
File without changes
|
templates/spaces/trackio/trackio_api_client.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Trackio API Client for Hugging Face Spaces
|
4 |
+
Uses gradio_client for proper API communication with automatic Space URL resolution
|
5 |
+
"""
|
6 |
+
|
7 |
+
import requests
|
8 |
+
import json
|
9 |
+
import time
|
10 |
+
import logging
|
11 |
+
from typing import Dict, Any, Optional
|
12 |
+
from datetime import datetime
|
13 |
+
import os
|
14 |
+
|
15 |
+
# Setup logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
try:
|
20 |
+
from gradio_client import Client
|
21 |
+
GRADIO_CLIENT_AVAILABLE = True
|
22 |
+
except ImportError:
|
23 |
+
GRADIO_CLIENT_AVAILABLE = False
|
24 |
+
logger.warning("gradio_client not available. Install with: pip install gradio_client")
|
25 |
+
|
26 |
+
try:
|
27 |
+
from huggingface_hub import HfApi
|
28 |
+
HF_HUB_AVAILABLE = True
|
29 |
+
except ImportError:
|
30 |
+
HF_HUB_AVAILABLE = False
|
31 |
+
logger.warning("huggingface_hub not available. Install with: pip install huggingface-hub")
|
32 |
+
|
33 |
+
class TrackioAPIClient:
|
34 |
+
"""API client for Trackio Space using gradio_client with automatic Space URL resolution"""
|
35 |
+
|
36 |
+
def __init__(self, space_id: str, hf_token: Optional[str] = None):
|
37 |
+
self.space_id = space_id
|
38 |
+
self.hf_token = hf_token
|
39 |
+
self.client = None
|
40 |
+
|
41 |
+
# Auto-resolve Space URL
|
42 |
+
self.space_url = self._resolve_space_url()
|
43 |
+
|
44 |
+
# Initialize gradio client
|
45 |
+
if GRADIO_CLIENT_AVAILABLE and self.space_url:
|
46 |
+
try:
|
47 |
+
self.client = Client(self.space_url)
|
48 |
+
logger.info(f"β
Connected to Trackio Space: {self.space_id}")
|
49 |
+
except Exception as e:
|
50 |
+
logger.error(f"β Failed to connect to Trackio Space: {e}")
|
51 |
+
self.client = None
|
52 |
+
else:
|
53 |
+
logger.error("β gradio_client not available. Install with: pip install gradio_client")
|
54 |
+
|
55 |
+
def _resolve_space_url(self) -> Optional[str]:
|
56 |
+
"""Resolve Space URL using Hugging Face Hub API"""
|
57 |
+
try:
|
58 |
+
# Clean the space_id - remove any URL prefixes
|
59 |
+
clean_space_id = self.space_id
|
60 |
+
if clean_space_id.startswith('http'):
|
61 |
+
# Extract space ID from URL
|
62 |
+
if '/spaces/' in clean_space_id:
|
63 |
+
clean_space_id = clean_space_id.split('/spaces/')[-1]
|
64 |
+
else:
|
65 |
+
# Try to extract from URL format
|
66 |
+
clean_space_id = clean_space_id.replace('https://', '').replace('http://', '')
|
67 |
+
if '.hf.space' in clean_space_id:
|
68 |
+
clean_space_id = clean_space_id.replace('.hf.space', '').replace('-', '/')
|
69 |
+
|
70 |
+
logger.info(f"π§ Resolving Space URL for ID: {clean_space_id}")
|
71 |
+
|
72 |
+
if not HF_HUB_AVAILABLE:
|
73 |
+
logger.warning("β οΈ Hugging Face Hub not available, using default URL format")
|
74 |
+
# Fallback to default URL format
|
75 |
+
space_name = clean_space_id.replace('/', '-')
|
76 |
+
return f"https://{space_name}.hf.space"
|
77 |
+
|
78 |
+
# Use Hugging Face Hub API to get Space info
|
79 |
+
api = HfApi(token=self.hf_token)
|
80 |
+
|
81 |
+
# Get Space info
|
82 |
+
space_info = api.space_info(clean_space_id)
|
83 |
+
if space_info and hasattr(space_info, 'host'):
|
84 |
+
# Use the host directly from space_info
|
85 |
+
space_url = space_info.host
|
86 |
+
logger.info(f"β
Resolved Space URL: {space_url}")
|
87 |
+
return space_url
|
88 |
+
else:
|
89 |
+
# Fallback to default URL format
|
90 |
+
space_name = clean_space_id.replace('/', '-')
|
91 |
+
space_url = f"https://{space_name}.hf.space"
|
92 |
+
logger.info(f"β
Using fallback Space URL: {space_url}")
|
93 |
+
return space_url
|
94 |
+
|
95 |
+
except Exception as e:
|
96 |
+
logger.warning(f"β οΈ Failed to resolve Space URL: {e}")
|
97 |
+
# Fallback to default URL format
|
98 |
+
space_name = self.space_id.replace('/', '-')
|
99 |
+
space_url = f"https://{space_name}.hf.space"
|
100 |
+
logger.info(f"β
Using fallback Space URL: {space_url}")
|
101 |
+
return space_url
|
102 |
+
|
103 |
+
def _make_api_call(self, api_name: str, *args) -> Dict[str, Any]:
|
104 |
+
"""Make an API call to the Trackio Space using gradio_client"""
|
105 |
+
if not self.client:
|
106 |
+
return {"error": "Client not available"}
|
107 |
+
|
108 |
+
try:
|
109 |
+
logger.debug(f"Making API call to {api_name} with args: {args}")
|
110 |
+
|
111 |
+
# Use gradio_client to make the prediction
|
112 |
+
result = self.client.predict(*args, api_name=api_name)
|
113 |
+
|
114 |
+
logger.debug(f"API call result: {result}")
|
115 |
+
return {"success": True, "data": result}
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
logger.error(f"API call failed for {api_name}: {e}")
|
119 |
+
return {"error": f"API call failed: {str(e)}"}
|
120 |
+
|
121 |
+
def create_experiment(self, name: str, description: str = "") -> Dict[str, Any]:
|
122 |
+
"""Create a new experiment"""
|
123 |
+
logger.info(f"Creating experiment: {name}")
|
124 |
+
|
125 |
+
result = self._make_api_call("/create_experiment_interface", name, description)
|
126 |
+
|
127 |
+
if "success" in result:
|
128 |
+
logger.info(f"Experiment created successfully: {result['data']}")
|
129 |
+
return result
|
130 |
+
else:
|
131 |
+
logger.error(f"Failed to create experiment: {result}")
|
132 |
+
return result
|
133 |
+
|
134 |
+
def log_metrics(self, experiment_id: str, metrics: Dict[str, Any], step: Optional[int] = None) -> Dict[str, Any]:
|
135 |
+
"""Log metrics for an experiment"""
|
136 |
+
metrics_json = json.dumps(metrics)
|
137 |
+
step_str = str(step) if step is not None else ""
|
138 |
+
|
139 |
+
logger.info(f"Logging metrics for experiment {experiment_id} at step {step}")
|
140 |
+
|
141 |
+
result = self._make_api_call("/log_metrics_interface", experiment_id, metrics_json, step_str)
|
142 |
+
|
143 |
+
if "success" in result:
|
144 |
+
logger.info(f"Metrics logged successfully: {result['data']}")
|
145 |
+
return result
|
146 |
+
else:
|
147 |
+
logger.error(f"Failed to log metrics: {result}")
|
148 |
+
return result
|
149 |
+
|
150 |
+
def log_parameters(self, experiment_id: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
|
151 |
+
"""Log parameters for an experiment"""
|
152 |
+
parameters_json = json.dumps(parameters)
|
153 |
+
|
154 |
+
logger.info(f"Logging parameters for experiment {experiment_id}")
|
155 |
+
|
156 |
+
result = self._make_api_call("/log_parameters_interface", experiment_id, parameters_json)
|
157 |
+
|
158 |
+
if "success" in result:
|
159 |
+
logger.info(f"Parameters logged successfully: {result['data']}")
|
160 |
+
return result
|
161 |
+
else:
|
162 |
+
logger.error(f"Failed to log parameters: {result}")
|
163 |
+
return result
|
164 |
+
|
165 |
+
def get_experiment_details(self, experiment_id: str) -> Dict[str, Any]:
|
166 |
+
"""Get experiment details"""
|
167 |
+
logger.info(f"Getting details for experiment {experiment_id}")
|
168 |
+
|
169 |
+
result = self._make_api_call("/get_experiment_details", experiment_id)
|
170 |
+
|
171 |
+
if "success" in result:
|
172 |
+
logger.info(f"Experiment details retrieved: {result['data']}")
|
173 |
+
return result
|
174 |
+
else:
|
175 |
+
logger.error(f"Failed to get experiment details: {result}")
|
176 |
+
return result
|
177 |
+
|
178 |
+
def list_experiments(self) -> Dict[str, Any]:
|
179 |
+
"""List all experiments"""
|
180 |
+
logger.info("Listing experiments")
|
181 |
+
|
182 |
+
result = self._make_api_call("/list_experiments_interface")
|
183 |
+
|
184 |
+
if "success" in result:
|
185 |
+
logger.info(f"Experiments listed successfully: {result['data']}")
|
186 |
+
return result
|
187 |
+
else:
|
188 |
+
logger.error(f"Failed to list experiments: {result}")
|
189 |
+
return result
|
190 |
+
|
191 |
+
def update_experiment_status(self, experiment_id: str, status: str) -> Dict[str, Any]:
|
192 |
+
"""Update experiment status"""
|
193 |
+
logger.info(f"Updating experiment {experiment_id} status to {status}")
|
194 |
+
|
195 |
+
result = self._make_api_call("/update_experiment_status_interface", experiment_id, status)
|
196 |
+
|
197 |
+
if "success" in result:
|
198 |
+
logger.info(f"Experiment status updated successfully: {result['data']}")
|
199 |
+
return result
|
200 |
+
else:
|
201 |
+
logger.error(f"Failed to update experiment status: {result}")
|
202 |
+
return result
|
203 |
+
|
204 |
+
def simulate_training_data(self, experiment_id: str) -> Dict[str, Any]:
|
205 |
+
"""Simulate training data for testing"""
|
206 |
+
logger.info(f"Simulating training data for experiment {experiment_id}")
|
207 |
+
|
208 |
+
result = self._make_api_call("/simulate_training_data", experiment_id)
|
209 |
+
|
210 |
+
if "success" in result:
|
211 |
+
logger.info(f"Training data simulated successfully: {result['data']}")
|
212 |
+
return result
|
213 |
+
else:
|
214 |
+
logger.error(f"Failed to simulate training data: {result}")
|
215 |
+
return result
|
216 |
+
|
217 |
+
def get_training_metrics(self, experiment_id: str) -> Dict[str, Any]:
|
218 |
+
"""Get training metrics for an experiment"""
|
219 |
+
logger.info(f"Getting training metrics for experiment {experiment_id}")
|
220 |
+
|
221 |
+
result = self._make_api_call("/get_experiment_details", experiment_id)
|
222 |
+
|
223 |
+
if "success" in result:
|
224 |
+
logger.info(f"Training metrics retrieved: {result['data']}")
|
225 |
+
return result
|
226 |
+
else:
|
227 |
+
logger.error(f"Failed to get training metrics: {result}")
|
228 |
+
return result
|
229 |
+
|
230 |
+
def create_metrics_plot(self, experiment_id: str, metric_name: str = "loss") -> Dict[str, Any]:
|
231 |
+
"""Create a metrics plot for an experiment"""
|
232 |
+
logger.info(f"Creating metrics plot for experiment {experiment_id}, metric: {metric_name}")
|
233 |
+
|
234 |
+
result = self._make_api_call("/create_metrics_plot", experiment_id, metric_name)
|
235 |
+
|
236 |
+
if "success" in result:
|
237 |
+
logger.info(f"Metrics plot created successfully")
|
238 |
+
return result
|
239 |
+
else:
|
240 |
+
logger.error(f"Failed to create metrics plot: {result}")
|
241 |
+
return result
|
242 |
+
|
243 |
+
def create_experiment_comparison(self, experiment_ids: str) -> Dict[str, Any]:
|
244 |
+
"""Compare multiple experiments"""
|
245 |
+
logger.info(f"Creating experiment comparison for: {experiment_ids}")
|
246 |
+
|
247 |
+
result = self._make_api_call("/create_experiment_comparison", experiment_ids)
|
248 |
+
|
249 |
+
if "success" in result:
|
250 |
+
logger.info(f"Experiment comparison created successfully")
|
251 |
+
return result
|
252 |
+
else:
|
253 |
+
logger.error(f"Failed to create experiment comparison: {result}")
|
254 |
+
return result
|
255 |
+
|
256 |
+
def test_connection(self) -> Dict[str, Any]:
|
257 |
+
"""Test connection to the Trackio Space"""
|
258 |
+
logger.info("Testing connection to Trackio Space")
|
259 |
+
|
260 |
+
try:
|
261 |
+
# Try to list experiments as a connection test
|
262 |
+
result = self.list_experiments()
|
263 |
+
if "success" in result:
|
264 |
+
return {"success": True, "message": "Connection successful"}
|
265 |
+
else:
|
266 |
+
return {"error": "Connection failed", "details": result}
|
267 |
+
except Exception as e:
|
268 |
+
return {"error": f"Connection test failed: {str(e)}"}
|
269 |
+
|
270 |
+
def get_space_info(self) -> Dict[str, Any]:
|
271 |
+
"""Get information about the Space"""
|
272 |
+
try:
|
273 |
+
if not HF_HUB_AVAILABLE:
|
274 |
+
return {"error": "Hugging Face Hub not available"}
|
275 |
+
|
276 |
+
api = HfApi(token=self.hf_token)
|
277 |
+
space_info = api.space_info(self.space_id)
|
278 |
+
|
279 |
+
return {
|
280 |
+
"success": True,
|
281 |
+
"data": {
|
282 |
+
"space_id": self.space_id,
|
283 |
+
"space_url": self.space_url,
|
284 |
+
"space_info": {
|
285 |
+
"title": getattr(space_info, 'title', 'Unknown'),
|
286 |
+
"host": getattr(space_info, 'host', 'Unknown'),
|
287 |
+
"stage": getattr(space_info, 'stage', 'Unknown'),
|
288 |
+
"visibility": getattr(space_info, 'visibility', 'Unknown')
|
289 |
+
}
|
290 |
+
}
|
291 |
+
}
|
292 |
+
except Exception as e:
|
293 |
+
return {"error": f"Failed to get Space info: {str(e)}"}
|
294 |
+
|
295 |
+
# Factory function to create client with dynamic configuration
|
296 |
+
def create_trackio_client(space_id: Optional[str] = None, hf_token: Optional[str] = None) -> TrackioAPIClient:
|
297 |
+
"""Create a TrackioAPIClient with dynamic configuration"""
|
298 |
+
|
299 |
+
# Get space_id from environment if not provided
|
300 |
+
if not space_id:
|
301 |
+
space_id = os.environ.get('TRACKIO_URL')
|
302 |
+
if not space_id:
|
303 |
+
# Try to construct from username and space name
|
304 |
+
username = os.environ.get('HF_USERNAME')
|
305 |
+
space_name = os.environ.get('TRACKIO_SPACE_NAME')
|
306 |
+
if username and space_name:
|
307 |
+
space_id = f"https://huggingface.co/spaces/{username}/{space_name}"
|
308 |
+
else:
|
309 |
+
logger.warning("β οΈ No space_id provided and could not determine from environment")
|
310 |
+
return None
|
311 |
+
|
312 |
+
# Get HF token from environment if not provided
|
313 |
+
if not hf_token:
|
314 |
+
hf_token = os.environ.get('HF_TOKEN')
|
315 |
+
|
316 |
+
if not space_id:
|
317 |
+
logger.error("β No space_id available for TrackioAPIClient")
|
318 |
+
return None
|
319 |
+
|
320 |
+
return TrackioAPIClient(space_id, hf_token)
|
tests/test_data_preservation.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script to validate data preservation in Trackio dataset operations
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import json
|
9 |
+
import tempfile
|
10 |
+
import logging
|
11 |
+
from datetime import datetime
|
12 |
+
from typing import Dict, Any
|
13 |
+
|
14 |
+
# Add src to path for imports
|
15 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
16 |
+
|
17 |
+
from dataset_utils import TrackioDatasetManager
|
18 |
+
|
19 |
+
# Setup logging
|
20 |
+
logging.basicConfig(level=logging.INFO)
|
21 |
+
logger = logging.getLogger(__name__)
|
22 |
+
|
23 |
+
def create_sample_experiment(experiment_id: str, name: str, status: str = "running") -> Dict[str, Any]:
|
24 |
+
"""Create a sample experiment for testing"""
|
25 |
+
return {
|
26 |
+
'experiment_id': experiment_id,
|
27 |
+
'name': name,
|
28 |
+
'description': f"Test experiment {name}",
|
29 |
+
'created_at': datetime.now().isoformat(),
|
30 |
+
'status': status,
|
31 |
+
'metrics': json.dumps([
|
32 |
+
{
|
33 |
+
'timestamp': datetime.now().isoformat(),
|
34 |
+
'step': 100,
|
35 |
+
'metrics': {
|
36 |
+
'loss': 1.5,
|
37 |
+
'accuracy': 0.85,
|
38 |
+
'learning_rate': 5e-6
|
39 |
+
}
|
40 |
+
}
|
41 |
+
]),
|
42 |
+
'parameters': json.dumps({
|
43 |
+
'model_name': 'HuggingFaceTB/SmolLM3-3B',
|
44 |
+
'batch_size': 8,
|
45 |
+
'learning_rate': 5e-6
|
46 |
+
}),
|
47 |
+
'artifacts': json.dumps([]),
|
48 |
+
'logs': json.dumps([]),
|
49 |
+
'last_updated': datetime.now().isoformat()
|
50 |
+
}
|
51 |
+
|
52 |
+
def test_data_preservation():
|
53 |
+
"""Test data preservation functionality"""
|
54 |
+
# Get HF token from environment
|
55 |
+
hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
|
56 |
+
|
57 |
+
if not hf_token:
|
58 |
+
logger.error("β HF_TOKEN not found in environment variables")
|
59 |
+
logger.info("Please set HF_TOKEN or HUGGING_FACE_HUB_TOKEN environment variable")
|
60 |
+
return False
|
61 |
+
|
62 |
+
# Use a test dataset repository
|
63 |
+
test_dataset_repo = "tonic/trackio-test-preservation"
|
64 |
+
|
65 |
+
try:
|
66 |
+
logger.info("π§ͺ Starting data preservation test")
|
67 |
+
logger.info(f"π Test dataset: {test_dataset_repo}")
|
68 |
+
|
69 |
+
# Initialize dataset manager
|
70 |
+
dataset_manager = TrackioDatasetManager(test_dataset_repo, hf_token)
|
71 |
+
|
72 |
+
# Test 1: Check if dataset exists
|
73 |
+
logger.info("\nπ Test 1: Checking dataset existence...")
|
74 |
+
exists = dataset_manager.check_dataset_exists()
|
75 |
+
logger.info(f"Dataset exists: {exists}")
|
76 |
+
|
77 |
+
# Test 2: Load existing experiments (should handle empty/non-existent gracefully)
|
78 |
+
logger.info("\nπ Test 2: Loading existing experiments...")
|
79 |
+
existing_experiments = dataset_manager.load_existing_experiments()
|
80 |
+
logger.info(f"Found {len(existing_experiments)} existing experiments")
|
81 |
+
|
82 |
+
# Test 3: Add first experiment
|
83 |
+
logger.info("\nπ Test 3: Adding first experiment...")
|
84 |
+
exp1 = create_sample_experiment("test_exp_001", "First Test Experiment")
|
85 |
+
success = dataset_manager.upsert_experiment(exp1)
|
86 |
+
logger.info(f"First experiment added: {success}")
|
87 |
+
|
88 |
+
if not success:
|
89 |
+
logger.error("β Failed to add first experiment")
|
90 |
+
return False
|
91 |
+
|
92 |
+
# Test 4: Add second experiment (should preserve first)
|
93 |
+
logger.info("\nπ Test 4: Adding second experiment...")
|
94 |
+
exp2 = create_sample_experiment("test_exp_002", "Second Test Experiment")
|
95 |
+
success = dataset_manager.upsert_experiment(exp2)
|
96 |
+
logger.info(f"Second experiment added: {success}")
|
97 |
+
|
98 |
+
if not success:
|
99 |
+
logger.error("β Failed to add second experiment")
|
100 |
+
return False
|
101 |
+
|
102 |
+
# Test 5: Verify both experiments exist
|
103 |
+
logger.info("\nπ Test 5: Verifying both experiments exist...")
|
104 |
+
all_experiments = dataset_manager.load_existing_experiments()
|
105 |
+
logger.info(f"Total experiments after adding two: {len(all_experiments)}")
|
106 |
+
|
107 |
+
exp_ids = [exp.get('experiment_id') for exp in all_experiments]
|
108 |
+
if "test_exp_001" in exp_ids and "test_exp_002" in exp_ids:
|
109 |
+
logger.info("β
Both experiments preserved successfully")
|
110 |
+
else:
|
111 |
+
logger.error(f"β Experiments not preserved. Found IDs: {exp_ids}")
|
112 |
+
return False
|
113 |
+
|
114 |
+
# Test 6: Update existing experiment (should preserve others)
|
115 |
+
logger.info("\nπ Test 6: Updating first experiment...")
|
116 |
+
exp1_updated = create_sample_experiment("test_exp_001", "Updated First Experiment", "completed")
|
117 |
+
success = dataset_manager.upsert_experiment(exp1_updated)
|
118 |
+
logger.info(f"First experiment updated: {success}")
|
119 |
+
|
120 |
+
if not success:
|
121 |
+
logger.error("β Failed to update first experiment")
|
122 |
+
return False
|
123 |
+
|
124 |
+
# Test 7: Verify update preserved other experiments
|
125 |
+
logger.info("\nπ Test 7: Verifying update preserved other experiments...")
|
126 |
+
final_experiments = dataset_manager.load_existing_experiments()
|
127 |
+
logger.info(f"Total experiments after update: {len(final_experiments)}")
|
128 |
+
|
129 |
+
# Check that we still have both experiments
|
130 |
+
if len(final_experiments) != 2:
|
131 |
+
logger.error(f"β Wrong number of experiments after update: {len(final_experiments)}")
|
132 |
+
return False
|
133 |
+
|
134 |
+
# Check that first experiment was updated
|
135 |
+
exp1_final = dataset_manager.get_experiment_by_id("test_exp_001")
|
136 |
+
if exp1_final and exp1_final.get('status') == 'completed':
|
137 |
+
logger.info("β
First experiment successfully updated")
|
138 |
+
else:
|
139 |
+
logger.error("β First experiment update failed")
|
140 |
+
return False
|
141 |
+
|
142 |
+
# Check that second experiment was preserved
|
143 |
+
exp2_final = dataset_manager.get_experiment_by_id("test_exp_002")
|
144 |
+
if exp2_final and exp2_final.get('name') == "Second Test Experiment":
|
145 |
+
logger.info("β
Second experiment successfully preserved")
|
146 |
+
else:
|
147 |
+
logger.error("β Second experiment not preserved")
|
148 |
+
return False
|
149 |
+
|
150 |
+
# Test 8: Test filtering functionality
|
151 |
+
logger.info("\nπ Test 8: Testing filtering functionality...")
|
152 |
+
running_experiments = dataset_manager.list_experiments(status_filter="running")
|
153 |
+
completed_experiments = dataset_manager.list_experiments(status_filter="completed")
|
154 |
+
|
155 |
+
logger.info(f"Running experiments: {len(running_experiments)}")
|
156 |
+
logger.info(f"Completed experiments: {len(completed_experiments)}")
|
157 |
+
|
158 |
+
if len(running_experiments) == 1 and len(completed_experiments) == 1:
|
159 |
+
logger.info("β
Filtering functionality works correctly")
|
160 |
+
else:
|
161 |
+
logger.error("β Filtering functionality failed")
|
162 |
+
return False
|
163 |
+
|
164 |
+
logger.info("\nπ All data preservation tests passed!")
|
165 |
+
logger.info("β
Data preservation functionality is working correctly")
|
166 |
+
return True
|
167 |
+
|
168 |
+
except Exception as e:
|
169 |
+
logger.error(f"β Test failed with exception: {e}")
|
170 |
+
return False
|
171 |
+
|
172 |
+
def main():
|
173 |
+
"""Main test function"""
|
174 |
+
logger.info("Data Preservation Test Suite")
|
175 |
+
logger.info("=" * 50)
|
176 |
+
|
177 |
+
success = test_data_preservation()
|
178 |
+
|
179 |
+
if success:
|
180 |
+
logger.info("\nβ
All tests passed!")
|
181 |
+
sys.exit(0)
|
182 |
+
else:
|
183 |
+
logger.error("\nβ Some tests failed!")
|
184 |
+
sys.exit(1)
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
main()
|
tests/test_demo_deployment.py
CHANGED
@@ -39,14 +39,17 @@ def test_template_files_exist():
|
|
39 |
"""Test that template files exist"""
|
40 |
print("π§ͺ Testing template files existence...")
|
41 |
|
42 |
-
|
43 |
-
|
44 |
required_files = ["app.py", "requirements.txt"]
|
45 |
|
46 |
-
for
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
50 |
|
51 |
print("β
Template files test passed")
|
52 |
|
|
|
39 |
"""Test that template files exist"""
|
40 |
print("π§ͺ Testing template files existence...")
|
41 |
|
42 |
+
demo_types = ["demo_smol", "demo_gpt"]
|
|
|
43 |
required_files = ["app.py", "requirements.txt"]
|
44 |
|
45 |
+
for demo_type in demo_types:
|
46 |
+
template_dir = Path(__file__).parent.parent / "templates" / "spaces" / demo_type
|
47 |
+
print(f"Checking {demo_type} templates...")
|
48 |
+
|
49 |
+
for file_name in required_files:
|
50 |
+
file_path = template_dir / file_name
|
51 |
+
assert file_path.exists(), f"Required file {file_name} not found in {demo_type} templates"
|
52 |
+
print(f"β
Found {demo_type}/{file_name}")
|
53 |
|
54 |
print("β
Template files test passed")
|
55 |
|
tests/test_deployment.py
CHANGED
@@ -17,15 +17,19 @@ def test_templates_exist():
|
|
17 |
|
18 |
# Check spaces templates
|
19 |
spaces_dir = project_root / "templates" / "spaces"
|
|
|
20 |
spaces_files = ["app.py", "requirements.txt", "README.md"]
|
21 |
|
22 |
-
for
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
29 |
|
30 |
# Check datasets templates
|
31 |
datasets_dir = project_root / "templates" / "datasets"
|
|
|
17 |
|
18 |
# Check spaces templates
|
19 |
spaces_dir = project_root / "templates" / "spaces"
|
20 |
+
demo_types = ["demo_smol", "demo_gpt", "trackio"]
|
21 |
spaces_files = ["app.py", "requirements.txt", "README.md"]
|
22 |
|
23 |
+
for demo_type in demo_types:
|
24 |
+
demo_dir = spaces_dir / demo_type
|
25 |
+
print(f"Checking {demo_type} templates...")
|
26 |
+
for file_name in spaces_files:
|
27 |
+
file_path = demo_dir / file_name
|
28 |
+
if file_path.exists():
|
29 |
+
print(f"β
{file_path}")
|
30 |
+
else:
|
31 |
+
print(f"β {file_path} not found")
|
32 |
+
return False
|
33 |
|
34 |
# Check datasets templates
|
35 |
datasets_dir = project_root / "templates" / "datasets"
|
tests/test_hf_datasets.py
CHANGED
@@ -76,7 +76,7 @@ def test_backup_fallback():
|
|
76 |
|
77 |
try:
|
78 |
# Import and test the TrackioSpace class
|
79 |
-
from templates.spaces.app import TrackioSpace
|
80 |
|
81 |
trackio = TrackioSpace()
|
82 |
experiments = trackio.experiments
|
@@ -105,7 +105,7 @@ def test_metrics_dataframe():
|
|
105 |
print("=" * 40)
|
106 |
|
107 |
try:
|
108 |
-
from templates.spaces.app import TrackioSpace
|
109 |
|
110 |
trackio = TrackioSpace()
|
111 |
|
|
|
76 |
|
77 |
try:
|
78 |
# Import and test the TrackioSpace class
|
79 |
+
from templates.spaces.trackio.app import TrackioSpace
|
80 |
|
81 |
trackio = TrackioSpace()
|
82 |
experiments = trackio.experiments
|
|
|
105 |
print("=" * 40)
|
106 |
|
107 |
try:
|
108 |
+
from templates.spaces.trackio.app import TrackioSpace
|
109 |
|
110 |
trackio = TrackioSpace()
|
111 |
|
tests/test_latest_deployment.py
CHANGED
@@ -158,16 +158,20 @@ def test_template_files():
|
|
158 |
"""Test that all required template files exist"""
|
159 |
print("\nπ Testing template files...")
|
160 |
|
161 |
-
|
|
|
162 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
163 |
|
164 |
-
for
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
|
|
|
|
|
|
171 |
|
172 |
return True
|
173 |
|
|
|
158 |
"""Test that all required template files exist"""
|
159 |
print("\nπ Testing template files...")
|
160 |
|
161 |
+
spaces_dir = project_root / "templates" / "spaces"
|
162 |
+
demo_types = ["demo_smol", "demo_gpt", "trackio"]
|
163 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
164 |
|
165 |
+
for demo_type in demo_types:
|
166 |
+
demo_dir = spaces_dir / demo_type
|
167 |
+
print(f"Checking {demo_type} templates...")
|
168 |
+
for file_name in required_files:
|
169 |
+
file_path = demo_dir / file_name
|
170 |
+
if file_path.exists():
|
171 |
+
print(f"β
{demo_type}/{file_name} exists")
|
172 |
+
else:
|
173 |
+
print(f"β {demo_type}/{file_name} missing")
|
174 |
+
return False
|
175 |
|
176 |
return True
|
177 |
|
tests/test_readme_template.py
CHANGED
@@ -16,8 +16,8 @@ def test_readme_template():
|
|
16 |
print("π Testing README template replacement...")
|
17 |
|
18 |
try:
|
19 |
-
# Get template path
|
20 |
-
templates_dir = project_root / "templates" / "spaces"
|
21 |
readme_template_path = templates_dir / "README.md"
|
22 |
|
23 |
if not readme_template_path.exists():
|
|
|
16 |
print("π Testing README template replacement...")
|
17 |
|
18 |
try:
|
19 |
+
# Get template path (using trackio as example)
|
20 |
+
templates_dir = project_root / "templates" / "spaces" / "trackio"
|
21 |
readme_template_path = templates_dir / "README.md"
|
22 |
|
23 |
if not readme_template_path.exists():
|
tests/test_real_dataset_access.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script to verify that the Trackio Space can read from the real Hugging Face dataset
|
4 |
+
This test requires an HF_TOKEN environment variable to access the dataset
|
5 |
+
"""
|
6 |
+
|
7 |
+
import sys
|
8 |
+
import os
|
9 |
+
import json
|
10 |
+
import logging
|
11 |
+
from typing import Dict, Any
|
12 |
+
|
13 |
+
# Setup logging
|
14 |
+
logging.basicConfig(level=logging.INFO)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
def test_direct_dataset_access():
|
18 |
+
"""Test direct access to the Hugging Face dataset"""
|
19 |
+
try:
|
20 |
+
hf_token = os.environ.get('HF_TOKEN')
|
21 |
+
|
22 |
+
if not hf_token:
|
23 |
+
logger.warning("β οΈ No HF_TOKEN found. Skipping real dataset test.")
|
24 |
+
logger.info("π‘ Set HF_TOKEN environment variable to test with real dataset")
|
25 |
+
return False
|
26 |
+
|
27 |
+
from datasets import load_dataset
|
28 |
+
|
29 |
+
dataset_repo = "Tonic/trackio-experiments"
|
30 |
+
logger.info(f"π§ Testing direct access to {dataset_repo}")
|
31 |
+
|
32 |
+
# Load the dataset
|
33 |
+
dataset = load_dataset(dataset_repo, token=hf_token)
|
34 |
+
|
35 |
+
# Check structure
|
36 |
+
experiment_count = len(dataset['train']) if 'train' in dataset else 0
|
37 |
+
logger.info(f"π Dataset contains {experiment_count} experiments")
|
38 |
+
|
39 |
+
if experiment_count == 0:
|
40 |
+
logger.warning("β οΈ No experiments found in dataset")
|
41 |
+
return False
|
42 |
+
|
43 |
+
# Check columns
|
44 |
+
columns = list(dataset['train'].column_names) if 'train' in dataset else []
|
45 |
+
logger.info(f"π Dataset columns: {columns}")
|
46 |
+
|
47 |
+
expected_columns = ['experiment_id', 'name', 'description', 'created_at', 'status', 'metrics', 'parameters', 'artifacts', 'logs', 'last_updated']
|
48 |
+
missing_columns = [col for col in expected_columns if col not in columns]
|
49 |
+
|
50 |
+
if missing_columns:
|
51 |
+
logger.warning(f"β οΈ Missing expected columns: {missing_columns}")
|
52 |
+
else:
|
53 |
+
logger.info("β
All expected columns present")
|
54 |
+
|
55 |
+
# Test parsing a few experiments
|
56 |
+
successful_parses = 0
|
57 |
+
for i, row in enumerate(dataset['train']):
|
58 |
+
if i >= 3: # Test first 3 experiments
|
59 |
+
break
|
60 |
+
|
61 |
+
exp_id = row.get('experiment_id', 'unknown')
|
62 |
+
logger.info(f"\n㪠Testing experiment: {exp_id}")
|
63 |
+
|
64 |
+
# Test metrics parsing
|
65 |
+
metrics_raw = row.get('metrics', '[]')
|
66 |
+
try:
|
67 |
+
if isinstance(metrics_raw, str):
|
68 |
+
metrics = json.loads(metrics_raw)
|
69 |
+
if isinstance(metrics, list):
|
70 |
+
logger.info(f" β
Metrics parsed: {len(metrics)} entries")
|
71 |
+
if metrics:
|
72 |
+
first_metric = metrics[0]
|
73 |
+
if 'metrics' in first_metric:
|
74 |
+
metric_keys = list(first_metric['metrics'].keys())
|
75 |
+
logger.info(f" π Sample metrics: {metric_keys[:5]}...")
|
76 |
+
successful_parses += 1
|
77 |
+
else:
|
78 |
+
logger.warning(f" β οΈ Metrics is not a list: {type(metrics)}")
|
79 |
+
else:
|
80 |
+
logger.warning(f" β οΈ Metrics is not a string: {type(metrics_raw)}")
|
81 |
+
except json.JSONDecodeError as e:
|
82 |
+
logger.warning(f" β Failed to parse metrics JSON: {e}")
|
83 |
+
|
84 |
+
# Test parameters parsing
|
85 |
+
parameters_raw = row.get('parameters', '{}')
|
86 |
+
try:
|
87 |
+
if isinstance(parameters_raw, str):
|
88 |
+
parameters = json.loads(parameters_raw)
|
89 |
+
if isinstance(parameters, dict):
|
90 |
+
logger.info(f" β
Parameters parsed: {len(parameters)} entries")
|
91 |
+
else:
|
92 |
+
logger.warning(f" β οΈ Parameters is not a dict: {type(parameters)}")
|
93 |
+
else:
|
94 |
+
logger.warning(f" β οΈ Parameters is not a string: {type(parameters_raw)}")
|
95 |
+
except json.JSONDecodeError as e:
|
96 |
+
logger.warning(f" β Failed to parse parameters JSON: {e}")
|
97 |
+
|
98 |
+
logger.info(f"\nπ Successfully parsed {successful_parses} out of {min(3, experiment_count)} test experiments")
|
99 |
+
|
100 |
+
return successful_parses > 0
|
101 |
+
|
102 |
+
except Exception as e:
|
103 |
+
logger.error(f"β Error testing direct dataset access: {e}")
|
104 |
+
import traceback
|
105 |
+
traceback.print_exc()
|
106 |
+
return False
|
107 |
+
|
108 |
+
def test_trackio_space_with_real_dataset():
|
109 |
+
"""Test TrackioSpace class with real dataset"""
|
110 |
+
try:
|
111 |
+
hf_token = os.environ.get('HF_TOKEN')
|
112 |
+
|
113 |
+
if not hf_token:
|
114 |
+
logger.warning("β οΈ No HF_TOKEN found. Skipping TrackioSpace test with real dataset.")
|
115 |
+
return False
|
116 |
+
|
117 |
+
# Add the templates/spaces/trackio directory to the path
|
118 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
119 |
+
|
120 |
+
from app import TrackioSpace
|
121 |
+
|
122 |
+
dataset_repo = "Tonic/trackio-experiments"
|
123 |
+
logger.info(f"π§ Testing TrackioSpace with {dataset_repo}")
|
124 |
+
|
125 |
+
# Create TrackioSpace instance with real credentials
|
126 |
+
trackio_space = TrackioSpace(hf_token=hf_token, dataset_repo=dataset_repo)
|
127 |
+
|
128 |
+
# Check if it loaded experiments from the dataset (not backup)
|
129 |
+
experiments_count = len(trackio_space.experiments)
|
130 |
+
logger.info(f"π TrackioSpace loaded {experiments_count} experiments")
|
131 |
+
|
132 |
+
if experiments_count == 0:
|
133 |
+
logger.warning("β οΈ TrackioSpace loaded no experiments")
|
134 |
+
return False
|
135 |
+
|
136 |
+
# Check if the dataset manager is available
|
137 |
+
if trackio_space.dataset_manager:
|
138 |
+
logger.info("β
Dataset manager is available - data preservation enabled")
|
139 |
+
else:
|
140 |
+
logger.warning("β οΈ Dataset manager not available - using legacy mode")
|
141 |
+
|
142 |
+
# Test loading a specific experiment
|
143 |
+
experiment_ids = list(trackio_space.experiments.keys())
|
144 |
+
if experiment_ids:
|
145 |
+
test_exp_id = experiment_ids[0]
|
146 |
+
logger.info(f"π¬ Testing metrics loading for {test_exp_id}")
|
147 |
+
|
148 |
+
from app import get_metrics_dataframe
|
149 |
+
df = get_metrics_dataframe(test_exp_id)
|
150 |
+
|
151 |
+
if not df.empty:
|
152 |
+
logger.info(f"β
Metrics DataFrame created: {len(df)} rows, {len(df.columns)} columns")
|
153 |
+
logger.info(f"π Available metrics: {list(df.columns)}")
|
154 |
+
return True
|
155 |
+
else:
|
156 |
+
logger.warning(f"β οΈ Metrics DataFrame is empty for {test_exp_id}")
|
157 |
+
return False
|
158 |
+
else:
|
159 |
+
logger.warning("β οΈ No experiments available for testing")
|
160 |
+
return False
|
161 |
+
|
162 |
+
except Exception as e:
|
163 |
+
logger.error(f"β Error testing TrackioSpace with real dataset: {e}")
|
164 |
+
import traceback
|
165 |
+
traceback.print_exc()
|
166 |
+
return False
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
logger.info("π Starting real dataset access test")
|
170 |
+
|
171 |
+
# Test direct dataset access
|
172 |
+
logger.info("\n" + "="*60)
|
173 |
+
logger.info("TEST 1: Direct Dataset Access")
|
174 |
+
logger.info("="*60)
|
175 |
+
|
176 |
+
direct_test_passed = test_direct_dataset_access()
|
177 |
+
|
178 |
+
# Test TrackioSpace with real dataset
|
179 |
+
logger.info("\n" + "="*60)
|
180 |
+
logger.info("TEST 2: TrackioSpace with Real Dataset")
|
181 |
+
logger.info("="*60)
|
182 |
+
|
183 |
+
trackio_test_passed = test_trackio_space_with_real_dataset()
|
184 |
+
|
185 |
+
# Summary
|
186 |
+
logger.info("\n" + "="*60)
|
187 |
+
logger.info("TEST SUMMARY")
|
188 |
+
logger.info("="*60)
|
189 |
+
|
190 |
+
logger.info(f"Direct Dataset Access: {'β
PASSED' if direct_test_passed else 'β FAILED/SKIPPED'}")
|
191 |
+
logger.info(f"TrackioSpace Integration: {'β
PASSED' if trackio_test_passed else 'β FAILED/SKIPPED'}")
|
192 |
+
|
193 |
+
if direct_test_passed and trackio_test_passed:
|
194 |
+
logger.info("π All tests passed! The dataset integration is working correctly.")
|
195 |
+
sys.exit(0)
|
196 |
+
elif not os.environ.get('HF_TOKEN'):
|
197 |
+
logger.info("βΉοΈ Tests skipped due to missing HF_TOKEN. Set the token to test with real dataset.")
|
198 |
+
sys.exit(0)
|
199 |
+
else:
|
200 |
+
logger.error("β Some tests failed. Please check the implementation.")
|
201 |
+
sys.exit(1)
|
tests/test_trackio_dataset_fix.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script to verify that the Trackio Space can properly read from the actual dataset
|
4 |
+
"""
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import logging
|
10 |
+
from typing import Dict, Any
|
11 |
+
|
12 |
+
# Add the templates/spaces/trackio directory to the path
|
13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
14 |
+
|
15 |
+
# Setup logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
def test_dataset_loading():
|
20 |
+
"""Test loading experiments from the actual dataset"""
|
21 |
+
try:
|
22 |
+
# Import the TrackioSpace class
|
23 |
+
from app import TrackioSpace
|
24 |
+
|
25 |
+
# Create a TrackioSpace instance pointing to the real dataset
|
26 |
+
dataset_repo = "Tonic/trackio-experiments"
|
27 |
+
hf_token = os.environ.get('HF_TOKEN')
|
28 |
+
|
29 |
+
if not hf_token:
|
30 |
+
logger.warning("β οΈ No HF_TOKEN found in environment. Testing with public access.")
|
31 |
+
|
32 |
+
logger.info(f"π§ Testing dataset loading from {dataset_repo}")
|
33 |
+
|
34 |
+
# Create TrackioSpace instance
|
35 |
+
trackio_space = TrackioSpace(hf_token=hf_token, dataset_repo=dataset_repo)
|
36 |
+
|
37 |
+
# Check how many experiments were loaded
|
38 |
+
experiments_count = len(trackio_space.experiments)
|
39 |
+
logger.info(f"π Loaded {experiments_count} experiments")
|
40 |
+
|
41 |
+
if experiments_count == 0:
|
42 |
+
logger.warning("β οΈ No experiments loaded - this might indicate a problem")
|
43 |
+
return False
|
44 |
+
|
45 |
+
# Test specific experiment IDs from the logs
|
46 |
+
test_experiment_ids = [
|
47 |
+
'exp_20250720_130853',
|
48 |
+
'exp_20250720_134319',
|
49 |
+
'exp_20250727_172507',
|
50 |
+
'exp_20250727_172526'
|
51 |
+
]
|
52 |
+
|
53 |
+
found_experiments = []
|
54 |
+
for exp_id in test_experiment_ids:
|
55 |
+
if exp_id in trackio_space.experiments:
|
56 |
+
found_experiments.append(exp_id)
|
57 |
+
experiment = trackio_space.experiments[exp_id]
|
58 |
+
|
59 |
+
logger.info(f"β
Found experiment: {exp_id}")
|
60 |
+
logger.info(f" Name: {experiment.get('name', 'N/A')}")
|
61 |
+
logger.info(f" Status: {experiment.get('status', 'N/A')}")
|
62 |
+
logger.info(f" Metrics count: {len(experiment.get('metrics', []))}")
|
63 |
+
logger.info(f" Parameters count: {len(experiment.get('parameters', {}))}")
|
64 |
+
|
65 |
+
# Test metrics parsing specifically
|
66 |
+
metrics = experiment.get('metrics', [])
|
67 |
+
if metrics:
|
68 |
+
logger.info(f" First metric entry: {metrics[0] if metrics else 'None'}")
|
69 |
+
|
70 |
+
# Test if we can get a DataFrame for this experiment
|
71 |
+
from app import get_metrics_dataframe
|
72 |
+
df = get_metrics_dataframe(exp_id)
|
73 |
+
if not df.empty:
|
74 |
+
logger.info(f" β
DataFrame created successfully: {len(df)} rows, {len(df.columns)} columns")
|
75 |
+
logger.info(f" Available metrics: {list(df.columns)}")
|
76 |
+
else:
|
77 |
+
logger.warning(f" β οΈ DataFrame is empty for {exp_id}")
|
78 |
+
else:
|
79 |
+
logger.warning(f" β οΈ No metrics found for {exp_id}")
|
80 |
+
|
81 |
+
logger.info(f"π Found {len(found_experiments)} out of {len(test_experiment_ids)} test experiments")
|
82 |
+
|
83 |
+
if found_experiments:
|
84 |
+
logger.info("β
Dataset loading appears to be working correctly!")
|
85 |
+
return True
|
86 |
+
else:
|
87 |
+
logger.warning("β οΈ No test experiments found - dataset loading may have issues")
|
88 |
+
return False
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
logger.error(f"β Error testing dataset loading: {e}")
|
92 |
+
import traceback
|
93 |
+
traceback.print_exc()
|
94 |
+
return False
|
95 |
+
|
96 |
+
def test_metrics_parsing():
|
97 |
+
"""Test parsing metrics from the actual dataset format"""
|
98 |
+
try:
|
99 |
+
# Test with actual data structure from the dataset
|
100 |
+
sample_metrics_json = '''[{"timestamp": "2025-07-20T11:20:01.780908", "step": 25, "metrics": {"loss": 1.1659, "grad_norm": 10.3125, "learning_rate": 7e-08, "num_tokens": 1642080.0, "mean_token_accuracy": 0.75923578992486, "epoch": 0.004851130919895701}}, {"timestamp": "2025-07-20T11:26:39.042155", "step": 50, "metrics": {"loss": 1.165, "grad_norm": 10.75, "learning_rate": 1.4291666666666667e-07, "num_tokens": 3324682.0, "mean_token_accuracy": 0.7577659255266189, "epoch": 0.009702261839791402}}]'''
|
101 |
+
|
102 |
+
logger.info("π§ Testing metrics parsing")
|
103 |
+
|
104 |
+
# Parse the JSON
|
105 |
+
metrics_list = json.loads(sample_metrics_json)
|
106 |
+
logger.info(f"π Parsed {len(metrics_list)} metric entries")
|
107 |
+
|
108 |
+
# Convert to DataFrame format (like the app does)
|
109 |
+
import pandas as pd
|
110 |
+
df_data = []
|
111 |
+
for metric_entry in metrics_list:
|
112 |
+
if isinstance(metric_entry, dict):
|
113 |
+
step = metric_entry.get('step', 0)
|
114 |
+
timestamp = metric_entry.get('timestamp', '')
|
115 |
+
metrics = metric_entry.get('metrics', {})
|
116 |
+
|
117 |
+
row = {'step': step, 'timestamp': timestamp}
|
118 |
+
row.update(metrics)
|
119 |
+
df_data.append(row)
|
120 |
+
|
121 |
+
if df_data:
|
122 |
+
df = pd.DataFrame(df_data)
|
123 |
+
logger.info(f"β
DataFrame created: {len(df)} rows, {len(df.columns)} columns")
|
124 |
+
logger.info(f"π Columns: {list(df.columns)}")
|
125 |
+
logger.info(f"π Sample data:\n{df.head()}")
|
126 |
+
return True
|
127 |
+
else:
|
128 |
+
logger.warning("β οΈ No data converted to DataFrame format")
|
129 |
+
return False
|
130 |
+
|
131 |
+
except Exception as e:
|
132 |
+
logger.error(f"β Error testing metrics parsing: {e}")
|
133 |
+
import traceback
|
134 |
+
traceback.print_exc()
|
135 |
+
return False
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
logger.info("π Starting Trackio dataset fix verification")
|
139 |
+
|
140 |
+
# Test metrics parsing first
|
141 |
+
logger.info("\n" + "="*50)
|
142 |
+
logger.info("TEST 1: Metrics Parsing")
|
143 |
+
logger.info("="*50)
|
144 |
+
|
145 |
+
metrics_test_passed = test_metrics_parsing()
|
146 |
+
|
147 |
+
# Test dataset loading
|
148 |
+
logger.info("\n" + "="*50)
|
149 |
+
logger.info("TEST 2: Dataset Loading")
|
150 |
+
logger.info("="*50)
|
151 |
+
|
152 |
+
dataset_test_passed = test_dataset_loading()
|
153 |
+
|
154 |
+
# Summary
|
155 |
+
logger.info("\n" + "="*50)
|
156 |
+
logger.info("TEST SUMMARY")
|
157 |
+
logger.info("="*50)
|
158 |
+
|
159 |
+
logger.info(f"Metrics Parsing: {'β
PASSED' if metrics_test_passed else 'β FAILED'}")
|
160 |
+
logger.info(f"Dataset Loading: {'β
PASSED' if dataset_test_passed else 'β FAILED'}")
|
161 |
+
|
162 |
+
if metrics_test_passed and dataset_test_passed:
|
163 |
+
logger.info("π All tests passed! The dataset fix should work correctly.")
|
164 |
+
sys.exit(0)
|
165 |
+
else:
|
166 |
+
logger.error("β Some tests failed. Please check the implementation.")
|
167 |
+
sys.exit(1)
|
tests/test_trackio_deployment.py
CHANGED
@@ -17,16 +17,16 @@ def test_templates_structure():
|
|
17 |
"""Test that the templates structure is correct"""
|
18 |
print("π Testing templates structure...")
|
19 |
|
20 |
-
|
21 |
|
22 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
23 |
|
24 |
for file_name in required_files:
|
25 |
-
file_path =
|
26 |
if file_path.exists():
|
27 |
-
print(f"β
{file_name} exists")
|
28 |
else:
|
29 |
-
print(f"β {file_name} missing")
|
30 |
return False
|
31 |
|
32 |
return True
|
@@ -35,7 +35,7 @@ def test_app_py_content():
|
|
35 |
"""Test that app.py has the required structure"""
|
36 |
print("\nπ Testing app.py content...")
|
37 |
|
38 |
-
app_path = project_root / "templates" / "spaces" / "app.py"
|
39 |
|
40 |
try:
|
41 |
with open(app_path, 'r', encoding='utf-8') as f:
|
|
|
17 |
"""Test that the templates structure is correct"""
|
18 |
print("π Testing templates structure...")
|
19 |
|
20 |
+
trackio_dir = project_root / "templates" / "spaces" / "trackio"
|
21 |
|
22 |
required_files = ["app.py", "requirements.txt", "README.md"]
|
23 |
|
24 |
for file_name in required_files:
|
25 |
+
file_path = trackio_dir / file_name
|
26 |
if file_path.exists():
|
27 |
+
print(f"β
trackio/{file_name} exists")
|
28 |
else:
|
29 |
+
print(f"β trackio/{file_name} missing")
|
30 |
return False
|
31 |
|
32 |
return True
|
|
|
35 |
"""Test that app.py has the required structure"""
|
36 |
print("\nπ Testing app.py content...")
|
37 |
|
38 |
+
app_path = project_root / "templates" / "spaces" / "trackio" / "app.py"
|
39 |
|
40 |
try:
|
41 |
with open(app_path, 'r', encoding='utf-8') as f:
|
tests/test_trackio_space_diagnostics.py
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Diagnostic script for Trackio Space issues
|
4 |
+
Helps debug dataset loading and API client issues
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import logging
|
10 |
+
|
11 |
+
# Add src directory to path
|
12 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src'))
|
13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'templates', 'spaces', 'trackio'))
|
14 |
+
|
15 |
+
# Setup logging
|
16 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
17 |
+
logger = logging.getLogger(__name__)
|
18 |
+
|
19 |
+
def test_dataset_manager():
|
20 |
+
"""Test dataset manager functionality"""
|
21 |
+
try:
|
22 |
+
from dataset_utils import TrackioDatasetManager
|
23 |
+
|
24 |
+
# Test with environment variables
|
25 |
+
hf_token = os.environ.get('HF_TOKEN')
|
26 |
+
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
27 |
+
|
28 |
+
if not hf_token:
|
29 |
+
logger.warning("β οΈ HF_TOKEN not found in environment")
|
30 |
+
return False
|
31 |
+
|
32 |
+
logger.info(f"π§ Testing dataset manager with repo: {dataset_repo}")
|
33 |
+
|
34 |
+
# Initialize dataset manager
|
35 |
+
manager = TrackioDatasetManager(dataset_repo, hf_token)
|
36 |
+
|
37 |
+
# Test loading experiments
|
38 |
+
experiments = manager.load_existing_experiments()
|
39 |
+
logger.info(f"π Loaded {len(experiments)} experiments from dataset")
|
40 |
+
|
41 |
+
# Test creating a sample experiment
|
42 |
+
sample_experiment = {
|
43 |
+
'experiment_id': f'test_diagnostic_{int(os.urandom(4).hex(), 16)}',
|
44 |
+
'name': 'Diagnostic Test Experiment',
|
45 |
+
'description': 'Test experiment created by diagnostic script',
|
46 |
+
'created_at': '2025-01-27T12:00:00',
|
47 |
+
'status': 'completed',
|
48 |
+
'metrics': '[]',
|
49 |
+
'parameters': '{"test": true}',
|
50 |
+
'artifacts': '[]',
|
51 |
+
'logs': '[]',
|
52 |
+
'last_updated': '2025-01-27T12:00:00'
|
53 |
+
}
|
54 |
+
|
55 |
+
# Test upsert functionality
|
56 |
+
logger.info("π§ͺ Testing experiment upsert...")
|
57 |
+
success = manager.upsert_experiment(sample_experiment)
|
58 |
+
|
59 |
+
if success:
|
60 |
+
logger.info("β
Dataset manager working correctly")
|
61 |
+
|
62 |
+
# Verify the experiment was saved
|
63 |
+
experiments_after = manager.load_existing_experiments()
|
64 |
+
logger.info(f"π After upsert: {len(experiments_after)} experiments")
|
65 |
+
|
66 |
+
return True
|
67 |
+
else:
|
68 |
+
logger.error("β Failed to upsert test experiment")
|
69 |
+
return False
|
70 |
+
|
71 |
+
except ImportError as e:
|
72 |
+
logger.error(f"β Failed to import dataset_utils: {e}")
|
73 |
+
return False
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"β Dataset manager test failed: {e}")
|
76 |
+
return False
|
77 |
+
|
78 |
+
def test_trackio_space():
|
79 |
+
"""Test TrackioSpace initialization"""
|
80 |
+
try:
|
81 |
+
# Import the TrackioSpace class
|
82 |
+
from app import TrackioSpace
|
83 |
+
|
84 |
+
logger.info("π§ͺ Testing TrackioSpace initialization...")
|
85 |
+
|
86 |
+
# Initialize TrackioSpace
|
87 |
+
space = TrackioSpace()
|
88 |
+
|
89 |
+
logger.info(f"π TrackioSpace initialized with {len(space.experiments)} experiments")
|
90 |
+
logger.info(f"π‘οΈ Dataset manager available: {'Yes' if space.dataset_manager else 'No'}")
|
91 |
+
logger.info(f"π HF Token available: {'Yes' if space.hf_token else 'No'}")
|
92 |
+
logger.info(f"π Dataset repo: {space.dataset_repo}")
|
93 |
+
|
94 |
+
return True
|
95 |
+
|
96 |
+
except ImportError as e:
|
97 |
+
logger.error(f"β Failed to import TrackioSpace: {e}")
|
98 |
+
return False
|
99 |
+
except Exception as e:
|
100 |
+
logger.error(f"β TrackioSpace test failed: {e}")
|
101 |
+
return False
|
102 |
+
|
103 |
+
def test_environment():
|
104 |
+
"""Test environment configuration"""
|
105 |
+
logger.info("π Checking environment configuration...")
|
106 |
+
|
107 |
+
# Check required environment variables
|
108 |
+
env_vars = {
|
109 |
+
'HF_TOKEN': os.environ.get('HF_TOKEN'),
|
110 |
+
'TRACKIO_DATASET_REPO': os.environ.get('TRACKIO_DATASET_REPO'),
|
111 |
+
'TRACKIO_URL': os.environ.get('TRACKIO_URL'),
|
112 |
+
'SPACE_ID': os.environ.get('SPACE_ID')
|
113 |
+
}
|
114 |
+
|
115 |
+
for var, value in env_vars.items():
|
116 |
+
if value:
|
117 |
+
masked_value = value[:8] + '...' if len(value) > 8 and 'TOKEN' in var else value
|
118 |
+
logger.info(f"β
{var}: {masked_value}")
|
119 |
+
else:
|
120 |
+
logger.warning(f"β οΈ {var}: Not set")
|
121 |
+
|
122 |
+
# Check if running on HF Spaces
|
123 |
+
is_hf_spaces = bool(os.environ.get('SPACE_ID'))
|
124 |
+
logger.info(f"π Running on HF Spaces: {'Yes' if is_hf_spaces else 'No'}")
|
125 |
+
|
126 |
+
return True
|
127 |
+
|
128 |
+
def fix_common_issues():
|
129 |
+
"""Suggest fixes for common issues"""
|
130 |
+
logger.info("π‘ Common issue fixes:")
|
131 |
+
|
132 |
+
# Check dataset repository format
|
133 |
+
dataset_repo = os.environ.get('TRACKIO_DATASET_REPO', 'tonic/trackio-experiments')
|
134 |
+
if '/' not in dataset_repo:
|
135 |
+
logger.warning(f"β οΈ Dataset repo format issue: {dataset_repo} should be 'username/dataset-name'")
|
136 |
+
else:
|
137 |
+
logger.info(f"β
Dataset repo format looks good: {dataset_repo}")
|
138 |
+
|
139 |
+
# Check for URL issues
|
140 |
+
trackio_url = os.environ.get('TRACKIO_URL', 'https://tonic-test-trackio-test.hf.space')
|
141 |
+
if trackio_url.startswith('https://https://') or trackio_url.startswith('http://http://'):
|
142 |
+
logger.warning(f"β οΈ URL format issue detected: {trackio_url}")
|
143 |
+
fixed_url = trackio_url.replace('https://https://', 'https://').replace('http://http://', 'http://')
|
144 |
+
logger.info(f"π‘ Fixed URL should be: {fixed_url}")
|
145 |
+
else:
|
146 |
+
logger.info(f"β
Trackio URL format looks good: {trackio_url}")
|
147 |
+
|
148 |
+
def main():
|
149 |
+
"""Run all diagnostic tests"""
|
150 |
+
logger.info("π§ Starting Trackio Space diagnostics...")
|
151 |
+
logger.info("=" * 60)
|
152 |
+
|
153 |
+
try:
|
154 |
+
# Test environment
|
155 |
+
test_environment()
|
156 |
+
logger.info("-" * 40)
|
157 |
+
|
158 |
+
# Test dataset manager
|
159 |
+
dataset_manager_ok = test_dataset_manager()
|
160 |
+
logger.info("-" * 40)
|
161 |
+
|
162 |
+
# Test TrackioSpace
|
163 |
+
trackio_space_ok = test_trackio_space()
|
164 |
+
logger.info("-" * 40)
|
165 |
+
|
166 |
+
# Suggest fixes
|
167 |
+
fix_common_issues()
|
168 |
+
logger.info("-" * 40)
|
169 |
+
|
170 |
+
# Summary
|
171 |
+
logger.info("π DIAGNOSTIC SUMMARY:")
|
172 |
+
logger.info(f"Dataset Manager: {'β
OK' if dataset_manager_ok else 'β Issues'}")
|
173 |
+
logger.info(f"TrackioSpace: {'β
OK' if trackio_space_ok else 'β Issues'}")
|
174 |
+
|
175 |
+
if dataset_manager_ok and trackio_space_ok:
|
176 |
+
logger.info("π All systems appear to be working correctly!")
|
177 |
+
logger.info("π‘ The issues in the logs might be related to:")
|
178 |
+
logger.info(" - Empty dataset (expected for new setup)")
|
179 |
+
logger.info(" - API client URL formatting (being auto-fixed)")
|
180 |
+
logger.info(" - Remote data access (falling back to local data)")
|
181 |
+
else:
|
182 |
+
logger.warning("β οΈ Some issues detected. Check the logs above for details.")
|
183 |
+
|
184 |
+
except Exception as e:
|
185 |
+
logger.error(f"β Diagnostic script failed: {e}")
|
186 |
+
return False
|
187 |
+
|
188 |
+
return True
|
189 |
+
|
190 |
+
if __name__ == "__main__":
|
191 |
+
main()
|