metadata
language: en
license: apache-2.0
Model Card
Metrics
- position: -1
- layer: 30
- refusal_score: -13.144852638244629
- refusal_score_baseline: 4.240177154541016
- steering_score: 3.974774122238159
- steering_score_baseline: -14.775822639465332
- kl_div_score: 0.06443626040695713
- no_filter: 7
- nan_values: 0
- late_layer: 50
- high_kl: 135
- low_refusal: 48