Commit
·
fe843c0
1
Parent(s):
bea1c13
pico-decoder-medium-1 trained to 50k steps
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- README.md +2 -0
- config.json +22 -0
- eval_results/step_0.json +1 -0
- eval_results/step_1000.json +1 -0
- eval_results/step_10000.json +1 -0
- eval_results/step_11000.json +1 -0
- eval_results/step_12000.json +1 -0
- eval_results/step_13000.json +1 -0
- eval_results/step_14000.json +1 -0
- eval_results/step_15000.json +1 -0
- eval_results/step_16000.json +1 -0
- eval_results/step_17000.json +1 -0
- eval_results/step_18000.json +1 -0
- eval_results/step_19000.json +1 -0
- eval_results/step_2000.json +1 -0
- eval_results/step_20000.json +1 -0
- eval_results/step_21000.json +1 -0
- eval_results/step_22000.json +1 -0
- eval_results/step_23000.json +1 -0
- eval_results/step_24000.json +1 -0
- eval_results/step_25000.json +1 -0
- eval_results/step_26000.json +1 -0
- eval_results/step_27000.json +1 -0
- eval_results/step_28000.json +1 -0
- eval_results/step_29000.json +1 -0
- eval_results/step_3000.json +1 -0
- eval_results/step_30000.json +1 -0
- eval_results/step_31000.json +1 -0
- eval_results/step_32000.json +1 -0
- eval_results/step_33000.json +1 -0
- eval_results/step_34000.json +1 -0
- eval_results/step_35000.json +1 -0
- eval_results/step_36000.json +1 -0
- eval_results/step_37000.json +1 -0
- eval_results/step_38000.json +1 -0
- eval_results/step_39000.json +1 -0
- eval_results/step_4000.json +1 -0
- eval_results/step_40000.json +1 -0
- eval_results/step_41000.json +1 -0
- eval_results/step_42000.json +1 -0
- eval_results/step_43000.json +1 -0
- eval_results/step_44000.json +1 -0
- eval_results/step_45000.json +1 -0
- eval_results/step_46000.json +1 -0
- eval_results/step_47000.json +1 -0
- eval_results/step_48000.json +1 -0
- eval_results/step_49000.json +1 -0
- eval_results/step_5000.json +1 -0
- eval_results/step_50000.json +1 -0
- eval_results/step_6000.json +1 -0
README.md
CHANGED
@@ -13,6 +13,8 @@ pipeline_tag: text-generation
|
|
13 |
|
14 |
**pico-decoder-medium** is a 181M parameter model in the `pico-decoder` suite, balancing scale and analyzability. Built with [`pico-train`](https://github.com/pico-lm) and instrumented with [`pico-analyze`](https://github.com/pico-lm), it enables detailed studies of layer-wise learning behavior during language model pretraining.
|
15 |
|
|
|
|
|
16 |
## 🔧 Model Details
|
17 |
|
18 |
| Field | Value |
|
|
|
13 |
|
14 |
**pico-decoder-medium** is a 181M parameter model in the `pico-decoder` suite, balancing scale and analyzability. Built with [`pico-train`](https://github.com/pico-lm) and instrumented with [`pico-analyze`](https://github.com/pico-lm), it enables detailed studies of layer-wise learning behavior during language model pretraining.
|
15 |
|
16 |
+
> NOTE: The `pico-decoder-medium-1` branch contains the full commit history for the training run.
|
17 |
+
|
18 |
## 🔧 Model Details
|
19 |
|
20 |
| Field | Value |
|
config.json
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"activation_hidden_dim": 3072,
|
3 |
+
"architectures": [
|
4 |
+
"PicoHF"
|
5 |
+
],
|
6 |
+
"attention_n_heads": 12,
|
7 |
+
"attention_n_kv_heads": 4,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "pico.PicoHFConfig",
|
10 |
+
"AutoModelForCausalLM": "pico.PicoHF"
|
11 |
+
},
|
12 |
+
"batch_size": 1024,
|
13 |
+
"d_model": 768,
|
14 |
+
"max_seq_len": 2048,
|
15 |
+
"model_type": "pico",
|
16 |
+
"n_layers": 12,
|
17 |
+
"norm_eps": 1e-06,
|
18 |
+
"position_emb_theta": 10000.0,
|
19 |
+
"torch_dtype": "float32",
|
20 |
+
"transformers_version": "4.48.1",
|
21 |
+
"vocab_size": 50304
|
22 |
+
}
|
eval_results/step_0.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 59416.7212543554}
|
eval_results/step_1000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 499.5738274564311}
|
eval_results/step_10000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 51.102903796073036}
|
eval_results/step_11000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 48.70718716470207}
|
eval_results/step_12000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 48.04790251246728}
|
eval_results/step_13000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 45.9789450450226}
|
eval_results/step_14000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 45.154312149988236}
|
eval_results/step_15000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 44.130179383032}
|
eval_results/step_16000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 43.383745260105734}
|
eval_results/step_17000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 42.70362300017154}
|
eval_results/step_18000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 42.00629499373951}
|
eval_results/step_19000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 41.85491885225117}
|
eval_results/step_2000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 177.08616091166638}
|
eval_results/step_20000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 40.92828142551595}
|
eval_results/step_21000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 40.3970818064354}
|
eval_results/step_22000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 40.0735236918054}
|
eval_results/step_23000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 39.55490014910283}
|
eval_results/step_24000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 39.20364000381908}
|
eval_results/step_25000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 38.902612380283635}
|
eval_results/step_26000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 38.46148998878559}
|
eval_results/step_27000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 38.17645851178452}
|
eval_results/step_28000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 37.92720969861393}
|
eval_results/step_29000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 37.60208778298275}
|
eval_results/step_3000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 104.02560207121049}
|
eval_results/step_30000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 37.333580768814485}
|
eval_results/step_31000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 37.089247246154095}
|
eval_results/step_32000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 36.82174229962485}
|
eval_results/step_33000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 36.57921791010202}
|
eval_results/step_34000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 36.40737543064543}
|
eval_results/step_35000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 36.21668368017217}
|
eval_results/step_36000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.997935054028076}
|
eval_results/step_37000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.9598933787296}
|
eval_results/step_38000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.7666209674463}
|
eval_results/step_39000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.641749255914725}
|
eval_results/step_4000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 85.37458271240939}
|
eval_results/step_40000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.34004565797201}
|
eval_results/step_41000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.26165773710723}
|
eval_results/step_42000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 35.171348174606884}
|
eval_results/step_43000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.918753832391744}
|
eval_results/step_44000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.810106504503445}
|
eval_results/step_45000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.69993601676064}
|
eval_results/step_46000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.59661542430572}
|
eval_results/step_47000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.41210062786262}
|
eval_results/step_48000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.33602121640581}
|
eval_results/step_49000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.21125884014555}
|
eval_results/step_5000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 1283.524473396328}
|
eval_results/step_50000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 34.043001136962545}
|
eval_results/step_6000.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"paloma": 77.90012125104977}
|