Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
try
Browse filesThis view is limited to 50 files because it contains too many changes. 聽
See raw diff
- eval-results/.gitattributes +55 -0
- eval-results/.idea/open_pl_llm_leaderboard_results.iml +8 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-32-59.263304.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-19.394508.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-27-04.852309.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-12-58.277105.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-19.801525.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-37-36.637222.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-12.455988.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-48.898331.json +109 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-56.871408.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-52.956955.json +119 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-01.731913.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-18.998768.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-26.656191.json +131 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-21.291842.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-30.374052.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-15-22.967823.json +112 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-25.200287.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-58-53.946082.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-23-40.740746.json +101 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-06-52.670471.json +104 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-02.213869.json +101 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-08.270834.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-11-55.449227.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-13.495944.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-25-19.492688.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-24-50.869505.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-20.849828.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-19-39.147509.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-53-51.017953.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-01-59.819478.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-54.278792.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-23.654679.json +109 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-52.993123.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-33-25.750066.json +119 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-08.007826.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-54-27.674557.json +118 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-22.563512.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-52.497622.json +105 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-18.271570.json +112 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_pes_1723381722/results_2024-08-27T17-50-52.063138.json +0 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-23.588300.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-00.491423.json +106 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-02.859037.json +101 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-09-42.653951.json +104 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-23.877449.json +101 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-30-24.424865.json +111 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-48.485190.json +107 -0
- eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-28.998766.json +118 -0
eval-results/.gitattributes
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.lz4 filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
27 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
37 |
+
# Audio files - uncompressed
|
38 |
+
*.pcm filter=lfs diff=lfs merge=lfs -text
|
39 |
+
*.sam filter=lfs diff=lfs merge=lfs -text
|
40 |
+
*.raw filter=lfs diff=lfs merge=lfs -text
|
41 |
+
# Audio files - compressed
|
42 |
+
*.aac filter=lfs diff=lfs merge=lfs -text
|
43 |
+
*.flac filter=lfs diff=lfs merge=lfs -text
|
44 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
45 |
+
*.ogg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
47 |
+
# Image files - uncompressed
|
48 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
49 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.tiff filter=lfs diff=lfs merge=lfs -text
|
52 |
+
# Image files - compressed
|
53 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
55 |
+
*.webp filter=lfs diff=lfs merge=lfs -text
|
eval-results/.idea/open_pl_llm_leaderboard_results.iml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$" />
|
5 |
+
<orderEntry type="inheritedJdk" />
|
6 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
7 |
+
</component>
|
8 |
+
</module>
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-32-59.263304.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_in": {
|
4 |
+
"exact_match,score-first": 0.7797783933518005,
|
5 |
+
"exact_match_stderr,score-first": 0.01543291377156506,
|
6 |
+
"alias": "polemo2_in"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polemo2_in": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polemo2_in": {
|
14 |
+
"task": "polemo2_in",
|
15 |
+
"group": [
|
16 |
+
"polemo2"
|
17 |
+
],
|
18 |
+
"dataset_path": "allegro/klej-polemo2-in",
|
19 |
+
"training_split": "train",
|
20 |
+
"validation_split": "validation",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 0,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true,
|
33 |
+
"hf_evaluate": true
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"output_type": "generate_until",
|
37 |
+
"generation_kwargs": {
|
38 |
+
"until": [
|
39 |
+
".",
|
40 |
+
","
|
41 |
+
],
|
42 |
+
"do_sample": false,
|
43 |
+
"temperature": 0.0,
|
44 |
+
"max_gen_toks": 50
|
45 |
+
},
|
46 |
+
"repeats": 1,
|
47 |
+
"filter_list": [
|
48 |
+
{
|
49 |
+
"name": "score-first",
|
50 |
+
"filter": [
|
51 |
+
{
|
52 |
+
"function": "regex",
|
53 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"function": "take_first"
|
57 |
+
}
|
58 |
+
]
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"should_decontaminate": true,
|
62 |
+
"doc_to_decontamination_query": "{{sentence}}",
|
63 |
+
"metadata": {
|
64 |
+
"version": 1.0
|
65 |
+
}
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polemo2_in": 1.0
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polemo2_in": 0
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polemo2_in": {
|
76 |
+
"exact_match": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polemo2_in": {
|
81 |
+
"original": 722,
|
82 |
+
"effective": 722
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381747.6734786,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polemo2_in": "287c7460415884286befac7ba8422a32230ec65846799595a6fee727f2d037a5"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2341192.242787643,
|
116 |
+
"end_time": 2342631.868033792,
|
117 |
+
"total_evaluation_time_seconds": "1439.6252461490221"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-19.394508.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_in_multiple_choice": {
|
4 |
+
"acc,none": 0.7714681440443213,
|
5 |
+
"acc_stderr,none": 0.015637406997304655,
|
6 |
+
"acc_norm,none": 0.7742382271468145,
|
7 |
+
"acc_norm_stderr,none": 0.015570224561219015,
|
8 |
+
"alias": "polemo2_in_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polemo2_in_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polemo2_in_multiple_choice": {
|
16 |
+
"task": "polemo2_in_multiple_choice",
|
17 |
+
"group": [
|
18 |
+
"polemo2_mc"
|
19 |
+
],
|
20 |
+
"dataset_path": "allegro/klej-polemo2-in",
|
21 |
+
"training_split": "train",
|
22 |
+
"validation_split": "validation",
|
23 |
+
"test_split": "test",
|
24 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
|
25 |
+
"doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
|
26 |
+
"doc_to_choice": [
|
27 |
+
"Neutralny",
|
28 |
+
"Negatywny",
|
29 |
+
"Pozytywny",
|
30 |
+
"Niejednoznaczny"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 0,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polemo2_in_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polemo2_in_multiple_choice": 0
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polemo2_in_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polemo2_in_multiple_choice": {
|
68 |
+
"original": 722,
|
69 |
+
"effective": 722
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381748.8968034,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polemo2_in_multiple_choice": "6cade7fdeb7a53de3a966bebb7fe941479487faada4badf2831b62d7bb426916"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2669732.988216114,
|
103 |
+
"end_time": 2670111.950610943,
|
104 |
+
"total_evaluation_time_seconds": "378.96239482890815"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-27-04.852309.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_out": {
|
4 |
+
"exact_match,score-first": 0.7530364372469636,
|
5 |
+
"exact_match_stderr,score-first": 0.0194223142525205,
|
6 |
+
"alias": "polemo2_out"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polemo2_out": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polemo2_out": {
|
14 |
+
"task": "polemo2_out",
|
15 |
+
"group": [
|
16 |
+
"polemo2"
|
17 |
+
],
|
18 |
+
"dataset_path": "allegro/klej-polemo2-out",
|
19 |
+
"training_split": "train",
|
20 |
+
"validation_split": "validation",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 0,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true,
|
33 |
+
"hf_evaluate": true
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"output_type": "generate_until",
|
37 |
+
"generation_kwargs": {
|
38 |
+
"until": [
|
39 |
+
".",
|
40 |
+
","
|
41 |
+
],
|
42 |
+
"do_sample": false,
|
43 |
+
"temperature": 0.0,
|
44 |
+
"max_gen_toks": 50
|
45 |
+
},
|
46 |
+
"repeats": 1,
|
47 |
+
"filter_list": [
|
48 |
+
{
|
49 |
+
"name": "score-first",
|
50 |
+
"filter": [
|
51 |
+
{
|
52 |
+
"function": "regex",
|
53 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"function": "take_first"
|
57 |
+
}
|
58 |
+
]
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"should_decontaminate": true,
|
62 |
+
"doc_to_decontamination_query": "{{sentence}}",
|
63 |
+
"metadata": {
|
64 |
+
"version": 1.0
|
65 |
+
}
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polemo2_out": 1.0
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polemo2_out": 0
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polemo2_out": {
|
76 |
+
"exact_match": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polemo2_out": {
|
81 |
+
"original": 494,
|
82 |
+
"effective": 494
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.629631,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polemo2_out": "bf931755699911cb191ed108ec01aa6c9695552185da1ccb8f6c40c22db028b6"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2577223.678777486,
|
116 |
+
"end_time": 2578308.421439366,
|
117 |
+
"total_evaluation_time_seconds": "1084.7426618798636"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-12-58.277105.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_out_multiple_choice": {
|
4 |
+
"acc,none": 0.742914979757085,
|
5 |
+
"acc_stderr,none": 0.019682691432000205,
|
6 |
+
"acc_norm,none": 0.7672064777327935,
|
7 |
+
"acc_norm_stderr,none": 0.019033476340855917,
|
8 |
+
"alias": "polemo2_out_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polemo2_out_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polemo2_out_multiple_choice": {
|
16 |
+
"task": "polemo2_out_multiple_choice",
|
17 |
+
"group": [
|
18 |
+
"polemo2_mc"
|
19 |
+
],
|
20 |
+
"dataset_path": "allegro/klej-polemo2-out",
|
21 |
+
"training_split": "train",
|
22 |
+
"validation_split": "validation",
|
23 |
+
"test_split": "test",
|
24 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
|
25 |
+
"doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
|
26 |
+
"doc_to_choice": [
|
27 |
+
"Neutralny",
|
28 |
+
"Negatywny",
|
29 |
+
"Pozytywny",
|
30 |
+
"Niejednoznaczny"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 0,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polemo2_out_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polemo2_out_multiple_choice": 0
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polemo2_out_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polemo2_out_multiple_choice": {
|
68 |
+
"original": 494,
|
69 |
+
"effective": 494
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381748.8963165,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polemo2_out_multiple_choice": "63ec4fc12bc668a566b3f91378159707f11e63ac52ded120b65fa3dd6a1b9979"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2669732.987871353,
|
103 |
+
"end_time": 2669970.833388602,
|
104 |
+
"total_evaluation_time_seconds": "237.8455172488466"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-19.801525.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_8tags_multiple_choice": {
|
4 |
+
"acc,none": 0.785224153705398,
|
5 |
+
"acc_stderr,none": 0.006211537927009462,
|
6 |
+
"acc_norm,none": 0.7829368709972553,
|
7 |
+
"acc_norm_stderr,none": 0.006235424129675317,
|
8 |
+
"alias": "polish_8tags_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_8tags_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_8tags_multiple_choice": {
|
16 |
+
"task": "polish_8tags_multiple_choice",
|
17 |
+
"dataset_path": "sdadas/8tags",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"fewshot_split": "train",
|
21 |
+
"doc_to_text": "Tytu艂: \"{{sentence}}\"\nDo podanego tytu艂u przyporz膮dkuj jedn膮 najlepiej pasuj膮c膮 kategori臋 z podanych: Film, Historia, Jedzenie, Medycyna, Motoryzacja, Praca, Sport, Technologie.\nKategoria:",
|
22 |
+
"doc_to_target": "{{label|int}}",
|
23 |
+
"doc_to_choice": [
|
24 |
+
"Film",
|
25 |
+
"Historia",
|
26 |
+
"Jedzenie",
|
27 |
+
"Medycyna",
|
28 |
+
"Motoryzacja",
|
29 |
+
"Praca",
|
30 |
+
"Sport",
|
31 |
+
"Technologie"
|
32 |
+
],
|
33 |
+
"description": "",
|
34 |
+
"target_delimiter": " ",
|
35 |
+
"fewshot_delimiter": "\n\n",
|
36 |
+
"num_fewshot": 0,
|
37 |
+
"metric_list": [
|
38 |
+
{
|
39 |
+
"metric": "acc",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "acc_norm",
|
45 |
+
"aggregation": "mean",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_8tags_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_8tags_multiple_choice": 0
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_8tags_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_8tags_multiple_choice": {
|
69 |
+
"original": 4372,
|
70 |
+
"effective": 4372
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381748.8961906,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_8tags_multiple_choice": "97e61e52772af016579422421c750a76a73c5aa55b81bd957c03e5fe7ca43b9b"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 2669732.988419174,
|
104 |
+
"end_time": 2671312.355425343,
|
105 |
+
"total_evaluation_time_seconds": "1579.3670061687008"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-37-36.637222.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_8tags_regex": {
|
4 |
+
"exact_match,score-first": 0.7509149130832571,
|
5 |
+
"exact_match_stderr,score-first": 0.006541522277132546,
|
6 |
+
"alias": "polish_8tags_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_8tags_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_8tags_regex": {
|
14 |
+
"task": "polish_8tags_regex",
|
15 |
+
"dataset_path": "sdadas/8tags",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Tytu艂: \"{{sentence}}\"\nPytanie: jaka kategoria najlepiej pasuje do podanego tytu艂u?\nMo偶liwe odpowiedzi:\nA - film\nB - historia\nC - jedzenie\nD - medycyna\nE - motoryzacja\nF - praca\nG - sport\nH - technologie\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H'}.get(label)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 0,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
","
|
37 |
+
],
|
38 |
+
"do_sample": false,
|
39 |
+
"temperature": 0.0,
|
40 |
+
"max_gen_toks": 50
|
41 |
+
},
|
42 |
+
"repeats": 1,
|
43 |
+
"filter_list": [
|
44 |
+
{
|
45 |
+
"name": "score-first",
|
46 |
+
"filter": [
|
47 |
+
{
|
48 |
+
"function": "regex",
|
49 |
+
"regex_pattern": "(\\b[ABCDEFGH]\\b)"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"function": "take_first"
|
53 |
+
}
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"should_decontaminate": true,
|
58 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"versions": {
|
62 |
+
"polish_8tags_regex": "Yaml"
|
63 |
+
},
|
64 |
+
"n-shot": {
|
65 |
+
"polish_8tags_regex": 0
|
66 |
+
},
|
67 |
+
"higher_is_better": {
|
68 |
+
"polish_8tags_regex": {
|
69 |
+
"exact_match": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_8tags_regex": {
|
74 |
+
"original": 4372,
|
75 |
+
"effective": 4372
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381748.6299846,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_8tags_regex": "65692e40c28addb981c1eb0f272d45d3abf7b640c98f72a2acf9de48677c436e"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 2577223.709303458,
|
109 |
+
"end_time": 2586140.204190591,
|
110 |
+
"total_evaluation_time_seconds": "8916.494887132663"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-12.455988.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_belebele_mc": {
|
4 |
+
"acc,none": 0.8755555555555555,
|
5 |
+
"acc_stderr,none": 0.011009047987347446,
|
6 |
+
"acc_norm,none": 0.8755555555555555,
|
7 |
+
"acc_norm_stderr,none": 0.011009047987347446,
|
8 |
+
"alias": "polish_belebele_mc"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_belebele_mc": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_belebele_mc": {
|
16 |
+
"task": "polish_belebele_mc",
|
17 |
+
"dataset_path": "facebook/belebele",
|
18 |
+
"test_split": "pol_Latn",
|
19 |
+
"fewshot_split": "pol_Latn",
|
20 |
+
"doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{['1', '2', '3', '4'].index(correct_answer_num)}}",
|
22 |
+
"doc_to_choice": [
|
23 |
+
"A",
|
24 |
+
"B",
|
25 |
+
"C",
|
26 |
+
"D"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"fewshot_config": {
|
32 |
+
"sampler": "first_n"
|
33 |
+
},
|
34 |
+
"num_fewshot": 0,
|
35 |
+
"metric_list": [
|
36 |
+
{
|
37 |
+
"metric": "acc",
|
38 |
+
"aggregation": "mean",
|
39 |
+
"higher_is_better": true
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"metric": "acc_norm",
|
43 |
+
"aggregation": "mean",
|
44 |
+
"higher_is_better": true
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"output_type": "multiple_choice",
|
48 |
+
"repeats": 1,
|
49 |
+
"should_decontaminate": true,
|
50 |
+
"doc_to_decontamination_query": "{{question}}",
|
51 |
+
"metadata": {
|
52 |
+
"version": 0.0
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"versions": {
|
57 |
+
"polish_belebele_mc": 0.0
|
58 |
+
},
|
59 |
+
"n-shot": {
|
60 |
+
"polish_belebele_mc": 0
|
61 |
+
},
|
62 |
+
"higher_is_better": {
|
63 |
+
"polish_belebele_mc": {
|
64 |
+
"acc": true,
|
65 |
+
"acc_norm": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_belebele_mc": {
|
70 |
+
"original": 900,
|
71 |
+
"effective": 900
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381748.8964837,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_belebele_mc": "e575c2bfe123497ebf8be109e92bdcb84761ff1f7ebc06ee26942cfec0914841"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 2669732.988173293,
|
105 |
+
"end_time": 2669985.011977567,
|
106 |
+
"total_evaluation_time_seconds": "252.023804273922"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-48.898331.json
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_belebele_regex": {
|
4 |
+
"exact_match,score-first": 0.8622222222222222,
|
5 |
+
"exact_match_stderr,score-first": 0.011495274539524291,
|
6 |
+
"alias": "polish_belebele_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_belebele_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_belebele_regex": {
|
14 |
+
"task": "polish_belebele_regex",
|
15 |
+
"dataset_path": "facebook/belebele",
|
16 |
+
"test_split": "pol_Latn",
|
17 |
+
"doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
|
18 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(correct_answer_num|int - 1)}}",
|
19 |
+
"description": "",
|
20 |
+
"target_delimiter": " ",
|
21 |
+
"fewshot_delimiter": "\n\n",
|
22 |
+
"num_fewshot": 0,
|
23 |
+
"metric_list": [
|
24 |
+
{
|
25 |
+
"metric": "exact_match",
|
26 |
+
"aggregation": "mean",
|
27 |
+
"higher_is_better": true
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"output_type": "generate_until",
|
31 |
+
"generation_kwargs": {
|
32 |
+
"until": [
|
33 |
+
".",
|
34 |
+
","
|
35 |
+
],
|
36 |
+
"do_sample": false,
|
37 |
+
"temperature": 0.0,
|
38 |
+
"max_gen_toks": 50
|
39 |
+
},
|
40 |
+
"repeats": 1,
|
41 |
+
"filter_list": [
|
42 |
+
{
|
43 |
+
"name": "score-first",
|
44 |
+
"filter": [
|
45 |
+
{
|
46 |
+
"function": "regex",
|
47 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"function": "take_first"
|
51 |
+
}
|
52 |
+
]
|
53 |
+
}
|
54 |
+
],
|
55 |
+
"should_decontaminate": true,
|
56 |
+
"doc_to_decontamination_query": "{{flores_passage}} {{question}} {{mc_answer1}} {{mc_answer2}} {{mc_answer3}} {{mc_answer4}}"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"versions": {
|
60 |
+
"polish_belebele_regex": "Yaml"
|
61 |
+
},
|
62 |
+
"n-shot": {
|
63 |
+
"polish_belebele_regex": 0
|
64 |
+
},
|
65 |
+
"higher_is_better": {
|
66 |
+
"polish_belebele_regex": {
|
67 |
+
"exact_match": true
|
68 |
+
}
|
69 |
+
},
|
70 |
+
"n-samples": {
|
71 |
+
"polish_belebele_regex": {
|
72 |
+
"original": 900,
|
73 |
+
"effective": 900
|
74 |
+
}
|
75 |
+
},
|
76 |
+
"config": {
|
77 |
+
"model": "hf",
|
78 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
79 |
+
"batch_size": "1",
|
80 |
+
"batch_sizes": [],
|
81 |
+
"device": "cuda:0",
|
82 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex/",
|
83 |
+
"limit": null,
|
84 |
+
"bootstrap_iters": 100000,
|
85 |
+
"gen_kwargs": null,
|
86 |
+
"random_seed": 0,
|
87 |
+
"numpy_seed": 1234,
|
88 |
+
"torch_seed": 1234,
|
89 |
+
"fewshot_seed": 1234
|
90 |
+
},
|
91 |
+
"git_hash": "2132286",
|
92 |
+
"date": 1723381748.62987,
|
93 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
94 |
+
"transformers_version": "4.43.1",
|
95 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
96 |
+
"task_hashes": {
|
97 |
+
"polish_belebele_regex": "27d3ad975a6f34d19e414caf684c5c66f47347a7e3f05c8420cd085a341dcbe7"
|
98 |
+
},
|
99 |
+
"model_source": "hf",
|
100 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
101 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
102 |
+
"system_instruction": null,
|
103 |
+
"system_instruction_sha": null,
|
104 |
+
"chat_template": null,
|
105 |
+
"chat_template_sha": null,
|
106 |
+
"start_time": 2577223.702730047,
|
107 |
+
"end_time": 2578892.467186104,
|
108 |
+
"total_evaluation_time_seconds": "1668.7644560569897"
|
109 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-56.871408.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_cbd_multiple_choice": {
|
4 |
+
"acc,none": 0.232,
|
5 |
+
"acc_stderr,none": 0.01335493745228157,
|
6 |
+
"f1,none": 0.19798644869999346,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.254,
|
9 |
+
"acc_norm_stderr,none": 0.013772206565168544,
|
10 |
+
"alias": "polish_cbd_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_cbd_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_cbd_multiple_choice": {
|
18 |
+
"task": "polish_cbd_multiple_choice",
|
19 |
+
"dataset_path": "ptaszynski/PolishCyberbullyingDataset",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nDo podanej wypowiedzi przyporz膮dkuj jedn膮, najlepiej pasuj膮c膮 kategori臋 z podanych: nieszkodliwa, szyderstwo, obelga, insynuacja, gro藕ba, molestowanie.\nKategoria:",
|
23 |
+
"doc_to_target": "{{{'szyderstwo': 1, 'obelga': 2, 'insynuacja': 3, 'grozba': 4, 'molestowanie': 5}.get(CATEGORIES, 0)}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"nieszkodliwa",
|
26 |
+
"szyderstwo",
|
27 |
+
"obelga",
|
28 |
+
"insynuacja",
|
29 |
+
"gro藕ba",
|
30 |
+
"molestowanie"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 0,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
49 |
+
"aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
|
50 |
+
"higher_is_better": true
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"output_type": "multiple_choice",
|
54 |
+
"repeats": 1,
|
55 |
+
"should_decontaminate": true,
|
56 |
+
"doc_to_decontamination_query": "{{TEXT}}"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"versions": {
|
60 |
+
"polish_cbd_multiple_choice": "Yaml"
|
61 |
+
},
|
62 |
+
"n-shot": {
|
63 |
+
"polish_cbd_multiple_choice": 0
|
64 |
+
},
|
65 |
+
"higher_is_better": {
|
66 |
+
"polish_cbd_multiple_choice": {
|
67 |
+
"acc": true,
|
68 |
+
"acc_norm": true,
|
69 |
+
"f1": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_cbd_multiple_choice": {
|
74 |
+
"original": 1000,
|
75 |
+
"effective": 1000
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381748.896205,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_cbd_multiple_choice": "56be7d38fd3346cc3ebad202ec8c0365fc6a9b7c3b60b7c527ec0cf16db2c0df"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 2669732.987932883,
|
109 |
+
"end_time": 2670209.427766158,
|
110 |
+
"total_evaluation_time_seconds": "476.4398332745768"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-52.956955.json
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_cbd_regex": {
|
4 |
+
"exact_match,score-first": 0.389,
|
5 |
+
"exact_match_stderr,score-first": 0.015424555647308496,
|
6 |
+
"f1,score-first": 0.24472868820966764,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_cbd_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_cbd_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_cbd_regex": {
|
16 |
+
"task": "polish_cbd_regex",
|
17 |
+
"dataset_path": "ptaszynski/PolishCyberbullyingDataset",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nPytanie: Jaka kategoria najlepiej pasuje do podanej wypowiedzi?\nMo偶liwe odpowiedzi:\nA - nieszkodliwa\nB - szyderstwo\nC - obelga\nD - insynuacja\nE - gro藕ba\nF - molestowanie\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{{'szyderstwo': 'B', 'obelga': 'C', 'insynuacja': 'D', 'grozba': 'E', 'molestowanie': 'F'}.get(CATEGORIES, 'A')}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 0,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
",",
|
43 |
+
";"
|
44 |
+
],
|
45 |
+
"do_sample": false,
|
46 |
+
"temperature": 0.0,
|
47 |
+
"max_gen_toks": 50
|
48 |
+
},
|
49 |
+
"repeats": 1,
|
50 |
+
"filter_list": [
|
51 |
+
{
|
52 |
+
"name": "score-first",
|
53 |
+
"filter": [
|
54 |
+
{
|
55 |
+
"function": "regex",
|
56 |
+
"regex_pattern": "(\\b[ABCDEF]\\b)"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"function": "take_first"
|
60 |
+
}
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"should_decontaminate": true,
|
65 |
+
"doc_to_decontamination_query": "{{TEXT}}"
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polish_cbd_regex": "Yaml"
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polish_cbd_regex": 0
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polish_cbd_regex": {
|
76 |
+
"exact_match": true,
|
77 |
+
"f1": true
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"n-samples": {
|
81 |
+
"polish_cbd_regex": {
|
82 |
+
"original": 1000,
|
83 |
+
"effective": 1000
|
84 |
+
}
|
85 |
+
},
|
86 |
+
"config": {
|
87 |
+
"model": "hf",
|
88 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
89 |
+
"batch_size": "1",
|
90 |
+
"batch_sizes": [],
|
91 |
+
"device": "cuda:0",
|
92 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex/",
|
93 |
+
"limit": null,
|
94 |
+
"bootstrap_iters": 100000,
|
95 |
+
"gen_kwargs": null,
|
96 |
+
"random_seed": 0,
|
97 |
+
"numpy_seed": 1234,
|
98 |
+
"torch_seed": 1234,
|
99 |
+
"fewshot_seed": 1234
|
100 |
+
},
|
101 |
+
"git_hash": "2132286",
|
102 |
+
"date": 1723381748.6293602,
|
103 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
104 |
+
"transformers_version": "4.43.1",
|
105 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
106 |
+
"task_hashes": {
|
107 |
+
"polish_cbd_regex": "d924b7270ebed050a040882627c2c9edeabe16833fce05d56d9afd2bdd04ab67"
|
108 |
+
},
|
109 |
+
"model_source": "hf",
|
110 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
111 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
112 |
+
"system_instruction": null,
|
113 |
+
"system_instruction_sha": null,
|
114 |
+
"chat_template": null,
|
115 |
+
"chat_template_sha": null,
|
116 |
+
"start_time": 2577223.672784959,
|
117 |
+
"end_time": 2579376.526070405,
|
118 |
+
"total_evaluation_time_seconds": "2152.8532854458317"
|
119 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-01.731913.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_dyk_multiple_choice": {
|
4 |
+
"acc,none": 0.8571428571428571,
|
5 |
+
"acc_stderr,none": 0.010913926579250558,
|
6 |
+
"f1,none": 0.6508313539192399,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.8571428571428571,
|
9 |
+
"acc_norm_stderr,none": 0.010913926579250558,
|
10 |
+
"alias": "polish_dyk_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_dyk_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_dyk_multiple_choice": {
|
18 |
+
"task": "polish_dyk_multiple_choice",
|
19 |
+
"dataset_path": "allegro/klej-dyk",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nPytanie: Czy sugerowana odpowied藕 na zadane pytanie jest poprawna?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{target|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 0,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
45 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{question}} {{answer}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_dyk_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_dyk_multiple_choice": 0
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_dyk_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true,
|
65 |
+
"f1": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_dyk_multiple_choice": {
|
70 |
+
"original": 1029,
|
71 |
+
"effective": 1029
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381748.8968518,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_dyk_multiple_choice": "614bab79a1ec3b666218bb65089e147f8ed82ac0a4d10ab14a57ffcc73379688"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 2669732.988483474,
|
105 |
+
"end_time": 2669974.28807118,
|
106 |
+
"total_evaluation_time_seconds": "241.29958770610392"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-18.998768.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_dyk_regex": {
|
4 |
+
"exact_match,score-first": 0.8532555879494655,
|
5 |
+
"exact_match_stderr,score-first": 0.01103630767704879,
|
6 |
+
"f1,score-first": 0.6591422121896162,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_dyk_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_dyk_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_dyk_regex": {
|
16 |
+
"task": "polish_dyk_regex",
|
17 |
+
"dataset_path": "allegro/klej-dyk",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nCzy sugerowana odpowied藕 na zadane pytanie jest poprawna? Mo偶liwe opcje:\nA - brakuje sugerowanej odpowiedzi\nB - nie, sugerowana odpowied藕 nie jest poprawna\nC - tak, sugerowana odpowied藕 jest poprawna\nD - brakuje pytania\nPrawid艂owa opcja:",
|
21 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(target|int + 1)}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 0,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
","
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"filter_list": [
|
50 |
+
{
|
51 |
+
"name": "score-first",
|
52 |
+
"filter": [
|
53 |
+
{
|
54 |
+
"function": "regex",
|
55 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"function": "take_first"
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"should_decontaminate": true,
|
64 |
+
"doc_to_decontamination_query": "{{question}} {{answer}}"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"versions": {
|
68 |
+
"polish_dyk_regex": "Yaml"
|
69 |
+
},
|
70 |
+
"n-shot": {
|
71 |
+
"polish_dyk_regex": 0
|
72 |
+
},
|
73 |
+
"higher_is_better": {
|
74 |
+
"polish_dyk_regex": {
|
75 |
+
"exact_match": true,
|
76 |
+
"f1": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polish_dyk_regex": {
|
81 |
+
"original": 1029,
|
82 |
+
"effective": 1029
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.6296444,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polish_dyk_regex": "2649d9ae76c76684ce97aa5028f8024f8eda160a26db26b547c0abf175fb2de1"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2577223.673779933,
|
116 |
+
"end_time": 2577542.567465127,
|
117 |
+
"total_evaluation_time_seconds": "318.89368519419804"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-26.656191.json
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_eq_bench": {
|
4 |
+
"first_eqbench,none": 48.33189616455903,
|
5 |
+
"first_eqbench_stderr,none": 2.573616088487117,
|
6 |
+
"first_percent_parseable,none": 100.0,
|
7 |
+
"first_percent_parseable_stderr,none": 0.0,
|
8 |
+
"revised_eqbench,none": 63.019918840007705,
|
9 |
+
"revised_eqbench_stderr,none": 2.3758111655038587,
|
10 |
+
"revised_percent_parseable,none": 99.41520467836257,
|
11 |
+
"revised_percent_parseable_stderr,none": 0.5847953216374274,
|
12 |
+
"average_eqbench,none": 55.67590750228339,
|
13 |
+
"average_eqbench_stderr,none": 2.1636830548973527,
|
14 |
+
"alias": "polish_eq_bench"
|
15 |
+
}
|
16 |
+
},
|
17 |
+
"group_subtasks": {
|
18 |
+
"polish_eq_bench": []
|
19 |
+
},
|
20 |
+
"configs": {
|
21 |
+
"polish_eq_bench": {
|
22 |
+
"task": "polish_eq_bench",
|
23 |
+
"dataset_path": "speakleash/EQ-Bench-PL",
|
24 |
+
"validation_split": "validation",
|
25 |
+
"doc_to_text": "{{prompt}}\nPierwsze oceny:\n",
|
26 |
+
"doc_to_target": "reference_answer_fullscale",
|
27 |
+
"process_results": "def score(docs, results):\n first_pass_answers, revised_answers = parse(results[0])\n reference = eval(docs[\"reference_answer\"])\n reference_fullscale = eval(docs[\"reference_answer_fullscale\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n revised_pass_score = calculate_score(reference_fullscale, revised_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n scores.update({'revised_'+k: v for k, v in revised_pass_score.items()})\n #add average score\n scores['average_eqbench'] = (scores['first_eqbench'] + scores['revised_eqbench']) / 2\n return scores\n",
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 0,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "first_eqbench",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "first_percent_parseable",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "revised_eqbench",
|
45 |
+
"aggregation": "mean",
|
46 |
+
"higher_is_better": true
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"metric": "revised_percent_parseable",
|
50 |
+
"aggregation": "mean",
|
51 |
+
"higher_is_better": true
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"metric": "average_eqbench",
|
55 |
+
"aggregation": "mean",
|
56 |
+
"higher_is_better": true
|
57 |
+
}
|
58 |
+
],
|
59 |
+
"output_type": "generate_until",
|
60 |
+
"generation_kwargs": {
|
61 |
+
"max_gen_toks": 512,
|
62 |
+
"do_sample": false,
|
63 |
+
"temperature": 0.0,
|
64 |
+
"until": [
|
65 |
+
"</s>",
|
66 |
+
"[Koniec odpowiedzi]",
|
67 |
+
"Masz za zadanie"
|
68 |
+
]
|
69 |
+
},
|
70 |
+
"repeats": 1,
|
71 |
+
"should_decontaminate": false,
|
72 |
+
"metadata": {
|
73 |
+
"version": 2.4
|
74 |
+
}
|
75 |
+
}
|
76 |
+
},
|
77 |
+
"versions": {
|
78 |
+
"polish_eq_bench": 2.4
|
79 |
+
},
|
80 |
+
"n-shot": {
|
81 |
+
"polish_eq_bench": 0
|
82 |
+
},
|
83 |
+
"higher_is_better": {
|
84 |
+
"polish_eq_bench": {
|
85 |
+
"first_eqbench": true,
|
86 |
+
"first_percent_parseable": true,
|
87 |
+
"revised_eqbench": true,
|
88 |
+
"revised_percent_parseable": true,
|
89 |
+
"average_eqbench": true
|
90 |
+
}
|
91 |
+
},
|
92 |
+
"n-samples": {
|
93 |
+
"polish_eq_bench": {
|
94 |
+
"original": 171,
|
95 |
+
"effective": 171
|
96 |
+
}
|
97 |
+
},
|
98 |
+
"config": {
|
99 |
+
"model": "hf",
|
100 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
101 |
+
"batch_size": "1",
|
102 |
+
"batch_sizes": [],
|
103 |
+
"device": "cuda:0",
|
104 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench/",
|
105 |
+
"limit": null,
|
106 |
+
"bootstrap_iters": 100000,
|
107 |
+
"gen_kwargs": null,
|
108 |
+
"random_seed": 0,
|
109 |
+
"numpy_seed": 1234,
|
110 |
+
"torch_seed": 1234,
|
111 |
+
"fewshot_seed": 1234
|
112 |
+
},
|
113 |
+
"git_hash": "2132286",
|
114 |
+
"date": 1723381748.6044407,
|
115 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
116 |
+
"transformers_version": "4.43.1",
|
117 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
118 |
+
"task_hashes": {
|
119 |
+
"polish_eq_bench": "18b3ee14b53fb2aaee4430e37609e64896598b0efa26dc7ecf4e483eece3a6b3"
|
120 |
+
},
|
121 |
+
"model_source": "hf",
|
122 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
123 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
124 |
+
"system_instruction": null,
|
125 |
+
"system_instruction_sha": null,
|
126 |
+
"chat_template": null,
|
127 |
+
"chat_template_sha": null,
|
128 |
+
"start_time": 780407.632073815,
|
129 |
+
"end_time": 783314.157407221,
|
130 |
+
"total_evaluation_time_seconds": "2906.5253334060544"
|
131 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-21.291842.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_eq_bench_first_turn": {
|
4 |
+
"first_eqbench,none": 46.996619012784315,
|
5 |
+
"first_eqbench_stderr,none": 2.655038142048486,
|
6 |
+
"first_percent_parseable,none": 100.0,
|
7 |
+
"first_percent_parseable_stderr,none": 0.0,
|
8 |
+
"alias": "polish_eq_bench_first_turn"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_eq_bench_first_turn": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_eq_bench_first_turn": {
|
16 |
+
"task": "polish_eq_bench_first_turn",
|
17 |
+
"dataset_path": "speakleash/EQ-Bench-PL-first-turn",
|
18 |
+
"validation_split": "validation",
|
19 |
+
"doc_to_text": "{{prompt}}\nOceny:\n",
|
20 |
+
"doc_to_target": "def doc_to_target(doc):\n reference = eval(doc[\"reference_answer\"])\n\n target = \"\"\n for i in range(1, 5):\n emotion = reference[f\"emotion{i}\"]\n emotion_score = reference[f\"emotion{i}_score\"]\n target += f\"{emotion}: {emotion_score}\\n\"\n target += \"\\n\"\n\n return target\n",
|
21 |
+
"process_results": "def score_first(docs, results):\n first_pass_answers = dict(list(re.findall(r'(\\w+(?: \\w+)*):\\s+(\\d+)', results[0]))[:4])\n reference = eval(docs[\"reference_answer\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n return scores\n",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 0,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "first_eqbench",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "first_percent_parseable",
|
34 |
+
"aggregation": "mean",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"max_gen_toks": 512,
|
41 |
+
"do_sample": false,
|
42 |
+
"temperature": 0.0,
|
43 |
+
"until": [
|
44 |
+
"</s>",
|
45 |
+
"[Koniec odpowiedzi]",
|
46 |
+
"Masz za zadanie"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": false,
|
51 |
+
"metadata": {
|
52 |
+
"version": 2.4
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"versions": {
|
57 |
+
"polish_eq_bench_first_turn": 2.4
|
58 |
+
},
|
59 |
+
"n-shot": {
|
60 |
+
"polish_eq_bench_first_turn": 0
|
61 |
+
},
|
62 |
+
"higher_is_better": {
|
63 |
+
"polish_eq_bench_first_turn": {
|
64 |
+
"first_eqbench": true,
|
65 |
+
"first_percent_parseable": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_eq_bench_first_turn": {
|
70 |
+
"original": 171,
|
71 |
+
"effective": 171
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381747.7569976,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_eq_bench_first_turn": "0e253a32b5915f6d9cff628bdffb1f234618238d116e0a34217ec48916ba0a49"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 2270424.400063744,
|
105 |
+
"end_time": 2272005.840581135,
|
106 |
+
"total_evaluation_time_seconds": "1581.4405173910782"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-30.374052.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_klej_ner_multiple_choice": {
|
4 |
+
"acc,none": 0.46987366375121475,
|
5 |
+
"acc_stderr,none": 0.011004317088597403,
|
6 |
+
"acc_norm,none": 0.5092322643343051,
|
7 |
+
"acc_norm_stderr,none": 0.011022467118497213,
|
8 |
+
"alias": "polish_klej_ner_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_klej_ner_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_klej_ner_multiple_choice": {
|
16 |
+
"task": "polish_klej_ner_multiple_choice",
|
17 |
+
"dataset_path": "allegro/klej-nkjp-ner",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"fewshot_split": "train",
|
22 |
+
"doc_to_text": "Zdanie: \"{{sentence}}\"\nJakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi: Brak nazwanej jednostki, Nazwa miejsca, Nazwa osoby, Nazwa organizacji, Czas, Nazwa geograficzna.\nRodzaj:",
|
23 |
+
"doc_to_target": "{{{'noEntity': 0, 'placeName': 1, 'persName': 2, 'orgName': 3, 'time': 4, 'geogName': 5}.get(target)}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Brak nazwanej jednostki",
|
26 |
+
"Nazwa miejsca",
|
27 |
+
"Nazwa osoby",
|
28 |
+
"Nazwa organizacji",
|
29 |
+
"Czas",
|
30 |
+
"Nazwa geograficzna"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 0,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polish_klej_ner_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polish_klej_ner_multiple_choice": 0
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polish_klej_ner_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polish_klej_ner_multiple_choice": {
|
68 |
+
"original": 2058,
|
69 |
+
"effective": 2058
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381747.6734633,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polish_klej_ner_multiple_choice": "09f6e903dc9fc050951f2c84685c285da57a8ddba1ff829ebb489df1ba737161"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2341192.242899954,
|
103 |
+
"end_time": 2341942.977311477,
|
104 |
+
"total_evaluation_time_seconds": "750.7344115232117"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-15-22.967823.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_klej_ner_regex": {
|
4 |
+
"exact_match,score-first": 0.5388726919339164,
|
5 |
+
"exact_match_stderr,score-first": 0.010990978618734456,
|
6 |
+
"alias": "polish_klej_ner_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_klej_ner_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_klej_ner_regex": {
|
14 |
+
"task": "polish_klej_ner_regex",
|
15 |
+
"dataset_path": "allegro/klej-nkjp-ner",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Zdanie: \"{{sentence}}\"\nPytanie: Jakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi:\nA - Brak nazwanej jednostki\nB - Nazwa miejsca\nC - Nazwa osoby\nD - Nazwa organizacji\nE - Czas\nF - Nazwa geograficzna\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{'noEntity': 'A', 'placeName': 'B', 'persName': 'C', 'orgName': 'D', 'time': 'E', 'geogName': 'F'}.get(target)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 0,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
",",
|
37 |
+
";"
|
38 |
+
],
|
39 |
+
"do_sample": false,
|
40 |
+
"temperature": 0.0,
|
41 |
+
"max_gen_toks": 50
|
42 |
+
},
|
43 |
+
"repeats": 1,
|
44 |
+
"filter_list": [
|
45 |
+
{
|
46 |
+
"name": "score-first",
|
47 |
+
"filter": [
|
48 |
+
{
|
49 |
+
"function": "regex",
|
50 |
+
"regex_pattern": "(\\b[ABCDEF]\\b)"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"function": "take_first"
|
54 |
+
}
|
55 |
+
]
|
56 |
+
}
|
57 |
+
],
|
58 |
+
"should_decontaminate": true,
|
59 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"versions": {
|
63 |
+
"polish_klej_ner_regex": "Yaml"
|
64 |
+
},
|
65 |
+
"n-shot": {
|
66 |
+
"polish_klej_ner_regex": 0
|
67 |
+
},
|
68 |
+
"higher_is_better": {
|
69 |
+
"polish_klej_ner_regex": {
|
70 |
+
"exact_match": true
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"n-samples": {
|
74 |
+
"polish_klej_ner_regex": {
|
75 |
+
"original": 2058,
|
76 |
+
"effective": 2058
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"config": {
|
80 |
+
"model": "hf",
|
81 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
82 |
+
"batch_size": "1",
|
83 |
+
"batch_sizes": [],
|
84 |
+
"device": "cuda:0",
|
85 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex/",
|
86 |
+
"limit": null,
|
87 |
+
"bootstrap_iters": 100000,
|
88 |
+
"gen_kwargs": null,
|
89 |
+
"random_seed": 0,
|
90 |
+
"numpy_seed": 1234,
|
91 |
+
"torch_seed": 1234,
|
92 |
+
"fewshot_seed": 1234
|
93 |
+
},
|
94 |
+
"git_hash": "2132286",
|
95 |
+
"date": 1723381748.6294396,
|
96 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
97 |
+
"transformers_version": "4.43.1",
|
98 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
99 |
+
"task_hashes": {
|
100 |
+
"polish_klej_ner_regex": "73b98cc9f2e2b0a3c1be3efc063d3765b29cbbfcadaa6952ff0b16c2aeca4784"
|
101 |
+
},
|
102 |
+
"model_source": "hf",
|
103 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
104 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
105 |
+
"system_instruction": null,
|
106 |
+
"system_instruction_sha": null,
|
107 |
+
"chat_template": null,
|
108 |
+
"chat_template_sha": null,
|
109 |
+
"start_time": 2577223.695462814,
|
110 |
+
"end_time": 2581206.536121368,
|
111 |
+
"total_evaluation_time_seconds": "3982.840658553876"
|
112 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-25.200287.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_closed_book": {
|
4 |
+
"exact_match,none": 0.09034267912772585,
|
5 |
+
"exact_match_stderr,none": 0.009242678703782942,
|
6 |
+
"levenshtein,none": 0.3904465212876428,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_polqa_closed_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_closed_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_closed_book": {
|
16 |
+
"task": "polish_polqa_closed_book",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs_closed(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and example['question'] not in used and (used.add(example['question']) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "answers",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 0,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
|
36 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
37 |
+
"higher_is_better": true
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"output_type": "generate_until",
|
41 |
+
"generation_kwargs": {
|
42 |
+
"until": [
|
43 |
+
"\n",
|
44 |
+
"</s>"
|
45 |
+
],
|
46 |
+
"do_sample": false,
|
47 |
+
"temperature": 0.0,
|
48 |
+
"max_gen_toks": 50
|
49 |
+
},
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{question}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_polqa_closed_book": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_polqa_closed_book": 0
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_polqa_closed_book": {
|
63 |
+
"exact_match": true,
|
64 |
+
"levenshtein": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_polqa_closed_book": {
|
69 |
+
"original": 963,
|
70 |
+
"effective": 963
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381747.7568066,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_polqa_closed_book": "0c5507d60ba16e4142471afab656e1a5d591a0227302e05fdfbce5cc9f087079"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 2270424.400038904,
|
104 |
+
"end_time": 2271229.748561826,
|
105 |
+
"total_evaluation_time_seconds": "805.3485229220241"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-58-53.946082.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_open_book": {
|
4 |
+
"exact_match,none": 0.23734817813765183,
|
5 |
+
"exact_match_stderr,none": 0.005526353270874367,
|
6 |
+
"levenshtein,none": 0.5875506072874493,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_polqa_open_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_open_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_open_book": {
|
16 |
+
"task": "polish_polqa_open_book",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs_open(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "answers",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 0,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
|
36 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
37 |
+
"higher_is_better": true
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"output_type": "generate_until",
|
41 |
+
"generation_kwargs": {
|
42 |
+
"until": [
|
43 |
+
"\n",
|
44 |
+
"</s>"
|
45 |
+
],
|
46 |
+
"do_sample": false,
|
47 |
+
"temperature": 0.0,
|
48 |
+
"max_gen_toks": 50
|
49 |
+
},
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{passage_text}} {{question}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_polqa_open_book": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_polqa_open_book": 0
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_polqa_open_book": {
|
63 |
+
"exact_match": true,
|
64 |
+
"levenshtein": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_polqa_open_book": {
|
69 |
+
"original": 5928,
|
70 |
+
"effective": 5928
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381747.756693,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_polqa_open_book": "605bac12835fc2014ee7398cc41fe38316bab9148d464cc12f8034038e6dd744"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 2270424.414197628,
|
104 |
+
"end_time": 2273418.492015983,
|
105 |
+
"total_evaluation_time_seconds": "2994.0778183550574"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-23-40.740746.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_reranking_multiple_choice": {
|
4 |
+
"acc,none": 0.8055708552993941,
|
5 |
+
"acc_stderr,none": 0.0035107018856493904,
|
6 |
+
"acc_norm,none": 0.8055708552993941,
|
7 |
+
"acc_norm_stderr,none": 0.0035107018856493904,
|
8 |
+
"alias": "polish_polqa_reranking_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_reranking_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_reranking_multiple_choice": {
|
16 |
+
"task": "polish_polqa_reranking_multiple_choice",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset):\n def _helper(doc):\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Czy kontekst jest relewantny dla pytania? \n Odpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{relevant|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 0,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"output_type": "multiple_choice",
|
45 |
+
"repeats": 1,
|
46 |
+
"should_decontaminate": true,
|
47 |
+
"doc_to_decontamination_query": "{{passage_text}} {{question}}"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"versions": {
|
51 |
+
"polish_polqa_reranking_multiple_choice": "Yaml"
|
52 |
+
},
|
53 |
+
"n-shot": {
|
54 |
+
"polish_polqa_reranking_multiple_choice": 0
|
55 |
+
},
|
56 |
+
"higher_is_better": {
|
57 |
+
"polish_polqa_reranking_multiple_choice": {
|
58 |
+
"acc": true,
|
59 |
+
"acc_norm": true
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"n-samples": {
|
63 |
+
"polish_polqa_reranking_multiple_choice": {
|
64 |
+
"original": 12709,
|
65 |
+
"effective": 12709
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"config": {
|
69 |
+
"model": "hf",
|
70 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
71 |
+
"batch_size": "1",
|
72 |
+
"batch_sizes": [],
|
73 |
+
"device": "cuda:0",
|
74 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice/",
|
75 |
+
"limit": null,
|
76 |
+
"bootstrap_iters": 100000,
|
77 |
+
"gen_kwargs": null,
|
78 |
+
"random_seed": 0,
|
79 |
+
"numpy_seed": 1234,
|
80 |
+
"torch_seed": 1234,
|
81 |
+
"fewshot_seed": 1234
|
82 |
+
},
|
83 |
+
"git_hash": "2132286",
|
84 |
+
"date": 1723381747.6733267,
|
85 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
86 |
+
"transformers_version": "4.43.1",
|
87 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
88 |
+
"task_hashes": {
|
89 |
+
"polish_polqa_reranking_multiple_choice": "284e872060f899232535470a606d94a217d950995b140caeb313a8887ea3f0b4"
|
90 |
+
},
|
91 |
+
"model_source": "hf",
|
92 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
93 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
94 |
+
"system_instruction": null,
|
95 |
+
"system_instruction_sha": null,
|
96 |
+
"chat_template": null,
|
97 |
+
"chat_template_sha": null,
|
98 |
+
"start_time": 2341192.242885584,
|
99 |
+
"end_time": 2342073.337279379,
|
100 |
+
"total_evaluation_time_seconds": "881.0943937948905"
|
101 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-06-52.670471.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_poquad_open_book": {
|
4 |
+
"exact_match,none": 0.0,
|
5 |
+
"exact_match_stderr,none": 0.0,
|
6 |
+
"levenshtein,none": 0.18771686328938236,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_poquad_open_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_poquad_open_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_poquad_open_book": {
|
16 |
+
"task": "polish_poquad_open_book",
|
17 |
+
"dataset_path": "clarin-pl/poquad",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "validation",
|
20 |
+
"doc_to_text": "Tytu艂: {{title}} \n Kontekst: {{context}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕 (kr贸tki cytat z Kontekstu):",
|
21 |
+
"doc_to_target": "def doc_to_target(doc):\n answer_list = doc[\"answers\"][\"text\"]\n if len(answer_list) > 0:\n answer = answer_list[0]\n else:\n answer = \"bez odpowiedzi\"\n return \" \" + answer\n",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 0,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0].lower().lstrip()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('.? ?(</s>)* ?$', '', _prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower().lstrip())\n if ld < len(reference)/2:\n return 1\n return 0\n",
|
34 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
"\n",
|
42 |
+
"</s>"
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"should_decontaminate": true,
|
50 |
+
"doc_to_decontamination_query": "{{context}} {{question}}"
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"versions": {
|
54 |
+
"polish_poquad_open_book": "Yaml"
|
55 |
+
},
|
56 |
+
"n-shot": {
|
57 |
+
"polish_poquad_open_book": 0
|
58 |
+
},
|
59 |
+
"higher_is_better": {
|
60 |
+
"polish_poquad_open_book": {
|
61 |
+
"exact_match": true,
|
62 |
+
"levenshtein": true
|
63 |
+
}
|
64 |
+
},
|
65 |
+
"n-samples": {
|
66 |
+
"polish_poquad_open_book": {
|
67 |
+
"original": 5764,
|
68 |
+
"effective": 5764
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"config": {
|
72 |
+
"model": "hf",
|
73 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
74 |
+
"batch_size": "1",
|
75 |
+
"batch_sizes": [],
|
76 |
+
"device": "cuda:0",
|
77 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book/",
|
78 |
+
"limit": null,
|
79 |
+
"bootstrap_iters": 100000,
|
80 |
+
"gen_kwargs": null,
|
81 |
+
"random_seed": 0,
|
82 |
+
"numpy_seed": 1234,
|
83 |
+
"torch_seed": 1234,
|
84 |
+
"fewshot_seed": 1234
|
85 |
+
},
|
86 |
+
"git_hash": "2132286",
|
87 |
+
"date": 1723381747.7568138,
|
88 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
89 |
+
"transformers_version": "4.43.1",
|
90 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
91 |
+
"task_hashes": {
|
92 |
+
"polish_poquad_open_book": "19564a782a5615c456e7084c72f26ca5fb6bc601f54dc4bfc5dec174c4e06e50"
|
93 |
+
},
|
94 |
+
"model_source": "hf",
|
95 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
96 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
97 |
+
"system_instruction": null,
|
98 |
+
"system_instruction_sha": null,
|
99 |
+
"chat_template": null,
|
100 |
+
"chat_template_sha": null,
|
101 |
+
"start_time": 2270424.41309478,
|
102 |
+
"end_time": 2273897.216942648,
|
103 |
+
"total_evaluation_time_seconds": "3472.8038478679955"
|
104 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-02.213869.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_ppc_multiple_choice": {
|
4 |
+
"acc,none": 0.74,
|
5 |
+
"acc_stderr,none": 0.013877773329774164,
|
6 |
+
"acc_norm,none": 0.74,
|
7 |
+
"acc_norm_stderr,none": 0.013877773329774164,
|
8 |
+
"alias": "polish_ppc_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_ppc_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_ppc_multiple_choice": {
|
16 |
+
"task": "polish_ppc_multiple_choice",
|
17 |
+
"dataset_path": "sdadas/ppc",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - znacz膮 dok艂adnie to samo\nB - maj膮 podobne znaczenie\nC - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
|
22 |
+
"doc_to_target": "{{label|int - 1}}",
|
23 |
+
"doc_to_choice": [
|
24 |
+
"A",
|
25 |
+
"B",
|
26 |
+
"C"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 0,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"output_type": "multiple_choice",
|
45 |
+
"repeats": 1,
|
46 |
+
"should_decontaminate": true,
|
47 |
+
"doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"versions": {
|
51 |
+
"polish_ppc_multiple_choice": "Yaml"
|
52 |
+
},
|
53 |
+
"n-shot": {
|
54 |
+
"polish_ppc_multiple_choice": 0
|
55 |
+
},
|
56 |
+
"higher_is_better": {
|
57 |
+
"polish_ppc_multiple_choice": {
|
58 |
+
"acc": true,
|
59 |
+
"acc_norm": true
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"n-samples": {
|
63 |
+
"polish_ppc_multiple_choice": {
|
64 |
+
"original": 1000,
|
65 |
+
"effective": 1000
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"config": {
|
69 |
+
"model": "hf",
|
70 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
71 |
+
"batch_size": "1",
|
72 |
+
"batch_sizes": [],
|
73 |
+
"device": "cuda:0",
|
74 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice/",
|
75 |
+
"limit": null,
|
76 |
+
"bootstrap_iters": 100000,
|
77 |
+
"gen_kwargs": null,
|
78 |
+
"random_seed": 0,
|
79 |
+
"numpy_seed": 1234,
|
80 |
+
"torch_seed": 1234,
|
81 |
+
"fewshot_seed": 1234
|
82 |
+
},
|
83 |
+
"git_hash": "2132286",
|
84 |
+
"date": 1723381748.8964221,
|
85 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
86 |
+
"transformers_version": "4.43.1",
|
87 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
88 |
+
"task_hashes": {
|
89 |
+
"polish_ppc_multiple_choice": "8747fd84df6316b9938ebd5eafb0d8cedcf82a0c184659a0eea7878c4dacdafa"
|
90 |
+
},
|
91 |
+
"model_source": "hf",
|
92 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
93 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
94 |
+
"system_instruction": null,
|
95 |
+
"system_instruction_sha": null,
|
96 |
+
"chat_template": null,
|
97 |
+
"chat_template_sha": null,
|
98 |
+
"start_time": 2669732.988217134,
|
99 |
+
"end_time": 2669974.769807927,
|
100 |
+
"total_evaluation_time_seconds": "241.78159079281613"
|
101 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-08.270834.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_ppc_regex": {
|
4 |
+
"exact_match,score-first": 0.703,
|
5 |
+
"exact_match_stderr,score-first": 0.014456832294801106,
|
6 |
+
"alias": "polish_ppc_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_ppc_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_ppc_regex": {
|
14 |
+
"task": "polish_ppc_regex",
|
15 |
+
"dataset_path": "sdadas/ppc",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - znacz膮 dok艂adnie to samo\nC - maj膮 podobne znaczenie\nD - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(label|int)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 0,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
","
|
37 |
+
],
|
38 |
+
"do_sample": false,
|
39 |
+
"temperature": 0.0,
|
40 |
+
"max_gen_toks": 50
|
41 |
+
},
|
42 |
+
"repeats": 1,
|
43 |
+
"filter_list": [
|
44 |
+
{
|
45 |
+
"name": "score-first",
|
46 |
+
"filter": [
|
47 |
+
{
|
48 |
+
"function": "regex",
|
49 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"function": "take_first"
|
53 |
+
}
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"should_decontaminate": true,
|
58 |
+
"doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"versions": {
|
62 |
+
"polish_ppc_regex": "Yaml"
|
63 |
+
},
|
64 |
+
"n-shot": {
|
65 |
+
"polish_ppc_regex": 0
|
66 |
+
},
|
67 |
+
"higher_is_better": {
|
68 |
+
"polish_ppc_regex": {
|
69 |
+
"exact_match": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_ppc_regex": {
|
74 |
+
"original": 1000,
|
75 |
+
"effective": 1000
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381748.6299686,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_ppc_regex": "55067cab325af5b68fb5e678581d4bfe4d45a600032d75381bf251f3aa9d8c91"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 2577223.665428845,
|
109 |
+
"end_time": 2579331.839649978,
|
110 |
+
"total_evaluation_time_seconds": "2108.174221132882"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-11-55.449227.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_psc_multiple_choice": {
|
4 |
+
"acc,none": 0.9656771799628943,
|
5 |
+
"acc_stderr,none": 0.005547529422575579,
|
6 |
+
"f1,none": 0.9428129829984544,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.9656771799628943,
|
9 |
+
"acc_norm_stderr,none": 0.005547529422575579,
|
10 |
+
"alias": "polish_psc_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_psc_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_psc_multiple_choice": {
|
18 |
+
"task": "polish_psc_multiple_choice",
|
19 |
+
"dataset_path": "allegro/klej-psc",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Tekst: \"{{extract_text}}\"\nPodsumowanie: \"{{summary_text}}\"\nPytanie: Czy podsumowanie dla podanego tekstu jest poprawne?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{label|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 0,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
45 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_psc_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_psc_multiple_choice": 0
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_psc_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true,
|
65 |
+
"f1": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_psc_multiple_choice": {
|
70 |
+
"original": 1078,
|
71 |
+
"effective": 1078
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381748.8966577,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_psc_multiple_choice": "53dfd060110a8ece3c4bf785c368bbf87ac6ae87f1d76781f1ffac90beb47879"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 2669732.988093483,
|
105 |
+
"end_time": 2669908.004738389,
|
106 |
+
"total_evaluation_time_seconds": "175.01664490625262"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-13.495944.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_psc_regex": {
|
4 |
+
"exact_match,score-first": 0.7588126159554731,
|
5 |
+
"exact_match_stderr,score-first": 0.01303577072183474,
|
6 |
+
"f1,score-first": 0.8123167155425219,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_psc_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_psc_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_psc_regex": {
|
16 |
+
"task": "polish_psc_regex",
|
17 |
+
"dataset_path": "allegro/klej-psc",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Fragment 1: \"{{extract_text}}\"\nFragment 2: \"{{summary_text}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy fragmentami 1 i 2?\nMo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - dotycz膮 tego samego artyku艂u\nC - dotycz膮 r贸偶nych artyku艂贸w\nD - brak poprawnej odpowiedzi\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{{0: 'A', 1: 'C', 2: 'B', 3: 'D'}.get(label|int + 1)}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 0,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
","
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"filter_list": [
|
50 |
+
{
|
51 |
+
"name": "score-first",
|
52 |
+
"filter": [
|
53 |
+
{
|
54 |
+
"function": "regex",
|
55 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"function": "take_first"
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"should_decontaminate": true,
|
64 |
+
"doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"versions": {
|
68 |
+
"polish_psc_regex": "Yaml"
|
69 |
+
},
|
70 |
+
"n-shot": {
|
71 |
+
"polish_psc_regex": 0
|
72 |
+
},
|
73 |
+
"higher_is_better": {
|
74 |
+
"polish_psc_regex": {
|
75 |
+
"exact_match": true,
|
76 |
+
"f1": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polish_psc_regex": {
|
81 |
+
"original": 1078,
|
82 |
+
"effective": 1078
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.6296773,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polish_psc_regex": "0065cab6bd75fa16d7b0b782973d6452c76ac6f272bfcb4e037049c1d2a420a5"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2577223.665712506,
|
116 |
+
"end_time": 2579697.064617748,
|
117 |
+
"total_evaluation_time_seconds": "2473.398905241862"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-25-19.492688.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_in": {
|
4 |
+
"exact_match,score-first": 0.8559556786703602,
|
5 |
+
"exact_match_stderr,score-first": 0.01307693837899346,
|
6 |
+
"alias": "polemo2_in"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polemo2_in": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polemo2_in": {
|
14 |
+
"task": "polemo2_in",
|
15 |
+
"group": [
|
16 |
+
"polemo2"
|
17 |
+
],
|
18 |
+
"dataset_path": "allegro/klej-polemo2-in",
|
19 |
+
"training_split": "train",
|
20 |
+
"validation_split": "validation",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 5,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true,
|
33 |
+
"hf_evaluate": true
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"output_type": "generate_until",
|
37 |
+
"generation_kwargs": {
|
38 |
+
"until": [
|
39 |
+
".",
|
40 |
+
","
|
41 |
+
],
|
42 |
+
"do_sample": false,
|
43 |
+
"temperature": 0.0,
|
44 |
+
"max_gen_toks": 50
|
45 |
+
},
|
46 |
+
"repeats": 1,
|
47 |
+
"filter_list": [
|
48 |
+
{
|
49 |
+
"name": "score-first",
|
50 |
+
"filter": [
|
51 |
+
{
|
52 |
+
"function": "regex",
|
53 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"function": "take_first"
|
57 |
+
}
|
58 |
+
]
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"should_decontaminate": true,
|
62 |
+
"doc_to_decontamination_query": "{{sentence}}",
|
63 |
+
"metadata": {
|
64 |
+
"version": 1.0
|
65 |
+
}
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polemo2_in": 1.0
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polemo2_in": 5
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polemo2_in": {
|
76 |
+
"exact_match": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polemo2_in": {
|
81 |
+
"original": 722,
|
82 |
+
"effective": 722
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.8777437,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polemo2_in": "311cf476a99939086a838a34ed5ebef9530cbeea1609d0919757a7dd473b40d1"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2272911.893989815,
|
116 |
+
"end_time": 2273890.992245757,
|
117 |
+
"total_evaluation_time_seconds": "979.09825594211"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-24-50.869505.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_in_multiple_choice": {
|
4 |
+
"acc,none": 0.871191135734072,
|
5 |
+
"acc_stderr,none": 0.012475615091746169,
|
6 |
+
"acc_norm,none": 0.8725761772853186,
|
7 |
+
"acc_norm_stderr,none": 0.012418220256560223,
|
8 |
+
"alias": "polemo2_in_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polemo2_in_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polemo2_in_multiple_choice": {
|
16 |
+
"task": "polemo2_in_multiple_choice",
|
17 |
+
"group": [
|
18 |
+
"polemo2_mc"
|
19 |
+
],
|
20 |
+
"dataset_path": "allegro/klej-polemo2-in",
|
21 |
+
"training_split": "train",
|
22 |
+
"validation_split": "validation",
|
23 |
+
"test_split": "test",
|
24 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
|
25 |
+
"doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
|
26 |
+
"doc_to_choice": [
|
27 |
+
"Neutralny",
|
28 |
+
"Negatywny",
|
29 |
+
"Pozytywny",
|
30 |
+
"Niejednoznaczny"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 5,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polemo2_in_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polemo2_in_multiple_choice": 5
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polemo2_in_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polemo2_in_multiple_choice": {
|
68 |
+
"original": 722,
|
69 |
+
"effective": 722
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381734.9804056,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polemo2_in_multiple_choice": "721bf5bd2111822d757513497aaacb13ff7172a1c79e8d903e554ae7db248670"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2270387.730317351,
|
103 |
+
"end_time": 2271351.381778121,
|
104 |
+
"total_evaluation_time_seconds": "963.6514607700519"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-20.849828.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_out": {
|
4 |
+
"exact_match,score-first": 0.7550607287449392,
|
5 |
+
"exact_match_stderr,score-first": 0.01936853142177567,
|
6 |
+
"alias": "polemo2_out"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polemo2_out": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polemo2_out": {
|
14 |
+
"task": "polemo2_out",
|
15 |
+
"group": [
|
16 |
+
"polemo2"
|
17 |
+
],
|
18 |
+
"dataset_path": "allegro/klej-polemo2-out",
|
19 |
+
"training_split": "train",
|
20 |
+
"validation_split": "validation",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 5,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true,
|
33 |
+
"hf_evaluate": true
|
34 |
+
}
|
35 |
+
],
|
36 |
+
"output_type": "generate_until",
|
37 |
+
"generation_kwargs": {
|
38 |
+
"until": [
|
39 |
+
".",
|
40 |
+
","
|
41 |
+
],
|
42 |
+
"do_sample": false,
|
43 |
+
"temperature": 0.0,
|
44 |
+
"max_gen_toks": 50
|
45 |
+
},
|
46 |
+
"repeats": 1,
|
47 |
+
"filter_list": [
|
48 |
+
{
|
49 |
+
"name": "score-first",
|
50 |
+
"filter": [
|
51 |
+
{
|
52 |
+
"function": "regex",
|
53 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
54 |
+
},
|
55 |
+
{
|
56 |
+
"function": "take_first"
|
57 |
+
}
|
58 |
+
]
|
59 |
+
}
|
60 |
+
],
|
61 |
+
"should_decontaminate": true,
|
62 |
+
"doc_to_decontamination_query": "{{sentence}}",
|
63 |
+
"metadata": {
|
64 |
+
"version": 1.0
|
65 |
+
}
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polemo2_out": 1.0
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polemo2_out": 5
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polemo2_out": {
|
76 |
+
"exact_match": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polemo2_out": {
|
81 |
+
"original": 494,
|
82 |
+
"effective": 494
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.8777425,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polemo2_out": "f4c38529c6c2d9871f34d315f5afa8b183cba25e628c029de45011230d53fac1"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2272911.893717436,
|
116 |
+
"end_time": 2273712.349220811,
|
117 |
+
"total_evaluation_time_seconds": "800.4555033748038"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-19-39.147509.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polemo2_out_multiple_choice": {
|
4 |
+
"acc,none": 0.7753036437246964,
|
5 |
+
"acc_stderr,none": 0.018797949035330906,
|
6 |
+
"acc_norm,none": 0.7854251012145749,
|
7 |
+
"acc_norm_stderr,none": 0.01848921134882508,
|
8 |
+
"alias": "polemo2_out_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polemo2_out_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polemo2_out_multiple_choice": {
|
16 |
+
"task": "polemo2_out_multiple_choice",
|
17 |
+
"group": [
|
18 |
+
"polemo2_mc"
|
19 |
+
],
|
20 |
+
"dataset_path": "allegro/klej-polemo2-out",
|
21 |
+
"training_split": "train",
|
22 |
+
"validation_split": "validation",
|
23 |
+
"test_split": "test",
|
24 |
+
"doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
|
25 |
+
"doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
|
26 |
+
"doc_to_choice": [
|
27 |
+
"Neutralny",
|
28 |
+
"Negatywny",
|
29 |
+
"Pozytywny",
|
30 |
+
"Niejednoznaczny"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 5,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polemo2_out_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polemo2_out_multiple_choice": 5
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polemo2_out_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polemo2_out_multiple_choice": {
|
68 |
+
"original": 494,
|
69 |
+
"effective": 494
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381734.9805498,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polemo2_out_multiple_choice": "45b774f8cfb07b51343dc4aba756739ac8f3ad9410eae31ce9abcab2243c33c6"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2270387.730003106,
|
103 |
+
"end_time": 2271039.659552081,
|
104 |
+
"total_evaluation_time_seconds": "651.9295489750803"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-53-51.017953.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_8tags_multiple_choice": {
|
4 |
+
"acc,none": 0.7936870997255261,
|
5 |
+
"acc_stderr,none": 0.006120648645628871,
|
6 |
+
"acc_norm,none": 0.7881976212259836,
|
7 |
+
"acc_norm_stderr,none": 0.0061800583187814695,
|
8 |
+
"alias": "polish_8tags_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_8tags_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_8tags_multiple_choice": {
|
16 |
+
"task": "polish_8tags_multiple_choice",
|
17 |
+
"dataset_path": "sdadas/8tags",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"fewshot_split": "train",
|
21 |
+
"doc_to_text": "Tytu艂: \"{{sentence}}\"\nDo podanego tytu艂u przyporz膮dkuj jedn膮 najlepiej pasuj膮c膮 kategori臋 z podanych: Film, Historia, Jedzenie, Medycyna, Motoryzacja, Praca, Sport, Technologie.\nKategoria:",
|
22 |
+
"doc_to_target": "{{label|int}}",
|
23 |
+
"doc_to_choice": [
|
24 |
+
"Film",
|
25 |
+
"Historia",
|
26 |
+
"Jedzenie",
|
27 |
+
"Medycyna",
|
28 |
+
"Motoryzacja",
|
29 |
+
"Praca",
|
30 |
+
"Sport",
|
31 |
+
"Technologie"
|
32 |
+
],
|
33 |
+
"description": "",
|
34 |
+
"target_delimiter": " ",
|
35 |
+
"fewshot_delimiter": "\n\n",
|
36 |
+
"num_fewshot": 5,
|
37 |
+
"metric_list": [
|
38 |
+
{
|
39 |
+
"metric": "acc",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "acc_norm",
|
45 |
+
"aggregation": "mean",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_8tags_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_8tags_multiple_choice": 5
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_8tags_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_8tags_multiple_choice": {
|
69 |
+
"original": 4372,
|
70 |
+
"effective": 4372
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381736.832911,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_8tags_multiple_choice": "73f7a912bc6b67622aaf742339f1fd7d8c602e2bba1d366f9084ffdcd115da22"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 779444.242893573,
|
104 |
+
"end_time": 782146.665026425,
|
105 |
+
"total_evaluation_time_seconds": "2702.4221328520216"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-01-59.819478.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_8tags_regex": {
|
4 |
+
"exact_match,score-first": 0.780192131747484,
|
5 |
+
"exact_match_stderr,score-first": 0.006263715115123265,
|
6 |
+
"alias": "polish_8tags_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_8tags_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_8tags_regex": {
|
14 |
+
"task": "polish_8tags_regex",
|
15 |
+
"dataset_path": "sdadas/8tags",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Tytu艂: \"{{sentence}}\"\nPytanie: jaka kategoria najlepiej pasuje do podanego tytu艂u?\nMo偶liwe odpowiedzi:\nA - film\nB - historia\nC - jedzenie\nD - medycyna\nE - motoryzacja\nF - praca\nG - sport\nH - technologie\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H'}.get(label)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 5,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
","
|
37 |
+
],
|
38 |
+
"do_sample": false,
|
39 |
+
"temperature": 0.0,
|
40 |
+
"max_gen_toks": 50
|
41 |
+
},
|
42 |
+
"repeats": 1,
|
43 |
+
"filter_list": [
|
44 |
+
{
|
45 |
+
"name": "score-first",
|
46 |
+
"filter": [
|
47 |
+
{
|
48 |
+
"function": "regex",
|
49 |
+
"regex_pattern": "(\\b[ABCDEFGH]\\b)"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"function": "take_first"
|
53 |
+
}
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"should_decontaminate": true,
|
58 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"versions": {
|
62 |
+
"polish_8tags_regex": "Yaml"
|
63 |
+
},
|
64 |
+
"n-shot": {
|
65 |
+
"polish_8tags_regex": 5
|
66 |
+
},
|
67 |
+
"higher_is_better": {
|
68 |
+
"polish_8tags_regex": {
|
69 |
+
"exact_match": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_8tags_regex": {
|
74 |
+
"original": 4372,
|
75 |
+
"effective": 4372
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381748.877596,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_8tags_regex": "db46138093af0d6032c98a8689c46f46e11c222dafe4ae0444f5c2f86b97dde9"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 2272911.894059325,
|
109 |
+
"end_time": 2279691.316861562,
|
110 |
+
"total_evaluation_time_seconds": "6779.4228022368625"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-54.278792.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_belebele_mc": {
|
4 |
+
"acc,none": 0.8855555555555555,
|
5 |
+
"acc_stderr,none": 0.010617576963634284,
|
6 |
+
"acc_norm,none": 0.8855555555555555,
|
7 |
+
"acc_norm_stderr,none": 0.010617576963634284,
|
8 |
+
"alias": "polish_belebele_mc"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_belebele_mc": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_belebele_mc": {
|
16 |
+
"task": "polish_belebele_mc",
|
17 |
+
"dataset_path": "facebook/belebele",
|
18 |
+
"test_split": "pol_Latn",
|
19 |
+
"fewshot_split": "pol_Latn",
|
20 |
+
"doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{['1', '2', '3', '4'].index(correct_answer_num)}}",
|
22 |
+
"doc_to_choice": [
|
23 |
+
"A",
|
24 |
+
"B",
|
25 |
+
"C",
|
26 |
+
"D"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"fewshot_config": {
|
32 |
+
"sampler": "first_n"
|
33 |
+
},
|
34 |
+
"num_fewshot": 5,
|
35 |
+
"metric_list": [
|
36 |
+
{
|
37 |
+
"metric": "acc",
|
38 |
+
"aggregation": "mean",
|
39 |
+
"higher_is_better": true
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"metric": "acc_norm",
|
43 |
+
"aggregation": "mean",
|
44 |
+
"higher_is_better": true
|
45 |
+
}
|
46 |
+
],
|
47 |
+
"output_type": "multiple_choice",
|
48 |
+
"repeats": 1,
|
49 |
+
"should_decontaminate": true,
|
50 |
+
"doc_to_decontamination_query": "{{question}}",
|
51 |
+
"metadata": {
|
52 |
+
"version": 0.0
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"versions": {
|
57 |
+
"polish_belebele_mc": 0.0
|
58 |
+
},
|
59 |
+
"n-shot": {
|
60 |
+
"polish_belebele_mc": 5
|
61 |
+
},
|
62 |
+
"higher_is_better": {
|
63 |
+
"polish_belebele_mc": {
|
64 |
+
"acc": true,
|
65 |
+
"acc_norm": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_belebele_mc": {
|
70 |
+
"original": 900,
|
71 |
+
"effective": 900
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381736.8325982,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_belebele_mc": "3617d71c141947146b1331680272d92dc45753002d91f496be692e189d2c3338"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 779444.243477263,
|
105 |
+
"end_time": 779869.928050127,
|
106 |
+
"total_evaluation_time_seconds": "425.6845728639746"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-23.654679.json
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_belebele_regex": {
|
4 |
+
"exact_match,score-first": 0.8888888888888888,
|
5 |
+
"exact_match_stderr,score-first": 0.010481480680812841,
|
6 |
+
"alias": "polish_belebele_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_belebele_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_belebele_regex": {
|
14 |
+
"task": "polish_belebele_regex",
|
15 |
+
"dataset_path": "facebook/belebele",
|
16 |
+
"test_split": "pol_Latn",
|
17 |
+
"doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
|
18 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(correct_answer_num|int - 1)}}",
|
19 |
+
"description": "",
|
20 |
+
"target_delimiter": " ",
|
21 |
+
"fewshot_delimiter": "\n\n",
|
22 |
+
"num_fewshot": 5,
|
23 |
+
"metric_list": [
|
24 |
+
{
|
25 |
+
"metric": "exact_match",
|
26 |
+
"aggregation": "mean",
|
27 |
+
"higher_is_better": true
|
28 |
+
}
|
29 |
+
],
|
30 |
+
"output_type": "generate_until",
|
31 |
+
"generation_kwargs": {
|
32 |
+
"until": [
|
33 |
+
".",
|
34 |
+
","
|
35 |
+
],
|
36 |
+
"do_sample": false,
|
37 |
+
"temperature": 0.0,
|
38 |
+
"max_gen_toks": 50
|
39 |
+
},
|
40 |
+
"repeats": 1,
|
41 |
+
"filter_list": [
|
42 |
+
{
|
43 |
+
"name": "score-first",
|
44 |
+
"filter": [
|
45 |
+
{
|
46 |
+
"function": "regex",
|
47 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"function": "take_first"
|
51 |
+
}
|
52 |
+
]
|
53 |
+
}
|
54 |
+
],
|
55 |
+
"should_decontaminate": true,
|
56 |
+
"doc_to_decontamination_query": "{{flores_passage}} {{question}} {{mc_answer1}} {{mc_answer2}} {{mc_answer3}} {{mc_answer4}}"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"versions": {
|
60 |
+
"polish_belebele_regex": "Yaml"
|
61 |
+
},
|
62 |
+
"n-shot": {
|
63 |
+
"polish_belebele_regex": 5
|
64 |
+
},
|
65 |
+
"higher_is_better": {
|
66 |
+
"polish_belebele_regex": {
|
67 |
+
"exact_match": true
|
68 |
+
}
|
69 |
+
},
|
70 |
+
"n-samples": {
|
71 |
+
"polish_belebele_regex": {
|
72 |
+
"original": 900,
|
73 |
+
"effective": 900
|
74 |
+
}
|
75 |
+
},
|
76 |
+
"config": {
|
77 |
+
"model": "hf",
|
78 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
79 |
+
"batch_size": "1",
|
80 |
+
"batch_sizes": [],
|
81 |
+
"device": "cuda:0",
|
82 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex/",
|
83 |
+
"limit": null,
|
84 |
+
"bootstrap_iters": 100000,
|
85 |
+
"gen_kwargs": null,
|
86 |
+
"random_seed": 0,
|
87 |
+
"numpy_seed": 1234,
|
88 |
+
"torch_seed": 1234,
|
89 |
+
"fewshot_seed": 1234
|
90 |
+
},
|
91 |
+
"git_hash": "2132286",
|
92 |
+
"date": 1723381748.8774083,
|
93 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
94 |
+
"transformers_version": "4.43.1",
|
95 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
96 |
+
"task_hashes": {
|
97 |
+
"polish_belebele_regex": "f24c47726a598a1d1eea361393c09e061f3bbf93fc16ed74e92c70bd969e71f2"
|
98 |
+
},
|
99 |
+
"model_source": "hf",
|
100 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
101 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
102 |
+
"system_instruction": null,
|
103 |
+
"system_instruction_sha": null,
|
104 |
+
"chat_template": null,
|
105 |
+
"chat_template_sha": null,
|
106 |
+
"start_time": 2272911.893726246,
|
107 |
+
"end_time": 2274555.15416838,
|
108 |
+
"total_evaluation_time_seconds": "1643.260442133993"
|
109 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-52.993123.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_cbd_multiple_choice": {
|
4 |
+
"acc,none": 0.74,
|
5 |
+
"acc_stderr,none": 0.013877773329774166,
|
6 |
+
"f1,none": 0.3516898467962298,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.747,
|
9 |
+
"acc_norm_stderr,none": 0.01375427861358708,
|
10 |
+
"alias": "polish_cbd_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_cbd_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_cbd_multiple_choice": {
|
18 |
+
"task": "polish_cbd_multiple_choice",
|
19 |
+
"dataset_path": "ptaszynski/PolishCyberbullyingDataset",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nDo podanej wypowiedzi przyporz膮dkuj jedn膮, najlepiej pasuj膮c膮 kategori臋 z podanych: nieszkodliwa, szyderstwo, obelga, insynuacja, gro藕ba, molestowanie.\nKategoria:",
|
23 |
+
"doc_to_target": "{{{'szyderstwo': 1, 'obelga': 2, 'insynuacja': 3, 'grozba': 4, 'molestowanie': 5}.get(CATEGORIES, 0)}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"nieszkodliwa",
|
26 |
+
"szyderstwo",
|
27 |
+
"obelga",
|
28 |
+
"insynuacja",
|
29 |
+
"gro藕ba",
|
30 |
+
"molestowanie"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 5,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
49 |
+
"aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
|
50 |
+
"higher_is_better": true
|
51 |
+
}
|
52 |
+
],
|
53 |
+
"output_type": "multiple_choice",
|
54 |
+
"repeats": 1,
|
55 |
+
"should_decontaminate": true,
|
56 |
+
"doc_to_decontamination_query": "{{TEXT}}"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"versions": {
|
60 |
+
"polish_cbd_multiple_choice": "Yaml"
|
61 |
+
},
|
62 |
+
"n-shot": {
|
63 |
+
"polish_cbd_multiple_choice": 5
|
64 |
+
},
|
65 |
+
"higher_is_better": {
|
66 |
+
"polish_cbd_multiple_choice": {
|
67 |
+
"acc": true,
|
68 |
+
"acc_norm": true,
|
69 |
+
"f1": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_cbd_multiple_choice": {
|
74 |
+
"original": 1000,
|
75 |
+
"effective": 1000
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381736.832983,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_cbd_multiple_choice": "7f04a198edb8f2a8d7c7854adaca6f42c6ab2547d80482066cd86becf9e6cd6c"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 779444.243050853,
|
109 |
+
"end_time": 780228.642777935,
|
110 |
+
"total_evaluation_time_seconds": "784.3997270819964"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-33-25.750066.json
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_cbd_regex": {
|
4 |
+
"exact_match,score-first": 0.75,
|
5 |
+
"exact_match_stderr,score-first": 0.013699915608779773,
|
6 |
+
"f1,score-first": 0.3634343551926929,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_cbd_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_cbd_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_cbd_regex": {
|
16 |
+
"task": "polish_cbd_regex",
|
17 |
+
"dataset_path": "ptaszynski/PolishCyberbullyingDataset",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nPytanie: Jaka kategoria najlepiej pasuje do podanej wypowiedzi?\nMo偶liwe odpowiedzi:\nA - nieszkodliwa\nB - szyderstwo\nC - obelga\nD - insynuacja\nE - gro藕ba\nF - molestowanie\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{{'szyderstwo': 'B', 'obelga': 'C', 'insynuacja': 'D', 'grozba': 'E', 'molestowanie': 'F'}.get(CATEGORIES, 'A')}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 5,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
",",
|
43 |
+
";"
|
44 |
+
],
|
45 |
+
"do_sample": false,
|
46 |
+
"temperature": 0.0,
|
47 |
+
"max_gen_toks": 50
|
48 |
+
},
|
49 |
+
"repeats": 1,
|
50 |
+
"filter_list": [
|
51 |
+
{
|
52 |
+
"name": "score-first",
|
53 |
+
"filter": [
|
54 |
+
{
|
55 |
+
"function": "regex",
|
56 |
+
"regex_pattern": "(\\b[ABCDEF]\\b)"
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"function": "take_first"
|
60 |
+
}
|
61 |
+
]
|
62 |
+
}
|
63 |
+
],
|
64 |
+
"should_decontaminate": true,
|
65 |
+
"doc_to_decontamination_query": "{{TEXT}}"
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"versions": {
|
69 |
+
"polish_cbd_regex": "Yaml"
|
70 |
+
},
|
71 |
+
"n-shot": {
|
72 |
+
"polish_cbd_regex": 5
|
73 |
+
},
|
74 |
+
"higher_is_better": {
|
75 |
+
"polish_cbd_regex": {
|
76 |
+
"exact_match": true,
|
77 |
+
"f1": true
|
78 |
+
}
|
79 |
+
},
|
80 |
+
"n-samples": {
|
81 |
+
"polish_cbd_regex": {
|
82 |
+
"original": 1000,
|
83 |
+
"effective": 1000
|
84 |
+
}
|
85 |
+
},
|
86 |
+
"config": {
|
87 |
+
"model": "hf",
|
88 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
89 |
+
"batch_size": "1",
|
90 |
+
"batch_sizes": [],
|
91 |
+
"device": "cuda:0",
|
92 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex/",
|
93 |
+
"limit": null,
|
94 |
+
"bootstrap_iters": 100000,
|
95 |
+
"gen_kwargs": null,
|
96 |
+
"random_seed": 0,
|
97 |
+
"numpy_seed": 1234,
|
98 |
+
"torch_seed": 1234,
|
99 |
+
"fewshot_seed": 1234
|
100 |
+
},
|
101 |
+
"git_hash": "2132286",
|
102 |
+
"date": 1723381748.6048338,
|
103 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
104 |
+
"transformers_version": "4.43.1",
|
105 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
106 |
+
"task_hashes": {
|
107 |
+
"polish_cbd_regex": "71dc0083f6f8b533188cbedcb2ea9d61ba63ef8ff3f6bb1c08f1844c9335ddf4"
|
108 |
+
},
|
109 |
+
"model_source": "hf",
|
110 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
111 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
112 |
+
"system_instruction": null,
|
113 |
+
"system_instruction_sha": null,
|
114 |
+
"chat_template": null,
|
115 |
+
"chat_template_sha": null,
|
116 |
+
"start_time": 780407.632129005,
|
117 |
+
"end_time": 781873.251077335,
|
118 |
+
"total_evaluation_time_seconds": "1465.6189483299386"
|
119 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-08.007826.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_dyk_multiple_choice": {
|
4 |
+
"acc,none": 0.8794946550048591,
|
5 |
+
"acc_stderr,none": 0.010153673638096375,
|
6 |
+
"f1,none": 0.7004830917874396,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.8794946550048591,
|
9 |
+
"acc_norm_stderr,none": 0.010153673638096375,
|
10 |
+
"alias": "polish_dyk_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_dyk_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_dyk_multiple_choice": {
|
18 |
+
"task": "polish_dyk_multiple_choice",
|
19 |
+
"dataset_path": "allegro/klej-dyk",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nPytanie: Czy sugerowana odpowied藕 na zadane pytanie jest poprawna?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{target|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 5,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
45 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{question}} {{answer}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_dyk_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_dyk_multiple_choice": 5
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_dyk_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true,
|
65 |
+
"f1": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_dyk_multiple_choice": {
|
70 |
+
"original": 1029,
|
71 |
+
"effective": 1029
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381736.8329315,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_dyk_multiple_choice": "90a835c3521affda43e1b7e595ec145d189a8781186b0d67f0a20cbb60069d75"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 779444.242757443,
|
105 |
+
"end_time": 779763.657474826,
|
106 |
+
"total_evaluation_time_seconds": "319.41471738298424"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-54-27.674557.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_dyk_regex": {
|
4 |
+
"exact_match,score-first": 0.8785228377065112,
|
5 |
+
"exact_match_stderr,score-first": 0.010188899761066529,
|
6 |
+
"f1,score-first": 0.7126436781609196,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_dyk_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_dyk_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_dyk_regex": {
|
16 |
+
"task": "polish_dyk_regex",
|
17 |
+
"dataset_path": "allegro/klej-dyk",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nCzy sugerowana odpowied藕 na zadane pytanie jest poprawna? Mo偶liwe opcje:\nA - brakuje sugerowanej odpowiedzi\nB - nie, sugerowana odpowied藕 nie jest poprawna\nC - tak, sugerowana odpowied藕 jest poprawna\nD - brakuje pytania\nPrawid艂owa opcja:",
|
21 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(target|int + 1)}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 5,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
","
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"filter_list": [
|
50 |
+
{
|
51 |
+
"name": "score-first",
|
52 |
+
"filter": [
|
53 |
+
{
|
54 |
+
"function": "regex",
|
55 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"function": "take_first"
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"should_decontaminate": true,
|
64 |
+
"doc_to_decontamination_query": "{{question}} {{answer}}"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"versions": {
|
68 |
+
"polish_dyk_regex": "Yaml"
|
69 |
+
},
|
70 |
+
"n-shot": {
|
71 |
+
"polish_dyk_regex": 5
|
72 |
+
},
|
73 |
+
"higher_is_better": {
|
74 |
+
"polish_dyk_regex": {
|
75 |
+
"exact_match": true,
|
76 |
+
"f1": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polish_dyk_regex": {
|
81 |
+
"original": 1029,
|
82 |
+
"effective": 1029
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.877224,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polish_dyk_regex": "ff511210f55c111bbc6d0c4cd80c3d7b334eaf5227fb2ed749d0a0530e518b27"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 2272911.894157125,
|
116 |
+
"end_time": 2275639.174286722,
|
117 |
+
"total_evaluation_time_seconds": "2727.2801295970567"
|
118 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-22.563512.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_eq_bench_first_turn": {
|
4 |
+
"first_eqbench,none": 70.08076901067246,
|
5 |
+
"first_eqbench_stderr,none": 2.1051510636673663,
|
6 |
+
"first_percent_parseable,none": 100.0,
|
7 |
+
"first_percent_parseable_stderr,none": 0.0,
|
8 |
+
"alias": "polish_eq_bench_first_turn"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_eq_bench_first_turn": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_eq_bench_first_turn": {
|
16 |
+
"task": "polish_eq_bench_first_turn",
|
17 |
+
"dataset_path": "speakleash/EQ-Bench-PL-first-turn",
|
18 |
+
"validation_split": "validation",
|
19 |
+
"doc_to_text": "{{prompt}}\nOceny:\n",
|
20 |
+
"doc_to_target": "def doc_to_target(doc):\n reference = eval(doc[\"reference_answer\"])\n\n target = \"\"\n for i in range(1, 5):\n emotion = reference[f\"emotion{i}\"]\n emotion_score = reference[f\"emotion{i}_score\"]\n target += f\"{emotion}: {emotion_score}\\n\"\n target += \"\\n\"\n\n return target\n",
|
21 |
+
"process_results": "def score_first(docs, results):\n first_pass_answers = dict(list(re.findall(r'(\\w+(?: \\w+)*):\\s+(\\d+)', results[0]))[:4])\n reference = eval(docs[\"reference_answer\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n return scores\n",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 5,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "first_eqbench",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "first_percent_parseable",
|
34 |
+
"aggregation": "mean",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"max_gen_toks": 512,
|
41 |
+
"do_sample": false,
|
42 |
+
"temperature": 0.0,
|
43 |
+
"until": [
|
44 |
+
"</s>",
|
45 |
+
"[Koniec odpowiedzi]",
|
46 |
+
"Masz za zadanie"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": false,
|
51 |
+
"metadata": {
|
52 |
+
"version": 2.4
|
53 |
+
}
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"versions": {
|
57 |
+
"polish_eq_bench_first_turn": 2.4
|
58 |
+
},
|
59 |
+
"n-shot": {
|
60 |
+
"polish_eq_bench_first_turn": 5
|
61 |
+
},
|
62 |
+
"higher_is_better": {
|
63 |
+
"polish_eq_bench_first_turn": {
|
64 |
+
"first_eqbench": true,
|
65 |
+
"first_percent_parseable": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_eq_bench_first_turn": {
|
70 |
+
"original": 171,
|
71 |
+
"effective": 171
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381748.6045775,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_eq_bench_first_turn": "80a40657adcfe9c62884d65078de0204ecd846ef1614217065f11a87cbb0ad87"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 780407.631921335,
|
105 |
+
"end_time": 780970.064839895,
|
106 |
+
"total_evaluation_time_seconds": "562.4329185599927"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-52.497622.json
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_klej_ner_multiple_choice": {
|
4 |
+
"acc,none": 0.5383867832847424,
|
5 |
+
"acc_stderr,none": 0.010991808831354909,
|
6 |
+
"acc_norm,none": 0.5291545189504373,
|
7 |
+
"acc_norm_stderr,none": 0.011005589555788344,
|
8 |
+
"alias": "polish_klej_ner_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_klej_ner_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_klej_ner_multiple_choice": {
|
16 |
+
"task": "polish_klej_ner_multiple_choice",
|
17 |
+
"dataset_path": "allegro/klej-nkjp-ner",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"fewshot_split": "train",
|
22 |
+
"doc_to_text": "Zdanie: \"{{sentence}}\"\nJakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi: Brak nazwanej jednostki, Nazwa miejsca, Nazwa osoby, Nazwa organizacji, Czas, Nazwa geograficzna.\nRodzaj:",
|
23 |
+
"doc_to_target": "{{{'noEntity': 0, 'placeName': 1, 'persName': 2, 'orgName': 3, 'time': 4, 'geogName': 5}.get(target)}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Brak nazwanej jednostki",
|
26 |
+
"Nazwa miejsca",
|
27 |
+
"Nazwa osoby",
|
28 |
+
"Nazwa organizacji",
|
29 |
+
"Czas",
|
30 |
+
"Nazwa geograficzna"
|
31 |
+
],
|
32 |
+
"description": "",
|
33 |
+
"target_delimiter": " ",
|
34 |
+
"fewshot_delimiter": "\n\n",
|
35 |
+
"num_fewshot": 5,
|
36 |
+
"metric_list": [
|
37 |
+
{
|
38 |
+
"metric": "acc",
|
39 |
+
"aggregation": "mean",
|
40 |
+
"higher_is_better": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"metric": "acc_norm",
|
44 |
+
"aggregation": "mean",
|
45 |
+
"higher_is_better": true
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"output_type": "multiple_choice",
|
49 |
+
"repeats": 1,
|
50 |
+
"should_decontaminate": true,
|
51 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
52 |
+
}
|
53 |
+
},
|
54 |
+
"versions": {
|
55 |
+
"polish_klej_ner_multiple_choice": "Yaml"
|
56 |
+
},
|
57 |
+
"n-shot": {
|
58 |
+
"polish_klej_ner_multiple_choice": 5
|
59 |
+
},
|
60 |
+
"higher_is_better": {
|
61 |
+
"polish_klej_ner_multiple_choice": {
|
62 |
+
"acc": true,
|
63 |
+
"acc_norm": true
|
64 |
+
}
|
65 |
+
},
|
66 |
+
"n-samples": {
|
67 |
+
"polish_klej_ner_multiple_choice": {
|
68 |
+
"original": 2058,
|
69 |
+
"effective": 2058
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"config": {
|
73 |
+
"model": "hf",
|
74 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
75 |
+
"batch_size": "1",
|
76 |
+
"batch_sizes": [],
|
77 |
+
"device": "cuda:0",
|
78 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice/",
|
79 |
+
"limit": null,
|
80 |
+
"bootstrap_iters": 100000,
|
81 |
+
"gen_kwargs": null,
|
82 |
+
"random_seed": 0,
|
83 |
+
"numpy_seed": 1234,
|
84 |
+
"torch_seed": 1234,
|
85 |
+
"fewshot_seed": 1234
|
86 |
+
},
|
87 |
+
"git_hash": "2132286",
|
88 |
+
"date": 1723381748.8774905,
|
89 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
90 |
+
"transformers_version": "4.43.1",
|
91 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
92 |
+
"task_hashes": {
|
93 |
+
"polish_klej_ner_multiple_choice": "382e085067293307f61df6d4b8dde438e9a35b2296d59d664ba9e1861a8fb319"
|
94 |
+
},
|
95 |
+
"model_source": "hf",
|
96 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
97 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
98 |
+
"system_instruction": null,
|
99 |
+
"system_instruction_sha": null,
|
100 |
+
"chat_template": null,
|
101 |
+
"chat_template_sha": null,
|
102 |
+
"start_time": 2272911.893820256,
|
103 |
+
"end_time": 2274523.996078617,
|
104 |
+
"total_evaluation_time_seconds": "1612.1022583609447"
|
105 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-18.271570.json
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_klej_ner_regex": {
|
4 |
+
"exact_match,score-first": 0.5515063168124392,
|
5 |
+
"exact_match_stderr,score-first": 0.010965697594667088,
|
6 |
+
"alias": "polish_klej_ner_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_klej_ner_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_klej_ner_regex": {
|
14 |
+
"task": "polish_klej_ner_regex",
|
15 |
+
"dataset_path": "allegro/klej-nkjp-ner",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Zdanie: \"{{sentence}}\"\nPytanie: Jakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi:\nA - Brak nazwanej jednostki\nB - Nazwa miejsca\nC - Nazwa osoby\nD - Nazwa organizacji\nE - Czas\nF - Nazwa geograficzna\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{'noEntity': 'A', 'placeName': 'B', 'persName': 'C', 'orgName': 'D', 'time': 'E', 'geogName': 'F'}.get(target)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 5,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
",",
|
37 |
+
";"
|
38 |
+
],
|
39 |
+
"do_sample": false,
|
40 |
+
"temperature": 0.0,
|
41 |
+
"max_gen_toks": 50
|
42 |
+
},
|
43 |
+
"repeats": 1,
|
44 |
+
"filter_list": [
|
45 |
+
{
|
46 |
+
"name": "score-first",
|
47 |
+
"filter": [
|
48 |
+
{
|
49 |
+
"function": "regex",
|
50 |
+
"regex_pattern": "(\\b[ABCDEF]\\b)"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"function": "take_first"
|
54 |
+
}
|
55 |
+
]
|
56 |
+
}
|
57 |
+
],
|
58 |
+
"should_decontaminate": true,
|
59 |
+
"doc_to_decontamination_query": "{{sentence}}"
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"versions": {
|
63 |
+
"polish_klej_ner_regex": "Yaml"
|
64 |
+
},
|
65 |
+
"n-shot": {
|
66 |
+
"polish_klej_ner_regex": 5
|
67 |
+
},
|
68 |
+
"higher_is_better": {
|
69 |
+
"polish_klej_ner_regex": {
|
70 |
+
"exact_match": true
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"n-samples": {
|
74 |
+
"polish_klej_ner_regex": {
|
75 |
+
"original": 2058,
|
76 |
+
"effective": 2058
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"config": {
|
80 |
+
"model": "hf",
|
81 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
82 |
+
"batch_size": "1",
|
83 |
+
"batch_sizes": [],
|
84 |
+
"device": "cuda:0",
|
85 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex/",
|
86 |
+
"limit": null,
|
87 |
+
"bootstrap_iters": 100000,
|
88 |
+
"gen_kwargs": null,
|
89 |
+
"random_seed": 0,
|
90 |
+
"numpy_seed": 1234,
|
91 |
+
"torch_seed": 1234,
|
92 |
+
"fewshot_seed": 1234
|
93 |
+
},
|
94 |
+
"git_hash": "2132286",
|
95 |
+
"date": 1723381748.60427,
|
96 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
97 |
+
"transformers_version": "4.43.1",
|
98 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
99 |
+
"task_hashes": {
|
100 |
+
"polish_klej_ner_regex": "ab6f4267720bbc662460a7390651b5fc2a339d12301ae3ba0cba80f4ffe4fe5f"
|
101 |
+
},
|
102 |
+
"model_source": "hf",
|
103 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
104 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
105 |
+
"system_instruction": null,
|
106 |
+
"system_instruction_sha": null,
|
107 |
+
"chat_template": null,
|
108 |
+
"chat_template_sha": null,
|
109 |
+
"start_time": 780407.631669254,
|
110 |
+
"end_time": 783305.771917303,
|
111 |
+
"total_evaluation_time_seconds": "2898.140248048934"
|
112 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_pes_1723381722/results_2024-08-27T17-50-52.063138.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-23.588300.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_closed_book": {
|
4 |
+
"exact_match,none": 0.7144340602284528,
|
5 |
+
"exact_match_stderr,none": 0.014562862295117392,
|
6 |
+
"levenshtein,none": 0.8328141225337488,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_polqa_closed_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_closed_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_closed_book": {
|
16 |
+
"task": "polish_polqa_closed_book",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs_closed(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and example['question'] not in used and (used.add(example['question']) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "answers",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 5,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
|
36 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
37 |
+
"higher_is_better": true
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"output_type": "generate_until",
|
41 |
+
"generation_kwargs": {
|
42 |
+
"until": [
|
43 |
+
"\n",
|
44 |
+
"</s>"
|
45 |
+
],
|
46 |
+
"do_sample": false,
|
47 |
+
"temperature": 0.0,
|
48 |
+
"max_gen_toks": 50
|
49 |
+
},
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{question}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_polqa_closed_book": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_polqa_closed_book": 5
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_polqa_closed_book": {
|
63 |
+
"exact_match": true,
|
64 |
+
"levenshtein": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_polqa_closed_book": {
|
69 |
+
"original": 963,
|
70 |
+
"effective": 963
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381748.604107,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_polqa_closed_book": "87d8cfbe97dc8a4ad77df54784eea533389ce029734e03de52acd682a4293a8e"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 780407.631446274,
|
104 |
+
"end_time": 780851.088669831,
|
105 |
+
"total_evaluation_time_seconds": "443.4572235570522"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-00.491423.json
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_open_book": {
|
4 |
+
"exact_match,none": 0.803306342780027,
|
5 |
+
"exact_match_stderr,none": 0.005163192439920857,
|
6 |
+
"levenshtein,none": 0.9239203778677463,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_polqa_open_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_open_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_open_book": {
|
16 |
+
"task": "polish_polqa_open_book",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs_open(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "answers",
|
24 |
+
"description": "",
|
25 |
+
"target_delimiter": " ",
|
26 |
+
"fewshot_delimiter": "\n\n",
|
27 |
+
"num_fewshot": 5,
|
28 |
+
"metric_list": [
|
29 |
+
{
|
30 |
+
"metric": "exact_match",
|
31 |
+
"aggregation": "mean",
|
32 |
+
"higher_is_better": true
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
|
36 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
37 |
+
"higher_is_better": true
|
38 |
+
}
|
39 |
+
],
|
40 |
+
"output_type": "generate_until",
|
41 |
+
"generation_kwargs": {
|
42 |
+
"until": [
|
43 |
+
"\n",
|
44 |
+
"</s>"
|
45 |
+
],
|
46 |
+
"do_sample": false,
|
47 |
+
"temperature": 0.0,
|
48 |
+
"max_gen_toks": 50
|
49 |
+
},
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{passage_text}} {{question}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_polqa_open_book": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_polqa_open_book": 5
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_polqa_open_book": {
|
63 |
+
"exact_match": true,
|
64 |
+
"levenshtein": true
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"n-samples": {
|
68 |
+
"polish_polqa_open_book": {
|
69 |
+
"original": 5928,
|
70 |
+
"effective": 5928
|
71 |
+
}
|
72 |
+
},
|
73 |
+
"config": {
|
74 |
+
"model": "hf",
|
75 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
76 |
+
"batch_size": "1",
|
77 |
+
"batch_sizes": [],
|
78 |
+
"device": "cuda:0",
|
79 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book/",
|
80 |
+
"limit": null,
|
81 |
+
"bootstrap_iters": 100000,
|
82 |
+
"gen_kwargs": null,
|
83 |
+
"random_seed": 0,
|
84 |
+
"numpy_seed": 1234,
|
85 |
+
"torch_seed": 1234,
|
86 |
+
"fewshot_seed": 1234
|
87 |
+
},
|
88 |
+
"git_hash": "2132286",
|
89 |
+
"date": 1723381748.6044483,
|
90 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
91 |
+
"transformers_version": "4.43.1",
|
92 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
93 |
+
"task_hashes": {
|
94 |
+
"polish_polqa_open_book": "1b7dbda5fd3d68d2b8f1d9ca3aecb84324d7c15639dca3a82f584f73f81e734f"
|
95 |
+
},
|
96 |
+
"model_source": "hf",
|
97 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
98 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
99 |
+
"system_instruction": null,
|
100 |
+
"system_instruction_sha": null,
|
101 |
+
"chat_template": null,
|
102 |
+
"chat_template_sha": null,
|
103 |
+
"start_time": 780407.632126545,
|
104 |
+
"end_time": 782687.990413396,
|
105 |
+
"total_evaluation_time_seconds": "2280.3582868510857"
|
106 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-02.859037.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_polqa_reranking_multiple_choice": {
|
4 |
+
"acc,none": 0.8563222912896372,
|
5 |
+
"acc_stderr,none": 0.0031115351999876245,
|
6 |
+
"acc_norm,none": 0.8563222912896372,
|
7 |
+
"acc_norm_stderr,none": 0.0031115351999876245,
|
8 |
+
"alias": "polish_polqa_reranking_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_polqa_reranking_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_polqa_reranking_multiple_choice": {
|
16 |
+
"task": "polish_polqa_reranking_multiple_choice",
|
17 |
+
"dataset_path": "ipipan/polqa",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset):\n def _helper(doc):\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
|
22 |
+
"doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Czy kontekst jest relewantny dla pytania? \n Odpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{relevant|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 5,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"output_type": "multiple_choice",
|
45 |
+
"repeats": 1,
|
46 |
+
"should_decontaminate": true,
|
47 |
+
"doc_to_decontamination_query": "{{passage_text}} {{question}}"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"versions": {
|
51 |
+
"polish_polqa_reranking_multiple_choice": "Yaml"
|
52 |
+
},
|
53 |
+
"n-shot": {
|
54 |
+
"polish_polqa_reranking_multiple_choice": 5
|
55 |
+
},
|
56 |
+
"higher_is_better": {
|
57 |
+
"polish_polqa_reranking_multiple_choice": {
|
58 |
+
"acc": true,
|
59 |
+
"acc_norm": true
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"n-samples": {
|
63 |
+
"polish_polqa_reranking_multiple_choice": {
|
64 |
+
"original": 12709,
|
65 |
+
"effective": 12709
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"config": {
|
69 |
+
"model": "hf",
|
70 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
71 |
+
"batch_size": "1",
|
72 |
+
"batch_sizes": [],
|
73 |
+
"device": "cuda:0",
|
74 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice/",
|
75 |
+
"limit": null,
|
76 |
+
"bootstrap_iters": 100000,
|
77 |
+
"gen_kwargs": null,
|
78 |
+
"random_seed": 0,
|
79 |
+
"numpy_seed": 1234,
|
80 |
+
"torch_seed": 1234,
|
81 |
+
"fewshot_seed": 1234
|
82 |
+
},
|
83 |
+
"git_hash": "2132286",
|
84 |
+
"date": 1723381748.8773608,
|
85 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
86 |
+
"transformers_version": "4.43.1",
|
87 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
88 |
+
"task_hashes": {
|
89 |
+
"polish_polqa_reranking_multiple_choice": "81b0a5c9f7c49792c084d2efb013d9475b0a80d66176de68f5f7c09c2464494a"
|
90 |
+
},
|
91 |
+
"model_source": "hf",
|
92 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
93 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
94 |
+
"system_instruction": null,
|
95 |
+
"system_instruction_sha": null,
|
96 |
+
"chat_template": null,
|
97 |
+
"chat_template_sha": null,
|
98 |
+
"start_time": 2272911.894071405,
|
99 |
+
"end_time": 2275374.35065494,
|
100 |
+
"total_evaluation_time_seconds": "2462.4565835352987"
|
101 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-09-42.653951.json
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_poquad_open_book": {
|
4 |
+
"exact_match,none": 0.37682165163081194,
|
5 |
+
"exact_match_stderr,none": 0.0063833666826593255,
|
6 |
+
"levenshtein,none": 0.6878903539208883,
|
7 |
+
"levenshtein_stderr,none": "N/A",
|
8 |
+
"alias": "polish_poquad_open_book"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_poquad_open_book": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_poquad_open_book": {
|
16 |
+
"task": "polish_poquad_open_book",
|
17 |
+
"dataset_path": "clarin-pl/poquad",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "validation",
|
20 |
+
"doc_to_text": "Tytu艂: {{title}} \n Kontekst: {{context}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕 (kr贸tki cytat z Kontekstu):",
|
21 |
+
"doc_to_target": "def doc_to_target(doc):\n answer_list = doc[\"answers\"][\"text\"]\n if len(answer_list) > 0:\n answer = answer_list[0]\n else:\n answer = \"bez odpowiedzi\"\n return \" \" + answer\n",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 5,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def levenshtein(predictions, references):\n _prediction = predictions[0].lower().lstrip()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('.? ?(</s>)* ?$', '', _prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower().lstrip())\n if ld < len(reference)/2:\n return 1\n return 0\n",
|
34 |
+
"aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
"\n",
|
42 |
+
"</s>"
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"should_decontaminate": true,
|
50 |
+
"doc_to_decontamination_query": "{{context}} {{question}}"
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"versions": {
|
54 |
+
"polish_poquad_open_book": "Yaml"
|
55 |
+
},
|
56 |
+
"n-shot": {
|
57 |
+
"polish_poquad_open_book": 5
|
58 |
+
},
|
59 |
+
"higher_is_better": {
|
60 |
+
"polish_poquad_open_book": {
|
61 |
+
"exact_match": true,
|
62 |
+
"levenshtein": true
|
63 |
+
}
|
64 |
+
},
|
65 |
+
"n-samples": {
|
66 |
+
"polish_poquad_open_book": {
|
67 |
+
"original": 5764,
|
68 |
+
"effective": 5764
|
69 |
+
}
|
70 |
+
},
|
71 |
+
"config": {
|
72 |
+
"model": "hf",
|
73 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
74 |
+
"batch_size": "1",
|
75 |
+
"batch_sizes": [],
|
76 |
+
"device": "cuda:0",
|
77 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book/",
|
78 |
+
"limit": null,
|
79 |
+
"bootstrap_iters": 100000,
|
80 |
+
"gen_kwargs": null,
|
81 |
+
"random_seed": 0,
|
82 |
+
"numpy_seed": 1234,
|
83 |
+
"torch_seed": 1234,
|
84 |
+
"fewshot_seed": 1234
|
85 |
+
},
|
86 |
+
"git_hash": "2132286",
|
87 |
+
"date": 1723381748.6042013,
|
88 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
89 |
+
"transformers_version": "4.43.1",
|
90 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
91 |
+
"task_hashes": {
|
92 |
+
"polish_poquad_open_book": "4052fd29bcd59435f258c0169cde1f29c3f22a618395f32cebf32e166bd3bf38"
|
93 |
+
},
|
94 |
+
"model_source": "hf",
|
95 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
96 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
97 |
+
"system_instruction": null,
|
98 |
+
"system_instruction_sha": null,
|
99 |
+
"chat_template": null,
|
100 |
+
"chat_template_sha": null,
|
101 |
+
"start_time": 780407.631651954,
|
102 |
+
"end_time": 787650.152180919,
|
103 |
+
"total_evaluation_time_seconds": "7242.520528964931"
|
104 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-23.877449.json
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_ppc_multiple_choice": {
|
4 |
+
"acc,none": 0.789,
|
5 |
+
"acc_stderr,none": 0.012909130321042095,
|
6 |
+
"acc_norm,none": 0.789,
|
7 |
+
"acc_norm_stderr,none": 0.012909130321042095,
|
8 |
+
"alias": "polish_ppc_multiple_choice"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_ppc_multiple_choice": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_ppc_multiple_choice": {
|
16 |
+
"task": "polish_ppc_multiple_choice",
|
17 |
+
"dataset_path": "sdadas/ppc",
|
18 |
+
"training_split": "train",
|
19 |
+
"validation_split": "validation",
|
20 |
+
"test_split": "test",
|
21 |
+
"doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - znacz膮 dok艂adnie to samo\nB - maj膮 podobne znaczenie\nC - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
|
22 |
+
"doc_to_target": "{{label|int - 1}}",
|
23 |
+
"doc_to_choice": [
|
24 |
+
"A",
|
25 |
+
"B",
|
26 |
+
"C"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 5,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
}
|
43 |
+
],
|
44 |
+
"output_type": "multiple_choice",
|
45 |
+
"repeats": 1,
|
46 |
+
"should_decontaminate": true,
|
47 |
+
"doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
|
48 |
+
}
|
49 |
+
},
|
50 |
+
"versions": {
|
51 |
+
"polish_ppc_multiple_choice": "Yaml"
|
52 |
+
},
|
53 |
+
"n-shot": {
|
54 |
+
"polish_ppc_multiple_choice": 5
|
55 |
+
},
|
56 |
+
"higher_is_better": {
|
57 |
+
"polish_ppc_multiple_choice": {
|
58 |
+
"acc": true,
|
59 |
+
"acc_norm": true
|
60 |
+
}
|
61 |
+
},
|
62 |
+
"n-samples": {
|
63 |
+
"polish_ppc_multiple_choice": {
|
64 |
+
"original": 1000,
|
65 |
+
"effective": 1000
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"config": {
|
69 |
+
"model": "hf",
|
70 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
71 |
+
"batch_size": "1",
|
72 |
+
"batch_sizes": [],
|
73 |
+
"device": "cuda:0",
|
74 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice/",
|
75 |
+
"limit": null,
|
76 |
+
"bootstrap_iters": 100000,
|
77 |
+
"gen_kwargs": null,
|
78 |
+
"random_seed": 0,
|
79 |
+
"numpy_seed": 1234,
|
80 |
+
"torch_seed": 1234,
|
81 |
+
"fewshot_seed": 1234
|
82 |
+
},
|
83 |
+
"git_hash": "2132286",
|
84 |
+
"date": 1723381736.8326252,
|
85 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
86 |
+
"transformers_version": "4.43.1",
|
87 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
88 |
+
"task_hashes": {
|
89 |
+
"polish_ppc_multiple_choice": "c3554bdb1ae93597ea2150e4ff1a633019458db699b0cb1639d96dd3970b6939"
|
90 |
+
},
|
91 |
+
"model_source": "hf",
|
92 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
93 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
94 |
+
"system_instruction": null,
|
95 |
+
"system_instruction_sha": null,
|
96 |
+
"chat_template": null,
|
97 |
+
"chat_template_sha": null,
|
98 |
+
"start_time": 779444.242971983,
|
99 |
+
"end_time": 779719.526449868,
|
100 |
+
"total_evaluation_time_seconds": "275.2834778849501"
|
101 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-30-24.424865.json
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_ppc_regex": {
|
4 |
+
"exact_match,score-first": 0.793,
|
5 |
+
"exact_match_stderr,score-first": 0.01281855355784399,
|
6 |
+
"alias": "polish_ppc_regex"
|
7 |
+
}
|
8 |
+
},
|
9 |
+
"group_subtasks": {
|
10 |
+
"polish_ppc_regex": []
|
11 |
+
},
|
12 |
+
"configs": {
|
13 |
+
"polish_ppc_regex": {
|
14 |
+
"task": "polish_ppc_regex",
|
15 |
+
"dataset_path": "sdadas/ppc",
|
16 |
+
"training_split": "train",
|
17 |
+
"validation_split": "validation",
|
18 |
+
"test_split": "test",
|
19 |
+
"doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - znacz膮 dok艂adnie to samo\nC - maj膮 podobne znaczenie\nD - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
|
20 |
+
"doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(label|int)}}",
|
21 |
+
"description": "",
|
22 |
+
"target_delimiter": " ",
|
23 |
+
"fewshot_delimiter": "\n\n",
|
24 |
+
"num_fewshot": 5,
|
25 |
+
"metric_list": [
|
26 |
+
{
|
27 |
+
"metric": "exact_match",
|
28 |
+
"aggregation": "mean",
|
29 |
+
"higher_is_better": true
|
30 |
+
}
|
31 |
+
],
|
32 |
+
"output_type": "generate_until",
|
33 |
+
"generation_kwargs": {
|
34 |
+
"until": [
|
35 |
+
".",
|
36 |
+
","
|
37 |
+
],
|
38 |
+
"do_sample": false,
|
39 |
+
"temperature": 0.0,
|
40 |
+
"max_gen_toks": 50
|
41 |
+
},
|
42 |
+
"repeats": 1,
|
43 |
+
"filter_list": [
|
44 |
+
{
|
45 |
+
"name": "score-first",
|
46 |
+
"filter": [
|
47 |
+
{
|
48 |
+
"function": "regex",
|
49 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"function": "take_first"
|
53 |
+
}
|
54 |
+
]
|
55 |
+
}
|
56 |
+
],
|
57 |
+
"should_decontaminate": true,
|
58 |
+
"doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
|
59 |
+
}
|
60 |
+
},
|
61 |
+
"versions": {
|
62 |
+
"polish_ppc_regex": "Yaml"
|
63 |
+
},
|
64 |
+
"n-shot": {
|
65 |
+
"polish_ppc_regex": 5
|
66 |
+
},
|
67 |
+
"higher_is_better": {
|
68 |
+
"polish_ppc_regex": {
|
69 |
+
"exact_match": true
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"n-samples": {
|
73 |
+
"polish_ppc_regex": {
|
74 |
+
"original": 1000,
|
75 |
+
"effective": 1000
|
76 |
+
}
|
77 |
+
},
|
78 |
+
"config": {
|
79 |
+
"model": "hf",
|
80 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
81 |
+
"batch_size": "1",
|
82 |
+
"batch_sizes": [],
|
83 |
+
"device": "cuda:0",
|
84 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex/",
|
85 |
+
"limit": null,
|
86 |
+
"bootstrap_iters": 100000,
|
87 |
+
"gen_kwargs": null,
|
88 |
+
"random_seed": 0,
|
89 |
+
"numpy_seed": 1234,
|
90 |
+
"torch_seed": 1234,
|
91 |
+
"fewshot_seed": 1234
|
92 |
+
},
|
93 |
+
"git_hash": "2132286",
|
94 |
+
"date": 1723381748.8771076,
|
95 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
96 |
+
"transformers_version": "4.43.1",
|
97 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
98 |
+
"task_hashes": {
|
99 |
+
"polish_ppc_regex": "a218e651c94f2f850a86a4c0b91c5b5a37007e52526c54eff802a95592defbe3"
|
100 |
+
},
|
101 |
+
"model_source": "hf",
|
102 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
103 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
104 |
+
"system_instruction": null,
|
105 |
+
"system_instruction_sha": null,
|
106 |
+
"chat_template": null,
|
107 |
+
"chat_template_sha": null,
|
108 |
+
"start_time": 2272911.893985835,
|
109 |
+
"end_time": 2274195.92427883,
|
110 |
+
"total_evaluation_time_seconds": "1284.030292995274"
|
111 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-48.485190.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_psc_multiple_choice": {
|
4 |
+
"acc,none": 0.9461966604823747,
|
5 |
+
"acc_stderr,none": 0.006875233780063374,
|
6 |
+
"f1,none": 0.9042904290429042,
|
7 |
+
"f1_stderr,none": "N/A",
|
8 |
+
"acc_norm,none": 0.9461966604823747,
|
9 |
+
"acc_norm_stderr,none": 0.006875233780063374,
|
10 |
+
"alias": "polish_psc_multiple_choice"
|
11 |
+
}
|
12 |
+
},
|
13 |
+
"group_subtasks": {
|
14 |
+
"polish_psc_multiple_choice": []
|
15 |
+
},
|
16 |
+
"configs": {
|
17 |
+
"polish_psc_multiple_choice": {
|
18 |
+
"task": "polish_psc_multiple_choice",
|
19 |
+
"dataset_path": "allegro/klej-psc",
|
20 |
+
"training_split": "train",
|
21 |
+
"test_split": "test",
|
22 |
+
"doc_to_text": "Tekst: \"{{extract_text}}\"\nPodsumowanie: \"{{summary_text}}\"\nPytanie: Czy podsumowanie dla podanego tekstu jest poprawne?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
|
23 |
+
"doc_to_target": "{{label|int}}",
|
24 |
+
"doc_to_choice": [
|
25 |
+
"Nie",
|
26 |
+
"Tak"
|
27 |
+
],
|
28 |
+
"description": "",
|
29 |
+
"target_delimiter": " ",
|
30 |
+
"fewshot_delimiter": "\n\n",
|
31 |
+
"num_fewshot": 5,
|
32 |
+
"metric_list": [
|
33 |
+
{
|
34 |
+
"metric": "acc",
|
35 |
+
"aggregation": "mean",
|
36 |
+
"higher_is_better": true
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"metric": "acc_norm",
|
40 |
+
"aggregation": "mean",
|
41 |
+
"higher_is_better": true
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
45 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
46 |
+
"higher_is_better": true
|
47 |
+
}
|
48 |
+
],
|
49 |
+
"output_type": "multiple_choice",
|
50 |
+
"repeats": 1,
|
51 |
+
"should_decontaminate": true,
|
52 |
+
"doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
|
53 |
+
}
|
54 |
+
},
|
55 |
+
"versions": {
|
56 |
+
"polish_psc_multiple_choice": "Yaml"
|
57 |
+
},
|
58 |
+
"n-shot": {
|
59 |
+
"polish_psc_multiple_choice": 5
|
60 |
+
},
|
61 |
+
"higher_is_better": {
|
62 |
+
"polish_psc_multiple_choice": {
|
63 |
+
"acc": true,
|
64 |
+
"acc_norm": true,
|
65 |
+
"f1": true
|
66 |
+
}
|
67 |
+
},
|
68 |
+
"n-samples": {
|
69 |
+
"polish_psc_multiple_choice": {
|
70 |
+
"original": 1078,
|
71 |
+
"effective": 1078
|
72 |
+
}
|
73 |
+
},
|
74 |
+
"config": {
|
75 |
+
"model": "hf",
|
76 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
77 |
+
"batch_size": "1",
|
78 |
+
"batch_sizes": [],
|
79 |
+
"device": "cuda:0",
|
80 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice/",
|
81 |
+
"limit": null,
|
82 |
+
"bootstrap_iters": 100000,
|
83 |
+
"gen_kwargs": null,
|
84 |
+
"random_seed": 0,
|
85 |
+
"numpy_seed": 1234,
|
86 |
+
"torch_seed": 1234,
|
87 |
+
"fewshot_seed": 1234
|
88 |
+
},
|
89 |
+
"git_hash": "2132286",
|
90 |
+
"date": 1723381736.8328693,
|
91 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
92 |
+
"transformers_version": "4.43.1",
|
93 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
94 |
+
"task_hashes": {
|
95 |
+
"polish_psc_multiple_choice": "20f66e13606e4708007e9a49fc374f8348f8309b56413b4ea31956ce9f49c601"
|
96 |
+
},
|
97 |
+
"model_source": "hf",
|
98 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
99 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
100 |
+
"system_instruction": null,
|
101 |
+
"system_instruction_sha": null,
|
102 |
+
"chat_template": null,
|
103 |
+
"chat_template_sha": null,
|
104 |
+
"start_time": 779444.242721803,
|
105 |
+
"end_time": 780044.13483785,
|
106 |
+
"total_evaluation_time_seconds": "599.8921160469763"
|
107 |
+
}
|
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-28.998766.json
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"results": {
|
3 |
+
"polish_psc_regex": {
|
4 |
+
"exact_match,score-first": 0.8942486085343229,
|
5 |
+
"exact_match_stderr,score-first": 0.00937053376963659,
|
6 |
+
"f1,score-first": 0.9228687415426252,
|
7 |
+
"f1_stderr,score-first": "N/A",
|
8 |
+
"alias": "polish_psc_regex"
|
9 |
+
}
|
10 |
+
},
|
11 |
+
"group_subtasks": {
|
12 |
+
"polish_psc_regex": []
|
13 |
+
},
|
14 |
+
"configs": {
|
15 |
+
"polish_psc_regex": {
|
16 |
+
"task": "polish_psc_regex",
|
17 |
+
"dataset_path": "allegro/klej-psc",
|
18 |
+
"training_split": "train",
|
19 |
+
"test_split": "test",
|
20 |
+
"doc_to_text": "Fragment 1: \"{{extract_text}}\"\nFragment 2: \"{{summary_text}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy fragmentami 1 i 2?\nMo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - dotycz膮 tego samego artyku艂u\nC - dotycz膮 r贸偶nych artyku艂贸w\nD - brak poprawnej odpowiedzi\nPrawid艂owa odpowied藕:",
|
21 |
+
"doc_to_target": "{{{0: 'A', 1: 'C', 2: 'B', 3: 'D'}.get(label|int + 1)}}",
|
22 |
+
"description": "",
|
23 |
+
"target_delimiter": " ",
|
24 |
+
"fewshot_delimiter": "\n\n",
|
25 |
+
"num_fewshot": 5,
|
26 |
+
"metric_list": [
|
27 |
+
{
|
28 |
+
"metric": "exact_match",
|
29 |
+
"aggregation": "mean",
|
30 |
+
"higher_is_better": true
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
|
34 |
+
"aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
|
35 |
+
"higher_is_better": true
|
36 |
+
}
|
37 |
+
],
|
38 |
+
"output_type": "generate_until",
|
39 |
+
"generation_kwargs": {
|
40 |
+
"until": [
|
41 |
+
".",
|
42 |
+
","
|
43 |
+
],
|
44 |
+
"do_sample": false,
|
45 |
+
"temperature": 0.0,
|
46 |
+
"max_gen_toks": 50
|
47 |
+
},
|
48 |
+
"repeats": 1,
|
49 |
+
"filter_list": [
|
50 |
+
{
|
51 |
+
"name": "score-first",
|
52 |
+
"filter": [
|
53 |
+
{
|
54 |
+
"function": "regex",
|
55 |
+
"regex_pattern": "(\\b[ABCD]\\b)"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"function": "take_first"
|
59 |
+
}
|
60 |
+
]
|
61 |
+
}
|
62 |
+
],
|
63 |
+
"should_decontaminate": true,
|
64 |
+
"doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
|
65 |
+
}
|
66 |
+
},
|
67 |
+
"versions": {
|
68 |
+
"polish_psc_regex": "Yaml"
|
69 |
+
},
|
70 |
+
"n-shot": {
|
71 |
+
"polish_psc_regex": 5
|
72 |
+
},
|
73 |
+
"higher_is_better": {
|
74 |
+
"polish_psc_regex": {
|
75 |
+
"exact_match": true,
|
76 |
+
"f1": true
|
77 |
+
}
|
78 |
+
},
|
79 |
+
"n-samples": {
|
80 |
+
"polish_psc_regex": {
|
81 |
+
"original": 1078,
|
82 |
+
"effective": 1078
|
83 |
+
}
|
84 |
+
},
|
85 |
+
"config": {
|
86 |
+
"model": "hf",
|
87 |
+
"model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
|
88 |
+
"batch_size": "1",
|
89 |
+
"batch_sizes": [],
|
90 |
+
"device": "cuda:0",
|
91 |
+
"use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex/",
|
92 |
+
"limit": null,
|
93 |
+
"bootstrap_iters": 100000,
|
94 |
+
"gen_kwargs": null,
|
95 |
+
"random_seed": 0,
|
96 |
+
"numpy_seed": 1234,
|
97 |
+
"torch_seed": 1234,
|
98 |
+
"fewshot_seed": 1234
|
99 |
+
},
|
100 |
+
"git_hash": "2132286",
|
101 |
+
"date": 1723381748.6041322,
|
102 |
+
"pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
|
103 |
+
"transformers_version": "4.43.1",
|
104 |
+
"upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
|
105 |
+
"task_hashes": {
|
106 |
+
"polish_psc_regex": "62e1c0c7b4494ec1f99bb0c7eeaad898f5b3e48f9263f8212e2a9759d5499045"
|
107 |
+
},
|
108 |
+
"model_source": "hf",
|
109 |
+
"model_name": "speakleash/Bielik-11B-v2.1-Instruct",
|
110 |
+
"model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
|
111 |
+
"system_instruction": null,
|
112 |
+
"system_instruction_sha": null,
|
113 |
+
"chat_template": null,
|
114 |
+
"chat_template_sha": null,
|
115 |
+
"start_time": 780407.631957345,
|
116 |
+
"end_time": 782716.499648762,
|
117 |
+
"total_evaluation_time_seconds": "2308.8676914171083"
|
118 |
+
}
|