djstrong commited on
Commit
02311bf
1 Parent(s): a2eeaa1
This view is limited to 50 files because it contains too many changes. 聽 See raw diff
Files changed (50) hide show
  1. eval-results/.gitattributes +55 -0
  2. eval-results/.idea/open_pl_llm_leaderboard_results.iml +8 -0
  3. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-32-59.263304.json +118 -0
  4. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-19.394508.json +105 -0
  5. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-27-04.852309.json +118 -0
  6. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-12-58.277105.json +105 -0
  7. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-19.801525.json +106 -0
  8. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-37-36.637222.json +111 -0
  9. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-12.455988.json +107 -0
  10. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-48.898331.json +109 -0
  11. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-56.871408.json +111 -0
  12. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-52.956955.json +119 -0
  13. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-01.731913.json +107 -0
  14. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-18.998768.json +118 -0
  15. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-26.656191.json +131 -0
  16. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-21.291842.json +107 -0
  17. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-30.374052.json +105 -0
  18. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-15-22.967823.json +112 -0
  19. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-25.200287.json +106 -0
  20. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-58-53.946082.json +106 -0
  21. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-23-40.740746.json +101 -0
  22. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-06-52.670471.json +104 -0
  23. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-02.213869.json +101 -0
  24. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-08.270834.json +111 -0
  25. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-11-55.449227.json +107 -0
  26. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-13.495944.json +118 -0
  27. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-25-19.492688.json +118 -0
  28. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-24-50.869505.json +105 -0
  29. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-20.849828.json +118 -0
  30. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-19-39.147509.json +105 -0
  31. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-53-51.017953.json +106 -0
  32. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-01-59.819478.json +111 -0
  33. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-54.278792.json +107 -0
  34. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-23.654679.json +109 -0
  35. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-52.993123.json +111 -0
  36. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-33-25.750066.json +119 -0
  37. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-08.007826.json +107 -0
  38. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-54-27.674557.json +118 -0
  39. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-22.563512.json +107 -0
  40. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-52.497622.json +105 -0
  41. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-18.271570.json +112 -0
  42. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_pes_1723381722/results_2024-08-27T17-50-52.063138.json +0 -0
  43. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-23.588300.json +106 -0
  44. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-00.491423.json +106 -0
  45. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-02.859037.json +101 -0
  46. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-09-42.653951.json +104 -0
  47. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-23.877449.json +101 -0
  48. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-30-24.424865.json +111 -0
  49. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-48.485190.json +107 -0
  50. eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-28.998766.json +118 -0
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/.idea/open_pl_llm_leaderboard_results.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-32-59.263304.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_in": {
4
+ "exact_match,score-first": 0.7797783933518005,
5
+ "exact_match_stderr,score-first": 0.01543291377156506,
6
+ "alias": "polemo2_in"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polemo2_in": []
11
+ },
12
+ "configs": {
13
+ "polemo2_in": {
14
+ "task": "polemo2_in",
15
+ "group": [
16
+ "polemo2"
17
+ ],
18
+ "dataset_path": "allegro/klej-polemo2-in",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 0,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true,
33
+ "hf_evaluate": true
34
+ }
35
+ ],
36
+ "output_type": "generate_until",
37
+ "generation_kwargs": {
38
+ "until": [
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0,
44
+ "max_gen_toks": 50
45
+ },
46
+ "repeats": 1,
47
+ "filter_list": [
48
+ {
49
+ "name": "score-first",
50
+ "filter": [
51
+ {
52
+ "function": "regex",
53
+ "regex_pattern": "(\\b[ABCD]\\b)"
54
+ },
55
+ {
56
+ "function": "take_first"
57
+ }
58
+ ]
59
+ }
60
+ ],
61
+ "should_decontaminate": true,
62
+ "doc_to_decontamination_query": "{{sentence}}",
63
+ "metadata": {
64
+ "version": 1.0
65
+ }
66
+ }
67
+ },
68
+ "versions": {
69
+ "polemo2_in": 1.0
70
+ },
71
+ "n-shot": {
72
+ "polemo2_in": 0
73
+ },
74
+ "higher_is_better": {
75
+ "polemo2_in": {
76
+ "exact_match": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polemo2_in": {
81
+ "original": 722,
82
+ "effective": 722
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381747.6734786,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polemo2_in": "287c7460415884286befac7ba8422a32230ec65846799595a6fee727f2d037a5"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2341192.242787643,
116
+ "end_time": 2342631.868033792,
117
+ "total_evaluation_time_seconds": "1439.6252461490221"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-19.394508.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_in_multiple_choice": {
4
+ "acc,none": 0.7714681440443213,
5
+ "acc_stderr,none": 0.015637406997304655,
6
+ "acc_norm,none": 0.7742382271468145,
7
+ "acc_norm_stderr,none": 0.015570224561219015,
8
+ "alias": "polemo2_in_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polemo2_in_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polemo2_in_multiple_choice": {
16
+ "task": "polemo2_in_multiple_choice",
17
+ "group": [
18
+ "polemo2_mc"
19
+ ],
20
+ "dataset_path": "allegro/klej-polemo2-in",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "test_split": "test",
24
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
25
+ "doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
26
+ "doc_to_choice": [
27
+ "Neutralny",
28
+ "Negatywny",
29
+ "Pozytywny",
30
+ "Niejednoznaczny"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polemo2_in_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polemo2_in_multiple_choice": 0
59
+ },
60
+ "higher_is_better": {
61
+ "polemo2_in_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polemo2_in_multiple_choice": {
68
+ "original": 722,
69
+ "effective": 722
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_in_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381748.8968034,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polemo2_in_multiple_choice": "6cade7fdeb7a53de3a966bebb7fe941479487faada4badf2831b62d7bb426916"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2669732.988216114,
103
+ "end_time": 2670111.950610943,
104
+ "total_evaluation_time_seconds": "378.96239482890815"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-27-04.852309.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_out": {
4
+ "exact_match,score-first": 0.7530364372469636,
5
+ "exact_match_stderr,score-first": 0.0194223142525205,
6
+ "alias": "polemo2_out"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polemo2_out": []
11
+ },
12
+ "configs": {
13
+ "polemo2_out": {
14
+ "task": "polemo2_out",
15
+ "group": [
16
+ "polemo2"
17
+ ],
18
+ "dataset_path": "allegro/klej-polemo2-out",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 0,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true,
33
+ "hf_evaluate": true
34
+ }
35
+ ],
36
+ "output_type": "generate_until",
37
+ "generation_kwargs": {
38
+ "until": [
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0,
44
+ "max_gen_toks": 50
45
+ },
46
+ "repeats": 1,
47
+ "filter_list": [
48
+ {
49
+ "name": "score-first",
50
+ "filter": [
51
+ {
52
+ "function": "regex",
53
+ "regex_pattern": "(\\b[ABCD]\\b)"
54
+ },
55
+ {
56
+ "function": "take_first"
57
+ }
58
+ ]
59
+ }
60
+ ],
61
+ "should_decontaminate": true,
62
+ "doc_to_decontamination_query": "{{sentence}}",
63
+ "metadata": {
64
+ "version": 1.0
65
+ }
66
+ }
67
+ },
68
+ "versions": {
69
+ "polemo2_out": 1.0
70
+ },
71
+ "n-shot": {
72
+ "polemo2_out": 0
73
+ },
74
+ "higher_is_better": {
75
+ "polemo2_out": {
76
+ "exact_match": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polemo2_out": {
81
+ "original": 494,
82
+ "effective": 494
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.629631,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polemo2_out": "bf931755699911cb191ed108ec01aa6c9695552185da1ccb8f6c40c22db028b6"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2577223.678777486,
116
+ "end_time": 2578308.421439366,
117
+ "total_evaluation_time_seconds": "1084.7426618798636"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-12-58.277105.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_out_multiple_choice": {
4
+ "acc,none": 0.742914979757085,
5
+ "acc_stderr,none": 0.019682691432000205,
6
+ "acc_norm,none": 0.7672064777327935,
7
+ "acc_norm_stderr,none": 0.019033476340855917,
8
+ "alias": "polemo2_out_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polemo2_out_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polemo2_out_multiple_choice": {
16
+ "task": "polemo2_out_multiple_choice",
17
+ "group": [
18
+ "polemo2_mc"
19
+ ],
20
+ "dataset_path": "allegro/klej-polemo2-out",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "test_split": "test",
24
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
25
+ "doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
26
+ "doc_to_choice": [
27
+ "Neutralny",
28
+ "Negatywny",
29
+ "Pozytywny",
30
+ "Niejednoznaczny"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polemo2_out_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polemo2_out_multiple_choice": 0
59
+ },
60
+ "higher_is_better": {
61
+ "polemo2_out_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polemo2_out_multiple_choice": {
68
+ "original": 494,
69
+ "effective": 494
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polemo2_out_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381748.8963165,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polemo2_out_multiple_choice": "63ec4fc12bc668a566b3f91378159707f11e63ac52ded120b65fa3dd6a1b9979"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2669732.987871353,
103
+ "end_time": 2669970.833388602,
104
+ "total_evaluation_time_seconds": "237.8455172488466"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-19.801525.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_8tags_multiple_choice": {
4
+ "acc,none": 0.785224153705398,
5
+ "acc_stderr,none": 0.006211537927009462,
6
+ "acc_norm,none": 0.7829368709972553,
7
+ "acc_norm_stderr,none": 0.006235424129675317,
8
+ "alias": "polish_8tags_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_8tags_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_8tags_multiple_choice": {
16
+ "task": "polish_8tags_multiple_choice",
17
+ "dataset_path": "sdadas/8tags",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "fewshot_split": "train",
21
+ "doc_to_text": "Tytu艂: \"{{sentence}}\"\nDo podanego tytu艂u przyporz膮dkuj jedn膮 najlepiej pasuj膮c膮 kategori臋 z podanych: Film, Historia, Jedzenie, Medycyna, Motoryzacja, Praca, Sport, Technologie.\nKategoria:",
22
+ "doc_to_target": "{{label|int}}",
23
+ "doc_to_choice": [
24
+ "Film",
25
+ "Historia",
26
+ "Jedzenie",
27
+ "Medycyna",
28
+ "Motoryzacja",
29
+ "Praca",
30
+ "Sport",
31
+ "Technologie"
32
+ ],
33
+ "description": "",
34
+ "target_delimiter": " ",
35
+ "fewshot_delimiter": "\n\n",
36
+ "num_fewshot": 0,
37
+ "metric_list": [
38
+ {
39
+ "metric": "acc",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "acc_norm",
45
+ "aggregation": "mean",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{sentence}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_8tags_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_8tags_multiple_choice": 0
60
+ },
61
+ "higher_is_better": {
62
+ "polish_8tags_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_8tags_multiple_choice": {
69
+ "original": 4372,
70
+ "effective": 4372
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_multiple_choice/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381748.8961906,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_8tags_multiple_choice": "97e61e52772af016579422421c750a76a73c5aa55b81bd957c03e5fe7ca43b9b"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 2669732.988419174,
104
+ "end_time": 2671312.355425343,
105
+ "total_evaluation_time_seconds": "1579.3670061687008"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-37-36.637222.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_8tags_regex": {
4
+ "exact_match,score-first": 0.7509149130832571,
5
+ "exact_match_stderr,score-first": 0.006541522277132546,
6
+ "alias": "polish_8tags_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_8tags_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_8tags_regex": {
14
+ "task": "polish_8tags_regex",
15
+ "dataset_path": "sdadas/8tags",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Tytu艂: \"{{sentence}}\"\nPytanie: jaka kategoria najlepiej pasuje do podanego tytu艂u?\nMo偶liwe odpowiedzi:\nA - film\nB - historia\nC - jedzenie\nD - medycyna\nE - motoryzacja\nF - praca\nG - sport\nH - technologie\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H'}.get(label)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 0,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ","
37
+ ],
38
+ "do_sample": false,
39
+ "temperature": 0.0,
40
+ "max_gen_toks": 50
41
+ },
42
+ "repeats": 1,
43
+ "filter_list": [
44
+ {
45
+ "name": "score-first",
46
+ "filter": [
47
+ {
48
+ "function": "regex",
49
+ "regex_pattern": "(\\b[ABCDEFGH]\\b)"
50
+ },
51
+ {
52
+ "function": "take_first"
53
+ }
54
+ ]
55
+ }
56
+ ],
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "{{sentence}}"
59
+ }
60
+ },
61
+ "versions": {
62
+ "polish_8tags_regex": "Yaml"
63
+ },
64
+ "n-shot": {
65
+ "polish_8tags_regex": 0
66
+ },
67
+ "higher_is_better": {
68
+ "polish_8tags_regex": {
69
+ "exact_match": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_8tags_regex": {
74
+ "original": 4372,
75
+ "effective": 4372
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_8tags_regex/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381748.6299846,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_8tags_regex": "65692e40c28addb981c1eb0f272d45d3abf7b640c98f72a2acf9de48677c436e"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 2577223.709303458,
109
+ "end_time": 2586140.204190591,
110
+ "total_evaluation_time_seconds": "8916.494887132663"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-12.455988.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_belebele_mc": {
4
+ "acc,none": 0.8755555555555555,
5
+ "acc_stderr,none": 0.011009047987347446,
6
+ "acc_norm,none": 0.8755555555555555,
7
+ "acc_norm_stderr,none": 0.011009047987347446,
8
+ "alias": "polish_belebele_mc"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_belebele_mc": []
13
+ },
14
+ "configs": {
15
+ "polish_belebele_mc": {
16
+ "task": "polish_belebele_mc",
17
+ "dataset_path": "facebook/belebele",
18
+ "test_split": "pol_Latn",
19
+ "fewshot_split": "pol_Latn",
20
+ "doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{['1', '2', '3', '4'].index(correct_answer_num)}}",
22
+ "doc_to_choice": [
23
+ "A",
24
+ "B",
25
+ "C",
26
+ "D"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "fewshot_config": {
32
+ "sampler": "first_n"
33
+ },
34
+ "num_fewshot": 0,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": true,
50
+ "doc_to_decontamination_query": "{{question}}",
51
+ "metadata": {
52
+ "version": 0.0
53
+ }
54
+ }
55
+ },
56
+ "versions": {
57
+ "polish_belebele_mc": 0.0
58
+ },
59
+ "n-shot": {
60
+ "polish_belebele_mc": 0
61
+ },
62
+ "higher_is_better": {
63
+ "polish_belebele_mc": {
64
+ "acc": true,
65
+ "acc_norm": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_belebele_mc": {
70
+ "original": 900,
71
+ "effective": 900
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_mc/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381748.8964837,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_belebele_mc": "e575c2bfe123497ebf8be109e92bdcb84761ff1f7ebc06ee26942cfec0914841"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 2669732.988173293,
105
+ "end_time": 2669985.011977567,
106
+ "total_evaluation_time_seconds": "252.023804273922"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-48.898331.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_belebele_regex": {
4
+ "exact_match,score-first": 0.8622222222222222,
5
+ "exact_match_stderr,score-first": 0.011495274539524291,
6
+ "alias": "polish_belebele_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_belebele_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_belebele_regex": {
14
+ "task": "polish_belebele_regex",
15
+ "dataset_path": "facebook/belebele",
16
+ "test_split": "pol_Latn",
17
+ "doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
18
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(correct_answer_num|int - 1)}}",
19
+ "description": "",
20
+ "target_delimiter": " ",
21
+ "fewshot_delimiter": "\n\n",
22
+ "num_fewshot": 0,
23
+ "metric_list": [
24
+ {
25
+ "metric": "exact_match",
26
+ "aggregation": "mean",
27
+ "higher_is_better": true
28
+ }
29
+ ],
30
+ "output_type": "generate_until",
31
+ "generation_kwargs": {
32
+ "until": [
33
+ ".",
34
+ ","
35
+ ],
36
+ "do_sample": false,
37
+ "temperature": 0.0,
38
+ "max_gen_toks": 50
39
+ },
40
+ "repeats": 1,
41
+ "filter_list": [
42
+ {
43
+ "name": "score-first",
44
+ "filter": [
45
+ {
46
+ "function": "regex",
47
+ "regex_pattern": "(\\b[ABCD]\\b)"
48
+ },
49
+ {
50
+ "function": "take_first"
51
+ }
52
+ ]
53
+ }
54
+ ],
55
+ "should_decontaminate": true,
56
+ "doc_to_decontamination_query": "{{flores_passage}} {{question}} {{mc_answer1}} {{mc_answer2}} {{mc_answer3}} {{mc_answer4}}"
57
+ }
58
+ },
59
+ "versions": {
60
+ "polish_belebele_regex": "Yaml"
61
+ },
62
+ "n-shot": {
63
+ "polish_belebele_regex": 0
64
+ },
65
+ "higher_is_better": {
66
+ "polish_belebele_regex": {
67
+ "exact_match": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "polish_belebele_regex": {
72
+ "original": 900,
73
+ "effective": 900
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
79
+ "batch_size": "1",
80
+ "batch_sizes": [],
81
+ "device": "cuda:0",
82
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_belebele_regex/",
83
+ "limit": null,
84
+ "bootstrap_iters": 100000,
85
+ "gen_kwargs": null,
86
+ "random_seed": 0,
87
+ "numpy_seed": 1234,
88
+ "torch_seed": 1234,
89
+ "fewshot_seed": 1234
90
+ },
91
+ "git_hash": "2132286",
92
+ "date": 1723381748.62987,
93
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
94
+ "transformers_version": "4.43.1",
95
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
96
+ "task_hashes": {
97
+ "polish_belebele_regex": "27d3ad975a6f34d19e414caf684c5c66f47347a7e3f05c8420cd085a341dcbe7"
98
+ },
99
+ "model_source": "hf",
100
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
101
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
102
+ "system_instruction": null,
103
+ "system_instruction_sha": null,
104
+ "chat_template": null,
105
+ "chat_template_sha": null,
106
+ "start_time": 2577223.702730047,
107
+ "end_time": 2578892.467186104,
108
+ "total_evaluation_time_seconds": "1668.7644560569897"
109
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-56.871408.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_cbd_multiple_choice": {
4
+ "acc,none": 0.232,
5
+ "acc_stderr,none": 0.01335493745228157,
6
+ "f1,none": 0.19798644869999346,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.254,
9
+ "acc_norm_stderr,none": 0.013772206565168544,
10
+ "alias": "polish_cbd_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_cbd_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_cbd_multiple_choice": {
18
+ "task": "polish_cbd_multiple_choice",
19
+ "dataset_path": "ptaszynski/PolishCyberbullyingDataset",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nDo podanej wypowiedzi przyporz膮dkuj jedn膮, najlepiej pasuj膮c膮 kategori臋 z podanych: nieszkodliwa, szyderstwo, obelga, insynuacja, gro藕ba, molestowanie.\nKategoria:",
23
+ "doc_to_target": "{{{'szyderstwo': 1, 'obelga': 2, 'insynuacja': 3, 'grozba': 4, 'molestowanie': 5}.get(CATEGORIES, 0)}}",
24
+ "doc_to_choice": [
25
+ "nieszkodliwa",
26
+ "szyderstwo",
27
+ "obelga",
28
+ "insynuacja",
29
+ "gro藕ba",
30
+ "molestowanie"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
49
+ "aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "multiple_choice",
54
+ "repeats": 1,
55
+ "should_decontaminate": true,
56
+ "doc_to_decontamination_query": "{{TEXT}}"
57
+ }
58
+ },
59
+ "versions": {
60
+ "polish_cbd_multiple_choice": "Yaml"
61
+ },
62
+ "n-shot": {
63
+ "polish_cbd_multiple_choice": 0
64
+ },
65
+ "higher_is_better": {
66
+ "polish_cbd_multiple_choice": {
67
+ "acc": true,
68
+ "acc_norm": true,
69
+ "f1": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_cbd_multiple_choice": {
74
+ "original": 1000,
75
+ "effective": 1000
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_multiple_choice/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381748.896205,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_cbd_multiple_choice": "56be7d38fd3346cc3ebad202ec8c0365fc6a9b7c3b60b7c527ec0cf16db2c0df"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 2669732.987932883,
109
+ "end_time": 2670209.427766158,
110
+ "total_evaluation_time_seconds": "476.4398332745768"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-52.956955.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_cbd_regex": {
4
+ "exact_match,score-first": 0.389,
5
+ "exact_match_stderr,score-first": 0.015424555647308496,
6
+ "f1,score-first": 0.24472868820966764,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_cbd_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_cbd_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_cbd_regex": {
16
+ "task": "polish_cbd_regex",
17
+ "dataset_path": "ptaszynski/PolishCyberbullyingDataset",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nPytanie: Jaka kategoria najlepiej pasuje do podanej wypowiedzi?\nMo偶liwe odpowiedzi:\nA - nieszkodliwa\nB - szyderstwo\nC - obelga\nD - insynuacja\nE - gro藕ba\nF - molestowanie\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{{'szyderstwo': 'B', 'obelga': 'C', 'insynuacja': 'D', 'grozba': 'E', 'molestowanie': 'F'}.get(CATEGORIES, 'A')}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 0,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ",",
43
+ ";"
44
+ ],
45
+ "do_sample": false,
46
+ "temperature": 0.0,
47
+ "max_gen_toks": 50
48
+ },
49
+ "repeats": 1,
50
+ "filter_list": [
51
+ {
52
+ "name": "score-first",
53
+ "filter": [
54
+ {
55
+ "function": "regex",
56
+ "regex_pattern": "(\\b[ABCDEF]\\b)"
57
+ },
58
+ {
59
+ "function": "take_first"
60
+ }
61
+ ]
62
+ }
63
+ ],
64
+ "should_decontaminate": true,
65
+ "doc_to_decontamination_query": "{{TEXT}}"
66
+ }
67
+ },
68
+ "versions": {
69
+ "polish_cbd_regex": "Yaml"
70
+ },
71
+ "n-shot": {
72
+ "polish_cbd_regex": 0
73
+ },
74
+ "higher_is_better": {
75
+ "polish_cbd_regex": {
76
+ "exact_match": true,
77
+ "f1": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "polish_cbd_regex": {
82
+ "original": 1000,
83
+ "effective": 1000
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
89
+ "batch_size": "1",
90
+ "batch_sizes": [],
91
+ "device": "cuda:0",
92
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_cbd_regex/",
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "2132286",
102
+ "date": 1723381748.6293602,
103
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
104
+ "transformers_version": "4.43.1",
105
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
106
+ "task_hashes": {
107
+ "polish_cbd_regex": "d924b7270ebed050a040882627c2c9edeabe16833fce05d56d9afd2bdd04ab67"
108
+ },
109
+ "model_source": "hf",
110
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
111
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
112
+ "system_instruction": null,
113
+ "system_instruction_sha": null,
114
+ "chat_template": null,
115
+ "chat_template_sha": null,
116
+ "start_time": 2577223.672784959,
117
+ "end_time": 2579376.526070405,
118
+ "total_evaluation_time_seconds": "2152.8532854458317"
119
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-01.731913.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_dyk_multiple_choice": {
4
+ "acc,none": 0.8571428571428571,
5
+ "acc_stderr,none": 0.010913926579250558,
6
+ "f1,none": 0.6508313539192399,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.8571428571428571,
9
+ "acc_norm_stderr,none": 0.010913926579250558,
10
+ "alias": "polish_dyk_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_dyk_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_dyk_multiple_choice": {
18
+ "task": "polish_dyk_multiple_choice",
19
+ "dataset_path": "allegro/klej-dyk",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nPytanie: Czy sugerowana odpowied藕 na zadane pytanie jest poprawna?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{target|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
45
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{question}} {{answer}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_dyk_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_dyk_multiple_choice": 0
60
+ },
61
+ "higher_is_better": {
62
+ "polish_dyk_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true,
65
+ "f1": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_dyk_multiple_choice": {
70
+ "original": 1029,
71
+ "effective": 1029
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_multiple_choice/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381748.8968518,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_dyk_multiple_choice": "614bab79a1ec3b666218bb65089e147f8ed82ac0a4d10ab14a57ffcc73379688"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 2669732.988483474,
105
+ "end_time": 2669974.28807118,
106
+ "total_evaluation_time_seconds": "241.29958770610392"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-18.998768.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_dyk_regex": {
4
+ "exact_match,score-first": 0.8532555879494655,
5
+ "exact_match_stderr,score-first": 0.01103630767704879,
6
+ "f1,score-first": 0.6591422121896162,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_dyk_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_dyk_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_dyk_regex": {
16
+ "task": "polish_dyk_regex",
17
+ "dataset_path": "allegro/klej-dyk",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nCzy sugerowana odpowied藕 na zadane pytanie jest poprawna? Mo偶liwe opcje:\nA - brakuje sugerowanej odpowiedzi\nB - nie, sugerowana odpowied藕 nie jest poprawna\nC - tak, sugerowana odpowied藕 jest poprawna\nD - brakuje pytania\nPrawid艂owa opcja:",
21
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(target|int + 1)}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 0,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ","
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "filter_list": [
50
+ {
51
+ "name": "score-first",
52
+ "filter": [
53
+ {
54
+ "function": "regex",
55
+ "regex_pattern": "(\\b[ABCD]\\b)"
56
+ },
57
+ {
58
+ "function": "take_first"
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ "should_decontaminate": true,
64
+ "doc_to_decontamination_query": "{{question}} {{answer}}"
65
+ }
66
+ },
67
+ "versions": {
68
+ "polish_dyk_regex": "Yaml"
69
+ },
70
+ "n-shot": {
71
+ "polish_dyk_regex": 0
72
+ },
73
+ "higher_is_better": {
74
+ "polish_dyk_regex": {
75
+ "exact_match": true,
76
+ "f1": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polish_dyk_regex": {
81
+ "original": 1029,
82
+ "effective": 1029
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_dyk_regex/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.6296444,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polish_dyk_regex": "2649d9ae76c76684ce97aa5028f8024f8eda160a26db26b547c0abf175fb2de1"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2577223.673779933,
116
+ "end_time": 2577542.567465127,
117
+ "total_evaluation_time_seconds": "318.89368519419804"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-26.656191.json ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_eq_bench": {
4
+ "first_eqbench,none": 48.33189616455903,
5
+ "first_eqbench_stderr,none": 2.573616088487117,
6
+ "first_percent_parseable,none": 100.0,
7
+ "first_percent_parseable_stderr,none": 0.0,
8
+ "revised_eqbench,none": 63.019918840007705,
9
+ "revised_eqbench_stderr,none": 2.3758111655038587,
10
+ "revised_percent_parseable,none": 99.41520467836257,
11
+ "revised_percent_parseable_stderr,none": 0.5847953216374274,
12
+ "average_eqbench,none": 55.67590750228339,
13
+ "average_eqbench_stderr,none": 2.1636830548973527,
14
+ "alias": "polish_eq_bench"
15
+ }
16
+ },
17
+ "group_subtasks": {
18
+ "polish_eq_bench": []
19
+ },
20
+ "configs": {
21
+ "polish_eq_bench": {
22
+ "task": "polish_eq_bench",
23
+ "dataset_path": "speakleash/EQ-Bench-PL",
24
+ "validation_split": "validation",
25
+ "doc_to_text": "{{prompt}}\nPierwsze oceny:\n",
26
+ "doc_to_target": "reference_answer_fullscale",
27
+ "process_results": "def score(docs, results):\n first_pass_answers, revised_answers = parse(results[0])\n reference = eval(docs[\"reference_answer\"])\n reference_fullscale = eval(docs[\"reference_answer_fullscale\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n revised_pass_score = calculate_score(reference_fullscale, revised_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n scores.update({'revised_'+k: v for k, v in revised_pass_score.items()})\n #add average score\n scores['average_eqbench'] = (scores['first_eqbench'] + scores['revised_eqbench']) / 2\n return scores\n",
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "first_eqbench",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "first_percent_parseable",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "revised_eqbench",
45
+ "aggregation": "mean",
46
+ "higher_is_better": true
47
+ },
48
+ {
49
+ "metric": "revised_percent_parseable",
50
+ "aggregation": "mean",
51
+ "higher_is_better": true
52
+ },
53
+ {
54
+ "metric": "average_eqbench",
55
+ "aggregation": "mean",
56
+ "higher_is_better": true
57
+ }
58
+ ],
59
+ "output_type": "generate_until",
60
+ "generation_kwargs": {
61
+ "max_gen_toks": 512,
62
+ "do_sample": false,
63
+ "temperature": 0.0,
64
+ "until": [
65
+ "</s>",
66
+ "[Koniec odpowiedzi]",
67
+ "Masz za zadanie"
68
+ ]
69
+ },
70
+ "repeats": 1,
71
+ "should_decontaminate": false,
72
+ "metadata": {
73
+ "version": 2.4
74
+ }
75
+ }
76
+ },
77
+ "versions": {
78
+ "polish_eq_bench": 2.4
79
+ },
80
+ "n-shot": {
81
+ "polish_eq_bench": 0
82
+ },
83
+ "higher_is_better": {
84
+ "polish_eq_bench": {
85
+ "first_eqbench": true,
86
+ "first_percent_parseable": true,
87
+ "revised_eqbench": true,
88
+ "revised_percent_parseable": true,
89
+ "average_eqbench": true
90
+ }
91
+ },
92
+ "n-samples": {
93
+ "polish_eq_bench": {
94
+ "original": 171,
95
+ "effective": 171
96
+ }
97
+ },
98
+ "config": {
99
+ "model": "hf",
100
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
101
+ "batch_size": "1",
102
+ "batch_sizes": [],
103
+ "device": "cuda:0",
104
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench/",
105
+ "limit": null,
106
+ "bootstrap_iters": 100000,
107
+ "gen_kwargs": null,
108
+ "random_seed": 0,
109
+ "numpy_seed": 1234,
110
+ "torch_seed": 1234,
111
+ "fewshot_seed": 1234
112
+ },
113
+ "git_hash": "2132286",
114
+ "date": 1723381748.6044407,
115
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
116
+ "transformers_version": "4.43.1",
117
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
118
+ "task_hashes": {
119
+ "polish_eq_bench": "18b3ee14b53fb2aaee4430e37609e64896598b0efa26dc7ecf4e483eece3a6b3"
120
+ },
121
+ "model_source": "hf",
122
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
123
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
124
+ "system_instruction": null,
125
+ "system_instruction_sha": null,
126
+ "chat_template": null,
127
+ "chat_template_sha": null,
128
+ "start_time": 780407.632073815,
129
+ "end_time": 783314.157407221,
130
+ "total_evaluation_time_seconds": "2906.5253334060544"
131
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-21.291842.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_eq_bench_first_turn": {
4
+ "first_eqbench,none": 46.996619012784315,
5
+ "first_eqbench_stderr,none": 2.655038142048486,
6
+ "first_percent_parseable,none": 100.0,
7
+ "first_percent_parseable_stderr,none": 0.0,
8
+ "alias": "polish_eq_bench_first_turn"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_eq_bench_first_turn": []
13
+ },
14
+ "configs": {
15
+ "polish_eq_bench_first_turn": {
16
+ "task": "polish_eq_bench_first_turn",
17
+ "dataset_path": "speakleash/EQ-Bench-PL-first-turn",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "{{prompt}}\nOceny:\n",
20
+ "doc_to_target": "def doc_to_target(doc):\n reference = eval(doc[\"reference_answer\"])\n\n target = \"\"\n for i in range(1, 5):\n emotion = reference[f\"emotion{i}\"]\n emotion_score = reference[f\"emotion{i}_score\"]\n target += f\"{emotion}: {emotion_score}\\n\"\n target += \"\\n\"\n\n return target\n",
21
+ "process_results": "def score_first(docs, results):\n first_pass_answers = dict(list(re.findall(r'(\\w+(?: \\w+)*):\\s+(\\d+)', results[0]))[:4])\n reference = eval(docs[\"reference_answer\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n return scores\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 0,
26
+ "metric_list": [
27
+ {
28
+ "metric": "first_eqbench",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "first_percent_parseable",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "max_gen_toks": 512,
41
+ "do_sample": false,
42
+ "temperature": 0.0,
43
+ "until": [
44
+ "</s>",
45
+ "[Koniec odpowiedzi]",
46
+ "Masz za zadanie"
47
+ ]
48
+ },
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 2.4
53
+ }
54
+ }
55
+ },
56
+ "versions": {
57
+ "polish_eq_bench_first_turn": 2.4
58
+ },
59
+ "n-shot": {
60
+ "polish_eq_bench_first_turn": 0
61
+ },
62
+ "higher_is_better": {
63
+ "polish_eq_bench_first_turn": {
64
+ "first_eqbench": true,
65
+ "first_percent_parseable": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_eq_bench_first_turn": {
70
+ "original": 171,
71
+ "effective": 171
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_eq_bench_first_turn/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381747.7569976,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_eq_bench_first_turn": "0e253a32b5915f6d9cff628bdffb1f234618238d116e0a34217ec48916ba0a49"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 2270424.400063744,
105
+ "end_time": 2272005.840581135,
106
+ "total_evaluation_time_seconds": "1581.4405173910782"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-30.374052.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_klej_ner_multiple_choice": {
4
+ "acc,none": 0.46987366375121475,
5
+ "acc_stderr,none": 0.011004317088597403,
6
+ "acc_norm,none": 0.5092322643343051,
7
+ "acc_norm_stderr,none": 0.011022467118497213,
8
+ "alias": "polish_klej_ner_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_klej_ner_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_klej_ner_multiple_choice": {
16
+ "task": "polish_klej_ner_multiple_choice",
17
+ "dataset_path": "allegro/klej-nkjp-ner",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "fewshot_split": "train",
22
+ "doc_to_text": "Zdanie: \"{{sentence}}\"\nJakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi: Brak nazwanej jednostki, Nazwa miejsca, Nazwa osoby, Nazwa organizacji, Czas, Nazwa geograficzna.\nRodzaj:",
23
+ "doc_to_target": "{{{'noEntity': 0, 'placeName': 1, 'persName': 2, 'orgName': 3, 'time': 4, 'geogName': 5}.get(target)}}",
24
+ "doc_to_choice": [
25
+ "Brak nazwanej jednostki",
26
+ "Nazwa miejsca",
27
+ "Nazwa osoby",
28
+ "Nazwa organizacji",
29
+ "Czas",
30
+ "Nazwa geograficzna"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polish_klej_ner_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polish_klej_ner_multiple_choice": 0
59
+ },
60
+ "higher_is_better": {
61
+ "polish_klej_ner_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polish_klej_ner_multiple_choice": {
68
+ "original": 2058,
69
+ "effective": 2058
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381747.6734633,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polish_klej_ner_multiple_choice": "09f6e903dc9fc050951f2c84685c285da57a8ddba1ff829ebb489df1ba737161"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2341192.242899954,
103
+ "end_time": 2341942.977311477,
104
+ "total_evaluation_time_seconds": "750.7344115232117"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-15-22.967823.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_klej_ner_regex": {
4
+ "exact_match,score-first": 0.5388726919339164,
5
+ "exact_match_stderr,score-first": 0.010990978618734456,
6
+ "alias": "polish_klej_ner_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_klej_ner_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_klej_ner_regex": {
14
+ "task": "polish_klej_ner_regex",
15
+ "dataset_path": "allegro/klej-nkjp-ner",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Zdanie: \"{{sentence}}\"\nPytanie: Jakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi:\nA - Brak nazwanej jednostki\nB - Nazwa miejsca\nC - Nazwa osoby\nD - Nazwa organizacji\nE - Czas\nF - Nazwa geograficzna\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{'noEntity': 'A', 'placeName': 'B', 'persName': 'C', 'orgName': 'D', 'time': 'E', 'geogName': 'F'}.get(target)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 0,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ",",
37
+ ";"
38
+ ],
39
+ "do_sample": false,
40
+ "temperature": 0.0,
41
+ "max_gen_toks": 50
42
+ },
43
+ "repeats": 1,
44
+ "filter_list": [
45
+ {
46
+ "name": "score-first",
47
+ "filter": [
48
+ {
49
+ "function": "regex",
50
+ "regex_pattern": "(\\b[ABCDEF]\\b)"
51
+ },
52
+ {
53
+ "function": "take_first"
54
+ }
55
+ ]
56
+ }
57
+ ],
58
+ "should_decontaminate": true,
59
+ "doc_to_decontamination_query": "{{sentence}}"
60
+ }
61
+ },
62
+ "versions": {
63
+ "polish_klej_ner_regex": "Yaml"
64
+ },
65
+ "n-shot": {
66
+ "polish_klej_ner_regex": 0
67
+ },
68
+ "higher_is_better": {
69
+ "polish_klej_ner_regex": {
70
+ "exact_match": true
71
+ }
72
+ },
73
+ "n-samples": {
74
+ "polish_klej_ner_regex": {
75
+ "original": 2058,
76
+ "effective": 2058
77
+ }
78
+ },
79
+ "config": {
80
+ "model": "hf",
81
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
82
+ "batch_size": "1",
83
+ "batch_sizes": [],
84
+ "device": "cuda:0",
85
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_klej_ner_regex/",
86
+ "limit": null,
87
+ "bootstrap_iters": 100000,
88
+ "gen_kwargs": null,
89
+ "random_seed": 0,
90
+ "numpy_seed": 1234,
91
+ "torch_seed": 1234,
92
+ "fewshot_seed": 1234
93
+ },
94
+ "git_hash": "2132286",
95
+ "date": 1723381748.6294396,
96
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
97
+ "transformers_version": "4.43.1",
98
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
99
+ "task_hashes": {
100
+ "polish_klej_ner_regex": "73b98cc9f2e2b0a3c1be3efc063d3765b29cbbfcadaa6952ff0b16c2aeca4784"
101
+ },
102
+ "model_source": "hf",
103
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
104
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
105
+ "system_instruction": null,
106
+ "system_instruction_sha": null,
107
+ "chat_template": null,
108
+ "chat_template_sha": null,
109
+ "start_time": 2577223.695462814,
110
+ "end_time": 2581206.536121368,
111
+ "total_evaluation_time_seconds": "3982.840658553876"
112
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-25.200287.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_closed_book": {
4
+ "exact_match,none": 0.09034267912772585,
5
+ "exact_match_stderr,none": 0.009242678703782942,
6
+ "levenshtein,none": 0.3904465212876428,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_polqa_closed_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_closed_book": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_closed_book": {
16
+ "task": "polish_polqa_closed_book",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs_closed(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and example['question'] not in used and (used.add(example['question']) or True)).map(_helper)\n",
22
+ "doc_to_text": "Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "answers",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 0,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true
33
+ },
34
+ {
35
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
36
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
37
+ "higher_is_better": true
38
+ }
39
+ ],
40
+ "output_type": "generate_until",
41
+ "generation_kwargs": {
42
+ "until": [
43
+ "\n",
44
+ "</s>"
45
+ ],
46
+ "do_sample": false,
47
+ "temperature": 0.0,
48
+ "max_gen_toks": 50
49
+ },
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{question}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_polqa_closed_book": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_polqa_closed_book": 0
60
+ },
61
+ "higher_is_better": {
62
+ "polish_polqa_closed_book": {
63
+ "exact_match": true,
64
+ "levenshtein": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_polqa_closed_book": {
69
+ "original": 963,
70
+ "effective": 963
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_closed_book/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381747.7568066,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_polqa_closed_book": "0c5507d60ba16e4142471afab656e1a5d591a0227302e05fdfbce5cc9f087079"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 2270424.400038904,
104
+ "end_time": 2271229.748561826,
105
+ "total_evaluation_time_seconds": "805.3485229220241"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-58-53.946082.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_open_book": {
4
+ "exact_match,none": 0.23734817813765183,
5
+ "exact_match_stderr,none": 0.005526353270874367,
6
+ "levenshtein,none": 0.5875506072874493,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_polqa_open_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_open_book": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_open_book": {
16
+ "task": "polish_polqa_open_book",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs_open(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
22
+ "doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "answers",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 0,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true
33
+ },
34
+ {
35
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
36
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
37
+ "higher_is_better": true
38
+ }
39
+ ],
40
+ "output_type": "generate_until",
41
+ "generation_kwargs": {
42
+ "until": [
43
+ "\n",
44
+ "</s>"
45
+ ],
46
+ "do_sample": false,
47
+ "temperature": 0.0,
48
+ "max_gen_toks": 50
49
+ },
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{passage_text}} {{question}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_polqa_open_book": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_polqa_open_book": 0
60
+ },
61
+ "higher_is_better": {
62
+ "polish_polqa_open_book": {
63
+ "exact_match": true,
64
+ "levenshtein": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_polqa_open_book": {
69
+ "original": 5928,
70
+ "effective": 5928
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_open_book/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381747.756693,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_polqa_open_book": "605bac12835fc2014ee7398cc41fe38316bab9148d464cc12f8034038e6dd744"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 2270424.414197628,
104
+ "end_time": 2273418.492015983,
105
+ "total_evaluation_time_seconds": "2994.0778183550574"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-23-40.740746.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_reranking_multiple_choice": {
4
+ "acc,none": 0.8055708552993941,
5
+ "acc_stderr,none": 0.0035107018856493904,
6
+ "acc_norm,none": 0.8055708552993941,
7
+ "acc_norm_stderr,none": 0.0035107018856493904,
8
+ "alias": "polish_polqa_reranking_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_reranking_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_reranking_multiple_choice": {
16
+ "task": "polish_polqa_reranking_multiple_choice",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs(dataset: datasets.Dataset):\n def _helper(doc):\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
22
+ "doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Czy kontekst jest relewantny dla pytania? \n Odpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{relevant|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "{{passage_text}} {{question}}"
48
+ }
49
+ },
50
+ "versions": {
51
+ "polish_polqa_reranking_multiple_choice": "Yaml"
52
+ },
53
+ "n-shot": {
54
+ "polish_polqa_reranking_multiple_choice": 0
55
+ },
56
+ "higher_is_better": {
57
+ "polish_polqa_reranking_multiple_choice": {
58
+ "acc": true,
59
+ "acc_norm": true
60
+ }
61
+ },
62
+ "n-samples": {
63
+ "polish_polqa_reranking_multiple_choice": {
64
+ "original": 12709,
65
+ "effective": 12709
66
+ }
67
+ },
68
+ "config": {
69
+ "model": "hf",
70
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
71
+ "batch_size": "1",
72
+ "batch_sizes": [],
73
+ "device": "cuda:0",
74
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_polqa_reranking_multiple_choice/",
75
+ "limit": null,
76
+ "bootstrap_iters": 100000,
77
+ "gen_kwargs": null,
78
+ "random_seed": 0,
79
+ "numpy_seed": 1234,
80
+ "torch_seed": 1234,
81
+ "fewshot_seed": 1234
82
+ },
83
+ "git_hash": "2132286",
84
+ "date": 1723381747.6733267,
85
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.98\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
86
+ "transformers_version": "4.43.1",
87
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
88
+ "task_hashes": {
89
+ "polish_polqa_reranking_multiple_choice": "284e872060f899232535470a606d94a217d950995b140caeb313a8887ea3f0b4"
90
+ },
91
+ "model_source": "hf",
92
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
93
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
94
+ "system_instruction": null,
95
+ "system_instruction_sha": null,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 2341192.242885584,
99
+ "end_time": 2342073.337279379,
100
+ "total_evaluation_time_seconds": "881.0943937948905"
101
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book_1723381723/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T16-06-52.670471.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_poquad_open_book": {
4
+ "exact_match,none": 0.0,
5
+ "exact_match_stderr,none": 0.0,
6
+ "levenshtein,none": 0.18771686328938236,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_poquad_open_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_poquad_open_book": []
13
+ },
14
+ "configs": {
15
+ "polish_poquad_open_book": {
16
+ "task": "polish_poquad_open_book",
17
+ "dataset_path": "clarin-pl/poquad",
18
+ "training_split": "train",
19
+ "test_split": "validation",
20
+ "doc_to_text": "Tytu艂: {{title}} \n Kontekst: {{context}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕 (kr贸tki cytat z Kontekstu):",
21
+ "doc_to_target": "def doc_to_target(doc):\n answer_list = doc[\"answers\"][\"text\"]\n if len(answer_list) > 0:\n answer = answer_list[0]\n else:\n answer = \"bez odpowiedzi\"\n return \" \" + answer\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 0,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0].lower().lstrip()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('.? ?(</s>)* ?$', '', _prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower().lstrip())\n if ld < len(reference)/2:\n return 1\n return 0\n",
34
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ "\n",
42
+ "</s>"
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "should_decontaminate": true,
50
+ "doc_to_decontamination_query": "{{context}} {{question}}"
51
+ }
52
+ },
53
+ "versions": {
54
+ "polish_poquad_open_book": "Yaml"
55
+ },
56
+ "n-shot": {
57
+ "polish_poquad_open_book": 0
58
+ },
59
+ "higher_is_better": {
60
+ "polish_poquad_open_book": {
61
+ "exact_match": true,
62
+ "levenshtein": true
63
+ }
64
+ },
65
+ "n-samples": {
66
+ "polish_poquad_open_book": {
67
+ "original": 5764,
68
+ "effective": 5764
69
+ }
70
+ },
71
+ "config": {
72
+ "model": "hf",
73
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
74
+ "batch_size": "1",
75
+ "batch_sizes": [],
76
+ "device": "cuda:0",
77
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_poquad_open_book/",
78
+ "limit": null,
79
+ "bootstrap_iters": 100000,
80
+ "gen_kwargs": null,
81
+ "random_seed": 0,
82
+ "numpy_seed": 1234,
83
+ "torch_seed": 1234,
84
+ "fewshot_seed": 1234
85
+ },
86
+ "git_hash": "2132286",
87
+ "date": 1723381747.7568138,
88
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
89
+ "transformers_version": "4.43.1",
90
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
91
+ "task_hashes": {
92
+ "polish_poquad_open_book": "19564a782a5615c456e7084c72f26ca5fb6bc601f54dc4bfc5dec174c4e06e50"
93
+ },
94
+ "model_source": "hf",
95
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
96
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
97
+ "system_instruction": null,
98
+ "system_instruction_sha": null,
99
+ "chat_template": null,
100
+ "chat_template_sha": null,
101
+ "start_time": 2270424.41309478,
102
+ "end_time": 2273897.216942648,
103
+ "total_evaluation_time_seconds": "3472.8038478679955"
104
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-02.213869.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_ppc_multiple_choice": {
4
+ "acc,none": 0.74,
5
+ "acc_stderr,none": 0.013877773329774164,
6
+ "acc_norm,none": 0.74,
7
+ "acc_norm_stderr,none": 0.013877773329774164,
8
+ "alias": "polish_ppc_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_ppc_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_ppc_multiple_choice": {
16
+ "task": "polish_ppc_multiple_choice",
17
+ "dataset_path": "sdadas/ppc",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - znacz膮 dok艂adnie to samo\nB - maj膮 podobne znaczenie\nC - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
22
+ "doc_to_target": "{{label|int - 1}}",
23
+ "doc_to_choice": [
24
+ "A",
25
+ "B",
26
+ "C"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
48
+ }
49
+ },
50
+ "versions": {
51
+ "polish_ppc_multiple_choice": "Yaml"
52
+ },
53
+ "n-shot": {
54
+ "polish_ppc_multiple_choice": 0
55
+ },
56
+ "higher_is_better": {
57
+ "polish_ppc_multiple_choice": {
58
+ "acc": true,
59
+ "acc_norm": true
60
+ }
61
+ },
62
+ "n-samples": {
63
+ "polish_ppc_multiple_choice": {
64
+ "original": 1000,
65
+ "effective": 1000
66
+ }
67
+ },
68
+ "config": {
69
+ "model": "hf",
70
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
71
+ "batch_size": "1",
72
+ "batch_sizes": [],
73
+ "device": "cuda:0",
74
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_multiple_choice/",
75
+ "limit": null,
76
+ "bootstrap_iters": 100000,
77
+ "gen_kwargs": null,
78
+ "random_seed": 0,
79
+ "numpy_seed": 1234,
80
+ "torch_seed": 1234,
81
+ "fewshot_seed": 1234
82
+ },
83
+ "git_hash": "2132286",
84
+ "date": 1723381748.8964221,
85
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
86
+ "transformers_version": "4.43.1",
87
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
88
+ "task_hashes": {
89
+ "polish_ppc_multiple_choice": "8747fd84df6316b9938ebd5eafb0d8cedcf82a0c184659a0eea7878c4dacdafa"
90
+ },
91
+ "model_source": "hf",
92
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
93
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
94
+ "system_instruction": null,
95
+ "system_instruction_sha": null,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 2669732.988217134,
99
+ "end_time": 2669974.769807927,
100
+ "total_evaluation_time_seconds": "241.78159079281613"
101
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-44-08.270834.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_ppc_regex": {
4
+ "exact_match,score-first": 0.703,
5
+ "exact_match_stderr,score-first": 0.014456832294801106,
6
+ "alias": "polish_ppc_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_ppc_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_ppc_regex": {
14
+ "task": "polish_ppc_regex",
15
+ "dataset_path": "sdadas/ppc",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - znacz膮 dok艂adnie to samo\nC - maj膮 podobne znaczenie\nD - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(label|int)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 0,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ","
37
+ ],
38
+ "do_sample": false,
39
+ "temperature": 0.0,
40
+ "max_gen_toks": 50
41
+ },
42
+ "repeats": 1,
43
+ "filter_list": [
44
+ {
45
+ "name": "score-first",
46
+ "filter": [
47
+ {
48
+ "function": "regex",
49
+ "regex_pattern": "(\\b[ABCD]\\b)"
50
+ },
51
+ {
52
+ "function": "take_first"
53
+ }
54
+ ]
55
+ }
56
+ ],
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
59
+ }
60
+ },
61
+ "versions": {
62
+ "polish_ppc_regex": "Yaml"
63
+ },
64
+ "n-shot": {
65
+ "polish_ppc_regex": 0
66
+ },
67
+ "higher_is_better": {
68
+ "polish_ppc_regex": {
69
+ "exact_match": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_ppc_regex": {
74
+ "original": 1000,
75
+ "effective": 1000
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_ppc_regex/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381748.6299686,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_ppc_regex": "55067cab325af5b68fb5e678581d4bfe4d45a600032d75381bf251f3aa9d8c91"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 2577223.665428845,
109
+ "end_time": 2579331.839649978,
110
+ "total_evaluation_time_seconds": "2108.174221132882"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-11-55.449227.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_psc_multiple_choice": {
4
+ "acc,none": 0.9656771799628943,
5
+ "acc_stderr,none": 0.005547529422575579,
6
+ "f1,none": 0.9428129829984544,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.9656771799628943,
9
+ "acc_norm_stderr,none": 0.005547529422575579,
10
+ "alias": "polish_psc_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_psc_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_psc_multiple_choice": {
18
+ "task": "polish_psc_multiple_choice",
19
+ "dataset_path": "allegro/klej-psc",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Tekst: \"{{extract_text}}\"\nPodsumowanie: \"{{summary_text}}\"\nPytanie: Czy podsumowanie dla podanego tekstu jest poprawne?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{label|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
45
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_psc_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_psc_multiple_choice": 0
60
+ },
61
+ "higher_is_better": {
62
+ "polish_psc_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true,
65
+ "f1": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_psc_multiple_choice": {
70
+ "original": 1078,
71
+ "effective": 1078
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_multiple_choice/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381748.8966577,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_psc_multiple_choice": "53dfd060110a8ece3c4bf785c368bbf87ac6ae87f1d76781f1ffac90beb47879"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 2669732.988093483,
105
+ "end_time": 2669908.004738389,
106
+ "total_evaluation_time_seconds": "175.01664490625262"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-13.495944.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_psc_regex": {
4
+ "exact_match,score-first": 0.7588126159554731,
5
+ "exact_match_stderr,score-first": 0.01303577072183474,
6
+ "f1,score-first": 0.8123167155425219,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_psc_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_psc_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_psc_regex": {
16
+ "task": "polish_psc_regex",
17
+ "dataset_path": "allegro/klej-psc",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Fragment 1: \"{{extract_text}}\"\nFragment 2: \"{{summary_text}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy fragmentami 1 i 2?\nMo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - dotycz膮 tego samego artyku艂u\nC - dotycz膮 r贸偶nych artyku艂贸w\nD - brak poprawnej odpowiedzi\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{{0: 'A', 1: 'C', 2: 'B', 3: 'D'}.get(label|int + 1)}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 0,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ","
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "filter_list": [
50
+ {
51
+ "name": "score-first",
52
+ "filter": [
53
+ {
54
+ "function": "regex",
55
+ "regex_pattern": "(\\b[ABCD]\\b)"
56
+ },
57
+ {
58
+ "function": "take_first"
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ "should_decontaminate": true,
64
+ "doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
65
+ }
66
+ },
67
+ "versions": {
68
+ "polish_psc_regex": "Yaml"
69
+ },
70
+ "n-shot": {
71
+ "polish_psc_regex": 0
72
+ },
73
+ "higher_is_better": {
74
+ "polish_psc_regex": {
75
+ "exact_match": true,
76
+ "f1": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polish_psc_regex": {
81
+ "original": 1078,
82
+ "effective": 1078
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-0_polish_psc_regex/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.6296773,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polish_psc_regex": "0065cab6bd75fa16d7b0b782973d6452c76ac6f272bfcb4e037049c1d2a420a5"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2577223.665712506,
116
+ "end_time": 2579697.064617748,
117
+ "total_evaluation_time_seconds": "2473.398905241862"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-25-19.492688.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_in": {
4
+ "exact_match,score-first": 0.8559556786703602,
5
+ "exact_match_stderr,score-first": 0.01307693837899346,
6
+ "alias": "polemo2_in"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polemo2_in": []
11
+ },
12
+ "configs": {
13
+ "polemo2_in": {
14
+ "task": "polemo2_in",
15
+ "group": [
16
+ "polemo2"
17
+ ],
18
+ "dataset_path": "allegro/klej-polemo2-in",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 5,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true,
33
+ "hf_evaluate": true
34
+ }
35
+ ],
36
+ "output_type": "generate_until",
37
+ "generation_kwargs": {
38
+ "until": [
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0,
44
+ "max_gen_toks": 50
45
+ },
46
+ "repeats": 1,
47
+ "filter_list": [
48
+ {
49
+ "name": "score-first",
50
+ "filter": [
51
+ {
52
+ "function": "regex",
53
+ "regex_pattern": "(\\b[ABCD]\\b)"
54
+ },
55
+ {
56
+ "function": "take_first"
57
+ }
58
+ ]
59
+ }
60
+ ],
61
+ "should_decontaminate": true,
62
+ "doc_to_decontamination_query": "{{sentence}}",
63
+ "metadata": {
64
+ "version": 1.0
65
+ }
66
+ }
67
+ },
68
+ "versions": {
69
+ "polemo2_in": 1.0
70
+ },
71
+ "n-shot": {
72
+ "polemo2_in": 5
73
+ },
74
+ "higher_is_better": {
75
+ "polemo2_in": {
76
+ "exact_match": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polemo2_in": {
81
+ "original": 722,
82
+ "effective": 722
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.8777437,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polemo2_in": "311cf476a99939086a838a34ed5ebef9530cbeea1609d0919757a7dd473b40d1"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2272911.893989815,
116
+ "end_time": 2273890.992245757,
117
+ "total_evaluation_time_seconds": "979.09825594211"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-24-50.869505.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_in_multiple_choice": {
4
+ "acc,none": 0.871191135734072,
5
+ "acc_stderr,none": 0.012475615091746169,
6
+ "acc_norm,none": 0.8725761772853186,
7
+ "acc_norm_stderr,none": 0.012418220256560223,
8
+ "alias": "polemo2_in_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polemo2_in_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polemo2_in_multiple_choice": {
16
+ "task": "polemo2_in_multiple_choice",
17
+ "group": [
18
+ "polemo2_mc"
19
+ ],
20
+ "dataset_path": "allegro/klej-polemo2-in",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "test_split": "test",
24
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
25
+ "doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
26
+ "doc_to_choice": [
27
+ "Neutralny",
28
+ "Negatywny",
29
+ "Pozytywny",
30
+ "Niejednoznaczny"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 5,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polemo2_in_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polemo2_in_multiple_choice": 5
59
+ },
60
+ "higher_is_better": {
61
+ "polemo2_in_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polemo2_in_multiple_choice": {
68
+ "original": 722,
69
+ "effective": 722
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_in_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381734.9804056,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polemo2_in_multiple_choice": "721bf5bd2111822d757513497aaacb13ff7172a1c79e8d903e554ae7db248670"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2270387.730317351,
103
+ "end_time": 2271351.381778121,
104
+ "total_evaluation_time_seconds": "963.6514607700519"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-22-20.849828.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_out": {
4
+ "exact_match,score-first": 0.7550607287449392,
5
+ "exact_match_stderr,score-first": 0.01936853142177567,
6
+ "alias": "polemo2_out"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polemo2_out": []
11
+ },
12
+ "configs": {
13
+ "polemo2_out": {
14
+ "task": "polemo2_out",
15
+ "group": [
16
+ "polemo2"
17
+ ],
18
+ "dataset_path": "allegro/klej-polemo2-out",
19
+ "training_split": "train",
20
+ "validation_split": "validation",
21
+ "test_split": "test",
22
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii. Mo偶liwe odpowiedzi:\nA - Neutralny\nB - Negatywny\nC - Pozytywny\nD - Niejednoznaczny\nPrawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{{'__label__meta_zero': 'A', '__label__meta_minus_m': 'B', '__label__meta_plus_m': 'C', '__label__meta_amb': 'D'}.get(target)}}",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 5,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true,
33
+ "hf_evaluate": true
34
+ }
35
+ ],
36
+ "output_type": "generate_until",
37
+ "generation_kwargs": {
38
+ "until": [
39
+ ".",
40
+ ","
41
+ ],
42
+ "do_sample": false,
43
+ "temperature": 0.0,
44
+ "max_gen_toks": 50
45
+ },
46
+ "repeats": 1,
47
+ "filter_list": [
48
+ {
49
+ "name": "score-first",
50
+ "filter": [
51
+ {
52
+ "function": "regex",
53
+ "regex_pattern": "(\\b[ABCD]\\b)"
54
+ },
55
+ {
56
+ "function": "take_first"
57
+ }
58
+ ]
59
+ }
60
+ ],
61
+ "should_decontaminate": true,
62
+ "doc_to_decontamination_query": "{{sentence}}",
63
+ "metadata": {
64
+ "version": 1.0
65
+ }
66
+ }
67
+ },
68
+ "versions": {
69
+ "polemo2_out": 1.0
70
+ },
71
+ "n-shot": {
72
+ "polemo2_out": 5
73
+ },
74
+ "higher_is_better": {
75
+ "polemo2_out": {
76
+ "exact_match": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polemo2_out": {
81
+ "original": 494,
82
+ "effective": 494
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.8777425,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polemo2_out": "f4c38529c6c2d9871f34d315f5afa8b183cba25e628c029de45011230d53fac1"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2272911.893717436,
116
+ "end_time": 2273712.349220811,
117
+ "total_evaluation_time_seconds": "800.4555033748038"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-19-39.147509.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polemo2_out_multiple_choice": {
4
+ "acc,none": 0.7753036437246964,
5
+ "acc_stderr,none": 0.018797949035330906,
6
+ "acc_norm,none": 0.7854251012145749,
7
+ "acc_norm_stderr,none": 0.01848921134882508,
8
+ "alias": "polemo2_out_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polemo2_out_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polemo2_out_multiple_choice": {
16
+ "task": "polemo2_out_multiple_choice",
17
+ "group": [
18
+ "polemo2_mc"
19
+ ],
20
+ "dataset_path": "allegro/klej-polemo2-out",
21
+ "training_split": "train",
22
+ "validation_split": "validation",
23
+ "test_split": "test",
24
+ "doc_to_text": "Opinia: \"{{sentence}}\"\nOkre艣l sentyment podanej opinii: Neutralny, Negatywny, Pozytywny, Niejednoznaczny.\nSentyment:",
25
+ "doc_to_target": "{{['__label__meta_zero', '__label__meta_minus_m', '__label__meta_plus_m', '__label__meta_amb'].index(target)}}",
26
+ "doc_to_choice": [
27
+ "Neutralny",
28
+ "Negatywny",
29
+ "Pozytywny",
30
+ "Niejednoznaczny"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 5,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polemo2_out_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polemo2_out_multiple_choice": 5
59
+ },
60
+ "higher_is_better": {
61
+ "polemo2_out_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polemo2_out_multiple_choice": {
68
+ "original": 494,
69
+ "effective": 494
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polemo2_out_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381734.9805498,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4499.90\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polemo2_out_multiple_choice": "45b774f8cfb07b51343dc4aba756739ac8f3ad9410eae31ce9abcab2243c33c6"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2270387.730003106,
103
+ "end_time": 2271039.659552081,
104
+ "total_evaluation_time_seconds": "651.9295489750803"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-53-51.017953.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_8tags_multiple_choice": {
4
+ "acc,none": 0.7936870997255261,
5
+ "acc_stderr,none": 0.006120648645628871,
6
+ "acc_norm,none": 0.7881976212259836,
7
+ "acc_norm_stderr,none": 0.0061800583187814695,
8
+ "alias": "polish_8tags_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_8tags_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_8tags_multiple_choice": {
16
+ "task": "polish_8tags_multiple_choice",
17
+ "dataset_path": "sdadas/8tags",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "fewshot_split": "train",
21
+ "doc_to_text": "Tytu艂: \"{{sentence}}\"\nDo podanego tytu艂u przyporz膮dkuj jedn膮 najlepiej pasuj膮c膮 kategori臋 z podanych: Film, Historia, Jedzenie, Medycyna, Motoryzacja, Praca, Sport, Technologie.\nKategoria:",
22
+ "doc_to_target": "{{label|int}}",
23
+ "doc_to_choice": [
24
+ "Film",
25
+ "Historia",
26
+ "Jedzenie",
27
+ "Medycyna",
28
+ "Motoryzacja",
29
+ "Praca",
30
+ "Sport",
31
+ "Technologie"
32
+ ],
33
+ "description": "",
34
+ "target_delimiter": " ",
35
+ "fewshot_delimiter": "\n\n",
36
+ "num_fewshot": 5,
37
+ "metric_list": [
38
+ {
39
+ "metric": "acc",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "acc_norm",
45
+ "aggregation": "mean",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{sentence}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_8tags_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_8tags_multiple_choice": 5
60
+ },
61
+ "higher_is_better": {
62
+ "polish_8tags_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_8tags_multiple_choice": {
69
+ "original": 4372,
70
+ "effective": 4372
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_multiple_choice/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381736.832911,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_8tags_multiple_choice": "73f7a912bc6b67622aaf742339f1fd7d8c602e2bba1d366f9084ffdcd115da22"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 779444.242893573,
104
+ "end_time": 782146.665026425,
105
+ "total_evaluation_time_seconds": "2702.4221328520216"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-01-59.819478.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_8tags_regex": {
4
+ "exact_match,score-first": 0.780192131747484,
5
+ "exact_match_stderr,score-first": 0.006263715115123265,
6
+ "alias": "polish_8tags_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_8tags_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_8tags_regex": {
14
+ "task": "polish_8tags_regex",
15
+ "dataset_path": "sdadas/8tags",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Tytu艂: \"{{sentence}}\"\nPytanie: jaka kategoria najlepiej pasuje do podanego tytu艂u?\nMo偶liwe odpowiedzi:\nA - film\nB - historia\nC - jedzenie\nD - medycyna\nE - motoryzacja\nF - praca\nG - sport\nH - technologie\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H'}.get(label)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 5,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ","
37
+ ],
38
+ "do_sample": false,
39
+ "temperature": 0.0,
40
+ "max_gen_toks": 50
41
+ },
42
+ "repeats": 1,
43
+ "filter_list": [
44
+ {
45
+ "name": "score-first",
46
+ "filter": [
47
+ {
48
+ "function": "regex",
49
+ "regex_pattern": "(\\b[ABCDEFGH]\\b)"
50
+ },
51
+ {
52
+ "function": "take_first"
53
+ }
54
+ ]
55
+ }
56
+ ],
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "{{sentence}}"
59
+ }
60
+ },
61
+ "versions": {
62
+ "polish_8tags_regex": "Yaml"
63
+ },
64
+ "n-shot": {
65
+ "polish_8tags_regex": 5
66
+ },
67
+ "higher_is_better": {
68
+ "polish_8tags_regex": {
69
+ "exact_match": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_8tags_regex": {
74
+ "original": 4372,
75
+ "effective": 4372
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_8tags_regex/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381748.877596,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_8tags_regex": "db46138093af0d6032c98a8689c46f46e11c222dafe4ae0444f5c2f86b97dde9"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 2272911.894059325,
109
+ "end_time": 2279691.316861562,
110
+ "total_evaluation_time_seconds": "6779.4228022368625"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-15-54.278792.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_belebele_mc": {
4
+ "acc,none": 0.8855555555555555,
5
+ "acc_stderr,none": 0.010617576963634284,
6
+ "acc_norm,none": 0.8855555555555555,
7
+ "acc_norm_stderr,none": 0.010617576963634284,
8
+ "alias": "polish_belebele_mc"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_belebele_mc": []
13
+ },
14
+ "configs": {
15
+ "polish_belebele_mc": {
16
+ "task": "polish_belebele_mc",
17
+ "dataset_path": "facebook/belebele",
18
+ "test_split": "pol_Latn",
19
+ "fewshot_split": "pol_Latn",
20
+ "doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{['1', '2', '3', '4'].index(correct_answer_num)}}",
22
+ "doc_to_choice": [
23
+ "A",
24
+ "B",
25
+ "C",
26
+ "D"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "fewshot_config": {
32
+ "sampler": "first_n"
33
+ },
34
+ "num_fewshot": 5,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": true,
50
+ "doc_to_decontamination_query": "{{question}}",
51
+ "metadata": {
52
+ "version": 0.0
53
+ }
54
+ }
55
+ },
56
+ "versions": {
57
+ "polish_belebele_mc": 0.0
58
+ },
59
+ "n-shot": {
60
+ "polish_belebele_mc": 5
61
+ },
62
+ "higher_is_better": {
63
+ "polish_belebele_mc": {
64
+ "acc": true,
65
+ "acc_norm": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_belebele_mc": {
70
+ "original": 900,
71
+ "effective": 900
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_mc/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381736.8325982,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_belebele_mc": "3617d71c141947146b1331680272d92dc45753002d91f496be692e189d2c3338"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 779444.243477263,
105
+ "end_time": 779869.928050127,
106
+ "total_evaluation_time_seconds": "425.6845728639746"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-36-23.654679.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_belebele_regex": {
4
+ "exact_match,score-first": 0.8888888888888888,
5
+ "exact_match_stderr,score-first": 0.010481480680812841,
6
+ "alias": "polish_belebele_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_belebele_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_belebele_regex": {
14
+ "task": "polish_belebele_regex",
15
+ "dataset_path": "facebook/belebele",
16
+ "test_split": "pol_Latn",
17
+ "doc_to_text": "Fragment: \"{{flores_passage}}\"\nPytanie: \"{{question}}\"\nMo偶liwe odpowiedzi:\nA - {{mc_answer1}}\nB - {{mc_answer2}}\nC - {{mc_answer3}}\nD - {{mc_answer4}}\nPrawid艂owa odpowied藕:",
18
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(correct_answer_num|int - 1)}}",
19
+ "description": "",
20
+ "target_delimiter": " ",
21
+ "fewshot_delimiter": "\n\n",
22
+ "num_fewshot": 5,
23
+ "metric_list": [
24
+ {
25
+ "metric": "exact_match",
26
+ "aggregation": "mean",
27
+ "higher_is_better": true
28
+ }
29
+ ],
30
+ "output_type": "generate_until",
31
+ "generation_kwargs": {
32
+ "until": [
33
+ ".",
34
+ ","
35
+ ],
36
+ "do_sample": false,
37
+ "temperature": 0.0,
38
+ "max_gen_toks": 50
39
+ },
40
+ "repeats": 1,
41
+ "filter_list": [
42
+ {
43
+ "name": "score-first",
44
+ "filter": [
45
+ {
46
+ "function": "regex",
47
+ "regex_pattern": "(\\b[ABCD]\\b)"
48
+ },
49
+ {
50
+ "function": "take_first"
51
+ }
52
+ ]
53
+ }
54
+ ],
55
+ "should_decontaminate": true,
56
+ "doc_to_decontamination_query": "{{flores_passage}} {{question}} {{mc_answer1}} {{mc_answer2}} {{mc_answer3}} {{mc_answer4}}"
57
+ }
58
+ },
59
+ "versions": {
60
+ "polish_belebele_regex": "Yaml"
61
+ },
62
+ "n-shot": {
63
+ "polish_belebele_regex": 5
64
+ },
65
+ "higher_is_better": {
66
+ "polish_belebele_regex": {
67
+ "exact_match": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "polish_belebele_regex": {
72
+ "original": 900,
73
+ "effective": 900
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "hf",
78
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
79
+ "batch_size": "1",
80
+ "batch_sizes": [],
81
+ "device": "cuda:0",
82
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_belebele_regex/",
83
+ "limit": null,
84
+ "bootstrap_iters": 100000,
85
+ "gen_kwargs": null,
86
+ "random_seed": 0,
87
+ "numpy_seed": 1234,
88
+ "torch_seed": 1234,
89
+ "fewshot_seed": 1234
90
+ },
91
+ "git_hash": "2132286",
92
+ "date": 1723381748.8774083,
93
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
94
+ "transformers_version": "4.43.1",
95
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
96
+ "task_hashes": {
97
+ "polish_belebele_regex": "f24c47726a598a1d1eea361393c09e061f3bbf93fc16ed74e92c70bd969e71f2"
98
+ },
99
+ "model_source": "hf",
100
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
101
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
102
+ "system_instruction": null,
103
+ "system_instruction_sha": null,
104
+ "chat_template": null,
105
+ "chat_template_sha": null,
106
+ "start_time": 2272911.893726246,
107
+ "end_time": 2274555.15416838,
108
+ "total_evaluation_time_seconds": "1643.260442133993"
109
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-21-52.993123.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_cbd_multiple_choice": {
4
+ "acc,none": 0.74,
5
+ "acc_stderr,none": 0.013877773329774166,
6
+ "f1,none": 0.3516898467962298,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.747,
9
+ "acc_norm_stderr,none": 0.01375427861358708,
10
+ "alias": "polish_cbd_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_cbd_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_cbd_multiple_choice": {
18
+ "task": "polish_cbd_multiple_choice",
19
+ "dataset_path": "ptaszynski/PolishCyberbullyingDataset",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nDo podanej wypowiedzi przyporz膮dkuj jedn膮, najlepiej pasuj膮c膮 kategori臋 z podanych: nieszkodliwa, szyderstwo, obelga, insynuacja, gro藕ba, molestowanie.\nKategoria:",
23
+ "doc_to_target": "{{{'szyderstwo': 1, 'obelga': 2, 'insynuacja': 3, 'grozba': 4, 'molestowanie': 5}.get(CATEGORIES, 0)}}",
24
+ "doc_to_choice": [
25
+ "nieszkodliwa",
26
+ "szyderstwo",
27
+ "obelga",
28
+ "insynuacja",
29
+ "gro藕ba",
30
+ "molestowanie"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 5,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
49
+ "aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "multiple_choice",
54
+ "repeats": 1,
55
+ "should_decontaminate": true,
56
+ "doc_to_decontamination_query": "{{TEXT}}"
57
+ }
58
+ },
59
+ "versions": {
60
+ "polish_cbd_multiple_choice": "Yaml"
61
+ },
62
+ "n-shot": {
63
+ "polish_cbd_multiple_choice": 5
64
+ },
65
+ "higher_is_better": {
66
+ "polish_cbd_multiple_choice": {
67
+ "acc": true,
68
+ "acc_norm": true,
69
+ "f1": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_cbd_multiple_choice": {
74
+ "original": 1000,
75
+ "effective": 1000
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_multiple_choice/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381736.832983,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_cbd_multiple_choice": "7f04a198edb8f2a8d7c7854adaca6f42c6ab2547d80482066cd86becf9e6cd6c"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 779444.243050853,
109
+ "end_time": 780228.642777935,
110
+ "total_evaluation_time_seconds": "784.3997270819964"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-33-25.750066.json ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_cbd_regex": {
4
+ "exact_match,score-first": 0.75,
5
+ "exact_match_stderr,score-first": 0.013699915608779773,
6
+ "f1,score-first": 0.3634343551926929,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_cbd_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_cbd_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_cbd_regex": {
16
+ "task": "polish_cbd_regex",
17
+ "dataset_path": "ptaszynski/PolishCyberbullyingDataset",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Wypowied藕: \"{{TEXT}}\"\nPytanie: Jaka kategoria najlepiej pasuje do podanej wypowiedzi?\nMo偶liwe odpowiedzi:\nA - nieszkodliwa\nB - szyderstwo\nC - obelga\nD - insynuacja\nE - gro藕ba\nF - molestowanie\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{{'szyderstwo': 'B', 'obelga': 'C', 'insynuacja': 'D', 'grozba': 'E', 'molestowanie': 'F'}.get(CATEGORIES, 'A')}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1_macro(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions, average='macro')\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ",",
43
+ ";"
44
+ ],
45
+ "do_sample": false,
46
+ "temperature": 0.0,
47
+ "max_gen_toks": 50
48
+ },
49
+ "repeats": 1,
50
+ "filter_list": [
51
+ {
52
+ "name": "score-first",
53
+ "filter": [
54
+ {
55
+ "function": "regex",
56
+ "regex_pattern": "(\\b[ABCDEF]\\b)"
57
+ },
58
+ {
59
+ "function": "take_first"
60
+ }
61
+ ]
62
+ }
63
+ ],
64
+ "should_decontaminate": true,
65
+ "doc_to_decontamination_query": "{{TEXT}}"
66
+ }
67
+ },
68
+ "versions": {
69
+ "polish_cbd_regex": "Yaml"
70
+ },
71
+ "n-shot": {
72
+ "polish_cbd_regex": 5
73
+ },
74
+ "higher_is_better": {
75
+ "polish_cbd_regex": {
76
+ "exact_match": true,
77
+ "f1": true
78
+ }
79
+ },
80
+ "n-samples": {
81
+ "polish_cbd_regex": {
82
+ "original": 1000,
83
+ "effective": 1000
84
+ }
85
+ },
86
+ "config": {
87
+ "model": "hf",
88
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
89
+ "batch_size": "1",
90
+ "batch_sizes": [],
91
+ "device": "cuda:0",
92
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_cbd_regex/",
93
+ "limit": null,
94
+ "bootstrap_iters": 100000,
95
+ "gen_kwargs": null,
96
+ "random_seed": 0,
97
+ "numpy_seed": 1234,
98
+ "torch_seed": 1234,
99
+ "fewshot_seed": 1234
100
+ },
101
+ "git_hash": "2132286",
102
+ "date": 1723381748.6048338,
103
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
104
+ "transformers_version": "4.43.1",
105
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
106
+ "task_hashes": {
107
+ "polish_cbd_regex": "71dc0083f6f8b533188cbedcb2ea9d61ba63ef8ff3f6bb1c08f1844c9335ddf4"
108
+ },
109
+ "model_source": "hf",
110
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
111
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
112
+ "system_instruction": null,
113
+ "system_instruction_sha": null,
114
+ "chat_template": null,
115
+ "chat_template_sha": null,
116
+ "start_time": 780407.632129005,
117
+ "end_time": 781873.251077335,
118
+ "total_evaluation_time_seconds": "1465.6189483299386"
119
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-14-08.007826.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_dyk_multiple_choice": {
4
+ "acc,none": 0.8794946550048591,
5
+ "acc_stderr,none": 0.010153673638096375,
6
+ "f1,none": 0.7004830917874396,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.8794946550048591,
9
+ "acc_norm_stderr,none": 0.010153673638096375,
10
+ "alias": "polish_dyk_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_dyk_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_dyk_multiple_choice": {
18
+ "task": "polish_dyk_multiple_choice",
19
+ "dataset_path": "allegro/klej-dyk",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nPytanie: Czy sugerowana odpowied藕 na zadane pytanie jest poprawna?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{target|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 5,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
45
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{question}} {{answer}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_dyk_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_dyk_multiple_choice": 5
60
+ },
61
+ "higher_is_better": {
62
+ "polish_dyk_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true,
65
+ "f1": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_dyk_multiple_choice": {
70
+ "original": 1029,
71
+ "effective": 1029
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_multiple_choice/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381736.8329315,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_dyk_multiple_choice": "90a835c3521affda43e1b7e595ec145d189a8781186b0d67f0a20cbb60069d75"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 779444.242757443,
105
+ "end_time": 779763.657474826,
106
+ "total_evaluation_time_seconds": "319.41471738298424"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-54-27.674557.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_dyk_regex": {
4
+ "exact_match,score-first": 0.8785228377065112,
5
+ "exact_match_stderr,score-first": 0.010188899761066529,
6
+ "f1,score-first": 0.7126436781609196,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_dyk_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_dyk_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_dyk_regex": {
16
+ "task": "polish_dyk_regex",
17
+ "dataset_path": "allegro/klej-dyk",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Pytanie: \"{{question}}\"\nSugerowana odpowied藕: \"{{answer}}\"\nCzy sugerowana odpowied藕 na zadane pytanie jest poprawna? Mo偶liwe opcje:\nA - brakuje sugerowanej odpowiedzi\nB - nie, sugerowana odpowied藕 nie jest poprawna\nC - tak, sugerowana odpowied藕 jest poprawna\nD - brakuje pytania\nPrawid艂owa opcja:",
21
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(target|int + 1)}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ","
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "filter_list": [
50
+ {
51
+ "name": "score-first",
52
+ "filter": [
53
+ {
54
+ "function": "regex",
55
+ "regex_pattern": "(\\b[ABCD]\\b)"
56
+ },
57
+ {
58
+ "function": "take_first"
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ "should_decontaminate": true,
64
+ "doc_to_decontamination_query": "{{question}} {{answer}}"
65
+ }
66
+ },
67
+ "versions": {
68
+ "polish_dyk_regex": "Yaml"
69
+ },
70
+ "n-shot": {
71
+ "polish_dyk_regex": 5
72
+ },
73
+ "higher_is_better": {
74
+ "polish_dyk_regex": {
75
+ "exact_match": true,
76
+ "f1": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polish_dyk_regex": {
81
+ "original": 1029,
82
+ "effective": 1029
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_dyk_regex/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.877224,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polish_dyk_regex": "ff511210f55c111bbc6d0c4cd80c3d7b334eaf5227fb2ed749d0a0530e518b27"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 2272911.894157125,
116
+ "end_time": 2275639.174286722,
117
+ "total_evaluation_time_seconds": "2727.2801295970567"
118
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-22.563512.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_eq_bench_first_turn": {
4
+ "first_eqbench,none": 70.08076901067246,
5
+ "first_eqbench_stderr,none": 2.1051510636673663,
6
+ "first_percent_parseable,none": 100.0,
7
+ "first_percent_parseable_stderr,none": 0.0,
8
+ "alias": "polish_eq_bench_first_turn"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_eq_bench_first_turn": []
13
+ },
14
+ "configs": {
15
+ "polish_eq_bench_first_turn": {
16
+ "task": "polish_eq_bench_first_turn",
17
+ "dataset_path": "speakleash/EQ-Bench-PL-first-turn",
18
+ "validation_split": "validation",
19
+ "doc_to_text": "{{prompt}}\nOceny:\n",
20
+ "doc_to_target": "def doc_to_target(doc):\n reference = eval(doc[\"reference_answer\"])\n\n target = \"\"\n for i in range(1, 5):\n emotion = reference[f\"emotion{i}\"]\n emotion_score = reference[f\"emotion{i}_score\"]\n target += f\"{emotion}: {emotion_score}\\n\"\n target += \"\\n\"\n\n return target\n",
21
+ "process_results": "def score_first(docs, results):\n first_pass_answers = dict(list(re.findall(r'(\\w+(?: \\w+)*):\\s+(\\d+)', results[0]))[:4])\n reference = eval(docs[\"reference_answer\"])\n first_pass_score = calculate_score(reference, first_pass_answers)\n scores= {'first_'+k: v for k, v in first_pass_score.items()}\n return scores\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "first_eqbench",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "first_percent_parseable",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "max_gen_toks": 512,
41
+ "do_sample": false,
42
+ "temperature": 0.0,
43
+ "until": [
44
+ "</s>",
45
+ "[Koniec odpowiedzi]",
46
+ "Masz za zadanie"
47
+ ]
48
+ },
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 2.4
53
+ }
54
+ }
55
+ },
56
+ "versions": {
57
+ "polish_eq_bench_first_turn": 2.4
58
+ },
59
+ "n-shot": {
60
+ "polish_eq_bench_first_turn": 5
61
+ },
62
+ "higher_is_better": {
63
+ "polish_eq_bench_first_turn": {
64
+ "first_eqbench": true,
65
+ "first_percent_parseable": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_eq_bench_first_turn": {
70
+ "original": 171,
71
+ "effective": 171
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_eq_bench_first_turn/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381748.6045775,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_eq_bench_first_turn": "80a40657adcfe9c62884d65078de0204ecd846ef1614217065f11a87cbb0ad87"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 780407.631921335,
105
+ "end_time": 780970.064839895,
106
+ "total_evaluation_time_seconds": "562.4329185599927"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-35-52.497622.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_klej_ner_multiple_choice": {
4
+ "acc,none": 0.5383867832847424,
5
+ "acc_stderr,none": 0.010991808831354909,
6
+ "acc_norm,none": 0.5291545189504373,
7
+ "acc_norm_stderr,none": 0.011005589555788344,
8
+ "alias": "polish_klej_ner_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_klej_ner_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_klej_ner_multiple_choice": {
16
+ "task": "polish_klej_ner_multiple_choice",
17
+ "dataset_path": "allegro/klej-nkjp-ner",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "fewshot_split": "train",
22
+ "doc_to_text": "Zdanie: \"{{sentence}}\"\nJakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi: Brak nazwanej jednostki, Nazwa miejsca, Nazwa osoby, Nazwa organizacji, Czas, Nazwa geograficzna.\nRodzaj:",
23
+ "doc_to_target": "{{{'noEntity': 0, 'placeName': 1, 'persName': 2, 'orgName': 3, 'time': 4, 'geogName': 5}.get(target)}}",
24
+ "doc_to_choice": [
25
+ "Brak nazwanej jednostki",
26
+ "Nazwa miejsca",
27
+ "Nazwa osoby",
28
+ "Nazwa organizacji",
29
+ "Czas",
30
+ "Nazwa geograficzna"
31
+ ],
32
+ "description": "",
33
+ "target_delimiter": " ",
34
+ "fewshot_delimiter": "\n\n",
35
+ "num_fewshot": 5,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "acc_norm",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ }
47
+ ],
48
+ "output_type": "multiple_choice",
49
+ "repeats": 1,
50
+ "should_decontaminate": true,
51
+ "doc_to_decontamination_query": "{{sentence}}"
52
+ }
53
+ },
54
+ "versions": {
55
+ "polish_klej_ner_multiple_choice": "Yaml"
56
+ },
57
+ "n-shot": {
58
+ "polish_klej_ner_multiple_choice": 5
59
+ },
60
+ "higher_is_better": {
61
+ "polish_klej_ner_multiple_choice": {
62
+ "acc": true,
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "polish_klej_ner_multiple_choice": {
68
+ "original": 2058,
69
+ "effective": 2058
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "hf",
74
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": "cuda:0",
78
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_multiple_choice/",
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "2132286",
88
+ "date": 1723381748.8774905,
89
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
90
+ "transformers_version": "4.43.1",
91
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
92
+ "task_hashes": {
93
+ "polish_klej_ner_multiple_choice": "382e085067293307f61df6d4b8dde438e9a35b2296d59d664ba9e1861a8fb319"
94
+ },
95
+ "model_source": "hf",
96
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
97
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
98
+ "system_instruction": null,
99
+ "system_instruction_sha": null,
100
+ "chat_template": null,
101
+ "chat_template_sha": null,
102
+ "start_time": 2272911.893820256,
103
+ "end_time": 2274523.996078617,
104
+ "total_evaluation_time_seconds": "1612.1022583609447"
105
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-57-18.271570.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_klej_ner_regex": {
4
+ "exact_match,score-first": 0.5515063168124392,
5
+ "exact_match_stderr,score-first": 0.010965697594667088,
6
+ "alias": "polish_klej_ner_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_klej_ner_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_klej_ner_regex": {
14
+ "task": "polish_klej_ner_regex",
15
+ "dataset_path": "allegro/klej-nkjp-ner",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Zdanie: \"{{sentence}}\"\nPytanie: Jakiego rodzaju jest nazwana jednostka, je偶eli wyst臋puje w podanym zdaniu?\nMo偶liwe odpowiedzi:\nA - Brak nazwanej jednostki\nB - Nazwa miejsca\nC - Nazwa osoby\nD - Nazwa organizacji\nE - Czas\nF - Nazwa geograficzna\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{'noEntity': 'A', 'placeName': 'B', 'persName': 'C', 'orgName': 'D', 'time': 'E', 'geogName': 'F'}.get(target)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 5,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ",",
37
+ ";"
38
+ ],
39
+ "do_sample": false,
40
+ "temperature": 0.0,
41
+ "max_gen_toks": 50
42
+ },
43
+ "repeats": 1,
44
+ "filter_list": [
45
+ {
46
+ "name": "score-first",
47
+ "filter": [
48
+ {
49
+ "function": "regex",
50
+ "regex_pattern": "(\\b[ABCDEF]\\b)"
51
+ },
52
+ {
53
+ "function": "take_first"
54
+ }
55
+ ]
56
+ }
57
+ ],
58
+ "should_decontaminate": true,
59
+ "doc_to_decontamination_query": "{{sentence}}"
60
+ }
61
+ },
62
+ "versions": {
63
+ "polish_klej_ner_regex": "Yaml"
64
+ },
65
+ "n-shot": {
66
+ "polish_klej_ner_regex": 5
67
+ },
68
+ "higher_is_better": {
69
+ "polish_klej_ner_regex": {
70
+ "exact_match": true
71
+ }
72
+ },
73
+ "n-samples": {
74
+ "polish_klej_ner_regex": {
75
+ "original": 2058,
76
+ "effective": 2058
77
+ }
78
+ },
79
+ "config": {
80
+ "model": "hf",
81
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
82
+ "batch_size": "1",
83
+ "batch_sizes": [],
84
+ "device": "cuda:0",
85
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_klej_ner_regex/",
86
+ "limit": null,
87
+ "bootstrap_iters": 100000,
88
+ "gen_kwargs": null,
89
+ "random_seed": 0,
90
+ "numpy_seed": 1234,
91
+ "torch_seed": 1234,
92
+ "fewshot_seed": 1234
93
+ },
94
+ "git_hash": "2132286",
95
+ "date": 1723381748.60427,
96
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
97
+ "transformers_version": "4.43.1",
98
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
99
+ "task_hashes": {
100
+ "polish_klej_ner_regex": "ab6f4267720bbc662460a7390651b5fc2a339d12301ae3ba0cba80f4ffe4fe5f"
101
+ },
102
+ "model_source": "hf",
103
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
104
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
105
+ "system_instruction": null,
106
+ "system_instruction_sha": null,
107
+ "chat_template": null,
108
+ "chat_template_sha": null,
109
+ "start_time": 780407.631669254,
110
+ "end_time": 783305.771917303,
111
+ "total_evaluation_time_seconds": "2898.140248048934"
112
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_pes_1723381722/results_2024-08-27T17-50-52.063138.json ADDED
The diff for this file is too large to render. See raw diff
 
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-16-23.588300.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_closed_book": {
4
+ "exact_match,none": 0.7144340602284528,
5
+ "exact_match_stderr,none": 0.014562862295117392,
6
+ "levenshtein,none": 0.8328141225337488,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_polqa_closed_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_closed_book": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_closed_book": {
16
+ "task": "polish_polqa_closed_book",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs_closed(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and example['question'] not in used and (used.add(example['question']) or True)).map(_helper)\n",
22
+ "doc_to_text": "Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "answers",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 5,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true
33
+ },
34
+ {
35
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
36
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
37
+ "higher_is_better": true
38
+ }
39
+ ],
40
+ "output_type": "generate_until",
41
+ "generation_kwargs": {
42
+ "until": [
43
+ "\n",
44
+ "</s>"
45
+ ],
46
+ "do_sample": false,
47
+ "temperature": 0.0,
48
+ "max_gen_toks": 50
49
+ },
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{question}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_polqa_closed_book": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_polqa_closed_book": 5
60
+ },
61
+ "higher_is_better": {
62
+ "polish_polqa_closed_book": {
63
+ "exact_match": true,
64
+ "levenshtein": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_polqa_closed_book": {
69
+ "original": 963,
70
+ "effective": 963
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_closed_book/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381748.604107,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_polqa_closed_book": "87d8cfbe97dc8a4ad77df54784eea533389ce029734e03de52acd682a4293a8e"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 780407.631446274,
104
+ "end_time": 780851.088669831,
105
+ "total_evaluation_time_seconds": "443.4572235570522"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-00.491423.json ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_open_book": {
4
+ "exact_match,none": 0.803306342780027,
5
+ "exact_match_stderr,none": 0.005163192439920857,
6
+ "levenshtein,none": 0.9239203778677463,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_polqa_open_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_open_book": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_open_book": {
16
+ "task": "polish_polqa_open_book",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs_open(dataset: datasets.Dataset):\n def _helper(doc):\n doc[\"answers\"] = ast.literal_eval(doc['answers'])\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: example[\"relevant\"] and (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
22
+ "doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "answers",
24
+ "description": "",
25
+ "target_delimiter": " ",
26
+ "fewshot_delimiter": "\n\n",
27
+ "num_fewshot": 5,
28
+ "metric_list": [
29
+ {
30
+ "metric": "exact_match",
31
+ "aggregation": "mean",
32
+ "higher_is_better": true
33
+ },
34
+ {
35
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0][0].lower()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('\\.? ?(</s>)* ?$','',_prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower())\n if ld<len(reference)/2:\n return 1\n return 0\n",
36
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
37
+ "higher_is_better": true
38
+ }
39
+ ],
40
+ "output_type": "generate_until",
41
+ "generation_kwargs": {
42
+ "until": [
43
+ "\n",
44
+ "</s>"
45
+ ],
46
+ "do_sample": false,
47
+ "temperature": 0.0,
48
+ "max_gen_toks": 50
49
+ },
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{passage_text}} {{question}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_polqa_open_book": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_polqa_open_book": 5
60
+ },
61
+ "higher_is_better": {
62
+ "polish_polqa_open_book": {
63
+ "exact_match": true,
64
+ "levenshtein": true
65
+ }
66
+ },
67
+ "n-samples": {
68
+ "polish_polqa_open_book": {
69
+ "original": 5928,
70
+ "effective": 5928
71
+ }
72
+ },
73
+ "config": {
74
+ "model": "hf",
75
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
76
+ "batch_size": "1",
77
+ "batch_sizes": [],
78
+ "device": "cuda:0",
79
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_open_book/",
80
+ "limit": null,
81
+ "bootstrap_iters": 100000,
82
+ "gen_kwargs": null,
83
+ "random_seed": 0,
84
+ "numpy_seed": 1234,
85
+ "torch_seed": 1234,
86
+ "fewshot_seed": 1234
87
+ },
88
+ "git_hash": "2132286",
89
+ "date": 1723381748.6044483,
90
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
91
+ "transformers_version": "4.43.1",
92
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
93
+ "task_hashes": {
94
+ "polish_polqa_open_book": "1b7dbda5fd3d68d2b8f1d9ca3aecb84324d7c15639dca3a82f584f73f81e734f"
95
+ },
96
+ "model_source": "hf",
97
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
98
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
99
+ "system_instruction": null,
100
+ "system_instruction_sha": null,
101
+ "chat_template": null,
102
+ "chat_template_sha": null,
103
+ "start_time": 780407.632126545,
104
+ "end_time": 782687.990413396,
105
+ "total_evaluation_time_seconds": "2280.3582868510857"
106
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-50-02.859037.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_polqa_reranking_multiple_choice": {
4
+ "acc,none": 0.8563222912896372,
5
+ "acc_stderr,none": 0.0031115351999876245,
6
+ "acc_norm,none": 0.8563222912896372,
7
+ "acc_norm_stderr,none": 0.0031115351999876245,
8
+ "alias": "polish_polqa_reranking_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_polqa_reranking_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_polqa_reranking_multiple_choice": {
16
+ "task": "polish_polqa_reranking_multiple_choice",
17
+ "dataset_path": "ipipan/polqa",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "process_docs": "def process_docs(dataset: datasets.Dataset):\n def _helper(doc):\n return doc\n\n used = set()\n\n return dataset.remove_columns(COLUMNS_TO_REMOVE).filter(lambda example: (example['passage_text'],example['question']) not in used and (used.add((example['passage_text'],example['question'])) or True)).map(_helper)\n",
22
+ "doc_to_text": "Kontekst: {{passage_text}} \n Pytanie: {{question}} \n Czy kontekst jest relewantny dla pytania? \n Odpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{relevant|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 5,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "{{passage_text}} {{question}}"
48
+ }
49
+ },
50
+ "versions": {
51
+ "polish_polqa_reranking_multiple_choice": "Yaml"
52
+ },
53
+ "n-shot": {
54
+ "polish_polqa_reranking_multiple_choice": 5
55
+ },
56
+ "higher_is_better": {
57
+ "polish_polqa_reranking_multiple_choice": {
58
+ "acc": true,
59
+ "acc_norm": true
60
+ }
61
+ },
62
+ "n-samples": {
63
+ "polish_polqa_reranking_multiple_choice": {
64
+ "original": 12709,
65
+ "effective": 12709
66
+ }
67
+ },
68
+ "config": {
69
+ "model": "hf",
70
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
71
+ "batch_size": "1",
72
+ "batch_sizes": [],
73
+ "device": "cuda:0",
74
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_polqa_reranking_multiple_choice/",
75
+ "limit": null,
76
+ "bootstrap_iters": 100000,
77
+ "gen_kwargs": null,
78
+ "random_seed": 0,
79
+ "numpy_seed": 1234,
80
+ "torch_seed": 1234,
81
+ "fewshot_seed": 1234
82
+ },
83
+ "git_hash": "2132286",
84
+ "date": 1723381748.8773608,
85
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
86
+ "transformers_version": "4.43.1",
87
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
88
+ "task_hashes": {
89
+ "polish_polqa_reranking_multiple_choice": "81b0a5c9f7c49792c084d2efb013d9475b0a80d66176de68f5f7c09c2464494a"
90
+ },
91
+ "model_source": "hf",
92
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
93
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
94
+ "system_instruction": null,
95
+ "system_instruction_sha": null,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 2272911.894071405,
99
+ "end_time": 2275374.35065494,
100
+ "total_evaluation_time_seconds": "2462.4565835352987"
101
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T17-09-42.653951.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_poquad_open_book": {
4
+ "exact_match,none": 0.37682165163081194,
5
+ "exact_match_stderr,none": 0.0063833666826593255,
6
+ "levenshtein,none": 0.6878903539208883,
7
+ "levenshtein_stderr,none": "N/A",
8
+ "alias": "polish_poquad_open_book"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_poquad_open_book": []
13
+ },
14
+ "configs": {
15
+ "polish_poquad_open_book": {
16
+ "task": "polish_poquad_open_book",
17
+ "dataset_path": "clarin-pl/poquad",
18
+ "training_split": "train",
19
+ "test_split": "validation",
20
+ "doc_to_text": "Tytu艂: {{title}} \n Kontekst: {{context}} \n Pytanie: {{question}} \n Prawid艂owa odpowied藕 (kr贸tki cytat z Kontekstu):",
21
+ "doc_to_target": "def doc_to_target(doc):\n answer_list = doc[\"answers\"][\"text\"]\n if len(answer_list) > 0:\n answer = answer_list[0]\n else:\n answer = \"bez odpowiedzi\"\n return \" \" + answer\n",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def levenshtein(predictions, references):\n _prediction = predictions[0].lower().lstrip()\n prediction_number = get_number(_prediction)\n\n _prediction = re.sub('.? ?(</s>)* ?$', '', _prediction)\n\n for reference in references:\n reference_number = get_number(reference)\n\n if reference_number is not None:\n if reference_number == prediction_number:\n return 1\n else:\n ld = distance(_prediction, reference.lower().lstrip())\n if ld < len(reference)/2:\n return 1\n return 0\n",
34
+ "aggregation": "def agg_levenshtein(items):\n return sum(items)/len(items)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ "\n",
42
+ "</s>"
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "should_decontaminate": true,
50
+ "doc_to_decontamination_query": "{{context}} {{question}}"
51
+ }
52
+ },
53
+ "versions": {
54
+ "polish_poquad_open_book": "Yaml"
55
+ },
56
+ "n-shot": {
57
+ "polish_poquad_open_book": 5
58
+ },
59
+ "higher_is_better": {
60
+ "polish_poquad_open_book": {
61
+ "exact_match": true,
62
+ "levenshtein": true
63
+ }
64
+ },
65
+ "n-samples": {
66
+ "polish_poquad_open_book": {
67
+ "original": 5764,
68
+ "effective": 5764
69
+ }
70
+ },
71
+ "config": {
72
+ "model": "hf",
73
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
74
+ "batch_size": "1",
75
+ "batch_sizes": [],
76
+ "device": "cuda:0",
77
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_poquad_open_book/",
78
+ "limit": null,
79
+ "bootstrap_iters": 100000,
80
+ "gen_kwargs": null,
81
+ "random_seed": 0,
82
+ "numpy_seed": 1234,
83
+ "torch_seed": 1234,
84
+ "fewshot_seed": 1234
85
+ },
86
+ "git_hash": "2132286",
87
+ "date": 1723381748.6042013,
88
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
89
+ "transformers_version": "4.43.1",
90
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
91
+ "task_hashes": {
92
+ "polish_poquad_open_book": "4052fd29bcd59435f258c0169cde1f29c3f22a618395f32cebf32e166bd3bf38"
93
+ },
94
+ "model_source": "hf",
95
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
96
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
97
+ "system_instruction": null,
98
+ "system_instruction_sha": null,
99
+ "chat_template": null,
100
+ "chat_template_sha": null,
101
+ "start_time": 780407.631651954,
102
+ "end_time": 787650.152180919,
103
+ "total_evaluation_time_seconds": "7242.520528964931"
104
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-13-23.877449.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_ppc_multiple_choice": {
4
+ "acc,none": 0.789,
5
+ "acc_stderr,none": 0.012909130321042095,
6
+ "acc_norm,none": 0.789,
7
+ "acc_norm_stderr,none": 0.012909130321042095,
8
+ "alias": "polish_ppc_multiple_choice"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_ppc_multiple_choice": []
13
+ },
14
+ "configs": {
15
+ "polish_ppc_multiple_choice": {
16
+ "task": "polish_ppc_multiple_choice",
17
+ "dataset_path": "sdadas/ppc",
18
+ "training_split": "train",
19
+ "validation_split": "validation",
20
+ "test_split": "test",
21
+ "doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - znacz膮 dok艂adnie to samo\nB - maj膮 podobne znaczenie\nC - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
22
+ "doc_to_target": "{{label|int - 1}}",
23
+ "doc_to_choice": [
24
+ "A",
25
+ "B",
26
+ "C"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 5,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ }
43
+ ],
44
+ "output_type": "multiple_choice",
45
+ "repeats": 1,
46
+ "should_decontaminate": true,
47
+ "doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
48
+ }
49
+ },
50
+ "versions": {
51
+ "polish_ppc_multiple_choice": "Yaml"
52
+ },
53
+ "n-shot": {
54
+ "polish_ppc_multiple_choice": 5
55
+ },
56
+ "higher_is_better": {
57
+ "polish_ppc_multiple_choice": {
58
+ "acc": true,
59
+ "acc_norm": true
60
+ }
61
+ },
62
+ "n-samples": {
63
+ "polish_ppc_multiple_choice": {
64
+ "original": 1000,
65
+ "effective": 1000
66
+ }
67
+ },
68
+ "config": {
69
+ "model": "hf",
70
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
71
+ "batch_size": "1",
72
+ "batch_sizes": [],
73
+ "device": "cuda:0",
74
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_multiple_choice/",
75
+ "limit": null,
76
+ "bootstrap_iters": 100000,
77
+ "gen_kwargs": null,
78
+ "random_seed": 0,
79
+ "numpy_seed": 1234,
80
+ "torch_seed": 1234,
81
+ "fewshot_seed": 1234
82
+ },
83
+ "git_hash": "2132286",
84
+ "date": 1723381736.8326252,
85
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
86
+ "transformers_version": "4.43.1",
87
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
88
+ "task_hashes": {
89
+ "polish_ppc_multiple_choice": "c3554bdb1ae93597ea2150e4ff1a633019458db699b0cb1639d96dd3970b6939"
90
+ },
91
+ "model_source": "hf",
92
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
93
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
94
+ "system_instruction": null,
95
+ "system_instruction_sha": null,
96
+ "chat_template": null,
97
+ "chat_template_sha": null,
98
+ "start_time": 779444.242971983,
99
+ "end_time": 779719.526449868,
100
+ "total_evaluation_time_seconds": "275.2834778849501"
101
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-30-24.424865.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_ppc_regex": {
4
+ "exact_match,score-first": 0.793,
5
+ "exact_match_stderr,score-first": 0.01281855355784399,
6
+ "alias": "polish_ppc_regex"
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "polish_ppc_regex": []
11
+ },
12
+ "configs": {
13
+ "polish_ppc_regex": {
14
+ "task": "polish_ppc_regex",
15
+ "dataset_path": "sdadas/ppc",
16
+ "training_split": "train",
17
+ "validation_split": "validation",
18
+ "test_split": "test",
19
+ "doc_to_text": "Zdanie A: \"{{sentence_A}}\"\nZdanie B: \"{{sentence_B}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy zdaniami A i B? Mo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - znacz膮 dok艂adnie to samo\nC - maj膮 podobne znaczenie\nD - maj膮 r贸偶ne znaczenie\nPrawid艂owa odpowied藕:",
20
+ "doc_to_target": "{{{0: 'A', 1: 'B', 2: 'C', 3: 'D'}.get(label|int)}}",
21
+ "description": "",
22
+ "target_delimiter": " ",
23
+ "fewshot_delimiter": "\n\n",
24
+ "num_fewshot": 5,
25
+ "metric_list": [
26
+ {
27
+ "metric": "exact_match",
28
+ "aggregation": "mean",
29
+ "higher_is_better": true
30
+ }
31
+ ],
32
+ "output_type": "generate_until",
33
+ "generation_kwargs": {
34
+ "until": [
35
+ ".",
36
+ ","
37
+ ],
38
+ "do_sample": false,
39
+ "temperature": 0.0,
40
+ "max_gen_toks": 50
41
+ },
42
+ "repeats": 1,
43
+ "filter_list": [
44
+ {
45
+ "name": "score-first",
46
+ "filter": [
47
+ {
48
+ "function": "regex",
49
+ "regex_pattern": "(\\b[ABCD]\\b)"
50
+ },
51
+ {
52
+ "function": "take_first"
53
+ }
54
+ ]
55
+ }
56
+ ],
57
+ "should_decontaminate": true,
58
+ "doc_to_decontamination_query": "{{sentence_A}} {{sentence_B}}"
59
+ }
60
+ },
61
+ "versions": {
62
+ "polish_ppc_regex": "Yaml"
63
+ },
64
+ "n-shot": {
65
+ "polish_ppc_regex": 5
66
+ },
67
+ "higher_is_better": {
68
+ "polish_ppc_regex": {
69
+ "exact_match": true
70
+ }
71
+ },
72
+ "n-samples": {
73
+ "polish_ppc_regex": {
74
+ "original": 1000,
75
+ "effective": 1000
76
+ }
77
+ },
78
+ "config": {
79
+ "model": "hf",
80
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
81
+ "batch_size": "1",
82
+ "batch_sizes": [],
83
+ "device": "cuda:0",
84
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_ppc_regex/",
85
+ "limit": null,
86
+ "bootstrap_iters": 100000,
87
+ "gen_kwargs": null,
88
+ "random_seed": 0,
89
+ "numpy_seed": 1234,
90
+ "torch_seed": 1234,
91
+ "fewshot_seed": 1234
92
+ },
93
+ "git_hash": "2132286",
94
+ "date": 1723381748.8771076,
95
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
96
+ "transformers_version": "4.43.1",
97
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
98
+ "task_hashes": {
99
+ "polish_ppc_regex": "a218e651c94f2f850a86a4c0b91c5b5a37007e52526c54eff802a95592defbe3"
100
+ },
101
+ "model_source": "hf",
102
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
103
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
104
+ "system_instruction": null,
105
+ "system_instruction_sha": null,
106
+ "chat_template": null,
107
+ "chat_template_sha": null,
108
+ "start_time": 2272911.893985835,
109
+ "end_time": 2274195.92427883,
110
+ "total_evaluation_time_seconds": "1284.030292995274"
111
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice_1723381711/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-18-48.485190.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_psc_multiple_choice": {
4
+ "acc,none": 0.9461966604823747,
5
+ "acc_stderr,none": 0.006875233780063374,
6
+ "f1,none": 0.9042904290429042,
7
+ "f1_stderr,none": "N/A",
8
+ "acc_norm,none": 0.9461966604823747,
9
+ "acc_norm_stderr,none": 0.006875233780063374,
10
+ "alias": "polish_psc_multiple_choice"
11
+ }
12
+ },
13
+ "group_subtasks": {
14
+ "polish_psc_multiple_choice": []
15
+ },
16
+ "configs": {
17
+ "polish_psc_multiple_choice": {
18
+ "task": "polish_psc_multiple_choice",
19
+ "dataset_path": "allegro/klej-psc",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "doc_to_text": "Tekst: \"{{extract_text}}\"\nPodsumowanie: \"{{summary_text}}\"\nPytanie: Czy podsumowanie dla podanego tekstu jest poprawne?\nOdpowiedz kr贸tko \"Tak\" lub \"Nie\". Prawid艂owa odpowied藕:",
23
+ "doc_to_target": "{{label|int}}",
24
+ "doc_to_choice": [
25
+ "Nie",
26
+ "Tak"
27
+ ],
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 5,
32
+ "metric_list": [
33
+ {
34
+ "metric": "acc",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true
37
+ },
38
+ {
39
+ "metric": "acc_norm",
40
+ "aggregation": "mean",
41
+ "higher_is_better": true
42
+ },
43
+ {
44
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
45
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
46
+ "higher_is_better": true
47
+ }
48
+ ],
49
+ "output_type": "multiple_choice",
50
+ "repeats": 1,
51
+ "should_decontaminate": true,
52
+ "doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
53
+ }
54
+ },
55
+ "versions": {
56
+ "polish_psc_multiple_choice": "Yaml"
57
+ },
58
+ "n-shot": {
59
+ "polish_psc_multiple_choice": 5
60
+ },
61
+ "higher_is_better": {
62
+ "polish_psc_multiple_choice": {
63
+ "acc": true,
64
+ "acc_norm": true,
65
+ "f1": true
66
+ }
67
+ },
68
+ "n-samples": {
69
+ "polish_psc_multiple_choice": {
70
+ "original": 1078,
71
+ "effective": 1078
72
+ }
73
+ },
74
+ "config": {
75
+ "model": "hf",
76
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
77
+ "batch_size": "1",
78
+ "batch_sizes": [],
79
+ "device": "cuda:0",
80
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_multiple_choice/",
81
+ "limit": null,
82
+ "bootstrap_iters": 100000,
83
+ "gen_kwargs": null,
84
+ "random_seed": 0,
85
+ "numpy_seed": 1234,
86
+ "torch_seed": 1234,
87
+ "fewshot_seed": 1234
88
+ },
89
+ "git_hash": "2132286",
90
+ "date": 1723381736.8328693,
91
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
92
+ "transformers_version": "4.43.1",
93
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
94
+ "task_hashes": {
95
+ "polish_psc_multiple_choice": "20f66e13606e4708007e9a49fc374f8348f8309b56413b4ea31956ce9f49c601"
96
+ },
97
+ "model_source": "hf",
98
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
99
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
100
+ "system_instruction": null,
101
+ "system_instruction_sha": null,
102
+ "chat_template": null,
103
+ "chat_template_sha": null,
104
+ "start_time": 779444.242721803,
105
+ "end_time": 780044.13483785,
106
+ "total_evaluation_time_seconds": "599.8921160469763"
107
+ }
eval-results/bielik2/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex_1723381722/__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2/results_2024-08-11T15-47-28.998766.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "polish_psc_regex": {
4
+ "exact_match,score-first": 0.8942486085343229,
5
+ "exact_match_stderr,score-first": 0.00937053376963659,
6
+ "f1,score-first": 0.9228687415426252,
7
+ "f1_stderr,score-first": "N/A",
8
+ "alias": "polish_psc_regex"
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "polish_psc_regex": []
13
+ },
14
+ "configs": {
15
+ "polish_psc_regex": {
16
+ "task": "polish_psc_regex",
17
+ "dataset_path": "allegro/klej-psc",
18
+ "training_split": "train",
19
+ "test_split": "test",
20
+ "doc_to_text": "Fragment 1: \"{{extract_text}}\"\nFragment 2: \"{{summary_text}}\"\nPytanie: jaka jest zale偶no艣膰 mi臋dzy fragmentami 1 i 2?\nMo偶liwe odpowiedzi:\nA - wszystkie odpowiedzi poprawne\nB - dotycz膮 tego samego artyku艂u\nC - dotycz膮 r贸偶nych artyku艂贸w\nD - brak poprawnej odpowiedzi\nPrawid艂owa odpowied藕:",
21
+ "doc_to_target": "{{{0: 'A', 1: 'C', 2: 'B', 3: 'D'}.get(label|int + 1)}}",
22
+ "description": "",
23
+ "target_delimiter": " ",
24
+ "fewshot_delimiter": "\n\n",
25
+ "num_fewshot": 5,
26
+ "metric_list": [
27
+ {
28
+ "metric": "exact_match",
29
+ "aggregation": "mean",
30
+ "higher_is_better": true
31
+ },
32
+ {
33
+ "metric": "def f1(predictions, references):\n _prediction = predictions[0]\n _reference = references[0]\n string_label = [\"B\", \"C\"]\n reference = string_label.index(_reference)\n prediction = (\n string_label.index(_prediction)\n if _prediction in string_label\n else 0\n )\n\n return (prediction, reference)\n",
34
+ "aggregation": "def agg_f1(items):\n predictions, references = zip(*items)\n references, predictions = np.asarray(references), np.asarray(predictions)\n\n return sklearn.metrics.f1_score(references, predictions)\n",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ ".",
42
+ ","
43
+ ],
44
+ "do_sample": false,
45
+ "temperature": 0.0,
46
+ "max_gen_toks": 50
47
+ },
48
+ "repeats": 1,
49
+ "filter_list": [
50
+ {
51
+ "name": "score-first",
52
+ "filter": [
53
+ {
54
+ "function": "regex",
55
+ "regex_pattern": "(\\b[ABCD]\\b)"
56
+ },
57
+ {
58
+ "function": "take_first"
59
+ }
60
+ ]
61
+ }
62
+ ],
63
+ "should_decontaminate": true,
64
+ "doc_to_decontamination_query": "{{extract_text}} {{summary_text}}"
65
+ }
66
+ },
67
+ "versions": {
68
+ "polish_psc_regex": "Yaml"
69
+ },
70
+ "n-shot": {
71
+ "polish_psc_regex": 5
72
+ },
73
+ "higher_is_better": {
74
+ "polish_psc_regex": {
75
+ "exact_match": true,
76
+ "f1": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "polish_psc_regex": {
81
+ "original": 1078,
82
+ "effective": 1078
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "hf",
87
+ "model_args": "pretrained=speakleash/Bielik-11B-v2.1-Instruct,dtype=bfloat16,trust_remote_code=True",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": "cuda:0",
91
+ "use_cache": "sqlite_caches/plgchriso/models/bielik_11B-v2_dpo/dpo5-001_e2_-5_polish_psc_regex/",
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "2132286",
101
+ "date": 1723381748.6041322,
102
+ "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Rocky Linux 9.3 (Blue Onyx) (x86_64)\nGCC version: (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2)\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.34\n\nPython version: 3.10.4 (main, Dec 14 2022, 11:01:42) [GCC 11.3.0] (64-bit runtime)\nPython platform: Linux-5.14.0-362.24.1.el9_3.x86_64-x86_64-with-glibc2.34\nIs CUDA available: True\nCUDA runtime version: 12.1.105\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-40GB\nNvidia driver version: 550.54.15\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4500.15\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 4\nNUMA node0 CPU(s): 0-31\nNUMA node1 CPU(s): 32-63\nNUMA node2 CPU(s): 64-95\nNUMA node3 CPU(s): 96-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled\nVulnerability Spec rstack overflow: Mitigation; SMT disabled\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] Could not collect",
103
+ "transformers_version": "4.43.1",
104
+ "upper_git_hash": "2132286315025b3abd7a22b7309f7052be200287",
105
+ "task_hashes": {
106
+ "polish_psc_regex": "62e1c0c7b4494ec1f99bb0c7eeaad898f5b3e48f9263f8212e2a9759d5499045"
107
+ },
108
+ "model_source": "hf",
109
+ "model_name": "speakleash/Bielik-11B-v2.1-Instruct",
110
+ "model_name_sanitized": "__net__pr2__projects__plgrid__plggspkl__plgchriso__models__bielik_11B-v2_dpo__dpo5-001_e2",
111
+ "system_instruction": null,
112
+ "system_instruction_sha": null,
113
+ "chat_template": null,
114
+ "chat_template_sha": null,
115
+ "start_time": 780407.631957345,
116
+ "end_time": 782716.499648762,
117
+ "total_evaluation_time_seconds": "2308.8676914171083"
118
+ }