Spaces:
Build error
Build error
ready for final run
Browse files- data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv +3 -0
- data/Qwen2.5-0.5B-Instruct_metrics.csv +12 -0
- data/Qwen2.5-0.5B-Instruct_shots_metrics.csv +8 -0
- data/Qwen2.5-1.5B-Instruct_metrics.csv +12 -0
- data/Qwen2.5-1.5B-Instruct_results.csv +0 -0
- data/Qwen2.5-1.5B-Instruct_shots_metrics.csv +8 -0
- data/Qwen2.5-3B-Instruct_metrics.csv +11 -3
- data/Qwen2.5-3B-Instruct_results.csv +0 -0
- data/Qwen2.5-3B-Instruct_shots_metrics.csv +6 -0
- data/Qwen2.5-72B-Instruct_metrics.csv +5 -0
- data/Qwen2.5-72B-Instruct_shots_metrics.csv +1 -0
- data/Qwen2.5-7B-Instruct_metrics.csv +12 -0
- data/Qwen2.5-7B-Instruct_shots_metrics.csv +8 -0
- data/internlm2_5-7b-chat-1m_shots_metrics.csv +4 -0
- data/internlm2_5-7b-chat_shots_metrics.csv +3 -0
- llm_toolkit/logical_reasoning_utils.py +32 -11
- notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
- notebooks/01a_internlm2_5-7b-chat_analysis.ipynb +0 -0
- notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb +0 -0
- notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb +0 -0
- notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb +0 -0
- notebooks/02e_Qwen2.5-1.5B-Instruct_analysis.ipynb +0 -0
- notebooks/02f_Qwen2.5-0.5B-Instruct_analysis.ipynb +0 -0
- notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb +0 -0
- notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb +0 -0
- scripts/eval-epochs.sh +1 -1
- scripts/eval-mgtv-qwen2.5_4bit.sh +7 -0
- scripts/eval-shots.sh +6 -0
data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
|
|
|
3 |
10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334
|
|
|
|
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
|
3 |
+
5,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-05,0.7056666666666667,0.7605745196939752,0.7056666666666667,0.7269189565098723,0.9886666666666667
|
4 |
10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334
|
5 |
+
20,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-20,0.767,0.7690587905035869,0.767,0.7661695279121855,0.979
|
6 |
+
30,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-30,0.7693333333333333,0.7765844200886581,0.7693333333333333,0.7697325957683855,0.7326666666666667
|
data/Qwen2.5-0.5B-Instruct_metrics.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct_torch.float16_lf,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
|
3 |
+
0.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-35_torch.float16_lf,0.525,0.5819221558338251,0.525,0.4586682135998428,1.0
|
4 |
+
0.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-70_torch.float16_lf,0.54,0.6445255881472232,0.54,0.5293020271128788,1.0
|
5 |
+
0.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-105_torch.float16_lf,0.43766666666666665,0.6565760150511494,0.43766666666666665,0.49167707971005714,1.0
|
6 |
+
0.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-140_torch.float16_lf,0.49933333333333335,0.6513093602943617,0.49933333333333335,0.49913143191054443,1.0
|
7 |
+
1.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-175_torch.float16_lf,0.5523333333333333,0.6622075519433389,0.5523333333333333,0.5627283867177305,1.0
|
8 |
+
1.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-210_torch.float16_lf,0.5403333333333333,0.64319564963495,0.5403333333333333,0.5598419070210608,1.0
|
9 |
+
1.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-245_torch.float16_lf,0.5843333333333334,0.6559808590166016,0.5843333333333334,0.6086767064128167,1.0
|
10 |
+
1.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-280_torch.float16_lf,0.5216666666666666,0.6604678981061621,0.5216666666666666,0.5615446578399996,1.0
|
11 |
+
1.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-315_torch.float16_lf,0.524,0.6673441240188523,0.524,0.5607458201939703,1.0
|
12 |
+
2.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-350_torch.float16_lf,0.507,0.6597337077954278,0.5070000000000001,0.5492280882625964,1.0
|
data/Qwen2.5-0.5B-Instruct_shots_metrics.csv
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-00,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
|
3 |
+
5,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-05,0.177,0.49074939459487404,0.177,0.2155165894788838,0.004
|
4 |
+
10,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-10,0.35433333333333333,0.5213384036972462,0.35433333333333333,0.39783362635065245,0.068
|
5 |
+
20,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-20,0.43666666666666665,0.5234006681691764,0.43666666666666665,0.4691719255495575,0.37266666666666665
|
6 |
+
30,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-30,0.39066666666666666,0.5462493905687185,0.39066666666666666,0.4339604066000981,0.07566666666666666
|
7 |
+
40,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-40,0.4653333333333333,0.5468189581246721,0.4653333333333333,0.49752341605759137,0.324
|
8 |
+
50,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-50,0.5026666666666667,0.5610230233594029,0.5026666666666667,0.5163435163649445,0.24333333333333335
|
data/Qwen2.5-1.5B-Instruct_metrics.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct_torch.float16_lf,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
|
3 |
+
0.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-35_torch.float16_lf,0.481,0.6625717555914767,0.481,0.5396575906071639,0.9996666666666667
|
4 |
+
0.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-70_torch.float16_lf,0.5653333333333334,0.711044032475149,0.5653333333333334,0.6130876130683667,0.9996666666666667
|
5 |
+
0.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-105_torch.float16_lf,0.5303333333333333,0.7229828883930918,0.5303333333333333,0.5954306316407808,1.0
|
6 |
+
0.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-140_torch.float16_lf,0.6423333333333333,0.7326944345439858,0.6423333333333333,0.6760588124127741,1.0
|
7 |
+
1.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-175_torch.float16_lf,0.6266666666666667,0.716073329097764,0.6266666666666667,0.6524988509397216,1.0
|
8 |
+
1.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-210_torch.float16_lf,0.5773333333333334,0.7309423620832619,0.5773333333333334,0.6328015564736814,1.0
|
9 |
+
1.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-245_torch.float16_lf,0.6403333333333333,0.749792626106991,0.6403333333333333,0.679795778108406,1.0
|
10 |
+
1.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-280_torch.float16_lf,0.6233333333333333,0.7415417300032008,0.6233333333333333,0.6642642786690383,1.0
|
11 |
+
1.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-315_torch.float16_lf,0.5903333333333334,0.7358743162328453,0.5903333333333334,0.6381733773475835,1.0
|
12 |
+
2.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-350_torch.float16_lf,0.5966666666666667,0.7358100917578044,0.5966666666666667,0.6407733961630157,1.0
|
data/Qwen2.5-1.5B-Instruct_results.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/Qwen2.5-1.5B-Instruct_shots_metrics.csv
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-00,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
|
3 |
+
5,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-05,0.3913333333333333,0.5906409192176565,0.3913333333333333,0.4387379376697362,0.8283333333333334
|
4 |
+
10,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-10,0.405,0.5886453916977137,0.405,0.46059038959324416,0.9156666666666666
|
5 |
+
20,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-20,0.228,0.5255112437643187,0.228,0.30386597855848074,0.676
|
6 |
+
30,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-30,0.23033333333333333,0.55368556787824,0.23033333333333333,0.3067125355762305,0.661
|
7 |
+
40,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-40,0.292,0.5667420801465655,0.292,0.375496356843247,0.5206666666666667
|
8 |
+
50,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-50,0.2876666666666667,0.5660207537890989,0.2876666666666667,0.36627420118815035,0.4603333333333333
|
data/Qwen2.5-3B-Instruct_metrics.csv
CHANGED
@@ -1,4 +1,12 @@
|
|
1 |
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
-
0.0,Qwen2.5-3B-Instruct,
|
3 |
-
0.2,Qwen2.5-3B-Instruct,
|
4 |
-
0.4,Qwen2.5-3B-Instruct,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct_torch.float16_lf,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
|
3 |
+
0.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-35_torch.float16_lf,0.689,0.7450174119748659,0.689,0.709114466474576,0.9986666666666667
|
4 |
+
0.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-70_torch.float16_lf,0.6556666666666666,0.7590430811422313,0.6556666666666666,0.6934194398116857,1.0
|
5 |
+
0.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-105_torch.float16_lf,0.6963333333333334,0.7550938479315918,0.6963333333333334,0.71844324172961,1.0
|
6 |
+
0.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-140_torch.float16_lf,0.6853333333333333,0.7542524799326954,0.6853333333333333,0.7128732915785243,1.0
|
7 |
+
1.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-175_torch.float16_lf,0.6846666666666666,0.7564071354272528,0.6846666666666666,0.7125676758538035,1.0
|
8 |
+
1.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-210_torch.float16_lf,0.6896666666666667,0.7690917466956201,0.6896666666666667,0.720231747443145,1.0
|
9 |
+
1.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-245_torch.float16_lf,0.7256666666666667,0.7753705482689578,0.7256666666666667,0.7440390153124937,1.0
|
10 |
+
1.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-280_torch.float16_lf,0.708,0.7659638403826392,0.708,0.7293997518219294,1.0
|
11 |
+
1.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-315_torch.float16_lf,0.7056666666666667,0.7717562122699148,0.7056666666666667,0.729817759784445,1.0
|
12 |
+
2.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-350_torch.float16_lf,0.7003333333333334,0.7698824212888824,0.7003333333333334,0.726563613830647,1.0
|
data/Qwen2.5-3B-Instruct_results.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/Qwen2.5-3B-Instruct_shots_metrics.csv
CHANGED
@@ -1,2 +1,8 @@
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
|
3 |
+
5,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-05,0.639,0.7226431221398603,0.639,0.641568790114368,0.9973333333333333
|
4 |
+
10,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-10,0.625,0.7164154004131771,0.625,0.6402584852791593,0.995
|
5 |
+
20,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-20,0.4666666666666667,0.6987641430848737,0.46666666666666673,0.5265074036660548,0.9316666666666666
|
6 |
+
30,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-30,0.475,0.6880994914236809,0.475,0.5310948082593374,0.904
|
7 |
+
40,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-40,0.584,0.7065303262365236,0.584,0.6214992664375876,0.7173333333333334
|
8 |
+
50,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-50,0.6093333333333333,0.7120506480394511,0.6093333333333333,0.6451959368825358,0.574
|
data/Qwen2.5-72B-Instruct_metrics.csv
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct_torch.bfloat16_4bit,0.755,0.7861877119461959,0.755,0.7540930716916622,0.5573333333333333
|
3 |
+
0.2,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-35_torch.bfloat16_4bit,0.7776666666666666,0.8064344404751805,0.7776666666666666,0.7902083134269027,0.5623333333333334
|
4 |
+
0.4,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-70_torch.bfloat16_4bit,0.7346666666666667,0.7919767732613179,0.7346666666666667,0.7595614261349122,0.5626666666666666
|
5 |
+
0.6,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-105_torch.bfloat16_4bit,0.739,0.8045199868378529,0.739,0.7672258374793208,0.5626666666666666
|
data/Qwen2.5-72B-Instruct_shots_metrics.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
shots,model,run,accuracy,precision,recall,f1
|
data/Qwen2.5-7B-Instruct_metrics.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
|
3 |
+
0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.745,0.7643041174791825,0.745,0.7482828029872421,0.998
|
4 |
+
0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.7446666666666667,0.7800215227839997,0.7446666666666667,0.7576550061479678,0.9996666666666667
|
5 |
+
0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7513333333333333,0.7996792149630704,0.7513333333333333,0.7693730206330721,0.9996666666666667
|
6 |
+
0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.75,0.7923028105975739,0.75,0.7665531868559959,1.0
|
7 |
+
1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
|
8 |
+
1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7443333333333333,0.79978900243777,0.7443333333333333,0.7660506505481828,1.0
|
9 |
+
1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.7486666666666667,0.7974562319123832,0.7486666666666667,0.7655275916268014,0.9993333333333333
|
10 |
+
1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7566666666666667,0.7939852407869384,0.7566666666666667,0.7689495073735431,0.9996666666666667
|
11 |
+
1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.755,0.7940575522966016,0.755,0.7681326415137147,0.9993333333333333
|
12 |
+
2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.756,0.7982464722401461,0.756,0.7704035278260453,0.9996666666666667
|
data/Qwen2.5-7B-Instruct_shots_metrics.csv
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
+
0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
|
3 |
+
5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.6346666666666667,0.7653343185471776,0.6346666666666667,0.6219419633691871,0.998
|
4 |
+
10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.678,0.7675951017673515,0.678,0.6790860659550377,0.9796666666666667
|
5 |
+
20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7353333333333333,0.7702034737275962,0.7353333333333333,0.7278047438569933,0.807
|
6 |
+
30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.7646666666666667,0.7787918401418651,0.7646666666666667,0.7527649874769439,0.805
|
7 |
+
40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.759,0.7736852689131295,0.759,0.7472252604775926,0.8546666666666667
|
8 |
+
50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.7586666666666667,0.7640431634617543,0.7586666666666667,0.7414332963557551,0.7563333333333333
|
data/internlm2_5-7b-chat-1m_shots_metrics.csv
CHANGED
@@ -1,3 +1,7 @@
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
|
|
|
3 |
10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667
|
|
|
|
|
|
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
|
3 |
+
5,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-05,0.7763333333333333,0.7640598325070357,0.7763333333333333,0.7700878172419743,0.9453333333333334
|
4 |
10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667
|
5 |
+
20,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-20,0.6733333333333333,0.7314610506764355,0.6733333333333333,0.6764198712634657,0.8213333333333334
|
6 |
+
30,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-30,0.6736666666666666,0.7482542000402412,0.6736666666666666,0.6810446770610585,0.8236666666666667
|
7 |
+
40,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-40,0.724,0.7567654663125225,0.724,0.712500180941536,0.8336666666666667
|
data/internlm2_5-7b-chat_shots_metrics.csv
CHANGED
@@ -1,3 +1,6 @@
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
|
|
|
3 |
10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333
|
|
|
|
|
|
1 |
shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
|
2 |
0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
|
3 |
+
5,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-05,0.7476666666666667,0.746806876028684,0.7476666666666667,0.7270588443494302,0.999
|
4 |
10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333
|
5 |
+
20,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-20,0.647,0.721136036365055,0.647,0.6769738108371004,0.9473333333333334
|
6 |
+
30,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-30,0.6263333333333333,0.7256804685839701,0.6263333333333333,0.6534519727626863,0.9403333333333334
|
llm_toolkit/logical_reasoning_utils.py
CHANGED
@@ -207,8 +207,30 @@ def extract_answer(text, debug=False):
|
|
207 |
|
208 |
return ""
|
209 |
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
assert len(references) == len(
|
213 |
predictions
|
214 |
), f"lengths are difference: {len(references)} != {len(predictions)}"
|
@@ -216,7 +238,7 @@ def calc_metrics(references, predictions, debug=False):
|
|
216 |
labels = np.unique(references)
|
217 |
valid_classifications = [1 if p in labels else 0 for p in predictions]
|
218 |
|
219 |
-
predictions = [extract_answer(text) for text in predictions]
|
220 |
|
221 |
accuracy = accuracy_score(references, predictions)
|
222 |
|
@@ -350,12 +372,9 @@ def get_metrics(df):
|
|
350 |
metrics_df = metrics_df.drop(columns=["index"])
|
351 |
|
352 |
accuracy = []
|
353 |
-
meteor = []
|
354 |
-
bleu_1 = []
|
355 |
-
rouge_l = []
|
356 |
all_metrics = []
|
357 |
for col in df.columns[2:]:
|
358 |
-
metrics = calc_metrics(df["label"], df[col], debug=True)
|
359 |
print(f"{col}: {metrics}")
|
360 |
|
361 |
accuracy.append(metrics["accuracy"])
|
@@ -414,7 +433,9 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
|
|
414 |
# )
|
415 |
if preprocess_func:
|
416 |
df["backup"] = df[column_name]
|
417 |
-
df[column_name] = df[column_name].apply(preprocess_func)
|
|
|
|
|
418 |
|
419 |
plt.figure(figsize=(8, 4))
|
420 |
|
@@ -438,8 +459,8 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
|
|
438 |
df.drop(columns=["backup"], inplace=True)
|
439 |
|
440 |
|
441 |
-
def calc_metrics_for_col(df, col):
|
442 |
-
metrics = calc_metrics(df["label"], df[col], debug=
|
443 |
return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
|
444 |
|
445 |
|
@@ -460,7 +481,7 @@ def get_metrics_df(df, variant="epoch", sort_columns=True):
|
|
460 |
columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
|
461 |
print("columns:", columns)
|
462 |
for i, col in enumerate(columns):
|
463 |
-
metrics = calc_metrics(df["label"], df[col], debug=False)
|
464 |
new_model_metrics = {
|
465 |
variant: i / 5 if variant == "epoch" else i + 1,
|
466 |
"model": col if "/" not in col else col.split("/")[1].split("_torch")[0],
|
|
|
207 |
|
208 |
return ""
|
209 |
|
210 |
+
def extract_answer_from_text(text, question):
|
211 |
+
labels = ['不是', '是', '不重要', '回答正确', '问法错误']
|
212 |
+
original_text = text
|
213 |
+
text = text.split("回答:")[-1]
|
214 |
+
found_question = False
|
215 |
+
for line in text.split("\n"):
|
216 |
+
if question in line:
|
217 |
+
found_question = True
|
218 |
+
elif found_question:
|
219 |
+
text = line
|
220 |
+
break
|
221 |
+
|
222 |
+
text = extract_answer(text)
|
223 |
+
if text in labels:
|
224 |
+
return text
|
225 |
+
|
226 |
+
text = text.replace("Human: ", "")
|
227 |
+
text = text.replace("Assistant: ", "")
|
228 |
+
if text in labels:
|
229 |
+
return text
|
230 |
+
# print(f"not found: {question} | {original_text} | {text}")
|
231 |
+
return text
|
232 |
+
|
233 |
+
def calc_metrics(references, predictions, questions=None, debug=False):
|
234 |
assert len(references) == len(
|
235 |
predictions
|
236 |
), f"lengths are difference: {len(references)} != {len(predictions)}"
|
|
|
238 |
labels = np.unique(references)
|
239 |
valid_classifications = [1 if p in labels else 0 for p in predictions]
|
240 |
|
241 |
+
predictions = [extract_answer(text) for text in predictions] if questions is None else [extract_answer_from_text(text, question) for text, question in zip(predictions, questions)]
|
242 |
|
243 |
accuracy = accuracy_score(references, predictions)
|
244 |
|
|
|
372 |
metrics_df = metrics_df.drop(columns=["index"])
|
373 |
|
374 |
accuracy = []
|
|
|
|
|
|
|
375 |
all_metrics = []
|
376 |
for col in df.columns[2:]:
|
377 |
+
metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=True)
|
378 |
print(f"{col}: {metrics}")
|
379 |
|
380 |
accuracy.append(metrics["accuracy"])
|
|
|
433 |
# )
|
434 |
if preprocess_func:
|
435 |
df["backup"] = df[column_name]
|
436 |
+
df[column_name] = df[column_name].apply(preprocess_func) if preprocess_func == extract_answer else df.apply(
|
437 |
+
preprocess_func, axis=1
|
438 |
+
)
|
439 |
|
440 |
plt.figure(figsize=(8, 4))
|
441 |
|
|
|
459 |
df.drop(columns=["backup"], inplace=True)
|
460 |
|
461 |
|
462 |
+
def calc_metrics_for_col(df, col, debug=True):
|
463 |
+
metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=debug)
|
464 |
return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
|
465 |
|
466 |
|
|
|
481 |
columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
|
482 |
print("columns:", columns)
|
483 |
for i, col in enumerate(columns):
|
484 |
+
metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=False)
|
485 |
new_model_metrics = {
|
486 |
variant: i / 5 if variant == "epoch" else i + 1,
|
487 |
"model": col if "/" not in col else col.split("/")[1].split("_torch")[0],
|
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02e_Qwen2.5-1.5B-Instruct_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02f_Qwen2.5-0.5B-Instruct_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
scripts/eval-epochs.sh
CHANGED
@@ -7,7 +7,7 @@ pwd
|
|
7 |
|
8 |
export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
9 |
export RESIZE_TOKEN_EMBEDDINGS=true
|
10 |
-
|
11 |
export USING_P1_PROMPT_TEMPLATE=false
|
12 |
|
13 |
export ORG_NAME=$1
|
|
|
7 |
|
8 |
export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
|
9 |
export RESIZE_TOKEN_EMBEDDINGS=true
|
10 |
+
export USING_LLAMA_FACTORY=true
|
11 |
export USING_P1_PROMPT_TEMPLATE=false
|
12 |
|
13 |
export ORG_NAME=$1
|
scripts/eval-mgtv-qwen2.5_4bit.sh
CHANGED
@@ -19,5 +19,12 @@ $BASEDIR/scripts/eval-epochs.sh Qwen Qwen2.5-72B-Instruct
|
|
19 |
export START_NUM_SHOTS=5
|
20 |
$BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
|
21 |
|
|
|
22 |
export START_NUM_SHOTS=40
|
23 |
$BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
export START_NUM_SHOTS=5
|
20 |
$BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
|
21 |
|
22 |
+
|
23 |
export START_NUM_SHOTS=40
|
24 |
$BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
|
25 |
+
|
26 |
+
export LOAD_IN_4BIT=false
|
27 |
+
$BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat
|
28 |
+
|
29 |
+
export START_NUM_SHOTS=50
|
30 |
+
$BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat-1m
|
scripts/eval-shots.sh
CHANGED
@@ -17,5 +17,11 @@ export MODEL_NAME=$ORG_NAME/$MODEL
|
|
17 |
|
18 |
export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
echo Evaluating $MODEL_NAME with few-shot learning
|
21 |
python llm_toolkit/eval_shots.py
|
|
|
17 |
|
18 |
export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
|
19 |
|
20 |
+
if [ "$MODEL" == "Qwen2.5-3B-Instruct" ];
|
21 |
+
then
|
22 |
+
echo "Skipping Qwen2.5-3B-Instruct"
|
23 |
+
exit 0
|
24 |
+
fi
|
25 |
+
|
26 |
echo Evaluating $MODEL_NAME with few-shot learning
|
27 |
python llm_toolkit/eval_shots.py
|