dh-mc commited on
Commit
687f68b
·
1 Parent(s): 18dda7e

ready for final run

Browse files
data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv CHANGED
@@ -1,3 +1,6 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
 
3
  10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
3
+ 5,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-05,0.7056666666666667,0.7605745196939752,0.7056666666666667,0.7269189565098723,0.9886666666666667
4
  10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334
5
+ 20,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-20,0.767,0.7690587905035869,0.767,0.7661695279121855,0.979
6
+ 30,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-30,0.7693333333333333,0.7765844200886581,0.7693333333333333,0.7697325957683855,0.7326666666666667
data/Qwen2.5-0.5B-Instruct_metrics.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct_torch.float16_lf,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
3
+ 0.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-35_torch.float16_lf,0.525,0.5819221558338251,0.525,0.4586682135998428,1.0
4
+ 0.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-70_torch.float16_lf,0.54,0.6445255881472232,0.54,0.5293020271128788,1.0
5
+ 0.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-105_torch.float16_lf,0.43766666666666665,0.6565760150511494,0.43766666666666665,0.49167707971005714,1.0
6
+ 0.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-140_torch.float16_lf,0.49933333333333335,0.6513093602943617,0.49933333333333335,0.49913143191054443,1.0
7
+ 1.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-175_torch.float16_lf,0.5523333333333333,0.6622075519433389,0.5523333333333333,0.5627283867177305,1.0
8
+ 1.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-210_torch.float16_lf,0.5403333333333333,0.64319564963495,0.5403333333333333,0.5598419070210608,1.0
9
+ 1.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-245_torch.float16_lf,0.5843333333333334,0.6559808590166016,0.5843333333333334,0.6086767064128167,1.0
10
+ 1.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-280_torch.float16_lf,0.5216666666666666,0.6604678981061621,0.5216666666666666,0.5615446578399996,1.0
11
+ 1.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-315_torch.float16_lf,0.524,0.6673441240188523,0.524,0.5607458201939703,1.0
12
+ 2.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-350_torch.float16_lf,0.507,0.6597337077954278,0.5070000000000001,0.5492280882625964,1.0
data/Qwen2.5-0.5B-Instruct_shots_metrics.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-00,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
3
+ 5,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-05,0.177,0.49074939459487404,0.177,0.2155165894788838,0.004
4
+ 10,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-10,0.35433333333333333,0.5213384036972462,0.35433333333333333,0.39783362635065245,0.068
5
+ 20,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-20,0.43666666666666665,0.5234006681691764,0.43666666666666665,0.4691719255495575,0.37266666666666665
6
+ 30,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-30,0.39066666666666666,0.5462493905687185,0.39066666666666666,0.4339604066000981,0.07566666666666666
7
+ 40,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-40,0.4653333333333333,0.5468189581246721,0.4653333333333333,0.49752341605759137,0.324
8
+ 50,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-50,0.5026666666666667,0.5610230233594029,0.5026666666666667,0.5163435163649445,0.24333333333333335
data/Qwen2.5-1.5B-Instruct_metrics.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct_torch.float16_lf,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
3
+ 0.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-35_torch.float16_lf,0.481,0.6625717555914767,0.481,0.5396575906071639,0.9996666666666667
4
+ 0.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-70_torch.float16_lf,0.5653333333333334,0.711044032475149,0.5653333333333334,0.6130876130683667,0.9996666666666667
5
+ 0.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-105_torch.float16_lf,0.5303333333333333,0.7229828883930918,0.5303333333333333,0.5954306316407808,1.0
6
+ 0.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-140_torch.float16_lf,0.6423333333333333,0.7326944345439858,0.6423333333333333,0.6760588124127741,1.0
7
+ 1.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-175_torch.float16_lf,0.6266666666666667,0.716073329097764,0.6266666666666667,0.6524988509397216,1.0
8
+ 1.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-210_torch.float16_lf,0.5773333333333334,0.7309423620832619,0.5773333333333334,0.6328015564736814,1.0
9
+ 1.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-245_torch.float16_lf,0.6403333333333333,0.749792626106991,0.6403333333333333,0.679795778108406,1.0
10
+ 1.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-280_torch.float16_lf,0.6233333333333333,0.7415417300032008,0.6233333333333333,0.6642642786690383,1.0
11
+ 1.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-315_torch.float16_lf,0.5903333333333334,0.7358743162328453,0.5903333333333334,0.6381733773475835,1.0
12
+ 2.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-350_torch.float16_lf,0.5966666666666667,0.7358100917578044,0.5966666666666667,0.6407733961630157,1.0
data/Qwen2.5-1.5B-Instruct_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/Qwen2.5-1.5B-Instruct_shots_metrics.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-00,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
3
+ 5,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-05,0.3913333333333333,0.5906409192176565,0.3913333333333333,0.4387379376697362,0.8283333333333334
4
+ 10,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-10,0.405,0.5886453916977137,0.405,0.46059038959324416,0.9156666666666666
5
+ 20,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-20,0.228,0.5255112437643187,0.228,0.30386597855848074,0.676
6
+ 30,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-30,0.23033333333333333,0.55368556787824,0.23033333333333333,0.3067125355762305,0.661
7
+ 40,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-40,0.292,0.5667420801465655,0.292,0.375496356843247,0.5206666666666667
8
+ 50,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-50,0.2876666666666667,0.5660207537890989,0.2876666666666667,0.36627420118815035,0.4603333333333333
data/Qwen2.5-3B-Instruct_metrics.csv CHANGED
@@ -1,4 +1,12 @@
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 0.0,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-35_torch.bfloat16_lf,0.7033333333333334,0.7493686353899274,0.7033333333333334,0.7196581245915875,1.0
3
- 0.2,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-70_torch.bfloat16_lf,0.664,0.7490874767990094,0.664,0.6954540806463714,1.0
4
- 0.4,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-88_torch.bfloat16_lf,0.6743333333333333,0.7591682267298503,0.6743333333333333,0.7069378240575964,1.0
 
 
 
 
 
 
 
 
 
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct_torch.float16_lf,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
3
+ 0.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-35_torch.float16_lf,0.689,0.7450174119748659,0.689,0.709114466474576,0.9986666666666667
4
+ 0.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-70_torch.float16_lf,0.6556666666666666,0.7590430811422313,0.6556666666666666,0.6934194398116857,1.0
5
+ 0.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-105_torch.float16_lf,0.6963333333333334,0.7550938479315918,0.6963333333333334,0.71844324172961,1.0
6
+ 0.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-140_torch.float16_lf,0.6853333333333333,0.7542524799326954,0.6853333333333333,0.7128732915785243,1.0
7
+ 1.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-175_torch.float16_lf,0.6846666666666666,0.7564071354272528,0.6846666666666666,0.7125676758538035,1.0
8
+ 1.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-210_torch.float16_lf,0.6896666666666667,0.7690917466956201,0.6896666666666667,0.720231747443145,1.0
9
+ 1.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-245_torch.float16_lf,0.7256666666666667,0.7753705482689578,0.7256666666666667,0.7440390153124937,1.0
10
+ 1.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-280_torch.float16_lf,0.708,0.7659638403826392,0.708,0.7293997518219294,1.0
11
+ 1.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-315_torch.float16_lf,0.7056666666666667,0.7717562122699148,0.7056666666666667,0.729817759784445,1.0
12
+ 2.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-350_torch.float16_lf,0.7003333333333334,0.7698824212888824,0.7003333333333334,0.726563613830647,1.0
data/Qwen2.5-3B-Instruct_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/Qwen2.5-3B-Instruct_shots_metrics.csv CHANGED
@@ -1,2 +1,8 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
 
 
 
 
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
3
+ 5,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-05,0.639,0.7226431221398603,0.639,0.641568790114368,0.9973333333333333
4
+ 10,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-10,0.625,0.7164154004131771,0.625,0.6402584852791593,0.995
5
+ 20,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-20,0.4666666666666667,0.6987641430848737,0.46666666666666673,0.5265074036660548,0.9316666666666666
6
+ 30,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-30,0.475,0.6880994914236809,0.475,0.5310948082593374,0.904
7
+ 40,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-40,0.584,0.7065303262365236,0.584,0.6214992664375876,0.7173333333333334
8
+ 50,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-50,0.6093333333333333,0.7120506480394511,0.6093333333333333,0.6451959368825358,0.574
data/Qwen2.5-72B-Instruct_metrics.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct_torch.bfloat16_4bit,0.755,0.7861877119461959,0.755,0.7540930716916622,0.5573333333333333
3
+ 0.2,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-35_torch.bfloat16_4bit,0.7776666666666666,0.8064344404751805,0.7776666666666666,0.7902083134269027,0.5623333333333334
4
+ 0.4,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-70_torch.bfloat16_4bit,0.7346666666666667,0.7919767732613179,0.7346666666666667,0.7595614261349122,0.5626666666666666
5
+ 0.6,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-105_torch.bfloat16_4bit,0.739,0.8045199868378529,0.739,0.7672258374793208,0.5626666666666666
data/Qwen2.5-72B-Instruct_shots_metrics.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ shots,model,run,accuracy,precision,recall,f1
data/Qwen2.5-7B-Instruct_metrics.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
3
+ 0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.745,0.7643041174791825,0.745,0.7482828029872421,0.998
4
+ 0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.7446666666666667,0.7800215227839997,0.7446666666666667,0.7576550061479678,0.9996666666666667
5
+ 0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7513333333333333,0.7996792149630704,0.7513333333333333,0.7693730206330721,0.9996666666666667
6
+ 0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.75,0.7923028105975739,0.75,0.7665531868559959,1.0
7
+ 1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
8
+ 1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7443333333333333,0.79978900243777,0.7443333333333333,0.7660506505481828,1.0
9
+ 1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.7486666666666667,0.7974562319123832,0.7486666666666667,0.7655275916268014,0.9993333333333333
10
+ 1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7566666666666667,0.7939852407869384,0.7566666666666667,0.7689495073735431,0.9996666666666667
11
+ 1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.755,0.7940575522966016,0.755,0.7681326415137147,0.9993333333333333
12
+ 2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.756,0.7982464722401461,0.756,0.7704035278260453,0.9996666666666667
data/Qwen2.5-7B-Instruct_shots_metrics.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
3
+ 5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.6346666666666667,0.7653343185471776,0.6346666666666667,0.6219419633691871,0.998
4
+ 10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.678,0.7675951017673515,0.678,0.6790860659550377,0.9796666666666667
5
+ 20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7353333333333333,0.7702034737275962,0.7353333333333333,0.7278047438569933,0.807
6
+ 30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.7646666666666667,0.7787918401418651,0.7646666666666667,0.7527649874769439,0.805
7
+ 40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.759,0.7736852689131295,0.759,0.7472252604775926,0.8546666666666667
8
+ 50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.7586666666666667,0.7640431634617543,0.7586666666666667,0.7414332963557551,0.7563333333333333
data/internlm2_5-7b-chat-1m_shots_metrics.csv CHANGED
@@ -1,3 +1,7 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
 
3
  10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667
 
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
3
+ 5,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-05,0.7763333333333333,0.7640598325070357,0.7763333333333333,0.7700878172419743,0.9453333333333334
4
  10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667
5
+ 20,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-20,0.6733333333333333,0.7314610506764355,0.6733333333333333,0.6764198712634657,0.8213333333333334
6
+ 30,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-30,0.6736666666666666,0.7482542000402412,0.6736666666666666,0.6810446770610585,0.8236666666666667
7
+ 40,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-40,0.724,0.7567654663125225,0.724,0.712500180941536,0.8336666666666667
data/internlm2_5-7b-chat_shots_metrics.csv CHANGED
@@ -1,3 +1,6 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
 
3
  10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
3
+ 5,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-05,0.7476666666666667,0.746806876028684,0.7476666666666667,0.7270588443494302,0.999
4
  10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333
5
+ 20,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-20,0.647,0.721136036365055,0.647,0.6769738108371004,0.9473333333333334
6
+ 30,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-30,0.6263333333333333,0.7256804685839701,0.6263333333333333,0.6534519727626863,0.9403333333333334
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -207,8 +207,30 @@ def extract_answer(text, debug=False):
207
 
208
  return ""
209
 
210
-
211
- def calc_metrics(references, predictions, debug=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  assert len(references) == len(
213
  predictions
214
  ), f"lengths are difference: {len(references)} != {len(predictions)}"
@@ -216,7 +238,7 @@ def calc_metrics(references, predictions, debug=False):
216
  labels = np.unique(references)
217
  valid_classifications = [1 if p in labels else 0 for p in predictions]
218
 
219
- predictions = [extract_answer(text) for text in predictions]
220
 
221
  accuracy = accuracy_score(references, predictions)
222
 
@@ -350,12 +372,9 @@ def get_metrics(df):
350
  metrics_df = metrics_df.drop(columns=["index"])
351
 
352
  accuracy = []
353
- meteor = []
354
- bleu_1 = []
355
- rouge_l = []
356
  all_metrics = []
357
  for col in df.columns[2:]:
358
- metrics = calc_metrics(df["label"], df[col], debug=True)
359
  print(f"{col}: {metrics}")
360
 
361
  accuracy.append(metrics["accuracy"])
@@ -414,7 +433,9 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
414
  # )
415
  if preprocess_func:
416
  df["backup"] = df[column_name]
417
- df[column_name] = df[column_name].apply(preprocess_func)
 
 
418
 
419
  plt.figure(figsize=(8, 4))
420
 
@@ -438,8 +459,8 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
438
  df.drop(columns=["backup"], inplace=True)
439
 
440
 
441
- def calc_metrics_for_col(df, col):
442
- metrics = calc_metrics(df["label"], df[col], debug=True)
443
  return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
444
 
445
 
@@ -460,7 +481,7 @@ def get_metrics_df(df, variant="epoch", sort_columns=True):
460
  columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
461
  print("columns:", columns)
462
  for i, col in enumerate(columns):
463
- metrics = calc_metrics(df["label"], df[col], debug=False)
464
  new_model_metrics = {
465
  variant: i / 5 if variant == "epoch" else i + 1,
466
  "model": col if "/" not in col else col.split("/")[1].split("_torch")[0],
 
207
 
208
  return ""
209
 
210
+ def extract_answer_from_text(text, question):
211
+ labels = ['不是', '是', '不重要', '回答正确', '问法错误']
212
+ original_text = text
213
+ text = text.split("回答:")[-1]
214
+ found_question = False
215
+ for line in text.split("\n"):
216
+ if question in line:
217
+ found_question = True
218
+ elif found_question:
219
+ text = line
220
+ break
221
+
222
+ text = extract_answer(text)
223
+ if text in labels:
224
+ return text
225
+
226
+ text = text.replace("Human: ", "")
227
+ text = text.replace("Assistant: ", "")
228
+ if text in labels:
229
+ return text
230
+ # print(f"not found: {question} | {original_text} | {text}")
231
+ return text
232
+
233
+ def calc_metrics(references, predictions, questions=None, debug=False):
234
  assert len(references) == len(
235
  predictions
236
  ), f"lengths are difference: {len(references)} != {len(predictions)}"
 
238
  labels = np.unique(references)
239
  valid_classifications = [1 if p in labels else 0 for p in predictions]
240
 
241
+ predictions = [extract_answer(text) for text in predictions] if questions is None else [extract_answer_from_text(text, question) for text, question in zip(predictions, questions)]
242
 
243
  accuracy = accuracy_score(references, predictions)
244
 
 
372
  metrics_df = metrics_df.drop(columns=["index"])
373
 
374
  accuracy = []
 
 
 
375
  all_metrics = []
376
  for col in df.columns[2:]:
377
+ metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=True)
378
  print(f"{col}: {metrics}")
379
 
380
  accuracy.append(metrics["accuracy"])
 
433
  # )
434
  if preprocess_func:
435
  df["backup"] = df[column_name]
436
+ df[column_name] = df[column_name].apply(preprocess_func) if preprocess_func == extract_answer else df.apply(
437
+ preprocess_func, axis=1
438
+ )
439
 
440
  plt.figure(figsize=(8, 4))
441
 
 
459
  df.drop(columns=["backup"], inplace=True)
460
 
461
 
462
+ def calc_metrics_for_col(df, col, debug=True):
463
+ metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=debug)
464
  return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
465
 
466
 
 
481
  columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
482
  print("columns:", columns)
483
  for i, col in enumerate(columns):
484
+ metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=False)
485
  new_model_metrics = {
486
  variant: i / 5 if variant == "epoch" else i + 1,
487
  "model": col if "/" not in col else col.split("/")[1].split("_torch")[0],
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/02e_Qwen2.5-1.5B-Instruct_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/02f_Qwen2.5-0.5B-Instruct_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/eval-epochs.sh CHANGED
@@ -7,7 +7,7 @@ pwd
7
 
8
  export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
9
  export RESIZE_TOKEN_EMBEDDINGS=true
10
- #export USING_LLAMA_FACTORY=true
11
  export USING_P1_PROMPT_TEMPLATE=false
12
 
13
  export ORG_NAME=$1
 
7
 
8
  export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
9
  export RESIZE_TOKEN_EMBEDDINGS=true
10
+ export USING_LLAMA_FACTORY=true
11
  export USING_P1_PROMPT_TEMPLATE=false
12
 
13
  export ORG_NAME=$1
scripts/eval-mgtv-qwen2.5_4bit.sh CHANGED
@@ -19,5 +19,12 @@ $BASEDIR/scripts/eval-epochs.sh Qwen Qwen2.5-72B-Instruct
19
  export START_NUM_SHOTS=5
20
  $BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
21
 
 
22
  export START_NUM_SHOTS=40
23
  $BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
 
 
 
 
 
 
 
19
  export START_NUM_SHOTS=5
20
  $BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
21
 
22
+
23
  export START_NUM_SHOTS=40
24
  $BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
25
+
26
+ export LOAD_IN_4BIT=false
27
+ $BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat
28
+
29
+ export START_NUM_SHOTS=50
30
+ $BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat-1m
scripts/eval-shots.sh CHANGED
@@ -17,5 +17,11 @@ export MODEL_NAME=$ORG_NAME/$MODEL
17
 
18
  export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
19
 
 
 
 
 
 
 
20
  echo Evaluating $MODEL_NAME with few-shot learning
21
  python llm_toolkit/eval_shots.py
 
17
 
18
  export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
19
 
20
+ if [ "$MODEL" == "Qwen2.5-3B-Instruct" ];
21
+ then
22
+ echo "Skipping Qwen2.5-3B-Instruct"
23
+ exit 0
24
+ fi
25
+
26
  echo Evaluating $MODEL_NAME with few-shot learning
27
  python llm_toolkit/eval_shots.py