Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 24, 2024

Commit

687f68b

1 Parent(s): 18dda7e

ready for final run

Browse files

Files changed (28) hide show

data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv +3 -0
data/Qwen2.5-0.5B-Instruct_metrics.csv +12 -0
data/Qwen2.5-0.5B-Instruct_shots_metrics.csv +8 -0
data/Qwen2.5-1.5B-Instruct_metrics.csv +12 -0
data/Qwen2.5-1.5B-Instruct_results.csv +0 -0
data/Qwen2.5-1.5B-Instruct_shots_metrics.csv +8 -0
data/Qwen2.5-3B-Instruct_metrics.csv +11 -3
data/Qwen2.5-3B-Instruct_results.csv +0 -0
data/Qwen2.5-3B-Instruct_shots_metrics.csv +6 -0
data/Qwen2.5-72B-Instruct_metrics.csv +5 -0
data/Qwen2.5-72B-Instruct_shots_metrics.csv +1 -0
data/Qwen2.5-7B-Instruct_metrics.csv +12 -0
data/Qwen2.5-7B-Instruct_shots_metrics.csv +8 -0
data/internlm2_5-7b-chat-1m_shots_metrics.csv +4 -0
data/internlm2_5-7b-chat_shots_metrics.csv +3 -0
llm_toolkit/logical_reasoning_utils.py +32 -11
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb +0 -0
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb +0 -0
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb +0 -0
notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb +0 -0
notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb +0 -0
notebooks/02e_Qwen2.5-1.5B-Instruct_analysis.ipynb +0 -0
notebooks/02f_Qwen2.5-0.5B-Instruct_analysis.ipynb +0 -0
notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb +0 -0
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb +0 -0
scripts/eval-epochs.sh +1 -1
scripts/eval-mgtv-qwen2.5_4bit.sh +7 -0
scripts/eval-shots.sh +6 -0

data/Llama3.1-8B-Chinese-Chat_shots_metrics.csv CHANGED Viewed

@@ -1,3 +1,6 @@
 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
 10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334

 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-00,0.742,0.7477056799746837,0.742,0.7371050181385632,0.8033333333333333
+5,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-05,0.7056666666666667,0.7605745196939752,0.7056666666666667,0.7269189565098723,0.9886666666666667
 10,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-10,0.6676666666666666,0.7834080522821993,0.6676666666666666,0.7082605860921491,0.9623333333333334
+20,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-20,0.767,0.7690587905035869,0.767,0.7661695279121855,0.979
+30,Llama3.1-8B-Chinese-Chat,shenzhi-wang/Llama3.1-8B-Chinese-Chat/shots-30,0.7693333333333333,0.7765844200886581,0.7693333333333333,0.7697325957683855,0.7326666666666667

data/Qwen2.5-0.5B-Instruct_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct_torch.float16_lf,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
+0.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-35_torch.float16_lf,0.525,0.5819221558338251,0.525,0.4586682135998428,1.0
+0.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-70_torch.float16_lf,0.54,0.6445255881472232,0.54,0.5293020271128788,1.0
+0.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-105_torch.float16_lf,0.43766666666666665,0.6565760150511494,0.43766666666666665,0.49167707971005714,1.0
+0.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-140_torch.float16_lf,0.49933333333333335,0.6513093602943617,0.49933333333333335,0.49913143191054443,1.0
+1.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-175_torch.float16_lf,0.5523333333333333,0.6622075519433389,0.5523333333333333,0.5627283867177305,1.0
+1.2,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-210_torch.float16_lf,0.5403333333333333,0.64319564963495,0.5403333333333333,0.5598419070210608,1.0
+1.4,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-245_torch.float16_lf,0.5843333333333334,0.6559808590166016,0.5843333333333334,0.6086767064128167,1.0
+1.6,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-280_torch.float16_lf,0.5216666666666666,0.6604678981061621,0.5216666666666666,0.5615446578399996,1.0
+1.8,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-315_torch.float16_lf,0.524,0.6673441240188523,0.524,0.5607458201939703,1.0
+2.0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/checkpoint-350_torch.float16_lf,0.507,0.6597337077954278,0.5070000000000001,0.5492280882625964,1.0

data/Qwen2.5-0.5B-Instruct_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-00,0.443,0.5490534863315207,0.443,0.43178235266224163,0.594
+5,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-05,0.177,0.49074939459487404,0.177,0.2155165894788838,0.004
+10,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-10,0.35433333333333333,0.5213384036972462,0.35433333333333333,0.39783362635065245,0.068
+20,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-20,0.43666666666666665,0.5234006681691764,0.43666666666666665,0.4691719255495575,0.37266666666666665
+30,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-30,0.39066666666666666,0.5462493905687185,0.39066666666666666,0.4339604066000981,0.07566666666666666
+40,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-40,0.4653333333333333,0.5468189581246721,0.4653333333333333,0.49752341605759137,0.324
+50,Qwen2.5-0.5B-Instruct,Qwen/Qwen2.5-0.5B-Instruct/shots-50,0.5026666666666667,0.5610230233594029,0.5026666666666667,0.5163435163649445,0.24333333333333335

data/Qwen2.5-1.5B-Instruct_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct_torch.float16_lf,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
+0.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-35_torch.float16_lf,0.481,0.6625717555914767,0.481,0.5396575906071639,0.9996666666666667
+0.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-70_torch.float16_lf,0.5653333333333334,0.711044032475149,0.5653333333333334,0.6130876130683667,0.9996666666666667
+0.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-105_torch.float16_lf,0.5303333333333333,0.7229828883930918,0.5303333333333333,0.5954306316407808,1.0
+0.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-140_torch.float16_lf,0.6423333333333333,0.7326944345439858,0.6423333333333333,0.6760588124127741,1.0
+1.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-175_torch.float16_lf,0.6266666666666667,0.716073329097764,0.6266666666666667,0.6524988509397216,1.0
+1.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-210_torch.float16_lf,0.5773333333333334,0.7309423620832619,0.5773333333333334,0.6328015564736814,1.0
+1.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-245_torch.float16_lf,0.6403333333333333,0.749792626106991,0.6403333333333333,0.679795778108406,1.0
+1.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-280_torch.float16_lf,0.6233333333333333,0.7415417300032008,0.6233333333333333,0.6642642786690383,1.0
+1.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-315_torch.float16_lf,0.5903333333333334,0.7358743162328453,0.5903333333333334,0.6381733773475835,1.0
+2.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-350_torch.float16_lf,0.5966666666666667,0.7358100917578044,0.5966666666666667,0.6407733961630157,1.0

data/Qwen2.5-1.5B-Instruct_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Qwen2.5-1.5B-Instruct_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-00,0.19966666666666666,0.5279959815418013,0.19966666666666666,0.23918953371981191,0.9223333333333333
+5,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-05,0.3913333333333333,0.5906409192176565,0.3913333333333333,0.4387379376697362,0.8283333333333334
+10,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-10,0.405,0.5886453916977137,0.405,0.46059038959324416,0.9156666666666666
+20,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-20,0.228,0.5255112437643187,0.228,0.30386597855848074,0.676
+30,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-30,0.23033333333333333,0.55368556787824,0.23033333333333333,0.3067125355762305,0.661
+40,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-40,0.292,0.5667420801465655,0.292,0.375496356843247,0.5206666666666667
+50,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-50,0.2876666666666667,0.5660207537890989,0.2876666666666667,0.36627420118815035,0.4603333333333333

data/Qwen2.5-3B-Instruct_metrics.csv CHANGED Viewed

@@ -1,4 +1,12 @@
 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
-0.0,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-35_torch.bfloat16_lf,0.7033333333333334,0.7493686353899274,0.7033333333333334,0.7196581245915875,1.0
-0.2,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-70_torch.bfloat16_lf,0.664,0.7490874767990094,0.664,0.6954540806463714,1.0
-0.4,Qwen2.5-3B-Instruct,qwen/Qwen2.5-3B-Instruct/checkpoint-88_torch.bfloat16_lf,0.6743333333333333,0.7591682267298503,0.6743333333333333,0.7069378240575964,1.0

 epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct_torch.float16_lf,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
+0.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-35_torch.float16_lf,0.689,0.7450174119748659,0.689,0.709114466474576,0.9986666666666667
+0.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-70_torch.float16_lf,0.6556666666666666,0.7590430811422313,0.6556666666666666,0.6934194398116857,1.0
+0.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-105_torch.float16_lf,0.6963333333333334,0.7550938479315918,0.6963333333333334,0.71844324172961,1.0
+0.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-140_torch.float16_lf,0.6853333333333333,0.7542524799326954,0.6853333333333333,0.7128732915785243,1.0
+1.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-175_torch.float16_lf,0.6846666666666666,0.7564071354272528,0.6846666666666666,0.7125676758538035,1.0
+1.2,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-210_torch.float16_lf,0.6896666666666667,0.7690917466956201,0.6896666666666667,0.720231747443145,1.0
+1.4,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-245_torch.float16_lf,0.7256666666666667,0.7753705482689578,0.7256666666666667,0.7440390153124937,1.0
+1.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-280_torch.float16_lf,0.708,0.7659638403826392,0.708,0.7293997518219294,1.0
+1.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-315_torch.float16_lf,0.7056666666666667,0.7717562122699148,0.7056666666666667,0.729817759784445,1.0
+2.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-350_torch.float16_lf,0.7003333333333334,0.7698824212888824,0.7003333333333334,0.726563613830647,1.0

data/Qwen2.5-3B-Instruct_results.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/Qwen2.5-3B-Instruct_shots_metrics.csv CHANGED Viewed

@@ -1,2 +1,8 @@
 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0

 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
+5,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-05,0.639,0.7226431221398603,0.639,0.641568790114368,0.9973333333333333
+10,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-10,0.625,0.7164154004131771,0.625,0.6402584852791593,0.995
+20,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-20,0.4666666666666667,0.6987641430848737,0.46666666666666673,0.5265074036660548,0.9316666666666666
+30,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-30,0.475,0.6880994914236809,0.475,0.5310948082593374,0.904
+40,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-40,0.584,0.7065303262365236,0.584,0.6214992664375876,0.7173333333333334
+50,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-50,0.6093333333333333,0.7120506480394511,0.6093333333333333,0.6451959368825358,0.574

data/Qwen2.5-72B-Instruct_metrics.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct_torch.bfloat16_4bit,0.755,0.7861877119461959,0.755,0.7540930716916622,0.5573333333333333
+0.2,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-35_torch.bfloat16_4bit,0.7776666666666666,0.8064344404751805,0.7776666666666666,0.7902083134269027,0.5623333333333334
+0.4,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-70_torch.bfloat16_4bit,0.7346666666666667,0.7919767732613179,0.7346666666666667,0.7595614261349122,0.5626666666666666
+0.6,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-105_torch.bfloat16_4bit,0.739,0.8045199868378529,0.739,0.7672258374793208,0.5626666666666666

data/Qwen2.5-72B-Instruct_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ shots,model,run,accuracy,precision,recall,f1

data/Qwen2.5-7B-Instruct_metrics.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
+0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.745,0.7643041174791825,0.745,0.7482828029872421,0.998
+0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.7446666666666667,0.7800215227839997,0.7446666666666667,0.7576550061479678,0.9996666666666667
+0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7513333333333333,0.7996792149630704,0.7513333333333333,0.7693730206330721,0.9996666666666667
+0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.75,0.7923028105975739,0.75,0.7665531868559959,1.0
+1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
+1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7443333333333333,0.79978900243777,0.7443333333333333,0.7660506505481828,1.0
+1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.7486666666666667,0.7974562319123832,0.7486666666666667,0.7655275916268014,0.9993333333333333
+1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7566666666666667,0.7939852407869384,0.7566666666666667,0.7689495073735431,0.9996666666666667
+1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.755,0.7940575522966016,0.755,0.7681326415137147,0.9993333333333333
+2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.756,0.7982464722401461,0.756,0.7704035278260453,0.9996666666666667

data/Qwen2.5-7B-Instruct_shots_metrics.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
+0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
+5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.6346666666666667,0.7653343185471776,0.6346666666666667,0.6219419633691871,0.998
+10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.678,0.7675951017673515,0.678,0.6790860659550377,0.9796666666666667
+20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7353333333333333,0.7702034737275962,0.7353333333333333,0.7278047438569933,0.807
+30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.7646666666666667,0.7787918401418651,0.7646666666666667,0.7527649874769439,0.805
+40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.759,0.7736852689131295,0.759,0.7472252604775926,0.8546666666666667
+50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.7586666666666667,0.7640431634617543,0.7586666666666667,0.7414332963557551,0.7563333333333333

data/internlm2_5-7b-chat-1m_shots_metrics.csv CHANGED Viewed

@@ -1,3 +1,7 @@
 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
 10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667

 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-00,0.48133333333333334,0.7605248207587668,0.48133333333333334,0.5244515621126862,0.9986666666666667
+5,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-05,0.7763333333333333,0.7640598325070357,0.7763333333333333,0.7700878172419743,0.9453333333333334
 10,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-10,0.6473333333333333,0.7282065610714444,0.6473333333333333,0.665824871588245,0.8866666666666667
+20,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-20,0.6733333333333333,0.7314610506764355,0.6733333333333333,0.6764198712634657,0.8213333333333334
+30,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-30,0.6736666666666666,0.7482542000402412,0.6736666666666666,0.6810446770610585,0.8236666666666667
+40,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-40,0.724,0.7567654663125225,0.724,0.712500180941536,0.8336666666666667

data/internlm2_5-7b-chat_shots_metrics.csv CHANGED Viewed

@@ -1,3 +1,6 @@
 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
 10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333

 shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
 0,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-00,0.705,0.7398041613378253,0.705,0.6906357423169466,1.0
+5,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-05,0.7476666666666667,0.746806876028684,0.7476666666666667,0.7270588443494302,0.999
 10,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-10,0.5533333333333333,0.7301739373336078,0.5533333333333333,0.625097481985829,0.9883333333333333
+20,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-20,0.647,0.721136036365055,0.647,0.6769738108371004,0.9473333333333334
+30,internlm2_5-7b-chat,internlm/internlm2_5-7b-chat/shots-30,0.6263333333333333,0.7256804685839701,0.6263333333333333,0.6534519727626863,0.9403333333333334

llm_toolkit/logical_reasoning_utils.py CHANGED Viewed

@@ -207,8 +207,30 @@ def extract_answer(text, debug=False):
     return ""
-def calc_metrics(references, predictions, debug=False):
     assert len(references) == len(
         predictions
     ), f"lengths are difference: {len(references)} != {len(predictions)}"
@@ -216,7 +238,7 @@ def calc_metrics(references, predictions, debug=False):
     labels = np.unique(references)
     valid_classifications = [1 if p in labels else 0 for p in predictions]
-    predictions = [extract_answer(text) for text in predictions]
     accuracy = accuracy_score(references, predictions)
@@ -350,12 +372,9 @@ def get_metrics(df):
     metrics_df = metrics_df.drop(columns=["index"])
     accuracy = []
-    meteor = []
-    bleu_1 = []
-    rouge_l = []
     all_metrics = []
     for col in df.columns[2:]:
-        metrics = calc_metrics(df["label"], df[col], debug=True)
         print(f"{col}: {metrics}")
         accuracy.append(metrics["accuracy"])
@@ -414,7 +433,9 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
     # )
     if preprocess_func:
         df["backup"] = df[column_name]
-        df[column_name] = df[column_name].apply(preprocess_func)
     plt.figure(figsize=(8, 4))
@@ -438,8 +459,8 @@ def plot_value_counts(df, column_name, offset=0.1, title=None, preprocess_func=N
         df.drop(columns=["backup"], inplace=True)
-def calc_metrics_for_col(df, col):
-    metrics = calc_metrics(df["label"], df[col], debug=True)
     return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
@@ -460,7 +481,7 @@ def get_metrics_df(df, variant="epoch", sort_columns=True):
         columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
     print("columns:", columns)
     for i, col in enumerate(columns):
-        metrics = calc_metrics(df["label"], df[col], debug=False)
         new_model_metrics = {
             variant: i / 5 if variant == "epoch" else i + 1,
             "model": col if "/" not in col else col.split("/")[1].split("_torch")[0],

     return ""
+def extract_answer_from_text(text, question):
+    labels = ['不是', '是', '不重要', '回答正确', '问法错误']
+    original_text = text
+    text = text.split("回答:")[-1]
+    found_question = False
+    for line in text.split("\n"):
+        if question in line:
+            found_question = True
+        elif found_question:
+            text = line
+            break
+    text = extract_answer(text)
+    if text in labels:
+        return text
+    text = text.replace("Human: ", "")
+    text = text.replace("Assistant: ", "")
+    if text in labels:
+        return text
+    # print(f"not found: {question} | {original_text} | {text}")
+    return text
+def calc_metrics(references, predictions, questions=None, debug=False):
     assert len(references) == len(
         predictions
     ), f"lengths are difference: {len(references)} != {len(predictions)}"
     labels = np.unique(references)
     valid_classifications = [1 if p in labels else 0 for p in predictions]
+    predictions = [extract_answer(text) for text in predictions] if questions is None else [extract_answer_from_text(text, question) for text, question in zip(predictions, questions)]
     accuracy = accuracy_score(references, predictions)
     metrics_df = metrics_df.drop(columns=["index"])
     accuracy = []
     all_metrics = []
     for col in df.columns[2:]:
+        metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=True)
         print(f"{col}: {metrics}")
         accuracy.append(metrics["accuracy"])
     # )
     if preprocess_func:
         df["backup"] = df[column_name]
+        df[column_name] = df[column_name].apply(preprocess_func) if preprocess_func == extract_answer else df.apply(
+            preprocess_func, axis=1
+        )
     plt.figure(figsize=(8, 4))
         df.drop(columns=["backup"], inplace=True)
+def calc_metrics_for_col(df, col, debug=True):
+    metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=debug)
     return metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1"]
         columns = sorted(columns, key=lambda x: int(x.lower().replace("-1m", "").replace("chat", "0").replace("instruct", "0").split("-")[-1].split("_")[0]))
     print("columns:", columns)
     for i, col in enumerate(columns):
+        metrics = calc_metrics(df["label"], df[col], questions=df["text"], debug=False)
         new_model_metrics = {
             variant: i / 5 if variant == "epoch" else i + 1,
             "model": col if "/" not in col else col.split("/")[1].split("_torch")[0],

notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/01a_internlm2_5-7b-chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02e_Qwen2.5-1.5B-Instruct_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02f_Qwen2.5-0.5B-Instruct_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/eval-epochs.sh CHANGED Viewed

@@ -7,7 +7,7 @@ pwd
 export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
 export RESIZE_TOKEN_EMBEDDINGS=true
-#export USING_LLAMA_FACTORY=true
 export USING_P1_PROMPT_TEMPLATE=false
 export ORG_NAME=$1

 export LOGICAL_REASONING_DATA_PATH=datasets/mgtv
 export RESIZE_TOKEN_EMBEDDINGS=true
+export USING_LLAMA_FACTORY=true
 export USING_P1_PROMPT_TEMPLATE=false
 export ORG_NAME=$1

scripts/eval-mgtv-qwen2.5_4bit.sh CHANGED Viewed

@@ -19,5 +19,12 @@ $BASEDIR/scripts/eval-epochs.sh Qwen Qwen2.5-72B-Instruct
 export START_NUM_SHOTS=5
 $BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
 export START_NUM_SHOTS=40
 $BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat

 export START_NUM_SHOTS=5
 $BASEDIR/scripts/eval-shots_4bit.sh Qwen Qwen2.5-72B-Instruct
 export START_NUM_SHOTS=40
 $BASEDIR/scripts/eval-shots_4bit.sh shenzhi-wang Llama3.1-70B-Chinese-Chat
+export LOAD_IN_4BIT=false
+$BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat
+export START_NUM_SHOTS=50
+$BASEDIR/scripts/eval-shots.sh internlm internlm2_5-7b-chat-1m

scripts/eval-shots.sh CHANGED Viewed

@@ -17,5 +17,11 @@ export MODEL_NAME=$ORG_NAME/$MODEL
 export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
 echo Evaluating $MODEL_NAME with few-shot learning
 python llm_toolkit/eval_shots.py

 export LOGICAL_REASONING_RESULTS_PATH=data/${MODEL}_results.csv
+if [ "$MODEL" == "Qwen2.5-3B-Instruct" ];
+then
+    echo "Skipping Qwen2.5-3B-Instruct"
+    exit 0
+fi
 echo Evaluating $MODEL_NAME with few-shot learning
 python llm_toolkit/eval_shots.py