File size: 27,782 Bytes
0de1d17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
"""
Script for an iterative scheme.

Assumptions:
- complete pariwise comparisons available, i.e. evaluations are cheap
-
"""

import pandas as pd
import numpy as np
from tqdm import tqdm
from selfrank.algos.metrics import mapk, rank_biased_overlap
from selfrank.algos.plots import plot_ranks
import logging
from typing import List, Callable, Optional
import random

logger = logging.getLogger(__name__)
tol = 0.001


class LLM_Model:
    def __init__(self, model_name, all_model_data):
        self.model_name = model_name

    def name(self):
        return self.model_name

    def __eq__(self, other):
        return self.name() == other.name()

    def __lt__(self, other):
        return self.name() < other.name()



class SelfRankGreedy:

    def __init__(self, MODELS: List, evaluator: Callable, true_ranking: Optional[List]=None, show_progress: Optional[bool]=False):
        self.MODELS = MODELS
        self.N = len(MODELS)
        self.evaluate = evaluator
        self.true_ranking = true_ranking
        self.show_progress = show_progress
        self.df = None
        self.DEBUG = False
        self.model_eval = None
        self.cnt=0
        
    def getEvaluation(self, a, b , c, df, eval_arr, modelsList):
        '''
        model c in  is evaluating a and b 
        It check in eval_arr is already evaluated; if not, evaluates and stores
        '''
        idx_a = modelsList.index(a)
        idx_b = modelsList.index(b)
        idx_c = modelsList.index(c)
        val = eval_arr[idx_c, idx_a, idx_b]  # stores c evaluating a to b
        if val > -1:
            return val
        else:
            val = self.evaluate(a, b, c, df)
            eval_arr[idx_c, idx_a, idx_b] = val
            eval_arr[idx_c, idx_b, idx_a] = 1 - val
            return val

    def __evaluateModelTriplet(self, df, triplet, eval_arr, modelsList):
        model1 = triplet[0]
        model2 = triplet[1]
        model3 = triplet[2]
        res = np.array([0, 0, 0])
        m1_cmp_2_3 = self.getEvaluation(a=model2.name(), b=model3.name(), c=model1.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model1.compareModels(model2, model3)
        m2_cmp_1_3 = self.getEvaluation(a=model1.name(), b=model3.name(), c=model2.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model2.compareModels(model1, model3)
        m3_cmp_1_2 = self.getEvaluation(a=model1.name(), b=model2.name(), c=model3.name(), df=df, eval_arr=eval_arr, modelsList=modelsList)  #model3.compareModels(model1, model2)
        if m1_cmp_2_3 >= 0.5:
            res[1]+=1
        else:
            res[2]+=1

        if m2_cmp_1_3 >= 0.5:
            res[0]+=1
        else:
            res[2]+=1

        if m3_cmp_1_2 >= 0.5:
            res[0]+=1
        else:
            res[1]+=1

        #print(res)
        #print(res.tolist())
        zipped_pairs = zip(res.tolist(), triplet)
        z = [(x,y, x.name()) for y, x in sorted(zipped_pairs, reverse=True)]
        return z

    def __printNames(self, ll):
        print([i.name() for i in ll])

    def __evaluateModels(self, df, evaluators, modelsToBeEvaluated, eval_arr, modelsList):
        # rewrittten method to allow usage with updated code
        # modelsToBeEvaluated can have 2 or 3 models only. evaluators will have only 1 model. Use evaluators to rank and return list of models in modelsToBeEvaluated
        if len(evaluators) > 1:
            raise Exception
        if len(modelsToBeEvaluated) > 3 or len(modelsToBeEvaluated) < 2:
            raise Exception
        if len(modelsToBeEvaluated) == 2:
            r = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
            if r >= 0.5:
                return [modelsToBeEvaluated[0],modelsToBeEvaluated[1]]
            else:
                return [modelsToBeEvaluated[1],modelsToBeEvaluated[0]]
        if len(modelsToBeEvaluated) == 3:
            r01 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[1].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
            r12 = self.getEvaluation(a=modelsToBeEvaluated[1].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
            r02 = self.getEvaluation(a=modelsToBeEvaluated[0].name(), b=modelsToBeEvaluated[2].name(), c=evaluators[0].name(), df=df, eval_arr=eval_arr, modelsList=modelsList)
            res = np.array([0, 0, 0])
            if r01 >= 0.5:
                res[0]+=1
            else:
                res[1]+=1

            if r12 >= 0.5:
                res[1]+=1
            else:
                res[2]+=1

            if r02 >= 0.5:
                res[0]+=1
            else:
                res[2]+=1

            zipped_pairs = zip(res.tolist(), modelsToBeEvaluated)
            z = [x for y, x in sorted(zipped_pairs, reverse=True)]
            return z


    def __rankModels(self, df, eval_arr, modelsList, triplet, prev_model_ranking, unrankedModelList, rankedModelList, bottomModelList):

        if len(triplet) < 3:
            return [], list(triplet), []
        self.cnt = self.cnt + 1
        model_ranking = self.__evaluateModelTriplet(df, triplet, eval_arr, modelsList)
        if self.DEBUG:
            print("Cnt: ", self.cnt)
            print("\n\n\nFIRST")
            self.__printNames(triplet)
            self.__printNames(unrankedModelList)
            self.__printNames(rankedModelList)
            self.__printNames(bottomModelList)
            print(model_ranking)
            print(prev_model_ranking)
            print("END FIRST")

        first_rank = model_ranking[0][1]
        second_rank = model_ranking[1][1]
        third_rank = model_ranking[2][1]
        if first_rank  == 2:  # first model is better than the other two

            if len(unrankedModelList) == 0 and len(bottomModelList) == 0:                                 # CASE 1
                # no more unranked models left to consider and none in bottomModels,
                # so add the models in rank order to rankedModelList
                if second_rank == 1 and third_rank == 0:
                    if self.DEBUG:
                        print('CASE 1a')
                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0], model_ranking[2][0]])
                elif second_rank == 0 and third_rank == 0:
                    if self.DEBUG:
                        print('CASE 1b')
                    rankedModelList.append(model_ranking[0][0])
                    #use current best model to rank the bottom 2 and add to rankedList in order
                    z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[1][0], model_ranking[2][0]], eval_arr, modelsList)
                    rankedModelList.extend(z)
                else:
                    raise Exception("Error: Should not have occurred CASE 1")
                if self.DEBUG:
                    self.__printNames(rankedModelList)
                return [], rankedModelList, []

            if len(unrankedModelList) == 0 and len(bottomModelList) == 1:                                 # CASE 2
                # no more unranked models left to consider and only 1 bottomModels in all,
                if second_rank == 1 and third_rank == 0:
                    # so add the models in rank order to rankedModelList
                    if self.DEBUG:
                        print('CASE 2a')
                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]])
                    #TODO Use top model in rankedModelList to rank the two models below and then add them according to ranking
                    z = self.__evaluateModels(df, [rankedModelList[0]],[model_ranking[2][0], bottomModelList[0]], eval_arr, modelsList)
                    rankedModelList.extend(z)
                    if self.DEBUG:
                        self.__printNames(rankedModelList)
                    return [], rankedModelList, []
                elif second_rank == 0 and third_rank == 0:
                    if self.DEBUG:
                        print('CASE 2b')

                    rankedModelList.append(model_ranking[0][0])
                    modelsToCompare = [model_ranking[1][0], model_ranking[2][0], bottomModelList[0]]

                    if self.DEBUG:
                        self.__printNames(tuple(modelsToCompare))
                        self.__printNames(rankedModelList)
                    return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, [], rankedModelList, [])
                else:
                    raise Exception("Error: Should not have occurred CASE 2")


            if len(unrankedModelList) == 0 and len(bottomModelList) > 1:                                  # CASE 3
                # no more unranked models left to consider but there are at least 2 models in bottomModelList
                if second_rank == 1 and third_rank == 0:
                    if self.DEBUG:
                        print('CASE 3a')
                    rankedModelList.extend([model_ranking[0][0], model_ranking[1][0]]) # add top two models to ranked list
                    bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList
                elif second_rank == 0 and third_rank == 0:
                    if self.DEBUG:
                        print('CASE 3b')
                    rankedModelList.append(model_ranking[0][0]) # add top model to ranked list
                    bottomModelList.extend([model_ranking[1][0], model_ranking[2][0]]) # add bottom two model to bottomModelList
                else:
                    raise Exception("Error: Should not have occurred CASE 3")

                modelsToCompare = random.sample(bottomModelList, 3)
                bottomModelList = [i for i in bottomModelList if i not in modelsToCompare]
                if self.DEBUG:
                    self.__printNames(tuple(modelsToCompare))
                    self.__printNames(bottomModelList)
                    self.__printNames(rankedModelList)
                    print([])
                return self.__rankModels(df, eval_arr, modelsList, tuple(modelsToCompare), model_ranking, bottomModelList, rankedModelList, [])

            # CASE 4 len(unrankedModelList) > 0

            #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
            # move all bottom to unranked and call with new triple
            #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
            #    unrankedModelList.extend(bottomModelList)
            #    if self.DEBUG:
            #        print('Case 4a NEW ONE')
            #        self.__printNames(triplet)
            #        self.__printNames(unrankedModelList)
            #        self.__printNames(rankedModelList)
            #        self.__printNames([])
            #    return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])
            if second_rank == 1 and third_rank == 0:
                if self.DEBUG:
                    print('CASE 4a')
                bottomModelList.append(model_ranking[2][0]) # add worst model to bottomModelList

                newModel = random.sample(unrankedModelList, 1)
                unrankedModelList.remove(newModel[0])
                triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])

                if self.DEBUG:
                    self.__printNames(triplet)
                    self.__printNames(unrankedModelList)
                    self.__printNames(rankedModelList)
                    self.__printNames(bottomModelList)
                return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
            elif second_rank == 0 and third_rank == 0:

                # if unrankedModelList has 2 or more elements, put both 2nd and 3rd model into bottom; if unrankedModelList has only one,
                # then randomly choose one of the two and put in bottom
                if len(unrankedModelList) > 1:
                    if self.DEBUG:
                        print('CASE 4b')
                    bottomModelList.append(model_ranking[2][0])
                    bottomModelList.append(model_ranking[1][0])
                    newModels = random.sample(unrankedModelList, 2)
                    triplet = (model_ranking[0][0],) + tuple(newModels)
                    unrankedModelList.remove(newModels[0])
                    unrankedModelList.remove(newModels[1])

                    if self.DEBUG:
                        self.__printNames(triplet)
                        self.__printNames(unrankedModelList)
                        self.__printNames(rankedModelList)
                        self.__printNames(bottomModelList)
                    return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)
                else:
                    if self.DEBUG:
                        print('CASE 4c')
                    #200, UR==1
                    #add third model to bottom. replace in tuple with one from unranked. and rank
                    #newModel = random.sample(unrankedModelList, 1)
                    #unrankedModelList.remove(newModel[0])
                    #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
                    #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])

                    #add both 0s to bottom. Create tuple with 2, the one from UR and 1 from B. Call self.__rankModels(df, (triple,B,R,[])
                    newModel = random.sample(unrankedModelList, 1)
                    unrankedModelList.remove(newModel[0])
                    bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
                    bottomModelList.append(model_ranking[1][0]) # add second model to bottomModelList
                    newBottomModel = random.sample(bottomModelList, 1)
                    bottomModelList.remove(newBottomModel[0])
                    triplet = (model_ranking[0][0], newModel[0], newBottomModel[0])
                    if self.DEBUG:
                        self.__printNames(triplet)
                        self.__printNames(unrankedModelList)
                        self.__printNames(rankedModelList)
                        self.__printNames(bottomModelList)
                    return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])
            else:
                raise Exception("Error: Should not have occurred CASE 4")

        else:
            # some problem with ranking all three models
            if len(unrankedModelList) == 0 and len(bottomModelList) == 0:                                 # CASE 1
                #use top model from rankedlist to rank the three and append to ranked list in order
                if self.DEBUG:
                    print('CASE ELSE_1')
                z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
                if self.DEBUG:
                    self.__printNames(z)
                rankedModelList.extend(z)
                if self.DEBUG:
                    self.__printNames(rankedModelList)
                return [], rankedModelList, []

            if len(unrankedModelList) == 0 and len(bottomModelList) == 1:                                 # CASE 2
                if self.DEBUG:
                    print('CASE ELSE_2')

                #ALTERNATIVE
                ##use top model from rankedlist to rank the three and append to ranked list in order; THEN, add the sole model from bottom list
                if len(rankedModelList) > 0:
                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
                else:
                    z = list(triplet)
                if self.DEBUG:
                    self.__printNames(z)
                rankedModelList.extend(z)
                rankedModelList.append(bottomModelList[0])
                if self.DEBUG:
                    self.__printNames(rankedModelList)
                return [], rankedModelList, []


            if len(unrankedModelList) == 0 and len(bottomModelList) > 1:                                  # CASE 3
                # ranks are 1xx or 000
                if self.DEBUG:
                    print('CASE ELSE_3')

                ##use top model from rankedlist to rank the three and add top 2 to ranked list in order;
                if len(rankedModelList) > 0:
                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
                else:
                    z = list(triplet)
                if self.DEBUG:
                    self.__printNames(z)
                rankedModelList.append(z[0])
                rankedModelList.append(z[1])

                bottomModelList.append(z[2])
                #Sample 3 from bottom to create triple. call self.__rankModels(df, (tripler, B, R, [])
                newModels = random.sample(bottomModelList, 3)
                for mod in newModels:
                    bottomModelList.remove(mod)
                if self.DEBUG:
                    self.__printNames(tuple(newModels))
                    self.__printNames(unrankedModelList)
                    self.__printNames(rankedModelList)
                    self.__printNames(bottomModelList)
                return self.__rankModels(df, eval_arr, modelsList, tuple(newModels), model_ranking, bottomModelList, rankedModelList, [])


            # CASE 4 len(unrankedModelList) > 0

            # if the three models are 1,1,1 or 0,0,0 i.e. indistinguishable

            #check the previous model ranking and model ranking. if either first or second ranked model previously is now the bottom ranked model,
            # move all bottom to unranked and call with new triple
            #if (prev_model_ranking is not None) and ((prev_model_ranking[0][0] == model_ranking[2][0]) or (prev_model_ranking[1][0] == model_ranking[2][0])):
            #    unrankedModelList.extend(bottomModelList)
            #    if self.DEBUG:
            #        print('Case ELSE_4 NEW ONE')
            #        self.__printNames(triplet)
            #        self.__printNames(unrankedModelList)
            #        self.__printNames(rankedModelList)
            #        self.__printNames([])
            #    return self.__rankModels(df, (triplet, None, unrankedModelList, rankedModelList, [])

            # choose one of the tuple models and add to unrankedlIst. Remove random model from unrankedList and add to tuple. rank again
            if first_rank == second_rank and first_rank == third_rank:
                if self.DEBUG:
                    print('CASE ELSE_4a')
                ##use top model from rankedlist to rank the three and add third one to Bottomlist ;
                ##then create tuple with top 2 and one from unranked
                
                if len(rankedModelList) > 0:
                    z = self.__evaluateModels(df, [rankedModelList[0]], list(triplet), eval_arr, modelsList)
                else:
                    z = list(triplet)
                if self.DEBUG:
                    print('z: ', z)
                    self.__printNames(z)

                bottomModelList.append(z[2])
                newModel = random.sample(unrankedModelList, 1)
                unrankedModelList.remove(newModel[0])
                triplet = (z[0], z[1], newModel[0])
                if self.DEBUG:
                    print(1)
                    print('triplet:', triplet)
                    self.__printNames(triplet)
                    print(2)
                    self.__printNames(unrankedModelList)
                    print(3)
                    self.__printNames(rankedModelList)
                    print(4)
                    self.__printNames(bottomModelList)
                    print(5)

            else: # there are one or two models with 0
                # if only 1, add to bottom and replace with one from unranked
                # if two are 0, then both replace with unranked if unranked has more than 1
                # otherwise randomly add one of the 0s to bottom and replace with unranked.
                if second_rank == 1: # then only third is 0
                    if self.DEBUG:
                        print('CASE ELSE_4b')
                    newModel = random.sample(unrankedModelList, 1)
                    unrankedModelList.remove(newModel[0])

                    bottomModelList.append(model_ranking[2][0])
                    triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])
                else: # both second and third are zero
                    if len(unrankedModelList) > 1:
                        if self.DEBUG:
                            print('CASE ELSE_4c')
                        bottomModelList.append(model_ranking[2][0])
                        bottomModelList.append(model_ranking[1][0])
                        newModels = random.sample(unrankedModelList, 2)
                        triplet = (model_ranking[0][0],) + tuple(newModels)
                        unrankedModelList.remove(newModels[0])
                        unrankedModelList.remove(newModels[1])
                    else:
                        if self.DEBUG:
                            print('CASE ELSE_4d')
                        #add third model to bottom. replace in tuple with one from unranked. and rank
                        #newModel = random.sample(unrankedModelList, 1)
                        #unrankedModelList.remove(newModel[0])
                        #bottomModelList.append(model_ranking[2][0]) # add third model to bottomModelList
                        #triplet = (model_ranking[0][0], model_ranking[1][0], newModel[0])

                        # UR==1,  100
                        #Add both 0s to Bottom. Create tuple from the 1, one from UR, and one from Bottom
                        #Call self.__rankModels(df, (triple, B, R, [])
                        bottomModelList.append(model_ranking[2][0])
                        bottomModelList.append(model_ranking[1][0])
                        newModels = random.sample(unrankedModelList, 1)
                        unrankedModelList.remove(newModel[0])
                        newBottomModels = random.sample(bottomModelList, 1)
                        bottomModelList.remove(newBottomModels[0])
                        triplet = (model_ranking[0][0], newModels[0], newBottomModels[0])
                        if self.DEBUG:
                            self.__printNames(triplet)
                            self.__printNames(unrankedModelList)
                            self.__printNames(rankedModelList)
                            self.__printNames(bottomModelList)
                        return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, bottomModelList, rankedModelList, [])


            if self.DEBUG:
                self.__printNames(triplet)
                self.__printNames(unrankedModelList)
                self.__printNames(rankedModelList)
                self.__printNames(bottomModelList)
            return self.__rankModels(df, eval_arr, modelsList, triplet, model_ranking, unrankedModelList, rankedModelList, bottomModelList)


    def __printRanks(self, ll):
        print([{i.name(): r} for r,i in enumerate(ll)])

    def __estimate_rankings(self, df, numIter=1, modelSubset=None, numModels=None):
        rankedLists = []
        if modelSubset is not None:
            model_list = modelSubset
        elif numModels is not None:
            model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())
            model_list = random.sample(model_list, numModels)
        else:
            model_list = self.MODELS.copy() #df.columns.tolist() #list(df['model'].unique())

        nModels = len(model_list)
        self.model_eval = np.full((nModels, nModels, nModels), -1)
        
        for it in tqdm(range(numIter)):
            shuffled_list = model_list.copy()
            random.shuffle(shuffled_list)

            t = random.sample(shuffled_list, 3)
            u = [i for i in shuffled_list if i not in t]

            t = [LLM_Model(i, df) for i in t]
            u = [LLM_Model(i, df) for i in u]

            _,rankedList,_ = self.__rankModels(df, self.model_eval, model_list, tuple(t), None, u, [], [])
            rankedLists.append(rankedList)

        estimated_ranking_lists = []
        ranks = []
        for rl in rankedLists:
            estimated_ranking = {i.name(): r+1 for r,i in enumerate(rl)}
            rank = [estimated_ranking[name] for name in model_list] #sorted(model_list)]
            estimated_ranking_lists.append(estimated_ranking)
            ranks.append(rank)

        average_estimated_scores = sorted(zip(np.mean(np.array(ranks), axis=0), model_list))
        average_estimated_ranking = [mod for rnk, mod in average_estimated_scores]
        #average_scores = [rnk for rnk, mod in zipped]

        return model_list, estimated_ranking_lists, average_estimated_ranking, average_estimated_scores


    def fit(self, df: pd.DataFrame):
        """
        df: Dataframe where each row is a benchmark instance,
        and there is a column with the output for each Model
        """
        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."

        #process the dataset
        self.df = df #self.__process_dataset(df)
        # Build a pairwise preference matrix
        #if self.show_progress:
        #    pbar = tqdm(total=self.N**3, position=0, leave=False, desc="Evaluations")

        #if self.show_progress: pbar.update(1)

        # Estimate the ranks
        _, _, average_estimated_ranking, _ = self.__estimate_rankings(self.df, numIter=1)
            #logging.info(f"Iteration {iter}:{delta}")


        self.ranking = average_estimated_ranking

        logger.info(f"Estimated 'greedy' ranks (best to worst): {self.ranking}")

        return self.ranking # Best to worst

    def measure(self, metric='rbo', k=5, p=0.95) -> float:
        """
        Report metrics related to self-rank
        """
        if metric not in ['rbo', 'mapk']:
            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

        if hasattr(self, 'ranking'):
            if self.true_ranking is not None:
                if metric == 'mapk':
                    if k > len(self.true_ranking):
                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
                    actual = [self.true_ranking[:k]]
                    pred = [self.ranking[:k]]
                    return mapk(actual, pred, k=k)
                elif metric == 'rbo':
                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
                else:
                    raise ValueError(f"Metric {metric} not understood.")
            else:
                raise ValueError("True ranking not available for metric calculation.")
        else:
            raise ValueError("Ranking not estimated. Run 'fit' first.")


    def plot(self, caselabel="output"):
        if hasattr(self, 'ranking') & (self.true_ranking is not None):
            return plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)