File size: 6,281 Bytes
0de1d17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from genai import Client
from typing import Callable, List, Optional
from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
from genai import Credentials, Client
from langchain.prompts import PromptTemplate
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
import random

logger = logging.getLogger(__name__)

load_dotenv()

credentials = Credentials.from_env()
client = Client(credentials=credentials)

_INSTRUCTION = "Compare the two responses."
_RUBRIC = "Which is the better response?"
_PROMETHEUS_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
4. Please do not generate any other opening, closing, and explanations.
###Instruction:
{instruction}
###Response 1:
{response_1}
###Response 2:
{response_2}
###Score Rubric:
{rubric}
###Feedback: 
"""

template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)

class LLMJudge:
    """
    Competing method based on an LLM-Judge (Prometheus)
    """

    def __init__(
        self,
        MODELS: List,
        true_ranking: Optional[List] = None,
        show_progress: Optional[bool] = True,
    ):
        self.MODELS = MODELS
        self.N = len(MODELS)
        self.evaluate = prometheus
        self.true_ranking = true_ranking
        self.show_progress = show_progress


    def fit(self, df: pd.DataFrame):
        """
        df: Dataframe where each row is a benchmark instance,
        and there is a column with the output for each Model

        """

        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
        self.N = len(self.MODELS)

        y = np.empty((self.N, self.N))

        if self.show_progress:
            pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")

        for i, a in enumerate(self.MODELS):
            for j, b in enumerate(self.MODELS):
                if a == b:
                    y[i, j] = 0
                else:
                    y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)
                    
                if self.show_progress: pbar.update(1)

        logger.debug(f"Win matrix:\n{y}")
        # Just agregate based on win rates
        df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
        df = df.sort_values(by='wins', ascending=False)
        self.ranking = df.index.to_list()

        return self.ranking    

    def measure(self, metric='rbo', k=5, p=0.95) -> float:
        """
        Report metric related to self-rank
        """
        if metric not in ['rbo', 'mapk']:
            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

        if hasattr(self, 'ranking'):
            if self.true_ranking is not None:
                if metric == 'mapk':
                    if k > len(self.true_ranking):
                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
                    actual = [self.true_ranking[:k]]
                    pred = [self.ranking[:k]]
                    return mapk(actual, pred, k=k)
                elif metric == 'rbo':
                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
                else:
                    raise ValueError(f"Metric {metric} not understood.")
            else:
                raise ValueError("True ranking not available for metric calculation.")
        else:
            raise ValueError("Ranking not estimated. Run 'fit' first.")


    def plot(self, caselabel="output"):
        if hasattr(self, 'ranking') & (self.true_ranking is not None):
            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)


def format_instruction(x, a, b):
    """instruction to score with Prometheus"""

    # Build the instruction
    response1 =f"{x[a]}"
    response2 =f"{x[b]}"

    instruction = _INSTRUCTION
    rubric = _RUBRIC
    
    instruction = template.format(
        instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
    )
    return instruction


def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
    """
    Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"

    client: is the `genai` client (using BAM).
    formatter: function that takes the model output and generates the Prometheus instruction
    parameters: BAM specific parameters.
    a: name of model `a` to be evaluated (column in `df` with responses)
    b: named of model `b` to be evaluated
    df: DataFrame with responses
    """

    parameters = TextGenerationParameters(
        max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
    )
    
    # Get the correct prompts
    inst = df.apply(formatter, axis=1, args = (a,b))    
    adf = df.copy(deep=True)

    results = []
    for response in client.text.generation.create(
                model_id="kaist-ai/prometheus-8x7b-v2",
                inputs=inst.values.tolist(),
                execution_options={"ordered": True, 'concurrency_limit': 10},
                parameters=parameters,
            ):
        results.append(response.results[0])
    
    adf["generated_text"] = [r.generated_text for r in results]

    def _helper(x):
        try:
            return int(x.split("[RESULT]")[1])
        except (IndexError, ValueError) as e:
            return random.choice([0, 1])

        return 
    adf['A'] = adf["generated_text"].apply(_helper)

    n = adf.shape[0]
    a_wins = sum(adf['A'])
    b_wins = n - a_wins
    
    if a_wins >= b_wins:
        return 1
    else:
        return 0