Spaces:
Running
Running
File size: 6,281 Bytes
0de1d17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from genai import Client
from typing import Callable, List, Optional
from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
from genai import Credentials, Client
from langchain.prompts import PromptTemplate
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
import random
logger = logging.getLogger(__name__)
load_dotenv()
credentials = Credentials.from_env()
client = Client(credentials=credentials)
_INSTRUCTION = "Compare the two responses."
_RUBRIC = "Which is the better response?"
_PROMETHEUS_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
4. Please do not generate any other opening, closing, and explanations.
###Instruction:
{instruction}
###Response 1:
{response_1}
###Response 2:
{response_2}
###Score Rubric:
{rubric}
###Feedback:
"""
template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)
class LLMJudge:
"""
Competing method based on an LLM-Judge (Prometheus)
"""
def __init__(
self,
MODELS: List,
true_ranking: Optional[List] = None,
show_progress: Optional[bool] = True,
):
self.MODELS = MODELS
self.N = len(MODELS)
self.evaluate = prometheus
self.true_ranking = true_ranking
self.show_progress = show_progress
def fit(self, df: pd.DataFrame):
"""
df: Dataframe where each row is a benchmark instance,
and there is a column with the output for each Model
"""
assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
self.N = len(self.MODELS)
y = np.empty((self.N, self.N))
if self.show_progress:
pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")
for i, a in enumerate(self.MODELS):
for j, b in enumerate(self.MODELS):
if a == b:
y[i, j] = 0
else:
y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)
if self.show_progress: pbar.update(1)
logger.debug(f"Win matrix:\n{y}")
# Just agregate based on win rates
df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
df = df.sort_values(by='wins', ascending=False)
self.ranking = df.index.to_list()
return self.ranking
def measure(self, metric='rbo', k=5, p=0.95) -> float:
"""
Report metric related to self-rank
"""
if metric not in ['rbo', 'mapk']:
raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")
if hasattr(self, 'ranking'):
if self.true_ranking is not None:
if metric == 'mapk':
if k > len(self.true_ranking):
logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
actual = [self.true_ranking[:k]]
pred = [self.ranking[:k]]
return mapk(actual, pred, k=k)
elif metric == 'rbo':
return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
else:
raise ValueError(f"Metric {metric} not understood.")
else:
raise ValueError("True ranking not available for metric calculation.")
else:
raise ValueError("Ranking not estimated. Run 'fit' first.")
def plot(self, caselabel="output"):
if hasattr(self, 'ranking') & (self.true_ranking is not None):
plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)
def format_instruction(x, a, b):
"""instruction to score with Prometheus"""
# Build the instruction
response1 =f"{x[a]}"
response2 =f"{x[b]}"
instruction = _INSTRUCTION
rubric = _RUBRIC
instruction = template.format(
instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
)
return instruction
def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
"""
Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"
client: is the `genai` client (using BAM).
formatter: function that takes the model output and generates the Prometheus instruction
parameters: BAM specific parameters.
a: name of model `a` to be evaluated (column in `df` with responses)
b: named of model `b` to be evaluated
df: DataFrame with responses
"""
parameters = TextGenerationParameters(
max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
)
# Get the correct prompts
inst = df.apply(formatter, axis=1, args = (a,b))
adf = df.copy(deep=True)
results = []
for response in client.text.generation.create(
model_id="kaist-ai/prometheus-8x7b-v2",
inputs=inst.values.tolist(),
execution_options={"ordered": True, 'concurrency_limit': 10},
parameters=parameters,
):
results.append(response.results[0])
adf["generated_text"] = [r.generated_text for r in results]
def _helper(x):
try:
return int(x.split("[RESULT]")[1])
except (IndexError, ValueError) as e:
return random.choice([0, 1])
return
adf['A'] = adf["generated_text"].apply(_helper)
n = adf.shape[0]
a_wins = sum(adf['A'])
b_wins = n - a_wins
if a_wins >= b_wins:
return 1
else:
return 0
|