import pandas as pd import numpy as np from tqdm import tqdm from dotenv import load_dotenv from genai import Client from typing import Callable, List, Optional from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions from genai import Credentials, Client from langchain.prompts import PromptTemplate from .metrics import mapk, rank_biased_overlap from .plots import plot_ranks import logging import random logger = logging.getLogger(__name__) load_dotenv() credentials = Credentials.from_env() client = Client(credentials=credentials) _INSTRUCTION = "Compare the two responses." _RUBRIC = "Which is the better response?" _PROMETHEUS_PROMPT = """###Task Description: An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given. 1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general. 2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric. 3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)" 4. Please do not generate any other opening, closing, and explanations. ###Instruction: {instruction} ###Response 1: {response_1} ###Response 2: {response_2} ###Score Rubric: {rubric} ###Feedback: """ template = PromptTemplate.from_template(_PROMETHEUS_PROMPT) class LLMJudge: """ Competing method based on an LLM-Judge (Prometheus) """ def __init__( self, MODELS: List, true_ranking: Optional[List] = None, show_progress: Optional[bool] = True, ): self.MODELS = MODELS self.N = len(MODELS) self.evaluate = prometheus self.true_ranking = true_ranking self.show_progress = show_progress def fit(self, df: pd.DataFrame): """ df: Dataframe where each row is a benchmark instance, and there is a column with the output for each Model """ assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked." self.N = len(self.MODELS) y = np.empty((self.N, self.N)) if self.show_progress: pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations") for i, a in enumerate(self.MODELS): for j, b in enumerate(self.MODELS): if a == b: y[i, j] = 0 else: y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df) if self.show_progress: pbar.update(1) logger.debug(f"Win matrix:\n{y}") # Just agregate based on win rates df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS) df = df.sort_values(by='wins', ascending=False) self.ranking = df.index.to_list() return self.ranking def measure(self, metric='rbo', k=5, p=0.95) -> float: """ Report metric related to self-rank """ if metric not in ['rbo', 'mapk']: raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').") if hasattr(self, 'ranking'): if self.true_ranking is not None: if metric == 'mapk': if k > len(self.true_ranking): logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.") actual = [self.true_ranking[:k]] pred = [self.ranking[:k]] return mapk(actual, pred, k=k) elif metric == 'rbo': return rank_biased_overlap(self.true_ranking, self.ranking, p=p) else: raise ValueError(f"Metric {metric} not understood.") else: raise ValueError("True ranking not available for metric calculation.") else: raise ValueError("Ranking not estimated. Run 'fit' first.") def plot(self, caselabel="output"): if hasattr(self, 'ranking') & (self.true_ranking is not None): plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel) def format_instruction(x, a, b): """instruction to score with Prometheus""" # Build the instruction response1 =f"{x[a]}" response2 =f"{x[b]}" instruction = _INSTRUCTION rubric = _RUBRIC instruction = template.format( instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric ) return instruction def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int: """ Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b" client: is the `genai` client (using BAM). formatter: function that takes the model output and generates the Prometheus instruction parameters: BAM specific parameters. a: name of model `a` to be evaluated (column in `df` with responses) b: named of model `b` to be evaluated df: DataFrame with responses """ parameters = TextGenerationParameters( max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42 ) # Get the correct prompts inst = df.apply(formatter, axis=1, args = (a,b)) adf = df.copy(deep=True) results = [] for response in client.text.generation.create( model_id="kaist-ai/prometheus-8x7b-v2", inputs=inst.values.tolist(), execution_options={"ordered": True, 'concurrency_limit': 10}, parameters=parameters, ): results.append(response.results[0]) adf["generated_text"] = [r.generated_text for r in results] def _helper(x): try: return int(x.split("[RESULT]")[1]) except (IndexError, ValueError) as e: return random.choice([0, 1]) return adf['A'] = adf["generated_text"].apply(_helper) n = adf.shape[0] a_wins = sum(adf['A']) b_wins = n - a_wins if a_wins >= b_wins: return 1 else: return 0