Spaces:

ibm
/

llm-rank-themselves

Running

File size: 6,281 Bytes

0de1d17


import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from genai import Client
from typing import Callable, List, Optional
from genai.text.generation import TextGenerationParameters, TextGenerationReturnOptions
from genai import Credentials, Client
from langchain.prompts import PromptTemplate
from .metrics import mapk, rank_biased_overlap
from .plots import plot_ranks
import logging
import random

logger = logging.getLogger(__name__)

load_dotenv()

credentials = Credentials.from_env()
client = Client(credentials=credentials)

_INSTRUCTION = "Compare the two responses."
_RUBRIC = "Which is the better response?"
_PROMETHEUS_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of two responses strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, choose a better response between Response 1 and Response 2. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (1 or 2)"
4. Please do not generate any other opening, closing, and explanations.
###Instruction:
{instruction}
###Response 1:
{response_1}
###Response 2:
{response_2}
###Score Rubric:
{rubric}
###Feedback: 
"""

template = PromptTemplate.from_template(_PROMETHEUS_PROMPT)

class LLMJudge:
    """
    Competing method based on an LLM-Judge (Prometheus)
    """

    def __init__(
        self,
        MODELS: List,
        true_ranking: Optional[List] = None,
        show_progress: Optional[bool] = True,
    ):
        self.MODELS = MODELS
        self.N = len(MODELS)
        self.evaluate = prometheus
        self.true_ranking = true_ranking
        self.show_progress = show_progress


    def fit(self, df: pd.DataFrame):
        """
        df: Dataframe where each row is a benchmark instance,
        and there is a column with the output for each Model

        """

        assert set(self.MODELS) == set(df.columns), "Benchmark data models inconsistent with models to be ranked."
        self.N = len(self.MODELS)

        y = np.empty((self.N, self.N))

        if self.show_progress:
            pbar = tqdm(total=self.N**2, position=0, leave=False, desc="Evaluations")

        for i, a in enumerate(self.MODELS):
            for j, b in enumerate(self.MODELS):
                if a == b:
                    y[i, j] = 0
                else:
                    y[i, j] = self.evaluate(client, format_instruction, a=a, b=b, df=df)
                    
                if self.show_progress: pbar.update(1)

        logger.debug(f"Win matrix:\n{y}")
        # Just agregate based on win rates
        df = pd.DataFrame({'wins': y.sum(axis=1)}, index=self.MODELS)
        df = df.sort_values(by='wins', ascending=False)
        self.ranking = df.index.to_list()

        return self.ranking    

    def measure(self, metric='rbo', k=5, p=0.95) -> float:
        """
        Report metric related to self-rank
        """
        if metric not in ['rbo', 'mapk']:
            raise ValueError(f"Metric {metric} not supported (use 'rbo'/'mapk').")

        if hasattr(self, 'ranking'):
            if self.true_ranking is not None:
                if metric == 'mapk':
                    if k > len(self.true_ranking):
                        logger.warning(f"MAPk metric is for k={len(self.true_ranking)}, and not k={k}.")
                    actual = [self.true_ranking[:k]]
                    pred = [self.ranking[:k]]
                    return mapk(actual, pred, k=k)
                elif metric == 'rbo':
                    return rank_biased_overlap(self.true_ranking, self.ranking, p=p)
                else:
                    raise ValueError(f"Metric {metric} not understood.")
            else:
                raise ValueError("True ranking not available for metric calculation.")
        else:
            raise ValueError("Ranking not estimated. Run 'fit' first.")


    def plot(self, caselabel="output"):
        if hasattr(self, 'ranking') & (self.true_ranking is not None):
            plot_ranks(self.true_ranking, self.ranking, "actual", "estimated", caselabel)


def format_instruction(x, a, b):
    """instruction to score with Prometheus"""

    # Build the instruction
    response1 =f"{x[a]}"
    response2 =f"{x[b]}"

    instruction = _INSTRUCTION
    rubric = _RUBRIC
    
    instruction = template.format(
        instruction=instruction, response_1=response1, response_2 = response2, rubric=rubric
    )
    return instruction


def prometheus(client: Client, formatter: Callable, a: str, b:str, df: pd.DataFrame) -> int:
    """
    Query the LLM-as-a-judge model Prometheus to compare responses from model "a" and model "b"

    client: is the `genai` client (using BAM).
    formatter: function that takes the model output and generates the Prometheus instruction
    parameters: BAM specific parameters.
    a: name of model `a` to be evaluated (column in `df` with responses)
    b: named of model `b` to be evaluated
    df: DataFrame with responses
    """

    parameters = TextGenerationParameters(
        max_new_tokens=500, return_options=TextGenerationReturnOptions(), random_seed=42
    )
    
    # Get the correct prompts
    inst = df.apply(formatter, axis=1, args = (a,b))    
    adf = df.copy(deep=True)

    results = []
    for response in client.text.generation.create(
                model_id="kaist-ai/prometheus-8x7b-v2",
                inputs=inst.values.tolist(),
                execution_options={"ordered": True, 'concurrency_limit': 10},
                parameters=parameters,
            ):
        results.append(response.results[0])
    
    adf["generated_text"] = [r.generated_text for r in results]

    def _helper(x):
        try:
            return int(x.split("[RESULT]")[1])
        except (IndexError, ValueError) as e:
            return random.choice([0, 1])

        return 
    adf['A'] = adf["generated_text"].apply(_helper)

    n = adf.shape[0]
    a_wins = sum(adf['A'])
    b_wins = n - a_wins
    
    if a_wins >= b_wins:
        return 1
    else:
        return 0