Spaces:

ibm
/

llm-rank-themselves

Running

App Files Files Community

rahulnair23 commited on Jul 26, 2024

Commit

d39c67a

1 Parent(s): 771a43c

instructions

Browse files

Files changed (4) hide show

app.py +32 -23
assets/header.md +6 -0
assets/instructions.md +74 -0
css.css → style.css +0 -0

app.py CHANGED Viewed

@@ -14,32 +14,29 @@ class UI:
     def __init__(self):
         """Load any static assets"""
-        pass
     def header_block(self):
         """Title/description"""
-        gr.Markdown(
-            """<h1 style='text-align: center; color: black;'>🥇 Ranking LLMs without ground truth </h1>"""
-        )
-        gr.Markdown(
-            "This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>"
-            "Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use.<br>"
-            "[Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main).<br>"
-        )
         gr.Markdown('---')
         gr.Markdown('<br>')
     def selection_panel(self):
         """user selections"""
-        gr.Markdown("""<h2 style='color: purple;'> Benchmark experiments </h2> """)
         with gr.Column(variant='compact'):
             self.data = gr.Dropdown(
                 choices=["CNN/DM", "XSUM", "MMLU"],
                 multiselect=False, value='CNN/DM',
                 label="Choose a dataset.",
-                info="The dataset describes a task",
                 interactive=True,
             )
             self.evaluation = gr.Dropdown(
@@ -50,14 +47,14 @@ class UI:
                 info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
             )
             self.nmodels = gr.Dropdown(
-                choices=[None, 10, 20, 30],
                 label="Number of models",
                 info="Sample a subset of LLMs to rank.",
                 value=10,
                 interactive=True,
             )
             self.nrows = gr.Dropdown(
-                choices=[None, 10, 20, 30],
                 label="Number of instances",
                 info="Sample a subset of instances to evaluate (smaller is faster).",
                 value=10,
@@ -89,14 +86,21 @@ class UI:
         """ Synthetic data experiments """
         gr.Markdown('<br>')
         gr.Markdown('---')
-        gr.Markdown("""<h2 style='color: purple;'>Synthetic multiple choice </h2> """)
     def byod_panel(self):
-        """ Synthetic data experiments """
         gr.Markdown('<br>')
         gr.Markdown('---')
-        gr.Markdown("""<h2 style='color: purple;'>BYOD </h2> """)
     def layout(self):
         """ Assemble the overall layout """
@@ -113,7 +117,7 @@ class UI:
                 # Output panel/leaderboard
                 self.output_panel()
-            self.synth_panel()
             self.byod_panel()
             # Register event listeners
@@ -128,6 +132,8 @@ class UI:
         """ Main execution flow for benchmarks """
         #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
         match data:
             case 'MMLU':
@@ -145,7 +151,7 @@ class UI:
                 raise ValueError(f"'{data}' not understood.")
         # Sample fewer models if so needed
-        if nmodels is not None:
             if nmodels < len(MODELS):
                 MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
@@ -168,9 +174,9 @@ class UI:
         # Filter by number of rows
         df.dropna(inplace=True)
-        if nrows is not None:
             if nrows < df.shape[0]:
-                df = df.sample(nrows)
         # Compute true ranking
         adf = adf.set_index(keys).loc[df.index].reset_index()
@@ -227,7 +233,10 @@ class UI:
         # generate outputs
         ranker.fit(df)
-        out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranker.ranking})
         out_metrics = {"rbo": ranker.measure(metric="rbo"),
             "map-1": ranker.measure(metric="mapk", k=1),
@@ -236,7 +245,7 @@ class UI:
             "map-10": ranker.measure(metric="mapk", k=10),
             "evaluations": evaluator.calls
         }
-        eval_metrics = (f"Evaluation measures: <br>"
                         f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
                         f"MAP-3              : {out_metrics['map-3']:0.3f}<br>"
                         f"MAP-5              : {out_metrics['map-5']:0.3f}<br>"

     def __init__(self):
         """Load any static assets"""
+        self.load_css()
     def header_block(self):
         """Title/description"""
+        with open("assets/header.md", 'r') as f:
+            content = f.read()
+        gr.Markdown(content)
         gr.Markdown('---')
         gr.Markdown('<br>')
     def selection_panel(self):
         """user selections"""
+        gr.Markdown("""<h1 style='color: purple;'> Ranking with benchmarks </h1> """)
+        gr.Markdown("""Using inference data gathered from [HELM](https://crfm.stanford.edu/helm/classic/latest/) we first show how our estimated rankings compare to rankings derived from using ground-truth or reference data.""")
         with gr.Column(variant='compact'):
             self.data = gr.Dropdown(
                 choices=["CNN/DM", "XSUM", "MMLU"],
                 multiselect=False, value='CNN/DM',
                 label="Choose a dataset.",
+                info="The dataset describes a specific task, either summarization (CNN/DM, XSUM) or multiple choice (MMLU).",
                 interactive=True,
             )
             self.evaluation = gr.Dropdown(
                 info="How should the Judge model decide the winner? Demo limited to use 'Rouge' for generative tasks like summarization, and 'equality' for multiple choice or classification tasks. In practice you can use any function that compares judge responses to the contestant models.",
             )
             self.nmodels = gr.Dropdown(
+                choices=["All", 10, 20, 30],
                 label="Number of models",
                 info="Sample a subset of LLMs to rank.",
                 value=10,
                 interactive=True,
             )
             self.nrows = gr.Dropdown(
+                choices=["All", 10, 20, 30],
                 label="Number of instances",
                 info="Sample a subset of instances to evaluate (smaller is faster).",
                 value=10,
         """ Synthetic data experiments """
         gr.Markdown('<br>')
         gr.Markdown('---')
+        gr.Markdown("""<h1 style='color: purple;'>Synthetic multiple choice </h1> """)
+        gr.Markdown("Coming soon.")
     def byod_panel(self):
+        """ Instructions panel """
         gr.Markdown('<br>')
         gr.Markdown('---')
+        with open("assets/instructions.md", 'r') as f:
+            content = f.read()
+        gr.Markdown(content)
+        gr.Markdown('---')
+    def load_css(self):
+        with open('style.css', 'r')  as file:
+            self.css = file.read()
     def layout(self):
         """ Assemble the overall layout """
                 # Output panel/leaderboard
                 self.output_panel()
+            #TODO: self.synth_panel()
             self.byod_panel()
             # Register event listeners
         """ Main execution flow for benchmarks """
         #gr.Info(f"Loaded run config: {data}, {evaluation}, {nmodels}.")
+        seed = 40
+        np.random.seed(seed)
         match data:
             case 'MMLU':
                 raise ValueError(f"'{data}' not understood.")
         # Sample fewer models if so needed
+        if nmodels != "All":
             if nmodels < len(MODELS):
                 MODELS = np.random.choice(MODELS, nmodels, replace=False).tolist()
         # Filter by number of rows
         df.dropna(inplace=True)
+        if nrows != "All":
             if nrows < df.shape[0]:
+                df = df.sample(nrows, random_state=seed)
         # Compute true ranking
         adf = adf.set_index(keys).loc[df.index].reset_index()
         # generate outputs
         ranker.fit(df)
+        ranks = ranker.ranking
+        from itertools import zip_longest
+        ranks = [j + i for i, j in zip_longest(ranks, ["🥇 ", "🥈 ", "🥉 "], fillvalue='')]
+        out_df = pd.DataFrame({'rank': range(1, len(true_ranking)+1), 'model': ranks})
         out_metrics = {"rbo": ranker.measure(metric="rbo"),
             "map-1": ranker.measure(metric="mapk", k=1),
             "map-10": ranker.measure(metric="mapk", k=10),
             "evaluations": evaluator.calls
         }
+        eval_metrics = (f"<h2> Evaluation measures </h2>"
                         f"Rank-Biased Overlap: {out_metrics['rbo']:0.3f}<br>"
                         f"MAP-3              : {out_metrics['map-3']:0.3f}<br>"
                         f"MAP-5              : {out_metrics['map-5']:0.3f}<br>"

assets/header.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<h1 style='text-align: center; color: black;'>🥇 Ranking LLMs without ground truth </h1>
+This space demonstrates reference-free ranking of large language models describe in our ACL Findings paper [Ranking Large Language Models without Ground Truth](https://arxiv.org/abs/2402.14860). <br>
+Inspired by real life where both an expert and a knowledgeable person can identify a novice the main idea is to consider triplets of models, where each one of them evaluates the other two, correctly identifying the worst model in the triplet with high probability. Iteratively performing such evaluations yields a estimated ranking that doesn't require ground truth/reference data which can be expensive to gather. The methods are a viable low-resource ranking mechanism for practical use. [Source code](https://huggingface.co/spaces/ibm/llm-rank-themselves/tree/main) is included as part of this space. Installation and usage instructions are provided below.<br>

assets/instructions.md ADDED Viewed

	@@ -0,0 +1,74 @@

+<h1 style='color: purple;'> Using on your data </h1>
+Source code is available as a pip installable python package.
+## Installation
+Use of a virtual enviroment is recommended.
+```bash
+$ conda create -n selfrank python=3.10
+```
+To install,
+```bash
+$ conda activate selfrank
+$ pip install git+https://huggingface.co/spaces/ibm/llm-rank-themselves.git
+```
+## Usage
+Start by gathering model inferences for the same question/prompt across all models you want to rank. The ranking method expects a pandas dataframe, with a row for each prompt, and a column for each model, i.e.
+|     | M1   | M2   | M3   | ...   |
+|:-----------|:-----|:-----|:-----|:------|
+| Q1         | a    | a    | b    | ...   |
+| Q2         | a    | b    | b    | ...   |
+| ...        | ...  | ...  | ...  | ...   |
+With this data, the self ranking procedure can be invoked as follows:
+```python
+import pandas as pd
+from algos.iterative import SelfRank # The full ranking algorithm
+from algos.greedy import SelfRankGreedy # The greedy version
+from algos.triplet import rouge, equality
+f = "inferences.csv"
+df = pd.read_csv(f)
+models_to_rank = df.columns.tolist()
+evaluator = rouge
+true_ranking = None
+r = SelfRank(models_to_rank, evaluator, true_ranking)
+# or, for the greedy version
+# r = SelfRankGreedy(models_to_rank, evaluator, true_ranking)
+r.fit(adf)
+print(r.ranking)
+```
+This should output the estimated ranking (best to worst): `['M5', 'M2', 'M1', ...]`. If true rankings are known, evaluation measures can be computed by `r.measure(metric='rbo')` (for rank-biased overlap) or `r.measure(metric='mapk')` for mean-average precision.
+We provide implementations of few evaluation function, i.e. the function the judge model uses to evaluate the contestant models. While `rouge` is recommended for generative tasks like summarization, `equality` would be more appropriate for multiple choice settings (like MMLU) or classification tasks with a discrete set of outcomes.
+You can also pass any arbitrary function to the ranker as long as it follows the following signature:
+```python
+def user_function(a: str, b:str, c:str, df:pd.DataFrame) -> int:
+    """
+    use model c to evaluate a vs. b
+    df: is a dataframe with inferences of all models
+    returns 1 if a is preferred or 0 if b is preferred
+    """
+    # Is this example, we count number of times a/b is the same as c
+    ties = df[a] == df[b]
+    a_wins = sum((df[a] == df[c]) & ~(ties))
+    b_wins = sum((df[b] == df[c]) & ~(ties))
+    if a_wins >= b_wins:
+        return 1
+    else:
+        return 0
+```
+<br>

css.css → style.css RENAMED Viewed

File without changes