File size: 8,656 Bytes
f9a609b
 
 
 
 
 
 
 
 
5e3dbdf
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e3dbdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9a609b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import os
import re
import io
import json
from typing import List, Tuple, Union
from pathlib import Path
import gradio as gr
from leptonai import Client

HF_TOKEN = os.environ.get("HF_TOKEN", None)
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)

client = Client("https://yb15a7dy-glider.tin.lepton.run", "glider", LEPTON_API_TOKEN)

PROMPT = """Analyze the following pass criteria carefully and score the text based on the rubric defined below.

To perform this evaluation, you must:

1. Understand the text tags, pass criteria and rubric thoroughly.
2. Review the finer details of the text and the rubric.
3. Compare the tags to be evaluated to the score descriptions in the rubric.
4. Pay close attention to small details that might impact the final score and form accurate associations between tags and pass criteria.
5. Write a detailed reasoning justifying your evaluation in a bullet point format. 
6. The reasoning must summarize the overall strengths and weaknesses of the output while quoting exact phrases from the output wherever required.
7. Output a list of words or phrases that you believe are the most important in determining the score.
8. Assign a final score based on the scoring rubric.

Data to evaluate:
{user_input}

Pass Criteria:
{pass_criteria}

Rubric:
{rubric}

Your output must in the following format:
<reasoning>
[Detailed reasoning justifying your evaluation in a bullet point format according to the specifics defined above]
</reasoning>
<highlight>
[List of words or phrases that you believe are the most important in determining the score]
</highlight>
<score>
[The final integer score assigned based on the scoring rubric]
</score>
"""

EXAMPLES = [
    {
        "emoji": "🌁",
        "model_output": "The sky is green.",
        "user_input": "What color is the sky?",
        "gold_answer": "",
        "retrieved_context": "The sky is blue.",
        "pass_criteria": "Is the MODEL OUTPUT grounded in the CONTEXT?",
        "rubric": "0. The pass criteria is not satisfied and not accurately followed\n1. The pass criteria is satisfied and accurately followed",
    }

]

HEADER = """
<div style="width: 100%; display: flex; flex-direction: column; gap: 24px; padding-top: 24px; position: relative">
    <img src="https://postimage.me/image/ICONGLIDER.U5e6TO" width="175" style="position: absolute; top: 0; right: 48px">
    <div style="display: flex; justify-content: space-between; z-index: 1;">
        <a href="https://www.patronus.ai">
            <img src="https://postimage.me/images/2024/07/31/FullLogo_ColorDefault.png" width="250">
        </a>
        <div style="display: flex; gap: 12px;">
            <a href="https://huggingface.co/PatronusAI/glider">
                <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange" height="20">
            </a>
            <a href="https://github.com/patronus-ai/glider">
                <img src="https://img.shields.io/badge/GitHub-Glider-indigo" height="20">
            </a>
            <a href="https://arxiv.org/abs/2412.14140">
                <img src="https://img.shields.io/badge/arXiv-2412.14140-b31b1b.svg" height="20">
            </a>
        </div>
    </div>
    <div>
        <h1>GLIDER: Grading LLM Interactions and Decisions using Explainable Ranking</h1>
        <h2>Patronus GLIDER Demo</h2>
    </div>
</div>

**GLIDER** is a powerful 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria.

**Getting Started**: First, provide a model output (text generated by your model) and user input (text used to prompt your model) and optionally a gold answer (label or gold answer to the prompt) and retrieved context (context used for text generated by your model). Next, provide a pass criteria (description of a passing evaluation). Finally, provide an optional rubric (scoring scales with explanations) and then click submit. The GLIDER Output panel will provide a score and reasoning which is a human readable explanation of the score.

"""

EXAMPLES_HEADER = """
# Try it Yourself!
"""

css = """
.example-button {
    width: fit-content;
    font-size: 1rem;
    font-weight: 400 !important;
    padding: .5rem 1rem;
    text-align: start;
}
.fixed-height-button {
    height: fit-content;
    word-break: break-all;
    font-size: .85rem;
}
"""

def format_string(retrieved_context, user_input, model_output, gold_answer):
    parts = []
    if retrieved_context:
        parts.append(f"<CONTEXT>\n{retrieved_context}\n</CONTEXT>")
    if user_input:
        parts.append(f"<USER INPUT>\n{user_input}\n</USER INPUT>")
    if model_output:
        parts.append(f"<MODEL OUTPUT>\n{model_output}\n</MODEL OUTPUT>")
    if gold_answer:
        parts.append(f"<GOLD ANSWER>\n{gold_answer}\n</GOLD ANSWER>")
    return "\n".join(parts)

def extract_spans(input_string):
    # Regex patterns to extract content within the reasoning, highlight, and score tags
    reasoning_pattern = r"<reasoning>\s*(.*?)\s*</reasoning>"
    highlight_pattern = r"<highlight>\s*(.*?)\s*</highlight>"
    score_pattern = r"<score>\s*(\d+)\s*</score(?:\>|)"
    
    # Using re.search to extract the contents based on our defined patterns
    reasoning_match = re.search(reasoning_pattern, input_string, re.DOTALL)
    highlight_match = re.search(highlight_pattern, input_string)
    score_match = re.search(score_pattern, input_string)
    
    # Extracting the matched groups if present
    reasoning = reasoning_match.group(1) if reasoning_match else None
    highlight = highlight_match.group(1).strip() if highlight_match else None
    score = int(score_match.group(1)) if score_match else None
    # Return a dictionary with the extracted content
    return score, reasoning, highlight

def model_call(model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric):
    if model_output == "" or user_input == "" or pass_criteria == "":
        return "", "", ""
    combined_user_input = format_string(retrieved_context, user_input, model_output, gold_answer)
    NEW_PROMPT_FORMAT = PROMPT.format(user_input=combined_user_input, pass_criteria=pass_criteria, rubric=rubric)
    response = client.api.v1.chat.completions(
        model="glider",
        messages=[{"role": "user", "content": NEW_PROMPT_FORMAT}],
        temperature=0,
        top_p=0.999,
        max_tokens=2048,
        stream=False,
    )
    score, reasoning, highlight_spans = extract_spans(response["choices"][0]["message"]["content"])
    return score, reasoning, highlight_spans

def select_template(template):
    return template["model_output"], template["user_input"], template["gold_answer"], template["retrieved_context"], template["pass_criteria"], template["rubric"]

with gr.Blocks(css=css, theme=gr.themes.Default(spacing_size="sm", font=[gr.themes.GoogleFont("Plus Jakarta Sans"), "Arial", "sans-serif"], primary_hue="indigo", secondary_hue="purple")) as demo:
    gr.Markdown(HEADER)
    with gr.Row(equal_height=True):
        with gr.Column(scale=1):
            gr.Markdown("**Your Inputs**")
            model_output = gr.Textbox(label="MODEL OUTPUT (required)")
            user_input = gr.Textbox(label="USER INPUT (required)")
            gold_answer = gr.Textbox(label="GOLD ANSWER")
            retrieved_context = gr.Textbox(label="RETRIEVED CONTEXT")
            pass_criteria = gr.Textbox(label="Pass Criteria (required)")
            rubric = gr.Textbox(label="Rubric")
            with gr.Row():
                clear_btn = gr.ClearButton([model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric])
                submit_button = gr.Button("Submit", variant="primary")
        with gr.Column(scale=1):
            gr.Markdown("**GLIDER Output**")
            score = gr.Textbox(label="Score")
            reasoning = gr.Textbox(label="Reasoning")
            highlights = gr.Textbox(label="Highlights")
    gr.Markdown("&nbsp;")
    gr.Markdown(EXAMPLES_HEADER)
    with gr.Row():
        with gr.Column():
            for _, example in enumerate(EXAMPLES):
                template_btn = gr.Button(f"{example['emoji']} {example['model_output']}", elem_classes="example-button")
                template_btn.click(
                    fn=select_template,
                    inputs=[gr.State(example)],
                    outputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric]
                    )

    submit_button.click(fn=model_call, inputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric], outputs=[score, reasoning, highlights])
demo.launch()