Spaces:
Running
Running
File size: 9,223 Bytes
f9a609b 5e3dbdf f9a609b 8836ef0 735951e f9a609b 06d6973 f9a609b 76957ec f9a609b 735951e b7ca860 735951e b7ca860 f9a609b 76957ec f9a609b e8798dd 5ae1b7f e8798dd 549c114 e8798dd f9a609b 5e3dbdf f9a609b e8798dd f9a609b aef8eda f9a609b aef8eda f9a609b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import os
import re
import io
import json
from typing import List, Tuple, Union
from pathlib import Path
import gradio as gr
from leptonai import Client
HF_TOKEN = os.environ.get("HF_TOKEN", None)
LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None)
client = Client("https://yb15a7dy-glider.tin.lepton.run", "glider", LEPTON_API_TOKEN)
PROMPT = """Analyze the following pass criteria carefully and score the text based on the rubric defined below.
To perform this evaluation, you must:
1. Understand the text tags, pass criteria and rubric thoroughly.
2. Review the finer details of the text and the rubric.
3. Compare the tags to be evaluated to the score descriptions in the rubric.
4. Pay close attention to small details that might impact the final score and form accurate associations between tags and pass criteria.
5. Write a detailed reasoning justifying your evaluation in a bullet point format.
6. The reasoning must summarize the overall strengths and weaknesses of the output while quoting exact phrases from the output wherever required.
7. Output a list of words or phrases that you believe are the most important in determining the score.
8. Assign a final score based on the scoring rubric.
Data to evaluate:
{user_input}
Pass Criteria:
{pass_criteria}
Rubric:
{rubric}
Your output must in the following format:
<reasoning>
[Detailed reasoning justifying your evaluation in a bullet point format according to the specifics defined above]
</reasoning>
<highlight>
[List of words or phrases that you believe are the most important in determining the score]
</highlight>
<score>
[The final integer score assigned based on the scoring rubric]
</score>
"""
EXAMPLES = [
{
"emoji": "🌁",
"model_output": "The sky is green.",
"user_input": "What color is the sky?",
"gold_answer": "",
"retrieved_context": "The sky is blue.",
"pass_criteria": "Is the MODEL OUTPUT grounded in the CONTEXT?",
"rubric": "0. The pass criteria is not satisfied and not accurately followed\n1. The pass criteria is satisfied and accurately followed",
}
]
HEADER = """
<div style="width: 100%; display: flex; flex-direction: column; gap: 24px; padding-top: 24px; position: relative">
<img src="https://postimage.me/images/2024/12/19/ICONGLIDER.md.png" width="350" style="position: absolute; top: 0; right: 36px">
<div style="display: flex; justify-content: space-between; z-index: 1;">
<a href="https://www.patronus.ai">
<img src="https://postimage.me/images/2024/12/19/patronuslogo-white.png" width="250">
</a>
<div style="display: flex; gap: 12px;">
<a href="https://huggingface.co/PatronusAI/glider">
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Model_Card-Huggingface-orange" height="20">
</a>
<a href="https://github.com/patronus-ai/glider">
<img src="https://img.shields.io/badge/GitHub-Glider-indigo" height="20">
</a>
<a href="https://arxiv.org/abs/2412.14140">
<img src="https://img.shields.io/badge/arXiv-2412.14140-b31b1b.svg" height="20">
</a>
</div>
</div>
<div>
<h1 style="color: #fff !important">GLIDER: Grading LLM Interactions and Decisions using Explainable Ranking</h1>
<h2 style="color: #fff !important">Patronus GLIDER Demo</h2>
</div>
</div>
<br><br>
<div style="color: #fff !important"><span style="color: inherit; font-weight: 600">GLIDER</span> is a powerful 3B evaluator LLM that can score any text input and associated context on arbitrary user defined criteria.</div>
<br><br>
<div style="color: #fff !important"><span style="color: inherit; font-weight: 600">Getting Started</span>: First, provide a model output (text generated by your model) and user input (text used to prompt your model) and optionally a gold answer (label or gold answer to the prompt) and retrieved context (context used for text generated by your model). Next, provide a pass criteria (description of a passing evaluation). Finally, provide an optional rubric (scoring scales with explanations) and then click submit. The GLIDER Output panel will provide a score and reasoning which is a human readable explanation of the score.</div>
"""
EXAMPLES_HEADER = """
<h1 style="color: #fff !important">
Try it Yourself!
</h1>
"""
css = """
.example-button {
width: fit-content;
font-size: 1rem;
font-weight: 400 !important;
padding: .5rem 1rem;
text-align: start;
}
.fixed-height-button {
height: fit-content;
word-break: break-all;
font-size: .85rem;
}
"""
theme = gr.themes.Default(
spacing_size="sm",
font=[gr.themes.GoogleFont("Plus Jakarta Sans"), "Arial", "sans-serif"],
primary_hue="indigo",
secondary_hue="purple"
).set(
background_fill_primary="radial-gradient(circle at 90% 0%, rgba(255,255,255,0.25), #000000 35%)")
def format_string(retrieved_context, user_input, model_output, gold_answer):
parts = []
if retrieved_context:
parts.append(f"<CONTEXT>\n{retrieved_context}\n</CONTEXT>")
if user_input:
parts.append(f"<USER INPUT>\n{user_input}\n</USER INPUT>")
if model_output:
parts.append(f"<MODEL OUTPUT>\n{model_output}\n</MODEL OUTPUT>")
if gold_answer:
parts.append(f"<GOLD ANSWER>\n{gold_answer}\n</GOLD ANSWER>")
return "\n".join(parts)
def extract_spans(input_string):
# Regex patterns to extract content within the reasoning, highlight, and score tags
reasoning_pattern = r"<reasoning>\s*(.*?)\s*</reasoning>"
highlight_pattern = r"<highlight>\s*(.*?)\s*</highlight>"
score_pattern = r"<score>\s*(\d+)\s*</score(?:\>|)"
# Using re.search to extract the contents based on our defined patterns
reasoning_match = re.search(reasoning_pattern, input_string, re.DOTALL)
highlight_match = re.search(highlight_pattern, input_string)
score_match = re.search(score_pattern, input_string)
# Extracting the matched groups if present
reasoning = reasoning_match.group(1) if reasoning_match else None
highlight = highlight_match.group(1).strip() if highlight_match else None
score = int(score_match.group(1)) if score_match else None
# Return a dictionary with the extracted content
return score, reasoning, highlight
def model_call(model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric):
if model_output == "" or user_input == "" or pass_criteria == "":
return "", "", ""
combined_user_input = format_string(retrieved_context, user_input, model_output, gold_answer)
NEW_PROMPT_FORMAT = PROMPT.format(user_input=combined_user_input, pass_criteria=pass_criteria, rubric=rubric)
response = client.api.v1.chat.completions(
model="glider",
messages=[{"role": "user", "content": NEW_PROMPT_FORMAT}],
temperature=0,
top_p=0.999,
max_tokens=2048,
stream=False,
)
score, reasoning, highlight_spans = extract_spans(response["choices"][0]["message"]["content"])
return score, reasoning, highlight_spans
def select_template(template):
return template["model_output"], template["user_input"], template["gold_answer"], template["retrieved_context"], template["pass_criteria"], template["rubric"]
with gr.Blocks(css=css, theme=theme) as demo:
gr.Markdown(HEADER)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
gr.Markdown("<div style='color: #fff !important; font-weight: 600'>Your Inputs</div>")
model_output = gr.Textbox(label="MODEL OUTPUT (required)")
user_input = gr.Textbox(label="USER INPUT (required)")
gold_answer = gr.Textbox(label="GOLD ANSWER")
retrieved_context = gr.Textbox(label="RETRIEVED CONTEXT")
pass_criteria = gr.Textbox(label="Pass Criteria (required)")
rubric = gr.Textbox(label="Rubric")
with gr.Row():
clear_btn = gr.ClearButton([model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric])
submit_button = gr.Button("Submit", variant="primary")
with gr.Column(scale=1):
gr.Markdown("<div style='color: #fff !important; font-weight: 600'>GLIDER Output</div>")
score = gr.Textbox(label="Score")
reasoning = gr.Textbox(label="Reasoning")
highlights = gr.Textbox(label="Highlights")
gr.Markdown(" ")
gr.Markdown(EXAMPLES_HEADER)
with gr.Row():
with gr.Column():
for _, example in enumerate(EXAMPLES):
template_btn = gr.Button(f"{example['emoji']} {example['model_output']}", elem_classes="example-button")
template_btn.click(
fn=select_template,
inputs=[gr.State(example)],
outputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric]
)
submit_button.click(fn=model_call, inputs=[model_output, user_input, gold_answer, retrieved_context, pass_criteria, rubric], outputs=[score, reasoning, highlights])
demo.launch()
|