Spaces:

root-signals
/

CustomJudgeDemo

Running

File size: 5,040 Bytes

8628f17
 
 
e333fa4
 
ebeb9b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3e0b87
ebeb9b4
8628f17
 
 
 
 
d3e0b87
8628f17
d3e0b87
 
 
8628f17
 
 
 
 
 
d3e0b87
 
 
8628f17
 
d3e0b87
8628f17
d3e0b87
8628f17
d3e0b87
 
 
8628f17
d3e0b87
8628f17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e984be7
8628f17
 
 
 
 
 
 
 
ebeb9b4
 
d3e0b87
ebeb9b4
d3e0b87
ebeb9b4
 
 
 
d3e0b87
ebeb9b4
 
 
 
 
 
 
 
 
8628f17
ebeb9b4
8628f17
d3e0b87
ebeb9b4
 
d3e0b87
 
ebeb9b4
 
 
 
 
8628f17
 
ebeb9b4
d3e0b87
8628f17
 
 
 
 
 
d3e0b87
8628f17
 
 
 
 
d3e0b87
8628f17
 
 
d3e0b87
8628f17

import gradio as gr
from root import RootSignals

client = None
custom_judge = None
MODELS = [
    "claude-3-5-sonnet",
    "claude-3-haiku-20240307",
    "claude-3-opus-20240229",
    "claude-3-sonnet-20240229",
    "codestral",
    "command-r",
    "command-r-plus",
    "fireworks_ai/llama-v3-70b-instruct",
    "gpt-4",
    "gpt-4o",
    "gpt-4o-mini",
    "gpt-4-turbo",
    "groq/llama3-70b-8192",
    "mistral-large-latest",
    "mistral-medium",
    "o1-mini",
    "o1-preview",
    "open-codestral-mamba",
    "RootJudge",
]

def initialize_client(api_key):
    global client
    return RootSignals(api_key=api_key)

def create_judge(api_key, judge_name, judge_prompt):
    global client, custom_judge
    if not api_key:
        return gr.Info("🔑 Please enter your Root Signals API key first!")
    
    if not client:
        client = initialize_client(api_key)
    
    # Create custom judge
    custom_judge = client.evaluators.create(
        name=judge_name,
        predicate=f'{judge_prompt}\n\nTEXT: {{{{response}}}}',
        intent=f"Intent: {judge_name}",
        model="gemini-2.0-flash",
    )
    
    return gr.Info(f"Your custom LLM-Judge '{judge_name}' is created successfully!")

def evaluate_response(api_key, llm_response):
    global client, custom_judge
    if not api_key:
        return gr.Info("🔑 Please enter your Root Signals API key first!"), ""
        
    if not client or not custom_judge:
        return gr.Info("Please create a judge first"), ""
    
    # Run evaluation using custom judge
    evaluation_result = custom_judge.run(response=llm_response)
    score = evaluation_result.score
    justification = evaluation_result.justification
    return score, justification

# Create the interface with a custom layout
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
    gr.HTML("""<a href="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/root-signals/RootEvaluatorsDemo">
               <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/root-signals/RootEvaluatorsDemo" />
               </a>""")

    with gr.Row():
        gr.Image(value="https://app.rootsignals.ai/images/root-signals-color.svg", height=70)
        gr.Markdown("<div>&nbsp;</div>")  # Add some space below the image
    gr.Markdown("# Custom Judge Demo by Root Signals")

    gr.Markdown("[Sign-up](https://app.rootsignals.ai/register) to create your API key or [create a temporary one](https://app.rootsignals.ai/demo-user)!")
    
    api_key = gr.Textbox(
        label="🔑 Root Signals API Key",
        placeholder="Enter your Root Signals API key...",
        type="password",
        show_label=True,
    )
    
    gr.Markdown("---")  # Divider

    gr.Markdown("## Create Custom Judge")
    with gr.Row():
        judge_name = gr.Textbox(label="👨‍⚖️ Judge Name", value="Medical Jargon Judge", placeholder="Enter a name for your custom judge...", interactive=True)
    with gr.Row():
        judge_prompt = gr.Textbox(
            label="📝 Custom Judge Prompt",
            placeholder="Enter the custom judge prompt...",
            value="Evaluate the medical jargon use of a text. Higher scores mean the text include a lot of technical jargon such as drug names and very specific medical terminology.",
            interactive=True,
            lines=5,
            max_lines=10
        )
        create_judge_btn = gr.Button("✨ CREATE JUDGE", variant="primary")
    info_message = gr.Info()
    
    gr.Markdown("---")  # Divider
    
    with gr.Row():
        # Left column - Evaluation
        with gr.Column():
            gr.Markdown("## Execute")
            llm_response = gr.Textbox(
                label="🤖 LLM Response", 
                placeholder="Enter the LLM response to be evaluated...",
                value="This CCR5 co-receptor is used by almost all primary HIV-1 isolates regardless of viral genetic subtype.",
                interactive=True,
                lines=5,
                max_lines=10
            )
            evaluate_btn = gr.Button("🧐 EVALUATE", variant="primary", visible=True)
        
        # Right column - Results
        with gr.Column():
            gr.Markdown("## Results")
            score = gr.Textbox(label="📊 Score (between 0 and 1)", interactive=False)
            justification = gr.TextArea(label="💬 Justification", interactive=False)
    
    # Button click events
    create_judge_btn.click(
        fn=create_judge,
        inputs=[api_key, judge_name, judge_prompt],
        outputs=info_message
    )
    
    evaluate_btn.click(
        fn=evaluate_response,
        inputs=[api_key, llm_response],
        outputs=[score, justification]
    )

    gr.Markdown("[🌐 Homepage](https://www.rootsignals.ai/) | [🤖 Github Repo](https://sdk.rootsignals.ai/en/latest/) | [🐍 Python SDK Docs](https://sdk.rootsignals.ai/en/latest/) | [💬 Discord](https://discord.gg/EhazTQsFnj)")

if __name__ == "__main__":
    demo.launch()