Spaces:

MrSimple07
/

RuSimulBench_arena

Sleeping

File size: 4,719 Bytes

f2a274f

import gradio as gr
import time
import json
import pandas as pd
from typing import List, Dict, Any

class BenchmarkSystem:
    def __init__(self):
        self.results = {}
        
    def run_benchmark(self, 
                     model_name: str,
                     test_cases: List[str],
                     system_prompt: str = "") -> Dict[str, Any]:
        """
        Run benchmark tests and measure performance metrics
        """
        results = {
            "model_name": model_name,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "total_tokens": 0,
            "total_time": 0,
            "responses": [],
            "metrics": {}
        }
        
        start_time = time.time()
        
        # Simulate processing test cases
        for test in test_cases:
            # Here you would add actual model inference
            # This is a placeholder for demonstration
            time.sleep(0.5)  # Simulate processing time
            results["responses"].append({
                "input": test,
                "output": f"Sample response for: {test}",
                "tokens": len(test.split()),
                "time": 0.5
            })
            
        results["total_time"] = time.time() - start_time
        results["total_tokens"] = sum(r["tokens"] for r in results["responses"])
        
        # Calculate aggregate metrics
        results["metrics"] = {
            "avg_response_time": results["total_time"] / len(test_cases),
            "avg_tokens_per_response": results["total_tokens"] / len(test_cases)
        }
        
        self.results[model_name] = results
        return results

def format_results(results: Dict[str, Any]) -> str:
    """Format benchmark results for display"""
    output = f"Model: {results['model_name']}\n"
    output += f"Timestamp: {results['timestamp']}\n"
    output += f"Total Time: {results['total_time']:.2f}s\n"
    output += f"Total Tokens: {results['total_tokens']}\n\n"
    
    output += "Metrics:\n"
    for metric, value in results["metrics"].items():
        output += f"- {metric}: {value:.2f}\n"
    
    return output

def save_results(results: Dict[str, Any], filename: str = "benchmark_results.json"):
    """Save benchmark results to a file"""
    with open(filename, "w") as f:
        json.dump(results, f, indent=2)
    return f"Results saved to {filename}"

def run_benchmark_interface(model_name: str, 
                          test_cases: str,
                          system_prompt: str) -> tuple[str, pd.DataFrame]:
    """
    Gradio interface function for running benchmarks
    """
    benchmark = BenchmarkSystem()
    
    # Parse test cases (assuming one per line)
    test_cases_list = [t.strip() for t in test_cases.split("\n") if t.strip()]
    
    # Run benchmark
    results = benchmark.run_benchmark(
        model_name=model_name,
        test_cases=test_cases_list,
        system_prompt=system_prompt
    )
    
    # Create DataFrame for response details
    df = pd.DataFrame([
        {
            "Input": r["input"],
            "Output": r["output"],
            "Tokens": r["tokens"],
            "Time (s)": r["time"]
        }
        for r in results["responses"]
    ])
    
    # Save results
    save_results(results)
    
    return format_results(results), df

# Create Gradio interface
with gr.Blocks(title="Model Benchmark Suite") as demo:
    gr.Markdown("# Model Benchmark Suite")
    gr.Markdown("Test and compare model performance across different scenarios")
    
    with gr.Row():
        with gr.Column():
            model_name = gr.Textbox(
                label="Model Name",
                placeholder="Enter model name or identifier"
            )
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="Enter system prompt if applicable",
                lines=2
            )
            test_cases = gr.Textbox(
                label="Test Cases",
                placeholder="Enter test cases (one per line)",
                lines=5
            )
            run_button = gr.Button("Run Benchmark")
        
        with gr.Column():
            results_text = gr.Textbox(
                label="Benchmark Results",
                lines=10,
                readonly=True
            )
            results_table = gr.DataFrame(
                label="Detailed Results",
                headers=["Input", "Output", "Tokens", "Time (s)"]
            )
    
    run_button.click(
        fn=run_benchmark_interface,
        inputs=[model_name, test_cases, system_prompt],
        outputs=[results_text, results_table]
    )

if __name__ == "__main__":
    demo.launch()