File size: 2,954 Bytes
143b62d
 
 
 
4fb58cc
87bb867
143b62d
 
 
 
d3d48e1
143b62d
 
 
87bb867
4fb58cc
 
 
87bb867
 
 
 
 
 
 
 
 
 
 
 
 
e302f12
87bb867
 
 
e302f12
87bb867
 
 
e302f12
87bb867
 
 
 
 
 
 
143b62d
 
 
 
 
 
 
dea4ce7
143b62d
 
 
 
 
 
 
 
 
 
 
 
 
 
87bb867
 
143b62d
 
aa733b6
87bb867
aa733b6
143b62d
af1a6de
143b62d
f253a0d
143b62d
f253a0d
143b62d
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from dotenv import load_dotenv
import gradio as gr
import random

import numpy as np

from utils.model import Model
from utils.data import dataset
from utils.metric import metric_rouge_score

from pages.summarization_playground import model, generate_answer

load_dotenv()

def display_results(response_list):
    overall_score = np.mean([r['metric_score']['rouge_score'] for r in response_list])
    
    html_output = f"<h2>Overall Score: {overall_score:.2f}</h2>"
    
    for i, item in enumerate(response_list, 1):
        dialogue = item['dialogue']
        summary = item['summary']
        response = item['response']
        rouge_score = item['metric_score']['rouge_score']
        
        html_output += f"""
        <details>
        <summary>Response {i} (Rouge Score: {rouge_score:.2f})</summary>
        <div style="display: flex; justify-content: space-between;">
            <div style="width: 30%;">
                <h3>Dialogue</h3>
                {dialogue}
            </div>
            <div style="width: 30%;">
                <h3>Summary</h3>
                {summary}
            </div>
            <div style="width: 30%;">
                <h3>Response</h3>
                {response}
            </div>
        </div>
        </details>
        """
    
    return html_output

def process(seed, model_selection, prompt, num=10):
    random.seed(seed)
    response_list = []

    for data in random.choices(dataset, k=num):
        dialogue = data['dialogue']
        summary = data['summary']
        response = generate_answer(dialogue, model_selection, prompt)

        rouge_score = metric_rouge_score(response, summary)

        response_list.append(
            {
                'dialogue': dialogue,
                'summary': summary,
                'response': response,
                'metric_score': {
                    'rouge_score': rouge_score
                }
            }
        )

    return display_results(response_list)


def create_batch_evaluation_interface():
    with gr.Blocks() as demo:
        gr.Markdown("## Here are evaluation setups. It will randomly sample 10 data points to generate and evaluate. Show results once finished.")
        
        with gr.Row():
            seed = gr.Number(value=8, info="pick your favoriate random seed", precision=0)
            model_dropdown = gr.Dropdown(choices=Model.__model_list__, label="Choose a model", value=Model.__model_list__[0])
        Template_text = gr.Textbox(value="""Summarize the following dialogue""", label='Input Prompting Template', lines=8, placeholder='Input your prompts')
        submit_button = gr.Button("✨ Submit ✨")
        output = gr.HTML(label="Results")

        submit_button.click(
            process,
            inputs=[seed, model_dropdown, Template_text],
            outputs=output
        )

    return demo

if __name__ == "__main__":
    demo = create_batch_evaluation_interface()
    demo.launch()