BounharAbdelaziz commited on
Commit
e9a40a3
·
verified ·
1 Parent(s): 5f36137

Update human_eval.py

Browse files
Files changed (1) hide show
  1. human_eval.py +203 -203
human_eval.py CHANGED
@@ -1,204 +1,204 @@
1
- import gradio as gr
2
- from collections import defaultdict
3
- import os
4
- import base64
5
- import torch
6
- from datasets import (
7
- Dataset,
8
- load_dataset,
9
- )
10
- import random
11
- import pandas as pd
12
- from collections import defaultdict
13
-
14
- def encode_image_to_base64(image_path):
15
- """Encode an image or GIF file to base64."""
16
- with open(image_path, "rb") as file:
17
- encoded_string = base64.b64encode(file.read()).decode()
18
- return encoded_string
19
-
20
- def create_html_media(media_path, is_gif=False):
21
- """Create HTML for displaying an image or GIF."""
22
- media_base64 = encode_image_to_base64(media_path)
23
- media_type = "gif" if is_gif else "jpeg"
24
-
25
- html_string = f"""
26
- <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
27
- <div style="max-width: 450px; margin: auto;">
28
- <img src="data:image/{media_type};base64,{media_base64}"
29
- style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
30
- alt="Displayed Media">
31
- </div>
32
- </div>
33
- """
34
- return html_string
35
-
36
- class LMBattleArena:
37
- def __init__(self, dataset_path):
38
- """Initialize battle arena with dataset"""
39
- self.df = pd.read_csv(dataset_path)
40
- print(self.df.head())
41
- self.current_index = 0
42
- self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
43
- self.evaluation_results = []
44
- self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
45
-
46
- def get_next_battle_pair(self):
47
- """Retrieve next pair of summaries for comparison"""
48
- if self.current_index >= len(self.df):
49
- return None
50
-
51
- row = self.df.iloc[self.current_index]
52
- model_summary_cols = [
53
- col
54
- for col in row.index
55
- if col.upper() != 'PROMPT'
56
- ]
57
- selected_models = random.sample(model_summary_cols, 2)
58
- battle_data = {
59
- 'prompt': row['prompt'],
60
- 'model_1': row[selected_models[0]],
61
- 'model_2': row[selected_models[1]],
62
- 'model1_name': selected_models[0],
63
- 'model2_name': selected_models[1]
64
- }
65
- self.current_index += 1
66
- return battle_data
67
-
68
- def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
69
- """Record user's model preference and update scores"""
70
- self.model_scores[model1_name]['total_comparisons'] += 1
71
- self.model_scores[model2_name]['total_comparisons'] += 1
72
-
73
- if preferred_models == "Both Good":
74
- self.model_scores[model1_name]['wins'] += 1
75
- self.model_scores[model2_name]['wins'] += 1
76
- elif preferred_models == "Model A": # Maps to first model
77
- self.model_scores[model1_name]['wins'] += 1
78
- elif preferred_models == "Model B": # Maps to second model
79
- self.model_scores[model2_name]['wins'] += 1
80
- # "Both Bad" case - no wins recorded
81
-
82
- evaluation = {
83
- 'input_text': input_text,
84
- 'output1': output1,
85
- 'output2': output2,
86
- 'model1_name': model1_name,
87
- 'model2_name': model2_name,
88
- 'preferred_models': preferred_models
89
- }
90
- self.evaluation_results.append(evaluation)
91
-
92
- return self.get_model_scores_df()
93
-
94
- def get_model_scores_df(self):
95
- """Convert model scores to DataFrame"""
96
- scores_data = []
97
- for model, stats in self.model_scores.items():
98
- win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
99
- scores_data.append({
100
- 'Model': model,
101
- 'Wins': stats['wins'],
102
- 'Total Comparisons': stats['total_comparisons'],
103
- 'Win Rate (%)': round(win_rate, 2)
104
- })
105
- results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
106
-
107
- # save the results in a huggingface dataset
108
- if self.current_index % self.saving_freq == 0 and self.current_index > 0:
109
- results_dataset = Dataset.from_pandas(results_df)
110
- results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
111
- results_df.to_csv('human_eval_results.csv')
112
-
113
- return results_df
114
-
115
-
116
- def create_battle_arena(dataset_path, is_gif):
117
- arena = LMBattleArena(dataset_path)
118
-
119
- def battle_round():
120
- battle_data = arena.get_next_battle_pair()
121
-
122
- if battle_data is None:
123
- return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
124
-
125
- return (
126
- battle_data['prompt'],
127
- battle_data['model_1'],
128
- battle_data['model_2'],
129
- battle_data['model1_name'],
130
- battle_data['model2_name'],
131
- gr.DataFrame(visible=True)
132
- )
133
-
134
- def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
135
- scores_df = arena.record_evaluation(
136
- preferred_models, input_text, output_1, output_2, model1_name, model2_name
137
- )
138
- next_battle = battle_round()
139
- return (*next_battle[:-1], scores_df)
140
-
141
- with gr.Blocks(css="footer{display:none !important}") as demo:
142
-
143
- base_path = os.path.dirname(__file__)
144
- local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
145
- gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
146
-
147
- with gr.Tabs():
148
- with gr.Tab("Battle Arena"):
149
- gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
150
-
151
- input_text = gr.Textbox(
152
- label="Input prompt",
153
- interactive=False,
154
- )
155
-
156
- with gr.Row():
157
- output_1 = gr.Textbox(
158
- label="Model A",
159
- interactive=False
160
- )
161
- model1_name = gr.State() # Hidden state for model1 name
162
-
163
- with gr.Row():
164
- output_2 = gr.Textbox(
165
- label="Model B",
166
- interactive=False
167
- )
168
- model2_name = gr.State() # Hidden state for model2 name
169
-
170
- preferred_models = gr.Radio(
171
- label="Which model is better?",
172
- choices=["Model A", "Model B", "Both Good", "Both Bad"]
173
- )
174
- submit_btn = gr.Button("Vote", variant="primary")
175
-
176
- scores_table = gr.DataFrame(
177
- headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
178
- label="🏆 Leaderboard"
179
- )
180
-
181
- submit_btn.click(
182
- submit_preference,
183
- inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
184
- outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
185
- )
186
-
187
- demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
188
-
189
- return demo
190
-
191
- if __name__ == "__main__":
192
-
193
- # load the existing dataset that contains outputs of the LMs
194
- human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
195
-
196
- # precision
197
- torch_dtype = torch.float16
198
-
199
- # inference device
200
- device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
201
- dataset_path = 'human_eval_dataset.csv'
202
- is_gif = True
203
- demo = create_battle_arena(dataset_path, is_gif)
204
  demo.launch(debug=True)
 
1
+ import gradio as gr
2
+ from collections import defaultdict
3
+ import os
4
+ import base64
5
+ import torch
6
+ from datasets import (
7
+ Dataset,
8
+ load_dataset,
9
+ )
10
+ import random
11
+ import pandas as pd
12
+ from collections import defaultdict
13
+
14
+ def encode_image_to_base64(image_path):
15
+ """Encode an image or GIF file to base64."""
16
+ with open(image_path, "rb") as file:
17
+ encoded_string = base64.b64encode(file.read()).decode()
18
+ return encoded_string
19
+
20
+ def create_html_media(media_path, is_gif=False):
21
+ """Create HTML for displaying an image or GIF."""
22
+ media_base64 = encode_image_to_base64(media_path)
23
+ media_type = "gif" if is_gif else "jpeg"
24
+
25
+ html_string = f"""
26
+ <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
27
+ <div style="max-width: 450px; margin: auto;">
28
+ <img src="data:image/{media_type};base64,{media_base64}"
29
+ style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
30
+ alt="Displayed Media">
31
+ </div>
32
+ </div>
33
+ """
34
+ return html_string
35
+
36
+ class LMBattleArena:
37
+ def __init__(self, dataset_path):
38
+ """Initialize battle arena with dataset"""
39
+ self.df = pd.read_csv(dataset_path)
40
+ print(self.df.head())
41
+ self.current_index = 0
42
+ self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
43
+ self.evaluation_results = []
44
+ self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
45
+
46
+ def get_next_battle_pair(self):
47
+ """Retrieve next pair of summaries for comparison"""
48
+ if self.current_index >= len(self.df):
49
+ return None
50
+
51
+ row = self.df.iloc[self.current_index]
52
+ model_summary_cols = [
53
+ col
54
+ for col in row.index
55
+ if col.upper() != 'PROMPT'
56
+ ]
57
+ selected_models = random.sample(model_summary_cols, 2)
58
+ battle_data = {
59
+ 'prompt': row['prompt'],
60
+ 'model_1': row[selected_models[0]],
61
+ 'model_2': row[selected_models[1]],
62
+ 'model1_name': selected_models[0],
63
+ 'model2_name': selected_models[1]
64
+ }
65
+ self.current_index += 1
66
+ return battle_data
67
+
68
+ def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
69
+ """Record user's model preference and update scores"""
70
+ self.model_scores[model1_name]['total_comparisons'] += 1
71
+ self.model_scores[model2_name]['total_comparisons'] += 1
72
+
73
+ if preferred_models == "Both Good":
74
+ self.model_scores[model1_name]['wins'] += 1
75
+ self.model_scores[model2_name]['wins'] += 1
76
+ elif preferred_models == "Model A": # Maps to first model
77
+ self.model_scores[model1_name]['wins'] += 1
78
+ elif preferred_models == "Model B": # Maps to second model
79
+ self.model_scores[model2_name]['wins'] += 1
80
+ # "Both Bad" case - no wins recorded
81
+
82
+ evaluation = {
83
+ 'input_text': input_text,
84
+ 'output1': output1,
85
+ 'output2': output2,
86
+ 'model1_name': model1_name,
87
+ 'model2_name': model2_name,
88
+ 'preferred_models': preferred_models
89
+ }
90
+ self.evaluation_results.append(evaluation)
91
+
92
+ return self.get_model_scores_df()
93
+
94
+ def get_model_scores_df(self):
95
+ """Convert model scores to DataFrame"""
96
+ scores_data = []
97
+ for model, stats in self.model_scores.items():
98
+ win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
99
+ scores_data.append({
100
+ 'Model': model,
101
+ 'Wins': stats['wins'],
102
+ 'Total Comparisons': stats['total_comparisons'],
103
+ 'Win Rate (%)': round(win_rate, 2)
104
+ })
105
+ results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
106
+
107
+ # save the results in a huggingface dataset
108
+ if self.current_index % self.saving_freq == 0 and self.current_index > 0:
109
+ # results_dataset = Dataset.from_pandas(results_df)
110
+ # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
111
+ results_df.to_csv('human_eval_results.csv')
112
+
113
+ return results_df
114
+
115
+
116
+ def create_battle_arena(dataset_path, is_gif):
117
+ arena = LMBattleArena(dataset_path)
118
+
119
+ def battle_round():
120
+ battle_data = arena.get_next_battle_pair()
121
+
122
+ if battle_data is None:
123
+ return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
124
+
125
+ return (
126
+ battle_data['prompt'],
127
+ battle_data['model_1'],
128
+ battle_data['model_2'],
129
+ battle_data['model1_name'],
130
+ battle_data['model2_name'],
131
+ gr.DataFrame(visible=True)
132
+ )
133
+
134
+ def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
135
+ scores_df = arena.record_evaluation(
136
+ preferred_models, input_text, output_1, output_2, model1_name, model2_name
137
+ )
138
+ next_battle = battle_round()
139
+ return (*next_battle[:-1], scores_df)
140
+
141
+ with gr.Blocks(css="footer{display:none !important}") as demo:
142
+
143
+ base_path = os.path.dirname(__file__)
144
+ local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
145
+ gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
146
+
147
+ with gr.Tabs():
148
+ with gr.Tab("Battle Arena"):
149
+ gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
150
+
151
+ input_text = gr.Textbox(
152
+ label="Input prompt",
153
+ interactive=False,
154
+ )
155
+
156
+ with gr.Row():
157
+ output_1 = gr.Textbox(
158
+ label="Model A",
159
+ interactive=False
160
+ )
161
+ model1_name = gr.State() # Hidden state for model1 name
162
+
163
+ with gr.Row():
164
+ output_2 = gr.Textbox(
165
+ label="Model B",
166
+ interactive=False
167
+ )
168
+ model2_name = gr.State() # Hidden state for model2 name
169
+
170
+ preferred_models = gr.Radio(
171
+ label="Which model is better?",
172
+ choices=["Model A", "Model B", "Both Good", "Both Bad"]
173
+ )
174
+ submit_btn = gr.Button("Vote", variant="primary")
175
+
176
+ scores_table = gr.DataFrame(
177
+ headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
178
+ label="🏆 Leaderboard"
179
+ )
180
+
181
+ submit_btn.click(
182
+ submit_preference,
183
+ inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
184
+ outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
185
+ )
186
+
187
+ demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
188
+
189
+ return demo
190
+
191
+ if __name__ == "__main__":
192
+
193
+ # load the existing dataset that contains outputs of the LMs
194
+ human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')
195
+
196
+ # precision
197
+ torch_dtype = torch.float16
198
+
199
+ # inference device
200
+ device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
201
+ dataset_path = 'human_eval_dataset.csv'
202
+ is_gif = True
203
+ demo = create_battle_arena(dataset_path, is_gif)
204
  demo.launch(debug=True)