Spaces:
Running
Running
feat: add TG and PP scores
Browse files- src/components/visualizations.py +71 -59
- src/core/scoring.py +26 -12
src/components/visualizations.py
CHANGED
@@ -9,6 +9,13 @@ from typing import Optional, Dict, List, Set
|
|
9 |
import plotly.graph_objects as go
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def create_performance_plot(
|
13 |
df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None
|
14 |
):
|
@@ -106,13 +113,13 @@ def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame:
|
|
106 |
return filtered_df
|
107 |
|
108 |
|
109 |
-
def create_model_size_performance_plot(df: pd.DataFrame,
|
110 |
"""Create a plot showing model size vs performance metrics for a specific device"""
|
111 |
if df.empty:
|
112 |
return None
|
113 |
|
114 |
# Filter for the selected device
|
115 |
-
device_df = df[df["Device"] ==
|
116 |
if device_df.empty:
|
117 |
return None
|
118 |
|
@@ -237,14 +244,24 @@ def render_model_size_performance(df: pd.DataFrame, filters: Dict):
|
|
237 |
return
|
238 |
|
239 |
# Get the device with highest performance score
|
240 |
-
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
# Device selector for size vs performance plots
|
245 |
-
|
246 |
"Select Device",
|
247 |
-
options=
|
|
|
|
|
|
|
248 |
help="Select a device to view its performance across different model sizes",
|
249 |
key="size_perf_device_selector",
|
250 |
placeholder="Search for a device...",
|
@@ -254,8 +271,8 @@ def render_model_size_performance(df: pd.DataFrame, filters: Dict):
|
|
254 |
# Create and display the model size vs performance plot
|
255 |
size_perf_fig = create_model_size_performance_plot(
|
256 |
size_perf_df,
|
257 |
-
|
258 |
-
f"Model Size vs Performance Metrics for {
|
259 |
)
|
260 |
|
261 |
if size_perf_fig:
|
@@ -489,12 +506,6 @@ def render_device_rankings(df: pd.DataFrame):
|
|
489 |
st.warning("No data available for device rankings.")
|
490 |
return
|
491 |
|
492 |
-
def clean_device_id(device_id: str) -> str:
|
493 |
-
"""Extract clean device name from normalized ID by removing platform prefix"""
|
494 |
-
if device_id.startswith("iOS/"):
|
495 |
-
return device_id[4:] # Remove "iOS/"
|
496 |
-
return device_id
|
497 |
-
|
498 |
# Create device summary
|
499 |
device_summary = (
|
500 |
df.groupby(["Normalized Device ID", "Platform"])
|
@@ -502,8 +513,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
502 |
{
|
503 |
"performance_score": "max", # Best score achieved
|
504 |
"Model Size": ["min", "max"], # Size range
|
505 |
-
"
|
506 |
-
"
|
507 |
"Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested
|
508 |
"quant_factor": lambda x: sorted(set(x)), # Quantization levels tested
|
509 |
}
|
@@ -518,8 +529,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
518 |
"Best Score",
|
519 |
"Min Model Size",
|
520 |
"Max Model Size",
|
521 |
-
"
|
522 |
-
"
|
523 |
"Tested Models",
|
524 |
"Tested Quantizations",
|
525 |
]
|
@@ -545,19 +556,20 @@ def render_device_rankings(df: pd.DataFrame):
|
|
545 |
# Format the display columns
|
546 |
display_df = overall_rankings.copy()
|
547 |
display_df["Best Score"] = display_df["Best Score"].round(2)
|
548 |
-
display_df["
|
549 |
-
display_df["
|
|
|
550 |
display_df["Model Size Range"] = display_df.apply(
|
551 |
lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
|
552 |
)
|
553 |
|
554 |
# Select and reorder columns for display
|
555 |
display_cols = [
|
556 |
-
"Device",
|
557 |
"Platform",
|
558 |
"Best Score",
|
559 |
-
"
|
560 |
-
"
|
561 |
"Model Size Range",
|
562 |
]
|
563 |
|
@@ -580,14 +592,14 @@ def render_device_rankings(df: pd.DataFrame):
|
|
580 |
"Best Score": st.column_config.NumberColumn(
|
581 |
"Score", help="Overall performance score (0-100)", format="%.2f"
|
582 |
),
|
583 |
-
"
|
584 |
-
"
|
585 |
-
help="
|
586 |
format="%.2f",
|
587 |
),
|
588 |
-
"
|
589 |
-
"
|
590 |
-
help="
|
591 |
format="%.2f",
|
592 |
),
|
593 |
},
|
@@ -620,8 +632,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
620 |
.agg(
|
621 |
{
|
622 |
"performance_score": ["max", "mean"],
|
623 |
-
"
|
624 |
-
"
|
625 |
"Model ID": lambda x: ", ".join(sorted(set(x))),
|
626 |
}
|
627 |
)
|
@@ -635,8 +647,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
635 |
"Size Category",
|
636 |
"Best Score",
|
637 |
"Avg Score",
|
638 |
-
"
|
639 |
-
"
|
640 |
"Models",
|
641 |
]
|
642 |
|
@@ -657,16 +669,16 @@ def render_device_rankings(df: pd.DataFrame):
|
|
657 |
# Format scores
|
658 |
cat_data["Best Score"] = cat_data["Best Score"].round(2)
|
659 |
cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
|
660 |
-
cat_data["
|
661 |
-
cat_data["
|
662 |
|
663 |
display_cols = [
|
664 |
-
"Device",
|
665 |
"Platform",
|
666 |
"Best Score",
|
667 |
"Avg Score",
|
668 |
-
"
|
669 |
-
"
|
670 |
]
|
671 |
|
672 |
st.dataframe(
|
@@ -693,14 +705,14 @@ def render_device_rankings(df: pd.DataFrame):
|
|
693 |
"Avg Score": st.column_config.NumberColumn(
|
694 |
"Avg Score", help="Average performance score", format="%.2f"
|
695 |
),
|
696 |
-
"
|
697 |
-
"
|
698 |
-
help="
|
699 |
format="%.2f",
|
700 |
),
|
701 |
-
"
|
702 |
-
"
|
703 |
-
help="
|
704 |
format="%.2f",
|
705 |
),
|
706 |
},
|
@@ -731,8 +743,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
731 |
.agg(
|
732 |
{
|
733 |
"performance_score": ["max", "mean"],
|
734 |
-
"
|
735 |
-
"
|
736 |
"Model ID": lambda x: ", ".join(sorted(set(x))),
|
737 |
}
|
738 |
)
|
@@ -746,8 +758,8 @@ def render_device_rankings(df: pd.DataFrame):
|
|
746 |
"Quant Factor",
|
747 |
"Best Score",
|
748 |
"Avg Score",
|
749 |
-
"
|
750 |
-
"
|
751 |
"Models",
|
752 |
]
|
753 |
|
@@ -771,16 +783,16 @@ def render_device_rankings(df: pd.DataFrame):
|
|
771 |
# Format scores
|
772 |
quant_data["Best Score"] = quant_data["Best Score"].round(2)
|
773 |
quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
|
774 |
-
quant_data["
|
775 |
-
quant_data["
|
776 |
|
777 |
display_cols = [
|
778 |
"Device",
|
779 |
"Platform",
|
780 |
"Best Score",
|
781 |
"Avg Score",
|
782 |
-
"
|
783 |
-
"
|
784 |
]
|
785 |
|
786 |
st.dataframe(
|
@@ -807,14 +819,14 @@ def render_device_rankings(df: pd.DataFrame):
|
|
807 |
"Avg Score": st.column_config.NumberColumn(
|
808 |
"Avg Score", help="Average performance score", format="%.2f"
|
809 |
),
|
810 |
-
"
|
811 |
-
"
|
812 |
-
help="
|
813 |
format="%.2f",
|
814 |
),
|
815 |
-
"
|
816 |
-
"
|
817 |
-
help="
|
818 |
format="%.2f",
|
819 |
),
|
820 |
},
|
|
|
9 |
import plotly.graph_objects as go
|
10 |
|
11 |
|
12 |
+
def clean_device_id(device_id: str) -> str:
|
13 |
+
"""Extract clean device name from normalized ID by removing platform prefix"""
|
14 |
+
if device_id.startswith("iOS/"):
|
15 |
+
return device_id[4:] # Remove "iOS/"
|
16 |
+
return device_id
|
17 |
+
|
18 |
+
|
19 |
def create_performance_plot(
|
20 |
df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None
|
21 |
):
|
|
|
113 |
return filtered_df
|
114 |
|
115 |
|
116 |
+
def create_model_size_performance_plot(df: pd.DataFrame, device_id: str, title: str):
|
117 |
"""Create a plot showing model size vs performance metrics for a specific device"""
|
118 |
if df.empty:
|
119 |
return None
|
120 |
|
121 |
# Filter for the selected device
|
122 |
+
device_df = df[df["Normalized Device ID"] == device_id].copy()
|
123 |
if device_df.empty:
|
124 |
return None
|
125 |
|
|
|
244 |
return
|
245 |
|
246 |
# Get the device with highest performance score
|
247 |
+
top_device_id = size_perf_df.loc[size_perf_df["performance_score"].idxmax()][
|
248 |
+
"Normalized Device ID"
|
249 |
+
]
|
250 |
+
device_ids = sorted(size_perf_df["Normalized Device ID"].unique())
|
251 |
+
default_index = device_ids.index(top_device_id)
|
252 |
+
|
253 |
+
# Create mapping of normalized IDs to display names
|
254 |
+
device_display_names = {
|
255 |
+
device_id: clean_device_id(device_id) for device_id in device_ids
|
256 |
+
}
|
257 |
|
258 |
# Device selector for size vs performance plots
|
259 |
+
selected_device_id = st.selectbox(
|
260 |
"Select Device",
|
261 |
+
options=device_ids,
|
262 |
+
format_func=lambda x: device_display_names[
|
263 |
+
x
|
264 |
+
], # Display clean names in dropdown
|
265 |
help="Select a device to view its performance across different model sizes",
|
266 |
key="size_perf_device_selector",
|
267 |
placeholder="Search for a device...",
|
|
|
271 |
# Create and display the model size vs performance plot
|
272 |
size_perf_fig = create_model_size_performance_plot(
|
273 |
size_perf_df,
|
274 |
+
selected_device_id,
|
275 |
+
f"Model Size vs Performance Metrics for {device_display_names[selected_device_id]}",
|
276 |
)
|
277 |
|
278 |
if size_perf_fig:
|
|
|
506 |
st.warning("No data available for device rankings.")
|
507 |
return
|
508 |
|
|
|
|
|
|
|
|
|
|
|
|
|
509 |
# Create device summary
|
510 |
device_summary = (
|
511 |
df.groupby(["Normalized Device ID", "Platform"])
|
|
|
513 |
{
|
514 |
"performance_score": "max", # Best score achieved
|
515 |
"Model Size": ["min", "max"], # Size range
|
516 |
+
"tg_score": "max", # Use normalized TG score
|
517 |
+
"pp_score": "max", # Use normalized PP score
|
518 |
"Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested
|
519 |
"quant_factor": lambda x: sorted(set(x)), # Quantization levels tested
|
520 |
}
|
|
|
529 |
"Best Score",
|
530 |
"Min Model Size",
|
531 |
"Max Model Size",
|
532 |
+
"TG Score",
|
533 |
+
"PP Score",
|
534 |
"Tested Models",
|
535 |
"Tested Quantizations",
|
536 |
]
|
|
|
556 |
# Format the display columns
|
557 |
display_df = overall_rankings.copy()
|
558 |
display_df["Best Score"] = display_df["Best Score"].round(2)
|
559 |
+
display_df["TG Score"] = display_df["TG Score"].round(2)
|
560 |
+
display_df["PP Score"] = display_df["PP Score"].round(2)
|
561 |
+
|
562 |
display_df["Model Size Range"] = display_df.apply(
|
563 |
lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
|
564 |
)
|
565 |
|
566 |
# Select and reorder columns for display
|
567 |
display_cols = [
|
568 |
+
"Device", # Use clean device name for display
|
569 |
"Platform",
|
570 |
"Best Score",
|
571 |
+
"TG Score",
|
572 |
+
"PP Score",
|
573 |
"Model Size Range",
|
574 |
]
|
575 |
|
|
|
592 |
"Best Score": st.column_config.NumberColumn(
|
593 |
"Score", help="Overall performance score (0-100)", format="%.2f"
|
594 |
),
|
595 |
+
"TG Score": st.column_config.NumberColumn(
|
596 |
+
"TG Score",
|
597 |
+
help="Normalized Token Generation score (0-100)",
|
598 |
format="%.2f",
|
599 |
),
|
600 |
+
"PP Score": st.column_config.NumberColumn(
|
601 |
+
"PP Score",
|
602 |
+
help="Normalized Prompt Processing score (0-100)",
|
603 |
format="%.2f",
|
604 |
),
|
605 |
},
|
|
|
632 |
.agg(
|
633 |
{
|
634 |
"performance_score": ["max", "mean"],
|
635 |
+
"tg_score": "max", # Use normalized scores
|
636 |
+
"pp_score": "max", # Use normalized scores
|
637 |
"Model ID": lambda x: ", ".join(sorted(set(x))),
|
638 |
}
|
639 |
)
|
|
|
647 |
"Size Category",
|
648 |
"Best Score",
|
649 |
"Avg Score",
|
650 |
+
"TG Score",
|
651 |
+
"PP Score",
|
652 |
"Models",
|
653 |
]
|
654 |
|
|
|
669 |
# Format scores
|
670 |
cat_data["Best Score"] = cat_data["Best Score"].round(2)
|
671 |
cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
|
672 |
+
cat_data["TG Score"] = cat_data["TG Score"].round(2)
|
673 |
+
cat_data["PP Score"] = cat_data["PP Score"].round(2)
|
674 |
|
675 |
display_cols = [
|
676 |
+
"Device", # Use clean device name for display
|
677 |
"Platform",
|
678 |
"Best Score",
|
679 |
"Avg Score",
|
680 |
+
"TG Score",
|
681 |
+
"PP Score",
|
682 |
]
|
683 |
|
684 |
st.dataframe(
|
|
|
705 |
"Avg Score": st.column_config.NumberColumn(
|
706 |
"Avg Score", help="Average performance score", format="%.2f"
|
707 |
),
|
708 |
+
"TG Score": st.column_config.NumberColumn(
|
709 |
+
"TG Score",
|
710 |
+
help="Normalized Token Generation score (0-100)",
|
711 |
format="%.2f",
|
712 |
),
|
713 |
+
"PP Score": st.column_config.NumberColumn(
|
714 |
+
"PP Score",
|
715 |
+
help="Normalized Prompt Processing score (0-100)",
|
716 |
format="%.2f",
|
717 |
),
|
718 |
},
|
|
|
743 |
.agg(
|
744 |
{
|
745 |
"performance_score": ["max", "mean"],
|
746 |
+
"tg_score": "max",
|
747 |
+
"pp_score": "max",
|
748 |
"Model ID": lambda x: ", ".join(sorted(set(x))),
|
749 |
}
|
750 |
)
|
|
|
758 |
"Quant Factor",
|
759 |
"Best Score",
|
760 |
"Avg Score",
|
761 |
+
"TG Score",
|
762 |
+
"PP Score",
|
763 |
"Models",
|
764 |
]
|
765 |
|
|
|
783 |
# Format scores
|
784 |
quant_data["Best Score"] = quant_data["Best Score"].round(2)
|
785 |
quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
|
786 |
+
quant_data["TG Score"] = quant_data["TG Score"].round(2)
|
787 |
+
quant_data["PP Score"] = quant_data["PP Score"].round(2)
|
788 |
|
789 |
display_cols = [
|
790 |
"Device",
|
791 |
"Platform",
|
792 |
"Best Score",
|
793 |
"Avg Score",
|
794 |
+
"TG Score",
|
795 |
+
"PP Score",
|
796 |
]
|
797 |
|
798 |
st.dataframe(
|
|
|
819 |
"Avg Score": st.column_config.NumberColumn(
|
820 |
"Avg Score", help="Average performance score", format="%.2f"
|
821 |
),
|
822 |
+
"TG Score": st.column_config.NumberColumn(
|
823 |
+
"TG Score",
|
824 |
+
help="Normalized Token Generation score (0-100)",
|
825 |
format="%.2f",
|
826 |
),
|
827 |
+
"PP Score": st.column_config.NumberColumn(
|
828 |
+
"PP Score",
|
829 |
+
help="Normalized Prompt Processing score (0-100)",
|
830 |
format="%.2f",
|
831 |
),
|
832 |
},
|
src/core/scoring.py
CHANGED
@@ -110,6 +110,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
110 |
)
|
111 |
# Return original dataframe with zero scores to avoid breaking the app
|
112 |
df["performance_score"] = 0
|
|
|
|
|
113 |
df["quant_factor"] = df["Model ID"].apply(
|
114 |
lambda x: get_quantization_tier(x, std)
|
115 |
)
|
@@ -124,22 +126,30 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
124 |
lambda x: get_quantization_tier(x, std)
|
125 |
)
|
126 |
|
127 |
-
#
|
128 |
-
standard_df["
|
129 |
-
|
130 |
-
standard_df["normalized_tg"] * std.TG_WEIGHT
|
131 |
-
+ standard_df["normalized_pp"] * std.PP_WEIGHT
|
132 |
-
)
|
133 |
* standard_df["Model Size"] # Direct size multiplier
|
134 |
* standard_df["quant_factor"] # Apply quantization penalty
|
135 |
)
|
136 |
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
standard_df["
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
# Merge scores back into original dataframe
|
145 |
df = df.merge(
|
@@ -149,6 +159,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
149 |
"Platform",
|
150 |
"Model ID",
|
151 |
"performance_score",
|
|
|
|
|
152 |
"quant_factor",
|
153 |
]
|
154 |
],
|
@@ -158,6 +170,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
|
|
158 |
|
159 |
# Fill missing scores with 0
|
160 |
df["performance_score"] = df["performance_score"].fillna(0)
|
|
|
|
|
161 |
|
162 |
return df
|
163 |
|
|
|
110 |
)
|
111 |
# Return original dataframe with zero scores to avoid breaking the app
|
112 |
df["performance_score"] = 0
|
113 |
+
df["tg_score"] = 0
|
114 |
+
df["pp_score"] = 0
|
115 |
df["quant_factor"] = df["Model ID"].apply(
|
116 |
lambda x: get_quantization_tier(x, std)
|
117 |
)
|
|
|
126 |
lambda x: get_quantization_tier(x, std)
|
127 |
)
|
128 |
|
129 |
+
# Calculate individual TG and PP scores
|
130 |
+
standard_df["tg_score"] = (
|
131 |
+
standard_df["normalized_tg"]
|
|
|
|
|
|
|
132 |
* standard_df["Model Size"] # Direct size multiplier
|
133 |
* standard_df["quant_factor"] # Apply quantization penalty
|
134 |
)
|
135 |
|
136 |
+
standard_df["pp_score"] = (
|
137 |
+
standard_df["normalized_pp"]
|
138 |
+
* standard_df["Model Size"] # Direct size multiplier
|
139 |
+
* standard_df["quant_factor"] # Apply quantization penalty
|
140 |
+
)
|
141 |
+
|
142 |
+
# Combined performance score using weighted TG and PP scores
|
143 |
+
standard_df["performance_score"] = (
|
144 |
+
standard_df["tg_score"] * std.TG_WEIGHT
|
145 |
+
+ standard_df["pp_score"] * std.PP_WEIGHT
|
146 |
+
)
|
147 |
+
|
148 |
+
# Normalize all scores to 0-100 range
|
149 |
+
for score_col in ["performance_score", "tg_score", "pp_score"]:
|
150 |
+
max_score = standard_df[score_col].max()
|
151 |
+
if max_score > 0:
|
152 |
+
standard_df[score_col] = 100 * (standard_df[score_col] / max_score)
|
153 |
|
154 |
# Merge scores back into original dataframe
|
155 |
df = df.merge(
|
|
|
159 |
"Platform",
|
160 |
"Model ID",
|
161 |
"performance_score",
|
162 |
+
"tg_score",
|
163 |
+
"pp_score",
|
164 |
"quant_factor",
|
165 |
]
|
166 |
],
|
|
|
170 |
|
171 |
# Fill missing scores with 0
|
172 |
df["performance_score"] = df["performance_score"].fillna(0)
|
173 |
+
df["tg_score"] = df["tg_score"].fillna(0)
|
174 |
+
df["pp_score"] = df["pp_score"].fillna(0)
|
175 |
|
176 |
return df
|
177 |
|