agh123 commited on
Commit
0202f73
·
1 Parent(s): 98965db

feat: add TG and PP scores

Browse files
src/components/visualizations.py CHANGED
@@ -9,6 +9,13 @@ from typing import Optional, Dict, List, Set
9
  import plotly.graph_objects as go
10
 
11
 
 
 
 
 
 
 
 
12
  def create_performance_plot(
13
  df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None
14
  ):
@@ -106,13 +113,13 @@ def filter_dataframe(df: pd.DataFrame, filters: Dict) -> pd.DataFrame:
106
  return filtered_df
107
 
108
 
109
- def create_model_size_performance_plot(df: pd.DataFrame, device: str, title: str):
110
  """Create a plot showing model size vs performance metrics for a specific device"""
111
  if df.empty:
112
  return None
113
 
114
  # Filter for the selected device
115
- device_df = df[df["Device"] == device].copy()
116
  if device_df.empty:
117
  return None
118
 
@@ -237,14 +244,24 @@ def render_model_size_performance(df: pd.DataFrame, filters: Dict):
237
  return
238
 
239
  # Get the device with highest performance score
240
- top_device = size_perf_df.loc[size_perf_df["performance_score"].idxmax()]["Device"]
241
- devices = sorted(size_perf_df["Device"].unique())
242
- default_index = devices.index(top_device)
 
 
 
 
 
 
 
243
 
244
  # Device selector for size vs performance plots
245
- selected_device = st.selectbox(
246
  "Select Device",
247
- options=devices,
 
 
 
248
  help="Select a device to view its performance across different model sizes",
249
  key="size_perf_device_selector",
250
  placeholder="Search for a device...",
@@ -254,8 +271,8 @@ def render_model_size_performance(df: pd.DataFrame, filters: Dict):
254
  # Create and display the model size vs performance plot
255
  size_perf_fig = create_model_size_performance_plot(
256
  size_perf_df,
257
- selected_device,
258
- f"Model Size vs Performance Metrics for {selected_device}",
259
  )
260
 
261
  if size_perf_fig:
@@ -489,12 +506,6 @@ def render_device_rankings(df: pd.DataFrame):
489
  st.warning("No data available for device rankings.")
490
  return
491
 
492
- def clean_device_id(device_id: str) -> str:
493
- """Extract clean device name from normalized ID by removing platform prefix"""
494
- if device_id.startswith("iOS/"):
495
- return device_id[4:] # Remove "iOS/"
496
- return device_id
497
-
498
  # Create device summary
499
  device_summary = (
500
  df.groupby(["Normalized Device ID", "Platform"])
@@ -502,8 +513,8 @@ def render_device_rankings(df: pd.DataFrame):
502
  {
503
  "performance_score": "max", # Best score achieved
504
  "Model Size": ["min", "max"], # Size range
505
- "Token Generation": "max", # Best token generation speed
506
- "Prompt Processing": "max", # Best prompt processing speed
507
  "Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested
508
  "quant_factor": lambda x: sorted(set(x)), # Quantization levels tested
509
  }
@@ -518,8 +529,8 @@ def render_device_rankings(df: pd.DataFrame):
518
  "Best Score",
519
  "Min Model Size",
520
  "Max Model Size",
521
- "Best TG Speed",
522
- "Best PP Speed",
523
  "Tested Models",
524
  "Tested Quantizations",
525
  ]
@@ -545,19 +556,20 @@ def render_device_rankings(df: pd.DataFrame):
545
  # Format the display columns
546
  display_df = overall_rankings.copy()
547
  display_df["Best Score"] = display_df["Best Score"].round(2)
548
- display_df["Best TG Speed"] = display_df["Best TG Speed"].round(2)
549
- display_df["Best PP Speed"] = display_df["Best PP Speed"].round(2)
 
550
  display_df["Model Size Range"] = display_df.apply(
551
  lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
552
  )
553
 
554
  # Select and reorder columns for display
555
  display_cols = [
556
- "Device",
557
  "Platform",
558
  "Best Score",
559
- "Best TG Speed",
560
- "Best PP Speed",
561
  "Model Size Range",
562
  ]
563
 
@@ -580,14 +592,14 @@ def render_device_rankings(df: pd.DataFrame):
580
  "Best Score": st.column_config.NumberColumn(
581
  "Score", help="Overall performance score (0-100)", format="%.2f"
582
  ),
583
- "Best TG Speed": st.column_config.NumberColumn(
584
- "Best TG Speed (t/s)",
585
- help="Best token generation speed",
586
  format="%.2f",
587
  ),
588
- "Best PP Speed": st.column_config.NumberColumn(
589
- "Best PP Speed (t/s)",
590
- help="Best prompt processing speed",
591
  format="%.2f",
592
  ),
593
  },
@@ -620,8 +632,8 @@ def render_device_rankings(df: pd.DataFrame):
620
  .agg(
621
  {
622
  "performance_score": ["max", "mean"],
623
- "Token Generation": "max",
624
- "Prompt Processing": "max",
625
  "Model ID": lambda x: ", ".join(sorted(set(x))),
626
  }
627
  )
@@ -635,8 +647,8 @@ def render_device_rankings(df: pd.DataFrame):
635
  "Size Category",
636
  "Best Score",
637
  "Avg Score",
638
- "Best TG Speed",
639
- "Best PP Speed",
640
  "Models",
641
  ]
642
 
@@ -657,16 +669,16 @@ def render_device_rankings(df: pd.DataFrame):
657
  # Format scores
658
  cat_data["Best Score"] = cat_data["Best Score"].round(2)
659
  cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
660
- cat_data["Best TG Speed"] = cat_data["Best TG Speed"].round(2)
661
- cat_data["Best PP Speed"] = cat_data["Best PP Speed"].round(2)
662
 
663
  display_cols = [
664
- "Device",
665
  "Platform",
666
  "Best Score",
667
  "Avg Score",
668
- "Best TG Speed",
669
- "Best PP Speed",
670
  ]
671
 
672
  st.dataframe(
@@ -693,14 +705,14 @@ def render_device_rankings(df: pd.DataFrame):
693
  "Avg Score": st.column_config.NumberColumn(
694
  "Avg Score", help="Average performance score", format="%.2f"
695
  ),
696
- "Best TG Speed": st.column_config.NumberColumn(
697
- "Best TG (t/s)",
698
- help="Best token generation speed",
699
  format="%.2f",
700
  ),
701
- "Best PP Speed": st.column_config.NumberColumn(
702
- "Best PP (t/s)",
703
- help="Best prompt processing speed",
704
  format="%.2f",
705
  ),
706
  },
@@ -731,8 +743,8 @@ def render_device_rankings(df: pd.DataFrame):
731
  .agg(
732
  {
733
  "performance_score": ["max", "mean"],
734
- "Token Generation": "max",
735
- "Prompt Processing": "max",
736
  "Model ID": lambda x: ", ".join(sorted(set(x))),
737
  }
738
  )
@@ -746,8 +758,8 @@ def render_device_rankings(df: pd.DataFrame):
746
  "Quant Factor",
747
  "Best Score",
748
  "Avg Score",
749
- "Best TG Speed",
750
- "Best PP Speed",
751
  "Models",
752
  ]
753
 
@@ -771,16 +783,16 @@ def render_device_rankings(df: pd.DataFrame):
771
  # Format scores
772
  quant_data["Best Score"] = quant_data["Best Score"].round(2)
773
  quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
774
- quant_data["Best TG Speed"] = quant_data["Best TG Speed"].round(2)
775
- quant_data["Best PP Speed"] = quant_data["Best PP Speed"].round(2)
776
 
777
  display_cols = [
778
  "Device",
779
  "Platform",
780
  "Best Score",
781
  "Avg Score",
782
- "Best TG Speed",
783
- "Best PP Speed",
784
  ]
785
 
786
  st.dataframe(
@@ -807,14 +819,14 @@ def render_device_rankings(df: pd.DataFrame):
807
  "Avg Score": st.column_config.NumberColumn(
808
  "Avg Score", help="Average performance score", format="%.2f"
809
  ),
810
- "Best TG Speed": st.column_config.NumberColumn(
811
- "Best TG (t/s)",
812
- help="Best token generation speed",
813
  format="%.2f",
814
  ),
815
- "Best PP Speed": st.column_config.NumberColumn(
816
- "Best PP (t/s)",
817
- help="Best prompt processing speed",
818
  format="%.2f",
819
  ),
820
  },
 
9
  import plotly.graph_objects as go
10
 
11
 
12
+ def clean_device_id(device_id: str) -> str:
13
+ """Extract clean device name from normalized ID by removing platform prefix"""
14
+ if device_id.startswith("iOS/"):
15
+ return device_id[4:] # Remove "iOS/"
16
+ return device_id
17
+
18
+
19
  def create_performance_plot(
20
  df: pd.DataFrame, metric: str, title: str, hover_data: List[str] = None
21
  ):
 
113
  return filtered_df
114
 
115
 
116
+ def create_model_size_performance_plot(df: pd.DataFrame, device_id: str, title: str):
117
  """Create a plot showing model size vs performance metrics for a specific device"""
118
  if df.empty:
119
  return None
120
 
121
  # Filter for the selected device
122
+ device_df = df[df["Normalized Device ID"] == device_id].copy()
123
  if device_df.empty:
124
  return None
125
 
 
244
  return
245
 
246
  # Get the device with highest performance score
247
+ top_device_id = size_perf_df.loc[size_perf_df["performance_score"].idxmax()][
248
+ "Normalized Device ID"
249
+ ]
250
+ device_ids = sorted(size_perf_df["Normalized Device ID"].unique())
251
+ default_index = device_ids.index(top_device_id)
252
+
253
+ # Create mapping of normalized IDs to display names
254
+ device_display_names = {
255
+ device_id: clean_device_id(device_id) for device_id in device_ids
256
+ }
257
 
258
  # Device selector for size vs performance plots
259
+ selected_device_id = st.selectbox(
260
  "Select Device",
261
+ options=device_ids,
262
+ format_func=lambda x: device_display_names[
263
+ x
264
+ ], # Display clean names in dropdown
265
  help="Select a device to view its performance across different model sizes",
266
  key="size_perf_device_selector",
267
  placeholder="Search for a device...",
 
271
  # Create and display the model size vs performance plot
272
  size_perf_fig = create_model_size_performance_plot(
273
  size_perf_df,
274
+ selected_device_id,
275
+ f"Model Size vs Performance Metrics for {device_display_names[selected_device_id]}",
276
  )
277
 
278
  if size_perf_fig:
 
506
  st.warning("No data available for device rankings.")
507
  return
508
 
 
 
 
 
 
 
509
  # Create device summary
510
  device_summary = (
511
  df.groupby(["Normalized Device ID", "Platform"])
 
513
  {
514
  "performance_score": "max", # Best score achieved
515
  "Model Size": ["min", "max"], # Size range
516
+ "tg_score": "max", # Use normalized TG score
517
+ "pp_score": "max", # Use normalized PP score
518
  "Model ID": lambda x: ", ".join(sorted(set(x))), # All models tested
519
  "quant_factor": lambda x: sorted(set(x)), # Quantization levels tested
520
  }
 
529
  "Best Score",
530
  "Min Model Size",
531
  "Max Model Size",
532
+ "TG Score",
533
+ "PP Score",
534
  "Tested Models",
535
  "Tested Quantizations",
536
  ]
 
556
  # Format the display columns
557
  display_df = overall_rankings.copy()
558
  display_df["Best Score"] = display_df["Best Score"].round(2)
559
+ display_df["TG Score"] = display_df["TG Score"].round(2)
560
+ display_df["PP Score"] = display_df["PP Score"].round(2)
561
+
562
  display_df["Model Size Range"] = display_df.apply(
563
  lambda x: f"{x['Min Model Size']:.1f}B - {x['Max Model Size']:.1f}B", axis=1
564
  )
565
 
566
  # Select and reorder columns for display
567
  display_cols = [
568
+ "Device", # Use clean device name for display
569
  "Platform",
570
  "Best Score",
571
+ "TG Score",
572
+ "PP Score",
573
  "Model Size Range",
574
  ]
575
 
 
592
  "Best Score": st.column_config.NumberColumn(
593
  "Score", help="Overall performance score (0-100)", format="%.2f"
594
  ),
595
+ "TG Score": st.column_config.NumberColumn(
596
+ "TG Score",
597
+ help="Normalized Token Generation score (0-100)",
598
  format="%.2f",
599
  ),
600
+ "PP Score": st.column_config.NumberColumn(
601
+ "PP Score",
602
+ help="Normalized Prompt Processing score (0-100)",
603
  format="%.2f",
604
  ),
605
  },
 
632
  .agg(
633
  {
634
  "performance_score": ["max", "mean"],
635
+ "tg_score": "max", # Use normalized scores
636
+ "pp_score": "max", # Use normalized scores
637
  "Model ID": lambda x: ", ".join(sorted(set(x))),
638
  }
639
  )
 
647
  "Size Category",
648
  "Best Score",
649
  "Avg Score",
650
+ "TG Score",
651
+ "PP Score",
652
  "Models",
653
  ]
654
 
 
669
  # Format scores
670
  cat_data["Best Score"] = cat_data["Best Score"].round(2)
671
  cat_data["Avg Score"] = cat_data["Avg Score"].round(2)
672
+ cat_data["TG Score"] = cat_data["TG Score"].round(2)
673
+ cat_data["PP Score"] = cat_data["PP Score"].round(2)
674
 
675
  display_cols = [
676
+ "Device", # Use clean device name for display
677
  "Platform",
678
  "Best Score",
679
  "Avg Score",
680
+ "TG Score",
681
+ "PP Score",
682
  ]
683
 
684
  st.dataframe(
 
705
  "Avg Score": st.column_config.NumberColumn(
706
  "Avg Score", help="Average performance score", format="%.2f"
707
  ),
708
+ "TG Score": st.column_config.NumberColumn(
709
+ "TG Score",
710
+ help="Normalized Token Generation score (0-100)",
711
  format="%.2f",
712
  ),
713
+ "PP Score": st.column_config.NumberColumn(
714
+ "PP Score",
715
+ help="Normalized Prompt Processing score (0-100)",
716
  format="%.2f",
717
  ),
718
  },
 
743
  .agg(
744
  {
745
  "performance_score": ["max", "mean"],
746
+ "tg_score": "max",
747
+ "pp_score": "max",
748
  "Model ID": lambda x: ", ".join(sorted(set(x))),
749
  }
750
  )
 
758
  "Quant Factor",
759
  "Best Score",
760
  "Avg Score",
761
+ "TG Score",
762
+ "PP Score",
763
  "Models",
764
  ]
765
 
 
783
  # Format scores
784
  quant_data["Best Score"] = quant_data["Best Score"].round(2)
785
  quant_data["Avg Score"] = quant_data["Avg Score"].round(2)
786
+ quant_data["TG Score"] = quant_data["TG Score"].round(2)
787
+ quant_data["PP Score"] = quant_data["PP Score"].round(2)
788
 
789
  display_cols = [
790
  "Device",
791
  "Platform",
792
  "Best Score",
793
  "Avg Score",
794
+ "TG Score",
795
+ "PP Score",
796
  ]
797
 
798
  st.dataframe(
 
819
  "Avg Score": st.column_config.NumberColumn(
820
  "Avg Score", help="Average performance score", format="%.2f"
821
  ),
822
+ "TG Score": st.column_config.NumberColumn(
823
+ "TG Score",
824
+ help="Normalized Token Generation score (0-100)",
825
  format="%.2f",
826
  ),
827
+ "PP Score": st.column_config.NumberColumn(
828
+ "PP Score",
829
+ help="Normalized Prompt Processing score (0-100)",
830
  format="%.2f",
831
  ),
832
  },
src/core/scoring.py CHANGED
@@ -110,6 +110,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
110
  )
111
  # Return original dataframe with zero scores to avoid breaking the app
112
  df["performance_score"] = 0
 
 
113
  df["quant_factor"] = df["Model ID"].apply(
114
  lambda x: get_quantization_tier(x, std)
115
  )
@@ -124,22 +126,30 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
124
  lambda x: get_quantization_tier(x, std)
125
  )
126
 
127
- # Combined performance score using model size as direct multiplier
128
- standard_df["performance_score"] = (
129
- (
130
- standard_df["normalized_tg"] * std.TG_WEIGHT
131
- + standard_df["normalized_pp"] * std.PP_WEIGHT
132
- )
133
  * standard_df["Model Size"] # Direct size multiplier
134
  * standard_df["quant_factor"] # Apply quantization penalty
135
  )
136
 
137
- # Normalize final score to 0-100 range
138
- max_score = standard_df["performance_score"].max()
139
- if max_score > 0:
140
- standard_df["performance_score"] = 100 * (
141
- standard_df["performance_score"] / max_score
142
- )
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  # Merge scores back into original dataframe
145
  df = df.merge(
@@ -149,6 +159,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
149
  "Platform",
150
  "Model ID",
151
  "performance_score",
 
 
152
  "quant_factor",
153
  ]
154
  ],
@@ -158,6 +170,8 @@ def calculate_performance_score(df: pd.DataFrame) -> pd.DataFrame:
158
 
159
  # Fill missing scores with 0
160
  df["performance_score"] = df["performance_score"].fillna(0)
 
 
161
 
162
  return df
163
 
 
110
  )
111
  # Return original dataframe with zero scores to avoid breaking the app
112
  df["performance_score"] = 0
113
+ df["tg_score"] = 0
114
+ df["pp_score"] = 0
115
  df["quant_factor"] = df["Model ID"].apply(
116
  lambda x: get_quantization_tier(x, std)
117
  )
 
126
  lambda x: get_quantization_tier(x, std)
127
  )
128
 
129
+ # Calculate individual TG and PP scores
130
+ standard_df["tg_score"] = (
131
+ standard_df["normalized_tg"]
 
 
 
132
  * standard_df["Model Size"] # Direct size multiplier
133
  * standard_df["quant_factor"] # Apply quantization penalty
134
  )
135
 
136
+ standard_df["pp_score"] = (
137
+ standard_df["normalized_pp"]
138
+ * standard_df["Model Size"] # Direct size multiplier
139
+ * standard_df["quant_factor"] # Apply quantization penalty
140
+ )
141
+
142
+ # Combined performance score using weighted TG and PP scores
143
+ standard_df["performance_score"] = (
144
+ standard_df["tg_score"] * std.TG_WEIGHT
145
+ + standard_df["pp_score"] * std.PP_WEIGHT
146
+ )
147
+
148
+ # Normalize all scores to 0-100 range
149
+ for score_col in ["performance_score", "tg_score", "pp_score"]:
150
+ max_score = standard_df[score_col].max()
151
+ if max_score > 0:
152
+ standard_df[score_col] = 100 * (standard_df[score_col] / max_score)
153
 
154
  # Merge scores back into original dataframe
155
  df = df.merge(
 
159
  "Platform",
160
  "Model ID",
161
  "performance_score",
162
+ "tg_score",
163
+ "pp_score",
164
  "quant_factor",
165
  ]
166
  ],
 
170
 
171
  # Fill missing scores with 0
172
  df["performance_score"] = df["performance_score"].fillna(0)
173
+ df["tg_score"] = df["tg_score"].fillna(0)
174
+ df["pp_score"] = df["pp_score"].fillna(0)
175
 
176
  return df
177