ash-98 commited on
Commit
b5117fc
·
1 Parent(s): b8f4692

cybermetric80

Browse files
Files changed (2) hide show
  1. app.py +27 -13
  2. metric.csv +6 -0
app.py CHANGED
@@ -15,13 +15,18 @@ with st.sidebar:
15
  selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
16
 
17
  datasets_by_category = {
18
- "Multiple Choice": ["secQA"],
19
  }
20
  dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
21
 
22
  st.divider()
23
  st.header("Filters & Options")
24
- dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
 
 
 
 
 
25
  # For filtering the leaderboard by model type
26
  # Note: The available model types will come from the CSV, once loaded.
27
  # We'll load the CSV later and then update this filter accordingly.
@@ -69,8 +74,8 @@ random_accuracyv2 = estimate_random_accuracy(questionnairev2)
69
  # For now, if dataset_choice is "secQA", we use "Benchmark.csv"
70
  if dataset_choice == "secQA":
71
  file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
72
- else:
73
- file_path = "Benchmark.csv" # Placeholder: update with actual file paths for future datasets
74
 
75
  # Function to load and clean CSV data
76
  @st.cache_data
@@ -91,8 +96,9 @@ def load_data(file_path):
91
 
92
  # Convert percentage strings to floats (e.g., "100%" → 1.0)
93
  for col in ["V1 Accuracy", "V2 Accuracy"]:
94
- df[col] = df[col].astype(str).str.replace("%", "").str.strip()
95
- df[col] = pd.to_numeric(df[col], errors='coerce') / 100
 
96
 
97
  return df
98
 
@@ -110,7 +116,12 @@ source_filter = source_filter_placeholder.multiselect(
110
  df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
111
 
112
  # Choose the correct metric version and compute Accuracy
113
- df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
 
 
 
 
 
114
  df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
115
 
116
  # Sort by Accuracy descending
@@ -125,7 +136,10 @@ df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
125
  tab1, tab2 = st.tabs(["Leaderboard", "About"])
126
 
127
  with tab1:
128
- st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
 
 
 
129
 
130
  # Use columns to display leaderboard and model details side-by-side
131
  col1, col2 = st.columns([2, 1])
@@ -145,10 +159,10 @@ with tab1:
145
 
146
  st.divider()
147
  # Display the random baseline accuracy above the leaderboard
148
- st.markdown("### Random Baseline Accuracy")
149
- st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
150
- st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
151
-
152
 
153
  # Footer
154
  st.markdown("---")
@@ -185,4 +199,4 @@ with tab2:
185
  [Priam.ai](https://www.priam.ai/)
186
 
187
  *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
188
- """) # Replace with actual random_accuracy values if available
 
15
  selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
16
 
17
  datasets_by_category = {
18
+ "Multiple Choice": ["secQA","CyberMetric80"],
19
  }
20
  dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
21
 
22
  st.divider()
23
  st.header("Filters & Options")
24
+ #dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
25
+ if dataset_choice == "secQA":
26
+ dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
27
+ else:
28
+ st.markdown("**Note:** Only CyberMetric80 has been evaluated")
29
+ dataset_version = "v1"
30
  # For filtering the leaderboard by model type
31
  # Note: The available model types will come from the CSV, once loaded.
32
  # We'll load the CSV later and then update this filter accordingly.
 
74
  # For now, if dataset_choice is "secQA", we use "Benchmark.csv"
75
  if dataset_choice == "secQA":
76
  file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
77
+ elif dataset_choice == "CyberMetric80":
78
+ file_path = "metric.csv" # Placeholder: update with actual file paths for future datasets
79
 
80
  # Function to load and clean CSV data
81
  @st.cache_data
 
96
 
97
  # Convert percentage strings to floats (e.g., "100%" → 1.0)
98
  for col in ["V1 Accuracy", "V2 Accuracy"]:
99
+ if col in df.columns:
100
+ df[col] = df[col].astype(str).str.replace("%", "").str.strip()
101
+ df[col] = pd.to_numeric(df[col], errors='coerce') / 100
102
 
103
  return df
104
 
 
116
  df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
117
 
118
  # Choose the correct metric version and compute Accuracy
119
+ #df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
120
+ if dataset_choice == "CyberMetric80":
121
+ df_filtered["Accuracy"] = df_filtered["V1 Accuracy"]
122
+ else:
123
+ df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
124
+
125
  df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
126
 
127
  # Sort by Accuracy descending
 
136
  tab1, tab2 = st.tabs(["Leaderboard", "About"])
137
 
138
  with tab1:
139
+ if dataset_choice == "secQA":
140
+ st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
141
+ elif dataset_choice == "CyberMetric80":
142
+ st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)")
143
 
144
  # Use columns to display leaderboard and model details side-by-side
145
  col1, col2 = st.columns([2, 1])
 
159
 
160
  st.divider()
161
  # Display the random baseline accuracy above the leaderboard
162
+ if dataset_choice == "secQA":
163
+ st.markdown("### Random Baseline Accuracy")
164
+ st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
165
+ st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
166
 
167
  # Footer
168
  st.markdown("---")
 
199
  [Priam.ai](https://www.priam.ai/)
200
 
201
  *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
202
+ """)
metric.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ model name, source, v1 metric
2
+ Google: Gemma 3 27B ,Open Source,95.4022988%
3
+ Google: Gemini Flash 2.0,Proprietary Model,97.7011494%
4
+ Google: Gemini 2.0 Flash Lite,Proprietary Model,95.4022988%
5
+ DeepSeek: R1,Open Source,96.5517241%
6
+ Qwen: QwQ 32B,Open Source,94.2528735%