Initial
Browse files- .gitignore +2 -0
- .streamlit/config.toml +5 -0
- Benchmark.csv +24 -0
- app.py +87 -0
- requirements.txt +37 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
venv
|
2 |
+
Dockerfile
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#01d2fc"
|
3 |
+
backgroundColor="#252040"
|
4 |
+
secondaryBackgroundColor="#262626"
|
5 |
+
textColor="#f4f4f4"
|
Benchmark.csv
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model name, source, v1 metric, v2 metric
|
2 |
+
OpenAI: GPT-4.5 (Preview),Proprietary Model,100%,97%
|
3 |
+
OpenAI: o3 Mini High,Proprietary Model,100%,96%
|
4 |
+
OpenAI: o3 Mini,Proprietary Model,100%,96%
|
5 |
+
OpenAI: GPT-4o,Proprietary Model,99.09%,95%
|
6 |
+
OpenAI: GPT-4o-mini,Proprietary Model,99.09%,97%
|
7 |
+
Anthropic: Claude 3.5 Sonnet,Proprietary Model,99.09%,97%
|
8 |
+
Anthropic: Claude 3.5 Haiku,Proprietary Model,100%,97%
|
9 |
+
Anthropic: Claude 3.7 Sonnet,Proprietary Model,99.09%,98%
|
10 |
+
Google: Gemma 3 27B ,Open Source,98.18%,95%
|
11 |
+
Google: Gemini Flash 2.0,Proprietary Model,100%,99%
|
12 |
+
Google: Gemini 2.0 Flash Lite,Proprietary Model,100%,97%
|
13 |
+
DeepSeek: R1,Open Source,100%,98%
|
14 |
+
DeepSeek: DeepSeek V3,Open Source,100%,97%
|
15 |
+
Mistral: Mistral Small 3.1 24B,Open Source,100%,97%
|
16 |
+
Mistral: Mistral Small 3,Open Source,99.09%,97%
|
17 |
+
Mistral Large 2411,Open Source,99.09%,96%
|
18 |
+
Meta: Llama 3.3 70B Instruct,Open Source,100%,97%
|
19 |
+
Meta: Llama 3.2 3B Instruct,Open Source,78.18%,75%
|
20 |
+
Qwen: QwQ 32B,Open Source,100.00%,96%
|
21 |
+
Microsoft: Phi 4,Proprietary Model,100%,97%
|
22 |
+
Microsoft: Phi-3.5 Mini 128K Instruct,Open Source,99.09%,97%
|
23 |
+
Microsoft: Phi-3 Mini 128K Instruct,Open Source,98.18%,98%
|
24 |
+
|
app.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# Set page configuration
|
5 |
+
st.set_page_config(page_title="Cyber Benchmark Hub: SECQA Leaderboard", layout="wide")
|
6 |
+
|
7 |
+
# Main Title (ensures it's displayed on the main page)
|
8 |
+
st.title("Cyber Benchmark Hub: SECQA Leaderboard")
|
9 |
+
st.markdown("## Powered by **Priam Cyber AI**")
|
10 |
+
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
|
11 |
+
|
12 |
+
# Function to load and clean CSV data
|
13 |
+
@st.cache_data
|
14 |
+
def load_data(file_path):
|
15 |
+
df = pd.read_csv(file_path)
|
16 |
+
|
17 |
+
# Remove any unnamed columns (caused by trailing commas)
|
18 |
+
df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
|
19 |
+
|
20 |
+
# Standardize column names
|
21 |
+
df.columns = df.columns.str.strip()
|
22 |
+
df.rename(columns={
|
23 |
+
"model name": "Model",
|
24 |
+
"source": "Type",
|
25 |
+
"v1 metric": "V1 Accuracy",
|
26 |
+
"v2 metric": "V2 Accuracy"
|
27 |
+
}, inplace=True)
|
28 |
+
|
29 |
+
# Convert percentage strings to floats (e.g., "100%" → 1.0)
|
30 |
+
for col in ["V1 Accuracy", "V2 Accuracy"]:
|
31 |
+
df[col] = df[col].astype(str).str.replace("%", "").str.strip()
|
32 |
+
df[col] = pd.to_numeric(df[col], errors='coerce') / 100
|
33 |
+
|
34 |
+
return df
|
35 |
+
|
36 |
+
# Load dataset
|
37 |
+
file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
|
38 |
+
df = load_data(file_path)
|
39 |
+
|
40 |
+
# Sidebar: Logo, then Filters and Options
|
41 |
+
with st.sidebar:
|
42 |
+
st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
|
43 |
+
st.divider()
|
44 |
+
st.header("Filters & Options")
|
45 |
+
dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
|
46 |
+
source_filter = st.multiselect(
|
47 |
+
"Select Model Type",
|
48 |
+
options=df["Type"].unique().tolist(),
|
49 |
+
default=df["Type"].unique().tolist()
|
50 |
+
)
|
51 |
+
st.markdown("---")
|
52 |
+
st.header("Test Parameters")
|
53 |
+
test_params = pd.DataFrame({
|
54 |
+
"Value": [0, 1, 0, 1, 0]
|
55 |
+
}, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
|
56 |
+
st.table(test_params)
|
57 |
+
|
58 |
+
# Apply filtering based on the sidebar selections
|
59 |
+
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
|
60 |
+
|
61 |
+
# Choose the correct metric version and compute Accuracy
|
62 |
+
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
|
63 |
+
df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
|
64 |
+
|
65 |
+
# Sort by Accuracy descending and add a Rank column starting from 1
|
66 |
+
df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
|
67 |
+
df_filtered.insert(0, "Rank", range(1, len(df_filtered) + 1))
|
68 |
+
|
69 |
+
# Use columns to display leaderboard and model details side-by-side
|
70 |
+
col1, col2 = st.columns([2, 1])
|
71 |
+
|
72 |
+
with col1:
|
73 |
+
st.subheader(f"Leaderboard for SECQA Version {dataset_version}")
|
74 |
+
st.dataframe(df_filtered.reset_index(drop=True))
|
75 |
+
|
76 |
+
with col2:
|
77 |
+
st.subheader("Model Details")
|
78 |
+
selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
|
79 |
+
model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
|
80 |
+
st.write(f"**Model:** {model_details['Model']}")
|
81 |
+
st.write(f"**Type:** {model_details['Type']}")
|
82 |
+
st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
|
83 |
+
st.write(f"**Rank:** {model_details['Rank']}")
|
84 |
+
|
85 |
+
# Footer
|
86 |
+
st.markdown("---")
|
87 |
+
st.info("More dataset benchmarks will be added to this hub in the future.")
|
requirements.txt
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.5.0
|
2 |
+
attrs==25.3.0
|
3 |
+
blinker==1.9.0
|
4 |
+
cachetools==5.5.2
|
5 |
+
certifi==2025.1.31
|
6 |
+
charset-normalizer==3.4.1
|
7 |
+
click==8.1.8
|
8 |
+
gitdb==4.0.12
|
9 |
+
GitPython==3.1.44
|
10 |
+
idna==3.10
|
11 |
+
Jinja2==3.1.6
|
12 |
+
jsonschema==4.23.0
|
13 |
+
jsonschema-specifications==2024.10.1
|
14 |
+
MarkupSafe==3.0.2
|
15 |
+
narwhals==1.31.0
|
16 |
+
numpy==2.2.4
|
17 |
+
packaging==24.2
|
18 |
+
pandas==2.2.3
|
19 |
+
pillow==11.1.0
|
20 |
+
protobuf==5.29.4
|
21 |
+
pyarrow==19.0.1
|
22 |
+
pydeck==0.9.1
|
23 |
+
python-dateutil==2.9.0.post0
|
24 |
+
pytz==2025.1
|
25 |
+
referencing==0.36.2
|
26 |
+
requests==2.32.3
|
27 |
+
rpds-py==0.23.1
|
28 |
+
six==1.17.0
|
29 |
+
smmap==5.0.2
|
30 |
+
streamlit==1.43.2
|
31 |
+
tenacity==9.0.0
|
32 |
+
toml==0.10.2
|
33 |
+
tornado==6.4.2
|
34 |
+
typing_extensions==4.12.2
|
35 |
+
tzdata==2025.2
|
36 |
+
urllib3==2.3.0
|
37 |
+
watchdog==6.0.0
|