ash-98 commited on
Commit
45e320b
·
1 Parent(s): 11e7d15
Files changed (5) hide show
  1. .gitignore +2 -0
  2. .streamlit/config.toml +5 -0
  3. Benchmark.csv +24 -0
  4. app.py +87 -0
  5. requirements.txt +37 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ Dockerfile
.streamlit/config.toml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [theme]
2
+ primaryColor="#01d2fc"
3
+ backgroundColor="#252040"
4
+ secondaryBackgroundColor="#262626"
5
+ textColor="#f4f4f4"
Benchmark.csv ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model name, source, v1 metric, v2 metric
2
+ OpenAI: GPT-4.5 (Preview),Proprietary Model,100%,97%
3
+ OpenAI: o3 Mini High,Proprietary Model,100%,96%
4
+ OpenAI: o3 Mini,Proprietary Model,100%,96%
5
+ OpenAI: GPT-4o,Proprietary Model,99.09%,95%
6
+ OpenAI: GPT-4o-mini,Proprietary Model,99.09%,97%
7
+ Anthropic: Claude 3.5 Sonnet,Proprietary Model,99.09%,97%
8
+ Anthropic: Claude 3.5 Haiku,Proprietary Model,100%,97%
9
+ Anthropic: Claude 3.7 Sonnet,Proprietary Model,99.09%,98%
10
+ Google: Gemma 3 27B ,Open Source,98.18%,95%
11
+ Google: Gemini Flash 2.0,Proprietary Model,100%,99%
12
+ Google: Gemini 2.0 Flash Lite,Proprietary Model,100%,97%
13
+ DeepSeek: R1,Open Source,100%,98%
14
+ DeepSeek: DeepSeek V3,Open Source,100%,97%
15
+ Mistral: Mistral Small 3.1 24B,Open Source,100%,97%
16
+ Mistral: Mistral Small 3,Open Source,99.09%,97%
17
+ Mistral Large 2411,Open Source,99.09%,96%
18
+ Meta: Llama 3.3 70B Instruct,Open Source,100%,97%
19
+ Meta: Llama 3.2 3B Instruct,Open Source,78.18%,75%
20
+ Qwen: QwQ 32B,Open Source,100.00%,96%
21
+ Microsoft: Phi 4,Proprietary Model,100%,97%
22
+ Microsoft: Phi-3.5 Mini 128K Instruct,Open Source,99.09%,97%
23
+ Microsoft: Phi-3 Mini 128K Instruct,Open Source,98.18%,98%
24
+
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Set page configuration
5
+ st.set_page_config(page_title="Cyber Benchmark Hub: SECQA Leaderboard", layout="wide")
6
+
7
+ # Main Title (ensures it's displayed on the main page)
8
+ st.title("Cyber Benchmark Hub: SECQA Leaderboard")
9
+ st.markdown("## Powered by **Priam Cyber AI**")
10
+ st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
11
+
12
+ # Function to load and clean CSV data
13
+ @st.cache_data
14
+ def load_data(file_path):
15
+ df = pd.read_csv(file_path)
16
+
17
+ # Remove any unnamed columns (caused by trailing commas)
18
+ df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
19
+
20
+ # Standardize column names
21
+ df.columns = df.columns.str.strip()
22
+ df.rename(columns={
23
+ "model name": "Model",
24
+ "source": "Type",
25
+ "v1 metric": "V1 Accuracy",
26
+ "v2 metric": "V2 Accuracy"
27
+ }, inplace=True)
28
+
29
+ # Convert percentage strings to floats (e.g., "100%" → 1.0)
30
+ for col in ["V1 Accuracy", "V2 Accuracy"]:
31
+ df[col] = df[col].astype(str).str.replace("%", "").str.strip()
32
+ df[col] = pd.to_numeric(df[col], errors='coerce') / 100
33
+
34
+ return df
35
+
36
+ # Load dataset
37
+ file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
38
+ df = load_data(file_path)
39
+
40
+ # Sidebar: Logo, then Filters and Options
41
+ with st.sidebar:
42
+ st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
43
+ st.divider()
44
+ st.header("Filters & Options")
45
+ dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
46
+ source_filter = st.multiselect(
47
+ "Select Model Type",
48
+ options=df["Type"].unique().tolist(),
49
+ default=df["Type"].unique().tolist()
50
+ )
51
+ st.markdown("---")
52
+ st.header("Test Parameters")
53
+ test_params = pd.DataFrame({
54
+ "Value": [0, 1, 0, 1, 0]
55
+ }, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
56
+ st.table(test_params)
57
+
58
+ # Apply filtering based on the sidebar selections
59
+ df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
60
+
61
+ # Choose the correct metric version and compute Accuracy
62
+ df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
63
+ df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
64
+
65
+ # Sort by Accuracy descending and add a Rank column starting from 1
66
+ df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
67
+ df_filtered.insert(0, "Rank", range(1, len(df_filtered) + 1))
68
+
69
+ # Use columns to display leaderboard and model details side-by-side
70
+ col1, col2 = st.columns([2, 1])
71
+
72
+ with col1:
73
+ st.subheader(f"Leaderboard for SECQA Version {dataset_version}")
74
+ st.dataframe(df_filtered.reset_index(drop=True))
75
+
76
+ with col2:
77
+ st.subheader("Model Details")
78
+ selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
79
+ model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
80
+ st.write(f"**Model:** {model_details['Model']}")
81
+ st.write(f"**Type:** {model_details['Type']}")
82
+ st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
83
+ st.write(f"**Rank:** {model_details['Rank']}")
84
+
85
+ # Footer
86
+ st.markdown("---")
87
+ st.info("More dataset benchmarks will be added to this hub in the future.")
requirements.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.5.0
2
+ attrs==25.3.0
3
+ blinker==1.9.0
4
+ cachetools==5.5.2
5
+ certifi==2025.1.31
6
+ charset-normalizer==3.4.1
7
+ click==8.1.8
8
+ gitdb==4.0.12
9
+ GitPython==3.1.44
10
+ idna==3.10
11
+ Jinja2==3.1.6
12
+ jsonschema==4.23.0
13
+ jsonschema-specifications==2024.10.1
14
+ MarkupSafe==3.0.2
15
+ narwhals==1.31.0
16
+ numpy==2.2.4
17
+ packaging==24.2
18
+ pandas==2.2.3
19
+ pillow==11.1.0
20
+ protobuf==5.29.4
21
+ pyarrow==19.0.1
22
+ pydeck==0.9.1
23
+ python-dateutil==2.9.0.post0
24
+ pytz==2025.1
25
+ referencing==0.36.2
26
+ requests==2.32.3
27
+ rpds-py==0.23.1
28
+ six==1.17.0
29
+ smmap==5.0.2
30
+ streamlit==1.43.2
31
+ tenacity==9.0.0
32
+ toml==0.10.2
33
+ tornado==6.4.2
34
+ typing_extensions==4.12.2
35
+ tzdata==2025.2
36
+ urllib3==2.3.0
37
+ watchdog==6.0.0