Browse files- +98 -0
- +123 -0
- +40 -0
- +66 -0
@@ -0,0 +1,98 @@
1 |
import streamlit as st
2 |
import pandas as pd
3 |
import as px
4 |
from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
5 |
from pca import perform_pca, plot_pca, plot_pca_3D
6 |
from data_preprocessing import mainDataWrangling
7 |
8 |
9 |
10 |
def convert_df(df):
11 |
return df.to_csv().encode('utf-8')
12 |
13 |
# Streamlit code
14 |
st.set_option('deprecation.showPyplotGlobalUse', False)
15 |
16 |
st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
17 |
uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')
18 |
19 |
if uploaded_file is not None:
20 |
df = pd.read_csv(uploaded_file)
21 |
22 |
raw_data = mainDataWrangling(df)
23 |
raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
24 |
project_names = raw_data["projectName"]
25 |
information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
26 |
"Efficacy", "Privacy", "Bias", "Explainability"]
27 |
28 |
data = raw_data.drop(columns=information_columns)
29 |
st.subheader('🔍 Data Preview')
30 |
31 |
32 |
st.subheader('🔍 Preprocessed Data')
33 |
34 |
35 |
# Step 1: Plot Elbow Method and Silhouette Scores
36 |
wcss = calculate_wcss(data)
37 |
silhouette_scores = calculate_silhouette_scores(data)
38 |
st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')
39 |
40 |
fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
41 |
labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
42 |
43 |
44 |
fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
45 |
title='Elbow Method')
46 |
47 |
48 |
49 |
50 |
51 |
- Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.
52 |
53 |
- For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.
54 |
55 |
- For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
56 |
From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
57 |
value of K or an optimal number of clusters.
58 |
59 |
60 |
61 |
62 |
63 |
optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
64 |
st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')
65 |
66 |
optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
67 |
value=2, step=1)
68 |
69 |
# Step 2: KMeans fitting and PCA
70 |
st.header('KMeans Clustering and PCA')
71 |
st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
72 |
kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)
73 |
74 |
# Add project names back to the data
75 |
display_data = pd.concat([project_names, clustered_data], axis=1)
76 |
77 |
st.subheader('📌 Clustered Data')
78 |
st.write(display_data[["projectName", "cluster"]])
79 |
80 |
principalDf = perform_pca(clustered_data, 2)
81 |
82 |
st.subheader('📊 2D PCA Plot')
83 |
fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
84 |
85 |
86 |
principalDf_3D = perform_pca(clustered_data, 3)
87 |
st.subheader('📊 3D PCA Plot')
88 |
fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
89 |
90 |
91 |
st.subheader('📩 Data Download')
92 |
csv = convert_df(display_data)
93 |
94 |
label="Download clustered data as CSV",
95 |
96 |
97 |
98 |
@@ -0,0 +1,123 @@
1 |
import pandas as pd
2 |
3 |
enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
4 |
societyGroups = [['policing','controlAccessToServices']]
5 |
dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
6 |
capabilitiesGroups = ['decisionSupportSystems']
7 |
8 |
technicalRisks = ['Robustness', 'Efficacy',
9 |
'Privacy', 'Bias', 'Explainability']
10 |
governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
11 |
riskVerticals = ['Overall'] + governanceRisks + technicalRisks
12 |
13 |
14 |
def mergeCostColumns(home, commisioned, licensed):
15 |
if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
16 |
output = 1
17 |
18 |
output = 0
19 |
return output
20 |
21 |
22 |
def generateUniqueEntries(targetColumn):
23 |
listOfEntries = []
24 |
for i in targetColumn.values:
25 |
listOfEntries += i.split(',')
26 |
listOfEntries = set(listOfEntries)
27 |
return list(listOfEntries)
28 |
29 |
30 |
def generateOneHot(dataframe, targetColumn, groups):
31 |
for group in groups:
32 |
groupColumnName = ''
33 |
if type(group) == str:
34 |
groupColumnName = targetColumn + '_' + group
35 |
36 |
for element in group:
37 |
if groupColumnName == '':
38 |
groupColumnName += targetColumn + '_' + element
39 |
40 |
groupColumnName += '_' + element
41 |
dataframe[groupColumnName] = 0
42 |
43 |
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
44 |
if type(group) == str:
45 |
if group in targetColumnData.split(','):
46 |
47 |
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
48 |
49 |
for element in group:
50 |
if element in targetColumnData.split(','):
51 |
52 |
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
53 |
54 |
dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) # move the new column to the far left
55 |
56 |
57 |
58 |
59 |
def convertToBinaryColumn(dataframe, targetColumn,
60 |
positiveGroup): # anything in the positive group gets assigned 1, o/w zero
61 |
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
62 |
if targetColumnData in positiveGroup:
63 |
dataframe.loc[i, targetColumn] = 1 # this method of assignment gets rid of the SettingWithCopy warning
64 |
65 |
dataframe.loc[i, targetColumn] = 0
66 |
67 |
68 |
def mainDataWrangling(data):
69 |
# 1. Throw away the columns that we don't need
70 |
columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
71 |
57] # only keep the columns that we think are pertinent to scoring
72 |
data = data.iloc[:, columnsToKeep]
73 |
74 |
# 2. Merge the three development cost columns to get a single column for insignificant cost
75 |
data['insignificant'] = data.apply(
76 |
lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
77 |
data.drop(data.iloc[:, 1:4], axis=1, inplace=True)
78 |
79 |
# 3. Replace headers with their compact forms
80 |
81 |
0, -1] = 'insignificant' # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
82 |
data.iloc[0, 0] = 'projectName'
83 |
data.iloc[0, -11:-1] = data.columns[-11:-1].values # copy the risk vertical names to the header row
84 |
data.columns = data.iloc[0, :].values # copying the first value row to the headers
85 |
data = data.drop([0]) # remove the first value row
86 |
data.reset_index(drop=True, inplace=True) # reset the row indices
87 |
data.insert(0, 'insignificant', data.pop('insignificant')) # move the insignificant column to the far left
88 |
89 |
# 4. Remove/replace missing data
90 |
data = data.dropna(
91 |
subset=data.columns[-10:].values) # drop all the samples for which risk scoring hasn't yet been done
92 |
data.reset_index(drop=True, inplace=True) # reset the row indices
93 |
data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) # replace NaNs for contingent question with 'low'
94 |
95 |
# 5. Perform one hot encoding and other encoding
96 |
generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
97 |
generateOneHot(data, 'soceityLevel', societyGroups)
98 |
convertToBinaryColumn(data, 'externalParties', ['yes'])
99 |
data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
100 |
{'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
101 |
generateOneHot(data, 'dataType', dataTypeGroups)
102 |
data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
103 |
data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
104 |
data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
105 |
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
106 |
data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
107 |
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
108 |
generateOneHot(data, 'capabilities', capabilitiesGroups)
109 |
data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})
110 |
111 |
# convert risk level columns to numbers
112 |
# creation of binary columns for Low
113 |
for riskColumn in data.columns[-10:]:
114 |
data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
115 |
# creation of binary columns for High
116 |
for riskColumn in data.columns[-20:-10]:
117 |
data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})
118 |
119 |
data.insert(0, 'projectName', data.pop('projectName'))
120 |
data['insignificant'] = pd.to_numeric(data['insignificant'])
121 |
data['externalParties'] = pd.to_numeric(data['externalParties'])
122 |
123 |
return data
@@ -0,0 +1,40 @@
1 |
from matplotlib import pyplot as plt
2 |
from sklearn.cluster import KMeans
3 |
from sklearn.metrics import silhouette_score
4 |
5 |
6 |
def calculate_wcss(data):
7 |
wcss = []
8 |
for i in range(1, 11):
9 |
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
10 |
11 |
12 |
return wcss
13 |
14 |
def calculate_silhouette_scores(data):
15 |
scores = []
16 |
range_values = range(2, 11)
17 |
for i in range_values:
18 |
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
19 |
20 |
score = silhouette_score(data, kmeans.labels_, metric='euclidean')
21 |
22 |
return scores
23 |
24 |
def plot_elbow(wcss):
25 |
plt.plot(range(1, 11), wcss)
26 |
plt.title('Elbow Method')
27 |
plt.xlabel('Number of clusters')
28 |
29 |
30 |
31 |
def get_optimal_clusters_silhouette(scores):
32 |
optimal_clusters = scores.index(max(scores)) + 2 # +2 because range_values starts from 2
33 |
print(f"Optimal number of clusters: {optimal_clusters}")
34 |
return optimal_clusters
35 |
36 |
def fit_kmeans(data, n_clusters):
37 |
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
38 |
clusters = kmeans.fit_predict(data)
39 |
data['cluster'] = clusters
40 |
return kmeans, data
@@ -0,0 +1,66 @@
1 |
import pandas as pd
2 |
from sklearn.decomposition import PCA
3 |
import as px
4 |
5 |
6 |
def perform_pca(data, n_components):
7 |
pca = PCA(n_components=n_components)
8 |
principalComponents = pca.fit_transform(data)
9 |
principalDf = pd.DataFrame(data=principalComponents,
10 |
columns=[f'principal component {i+1}' for i in range(n_components)])
11 |
return principalDf
12 |
13 |
def plot_pca(clustered_data,principalDf,df,information_columns):
14 |
clustered_data = clustered_data.reset_index() # To make sure indices match with df
15 |
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
16 |
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
17 |
18 |
finalDf.drop(columns=['index'], inplace=True)
19 |
fig = px.scatter(finalDf,
20 |
x='principal component 1',
21 |
y='principal component 2',
22 |
23 |
24 |
title='2 Component PCA',
25 |
labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
26 |
27 |
28 |
return fig
29 |
30 |
# def get_common_features(data):
31 |
# common_features = {}
32 |
# for cluster in data['cluster'].unique():
33 |
# cluster_data = data[data['cluster'] == cluster]
34 |
# cluster_features_counts = {}
35 |
# for column in cluster_data.drop(columns=["cluster"]).columns:
36 |
# top_feature = cluster_data[column].mode()[0] # Use mode to find the most common category
37 |
# if top_feature != 0: # Add only if most common feature is not 0
38 |
# cluster_features_counts[column] = top_feature
39 |
# common_features[cluster] = cluster_features_counts
40 |
# return common_features
41 |
42 |
43 |
def plot_pca_3D(clustered_data, principalDf,df,information_columns):
44 |
clustered_data = clustered_data.reset_index() # To make sure indices match with df
45 |
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
46 |
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
47 |
finalDf.drop(columns=['index'], inplace=True)
48 |
49 |
# common_features = get_common_features(clustered_data)
50 |
# # Add most common features to finalDf
51 |
# for cluster, features in common_features.items():
52 |
# for feature, value in features.items():
53 |
# finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value
54 |
55 |
hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
56 |
fig = px.scatter_3d(finalDf,
57 |
x='principal component 1',
58 |
y='principal component 2',
59 |
z='principal component 3',
60 |
61 |
62 |
title='3 Component PCA',
63 |
labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
64 |
65 |
66 |
return fig