File size: 3,085 Bytes
760a88c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px


def perform_pca(data, n_components):
    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(data)
    principalDf = pd.DataFrame(data=principalComponents,
                               columns=[f'principal component {i+1}' for i in range(n_components)])
    return principalDf

def plot_pca(clustered_data,principalDf,df,information_columns):
    clustered_data = clustered_data.reset_index()  # To make sure indices match with df
    finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
    finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)

    finalDf.drop(columns=['index'], inplace=True)
    fig = px.scatter(finalDf,
                 x='principal component 1',
                 y='principal component 2',
                 color='cluster',
                 hover_data=information_columns,
                 title='2 Component PCA',
                 labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
                 color_continuous_scale='viridis')
    # fig.show()
    return fig

# def get_common_features(data):
#     common_features = {}
#     for cluster in data['cluster'].unique():
#         cluster_data = data[data['cluster'] == cluster]
#         cluster_features_counts = {}
#         for column in cluster_data.drop(columns=["cluster"]).columns:
#             top_feature = cluster_data[column].mode()[0]  # Use mode to find the most common category
#             if top_feature != 0:  # Add only if most common feature is not 0
#                 cluster_features_counts[column] = top_feature
#         common_features[cluster] = cluster_features_counts
#     return common_features


def plot_pca_3D(clustered_data, principalDf,df,information_columns):
    clustered_data = clustered_data.reset_index()  # To make sure indices match with df
    finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
    finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
    finalDf.drop(columns=['index'], inplace=True)

    # common_features = get_common_features(clustered_data)
    # # Add most common features to finalDf
    # for cluster, features in common_features.items():
    #     for feature, value in features.items():
    #         finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value

    hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
    fig = px.scatter_3d(finalDf,
                 x='principal component 1',
                 y='principal component 2',
                 z='principal component 3',
                 color='cluster',
                 hover_data=hover_data,
                 title='3 Component PCA',
                 labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
                 color_continuous_scale='viridis')
    # fig.show()
    return fig