File size: 4,023 Bytes
7db94f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import load_npz

import time
from multiprocessing import Pool


embed_type = 'SBERT'  # Change this to 'MLFPA' or 'BERT' as needed

#if no parquet create it
try:
    embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
except:
    # Load MLFPA_project-main\BERT embeddings\bert_embedding.npz
    embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz')
    # print(embeddings_df.files)  # Check the keys in the .npz file
    # embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding']
    print(embeddings_df.shape)  # Check the shape of the embeddings

    #print data type
    print(type(embeddings_df))  # Check the type of the embeddings
    #change to pandas dataframe
    embeddings_df = pd.DataFrame(embeddings_df)
    #save as parquet
    embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
    #load parquet
    embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')

#do the clustering lmao
def scale_and_pca(embeddings_df):
    # Standardize the data
    scaler = StandardScaler()
    embeddings_scaled = scaler.fit_transform(embeddings_df)
    # Perform PCA to reduce dimensionality
    pca = PCA(n_components=3)
    embeddings_pca = pca.fit_transform(embeddings_scaled)
    return embeddings_pca

embeddings_pca = scale_and_pca(embeddings_df)
#remove embeddings_df from memory
del embeddings_df
# Create a 3D scatter plot of the PCA results

def plot_3d_scatter(embeddings_pca):
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1)
    ax.set_xlabel('PC 1')
    ax.set_ylabel('PC 2')
    ax.set_zlabel('PC 3')
    plt.title('3D PCA of BERT Embeddings')
    plt.show()

# plot_3d_scatter(embeddings_pca)


# def compute_silhouette(n_clusters, data):
#     kmeans = KMeans(n_clusters=n_clusters, random_state=420)
#     labels = kmeans.fit_predict(data)
#     silhouette_avg = silhouette_score(data, labels)
#     print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}")
#     return silhouette_avg


# silhouette_scores = []
# for i in range(2, 10):
#     start_time = time.time()
#     silhouette_scores.append(compute_silhouette(i, embeddings_pca))
#     end_time = time.time()
#     print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds")
    
# # Plot silhouette scores
# plt.figure(figsize=(10, 6))
# plt.plot(range(2, 10), silhouette_scores, marker='o')
# plt.title('Silhouette Scores for Different Cluster Sizes')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score')
# plt.xticks(range(2, 10))
# plt.grid()
# plt.show()
# # Save silhouette scores to CSV
# silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores})
# silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False)
    
#save the the cluster labels for n_clusters = 5
def save_cluster_labels(n_clusters, data):
    kmeans = KMeans(n_clusters=n_clusters, random_state=420)
    labels = kmeans.fit_predict(data)
    labels_df = pd.DataFrame(labels, columns=['cluster_label'])
    labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False)
    return labels_df
    
save_cluster_labels(5, embeddings_pca)

# plot_3d_scatter(embeddings_pca)