File size: 4,023 Bytes
7db94f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import load_npz
import time
from multiprocessing import Pool
embed_type = 'SBERT' # Change this to 'MLFPA' or 'BERT' as needed
#if no parquet create it
try:
embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
except:
# Load MLFPA_project-main\BERT embeddings\bert_embedding.npz
embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz')
# print(embeddings_df.files) # Check the keys in the .npz file
# embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding']
print(embeddings_df.shape) # Check the shape of the embeddings
#print data type
print(type(embeddings_df)) # Check the type of the embeddings
#change to pandas dataframe
embeddings_df = pd.DataFrame(embeddings_df)
#save as parquet
embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
#load parquet
embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
#do the clustering lmao
def scale_and_pca(embeddings_df):
# Standardize the data
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings_df)
# Perform PCA to reduce dimensionality
pca = PCA(n_components=3)
embeddings_pca = pca.fit_transform(embeddings_scaled)
return embeddings_pca
embeddings_pca = scale_and_pca(embeddings_df)
#remove embeddings_df from memory
del embeddings_df
# Create a 3D scatter plot of the PCA results
def plot_3d_scatter(embeddings_pca):
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1)
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')
plt.title('3D PCA of BERT Embeddings')
plt.show()
# plot_3d_scatter(embeddings_pca)
# def compute_silhouette(n_clusters, data):
# kmeans = KMeans(n_clusters=n_clusters, random_state=420)
# labels = kmeans.fit_predict(data)
# silhouette_avg = silhouette_score(data, labels)
# print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}")
# return silhouette_avg
# silhouette_scores = []
# for i in range(2, 10):
# start_time = time.time()
# silhouette_scores.append(compute_silhouette(i, embeddings_pca))
# end_time = time.time()
# print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds")
# # Plot silhouette scores
# plt.figure(figsize=(10, 6))
# plt.plot(range(2, 10), silhouette_scores, marker='o')
# plt.title('Silhouette Scores for Different Cluster Sizes')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score')
# plt.xticks(range(2, 10))
# plt.grid()
# plt.show()
# # Save silhouette scores to CSV
# silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores})
# silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False)
#save the the cluster labels for n_clusters = 5
def save_cluster_labels(n_clusters, data):
kmeans = KMeans(n_clusters=n_clusters, random_state=420)
labels = kmeans.fit_predict(data)
labels_df = pd.DataFrame(labels, columns=['cluster_label'])
labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False)
return labels_df
save_cluster_labels(5, embeddings_pca)
# plot_3d_scatter(embeddings_pca)
|