import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.metrics import silhouette_score from sklearn.preprocessing import StandardScaler from scipy.sparse import load_npz import time from multiprocessing import Pool embed_type = 'SBERT' # Change this to 'MLFPA' or 'BERT' as needed #if no parquet create it try: embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') except: # Load MLFPA_project-main\BERT embeddings\bert_embedding.npz embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz') # print(embeddings_df.files) # Check the keys in the .npz file # embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding'] print(embeddings_df.shape) # Check the shape of the embeddings #print data type print(type(embeddings_df)) # Check the type of the embeddings #change to pandas dataframe embeddings_df = pd.DataFrame(embeddings_df) #save as parquet embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') #load parquet embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet') #do the clustering lmao def scale_and_pca(embeddings_df): # Standardize the data scaler = StandardScaler() embeddings_scaled = scaler.fit_transform(embeddings_df) # Perform PCA to reduce dimensionality pca = PCA(n_components=3) embeddings_pca = pca.fit_transform(embeddings_scaled) return embeddings_pca embeddings_pca = scale_and_pca(embeddings_df) #remove embeddings_df from memory del embeddings_df # Create a 3D scatter plot of the PCA results def plot_3d_scatter(embeddings_pca): fig = plt.figure(figsize=(10, 7)) ax = fig.add_subplot(111, projection='3d') ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1) ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.set_zlabel('PC 3') plt.title('3D PCA of BERT Embeddings') plt.show() # plot_3d_scatter(embeddings_pca) # def compute_silhouette(n_clusters, data): # kmeans = KMeans(n_clusters=n_clusters, random_state=420) # labels = kmeans.fit_predict(data) # silhouette_avg = silhouette_score(data, labels) # print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}") # return silhouette_avg # silhouette_scores = [] # for i in range(2, 10): # start_time = time.time() # silhouette_scores.append(compute_silhouette(i, embeddings_pca)) # end_time = time.time() # print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds") # # Plot silhouette scores # plt.figure(figsize=(10, 6)) # plt.plot(range(2, 10), silhouette_scores, marker='o') # plt.title('Silhouette Scores for Different Cluster Sizes') # plt.xlabel('Number of Clusters') # plt.ylabel('Silhouette Score') # plt.xticks(range(2, 10)) # plt.grid() # plt.show() # # Save silhouette scores to CSV # silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores}) # silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False) #save the the cluster labels for n_clusters = 5 def save_cluster_labels(n_clusters, data): kmeans = KMeans(n_clusters=n_clusters, random_state=420) labels = kmeans.fit_predict(data) labels_df = pd.DataFrame(labels, columns=['cluster_label']) labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False) return labels_df save_cluster_labels(5, embeddings_pca) # plot_3d_scatter(embeddings_pca)