Spaces:
Sleeping
Sleeping
File size: 5,861 Bytes
1215771 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import time
import os.path
# from sklearnex import patch_sklearn, unpatch_sklearn
# patch_sklearn()
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
# values: a two-dimensional array, m number of n-dimensional vectors to be clustered;
def modified_kmeans_cluster(values_to_cluster, threshold, k_start, n_clusters=None):
if n_clusters is not None:
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster)
return kmeans.labels_
else:
n_clusters = k_start
n_values = len(values_to_cluster)
assert n_values > 0
kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster)
inertias = [kmeans.inertia_]
while n_values > n_clusters:
n_clusters_new = n_clusters + 1
kmeans_new = KMeans(n_clusters=n_clusters_new, n_init="auto", random_state=0).fit(values_to_cluster)
inertias.append(kmeans_new.inertia_)
if terminate_clustering(inertias, threshold):
break
kmeans = kmeans_new
n_clusters += 1
return kmeans.labels_
def terminate_clustering(inertias, threshold):
# method: compute relative improvement toward previous step
assert len(inertias) > 1
improvement = 1 - (inertias[-1] / inertias[-2])
return improvement < threshold
def cluster_existed_features(network_folder_path, classes, layers_indexes, taus):
appendixes = ["_correctly_classified_features.csv", "_incorrectly_classified_features.csv"]
product = ((i, y, appendix) for i in layers_indexes for y in classes for appendix in appendixes)
for i, y, appendix in product:
start_time = time.time()
# load data for class y at layer minus i
features_file_path = network_folder_path +"Layer_minus_" + str(i) + "/class_" + str(y) + appendix
df = pd.read_csv(features_file_path)
index_values = df["index"].to_numpy()
values_to_cluster = df[df.columns[3:]].to_numpy()
if len(values_to_cluster):
# specify path and then load existing clustering results
k_and_taus = dict()
taus_existed = []
clustering_results = pd.DataFrame(df, columns=["index", "true_label", "pred_label"])
clustering_results_path = network_folder_path + "Layer_minus_" + str(i) + "/clustering_results_class_" + str(y) + appendix
if os.path.exists(clustering_results_path):
clustering_results = pd.read_csv(clustering_results_path)
for col in clustering_results.columns[3:]:
k_and_taus[col] = clustering_results[col].max() + 1
# update the existing values of tau
taus_existed = [float(key) for key in k_and_taus.keys()]
# remove existing tau from list existed_taus
taus_new = [tau for tau in taus if tau not in taus_existed]
# iterate every tau to cluster the given data
for tau in taus_new:
# fix starting searching point
k_start = 1
bigger_taus = [x for x in taus_existed if x > tau]
if len(bigger_taus):
tau_closest = min(bigger_taus)
k_start = k_and_taus[str(tau_closest)]
# start to cluster
cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start)
clustering_results[str(tau)] = cluster_labels
taus_existed.append(tau)
k_and_taus[str(tau)] = max(cluster_labels) + 1
clustering_results.to_csv(clustering_results_path, index = False)
elapsed_time = time.time() - start_time
print("file:" + "Layer_minus_" + str(i) + "_class_" + str(y) + appendix + ",", "lasting time:", elapsed_time, "seconds")
def features_clustering(features, taus, nb_clusters):
start_time = time.time()
values_to_cluster = features
if len(values_to_cluster):
# specify path and then load existing clustering results
k_and_taus = dict()
taus_existed = []
# if os.path.exists(clustering_results_path):
# clustering_results = pd.read_csv(clustering_results_path)
# for col in clustering_results.columns[3:]:
# k_and_taus[col] = clustering_results[col].max() + 1
# else:
# clustering_results = pd.DataFrame()
# update the existing values of tau
taus_existed = [float(key) for key in k_and_taus.keys()]
# remove existing tau from list existed_taus
taus_new = [tau for tau in taus if tau not in taus_existed]
clustering_results = dict()
# iterate every tau to cluster the given data
for tau in taus_new:
# fix starting searching point
k_start = 1
bigger_taus = [x for x in taus_existed if x > tau]
if len(bigger_taus):
tau_closest = min(bigger_taus)
k_start = k_and_taus[str(tau_closest)]
# start to cluster
cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start, nb_clusters)
clustering_results[str(tau)] = cluster_labels
taus_existed.append(tau)
k_and_taus[str(tau)] = max(cluster_labels) + 1
# clustering_results.to_csv(clustering_results_path, index = False)
elapsed_time = time.time() - start_time
# print("clustering time:", elapsed_time, "seconds")
return clustering_results
|