File size: 3,835 Bytes
a7b2523 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import numpy as np
class KMeans:
# Initialization and properties
def __init__(self):
self.centroids = np.empty(1)
# Fit method
def fit(self, data, clusters, epochs = 1, random_seed = 42):
np.random.seed(random_seed)
N = len(data)
centroids = data[np.random.choice(N, clusters, replace=False), :]
labels = np.empty(N)
old_labels = np.empty(N)
while True:
distances = np.linalg.norm(data[:, None, :] - centroids, axis=2)
labels = np.argmin(distances, axis=1)
for j in range(clusters):
centroids[j] = np.mean(data[labels == j], axis=0)
if np.all(labels == old_labels):
break
old_labels = labels
self.centroids = centroids
return labels
# Predict method
def predict(self, data):
distances = np.linalg.norm(data[:, None, :] - self.centroids, axis=2)
labels = np.argmin(distances, axis=1)
return labels
class KMedoids:
# Initialization and properties
def __init__(self, data, clusters):
self.medoids = np.empty(1)
self.data = data
self.N = len(data)
self.clusters = clusters
# Fit
def fit(self, random_seed = 42):
np.random.seed(random_seed)
data = self.data
N = self.N
clusters = self.clusters
medoids_idx = np.random.choice(N, clusters, replace=False)
medoids = data[medoids_idx].copy()
distances = np.zeros((N, clusters))
for i in range(clusters):
distances[:, i] = np.sum(np.abs(data - medoids[i]), axis=1)
labels = np.argmin(distances, axis=1)
old_labels = np.empty(N)
all_idxs = np.arange(N)
while True:
best_swap = (-1, -1, 0)
best_distances = np.zeros(N)
for i in range(clusters):
non_medoids_idx = all_idxs[np.logical_not(np.isin(all_idxs, medoids_idx))]
for j in non_medoids_idx:
new_medoid = data[j]
new_distances = np.sum(np.abs(data - new_medoid), axis=1)
cost_change = np.sum(new_distances[labels == i]) - np.sum(
distances[labels == i, i]
)
if cost_change < best_swap[2]:
best_swap = (i, j, cost_change)
best_distances = new_distances
if best_swap == (-1, -1, 0):
break
i, j, _ = best_swap
distances[:, i] = best_distances
medoids[i] = data[j]
labels = np.argmin(distances, axis=1)
old_labels = labels
self.medoids = medoids
return labels
# Predict
def predict(self, data):
distances = np.zeros((len(data), self.clusters))
for i in range(self.clusters):
distances[:, i] = np.sum(np.abs(data - self.medoids[i]), axis=1)
labels = np.argmin(distances, axis=1)
return labels
class EnsembleClustering:
# Initialization
def __init__(self, data, clusters):
self.data = data
self.clusters = clusters
self.kmeans = None
self.kmedoids = None
# Fit method
def fit(self):
kmeans = KMeans()
kmeans_labels = kmeans.fit(self.data, self.clusters)
self.kmeans = kmeans
kmedoids = KMedoids(data = self.data, clusters = self.clusters)
kmedoids_labels = kmedoids.fit()
self.kmedoids = kmedoids
labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
return labels
# Maximum voting method
def maximumVoting(self, labels1, labels2):
labels = np.zeros(len(labels1), dtype=int)
for i in range(len(labels1)):
voting = np.zeros(self.clusters, dtype=int)
voting[labels1[i]] += 1
voting[labels2[i]] += 1
labels[i] = voting.argmax()
return labels
# Predict method
def predict(self, data):
kmeans_labels = self.kmeans.predict(data)
kmedoids_labels = self.kmedoids.predict(data)
labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
return labels
|