|
import numpy as np |
|
|
|
class KMeans: |
|
|
|
def __init__(self): |
|
self.centroids = np.empty(1) |
|
|
|
|
|
def fit(self, data, clusters, epochs = 1, random_seed = 42): |
|
np.random.seed(random_seed) |
|
N = len(data) |
|
centroids = data[np.random.choice(N, clusters, replace=False), :] |
|
|
|
labels = np.empty(N) |
|
old_labels = np.empty(N) |
|
|
|
while True: |
|
distances = np.linalg.norm(data[:, None, :] - centroids, axis=2) |
|
labels = np.argmin(distances, axis=1) |
|
for j in range(clusters): |
|
centroids[j] = np.mean(data[labels == j], axis=0) |
|
|
|
if np.all(labels == old_labels): |
|
break |
|
|
|
old_labels = labels |
|
self.centroids = centroids |
|
return labels |
|
|
|
|
|
def predict(self, data): |
|
distances = np.linalg.norm(data[:, None, :] - self.centroids, axis=2) |
|
labels = np.argmin(distances, axis=1) |
|
return labels |
|
|
|
|
|
class KMedoids: |
|
|
|
def __init__(self, data, clusters): |
|
self.medoids = np.empty(1) |
|
self.data = data |
|
self.N = len(data) |
|
self.clusters = clusters |
|
|
|
|
|
def fit(self, random_seed = 42): |
|
np.random.seed(random_seed) |
|
data = self.data |
|
N = self.N |
|
clusters = self.clusters |
|
medoids_idx = np.random.choice(N, clusters, replace=False) |
|
medoids = data[medoids_idx].copy() |
|
distances = np.zeros((N, clusters)) |
|
|
|
for i in range(clusters): |
|
distances[:, i] = np.sum(np.abs(data - medoids[i]), axis=1) |
|
|
|
labels = np.argmin(distances, axis=1) |
|
old_labels = np.empty(N) |
|
all_idxs = np.arange(N) |
|
|
|
while True: |
|
best_swap = (-1, -1, 0) |
|
best_distances = np.zeros(N) |
|
for i in range(clusters): |
|
non_medoids_idx = all_idxs[np.logical_not(np.isin(all_idxs, medoids_idx))] |
|
for j in non_medoids_idx: |
|
new_medoid = data[j] |
|
new_distances = np.sum(np.abs(data - new_medoid), axis=1) |
|
cost_change = np.sum(new_distances[labels == i]) - np.sum( |
|
distances[labels == i, i] |
|
) |
|
if cost_change < best_swap[2]: |
|
best_swap = (i, j, cost_change) |
|
best_distances = new_distances |
|
|
|
if best_swap == (-1, -1, 0): |
|
break |
|
i, j, _ = best_swap |
|
distances[:, i] = best_distances |
|
medoids[i] = data[j] |
|
|
|
labels = np.argmin(distances, axis=1) |
|
|
|
old_labels = labels |
|
self.medoids = medoids |
|
return labels |
|
|
|
|
|
def predict(self, data): |
|
distances = np.zeros((len(data), self.clusters)) |
|
for i in range(self.clusters): |
|
distances[:, i] = np.sum(np.abs(data - self.medoids[i]), axis=1) |
|
labels = np.argmin(distances, axis=1) |
|
return labels |
|
|
|
|
|
class EnsembleClustering: |
|
|
|
def __init__(self, data, clusters): |
|
self.data = data |
|
self.clusters = clusters |
|
self.kmeans = None |
|
self.kmedoids = None |
|
|
|
|
|
def fit(self): |
|
kmeans = KMeans() |
|
kmeans_labels = kmeans.fit(self.data, self.clusters) |
|
self.kmeans = kmeans |
|
|
|
kmedoids = KMedoids(data = self.data, clusters = self.clusters) |
|
kmedoids_labels = kmedoids.fit() |
|
self.kmedoids = kmedoids |
|
|
|
labels = self.maximumVoting(kmeans_labels, kmedoids_labels) |
|
return labels |
|
|
|
|
|
def maximumVoting(self, labels1, labels2): |
|
labels = np.zeros(len(labels1), dtype=int) |
|
for i in range(len(labels1)): |
|
voting = np.zeros(self.clusters, dtype=int) |
|
voting[labels1[i]] += 1 |
|
voting[labels2[i]] += 1 |
|
labels[i] = voting.argmax() |
|
return labels |
|
|
|
|
|
def predict(self, data): |
|
kmeans_labels = self.kmeans.predict(data) |
|
kmedoids_labels = self.kmedoids.predict(data) |
|
labels = self.maximumVoting(kmeans_labels, kmedoids_labels) |
|
return labels |
|
|