nelbarman053's picture
Clustering model deployment
a7b2523
import numpy as np
class KMeans:
# Initialization and properties
def __init__(self):
self.centroids = np.empty(1)
# Fit method
def fit(self, data, clusters, epochs = 1, random_seed = 42):
np.random.seed(random_seed)
N = len(data)
centroids = data[np.random.choice(N, clusters, replace=False), :]
labels = np.empty(N)
old_labels = np.empty(N)
while True:
distances = np.linalg.norm(data[:, None, :] - centroids, axis=2)
labels = np.argmin(distances, axis=1)
for j in range(clusters):
centroids[j] = np.mean(data[labels == j], axis=0)
if np.all(labels == old_labels):
break
old_labels = labels
self.centroids = centroids
return labels
# Predict method
def predict(self, data):
distances = np.linalg.norm(data[:, None, :] - self.centroids, axis=2)
labels = np.argmin(distances, axis=1)
return labels
class KMedoids:
# Initialization and properties
def __init__(self, data, clusters):
self.medoids = np.empty(1)
self.data = data
self.N = len(data)
self.clusters = clusters
# Fit
def fit(self, random_seed = 42):
np.random.seed(random_seed)
data = self.data
N = self.N
clusters = self.clusters
medoids_idx = np.random.choice(N, clusters, replace=False)
medoids = data[medoids_idx].copy()
distances = np.zeros((N, clusters))
for i in range(clusters):
distances[:, i] = np.sum(np.abs(data - medoids[i]), axis=1)
labels = np.argmin(distances, axis=1)
old_labels = np.empty(N)
all_idxs = np.arange(N)
while True:
best_swap = (-1, -1, 0)
best_distances = np.zeros(N)
for i in range(clusters):
non_medoids_idx = all_idxs[np.logical_not(np.isin(all_idxs, medoids_idx))]
for j in non_medoids_idx:
new_medoid = data[j]
new_distances = np.sum(np.abs(data - new_medoid), axis=1)
cost_change = np.sum(new_distances[labels == i]) - np.sum(
distances[labels == i, i]
)
if cost_change < best_swap[2]:
best_swap = (i, j, cost_change)
best_distances = new_distances
if best_swap == (-1, -1, 0):
break
i, j, _ = best_swap
distances[:, i] = best_distances
medoids[i] = data[j]
labels = np.argmin(distances, axis=1)
old_labels = labels
self.medoids = medoids
return labels
# Predict
def predict(self, data):
distances = np.zeros((len(data), self.clusters))
for i in range(self.clusters):
distances[:, i] = np.sum(np.abs(data - self.medoids[i]), axis=1)
labels = np.argmin(distances, axis=1)
return labels
class EnsembleClustering:
# Initialization
def __init__(self, data, clusters):
self.data = data
self.clusters = clusters
self.kmeans = None
self.kmedoids = None
# Fit method
def fit(self):
kmeans = KMeans()
kmeans_labels = kmeans.fit(self.data, self.clusters)
self.kmeans = kmeans
kmedoids = KMedoids(data = self.data, clusters = self.clusters)
kmedoids_labels = kmedoids.fit()
self.kmedoids = kmedoids
labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
return labels
# Maximum voting method
def maximumVoting(self, labels1, labels2):
labels = np.zeros(len(labels1), dtype=int)
for i in range(len(labels1)):
voting = np.zeros(self.clusters, dtype=int)
voting[labels1[i]] += 1
voting[labels2[i]] += 1
labels[i] = voting.argmax()
return labels
# Predict method
def predict(self, data):
kmeans_labels = self.kmeans.predict(data)
kmedoids_labels = self.kmedoids.predict(data)
labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
return labels