File size: 3,835 Bytes
a7b2523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np

class KMeans:
  # Initialization and properties
  def __init__(self):
    self.centroids = np.empty(1)

  # Fit method
  def fit(self, data, clusters, epochs = 1, random_seed = 42):
    np.random.seed(random_seed)
    N = len(data)
    centroids = data[np.random.choice(N, clusters, replace=False), :]

    labels = np.empty(N)
    old_labels = np.empty(N)

    while True:
      distances = np.linalg.norm(data[:, None, :] - centroids, axis=2)
      labels = np.argmin(distances, axis=1)
      for j in range(clusters):
            centroids[j] = np.mean(data[labels == j], axis=0)

      if np.all(labels == old_labels):
            break

      old_labels = labels
    self.centroids = centroids
    return labels

  # Predict method
  def predict(self, data):
      distances = np.linalg.norm(data[:, None, :] - self.centroids, axis=2)
      labels = np.argmin(distances, axis=1)
      return labels
  

class KMedoids:
  # Initialization and properties
  def __init__(self, data, clusters):
    self.medoids = np.empty(1)
    self.data = data
    self.N = len(data)
    self.clusters = clusters

  # Fit
  def fit(self, random_seed = 42):
    np.random.seed(random_seed)
    data = self.data
    N = self.N
    clusters = self.clusters
    medoids_idx = np.random.choice(N, clusters, replace=False)
    medoids = data[medoids_idx].copy()
    distances = np.zeros((N, clusters))

    for i in range(clusters):
        distances[:, i] = np.sum(np.abs(data - medoids[i]), axis=1)

    labels = np.argmin(distances, axis=1)
    old_labels = np.empty(N)
    all_idxs = np.arange(N)

    while True:
      best_swap = (-1, -1, 0)
      best_distances = np.zeros(N)
      for i in range(clusters):
          non_medoids_idx = all_idxs[np.logical_not(np.isin(all_idxs, medoids_idx))]
          for j in non_medoids_idx:
              new_medoid = data[j]
              new_distances = np.sum(np.abs(data - new_medoid), axis=1)
              cost_change = np.sum(new_distances[labels == i]) - np.sum(
                  distances[labels == i, i]
              )
              if cost_change < best_swap[2]:
                  best_swap = (i, j, cost_change)
                  best_distances = new_distances

      if best_swap == (-1, -1, 0):
            break
      i, j, _ = best_swap
      distances[:, i] = best_distances
      medoids[i] = data[j]

      labels = np.argmin(distances, axis=1)

      old_labels = labels
    self.medoids = medoids
    return labels

  # Predict
  def predict(self, data):
      distances = np.zeros((len(data), self.clusters))
      for i in range(self.clusters):
        distances[:, i] = np.sum(np.abs(data - self.medoids[i]), axis=1)
      labels = np.argmin(distances, axis=1)
      return labels
  
  
class EnsembleClustering:
  # Initialization
  def __init__(self, data, clusters):
    self.data = data
    self.clusters = clusters
    self.kmeans = None
    self.kmedoids = None

  # Fit method
  def fit(self):
    kmeans = KMeans()
    kmeans_labels = kmeans.fit(self.data, self.clusters)
    self.kmeans = kmeans

    kmedoids = KMedoids(data = self.data, clusters = self.clusters)
    kmedoids_labels = kmedoids.fit()
    self.kmedoids = kmedoids

    labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
    return labels

  # Maximum voting method
  def maximumVoting(self, labels1, labels2):
    labels = np.zeros(len(labels1), dtype=int)
    for i in range(len(labels1)):
      voting = np.zeros(self.clusters, dtype=int)
      voting[labels1[i]] += 1
      voting[labels2[i]] += 1
      labels[i] = voting.argmax()
    return labels

  # Predict method
  def predict(self, data):
    kmeans_labels = self.kmeans.predict(data)
    kmedoids_labels = self.kmedoids.predict(data)
    labels = self.maximumVoting(kmeans_labels, kmedoids_labels)
    return labels