assign each point to the cluster with the closest centroid python
def kmeans(X, k, maxiter, seed = None):
"""
specify the number of clusters k and
the maximum iteration to run the algorithm
"""
n_row, n_col = X.shape
# randomly choose k data points as initial centroids
if seed is not None:
np.random.seed(seed)
rand_indices = np.random.choice(n_row, size = k)
centroids = X[rand_indices]
for itr in range(maxiter):
# compute distances between each data point and the set of centroids
# and assign each data point to the closest centroid
distances_to_centroids = pairwise_distances(X, centroids, metric = 'euclidean')
cluster_assignment = np.argmin(distances_to_centroids, axis = 1)
# select all data points that belong to cluster i and compute
# the mean of these data points (each feature individually)
# this will be our new cluster centroids
new_centroids = np.array([X[cluster_assignment == i].mean(axis = 0) for i in range(k)])
# if the updated centroid is still the same,
# then the algorithm converged
if np.all(centroids == new_centroids):
break
centroids = new_centroids
return centroids, cluster_assignment