K-means clustering with a k-means++ like initialization mode
# K-means clustering with a k-means++ like initialization mode
from pyspark.ml.linalg import Vectors
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
centers = model.clusterCenters()
len(centers)
# 2
model.computeCost(df)
# 2.000...
transformed = model.transform(df).select("features", "prediction")
rows = transformed.collect()
rows[0].prediction == rows[1].prediction
# True
rows[2].prediction == rows[3].prediction
# True
model.hasSummary
summary.k
# 2
summary.clusterSizes
# [2, 2]
kmeans_path = temp_path + "/kmeans"
kmeans.save(kmeans_path)
kmeans2 = KMeans.load(kmeans_path)
kmeans2.getK()
# 2
model_path = temp_path + "/kmeans_model"
model.save(model_path)
model2 = KMeansModel.load(model_path)
model2.hasSummary
# False
model.clusterCenters()[0] == model2.clusterCenters()[0]
# array([ True, True], dtype=bool)
model.clustersCenters()[1] == model2.clusterCenters()[1]
# array([ True, True], dtype=bool)