clustering python
#HIERARCHCAL CLUSTERING
#import the linkage, dendrogram and fcluster func from scipy
from scipy.cluster.hierarchy import dendrogram,linkage,fcluster
import matplotlib.pyplot as plt
import numpy as np
#Create your linkage object, it contains all the info about the 
#joins and clusterization
#The "ward" argument refers to the linking method ("single","average"...)
Z = linkage(X,"ward")
#plotting the dendrogram
plt.figure(figsize = (25,30))
# Color threshold refers to the distance cutoff for coloring the clusters
dendrogram(Z, leaf_font_size = 8, color_threshold = 10)
plt.show()
# fcluster returns an array as big as your df w the cluster each data belongs
# u can cut the clusters using diferent "criterion". Some examples:
# U only want 4 clusters:
k = 4
clusters = fcluster(Z,k,criterion="maxclust")
# U want the max distance in a cluster to be 10:
max_d = 20
clusters = fcluster(Z,max_d, criterion = "distance")
# Visualization of the clustering (2d clustering)
plt.figure(figsize = (10,8))
# now we use the object /w the ncluster info to color the scatter
# cmap refers to the color palette we are using
plt.scatter(X[:,0], X[:,1] ,  c = clusters, cmap = "brg")
plt.show()
