1. K-Means Clustering

1.1 import modules

    #show the figures buildin the notebook
    %matplotlib inline 
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.io #Used to load the OCTAVE *.mat files
    import scipy.misc #Used to show matrix as an image
    from scipy import optimize
    import random

1.2 import sample data

    fileName = 'ex7/data/ex7data2.mat'
    mat = scipy.io.loadmat(fileName)
    X = mat['X']

1.3 find the closest for each sample

the algorithm as the following:

for i in xrange(iteration):
    idx = findClosedCentroids(X,centroids)
    centroids = computeMeans(X, idx, K)

    def findClosedCentroid(X, centroids):
        """
        return the indexs of the centroids for each sample in X
        """
        ids = np.zeros((X.shape[0],1)) * (-1)
        for i in xrange(X.shape[0]):
            ids[i] = findClosest(X[i],centroids)
        return ids
    
    def findClosest(point,centroids):
        """
        find the closest one in centroids array for the given sample
        return the index of the closed one
        """
        result = np.zeros((len(centroids),1))
        for i in xrange(len(centroids)):
            result[i] = distSquared(point, centroids[i])
        return np.argmin(result,axis=0)
    
    def distSquared(point1, point2):
        """
        return the distance of these 2 points
        """
        assert point1.shape == point2.shape
        return np.sum(np.square(point1 - point2))
    
    def computeCentroids(X, ids):
        """
        first, according the distinguish values in ids to get the value:K
        then, make good use of K to compute the K-centroids points(mean value)
        """
        k = len(np.unique(ids))
        
        result = np.empty((k,X.shape[1]))
        #print result.shape
        for i in xrange(k):
            result[i]=np.mean(X[ids.flatten()==i],axis=0)
        return result
    
    initial_centroids = np.array([[3,3],[6,2],[8,5]]);
    ids = findClosedCentroid(X,initial_centroids)
    centroids = computeCentroids(X,ids)
    print "the index for the init centroids:%s" % ids[0:3].flatten()
    print "the first centroids we compute by the init centroids is:%s" % centroids

the output is:

the index for the init centroids:[ 0. 2. 1.] the first centroids we compute by the init centroids is:[[ 2.42830111 3.15792418] [ 5.81350331 2.63365645] [ 7.11938687 3.6166844 ]]

    def calMeanK(X, centroids, maxIter=100):
        cent_history=[]
        for i in xrange(maxIter):
            cent_history.append(centroids)
            ids = findClosedCentroid(X,centroids)
            cent = computeCentroids(X,ids)
            if np.sum(cent!=centroids) == 0:
                break
            else:
                centroids = cent
        cent_history.append(cent)
        return ids,cent_history
    ids,cent_history=calMeanK(X,initial_centroids,100)
    print cent_history

The output is:

    [array([[3, 3],
           [6, 2],
           [8, 5]]), array([[ 2.42830111,  3.15792418],
           [ 5.81350331,  2.63365645],
           [ 7.11938687,  3.6166844 ]]), array([[ 2.31325526,  3.22830617],
           [ 5.33273768,  2.43159599],
           [ 6.8653618 ,  3.23293995]]), array([[ 2.19692479,  3.42136707],
           [ 4.83555397,  2.12976745],
           [ 6.6560054 ,  3.0751355 ]]), array([[ 1.98241171,  4.0250785 ],
           [ 3.91150763,  1.47060546],
           [ 6.34008592,  3.05366642]]), array([[ 1.95399466,  5.02557006],
           [ 3.12663743,  1.1121712 ],
           [ 6.12919526,  3.01606258]]), array([[ 1.95399466,  5.02557006],
           [ 3.04367119,  1.01541041],
           [ 6.03366736,  3.00052511]]), array([[ 1.95399466,  5.02557006],
           [ 3.04367119,  1.01541041],
           [ 6.03366736,  3.00052511]])]

compress the image

load and display

    fileName = 'ex7/data/bird_small.png'
    A = scipy.misc.imread(fileName)
    print "the loaded image's data shape:",A.shape
    plt.figure()
    dummy = plt.imshow(A)
    plt.show()

the output is:

the loaded image’s data shape: (128, 128, 3)

png

normalization

    A = A / 255.0 # after the normalization, then the feature is in the range[0,1]
    print A.shape
    B = A.reshape((A.shape[0] * A.shape[1],3)) # can also use (-1,3)
    
    myK = 16 # use the 16 centroid point to stand for all the points in the image
    init_centroids = random.sample(B, myK)
    print init_centroids

the output is:

    (128, 128, 3)
    [array([ 0.47058824,  0.31764706,  0.21568627]), array([ 0.89803922,  0.76470588,  0.4745098 ]), array([ 0.57254902,  0.50196078,  0.17647059]), array([ 0.09411765,  0.10980392,  0.12156863]), array([ 0.06666667,  0.07058824,  0.05882353]), array([ 0.84313725,  0.69019608,  0.39215686]), array([ 0.69019608,  0.54901961,  0.56862745]), array([ 0.36078431,  0.22745098,  0.21960784]), array([ 0.43921569,  0.3254902 ,  0.33333333]), array([ 0.23529412,  0.21960784,  0.21568627]), array([ 0.76470588,  0.6745098 ,  0.41176471]), array([ 0.99607843,  0.95294118,  0.74509804]), array([ 0.11764706,  0.1254902 ,  0.1254902 ]), array([ 0.85882353,  0.76078431,  0.52941176]), array([ 0.92156863,  0.86666667,  0.5254902 ]), array([ 0.82745098,  0.70196078,  0.45490196])]

cal the centroids

    idxs, centroid_history = calMeanK(B,init_centroids,10)
    print idxs

    [[  1.]
     [  1.]
     [  1.]
     ..., 
     [  9.]
     [ 12.]
     [ 12.]]

compress and display

    final_cent = centroid_history[-1]
    cover_A = np.array([final_cent[int(i)] for i in idxs.flatten()])
    cover_B = cover_A.reshape(A.shape)
    plt.figure()
    plt.imshow(cover_B)
    plt.show()

png

Coursera:Machine Learning by Andrew Ng(7) - Compression with K-means

1. K-Means Clustering

1.1 import modules

1.2 import sample data

1.3 find the closest for each sample

compress the image

load and display

normalization

cal the centroids

compress and display

Comments