LW Senior Architect

Coursera:Machine Learning by Andrew Ng(3) - One VS All with Logistic Regression


the logistic Regression can only help resolve the problem of binary classification, what can we do when there are multiple different output labels?

There are 2 solutions:

  • one-vs-all
  • neural network

load the data and visualization

import the needed modules

    #show the figures buildin the notebook
    %matplotlib inline 
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.io #Used to load the OCTAVE *.mat files
    import scipy.misc #Used to show matrix as an image
    from scipy import optimize
    import matplotlib.cm as cm #Used to display images in a specific colormap
    import random

load the data from the file as matrix

    fileName='ex3/data/ex3data1.mat'
    samples = scipy.io.loadmat(fileName)


    # extract to get (X,y) samples
    X = samples['X']
    X = np.insert(X,0,1,axis=1)  # insert 1 before index 0 as a column
    Y=samples['y']
    # since the handy digit 0 is labeled as 10
    Y=np.array([i % 10 for i in Y])
    
    print "X's shape is: %s,X[0]'s shape is: %s" % (X.shape, X[0:1].shape)
    print "y's shape is: %s, unique elements in y: %s" % (Y.shape, np.unique(Y))

X’s shape is: (5000, 401),X[0]’s shape is: (1, 401) y’s shape is: (5000, 1), unique elements in y: [0 1 2 3 4 5 6 7 8 9]

visualization the samples

    def myPlotImg(myX):
        
        
        xPixes, yPixes = (20,20)
        nrows,ncols = (10,10)
        pad = 1
        
        # this variable is used to store all the 100 samples data which are used to visualize
        big_pic = np.zeros((nrows * (xPixes + pad),ncols * (yPixes + pad)))
        
        cr,cl=(0,0) #cr stands for rows, cl stands for column
        for i in xrange(nrows):
            for j in xrange(ncols):
                # extract an sample as a 20 * 20 matrix
                sample = myX[random.randint(1,X.shape[0])-1] #randomly get a sample
                sample = sample[1:].reshape(xPixes,yPixes) # ignore the first column with value 1
                
                big_pic[i * xPixes+pad:(i+1)*xPixes+pad,j*yPixes+pad:(j+1)*yPixes+pad] = sample.T
        
        plt.figure(figsize=(6,6))
        img = scipy.misc.toimage(big_pic)
        plt.xlabel('x coordinate')
        plt.ylabel('y coordinate')
        plt.imshow(img,cmap=cm.Greys_r)
        plt.show()
                
    myPlotImg(X)   

png

one-vs-all

For this solution, we make good use of logistic regression to complete this.

Mechnism

We take one label as the expected one(y=1) and the other labels are the negative labels(y=0)

implementation

implement cost and gradient function

    # define the sigmoid function:
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))
    
    # define the cost function with regularization
    def costFun(theta, myX, myY, mylambda = 0.0):
        theta = theta.reshape(-1,1)
        
        z = np.dot(X,theta)
        T = sigmoid(z)
        if np.isnan(T[0][0]):
            print "there is nan value!!!!!!!!" 
            
        m = len(X)
        result = np.sum(myY * np.log(T) + (1 - myY)*np.log(1-T)) * (-1) / m
        theta2 = theta[1:,:]
        result = result + (mylambda / (2* m)) * np.dot(theta2.T,theta2)
        if np.isnan(result):
            return np.inf
        return result
        
    
    # define the gradient function of cost function
    def gradientFun(theta, myX, myY, mylambda =0.0):
        theta = theta.reshape(-1,1)
       # print theta, theta.shape
        Z = np.dot(myX, theta)   # m*1
        T = sigmoid(Z)
        m = len(myX)
        result = np.dot(myX.T,T - myY) / m  # n * 1
        theta[0] = 0
        result = result + (mylambda / m) * theta
        return result.flatten()
    
    def oneVSAll(myX, myY, mylambda=0.0):
        # get all the possible labels
        labels = np.unique(myY)
        
        theta = np.empty((len(labels), myX.shape[1])) # labels * (n+1)
        for xclass in labels:
            ## one VS All(0)
            print "optimizing for the handwriting number:%d" % xclass
            tY = np.array([1 if v==xclass else 0 for v in myY.flatten()]).reshape(-1,1) # m * 1
            res = optimize.minimize(costFun,np.zeros((myX.shape[1],1)), args=(myX,tY,mylambda),method='BFGS',
                                     jac=gradientFun, options={'maxiter':100,'disp':True})
            theta[xclass] = res.x
        
        print "one vs all Done"
        
        return theta
            
    theta = oneVSAll(X,Y,mylambda=0.1)    

the following is the log information

optimizing for the handwriting number:0
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.011179
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:1
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.016092
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103
optimizing for the handwriting number:2
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.057727
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:3
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.063839
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:4
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.039164
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
optimizing for the handwriting number:5
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.062603
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:6
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.023639
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:7
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.036296
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103
optimizing for the handwriting number:8
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.084091
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:9
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.077913
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
one vs all Done

predict the samples’ accuracy

    # define the predict function to return the predict number array
    def predictOneVSAll(all_theta,myX):
        z = np.dot(myX,all_theta.T) # m * num_labels
        t = sigmoid(z)
        result = np.argmax(t,axis=1).reshape(-1,1)
        return result
    
    predict = predictOneVSAll(theta, X)
    precision = float(np.sum(predict ==Y)) / len(Y)  * 100
    print "training set accuracy:%0.2f" % precision

training set accuracy:94.96


Comments