the logistic Regression can only help resolve the problem of binary classification, what can we do when there are multiple different output labels?
There are 2 solutions:
- one-vs-all
 - neural network
 
load the data and visualization
import the needed modules
    #show the figures buildin the notebook
    %matplotlib inline 
    import numpy as np
    import matplotlib.pyplot as plt
    import scipy.io #Used to load the OCTAVE *.mat files
    import scipy.misc #Used to show matrix as an image
    from scipy import optimize
    import matplotlib.cm as cm #Used to display images in a specific colormap
    import random
load the data from the file as matrix
    fileName='ex3/data/ex3data1.mat'
    samples = scipy.io.loadmat(fileName)
    # extract to get (X,y) samples
    X = samples['X']
    X = np.insert(X,0,1,axis=1)  # insert 1 before index 0 as a column
    Y=samples['y']
    # since the handy digit 0 is labeled as 10
    Y=np.array([i % 10 for i in Y])
    
    print "X's shape is: %s,X[0]'s shape is: %s" % (X.shape, X[0:1].shape)
    print "y's shape is: %s, unique elements in y: %s" % (Y.shape, np.unique(Y))
X’s shape is: (5000, 401),X[0]’s shape is: (1, 401) y’s shape is: (5000, 1), unique elements in y: [0 1 2 3 4 5 6 7 8 9]
visualization the samples
    def myPlotImg(myX):
        
        
        xPixes, yPixes = (20,20)
        nrows,ncols = (10,10)
        pad = 1
        
        # this variable is used to store all the 100 samples data which are used to visualize
        big_pic = np.zeros((nrows * (xPixes + pad),ncols * (yPixes + pad)))
        
        cr,cl=(0,0) #cr stands for rows, cl stands for column
        for i in xrange(nrows):
            for j in xrange(ncols):
                # extract an sample as a 20 * 20 matrix
                sample = myX[random.randint(1,X.shape[0])-1] #randomly get a sample
                sample = sample[1:].reshape(xPixes,yPixes) # ignore the first column with value 1
                
                big_pic[i * xPixes+pad:(i+1)*xPixes+pad,j*yPixes+pad:(j+1)*yPixes+pad] = sample.T
        
        plt.figure(figsize=(6,6))
        img = scipy.misc.toimage(big_pic)
        plt.xlabel('x coordinate')
        plt.ylabel('y coordinate')
        plt.imshow(img,cmap=cm.Greys_r)
        plt.show()
                
    myPlotImg(X)   

one-vs-all
For this solution, we make good use of logistic regression to complete this.
Mechnism
We take one label as the expected one(y=1) and the other labels are the negative labels(y=0)
implementation
implement cost and gradient function
    # define the sigmoid function:
    def sigmoid(z):
        return 1.0 / (1.0 + np.exp(-z))
    
    # define the cost function with regularization
    def costFun(theta, myX, myY, mylambda = 0.0):
        theta = theta.reshape(-1,1)
        
        z = np.dot(X,theta)
        T = sigmoid(z)
        if np.isnan(T[0][0]):
            print "there is nan value!!!!!!!!" 
            
        m = len(X)
        result = np.sum(myY * np.log(T) + (1 - myY)*np.log(1-T)) * (-1) / m
        theta2 = theta[1:,:]
        result = result + (mylambda / (2* m)) * np.dot(theta2.T,theta2)
        if np.isnan(result):
            return np.inf
        return result
        
    
    # define the gradient function of cost function
    def gradientFun(theta, myX, myY, mylambda =0.0):
        theta = theta.reshape(-1,1)
       # print theta, theta.shape
        Z = np.dot(myX, theta)   # m*1
        T = sigmoid(Z)
        m = len(myX)
        result = np.dot(myX.T,T - myY) / m  # n * 1
        theta[0] = 0
        result = result + (mylambda / m) * theta
        return result.flatten()
    
    def oneVSAll(myX, myY, mylambda=0.0):
        # get all the possible labels
        labels = np.unique(myY)
        
        theta = np.empty((len(labels), myX.shape[1])) # labels * (n+1)
        for xclass in labels:
            ## one VS All(0)
            print "optimizing for the handwriting number:%d" % xclass
            tY = np.array([1 if v==xclass else 0 for v in myY.flatten()]).reshape(-1,1) # m * 1
            res = optimize.minimize(costFun,np.zeros((myX.shape[1],1)), args=(myX,tY,mylambda),method='BFGS',
                                     jac=gradientFun, options={'maxiter':100,'disp':True})
            theta[xclass] = res.x
        
        print "one vs all Done"
        
        return theta
            
    theta = oneVSAll(X,Y,mylambda=0.1)    
the following is the log information
optimizing for the handwriting number:0
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.011179
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:1
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.016092
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103
optimizing for the handwriting number:2
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.057727
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:3
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.063839
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:4
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.039164
         Iterations: 100
         Function evaluations: 105
         Gradient evaluations: 105
optimizing for the handwriting number:5
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.062603
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:6
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.023639
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:7
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.036296
         Iterations: 100
         Function evaluations: 103
         Gradient evaluations: 103
optimizing for the handwriting number:8
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.084091
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
optimizing for the handwriting number:9
Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.077913
         Iterations: 100
         Function evaluations: 102
         Gradient evaluations: 102
one vs all Done
predict the samples’ accuracy
    # define the predict function to return the predict number array
    def predictOneVSAll(all_theta,myX):
        z = np.dot(myX,all_theta.T) # m * num_labels
        t = sigmoid(z)
        result = np.argmax(t,axis=1).reshape(-1,1)
        return result
    
    predict = predictOneVSAll(theta, X)
    precision = float(np.sum(predict ==Y)) / len(Y)  * 100
    print "training set accuracy:%0.2f" % precision
training set accuracy:94.96