Homework no.3 Machine Learning#

Stu. Name: Mohammad Amin Dadgar

Stu. Id: 4003624016

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

Q1#

## read images of 0 to 4 chracters 
image_no0 = plt.imread('datasets/usps_0.jpg')
image_no1 = plt.imread('datasets/usps_1.jpg')
image_no2 = plt.imread('datasets/usps_2.jpg')
image_no3 = plt.imread('datasets/usps_3.jpg')
image_no4 = plt.imread('datasets/usps_4.jpg')

## show one of the images
plt.imshow(image_no0)
plt.show()

## take a look at one of the characters
plt.imshow(image_no0[:16, :16])

<matplotlib.image.AxesImage at 0x7f5a6524fb80>

## open all images file 
img_numbers1 = plt.imread('datasets/usps_0.jpg')
img_numbers2 = plt.imread('datasets/usps_1.jpg')
img_numbers3 = plt.imread('datasets/usps_2.jpg')
img_numbers4 = plt.imread('datasets/usps_3.jpg')
img_numbers5 = plt.imread('datasets/usps_4.jpg')

## iterate over each images and get the valus of them
images_arr = [img_numbers1, img_numbers2, img_numbers3, img_numbers4, img_numbers5]

## each image is 16 by 16 pixels
IMAGE_SIZE_X = 16
IMAGE_SIZE_Y = 16

## feature space size is the multiplication of width and height
FEATURE_SPACE_SIZE = IMAGE_SIZE_X * IMAGE_SIZE_Y

## create pandas columns
cols = []
for i in range(0, FEATURE_SPACE_SIZE):
    cols.append(f"feature_{i}")
## there must be a label for each image
cols.append('label')

dataset_df = pd.DataFrame(columns=cols)

images = []
## each label for hand writed images is the index of the array
for label, image in enumerate(images_arr):
    images.append([])
    ## x of each image
    ## iterate over image columns
    for y_idx in np.arange(0, image.shape[1] - IMAGE_SIZE_Y + 1, IMAGE_SIZE_Y):
        ## iterate over image rows
        for x_idx in np.arange(0, image.shape[0] - IMAGE_SIZE_X + 1, IMAGE_SIZE_X):
            ## add images using the labels
            img = np.array(image[x_idx: x_idx + IMAGE_SIZE_X, y_idx: y_idx + IMAGE_SIZE_Y])
            images[label].append( img )

            df = pd.DataFrame(columns=cols)
            img = img.flatten()
            img = np.append(img, label) 
            img_series = pd.Series(img, index=cols)
            df = df.append(img_series, ignore_index=True)
 
            dataset_df = dataset_df.append(df, ignore_index=True)  

## save the images dataset into a csv file
dataset_df.to_csv('datasets/usps_images.csv', index=False)

Now we can see each images saved in arrays

fig, axes = plt.subplots(1,5, figsize=(30,5))
axes[0].imshow(images[0][0])
axes[0].set_title('number 1')

axes[1].imshow(images[1][0])
axes[1].set_title('number 2')

axes[2].imshow(images[2][0])
axes[2].set_title('number 3')

axes[3].imshow(images[3][0])
axes[3].set_title('number 4')

axes[4].imshow(images[4][0])
axes[4].set_title('number 5')

plt.show()

## if the data was available start from here

dataset_df = pd.read_csv('datasets/usps_images.csv')
dataset_df.head()

	feature_0	feature_1	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_247	feature_248	feature_249	feature_250	feature_251	feature_252	feature_253	feature_254	feature_255
0	0	3	1	0	1	4	94	97	0	...	73	10	106	4	3	0	9	0	0
1	4	0	5	0	10	1	0	1	24	...	41	38	0	0	0	0	16	0	5
2	0	12	3	5	0	0	3	0	47	...	128	116	51	17	2	0	8	5	0
3	8	0	3	0	0	0	0	76	123	...	173	89	24	7	0	13	0	9	2
4	0	14	4	7	0	8	6	5	0	...	1	9	0	6	0	0	0	10	0

5 rows × 257 columns

Q2#

Apply Naive Bayes model on dataset. Divide dataset into test and train 10 times randomly and train the model.

X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=123)

## using the codes we've written previously in Homework no.2

def probability_normal_distribution(X, mu, sigma):
    """
    The probability value for normal distribution function
    
    Parameters:
    ------------
    x : array_like
        the input data
    mu : float
        the mean value given
    sigma : float
        the variance given 
        
    Returns:
    --------
    probability : float
        the probability value for the x input values 
    """
    ## we've divided the equation in two parts
    p1 = 1 / (np.sqrt(np.pi * 2) * sigma)
    p2 = np.exp(-0.5 * ((X-mu) / sigma)**2 )
    
    probability = p1 * p2
    
    return probability
    
def find_MLE_Normal_distro(X):
    """
    the maximum likelihood estimation for parameters of normal distribution
    the parameters for normal distribution is covariance matrix and mean vector
    
    Parameters:
    ------------
    X : array_like
        the X input data vectors
    
    Returns:
    ---------
    mu : array_like
        the means vector
    variance : matrix_like
        the matrix representing the covariance

    """
    X = np.array(X)
    
    mu = (1 / len(X)) * np.sum(X)
    
    ## some changes was made to the ML estimation of variance
    ## because of dataset shape
    variance = np.sqrt((1 / len(X)) * np.sum((X - mu)**2))    
    
    return mu, variance

## divide the dataset into 0 and 1 labels
def estimate_MLE_NB(X, Y, features_arr):
    """
    estimate the Maximum likelihood parameters for naive bayes method
    in detail: in naive bayes we have a parameter for each dimension and each class

    Parameters:
    ------------
    X : array_like
        the input data (a pandas dataframe is prefered)
    Y : array_like
        the labels for each `X` inputs
    features : array_like
        the string array for the name of each label in training data (dimensions)

    Returns:
    --------
    MLE_estimates : dictionary 
        the estimated parameters as a dictionary
    """
    ## dictionary of maximum likelihood estimations
    MLE_estimates = {}
    for feature in features_arr:
        for label in [0, 1, 2, 3, 4]:
            mu, var = find_MLE_Normal_distro(X[Y == label][feature])
            ## each feature of class estimation
            MLE_estimates[f'{feature},{label}'] = [mu, var]
    return MLE_estimates

mle_estimates = estimate_MLE_NB(X_train, Y_train, X_train.columns)

## for each feature and each class there is a mean and variance
len(mle_estimates)

def predict_NB(X, MLE_estimations, features_arr):
    """
    predict the class Using Naive bayes algorithm
    
    Parameters:
    ------------
    X : pandas dataframe
        Input data, `X` and `Y` should be the features
    MLE_estimations : dictionary
        Maximum likelihood estimations corresponding to each dimension and class as a dictionary with keys like `X,0`
        meaning X as first feature and 0 as first class 
        
    Returns:
    ---------
    prediction : array_like
        the array representing the probability of each class for data
    """
    ## the predicted value for each data
    prediction = []
    for idx in range(len(X)):
        ## initialize Class probability array
        class_p_arr = []
        for i in [0, 1, 2, 3, 4]:
            ## multiply probability for each dimension
            p = 1
            for feature in features_arr:
                mu, var = MLE_estimations[f'{feature},{i}']
                ## multiplying probabilities with 500 to avoid underflow
                class_prob = probability_normal_distribution(X.iloc[idx][feature],mu, var) * 500
                p = p * class_prob
            class_p_arr.append(p)

        ## save each class probability of each data
        prediction.append(class_p_arr)
    
    ## for ease of use convert to numpy
    prediction = np.array(prediction)
    return prediction

def report_model(confusion_matrix):
    """
    Find accuracy, precision and recall of a model using its confusion matrix
    
    Parameters:
    ------------
    confusion_matrix : matrix_like
        the confusion matrix of the result
        
    Returns:
    ---------
    accuracy : float
        the accuracy of model
    precision : float
    recall : float
    """
    ## False Positive
    FP = 0
    ## False Negative
    FN = 0
    ## True Positive
    TP = 0
    
    ## iterate the matrix
    for i in range(len(confusion_matrix)):
        TP += confusion_matrix[i, i]
        for j in range(len(confusion_matrix)):
            ## Skip True positive values
            if i != j:
                ## use the row of the matrix 
                FP += confusion_matrix[i, j]
                ## use the column of the matrix 
                FN += confusion_matrix[j, i]
    
    accuracy = TP / np.sum(confusion_matrix)
    precision =  TP / (TP + FP)
    recall = TP / (TP + FN)
    
    return accuracy, precision, recall

test_NB_results = predict_NB(X_test, mle_estimates, X_test.columns)

NB_test_class_pred = np.argmax(test_NB_results, axis=1)

NB_test_pred_confusion_mat = confusion_matrix(Y_test, NB_test_class_pred)
NB_test_pred_confusion_mat

array([[208,   7,   6,   2,   7],
       [  0, 192,  15,   2,   8],
       [  3,   7, 219,   3,   6],
       [  2,  10,  18, 178,   4],
       [  1,  16,   5,   0, 203]])

print('Accuracy,\tPrecision,\tRecall')
report_model(NB_test_pred_confusion_mat)

Accuracy,	Precision,	Recall

(0.8912655971479501, 0.8912655971479501, 0.8912655971479501)

We’ve achieved 89% accuracy, The question requested us to run 10 times with different dataset splits.

## save the results of 10 run confusion matrix in an array
model_NB_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    
    mle_estimates = estimate_MLE_NB(X_train, Y_train, X_train.columns)    
    test_NB_results = predict_NB(X_test, mle_estimates, X_test.columns)
    
    pred_result = np.argmax(test_NB_results, axis=1)
    
    conf_matrix = confusion_matrix(Y_test, pred_result)
    model_NB_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'Naive Bayes model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

Naive Bayes model,RUN 0
Accuracy: 0.8912655971479501
Precision: 0.8912655971479501
Recall: 0.8912655971479501
Naive Bayes model,RUN 1
Accuracy: 0.9180035650623886
Precision: 0.9180035650623886
Recall: 0.9180035650623886
Naive Bayes model,RUN 2
Accuracy: 0.9135472370766489
Precision: 0.9135472370766489
Recall: 0.9135472370766489
Naive Bayes model,RUN 3
Accuracy: 0.9126559714795008
Precision: 0.9126559714795008
Recall: 0.9126559714795008
Naive Bayes model,RUN 4
Accuracy: 0.9010695187165776
Precision: 0.9010695187165776
Recall: 0.9010695187165776
Naive Bayes model,RUN 5
Accuracy: 0.9251336898395722
Precision: 0.9251336898395722
Recall: 0.9251336898395722
Naive Bayes model,RUN 6
Accuracy: 0.9197860962566845
Precision: 0.9197860962566845
Recall: 0.9197860962566845
Naive Bayes model,RUN 7
Accuracy: 0.9081996434937611
Precision: 0.9081996434937611
Recall: 0.9081996434937611
Naive Bayes model,RUN 8
Accuracy: 0.9117647058823529
Precision: 0.9117647058823529
Recall: 0.9117647058823529
Naive Bayes model,RUN 9
Accuracy: 0.9028520499108734
Precision: 0.9028520499108734
Recall: 0.9028520499108734

Q3#

We’ve written the QDA in Homework no.2 So we’ve copied the previous codes, But Some changes are applied for multiclass classification task.

class QDA():
    """
    Quadratic Discriminant Analysis Class
    """
    def __init__(self):
        self.hyperparameters = None
    
    __
    def predict(self, X, classes):
        """
        Predict the output for the X input
        
        Parameters:
        ------------
        X : pandas dataframe
            The data to be appended
        classes : array_like
            array of class labels
        """
        hyperparameters = self.hyperparameters
        ## check if the model is not learned and the parameters is updated
        ## checking one parameter is enough 
        ## because we are assigning a value to all in learning phase 
        if len(hyperparameters) == 0:
            raise "Error! First fit the model on a dataset then try to predict the values!"
        


        ## Find the probabilities for class 0
        ## save them in an array for furthur comparisons
        prediction = []
        for i in range(len(X)):
            ## Find the predicted Class of each data
            probabilities = []
            for label in classes:
                mu, sigma = hyperparameters[str(label)]
                p = self.__probability_multivariate_normal_distribution(X.iloc[i]
                                                                        ,mu,sigma)
                probabilities.append(p)

            ## Compare and set the class with the highest probability
            P = np.argmax(probabilities)
            ## Append the number of Class
            prediction.append(int(P))
        
        return prediction
    def fit(self, X, Y):
        """
        Learning the parameters of the model (Binary Classification model!)
        
        Parameters:
        -----------
        X : pandas dataframe
            the input values to be learned, With outputs as label
        Y : array_like
            the label for the data (The binary classification task is here)
        """
        ## we need to find the mean and covariance of each class

        ## find hyperparameters of each class 
        hyperparameters = {}
        for label in np.unique(Y):
            mu, sigma = self.__find_MLE_Normal_distro(X[Y == label])
            hyperparameters[f'{label}'] = mu, sigma
        
        ## Save the parameters of the model
        self.hyperparameters = hyperparameters
            
    def __probability_multivariate_normal_distribution(self, X, mu, sigma):
        """
        The probability value for multivariate normal distribution function

        Parameters:
        ------------
        x : array_like
            the input data
        mu : array_like
            the means vector
        sigma : matrix_like
            the matrix representing the covariance

        Returns:
        --------
        probability : float
            the probability value for the x input values 
        """
        dimension = len(mu)

        ## divide the formula into 2 parts
        ## slogdet is used because of overflow/underflow problems
        p1 = 1 / np.sqrt(((2*np.pi)**dimension) * np.linalg.slogdet(sigma)[1])

        ## some changes was made to the equation
        ## because of dataset shape
        p2 = np.exp(-1/2 * (np.dot(X-mu, np.linalg.inv(sigma) @ (X-mu).T)))


        probability = p1 * p2

        return probability

    def __find_MLE_Normal_distro(self, X):
        """
        the maximum likelihood estimation for parameters of multivatiate normal distribution
        the parameters for normal distribution is covariance matrix and mean vector

        Parameters:
        ------------
        X : array_like
            the X input data vectors

        Returns:
        ---------
        mu : array_like
            the means vector
        covariance : matrix_like
            the matrix representing the covariance

        """

        mu = (1 / len(X.T)) * np.sum(X, axis=0)

        ## some changes was made to the ML estimation of covariance
        ## because of dataset shape
        covariance = (1 / len(X.T)) * ((X-mu).T @ (X-mu))    

        return mu, covariance
   

model_QDA = QDA()
model_QDA.fit(X_train, Y_train)

model_QDA_test_results = model_QDA.predict(X_test, ['0', '1', '2', '3', '4'])

model_QDA_confusion_mat = confusion_matrix(Y_test, model_QDA_test_results)
model_QDA_confusion_mat

array([[192,   6,   8,   2,   0],
       [  0, 170,  74,   2,   2],
       [  0,   7, 222,   0,   1],
       [  0,   6,   8, 209,   0],
       [  0,   9,   2,   0, 202]])

acc_QDA, precision_QDA, recall_QDA = report_model(model_QDA_confusion_mat)
print(f'QDA Report:\nAccuracy: {acc_QDA}\nPrecision: {precision_QDA}\nRecall: {recall_QDA}')

QDA Report:
Accuracy: 0.8868092691622104
Precision: 0.8868092691622104
Recall: 0.8868092691622104

We achieved about 89 percent accuracy for one run. The question asked us to run 10 times on different splits of dataset and report the results.

## save the results of 10 run confusion matrix in an array
model_QDA_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_QDA = QDA()
    model_QDA.fit(X_train, Y_train)
    results = model_QDA.predict(X_test, ['0', '1', '2', '3', '4'])
    conf_matrix = confusion_matrix(Y_test, model_QDA_test_results)
    model_QDA_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'QDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

QDA model,RUN 0
Accuracy: 0.20320855614973263
Precision: 0.20320855614973263
Recall: 0.20320855614973263
QDA model,RUN 1
Accuracy: 0.20677361853832443
Precision: 0.20677361853832443
Recall: 0.20677361853832443
QDA model,RUN 2
Accuracy: 0.21390374331550802
Precision: 0.21390374331550802
Recall: 0.21390374331550802
QDA model,RUN 3
Accuracy: 0.20766488413547238
Precision: 0.20766488413547238
Recall: 0.20766488413547238
QDA model,RUN 4
Accuracy: 0.18449197860962566
Precision: 0.18449197860962566
Recall: 0.18449197860962566
QDA model,RUN 5
Accuracy: 0.20053475935828877
Precision: 0.20053475935828877
Recall: 0.20053475935828877
QDA model,RUN 6
Accuracy: 0.20320855614973263
Precision: 0.20320855614973263
Recall: 0.20320855614973263
QDA model,RUN 7
Accuracy: 0.22103386809269163
Precision: 0.22103386809269163
Recall: 0.22103386809269163
QDA model,RUN 8
Accuracy: 0.19875222816399288
Precision: 0.19875222816399288
Recall: 0.19875222816399288
QDA model,RUN 9
Accuracy: 0.8868092691622104
Precision: 0.8868092691622104
Recall: 0.8868092691622104

Q4#

Using Linear Discriminant Analysis (A version of QDA). It uses same covariance matrix for all data.

For this fact we find one covariance matrix for all data and mean vectors for each class.

class LDA():
    """
    Linear Discriminant Analysis Class
    """
    def __init__(self):
        self.mu_vectors = None
        self.covariance = None
    
    __
    def predict(self, X):
        """
        Predict the output for the X input
        
        Parameters:
        ------------
        X : pandas dataframe
            The data to be appended
        classes : array_like
            array of class labels
        """
        mu_vectors = self.mu_vectors
        covariance = self.covariance
        
        ## check if the model is not learned and the parameters is updated
        ## checking one parameter is enough 
        ## because we are assigning a value to all in learning phase 
        if len(mu_vectors) == 0:
            raise "Error! First fit the model on a dataset then try to predict the values!"

        ## Find the probabilities for class 0
        ## save them in an array for furthur comparisons
        prediction = []
        for i in range(len(X)):
            ## Find the predicted Class of each data
            probabilities = []
            
            for label in range(len(mu_vectors)):
                mu = mu_vectors[label]
                p = self.__probability_multivariate_normal_distribution(X.iloc[i]
                                                                        ,mu,covariance)
                probabilities.append(p)

            ## Compare and set the class with the highest probability
            P = np.argmax(probabilities)
            ## Append the number of Class
            prediction.append(int(P))
        
        return prediction
    def fit(self, X, Y):
        """
        Learning the parameters of the model (Binary Classification model!)
        
        Parameters:
        -----------
        X : pandas dataframe
            the input values to be learned, With outputs as label
        Y : array_like
            the label for the data (The binary classification task is here)
        """
        ## we need to find the mean of each class and a covariance matrix for all of the data
        _, covariance = self.__find_MLE_Normal_distro(X) 
        
        ## find mu vectors of each class 
        mu_vectors = [] 
        for label in np.unique(Y):
            mu, _ = self.__find_MLE_Normal_distro(X[Y == label])
            mu_vectors.append(mu)
        
        ## Save the parameters of the model
        self.mu_vectors = mu_vectors
        self.covariance = covariance
            
    def __probability_multivariate_normal_distribution(self, X, mu, sigma):
        """
        The probability value for multivariate normal distribution function

        Parameters:
        ------------
        x : array_like
            the input data
        mu : array_like
            the means vector
        sigma : matrix_like
            the matrix representing the covariance

        Returns:
        --------
        probability : float
            the probability value for the x input values 
        """
        dimension = len(mu)

        ## divide the formula into 2 parts
        ## slogdet is used because of overflow/underflow problems
        p1 = 1 / np.sqrt(((2*np.pi)**dimension) * np.linalg.slogdet(sigma)[1])

        ## some changes was made to the equation
        ## because of dataset shape
        p2 = np.exp(-1/2 * (np.dot(X-mu, np.linalg.inv(sigma) @ (X-mu).T)))


        probability = p1 * p2

        return probability

    def __find_MLE_Normal_distro(self, X):
        """
        the maximum likelihood estimation for parameters of multivatiate normal distribution
        the parameters for normal distribution is covariance matrix and mean vector

        Parameters:
        ------------
        X : array_like
            the X input data vectors

        Returns:
        ---------
        mu : array_like
            the means vector
        covariance : matrix_like
            the matrix representing the covariance

        """

        mu = (1 / len(X.T)) * np.sum(X, axis=0)

        ## some changes was made to the ML estimation of covariance
        ## because of dataset shape
        covariance = (1 / len(X.T)) * ((X-mu).T @ (X-mu))    

        return mu, covariance
   

model_LDA = LDA()
model_LDA.fit(X_train, Y_train)
model_LDA_results = model_LDA.predict(X_test)

model_LDA_confusion_mat = confusion_matrix(Y_test ,model_LDA_results)
model_LDA_confusion_mat

array([[195,   9,   2,   2,   0],
       [  0, 241,   1,   4,   2],
       [  2,  18, 204,   1,   5],
       [  1,  12,   8, 201,   1],
       [  0,  20,   4,   0, 189]])

acc_LDA, precision_LDA, recall_LDA = report_model(model_LDA_confusion_mat)
print(f'LDA Report:\nAccuracy: {acc_LDA}\nPrecision: {precision_LDA}\nRecall: {recall_LDA}')

LDA Report:
Accuracy: 0.9180035650623886
Precision: 0.9180035650623886
Recall: 0.9180035650623886

We’ve achieved 91% accuracy. This shows the LDA model can perform better than the more complex models as QDA and Naive Bayes.

Now we’re going to run 10 times the LDA model with different data splits.

## save the results of 10 run confusion matrix in an array
model_LDA_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_LDA = LDA()
    model_LDA.fit(X_train, Y_train)
    results = model_LDA.predict(X_test)
    conf_matrix = confusion_matrix(Y_test, results)
    model_LDA_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'LDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

LDA model,RUN 0
Accuracy: 0.9171122994652406
Precision: 0.9171122994652406
Recall: 0.9171122994652406
LDA model,RUN 1
Accuracy: 0.9340463458110517
Precision: 0.9340463458110517
Recall: 0.9340463458110517
LDA model,RUN 2
Accuracy: 0.9206773618538324
Precision: 0.9206773618538324
Recall: 0.9206773618538324
LDA model,RUN 3
Accuracy: 0.9313725490196079
Precision: 0.9313725490196079
Recall: 0.9313725490196079
LDA model,RUN 4
Accuracy: 0.9215686274509803
Precision: 0.9215686274509803
Recall: 0.9215686274509803
LDA model,RUN 5
Accuracy: 0.9331550802139037
Precision: 0.9331550802139037
Recall: 0.9331550802139037
LDA model,RUN 6
Accuracy: 0.9340463458110517
Precision: 0.9340463458110517
Recall: 0.9340463458110517
LDA model,RUN 7
Accuracy: 0.9278074866310161
Precision: 0.9278074866310161
Recall: 0.9278074866310161
LDA model,RUN 8
Accuracy: 0.93048128342246
Precision: 0.93048128342246
Recall: 0.93048128342246
LDA model,RUN 9
Accuracy: 0.9180035650623886
Precision: 0.9180035650623886
Recall: 0.9180035650623886

Q5#

Now we’re going to use decision tree (DT) classifier for characters dataset.

model_tree = tree.DecisionTreeClassifier(criterion='gini')
model_tree.fit(X_train, Y_train)

DecisionTreeClassifier()

Some parameters are set as default, so let’s have a look at them.

model_tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

## plotting tree with maximum depth of 5 
## because our tree is big we have limited it to 5 
plt.figure(figsize=(40,40))
tree.plot_tree(model_tree,
               feature_names=X_train.columns,
              filled=True,
              impurity=True,
              rounded=True,
              fontsize=20
              ,max_depth=5)
plt.savefig('Q5_tree_plot.png')

model_tree_test_pred = model_tree.predict(X_test)

model_tree_conf_mat = confusion_matrix(Y_test, model_tree_test_pred)

print('Accuracy,\tPrecision,\tRecall')
report_model(model_tree_conf_mat)

Accuracy,	Precision,	Recall

(0.8877005347593583, 0.8877005347593583, 0.8877005347593583)

## running the model 10 times

## save the results of 10 run confusion matrix in an array
model_tree_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_tree = tree.DecisionTreeClassifier()
    model_tree.fit(X_train, Y_train)
    results = model_tree.predict(X_test)
    conf_matrix = confusion_matrix(Y_test, results)
    model_tree_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'LDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

LDA model,RUN 0
Accuracy: 0.8841354723707665
Precision: 0.8841354723707665
Recall: 0.8841354723707665
LDA model,RUN 1
Accuracy: 0.8787878787878788
Precision: 0.8787878787878788
Recall: 0.8787878787878788
LDA model,RUN 2
Accuracy: 0.9099821746880571
Precision: 0.9099821746880571
Recall: 0.9099821746880571
LDA model,RUN 3
Accuracy: 0.8894830659536542
Precision: 0.8894830659536542
Recall: 0.8894830659536542
LDA model,RUN 4
Accuracy: 0.8868092691622104
Precision: 0.8868092691622104
Recall: 0.8868092691622104
LDA model,RUN 5
Accuracy: 0.8921568627450981
Precision: 0.8921568627450981
Recall: 0.8921568627450981
LDA model,RUN 6
Accuracy: 0.8761140819964349
Precision: 0.8761140819964349
Recall: 0.8761140819964349
LDA model,RUN 7
Accuracy: 0.8832442067736186
Precision: 0.8832442067736186
Recall: 0.8832442067736186
LDA model,RUN 8
Accuracy: 0.8859180035650623
Precision: 0.8859180035650623
Recall: 0.8859180035650623
LDA model,RUN 9
Accuracy: 0.8832442067736186
Precision: 0.8832442067736186
Recall: 0.8832442067736186

In 10 runs we saw that the model achieved lower performance than the LDA model.

Q6#

Using SVM model for dataset.

model_svm = svm.SVC()
## having a look at default parameters 
model_svm.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

model_svm.fit(X_train, Y_train)
model_svm_test_pred = model_svm.predict(X_test)

svm_confusion_mat = confusion_matrix(Y_test, model_svm_test_pred)
svm_confusion_mat

array([[202,   6,   0,   0,   0],
       [  0, 248,   0,   0,   0],
       [  1,   7, 220,   0,   2],
       [  0,   7,   0, 216,   0],
       [  0,   9,   0,   0, 204]])

print('Accuracy,\tPrecision,\tRecall')
report_model(svm_confusion_mat)

Accuracy,	Precision,	Recall

(0.9714795008912656, 0.9714795008912656, 0.9714795008912656)

we can see that SVM achived much higher performance on test set than the other models we tried in this excercises.

## running the model 10 times

## save the results of 10 run confusion matrix in an array
model_svm_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(dataset_df[dataset_df.columns[:-1]], 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_svm = svm.SVC()
    model_svm.fit(X_train, Y_train)
    results = model_svm.predict(X_test)
    conf_matrix = confusion_matrix(Y_test, results)
    model_svm_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'LDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

LDA model,RUN 0
Accuracy: 0.9750445632798574
Precision: 0.9750445632798574
Recall: 0.9750445632798574
LDA model,RUN 1
Accuracy: 0.9812834224598931
Precision: 0.9812834224598931
Recall: 0.9812834224598931
LDA model,RUN 2
Accuracy: 0.9812834224598931
Precision: 0.9812834224598931
Recall: 0.9812834224598931
LDA model,RUN 3
Accuracy: 0.9786096256684492
Precision: 0.9786096256684492
Recall: 0.9786096256684492
LDA model,RUN 4
Accuracy: 0.9714795008912656
Precision: 0.9714795008912656
Recall: 0.9714795008912656
LDA model,RUN 5
Accuracy: 0.9759358288770054
Precision: 0.9759358288770054
Recall: 0.9759358288770054
LDA model,RUN 6
Accuracy: 0.9786096256684492
Precision: 0.9786096256684492
Recall: 0.9786096256684492
LDA model,RUN 7
Accuracy: 0.9803921568627451
Precision: 0.9803921568627451
Recall: 0.9803921568627451
LDA model,RUN 8
Accuracy: 0.9795008912655971
Precision: 0.9795008912655971
Recall: 0.9795008912655971
LDA model,RUN 9
Accuracy: 0.9714795008912656
Precision: 0.9714795008912656
Recall: 0.9714795008912656

Q7#

Apply PCA and Fisher dimension reductinon methods on data and then use LDA model to be trained.

(a) Fisher dimension reduction method#

X_dataset = dataset_df[dataset_df.columns[:-1]]
Y_dataset = dataset_df['label']

class Fishers_dimension_reduction():
    """
    reduce dimension of dataset using fishers method (Or LDA in other names)
    
    """
    def __init__(self, n_components=0.95):
        """
        Parameters:
        ------------
        n_components : float
            can be a number between 0 and 1 that represents the data loss, default is `0.95` means the data loss is 0.05
            can be a number more than 1 that represents how many dimensions to save
        """
        self.n_components = n_components
        
        ## the matrix that transform data into new dimensionality
        self.transformation_matrix = None
        
    def __find_most_discriminator_eigenvectors(self, eigvalues, eigvectors):
        """
        find the most discriminator dimensions by sorting eigenvalues and returning the best eigenvectors corresponding to the highest eigen values
        rate is used to how much to save the dimensions

        Parameters:
        ------------
        eigvalues : 1D array
            array of eigen values
        eigvectors : 2D array
            array of eigen vectors
            
        Returns:
        ---------
        eig_vectors : matrix
            the most discriminative eigen vectors
        """
        if (0<= self.n_components) and (self.n_components <=1):
            eig_vectors = self.__find_the_most_discriminator_floating(eigvalues, eigvectors)
        else:
            eig_vectors = self.__sort_eigvalues_corresponding_eigenvectors(eigvalues, eigvectors)
            eig_vectors = eig_vectors[0:n_components]
        
        return eig_vectors
    
    def __find_the_most_discriminator_floating(self, eigvalues, eigvectors):
        """
        find the most discriminator dimension when the n_components is a floating point between 0 and 1
        """
        ## apply pandas dataframe to have an index corresponding to each row
        eigvalues_df = pd.DataFrame(eigvalues)
        sorted_indexes = eigvalues_df.sort_values(by=0, ascending=False).index.values


        
        sorted_eigvectors_df = self.__sort_eigvalues_corresponding_eigenvectors(eig_values, eig_vectors)
        ## iterate over data until it reached the threshold
        threshold = 0
        ## initialize the range of data 
        dimension_range = None

        for idx in range(len(sorted_eigvectors_df) - 1):
            threshold = abs(np.sum(eigvalues_df.loc[sorted_indexes[0:idx+1]])) / abs(np.sum(eigvalues))
            if threshold.values >= self.n_components:
                dimension_range = idx
                break
        return sorted_eigvectors_df.iloc[0:dimension_range]
    
    def __sort_eigvalues_corresponding_eigenvectors(self, eigvalues, eigvectors):
        """
        sort the eigenvectors by the highest eigenvalues
        
        Parameters:
        ------------
        eigvalues : array
            array of eigenvalues
        eigvectors : matrix
            array of eigenvectors corresponding to each eigenvalues
            
        Returns:
        ---------
        sorted_eigenvectors : pandas dataframe
            dataframe of eigenvectors related to Descending sorted eigenvalues
        """
        ## apply pandas dataframe to have an index corresponding to each row
        eigvalues_df = pd.DataFrame(eigvalues)
        
        ## get the sorted eigen values indexes
        sorted_indexes = eigvalues_df.sort_values(by=0, ascending=False).index.values


        ## convert eigenvectors to pandas to find the best of it corresponding to highest eigenvalues
        eigvectors_df = pd.DataFrame(eigvectors)
        ## and then find the corresponding eigen vectors
        sorted_eigvectors_df = eigvectors_df.reindex(sorted_indexes)
        
        return sorted_eigvectors_df

        
    def fit(self, X_data, classes_name):
        """
        fit the parameters for dimensionality reduction
        
        Parameters:
        -----------
        X_data : matrix or array
            the X classes as a matrix or array
            array can only represent one feature but matrix would represent more than one feature
        classes_name : array
            array of classes label (must be unique)
        """
        
        ## finding between class variation

        ## initialize the variable with zero values
        between_class_variation = np.zeros((256, 256))

        ## find the global mean
        dataset_mean = X_data.mean()

        ## iterate for each class 
        for class_num in classes_name:
                ## find the local mean (Each class mean)
                ## and get the difference
                difference = X_data[Y_dataset == class_num].mean() - dataset_mean
                difference = np.matrix(difference)
                between_class_variation += difference.T @ difference
                
        ## combining both matrixes
        J = np.linalg.inv(within_class_variation) @ between_class_variation

        eig_values, eig_vectors = np.linalg.eig(J)
        
        ## find the transformation matrix 
        U = self.__find_most_discriminator_eigenvectors(eig_values, eig_vectors)
        
        self.transformation_matrix = U.T
        
    def transform(self, X_data):
        """
        transform dataset into new reduced dimensionality
        
        Parameters:
        ------------
        X_data : matrix_like
            the X classes as a matrix or array
            matrix would represent more than one feature 
        
        Returns:
        ---------
        X_reduced : matrix_like
            the reduced dimension X_data
        """
        X_reduced = np.dot(self.transformation_matrix.T, X_data.T).T
        
        return X_reduced
    
    def fit_transform(self, X_data, classes_name):
        """
        fit on data and then return the reduced version of X_data
        
        Parameters:
        -----------
        X_data : matrix or array
            the X classes as a matrix or array
            array can only represent one feature but matrix would represent more than one feature
        classes_name : array
            array of classes label (must be unique)
            
        Returns:
        ---------
        X_reduced : matrix_like
            the reduced dimension X_data
        """
        self.fit(X_data, classes_name)
        X_reduced = self.transform(X_data)
        
        return X_reduced

fisher_reduction = Fishers_dimension_reduction()
X_reduced_dataset = fisher_reduction.fit_transform(X_dataset, np.unique(Y_dataset))
X_reduced_dataset.shape

X_train, X_test, Y_train, Y_test = train_test_split(X_reduced_dataset,
                                                    Y_dataset, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=123)

## reduce the test data
X_reduced_test = fisher_reduction.transform(X_test)
X_reduced_test.shape

(1122, 3)

model_LDA = LDA()
model_LDA.fit(X_train, Y_train)
model_LDA_results = model_LDA.predict(pd.DataFrame(X_test))

model_LDA_confusion_mat = confusion_matrix(Y_test ,model_LDA_results)
model_LDA_confusion_mat

array([[  0,   2, 228,   0,   0],
       [  2,  12, 203,   0,   0],
       [  0,   5, 233,   0,   0],
       [  5,  13, 194,   0,   0],
       [  1,  30, 194,   0,   0]])

acc_LDA, precision_LDA, recall_LDA = report_model(model_LDA_confusion_mat)
print(f'LDA Report:\nAccuracy: {acc_LDA}\nPrecision: {precision_LDA}\nRecall: {recall_LDA}')

LDA Report:
Accuracy: 0.21836007130124777
Precision: 0.21836007130124777
Recall: 0.21836007130124777

With 3 dimensions extracted using Fishers method the results are highly low!

Now we will explore the dataset more and have a more deep look at the fishers method performance.

## save the results of 10 run confusion matrix in an array
model_LDA_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(X_reduced_dataset, 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_LDA = LDA()
    model_LDA.fit(X_train, Y_train)
    results = model_LDA.predict(pd.DataFrame(X_test))
    conf_matrix = confusion_matrix(Y_test, results)
    model_LDA_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'LDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

LDA model,RUN 0
Accuracy: 0.21836007130124777
Precision: 0.21836007130124777
Recall: 0.21836007130124777
LDA model,RUN 1
Accuracy: 0.3324420677361854
Precision: 0.3324420677361854
Recall: 0.3324420677361854
LDA model,RUN 2
Accuracy: 0.30124777183600715
Precision: 0.30124777183600715
Recall: 0.30124777183600715
LDA model,RUN 3
Accuracy: 0.2103386809269162
Precision: 0.2103386809269162
Recall: 0.2103386809269162
LDA model,RUN 4
Accuracy: 0.1836007130124777
Precision: 0.1836007130124777
Recall: 0.1836007130124777
LDA model,RUN 5
Accuracy: 0.2014260249554367
Precision: 0.2014260249554367
Recall: 0.2014260249554367
LDA model,RUN 6
Accuracy: 0.19875222816399288
Precision: 0.19875222816399288
Recall: 0.19875222816399288
LDA model,RUN 7
Accuracy: 0.20409982174688057
Precision: 0.20409982174688057
Recall: 0.20409982174688057
LDA model,RUN 8
Accuracy: 0.25668449197860965
Precision: 0.25668449197860965
Recall: 0.25668449197860965
LDA model,RUN 9
Accuracy: 0.20677361853832443
Precision: 0.20677361853832443
Recall: 0.20677361853832443

It seems that Fisher’s method is not working good at all here because it just produced 33% accuracy in the best situation.

(b) PCA dimension reduction method#

Saving 95% of features and whitening the dataset.

X_dataset = dataset_df[dataset_df.columns[:-1]]
Y_dataset = dataset_df['label']

pca = PCA(n_components=0.95,whiten=True, svd_solver='full')
pca.fit(X_dataset)
X_dataset_reduced = pca.transform(X_dataset)

It seems that 95% of data variance is in 100 of the features. the other 5 percent is in the other 150 features.

X_dataset_reduced.shape

(5610, 100)

## have a look at variances
pca.explained_variance_

array([293331.87410841, 158119.05900527, 137937.08914476, 126798.433732  ,
       103117.73500377,  89887.35327351,  77749.6914955 ,  50172.87297051,
        45807.33462414,  38951.06624323,  35248.39082542,  32714.42323582,
        30933.88036519,  30225.16754704,  26959.18443678,  25179.5259855 ,
        23453.45575095,  22159.41485451,  20569.87854983,  18919.04164789,
        18335.37124556,  17894.17986046,  16274.90525257,  15626.73732931,
        14930.51616494,  14487.07813265,  13935.53954212,  12847.4915951 ,
        12200.14742319,  11851.5244597 ,  11608.67280995,  11027.93554689,
        10438.56099896,   9918.99381511,   9825.26443153,   9610.42316895,
         9026.15107666,   8475.19580363,   8112.47976397,   7909.35264363,
         7549.8558876 ,   7382.37198072,   7030.60232858,   6771.14100226,
         6573.08342079,   6535.76291499,   6405.82355752,   6026.2193113 ,
         5781.87826222,   5656.70352764,   5575.56618965,   5327.63682127,
         5253.24969877,   4977.51338608,   4706.2775176 ,   4633.88445415,
         4486.97080191,   4467.75060286,   4204.5685125 ,   4162.9440353 ,
         4060.79606155,   4009.33727359,   3846.56066471,   3825.70407122,
         3559.88023113,   3541.97802725,   3315.23366661,   3258.86044577,
         3220.69698483,   3103.2676193 ,   3066.28810666,   2964.10719027,
         2889.14618471,   2880.95858029,   2865.29297507,   2768.93135882,
         2674.50137337,   2604.48325481,   2566.29760112,   2474.23296026,
         2414.15770602,   2333.88972106,   2320.19246632,   2276.48904847,
         2209.24235496,   2158.48712296,   2127.15032751,   2086.65998292,
         2053.00585354,   2034.42923726,   1961.99080637,   1959.03813319,
         1860.68198017,   1845.81583079,   1823.90922216,   1789.15784318,
         1745.06494381,   1722.43788405,   1710.24871538,   1665.05464137])

X_train, X_test, Y_train, Y_test = train_test_split(X_dataset_reduced,
                                                    Y_dataset, 
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=123)

model_LDA = LDA()
model_LDA.fit(X_train, Y_train)
model_LDA_results = model_LDA.predict(pd.DataFrame(X_test))

model_LDA_confusion_mat = confusion_matrix(Y_test ,model_LDA_results)
model_LDA_confusion_mat

array([[194,   7,  24,   1,   4],
       [  0, 204,   9,   2,   2],
       [  1,   6, 231,   0,   0],
       [  1,   5,  25, 181,   0],
       [  0,  32,  15,   0, 178]], dtype=int64)

acc_LDA, precision_LDA, recall_LDA = report_model(model_LDA_confusion_mat)
print(f'LDA Report:\nAccuracy: {acc_LDA}\nPrecision: {precision_LDA}\nRecall: {recall_LDA}')

LDA Report:
Accuracy: 0.8805704099821747
Precision: 0.8805704099821747
Recall: 0.8805704099821747

## save the results of 10 run confusion matrix in an array
model_LDA_resuls_confusion_matrix = []
## run count
N = 10

for i in range(N):
    X_train, X_test, Y_train, Y_test = train_test_split(X_dataset_reduced, 
                                                    dataset_df['label'],
                                                    test_size=0.2,
                                                    random_state=(123 + i))
    model_LDA = LDA()
    model_LDA.fit(X_train, Y_train)
    results = model_LDA.predict(pd.DataFrame(X_test))
    conf_matrix = confusion_matrix(Y_test, results)
    model_LDA_resuls_confusion_matrix.append(conf_matrix)
    
    acc, precision, recall = report_model(conf_matrix)
    print(f'LDA model,RUN {i}\nAccuracy: {acc}\nPrecision: {precision}\nRecall: {recall}')

LDA model,RUN 0
Accuracy: 0.8805704099821747
Precision: 0.8805704099821747
Recall: 0.8805704099821747
LDA model,RUN 1
Accuracy: 0.9126559714795008
Precision: 0.9126559714795008
Recall: 0.9126559714795008
LDA model,RUN 2
Accuracy: 0.8636363636363636
Precision: 0.8636363636363636
Recall: 0.8636363636363636
LDA model,RUN 3
Accuracy: 0.9046345811051694
Precision: 0.9046345811051694
Recall: 0.9046345811051694
LDA model,RUN 4
Accuracy: 0.893048128342246
Precision: 0.893048128342246
Recall: 0.893048128342246
LDA model,RUN 5
Accuracy: 0.8983957219251337
Precision: 0.8983957219251337
Recall: 0.8983957219251337
LDA model,RUN 6
Accuracy: 0.910873440285205
Precision: 0.910873440285205
Recall: 0.910873440285205
LDA model,RUN 7
Accuracy: 0.9090909090909091
Precision: 0.9090909090909091
Recall: 0.9090909090909091
LDA model,RUN 8
Accuracy: 0.9162210338680927
Precision: 0.9162210338680927
Recall: 0.9162210338680927
LDA model,RUN 9
Accuracy: 0.875222816399287
Precision: 0.875222816399287
Recall: 0.875222816399287

My Jupyter Book

Homework no.3 Machine Learning

Contents

Homework no.3 Machine Learning#

Q1#

Q2#

Q3#

Q4#

Q5#

Q6#

Q7#

(a) Fisher dimension reduction method#

(b) PCA dimension reduction method#

	feature_0	feature_1	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_247	feature_248	feature_249	feature_250	feature_251	feature_252	feature_253	feature_254	feature_255
0	0	3	1	0	1	4	94	97	0	...	73	10	106	4	3	0	9	0	0
1	4	0	5	0	10	1	0	1	24	...	41	38	0	0	0	0	16	0	5
2	0	12	3	5	0	0	3	0	47	...	128	116	51	17	2	0	8	5	0
3	8	0	3	0	0	0	0	76	123	...	173	89	24	7	0	13	0	9	2
4	0	14	4	7	0	8	6	5	0	...	1	9	0	6	0	0	0	10	0

	feature_0	feature_1	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_247	feature_248	feature_249	feature_250	feature_251	feature_252	feature_253	feature_254	feature_255
0	0	3	1	0	1	4	94	97	0	...	73	10	106	4	3	0	9	0	0
1	4	0	5	0	10	1	0	1	24	...	41	38	0	0	0	0	16	0	5
2	0	12	3	5	0	0	3	0	47	...	128	116	51	17	2	0	8	5	0
3	8	0	3	0	0	0	0	76	123	...	173	89	24	7	0	13	0	9	2
4	0	14	4	7	0	8	6	5	0	...	1	9	0	6	0	0	0	10	0

	feature_0	feature_1	feature_3	feature_4	feature_5	feature_6	feature_7	feature_8	feature_9	...	feature_247	feature_248	feature_249	feature_250	feature_251	feature_252	feature_253	feature_254	feature_255
0	0	3	1	0	1	4	94	97	0	...	73	10	106	4	3	0	9	0	0
1	4	0	5	0	10	1	0	1	24	...	41	38	0	0	0	0	16	0	5
2	0	12	3	5	0	0	3	0	47	...	128	116	51	17	2	0	8	5	0
3	8	0	3	0	0	0	0	76	123	...	173	89	24	7	0	13	0	9	2
4	0	14	4	7	0	8	6	5	0	...	1	9	0	6	0	0	0	10	0