In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

Load Training and Test Data

In [30]:
# load the training data : in this example, we consider a binary classification between 
# digit 3 and digit 8. We will refer to data of digit 3 as positive examples, and those
# of digit 8 as negative examples.

pos_digit = pd.read_csv('../digits/train/digit_3_train.csv',header=None)
neg_digit = pd.read_csv('../digits/train/digit_8_train.csv',header=None)

# get the number of positive and negative examples
n_pos = pos_digit.shape[0]
n_neg = neg_digit.shape[0]

# concatenate the positive and negative examples into a single matrix : both positive
# and negative examples 
X_train = pd.concat([pos_digit, neg_digit], ignore_index=True)

# read the number of examples and the number of features
n_digits = n_pos + n_neg

# construct the label vector : 
y_train = np.ravel(np.vstack((np.zeros((n_pos,1)),np.ones((n_neg,1)))))
In [31]:
# load the test data
pos_digit_t = pd.read_csv('../digits/test/digit_3_test.csv',header=None)
neg_digit_t = pd.read_csv('../digits/test/digit_8_test.csv',header=None)

# get the number of positive and negative examples
n_pos_t = pos_digit_t.shape[0]
n_neg_t = neg_digit_t.shape[0]

# concatenate the positive and negative examples into a single matrix : both positive
# and negative examples 
X_test_org = pd.concat([pos_digit_t, neg_digit_t], ignore_index=True)

# read the number of examples and the number of features
n_digits_t = n_pos_t + n_neg_t

# construct the label vector : 
y_test = np.ravel(np.vstack((np.zeros((n_pos_t,1)),np.ones((n_neg_t,1)))))
In [32]:
# visualize some positive and negative examples

n_show = 4 # number of digits to be shown in each class

pos_idx = np.random.permutation(n_pos)
neg_idx = np.random.permutation(n_neg)

fig,ax = plt.subplots(2,n_show)

for i in range(n_show):
    ax[0,i].imshow(np.reshape(pos_digit.values[pos_idx[i],:],(16,16)).T,cmap=plt.get_cmap('Greys'));
    ax[1,i].imshow(np.reshape(neg_digit.values[neg_idx[i],:],(16,16)).T,cmap=plt.get_cmap('Greys'));
    ax[0,i].set_xticks([]), ax[0,i].set_yticks([])
    ax[1,i].set_xticks([]), ax[1,i].set_yticks([])

Preprocess Data

We will preprocess the data by standardize each feature. That is, after standardizing, each feature will have zero mean and unit variance. Linear scaling can also be used here.

In [33]:
from sklearn.preprocessing import scale
In [34]:
X_train = scale(X_train)
X_test = scale(X_test_org)
In [35]:
# data before standardization
X_test_org.head()
Out[35]:
0 1 2 3 4 5 6 7 8 9 ... 246 247 248 249 250 251 252 253 254 255
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 11 86 108 142 74 11 0 0 0
3 0 0 65 138 45 0 0 0 0 0 ... 0 0 0 76 137 137 24 0 0 0
4 0 0 0 0 0 30 173 7 0 21 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 256 columns

In [36]:
# data after standardization
X_test
Out[36]:
array([[-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ],
       [-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ],
       [-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ],
       ..., 
       [-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ],
       [-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ],
       [-0.15947259, -0.28916053, -0.34878234, ..., -0.28714556,
        -0.18093055, -0.0744889 ]])

Cross Validation

Perform K-fold cross validation to select the best parameter values.

In [37]:
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
In [ ]:
 
In [38]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    # perform grid search to find the best parameter values
    clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5,
                       scoring='%s_weighted' % score)
    
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
# Tuning hyper-parameters for precision
()
Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
()
Grid scores on development set:
()
0.982 (+/-0.010) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.973 (+/-0.007) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.986 (+/-0.009) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.984 (+/-0.012) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.986 (+/-0.009) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.982 (+/-0.009) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.986 (+/-0.009) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.982 (+/-0.012) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.980 (+/-0.014) for {'kernel': 'linear', 'C': 1}
0.980 (+/-0.014) for {'kernel': 'linear', 'C': 10}
0.980 (+/-0.014) for {'kernel': 'linear', 'C': 100}
0.980 (+/-0.014) for {'kernel': 'linear', 'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

        0.0       0.99      0.97      0.98       330
        1.0       0.97      0.99      0.98       330

avg / total       0.98      0.98      0.98       660

()
# Tuning hyper-parameters for recall
()
Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
()
Grid scores on development set:
()
0.982 (+/-0.010) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.973 (+/-0.007) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.986 (+/-0.010) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.984 (+/-0.012) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.986 (+/-0.010) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.982 (+/-0.009) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.986 (+/-0.010) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.981 (+/-0.013) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.979 (+/-0.015) for {'kernel': 'linear', 'C': 1}
0.979 (+/-0.015) for {'kernel': 'linear', 'C': 10}
0.979 (+/-0.015) for {'kernel': 'linear', 'C': 100}
0.979 (+/-0.015) for {'kernel': 'linear', 'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are computed on the full evaluation set.
()
             precision    recall  f1-score   support

        0.0       0.99      0.97      0.98       330
        1.0       0.97      0.99      0.98       330

avg / total       0.98      0.98      0.98       660

()
In [39]:
# the best parameter valuse obtained from cross-validation

clf.best_params_
Out[39]:
{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
In [ ]:
 
In [40]:
# evaluate the classifier on the test data

clf.score(X_test,y_test)
Out[40]:
0.98333333333333328
In [41]:
# compute the prediction vector

y_predicted = clf.predict(X_test)
In [42]:
# find digits that were mis-classified

misclassified_X = X_test_org.values[y_predicted != y_test,:]
In [43]:
misclassified_X.shape[0] # the number of misclassification
Out[43]:
11
In [44]:
# visualize misclassified examples

fig,ax = plt.subplots(1,11,figsize=(15,5))

for i in range(misclassified_X.shape[0]):
    ax[i].imshow(np.reshape(misclassified_X[i,:],(16,16)).T,cmap=plt.get_cmap('Greys'))
    ax[i].set_xticks([]), ax[i].set_yticks([])
    ax[i].set_xticks([]), ax[i].set_yticks([])
In [ ]: