from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)


X.shape

(70000, 784)


y.shape

(70000,)


%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28,28)

plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
          interpolation="nearest")

plt.axis("off")
plt.show()


y[36000]

'9'


X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


y_train_9 = (y_train == '9')
y_test_9 = (y_test == '9')


from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_9)

SGDClassifier(random_state=42)


sgd_clf.predict([some_digit])

array([False])


from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_9, cv=3, scoring="accuracy")

array([0.94675, 0.955  , 0.9435 ])


from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_9, cv=3)


from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_9, y_train_pred)

#first row - negative or False
#second row - positive or True

array([[53290,   761],
       [ 2334,  3615]])


from sklearn.metrics import precision_score, recall_score
precision_score(y_train_9, y_train_pred) # == 3832 / (3832 + 1909)

0.8260968921389397


recall_score(y_train_9, y_train_pred) # == 3832 / (3832 + 2117)

0.6076651538073626


from sklearn.metrics import f1_score
f1_score(y_train_9, y_train_pred)

0.7002421307506054


y_scores = sgd_clf.decision_function([some_digit])
threshold = 0
y_some_digit_pred = (y_scores > threshold)


y_scores = cross_val_predict(sgd_clf, X_train, y_train_9, cv=3, method='decision_function')


from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores)


def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='Presicion')
    plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center left')
    plt.ylim([0,1])


plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, 'g-')
    plt.xlabel('Recall')
    plt.ylabel('Precision')


plot_precision_vs_recall(precisions, recalls)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


y_train_pred90 = (y_scores > 5000)


precision_score(y_train_9, y_train_pred90)

0.9201934703748489


recall_score(y_train_9, y_train_pred90)

0.12792065893427468


from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_9, y_scores)


def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr, tpr)
plt.show()


from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_9, y_scores)

0.9608004989615918


from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_9, cv=3, method='predict_proba')


y_scores_forest = y_probas_forest[:, 1] # score = probability of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_9, y_scores_forest)


plt.plot(fpr, tpr, 'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show()


roc_auc_score(y_train_9, y_scores_forest)

0.9951605460783337


precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores_forest)

plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>


y_train_forest_pred90 = (y_scores_forest > .4)


precision_score(y_train_9, y_train_forest_pred90)

0.959927797833935


recall_score(y_train_9, y_train_forest_pred90)

0.893931753235838


sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

array(['3'], dtype='<U1')


some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

array([[-35174.9477605 , -20628.00288562, -23841.9225113 ,
           -73.13490557,  -2951.34057527,  -7734.13773636,
        -29138.40714959, -12521.89468369,  -5878.18191885,
         -4169.64674913]])


np.argmax(some_digit_scores)

3


sgd_clf.classes_

array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1')


sgd_clf.classes_[9]

'9'


from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

array(['4'], dtype=object)


len(ovo_clf.estimators_)

45


forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

array(['9'], dtype=object)


forest_clf.predict_proba([some_digit])

array([[0.01, 0.  , 0.  , 0.01, 0.08, 0.03, 0.  , 0.  , 0.  , 0.87]])


cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

array([0.84205, 0.85795, 0.86885])


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')

array([0.89955, 0.8959 , 0.89925])


y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv =3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

array([[5574,    0,   15,    6,    8,   38,   32,    5,  244,    1],
       [   1, 6415,   44,   21,    3,   43,    4,    9,  192,   10],
       [  26,   26, 5238,   83,   71,   27,   68,   35,  375,    9],
       [  27,   18,  110, 5205,    0,  196,   24,   47,  433,   71],
       [  10,   14,   34,    7, 5228,    6,   32,   19,  344,  148],
       [  28,   19,   28,  151,   54, 4427,   78,   16,  550,   70],
       [  25,   17,   47,    2,   42,   85, 5554,    5,  141,    0],
       [  18,   13,   47,   24,   49,    9,    4, 5713,  181,  207],
       [  13,   67,   40,   93,    1,  125,   33,    7, 5429,   43],
       [  20,   22,   30,   52,  118,   35,    1,  181,  379, 5111]])


plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()


row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

# rows represent actual classes
# columns represent predicted classes


# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")


cl_a , cl_b = '3', '5'
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))

plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

# classified 3s on the left
# classified 5s on the right


from sklearn.neighbors import KNeighborsClassifier

y_train_large = [(i in '789') for i in y_train]
y_train_odd = [(i in '13579') for i in y_train]
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

KNeighborsClassifier()


knn_clf.predict([some_digit])

array([[ True,  True]])


y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average='macro')

#set average='weighted' if all instances are not equally important

0.977819997174431


noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test


peek = [X_train_mod[0], y_train_mod[0]]
plt.subplot(221); plot_digits(peek, images_per_row=2)


knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[3600]])
plot_digits([clean_digit])

MNIST Dataset Classification¶

Import Data¶

Quick Look¶

Split Data¶

Shuffle Training Set¶

Training a Binary Classifier¶

Performance Measures¶

Choose Threshold¶

The ROC Curve¶

Precision/Recall or ROC?¶

Multiclass Classification (multinomial classifiers)¶

Error Analysis¶

Multilabel Classification¶

Multioutput Classification¶