from sklearn.datasets import fetch_openml
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X.shape
(70000, 784)
y.shape
(70000,)
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
some_digit = X[36000]
some_digit_image = some_digit.reshape(28,28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
interpolation="nearest")
plt.axis("off")
plt.show()
y[36000]
'9'
The MNIST dataset has already been prepared into training and test sets (first 60,000 and last 10,000).
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
This will help prevent cross validation folds and will reduce risk of multiple similar instances in a row.
import numpy as np
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
This will train a classifier for 9s and non-9s.
y_train_9 = (y_train == '9')
y_test_9 = (y_test == '9')
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_9)
SGDClassifier(random_state=42)
sgd_clf.predict([some_digit])
array([False])
Accuracy will be high in classification problems. It's usually not preferred in classification.
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_9, cv=3, scoring="accuracy")
array([0.94675, 0.955 , 0.9435 ])
cross_val_predict performs cross validation then returns the predictions instead of the accuracy. This gives you a clean prediction for each instance.
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_9, cv=3)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_9, y_train_pred)
#first row - negative or False
#second row - positive or True
array([[53290, 761], [ 2334, 3615]])
precision_score( ) returns TP / (TP + FP)
When it claims an image represents a 9, it's correct 67% of the time
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_9, y_train_pred) # == 3832 / (3832 + 1909)
0.8260968921389397
recall_score( ) returns TP / (TP + FN)
It only detects 64% of the 9s
recall_score(y_train_9, y_train_pred) # == 3832 / (3832 + 2117)
0.6076651538073626
The F1 score is the harmonic mean of precision and recall
from sklearn.metrics import f1_score
f1_score(y_train_9, y_train_pred)
0.7002421307506054
Precision and Recall are trade-offs
increase threshold for precision, decrease for recall
Scikit-Learn does not have support for setting the threshold directly.
y_scores = sgd_clf.decision_function([some_digit])
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_scores = cross_val_predict(sgd_clf, X_train, y_train_9, cv=3, method='decision_function')
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores)
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--', label='Presicion')
plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
plt.xlabel('Threshold')
plt.legend(loc='center left')
plt.ylim([0,1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, 'g-')
plt.xlabel('Recall')
plt.ylabel('Precision')
plot_precision_vs_recall(precisions, recalls)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
y_train_pred90 = (y_scores > 5000)
precision_score(y_train_9, y_train_pred90)
0.9201934703748489
recall_score(y_train_9, y_train_pred90)
0.12792065893427468
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_9, y_scores)
ROC curve plots the true positive rate (recall) against the false positive rate. The dotted line is a completely random classifier You want your classifier to be as far to the top left corner as possible
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr)
plt.show()
You want the area under the curve to as close to one as possible
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_9, y_scores)
0.9608004989615918
Use P/R when the positive class is rare or when you care more about the false positives than the false negatives and the ROC curve otherwise.
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_9, cv=3, method='predict_proba')
y_scores_forest = y_probas_forest[:, 1] # score = probability of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_9, y_scores_forest)
plt.plot(fpr, tpr, 'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show()
roc_auc_score(y_train_9, y_scores_forest)
0.9951605460783337
precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores_forest)
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
y_train_forest_pred90 = (y_scores_forest > .4)
precision_score(y_train_9, y_train_forest_pred90)
0.959927797833935
recall_score(y_train_9, y_train_forest_pred90)
0.893931753235838
One vs All: Binary classifier for each class
One vs One: Binary classifier for each class vs each class N * (N-1) / 2
Scikit Learn defaults to OvA unless using a SVM
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
array(['3'], dtype='<U1')
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores
array([[-35174.9477605 , -20628.00288562, -23841.9225113 , -73.13490557, -2951.34057527, -7734.13773636, -29138.40714959, -12521.89468369, -5878.18191885, -4169.64674913]])
np.argmax(some_digit_scores)
3
sgd_clf.classes_
array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1')
sgd_clf.classes_[9]
'9'
Manually use OvO instead of OvA
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
array(['4'], dtype=object)
len(ovo_clf.estimators_)
45
Random Forest classifiers don't need OvO or OvA as they can classify multiple classes
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
array(['9'], dtype=object)
forest_clf.predict_proba([some_digit])
array([[0.01, 0. , 0. , 0.01, 0.08, 0.03, 0. , 0. , 0. , 0.87]])
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')
array([0.84205, 0.85795, 0.86885])
Normally we would have this all done before any training but we're gonna scale the inputs now:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
array([0.89955, 0.8959 , 0.89925])
Normally we would have optimized the system a little already but we are going to assume we already did.
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv =3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx
array([[5574, 0, 15, 6, 8, 38, 32, 5, 244, 1], [ 1, 6415, 44, 21, 3, 43, 4, 9, 192, 10], [ 26, 26, 5238, 83, 71, 27, 68, 35, 375, 9], [ 27, 18, 110, 5205, 0, 196, 24, 47, 433, 71], [ 10, 14, 34, 7, 5228, 6, 32, 19, 344, 148], [ 28, 19, 28, 151, 54, 4427, 78, 16, 550, 70], [ 25, 17, 47, 2, 42, 85, 5554, 5, 141, 0], [ 18, 13, 47, 24, 49, 9, 4, 5713, 181, 207], [ 13, 67, 40, 93, 1, 125, 33, 7, 5429, 43], [ 20, 22, 30, 52, 118, 35, 1, 181, 379, 5111]])
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()
compare error rates vs absolute number of errors
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()
# rows represent actual classes
# columns represent predicted classes
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
size = 28
images_per_row = min(len(instances), images_per_row)
images = [instance.reshape(size,size) for instance in instances]
n_rows = (len(instances) - 1) // images_per_row + 1
row_images = []
n_empty = n_rows * images_per_row - len(instances)
images.append(np.zeros((size, size * n_empty)))
for row in range(n_rows):
rimages = images[row * images_per_row : (row + 1) * images_per_row]
row_images.append(np.concatenate(rimages, axis=1))
image = np.concatenate(row_images, axis=0)
plt.imshow(image, cmap = matplotlib.cm.binary, **options)
plt.axis("off")
cl_a , cl_b = '3', '5'
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]
plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()
# classified 3s on the left
# classified 5s on the right
from sklearn.neighbors import KNeighborsClassifier
y_train_large = [(i in '789') for i in y_train]
y_train_odd = [(i in '13579') for i in y_train]
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)
KNeighborsClassifier()
knn_clf.predict([some_digit])
array([[ True, True]])
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average='macro')
#set average='weighted' if all instances are not equally important
0.977819997174431
adding noise then reducing noise in the images
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test
peek = [X_train_mod[0], y_train_mod[0]]
plt.subplot(221); plot_digits(peek, images_per_row=2)
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[3600]])
plot_digits([clean_digit])