Machine Learning in Python: scikit-learn

title


Demonstrated by:

Avisek Gupta, SRF in CS, ECSU

avisek003@gmail.com


In [1]:
import numpy as np
import matplotlib.pyplot as plt

Convention:

A 'data matrix' contains:

• data points in its rows

• features in its columns

In [2]:
from sklearn.datasets import load_iris
iris = load_iris().data

print('The Iris data set contains the data of 150 flowers.')

print('\nThe Iris data set has 50 flowers each of the following 3 species: ')
tn = load_iris().target_names
for i in tn:
    print('\tIris', i)

print('''
For each flower, the following four features are measured:
(1) sepal length, (2) sepal width, (3) petal length, (4) petal width.
\n''')

print('Dimensions of the data matrix =', iris.shape)

print('\nNumber of data points =', iris.shape[0])
print('Number of features =', iris.shape[1])

fn = load_iris().feature_names
print('\nNames of the features: ')
for i in fn:
    print('\t',i)

print('\nThe Iris Data Set:')
print(iris)
The Iris data set contains the data of 150 flowers.

The Iris data set has 50 flowers each of the following 3 species: 
	Iris setosa
	Iris versicolor
	Iris virginica

For each flower, the following four features are measured:
(1) sepal length, (2) sepal width, (3) petal length, (4) petal width.


Dimensions of the data matrix = (150, 4)

Number of data points = 150
Number of features = 4

Names of the features: 
	 sepal length (cm)
	 sepal width (cm)
	 petal length (cm)
	 petal width (cm)

The Iris Data Set:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.9 1.5]
 [5.5 2.3 4.  1.3]
 [6.5 2.8 4.6 1.5]
 [5.7 2.8 4.5 1.3]
 [6.3 3.3 4.7 1.6]
 [4.9 2.4 3.3 1. ]
 [6.6 2.9 4.6 1.3]
 [5.2 2.7 3.9 1.4]
 [5.  2.  3.5 1. ]
 [5.9 3.  4.2 1.5]
 [6.  2.2 4.  1. ]
 [6.1 2.9 4.7 1.4]
 [5.6 2.9 3.6 1.3]
 [6.7 3.1 4.4 1.4]
 [5.6 3.  4.5 1.5]
 [5.8 2.7 4.1 1. ]
 [6.2 2.2 4.5 1.5]
 [5.6 2.5 3.9 1.1]
 [5.9 3.2 4.8 1.8]
 [6.1 2.8 4.  1.3]
 [6.3 2.5 4.9 1.5]
 [6.1 2.8 4.7 1.2]
 [6.4 2.9 4.3 1.3]
 [6.6 3.  4.4 1.4]
 [6.8 2.8 4.8 1.4]
 [6.7 3.  5.  1.7]
 [6.  2.9 4.5 1.5]
 [5.7 2.6 3.5 1. ]
 [5.5 2.4 3.8 1.1]
 [5.5 2.4 3.7 1. ]
 [5.8 2.7 3.9 1.2]
 [6.  2.7 5.1 1.6]
 [5.4 3.  4.5 1.5]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [6.3 2.3 4.4 1.3]
 [5.6 3.  4.1 1.3]
 [5.5 2.5 4.  1.3]
 [5.5 2.6 4.4 1.2]
 [6.1 3.  4.6 1.4]
 [5.8 2.6 4.  1.2]
 [5.  2.3 3.3 1. ]
 [5.6 2.7 4.2 1.3]
 [5.7 3.  4.2 1.2]
 [5.7 2.9 4.2 1.3]
 [6.2 2.9 4.3 1.3]
 [5.1 2.5 3.  1.1]
 [5.7 2.8 4.1 1.3]
 [6.3 3.3 6.  2.5]
 [5.8 2.7 5.1 1.9]
 [7.1 3.  5.9 2.1]
 [6.3 2.9 5.6 1.8]
 [6.5 3.  5.8 2.2]
 [7.6 3.  6.6 2.1]
 [4.9 2.5 4.5 1.7]
 [7.3 2.9 6.3 1.8]
 [6.7 2.5 5.8 1.8]
 [7.2 3.6 6.1 2.5]
 [6.5 3.2 5.1 2. ]
 [6.4 2.7 5.3 1.9]
 [6.8 3.  5.5 2.1]
 [5.7 2.5 5.  2. ]
 [5.8 2.8 5.1 2.4]
 [6.4 3.2 5.3 2.3]
 [6.5 3.  5.5 1.8]
 [7.7 3.8 6.7 2.2]
 [7.7 2.6 6.9 2.3]
 [6.  2.2 5.  1.5]
 [6.9 3.2 5.7 2.3]
 [5.6 2.8 4.9 2. ]
 [7.7 2.8 6.7 2. ]
 [6.3 2.7 4.9 1.8]
 [6.7 3.3 5.7 2.1]
 [7.2 3.2 6.  1.8]
 [6.2 2.8 4.8 1.8]
 [6.1 3.  4.9 1.8]
 [6.4 2.8 5.6 2.1]
 [7.2 3.  5.8 1.6]
 [7.4 2.8 6.1 1.9]
 [7.9 3.8 6.4 2. ]
 [6.4 2.8 5.6 2.2]
 [6.3 2.8 5.1 1.5]
 [6.1 2.6 5.6 1.4]
 [7.7 3.  6.1 2.3]
 [6.3 3.4 5.6 2.4]
 [6.4 3.1 5.5 1.8]
 [6.  3.  4.8 1.8]
 [6.9 3.1 5.4 2.1]
 [6.7 3.1 5.6 2.4]
 [6.9 3.1 5.1 2.3]
 [5.8 2.7 5.1 1.9]
 [6.8 3.2 5.9 2.3]
 [6.7 3.3 5.7 2.5]
 [6.7 3.  5.2 2.3]
 [6.3 2.5 5.  1.9]
 [6.5 3.  5.2 2. ]
 [6.2 3.4 5.4 2.3]
 [5.9 3.  5.1 1.8]]
In [3]:
from sklearn.datasets import load_iris
iris = load_iris().data
labels = load_iris().target

fig, ax = plt.subplots(4, 4, figsize=(24,22))
for i in range(4):
    for j in range(4):
        if i != j:
            ax[i,j].scatter(iris[:,i], iris[:,j], c=labels, marker='x')
            ax[i,j].set_xlabel('Feature '+str(i))
            ax[i,j].set_ylabel('Feature '+str(j))
        else:
            ax[i,j].text(0.5, 0.5, load_iris().feature_names[i], horizontalalignment='center',
                     verticalalignment='center', size=20)
            ax[i,j].set_xticks([])
            ax[i,j].set_yticks([])
plt.show()

Unsupervised Learning

k-Means Clustering

In [4]:
###########################################################
##                  k-Means Clustering                   ##
###########################################################


# Load the data set
from sklearn import datasets
iris = datasets.load_iris().data

# Run k-Means clustering on the data
from sklearn.cluster import KMeans
km1 = KMeans(n_clusters=3).fit(iris)

# Plot the data and clusters
fig, ax = plt.subplots(1,2,figsize=(16,6))
ax[0].scatter(iris[:,2], iris[:,3], marker='x', c='k')
ax[0].set_xlabel('Feature 2')
ax[0].set_ylabel('Feature 3')
ax[0].set_title('Iris Data Set', size=20)
col = ['y', 'g', 'b']
for i in range(3):
    ax[1].scatter(
        iris[km1.labels_==i,2], iris[km1.labels_==i,3], 
        marker='x', c=col[i]
    )
ax[1].scatter(
    km1.cluster_centers_[:,2],km1.cluster_centers_[:,3],
    s=120, marker='o', c=col, edgecolor='k'
)
ax[1].set_xlabel('Feature 2')
ax[1].set_ylabel('Feature 3')
ax[1].set_title('kMeans Clusters', size=20)
plt.show()
In [5]:
###########################################################
##        Step-by-step Demo of k-Means Clustering        ##
###########################################################


# Load the data
from sklearn import datasets
iris = datasets.load_iris().data

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the data
fig, ax = plt.subplots(4,2,figsize=(16,28))
ax[0,0].scatter(iris[:,2], iris[:,3], marker='x', c='k')
ax[0,0].set_xlabel('Feature 2')
ax[0,0].set_ylabel('Feature 3')
ax[0,0].set_title('Iris Data', size=20)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Run k-Means clustering on the data
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# Randomly select cluster centers
centers = iris[np.random.choice(
            iris.shape[0], size=3, replace=False
        ),:]
# Compute cluster memberships
labels = cdist(centers, iris).argmin(axis=0)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot clusters
col = ['y', 'g', 'b']
for i in range(3):
    ax[0,1].scatter(
        iris[labels==i,2], iris[labels==i,3], 
        marker='x', c=col[i]
    )
ax[0,1].scatter(
    centers[:,2], centers[:,3],
    s=120, marker='o', c=col, edgecolor='k'
)
ax[0,1].set_xlabel('Feature 2')
ax[0,1].set_ylabel('Feature 3')
ax[0,1].set_title('kMeans Iteration 1', size=20)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Iterate 6 more times
for i in range(6):
    # Run 1 iteration of kMeans
    km1 = KMeans(
            n_clusters=3, init=centers, max_iter=1
    ).fit(iris)
    
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Plot the clusters
    centers = km1.cluster_centers_
    labels = km1.labels_
    for j in range(3):
        ax[1+i//2,i%2].scatter(
            iris[labels==j,2], iris[labels==j,3], 
            marker='x', c=col[j]
        )
    ax[1+i//2,i%2].scatter(
        centers[:,2], centers[:,3],
        s=120, marker='o', c=col, edgecolor='k'
    )
    ax[1+i//2,i%2].set_xlabel('Feature 2')
    ax[1+i//2,i%2].set_ylabel('Feature 3')
    ax[1+i//2,i%2].set_title(
        'kMeans Iteration ' + str(i+2), size=20
    )
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

plt.show()
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(
<ipython-input-5-41f9a4bd95f2>:49: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  km1 = KMeans(

Supervised Learning

Gaussian Mixture Models classifier

In [6]:
import matplotlib as mpl
###########################################################
##              Function to draw ellipses                ##
###########################################################


def make_ellipses(gmm, ax):
    for n, color in enumerate('rgb'):
        v, w = np.linalg.eigh(gmm.covariances_[n][2:, 2:])
        u = w[0] / np.linalg.norm(w[0])
        angle = (180 / np.pi) * np.arctan2(u[1], u[0])
        # Boost magnitudes for visualization
        v = (v ** 0.4) * 3
        ell = mpl.patches.Ellipse(
            gmm.means_[n, 2:], v[0], v[1], 
            180 + angle, color=color
        )
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.25)
        ax.add_artist(ell)


###########################################################
##        Gaussian Mixure Models Classification         ##
###########################################################
        
    
from sklearn.mixture import GaussianMixture as GMM
from sklearn import datasets
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
# Split the data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.4
)
n_classes = len(np.unique(y_train))

# Initialize the Gaussian Mixture Model classifier object
gmm_classifier = GMM(
    n_components=n_classes, covariance_type='full'
)
gmm_classifier.means_init = np.array([
        X_train[y_train == i].mean(axis=0) 
        for i in range(n_classes)
])

# Train the classifier using the EM algorithm.
gmm_classifier.fit(X_train)

# Print the accuracy on the training data set
y_train_pred = gmm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

# Print the accuracy on the testing data set
y_test_pred = gmm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)



###########################################################
##               Plot classifier Gaussians               ##
###########################################################


fig, ax = plt.subplots(figsize=(8,6))
make_ellipses(gmm_classifier, ax)
for n, color in enumerate('rgb'):
    data = iris.data[iris.target == n]
    plt.scatter(
        data[:, 2], data[:, 3], s=10, 
        c=color, label=iris.target_names[n]
    )
# Plot the test data with crosses
for n, color in enumerate('rgb'):
    data = X_test[y_test == n]
    plt.scatter(
        data[:, 2], data[:, 3], s=80, 
        marker='x', c=color
    )
plt.xticks(()) 
plt.yticks(())
plt.xlabel('Feature 1')
plt.ylabel('Feature 1')
plt.legend(loc='lower right', prop=dict(size=16))
plt.title(
    'Fitting Gaussian Mixture Models on the Iris data set',
    size=16
)
plt.show()
Training Accuracy = 97.77777777777777
Testing Accuracy = 96.66666666666667

Support Vector Machines

In [7]:
###########################################################
##               Generate 2 random classes               ##
###########################################################


from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=100, n_features=2, centers=2)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], marker='x', c='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('2 classes of random data', size=16)
plt.axis('tight')
plt.show()
In [8]:
###########################################################
##                      Linear SVM                       ##
###########################################################


# Split the data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4
)

# Train the Linear SVM classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X, y)

# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)

# Plots of the classified data and the maximal margin
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.figure(figsize=(8,6))
plt.clf()
plt.scatter(
    X[:, 0], X[:, 1], c=y, zorder=10, 
    cmap=plt.cm.Paired, edgecolor='k', s=20
)
# Circle out the test data
plt.scatter(
    X_test[:, 0], X_test[:, 1], s=80, 
    facecolors='none', zorder=10, edgecolor='k'
)
plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = svm_classifier.decision_function(
    np.c_[XX.ravel(), YY.ravel()]
)
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
            linestyles=['--', '-', '--'], levels=[-1, 0, 1])
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Linear SVM', size=20)
plt.show()
Training Accuracy = 100.0
Testing Accuracy = 100.0
In [9]:
from sklearn.datasets import make_circles
X, y = make_circles()

# Plot the data
plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], marker='x', c=y)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Concentric Circles data set', size=20)
plt.show()
In [10]:
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split

X, y = make_circles()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4
)

# Train the Linear SVM classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X, y)


print('On the concentric circles data set, performance of Linear SVM:')
# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
On the concentric circles data set, performance of Linear SVM:
Training Accuracy = 48.333333333333336
Testing Accuracy = 45.0
In [11]:
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
X, y = make_circles()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4
)


# Train the SVM classifier using the RBF Kernel
from sklearn.svm import SVC
svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X, y)


# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the data
fig, ax = plt.subplots(1, 2, figsize=(16,6))
ax[0].scatter(X[:,0], X[:,1], marker='x', c=y)
ax[0].set_xlabel('Feature 1')
ax[0].set_ylabel('Feature 2')
ax[0].set_title('Concentric Circles data set', size=20)
# Plots of the classified data
y_plot = svm_classifier.predict(X)
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = svm_classifier.decision_function(
    np.c_[XX.ravel(), YY.ravel()]
)
# Put the result into a color plot
Z = Z.reshape(XX.shape)
ax[1].pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
ax[1].scatter(
    X[:, 0], X[:, 1], c = y_plot, marker='x'
)
ax[1].set_title('SVM using RBF Kernels', size=20)
plt.show()
Training Accuracy = 100.0
Testing Accuracy = 100.0

kNN classifier

In [12]:
from sklearn import datasets
iris = datasets.load_iris()
# taking only the last two features for easy visualization
X = iris.data[:, 2:]
y = iris.target

# Train the kNN classifier
from sklearn import neighbors
n_neighbors = 5
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
knn_classifier.fit(X, y)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from matplotlib.colors import ListedColormap
h = .02  # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# Plot the decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
    np.arange(y_min, y_max, h))
Z = knn_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,6))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
    edgecolor='k', s=20)
plt.title("3-Class classification with k = %i"
    % (n_neighbors), size=20)
plt.show()

Decision Trees

In [13]:
from sklearn.datasets import load_iris
iris = load_iris()

from sklearn import tree
decision_trees = tree.DecisionTreeClassifier()
decision_trees = decision_trees.fit(iris.data, iris.target)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~
import graphviz
dot_data = tree.export_graphviz(decision_trees, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("iris") 
dot_data = tree.export_graphviz(
    decision_trees, out_file=None, 
    feature_names=iris.feature_names,  
    class_names=iris.target_names,  
    filled=True, rounded=True,  
    special_characters=True
)
graph = graphviz.Source(dot_data)  
graph 
Out[13]:
Tree 0 petal length (cm) ≤ 2.45 gini = 0.667 samples = 150 value = [50, 50, 50] class = setosa 1 gini = 0.0 samples = 50 value = [50, 0, 0] class = setosa 0->1 True 2 petal width (cm) ≤ 1.75 gini = 0.5 samples = 100 value = [0, 50, 50] class = versicolor 0->2 False 3 petal length (cm) ≤ 4.95 gini = 0.168 samples = 54 value = [0, 49, 5] class = versicolor 2->3 12 petal length (cm) ≤ 4.85 gini = 0.043 samples = 46 value = [0, 1, 45] class = virginica 2->12 4 petal width (cm) ≤ 1.65 gini = 0.041 samples = 48 value = [0, 47, 1] class = versicolor 3->4 7 petal width (cm) ≤ 1.55 gini = 0.444 samples = 6 value = [0, 2, 4] class = virginica 3->7 5 gini = 0.0 samples = 47 value = [0, 47, 0] class = versicolor 4->5 6 gini = 0.0 samples = 1 value = [0, 0, 1] class = virginica 4->6 8 gini = 0.0 samples = 3 value = [0, 0, 3] class = virginica 7->8 9 petal length (cm) ≤ 5.45 gini = 0.444 samples = 3 value = [0, 2, 1] class = versicolor 7->9 10 gini = 0.0 samples = 2 value = [0, 2, 0] class = versicolor 9->10 11 gini = 0.0 samples = 1 value = [0, 0, 1] class = virginica 9->11 13 sepal length (cm) ≤ 5.95 gini = 0.444 samples = 3 value = [0, 1, 2] class = virginica 12->13 16 gini = 0.0 samples = 43 value = [0, 0, 43] class = virginica 12->16 14 gini = 0.0 samples = 1 value = [0, 1, 0] class = versicolor 13->14 15 gini = 0.0 samples = 2 value = [0, 0, 2] class = virginica 13->15
In [14]:
from sklearn.datasets import load_breast_cancer
wdbc = load_breast_cancer()

from sklearn import tree
decision_trees = tree.DecisionTreeClassifier()
decision_trees = decision_trees.fit(wdbc.data, wdbc.target)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~
import graphviz
dot_data = tree.export_graphviz(decision_trees, out_file=None)
graph = graphviz.Source(dot_data) 
graph.render("wdbc") 
dot_data = tree.export_graphviz(
    decision_trees, out_file=None, 
    feature_names=wdbc.feature_names,  
    class_names=wdbc.target_names,  
    filled=True, rounded=True,  
    special_characters=True
)
graph = graphviz.Source(dot_data)  
graph 
Out[14]:
Tree 0 worst radius ≤ 16.795 gini = 0.468 samples = 569 value = [212, 357] class = benign 1 worst concave points ≤ 0.136 gini = 0.159 samples = 379 value = [33, 346] class = benign 0->1 True 32 mean texture ≤ 16.11 gini = 0.109 samples = 190 value = [179, 11] class = malignant 0->32 False 2 radius error ≤ 1.048 gini = 0.03 samples = 333 value = [5, 328] class = benign 1->2 19 worst texture ≤ 25.67 gini = 0.476 samples = 46 value = [28, 18] class = malignant 1->19 3 area error ≤ 38.605 gini = 0.024 samples = 332 value = [4, 328] class = benign 2->3 18 gini = 0.0 samples = 1 value = [1, 0] class = malignant 2->18 4 smoothness error ≤ 0.003 gini = 0.012 samples = 319 value = [2, 317] class = benign 3->4 13 worst compactness ≤ 0.082 gini = 0.26 samples = 13 value = [2, 11] class = benign 3->13 5 worst concave points ≤ 0.101 gini = 0.245 samples = 7 value = [1, 6] class = benign 4->5 8 worst texture ≤ 33.27 gini = 0.006 samples = 312 value = [1, 311] class = benign 4->8 6 gini = 0.0 samples = 6 value = [0, 6] class = benign 5->6 7 gini = 0.0 samples = 1 value = [1, 0] class = malignant 5->7 9 gini = 0.0 samples = 292 value = [0, 292] class = benign 8->9 10 worst texture ≤ 33.56 gini = 0.095 samples = 20 value = [1, 19] class = benign 8->10 11 gini = 0.0 samples = 1 value = [1, 0] class = malignant 10->11 12 gini = 0.0 samples = 19 value = [0, 19] class = benign 10->12 14 gini = 0.0 samples = 1 value = [1, 0] class = malignant 13->14 15 radius error ≤ 0.421 gini = 0.153 samples = 12 value = [1, 11] class = benign 13->15 16 gini = 0.0 samples = 1 value = [1, 0] class = malignant 15->16 17 gini = 0.0 samples = 11 value = [0, 11] class = benign 15->17 20 worst area ≤ 810.3 gini = 0.332 samples = 19 value = [4, 15] class = benign 19->20 27 mean concave points ≤ 0.054 gini = 0.198 samples = 27 value = [24, 3] class = malignant 19->27 21 worst smoothness ≤ 0.179 gini = 0.124 samples = 15 value = [1, 14] class = benign 20->21 24 smoothness error ≤ 0.005 gini = 0.375 samples = 4 value = [3, 1] class = malignant 20->24 22 gini = 0.0 samples = 14 value = [0, 14] class = benign 21->22 23 gini = 0.0 samples = 1 value = [1, 0] class = malignant 21->23 25 gini = 0.0 samples = 1 value = [0, 1] class = benign 24->25 26 gini = 0.0 samples = 3 value = [3, 0] class = malignant 24->26 28 worst texture ≤ 28.545 gini = 0.5 samples = 6 value = [3, 3] class = malignant 27->28 31 gini = 0.0 samples = 21 value = [21, 0] class = malignant 27->31 29 gini = 0.0 samples = 3 value = [0, 3] class = benign 28->29 30 gini = 0.0 samples = 3 value = [3, 0] class = malignant 28->30 33 concavity error ≤ 0.034 gini = 0.498 samples = 17 value = [8, 9] class = benign 32->33 36 worst smoothness ≤ 0.088 gini = 0.023 samples = 173 value = [171, 2] class = malignant 32->36 34 gini = 0.0 samples = 9 value = [0, 9] class = benign 33->34 35 gini = 0.0 samples = 8 value = [8, 0] class = malignant 33->35 37 gini = 0.0 samples = 1 value = [0, 1] class = benign 36->37 38 worst concavity ≤ 0.18 gini = 0.012 samples = 172 value = [171, 1] class = malignant 36->38 39 worst smoothness ≤ 0.129 gini = 0.375 samples = 4 value = [3, 1] class = malignant 38->39 42 gini = 0.0 samples = 168 value = [168, 0] class = malignant 38->42 40 gini = 0.0 samples = 3 value = [3, 0] class = malignant 39->40 41 gini = 0.0 samples = 1 value = [0, 1] class = benign 39->41

Random Forest, Multi-Layered Perceptrons

In [15]:
from sklearn.datasets import load_digits
data = load_digits().data
labels = load_digits().target

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Display random data points (which are numeric digits)
rand_idx = np.random.choice(
            data.shape[0], size=5, replace=False
        )
fig, ax = plt.subplots(1,5,figsize=(16,6))
for i in range(len(rand_idx)):
    ax[i].imshow(data[rand_idx[i],:].reshape(8,8), cmap='Greys')
    ax[i].set_title('Digit '+str(labels[rand_idx[i]]), size=14)
plt.show()
In [16]:
from sklearn.datasets import load_digits
data = load_digits().data
labels = load_digits().target
X_train, X_test, y_train, y_test = train_test_split(
    data, labels, test_size=0.4
)


# Train the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(
    n_estimators=20, max_features=int(X_train.shape[1] ** 0.5)
)
random_forest.fit(X_train, y_train)

print('Random Forest:')
y_train_pred = random_forest.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

y_test_pred = random_forest.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)


# Train the Multi-Layered Perceptrons
from sklearn.neural_network import MLPClassifier
neural_networks = MLPClassifier(hidden_layer_sizes=(100))
neural_networks.fit(X_train, y_train)

print('\nMulti-Layered Perceptron:')
y_train_pred = neural_networks.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

y_test_pred = neural_networks.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)


# Train Linear Support Vector Machines
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

print('\nSupport Vector Machines:')
y_train_pred = svm.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)

y_test_pred = svm.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
Random Forest:
Training Accuracy = 100.0
Testing Accuracy = 95.54937413073714

Multi-Layered Perceptron:
Training Accuracy = 100.0
Testing Accuracy = 97.07927677329624

Support Vector Machines:
Training Accuracy = 100.0
Testing Accuracy = 97.77468706536857

10-Fold Cross-Validation

In [17]:
from sklearn.datasets import load_digits
X = load_digits().data
y = load_digits().target

from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True)

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

rf_accuracy = 0
mlp_accuracy = 0
svm_accuracy = 0

for train_idx, test_idx in kf.split(X):
    X_train, X_test, y_train, y_test = \
        X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    random_forest = RandomForestClassifier(
        n_estimators=20, max_features=int(X_train.shape[1] ** 0.5)
    ).fit(X_train, y_train)
    y_test_pred = random_forest.predict(X_test)
    rf_accuracy += (y_test_pred == y_test).mean() * 100

    neural_network = MLPClassifier(
        hidden_layer_sizes=(100)
    ).fit(X_train, y_train)
    y_test_pred = neural_network.predict(X_test)
    mlp_accuracy += (y_test_pred == y_test).mean() * 100

    svm = SVC(kernel='linear').fit(X_train, y_train)
    y_test_pred = svm.predict(X_test)
    svm_accuracy += (y_test_pred == y_test).mean() * 100

print('Random Forest accuracy =', rf_accuracy / 10)
print(
    'Multi-Layered Perceptron accuracy =', mlp_accuracy / 10
)
print('Linear SVM accuracy ', svm_accuracy / 10)
Random Forest accuracy = 96.49472377405338
Multi-Layered Perceptron accuracy = 97.71942892613284
Linear SVM accuracy  98.10676598386097
In [18]:
from IPython.display import Image
Image("sklearn.png")
Out[18]: