Ensemble Learning - Decision Trees, Boosting (AdaBoost), Bagging (Random Forests)

Avisek Gupta, Senior Research Fellow, ECSU

Dr. Swagatam Das, Associate Professor, ECSU

A Short Course on Machine Learning for Practitioners

Organized by Centre for Artificial Intelligence and Machine Learning

Indian Statistical Institute, Kolkata.

November 22, 2019

1. Training Decision Trees:

(i) Generating random data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

X = np.vstack((
    np.random.normal(loc=[0,0], scale=1, size=(100,2)),
    np.random.normal(loc=[10,0], scale=1, size=(100,2)),
    np.random.normal(loc=[5,6], scale=1, size=(100,2))
y = np.hstack((
    np.zeros((100)) + 1,
    np.zeros((100)) + 2
n_classes = 3

for j in range(n_classes):
    plt.scatter(X[y==j,0], X[y==j,1], marker='x')

(ii) Decision Tree Classification

In [3]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier

# Train a Decision Tree Classifier
clf = DecisionTreeClassifier().fit(X, y)

y_pred = clf.predict(X)
from sklearn.metrics import accuracy_score
print('Training Accuracy =', accuracy_score(y, y_pred))

# Plot the decision surface
plot_colors = "ryb"
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
    np.arange(y_min, y_max, plot_step))
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
for i, color in zip(range(n_classes), plot_colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, 
        cmap=plt.cm.RdYlBu, edgecolor='black', s=15)
plt.title("Decision surface of a decision tree")
Training Accuracy 1.0
In [27]:
In [21]:
from sklearn.datasets import load_iris

X = load_iris().data

y = load_iris().target
n_classes = len(np.unique(y))
print('#Classes =', n_classes)
(150, 4)
#Classes = 3
In [22]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

n_classes = 3
plot_colors = "ryb"
plot_step = 0.02
for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # Only take two features
    X = load_iris().data[:, pair]
    y = load_iris().target

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)
plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(borderpad=0, handletextpad=0, bbox_to_anchor=(1.05, 1.05))

2. Decision Trees are interpretable classification models

In [40]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier, plot_tree

# Load data
iris = load_iris()

clf = DecisionTreeClassifier().fit(iris.data, iris.target)
plot_tree(clf, feature_names=load_iris().feature_names, filled=True)

3. Application of interpretable classification models: Medical Diagnosis

In [42]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.datasets import load_breast_cancer
X = load_breast_cancer().data
y = load_breast_cancer().target

print('Dsta size and dimensions =', X.shape)
print('Number of clusters =', len(np.unique(y)))

clf = DecisionTreeClassifier().fit(X, y)
plot_tree(clf, feature_names=load_breast_cancer().feature_names, filled=True)
Dsta size and dimensions = (569, 30)
Number of clusters = 2

4. Boosting: Training the AdaBoost Classifier

In [32]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
n_classes = len(np.unique(y))

print('data shape =', X.shape)
print('#Classes =', n_classes)
data shape = (12000, 10)
#Classes = 2
In [35]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import zero_one_loss
from sklearn.ensemble import AdaBoostClassifier

X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X_test, y_test = X[2000:], y[2000:]
X_train, y_train = X[:2000], y[:2000]

dt_stump = DecisionTreeClassifier(max_depth=1)
dt_stump.fit(X_train, y_train)
dt_stump_err = 1.0 - dt_stump.score(X_test, y_test)

dt = DecisionTreeClassifier(max_depth=9)
dt.fit(X_train, y_train)
dt_err = 1.0 - dt.score(X_test, y_test)

n_estimators = 400
learning_rate = 1.

ada_real = AdaBoostClassifier(
ada_real.fit(X_train, y_train)

fig = plt.figure(dpi=200)
ax = fig.add_subplot(111)
ax.plot([1, n_estimators], [dt_stump_err] * 2, 'b-',
        label='Decision Stump Error')
ax.plot([1, n_estimators], [dt_err] * 2, 'b--',
        label='Decision Tree Error')
ada_real_err = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_real.staged_predict(X_test)):
    ada_real_err[i] = zero_one_loss(y_pred, y_test)
ada_real_err_train = np.zeros((n_estimators,))
for i, y_pred in enumerate(ada_real.staged_predict(X_train)):
    ada_real_err_train[i] = zero_one_loss(y_pred, y_train)
ax.plot(np.arange(n_estimators) + 1, ada_real_err,
        label='AdaBoost Test Error',
ax.plot(np.arange(n_estimators) + 1, ada_real_err_train,
        label='AdaBoost Train Error',
ax.set_ylim((0.0, 0.5))
ax.set_ylabel('error rate')
leg = ax.legend(loc='upper right', fancybox=True)

5. Bagging: Training Random Forest of Decision Trees

In [55]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

# Load data
X = load_iris().data
y = load_iris().target

clf = RandomForestClassifier(n_estimators=10).fit(X, y)

print('Training Accuracy =', accuracy_score(y, clf.predict(X)))

i = 1
for dtree in clf.estimators_:
    print('Tree #'+str(i),':')
    plot_tree(dtree, feature_names=load_iris().feature_names,  filled=True)
    i = i + 1
Training Accuracy = 1.0
Tree #1 :
Tree #2 :
Tree #3 :
Tree #4 :
Tree #5 :
Tree #6 :
Tree #7 :
Tree #8 :
Tree #9 :
Tree #10 :

6. Competing on the Digits data set:

(i) The Digits data set

In [38]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits

X = load_digits().data
y = load_digits().target

# Randomly select 10 images to be shown
rnd_idx = np.random.randint(0, X.shape[0], 10)

fig, ax = plt.subplots(2,5, dpi=150)
for i in range(2):
    for j in range(5):
        ax[i,j].imshow(X[rnd_idx[i*5+j]].reshape(8,8), cmap='gray')
        ax[i,j].set_title('Digit '+str(y[rnd_idx[i*5+j]]))

(ii) Visualizing the data set with TSNE

In [8]:
from sklearn.datasets import load_digits

X = load_digits().data
y = load_digits().target
n_classes = len(np.unique(y))

from sklearn.manifold import TSNE

projX = TSNE(n_components=2).fit_transform(X)

for j in range(n_classes):
    plt.scatter(projX[y==j,0], projX[y==j,1], marker='x')
plt.title('TSNE projection of the digits data set')

(iii) Decision Trees vs. AdaBoost vs. Random Forest on the Digits data set

In [49]:
import time
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

X = load_digits().data
y = load_digits().target
n_classes = len(np.unique(y))

X_train, X_test, y_train, y_test = train_test_split(X, y)

results = []
train_time = []
test_time = []

clf1 = DecisionTreeClassifier()
start = time.time()
clf1.fit(X_train, y_train)
train_time.append(time.time() - start)
y_pred1 = clf1.predict(X_test)
test_time.append(time.time() - start)
results.append(accuracy_score(y_test, y_pred1))

clf2 = AdaBoostClassifier(
start = time.time()
clf2.fit(X_train, y_train)
train_time.append(time.time() - start)
y_pred2 = clf2.predict(X_test)
test_time.append(time.time() - start)
results.append(accuracy_score(y_test, y_pred2))

clf3 = RandomForestClassifier(n_estimators=30)
start = time.time()
clf3.fit(X_train, y_train)
train_time.append(time.time() - start)
y_pred3 = clf3.predict(X_test)
test_time.append(time.time() - start)
results.append(accuracy_score(y_test, y_pred3))

train_time = np.array(train_time)
train_time = train_time / train_time.max()
test_time = np.array(test_time)
test_time = test_time / test_time.max()

indices = np.arange(len(results))
plt.barh(indices, results, .2, label="results", color='navy')
plt.barh(indices + .3, train_time, .2, label="train time",
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')

clf_names = ['Decision Tree', 'AdaBoost', 'Random Forest']
for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)

In [51]:
test_idx = np.random.randint(0, X_test.shape[0], 10)

y_pred1 = clf1.predict(X_test[test_idx])
y_pred2 = clf2.predict(X_test[test_idx])
y_pred3 = clf3.predict(X_test[test_idx])

fig, ax = plt.subplots(2,5, dpi=150)
for i in range(2):
    for j in range(5):
        ax[i,j].imshow(X_test[test_idx[i*5+j]].reshape(8,8), cmap='gray')
        ax[i,j].set_xlabel('Digit '+str(y_test[test_idx[i*5+j]])+'\n'
            +'DT pred: '+str(y_pred1[i*5+j])+'\n'
            +'AdB pred: '+str(y_pred1[i*5+j])+'\n'
            +'RF pred: '+str(y_pred1[i*5+j]))


1. The scikit-learn documentation: https://scikit-learn.org/stable/user_guide.html

For Queries: avisek003@gmail.com (Avisek Gupta)