import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
iris = load_iris().data
print('The Iris data set contains the data of 150 flowers.')
print('\nThe Iris data set has 50 flowers each of the following 3 species: ')
tn = load_iris().target_names
for i in tn:
print('\tIris', i)
print('''
For each flower, the following four features are measured:
(1) sepal length, (2) sepal width, (3) petal length, (4) petal width.
\n''')
print('Dimensions of the data matrix =', iris.shape)
print('\nNumber of data points =', iris.shape[0])
print('Number of features =', iris.shape[1])
fn = load_iris().feature_names
print('\nNames of the features: ')
for i in fn:
print('\t',i)
print('\nThe Iris Data Set:')
print(iris)
from sklearn.datasets import load_iris
iris = load_iris().data
labels = load_iris().target
fig, ax = plt.subplots(4, 4, figsize=(24,22))
for i in range(4):
for j in range(4):
if i != j:
ax[i,j].scatter(iris[:,i], iris[:,j], c=labels, marker='x')
ax[i,j].set_xlabel('Feature '+str(i))
ax[i,j].set_ylabel('Feature '+str(j))
else:
ax[i,j].text(0.5, 0.5, load_iris().feature_names[i], horizontalalignment='center',
verticalalignment='center', size=20)
ax[i,j].set_xticks([])
ax[i,j].set_yticks([])
plt.show()
###########################################################
## k-Means Clustering ##
###########################################################
# Load the data set
from sklearn import datasets
iris = datasets.load_iris().data
# Run k-Means clustering on the data
from sklearn.cluster import KMeans
km1 = KMeans(n_clusters=3).fit(iris)
# Plot the data and clusters
fig, ax = plt.subplots(1,2,figsize=(16,6))
ax[0].scatter(iris[:,2], iris[:,3], marker='x', c='k')
ax[0].set_xlabel('Feature 2')
ax[0].set_ylabel('Feature 3')
ax[0].set_title('Iris Data Set', size=20)
col = ['y', 'g', 'b']
for i in range(3):
ax[1].scatter(
iris[km1.labels_==i,2], iris[km1.labels_==i,3],
marker='x', c=col[i]
)
ax[1].scatter(
km1.cluster_centers_[:,2],km1.cluster_centers_[:,3],
s=120, marker='o', c=col, edgecolor='k'
)
ax[1].set_xlabel('Feature 2')
ax[1].set_ylabel('Feature 3')
ax[1].set_title('kMeans Clusters', size=20)
plt.show()
###########################################################
## Step-by-step Demo of k-Means Clustering ##
###########################################################
# Load the data
from sklearn import datasets
iris = datasets.load_iris().data
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the data
fig, ax = plt.subplots(4,2,figsize=(16,28))
ax[0,0].scatter(iris[:,2], iris[:,3], marker='x', c='k')
ax[0,0].set_xlabel('Feature 2')
ax[0,0].set_ylabel('Feature 3')
ax[0,0].set_title('Iris Data', size=20)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Run k-Means clustering on the data
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# Randomly select cluster centers
centers = iris[np.random.choice(
iris.shape[0], size=3, replace=False
),:]
# Compute cluster memberships
labels = cdist(centers, iris).argmin(axis=0)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot clusters
col = ['y', 'g', 'b']
for i in range(3):
ax[0,1].scatter(
iris[labels==i,2], iris[labels==i,3],
marker='x', c=col[i]
)
ax[0,1].scatter(
centers[:,2], centers[:,3],
s=120, marker='o', c=col, edgecolor='k'
)
ax[0,1].set_xlabel('Feature 2')
ax[0,1].set_ylabel('Feature 3')
ax[0,1].set_title('kMeans Iteration 1', size=20)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Iterate 6 more times
for i in range(6):
# Run 1 iteration of kMeans
km1 = KMeans(
n_clusters=3, init=centers, max_iter=1
).fit(iris)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the clusters
centers = km1.cluster_centers_
labels = km1.labels_
for j in range(3):
ax[1+i//2,i%2].scatter(
iris[labels==j,2], iris[labels==j,3],
marker='x', c=col[j]
)
ax[1+i//2,i%2].scatter(
centers[:,2], centers[:,3],
s=120, marker='o', c=col, edgecolor='k'
)
ax[1+i//2,i%2].set_xlabel('Feature 2')
ax[1+i//2,i%2].set_ylabel('Feature 3')
ax[1+i//2,i%2].set_title(
'kMeans Iteration ' + str(i+2), size=20
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.show()
import matplotlib as mpl
###########################################################
## Function to draw ellipses ##
###########################################################
def make_ellipses(gmm, ax):
for n, color in enumerate('rgb'):
v, w = np.linalg.eigh(gmm.covariances_[n][2:, 2:])
u = w[0] / np.linalg.norm(w[0])
angle = (180 / np.pi) * np.arctan2(u[1], u[0])
# Boost magnitudes for visualization
v = (v ** 0.4) * 3
ell = mpl.patches.Ellipse(
gmm.means_[n, 2:], v[0], v[1],
180 + angle, color=color
)
ell.set_clip_box(ax.bbox)
ell.set_alpha(0.25)
ax.add_artist(ell)
###########################################################
## Gaussian Mixure Models Classification ##
###########################################################
from sklearn.mixture import GaussianMixture as GMM
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
# Split the data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, test_size=0.4
)
n_classes = len(np.unique(y_train))
# Initialize the Gaussian Mixture Model classifier object
gmm_classifier = GMM(
n_components=n_classes, covariance_type='full'
)
gmm_classifier.means_init = np.array([
X_train[y_train == i].mean(axis=0)
for i in range(n_classes)
])
# Train the classifier using the EM algorithm.
gmm_classifier.fit(X_train)
# Print the accuracy on the training data set
y_train_pred = gmm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
# Print the accuracy on the testing data set
y_test_pred = gmm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
###########################################################
## Plot classifier Gaussians ##
###########################################################
fig, ax = plt.subplots(figsize=(8,6))
make_ellipses(gmm_classifier, ax)
for n, color in enumerate('rgb'):
data = iris.data[iris.target == n]
plt.scatter(
data[:, 2], data[:, 3], s=10,
c=color, label=iris.target_names[n]
)
# Plot the test data with crosses
for n, color in enumerate('rgb'):
data = X_test[y_test == n]
plt.scatter(
data[:, 2], data[:, 3], s=80,
marker='x', c=color
)
plt.xticks(())
plt.yticks(())
plt.xlabel('Feature 1')
plt.ylabel('Feature 1')
plt.legend(loc='lower right', prop=dict(size=16))
plt.title(
'Fitting Gaussian Mixture Models on the Iris data set',
size=16
)
plt.show()
###########################################################
## Generate 2 random classes ##
###########################################################
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=100, n_features=2, centers=2)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], marker='x', c='k')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('2 classes of random data', size=16)
plt.axis('tight')
plt.show()
###########################################################
## Linear SVM ##
###########################################################
# Split the data into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4
)
# Train the Linear SVM classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X, y)
# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
# Plots of the classified data and the maximal margin
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
plt.figure(figsize=(8,6))
plt.clf()
plt.scatter(
X[:, 0], X[:, 1], c=y, zorder=10,
cmap=plt.cm.Paired, edgecolor='k', s=20
)
# Circle out the test data
plt.scatter(
X_test[:, 0], X_test[:, 1], s=80,
facecolors='none', zorder=10, edgecolor='k'
)
plt.axis('tight')
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = svm_classifier.decision_function(
np.c_[XX.ravel(), YY.ravel()]
)
# Put the result into a color plot
Z = Z.reshape(XX.shape)
plt.pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
plt.contour(XX, YY, Z, colors=['k', 'k', 'k'],
linestyles=['--', '-', '--'], levels=[-1, 0, 1])
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Linear SVM', size=20)
plt.show()
from sklearn.datasets import make_circles
X, y = make_circles()
# Plot the data
plt.figure(figsize=(8,6))
plt.scatter(X[:,0], X[:,1], marker='x', c=y)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Concentric Circles data set', size=20)
plt.show()
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
X, y = make_circles()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4
)
# Train the Linear SVM classifier
from sklearn.svm import SVC
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X, y)
print('On the concentric circles data set, performance of Linear SVM:')
# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
from sklearn.datasets import make_circles
from sklearn.model_selection import train_test_split
X, y = make_circles()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.4
)
# Train the SVM classifier using the RBF Kernel
from sklearn.svm import SVC
svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X, y)
# Print the accuracy on the training data set
y_train_pred = svm_classifier.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
# Print the accuracy on the testing data set
y_test_pred = svm_classifier.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Plot the data
fig, ax = plt.subplots(1, 2, figsize=(16,6))
ax[0].scatter(X[:,0], X[:,1], marker='x', c=y)
ax[0].set_xlabel('Feature 1')
ax[0].set_ylabel('Feature 2')
ax[0].set_title('Concentric Circles data set', size=20)
# Plots of the classified data
y_plot = svm_classifier.predict(X)
x_min = X[:, 0].min()
x_max = X[:, 0].max()
y_min = X[:, 1].min()
y_max = X[:, 1].max()
XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
Z = svm_classifier.decision_function(
np.c_[XX.ravel(), YY.ravel()]
)
# Put the result into a color plot
Z = Z.reshape(XX.shape)
ax[1].pcolormesh(XX, YY, Z > 0, cmap=plt.cm.Paired)
ax[1].scatter(
X[:, 0], X[:, 1], c = y_plot, marker='x'
)
ax[1].set_title('SVM using RBF Kernels', size=20)
plt.show()
from sklearn import datasets
iris = datasets.load_iris()
# taking only the last two features for easy visualization
X = iris.data[:, 2:]
y = iris.target
# Train the kNN classifier
from sklearn import neighbors
n_neighbors = 5
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors)
knn_classifier.fit(X, y)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from matplotlib.colors import ListedColormap
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
# Plot the decision boundary
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = knn_classifier.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(figsize=(8,6))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold,
edgecolor='k', s=20)
plt.title("3-Class classification with k = %i"
% (n_neighbors), size=20)
plt.show()
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn import tree
decision_trees = tree.DecisionTreeClassifier()
decision_trees = decision_trees.fit(iris.data, iris.target)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
import graphviz
dot_data = tree.export_graphviz(decision_trees, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("iris")
dot_data = tree.export_graphviz(
decision_trees, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True
)
graph = graphviz.Source(dot_data)
graph
from sklearn.datasets import load_breast_cancer
wdbc = load_breast_cancer()
from sklearn import tree
decision_trees = tree.DecisionTreeClassifier()
decision_trees = decision_trees.fit(wdbc.data, wdbc.target)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
import graphviz
dot_data = tree.export_graphviz(decision_trees, out_file=None)
graph = graphviz.Source(dot_data)
graph.render("wdbc")
dot_data = tree.export_graphviz(
decision_trees, out_file=None,
feature_names=wdbc.feature_names,
class_names=wdbc.target_names,
filled=True, rounded=True,
special_characters=True
)
graph = graphviz.Source(dot_data)
graph
from sklearn.datasets import load_digits
data = load_digits().data
labels = load_digits().target
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Display random data points (which are numeric digits)
rand_idx = np.random.choice(
data.shape[0], size=5, replace=False
)
fig, ax = plt.subplots(1,5,figsize=(16,6))
for i in range(len(rand_idx)):
ax[i].imshow(data[rand_idx[i],:].reshape(8,8), cmap='Greys')
ax[i].set_title('Digit '+str(labels[rand_idx[i]]), size=14)
plt.show()
from sklearn.datasets import load_digits
data = load_digits().data
labels = load_digits().target
X_train, X_test, y_train, y_test = train_test_split(
data, labels, test_size=0.4
)
# Train the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(
n_estimators=20, max_features=int(X_train.shape[1] ** 0.5)
)
random_forest.fit(X_train, y_train)
print('Random Forest:')
y_train_pred = random_forest.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
y_test_pred = random_forest.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
# Train the Multi-Layered Perceptrons
from sklearn.neural_network import MLPClassifier
neural_networks = MLPClassifier(hidden_layer_sizes=(100))
neural_networks.fit(X_train, y_train)
print('\nMulti-Layered Perceptron:')
y_train_pred = neural_networks.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
y_test_pred = neural_networks.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
# Train Linear Support Vector Machines
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
print('\nSupport Vector Machines:')
y_train_pred = svm.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean() * 100
print('Training Accuracy =', train_accuracy)
y_test_pred = svm.predict(X_test)
test_accuracy = (y_test_pred == y_test).mean() * 100
print('Testing Accuracy =', test_accuracy)
from sklearn.datasets import load_digits
X = load_digits().data
y = load_digits().target
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
rf_accuracy = 0
mlp_accuracy = 0
svm_accuracy = 0
for train_idx, test_idx in kf.split(X):
X_train, X_test, y_train, y_test = \
X[train_idx], X[test_idx], y[train_idx], y[test_idx]
random_forest = RandomForestClassifier(
n_estimators=20, max_features=int(X_train.shape[1] ** 0.5)
).fit(X_train, y_train)
y_test_pred = random_forest.predict(X_test)
rf_accuracy += (y_test_pred == y_test).mean() * 100
neural_network = MLPClassifier(
hidden_layer_sizes=(100)
).fit(X_train, y_train)
y_test_pred = neural_network.predict(X_test)
mlp_accuracy += (y_test_pred == y_test).mean() * 100
svm = SVC(kernel='linear').fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
svm_accuracy += (y_test_pred == y_test).mean() * 100
print('Random Forest accuracy =', rf_accuracy / 10)
print(
'Multi-Layered Perceptron accuracy =', mlp_accuracy / 10
)
print('Linear SVM accuracy ', svm_accuracy / 10)
from IPython.display import Image
Image("sklearn.png")