# Gradient Descent using torch.optim

import torch
import torch.optim as optim

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

x = torch.randn(1, requires_grad=True, device=device)
print('Initial x:', x)

max_iter = 100
learning_rate = 0.1
eps = 1e-6

# (a) Initialize Optimizer
optimizer = optim.SGD([x], lr=learning_rate)

for _ in range(max_iter):
    optimizer.zero_grad()  # (b) i. Explicitly set the gradients to zero
    y = (x - 4) ** 2
    y.backward()  # (b) ii. Compute the gradients
    prev_x = x.clone()
    optimizer.step()  # (b) iii. Update the parameters using the computed gradients
    if torch.norm(prev_x - x) < eps:
        break

print('Soln x:', x)

Initial x: tensor([1.6230], requires_grad=True)
Soln x: tensor([4.0000], requires_grad=True)


import torch
import torch.optim as optim

# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

# Multi-Layered Perceptron Architecture
# Input dim: X.shape[1]
# Hidden Layer dim: 200
# Output Layer dim: n_classes
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]

# Initialize network parameters
W1 = torch.rand(input_dim+1, hidden_layer_size) * 2 - 1
W2 = torch.rand(hidden_layer_size+1, n_classes) * 2 - 1

# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
W1 = W1.to(device)
W2 = W2.to(device)
W1.requires_grad = True
W2.requires_grad = True
const_ones = torch.ones(X_train.shape[0], 1).to(device)
const_ones.requires_grad = False

# Training parameters
max_epochs = 10000
learning_rate = 1e-4

optimizer = optim.SGD([W1, W2], lr=learning_rate)  # Initialize Optimizer

for n_epoch in range(max_epochs):
    cost = torch.zeros(1).to(device)
    
    optimizer.zero_grad()  # i. Explicitly set the gradients to zero
    
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X_train], dim=1), W1)))
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), W2)))
    z = torch.clamp(z, min=1e-6, max=1-1e-6)
    z = -torch.sum((y_train * torch.log(z)) + ((1 - y_train) * torch.log(1 - z)))
    with torch.no_grad():
        cost = z.clone()
    
    z.backward()  # ii. Compute the gradients
    
    optimizer.step()  # iii. Update the parameters using the computed gradients 

    if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
        print('epoch #'+str(n_epoch)+':', cost.item())

epoch #0: 44920.55078125
epoch #1000: 399.1661376953125
epoch #2000: 269.6180419921875
epoch #3000: 206.77101135253906
epoch #4000: 167.23831176757812
epoch #5000: 139.4957275390625
epoch #6000: 118.78943634033203
epoch #7000: 102.71863555908203
epoch #8000: 89.90544128417969
epoch #9000: 79.48570251464844
epoch #9999: 70.889892578125


# Compute training and test accuracy

with torch.no_grad():
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X_train], dim=1), W1)))
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), W2)))
    y_train_pred = torch.argmax(z, 1)
    y_train_true = torch.argmax(y_train, 1)

    X_test = X_test.to(device)
    y_test = y_test.to(device)
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([torch.ones(X_test.shape[0], 1).to(device), X_test], dim=1), W1)))
    z = 1 / (1 + torch.exp(torch.matmul(torch.cat([torch.ones(X_test.shape[0], 1).to(device), z], dim=1), W2)))
    y_test_pred = torch.argmax(z, 1)
    y_test_true = torch.argmax(y_test, 1)


print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')

Training Accuracy = 99.92044550517105 %
Test Accuracy = 97.5925925925926 %


import torch
import torch.optim as optim
import torch.nn as nn


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layer_size, n_classes):
        super(MLP, self).__init__()
        self.W1 = nn.Parameter(torch.rand(input_dim + 1, hidden_layer_size) * 2 - 1)
        self.W2 = nn.Parameter(torch.rand(hidden_layer_size + 1, n_classes) * 2 - 1)
    
    def forward(self, X):
        const_ones = torch.ones(X.shape[-2], 1)
        z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X], dim=1), self.W1)))
        z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), self.W2)))
        z = torch.clamp(z, min=1e-6, max=1-1e-6)
        return z


# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))


# Initialize network parameters
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]


# Initialize network
mlp = MLP(input_dim=input_dim, hidden_layer_size=hidden_layer_size, n_classes=n_classes)


# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
mlp.to(device)


# Training parameters
max_epochs = 10000
learning_rate = 1e-4

loss = nn.BCELoss(reduction='sum')
optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)

for n_epoch in range(max_epochs):
    cost = torch.zeros(1).to(device)
    
    optimizer.zero_grad()
    z = mlp(X_train)
    z = loss(z, y_train)
    with torch.no_grad():
        cost = z.clone()
    
    z.backward()
    
    optimizer.step()
    
    if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
        print('epoch #'+str(n_epoch)+':', cost.item())

epoch #0: 32940.52734375
epoch #1000: 411.0254821777344
epoch #2000: 277.697265625
epoch #3000: 213.82894897460938
epoch #4000: 173.45681762695312
epoch #5000: 144.90814208984375
epoch #6000: 123.4655990600586
epoch #7000: 106.75452423095703
epoch #8000: 93.408447265625
epoch #9000: 82.55364990234375
epoch #9999: 73.60090637207031


with torch.no_grad():
    z = mlp(X_train)
    y_train_pred = torch.argmax(z, 1)
    y_train_true = torch.argmax(y_train, 1)

    X_test = X_test.to(device)
    y_test = y_test.to(device)
    z = mlp(X_test)
    y_test_pred = torch.argmax(z, 1)
    y_test_true = torch.argmax(y_test, 1)


print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')

Training Accuracy = 99.84089101034209 %
Test Accuracy = 97.4074074074074 %


import torch
import torch.optim as optim
import torch.nn as nn


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_layer_size, n_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, n_classes)
    
    def forward(self, X):
        z = torch.sigmoid(self.fc1(X))
        z = self.fc2(z)
        return z


# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))


# Initialize network parameters
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]


# Initialize network
mlp = MLP(input_dim=input_dim, hidden_layer_size=hidden_layer_size, n_classes=n_classes)


# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
mlp.to(device)


# Training parameters
max_epochs = 10000
learning_rate = 1e-4

loss = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)

for n_epoch in range(max_epochs):
    cost = torch.zeros(1).to(device)
    
    optimizer.zero_grad()
    z = mlp(X_train)
    z = loss(z, y_train)
    with torch.no_grad():
        cost = z.clone()
    
    z.backward()
    
    optimizer.step()
    
    if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
        print('epoch #'+str(n_epoch)+':', cost.item())

epoch #0: 8958.8310546875
epoch #1000: 904.8087158203125
epoch #2000: 527.9207763671875
epoch #3000: 409.3244323730469
epoch #4000: 342.25213623046875
epoch #5000: 295.15966796875
epoch #6000: 258.5216979980469
epoch #7000: 228.5030059814453
epoch #8000: 203.29234313964844
epoch #9000: 181.77488708496094
epoch #9999: 163.18670654296875


with torch.no_grad():
    z = mlp(X_train)
    y_train_pred = torch.argmax(z, 1)
    y_train_true = torch.argmax(y_train, 1)

    X_test = X_test.to(device)
    y_test = y_test.to(device)
    z = mlp(X_test)
    y_test_pred = torch.argmax(z, 1)
    y_test_true = torch.argmax(y_test, 1)


print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')

Training Accuracy = 99.28400954653938 %
Test Accuracy = 97.5925925925926 %


import torch.nn.functional as F

a1 = torch.rand(4, 1, 28, 28)
print('a1.shape', a1.shape)

a2 = F.relu(nn.Conv2d(1, 6, 3)(a1))
print('a2.shape', a2.shape)
a3 = nn.MaxPool2d(2, 2)(a2)
print('a3.shape', a3.shape)

a4 = F.relu(nn.Conv2d(6, 16, 3)(a3))
print('a4shape', a4.shape)
a5 = nn.MaxPool2d(2, 2)(a4)
print('a5.shape', a5.shape)

a6 = a5.view(-1, 16 * 5 * 5)
print('a6.shape', a6.shape)

a7 = nn.Linear(16 * 5 * 5, 128)(a6)
print('a7.shape', a7.shape)

a8 = nn.Linear(128, 10)(a7)
print('a8.shape', a8.shape)

a1.shape torch.Size([4, 1, 28, 28])
a2.shape torch.Size([4, 6, 26, 26])
a3.shape torch.Size([4, 6, 13, 13])
a4shape torch.Size([4, 16, 11, 11])
a5.shape torch.Size([4, 16, 5, 5])
a6.shape torch.Size([4, 400])
a7.shape torch.Size([4, 128])
a8.shape torch.Size([4, 10])


import torchvision
import torchvision.transforms as transforms

trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
print('Training data size:', trainset.data.shape)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, shuffle=True, num_workers=4)

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = Net()
net.to(device)


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


max_epochs = 100
for epoch in range(max_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)
        
        optimizer.zero_grad()  # zero the parameter gradients

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    if (epoch+1) % 10 == 0:
        print('[epoch %d] loss: %.3f' %(epoch + 1, running_loss))
print('Finished Training')


PATH = './mnist_net.pth'
torch.save(net.state_dict(), PATH)

Training data size: torch.Size([60000, 28, 28])

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  cpuset_checked))

[epoch 10] loss: 55.086
[epoch 20] loss: 28.516
[epoch 30] loss: 20.077
[epoch 40] loss: 15.824
[epoch 50] loss: 13.531
[epoch 60] loss: 11.561
[epoch 70] loss: 10.326
[epoch 80] loss: 9.081
[epoch 90] loss: 8.197
[epoch 100] loss: 7.455
Finished Training


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


PATH = './mnist_net.pth'
net = Net()
net.load_state_dict(torch.load(PATH))
net.to(device)


testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
print('Test data size:', testset.data.shape)

testloader = torch.utils.data.DataLoader(testset, batch_size=256, shuffle=False, num_workers=4)


correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = net(images)
        predicted = torch.argmax(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
print('Accuracy =', 100 * correct / total)

Test data size: torch.Size([10000, 28, 28])

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  cpuset_checked))

Accuracy = 98.53


import torchvision

model = torchvision.models.resnet18(pretrained=True)  # Load the ResNet-18 model with pre-trained layer weights

# Prevent training of existing layers
for param in model.parameters():
    param.requires_grad = False

# Replace the last fully-connected layer
model.fc = nn.Linear(512, 100)  # Parameters of newly constructed modules have requires_grad=True by default

import torch.optim as optim

# Optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)

Avisek Gupta

Indian Statistical Institute, Kolkata

Tutorial 3: Training Neural Networks

1. torch.optim updates the parameters based on the computed gradients.

2. Steps of using an optimizer:

(a) Initialize Optimizer

(b) In the Optimization loop:

i. Explicitly set the gradients to zero

ii. Compute the gradients

iii. Update the parameters using the computed gradients

3. (i) Implementing a simple one-hidden layer Multi-Layered Perceptron

3. (ii) Using torch.nn to define a network, and use a predefined loss

3. (iii) Using layers defined in torch.nn

4. Using layers defined in torch.nn, and activation functions in torch.nn.functional

5. Using data sets present in torchvision

6. To fine-tune a pre-trained CNN, it’s enough to switch the "requires_grad" flags

Recap:

1. torch.optim updates the parameters based on the computed gradients.

2. Steps of using an optimizer:

(a) Initialize Optimizer

(b) In the Optimization loop:

i. Explicitly set the gradients to zero

ii. Compute the gradients

iii. Update the parameters using the computed gradients

3. (i) Implementing a simple one-hidden layer Multi-Layered Perceptron

(ii) Using torch.nn to define a network, and use a predefined loss

(iii) Using layers defined in torch.nn

4. Using layers defined in torch.nn, and activation functions in torch.nn.functional

5. Using data sets present in torchvision

6. To fine-tune a pre-trained CNN, it’s enough to switch the "requires_grad" flags

References at the PyTorch Documentation:

torch.optim

torch.nn

torch.nn.functional

torchvision.datasets

torchvision.transforms

torchvision.models

torch.optim

torch.nn

torch.nn.functional

torchvision.datasets

torchvision.transforms

torchvision.models

Avisek Gupta

Indian Statistical Institute, Kolkata

Tutorial 3: Training Neural Networks

1. torch.optim updates the parameters based on the computed gradients.

2. Steps of using an optimizer:

(a) Initialize Optimizer

(b) In the Optimization loop:

i. Explicitly set the gradients to zero

ii. Compute the gradients

iii. Update the parameters using the computed gradients

3. (i) Implementing a simple one-hidden layer Multi-Layered Perceptron

3. (ii) Using torch.nn to define a network, and use a predefined loss

3. (iii) Using layers defined in torch.nn

4. Using layers defined in torch.nn, and activation functions in torch.nn.functional

5. Using data sets present in torchvision

6. To fine-tune a pre-trained CNN, it’s enough to switch the "requires_grad" flags

Recap:

1. torch.optim updates the parameters based on the computed gradients.

2. Steps of using an optimizer:

(a) Initialize Optimizer

(b) In the Optimization loop:

i. Explicitly set the gradients to zero

ii. Compute the gradients

iii. Update the parameters using the computed gradients

3. (i) Implementing a simple one-hidden layer Multi-Layered Perceptron

(ii) Using torch.nn to define a network, and use a predefined loss

(iii) Using layers defined in torch.nn

4. Using layers defined in torch.nn, and activation functions in torch.nn.functional

5. Using data sets present in torchvision

6. To fine-tune a pre-trained CNN, it’s enough to switch the "requires_grad" flags

References at the PyTorch Documentation: torch.optim torch.nn torch.nn.functional torchvision.datasets torchvision.transforms torchvision.models

torch.optim

torch.nn

torch.nn.functional

torchvision.datasets

torchvision.transforms

torchvision.models

References at the PyTorch Documentation:

torch.optim

torch.nn

torch.nn.functional

torchvision.datasets

torchvision.transforms

torchvision.models