# Gradient Descent using torch.optim
import torch
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x = torch.randn(1, requires_grad=True, device=device)
print('Initial x:', x)
max_iter = 100
learning_rate = 0.1
eps = 1e-6
# (a) Initialize Optimizer
optimizer = optim.SGD([x], lr=learning_rate)
for _ in range(max_iter):
optimizer.zero_grad() # (b) i. Explicitly set the gradients to zero
y = (x - 4) ** 2
y.backward() # (b) ii. Compute the gradients
prev_x = x.clone()
optimizer.step() # (b) iii. Update the parameters using the computed gradients
if torch.norm(prev_x - x) < eps:
break
print('Soln x:', x)
Initial x: tensor([1.6230], requires_grad=True) Soln x: tensor([4.0000], requires_grad=True)
import torch
import torch.optim as optim
# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))
# Multi-Layered Perceptron Architecture
# Input dim: X.shape[1]
# Hidden Layer dim: 200
# Output Layer dim: n_classes
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]
# Initialize network parameters
W1 = torch.rand(input_dim+1, hidden_layer_size) * 2 - 1
W2 = torch.rand(hidden_layer_size+1, n_classes) * 2 - 1
# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
W1 = W1.to(device)
W2 = W2.to(device)
W1.requires_grad = True
W2.requires_grad = True
const_ones = torch.ones(X_train.shape[0], 1).to(device)
const_ones.requires_grad = False
# Training parameters
max_epochs = 10000
learning_rate = 1e-4
optimizer = optim.SGD([W1, W2], lr=learning_rate) # Initialize Optimizer
for n_epoch in range(max_epochs):
cost = torch.zeros(1).to(device)
optimizer.zero_grad() # i. Explicitly set the gradients to zero
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X_train], dim=1), W1)))
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), W2)))
z = torch.clamp(z, min=1e-6, max=1-1e-6)
z = -torch.sum((y_train * torch.log(z)) + ((1 - y_train) * torch.log(1 - z)))
with torch.no_grad():
cost = z.clone()
z.backward() # ii. Compute the gradients
optimizer.step() # iii. Update the parameters using the computed gradients
if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
print('epoch #'+str(n_epoch)+':', cost.item())
epoch #0: 44920.55078125 epoch #1000: 399.1661376953125 epoch #2000: 269.6180419921875 epoch #3000: 206.77101135253906 epoch #4000: 167.23831176757812 epoch #5000: 139.4957275390625 epoch #6000: 118.78943634033203 epoch #7000: 102.71863555908203 epoch #8000: 89.90544128417969 epoch #9000: 79.48570251464844 epoch #9999: 70.889892578125
# Compute training and test accuracy
with torch.no_grad():
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X_train], dim=1), W1)))
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), W2)))
y_train_pred = torch.argmax(z, 1)
y_train_true = torch.argmax(y_train, 1)
X_test = X_test.to(device)
y_test = y_test.to(device)
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([torch.ones(X_test.shape[0], 1).to(device), X_test], dim=1), W1)))
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([torch.ones(X_test.shape[0], 1).to(device), z], dim=1), W2)))
y_test_pred = torch.argmax(z, 1)
y_test_true = torch.argmax(y_test, 1)
print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')
Training Accuracy = 99.92044550517105 % Test Accuracy = 97.5925925925926 %
import torch
import torch.optim as optim
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, hidden_layer_size, n_classes):
super(MLP, self).__init__()
self.W1 = nn.Parameter(torch.rand(input_dim + 1, hidden_layer_size) * 2 - 1)
self.W2 = nn.Parameter(torch.rand(hidden_layer_size + 1, n_classes) * 2 - 1)
def forward(self, X):
const_ones = torch.ones(X.shape[-2], 1)
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, X], dim=1), self.W1)))
z = 1 / (1 + torch.exp(torch.matmul(torch.cat([const_ones, z], dim=1), self.W2)))
z = torch.clamp(z, min=1e-6, max=1-1e-6)
return z
# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))
# Initialize network parameters
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]
# Initialize network
mlp = MLP(input_dim=input_dim, hidden_layer_size=hidden_layer_size, n_classes=n_classes)
# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
mlp.to(device)
# Training parameters
max_epochs = 10000
learning_rate = 1e-4
loss = nn.BCELoss(reduction='sum')
optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)
for n_epoch in range(max_epochs):
cost = torch.zeros(1).to(device)
optimizer.zero_grad()
z = mlp(X_train)
z = loss(z, y_train)
with torch.no_grad():
cost = z.clone()
z.backward()
optimizer.step()
if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
print('epoch #'+str(n_epoch)+':', cost.item())
epoch #0: 32940.52734375 epoch #1000: 411.0254821777344 epoch #2000: 277.697265625 epoch #3000: 213.82894897460938 epoch #4000: 173.45681762695312 epoch #5000: 144.90814208984375 epoch #6000: 123.4655990600586 epoch #7000: 106.75452423095703 epoch #8000: 93.408447265625 epoch #9000: 82.55364990234375 epoch #9999: 73.60090637207031
with torch.no_grad():
z = mlp(X_train)
y_train_pred = torch.argmax(z, 1)
y_train_true = torch.argmax(y_train, 1)
X_test = X_test.to(device)
y_test = y_test.to(device)
z = mlp(X_test)
y_test_pred = torch.argmax(z, 1)
y_test_true = torch.argmax(y_test, 1)
print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')
Training Accuracy = 99.84089101034209 % Test Accuracy = 97.4074074074074 %
import torch
import torch.optim as optim
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, hidden_layer_size, n_classes):
super(MLP, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_layer_size)
self.fc2 = nn.Linear(hidden_layer_size, n_classes)
def forward(self, X):
z = torch.sigmoid(self.fc1(X))
z = self.fc2(z)
return z
# Load in the digits data set from scikit-learn
from sklearn.datasets import load_digits
from sklearn.preprocessing import LabelBinarizer
import numpy as np
X = load_digits().data
X = X / X.max()
y = LabelBinarizer().fit_transform(load_digits().target) # Convert each label to one-hot vectors
# Split the data into training and test data sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
# Convert numpy arrays to torch tensors
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))
# Initialize network parameters
input_dim = X.shape[1]
hidden_layer_size = 200
n_classes = y.shape[1]
# Initialize network
mlp = MLP(input_dim=input_dim, hidden_layer_size=hidden_layer_size, n_classes=n_classes)
# Push everything on to the GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
X_train = X_train.to(device)
y_train = y_train.to(device)
mlp.to(device)
# Training parameters
max_epochs = 10000
learning_rate = 1e-4
loss = nn.BCEWithLogitsLoss(reduction='sum')
optimizer = optim.SGD(mlp.parameters(), lr=learning_rate)
for n_epoch in range(max_epochs):
cost = torch.zeros(1).to(device)
optimizer.zero_grad()
z = mlp(X_train)
z = loss(z, y_train)
with torch.no_grad():
cost = z.clone()
z.backward()
optimizer.step()
if n_epoch % 1000 == 0 or (n_epoch+1)==max_epochs:
print('epoch #'+str(n_epoch)+':', cost.item())
epoch #0: 8958.8310546875 epoch #1000: 904.8087158203125 epoch #2000: 527.9207763671875 epoch #3000: 409.3244323730469 epoch #4000: 342.25213623046875 epoch #5000: 295.15966796875 epoch #6000: 258.5216979980469 epoch #7000: 228.5030059814453 epoch #8000: 203.29234313964844 epoch #9000: 181.77488708496094 epoch #9999: 163.18670654296875
with torch.no_grad():
z = mlp(X_train)
y_train_pred = torch.argmax(z, 1)
y_train_true = torch.argmax(y_train, 1)
X_test = X_test.to(device)
y_test = y_test.to(device)
z = mlp(X_test)
y_test_pred = torch.argmax(z, 1)
y_test_true = torch.argmax(y_test, 1)
print('Training Accuracy =', (y_train_true == y_train_pred).sum().item() * 100 / y_train_true.shape[0],'\b%')
print('Test Accuracy =', (y_test_true == y_test_pred).sum().item() * 100 / y_test_true.shape[0],'\b%')
Training Accuracy = 99.28400954653938 % Test Accuracy = 97.5925925925926 %
import torch.nn.functional as F
a1 = torch.rand(4, 1, 28, 28)
print('a1.shape', a1.shape)
a2 = F.relu(nn.Conv2d(1, 6, 3)(a1))
print('a2.shape', a2.shape)
a3 = nn.MaxPool2d(2, 2)(a2)
print('a3.shape', a3.shape)
a4 = F.relu(nn.Conv2d(6, 16, 3)(a3))
print('a4shape', a4.shape)
a5 = nn.MaxPool2d(2, 2)(a4)
print('a5.shape', a5.shape)
a6 = a5.view(-1, 16 * 5 * 5)
print('a6.shape', a6.shape)
a7 = nn.Linear(16 * 5 * 5, 128)(a6)
print('a7.shape', a7.shape)
a8 = nn.Linear(128, 10)(a7)
print('a8.shape', a8.shape)
a1.shape torch.Size([4, 1, 28, 28]) a2.shape torch.Size([4, 6, 26, 26]) a3.shape torch.Size([4, 6, 13, 13]) a4shape torch.Size([4, 16, 11, 11]) a5.shape torch.Size([4, 16, 5, 5]) a6.shape torch.Size([4, 400]) a7.shape torch.Size([4, 128]) a8.shape torch.Size([4, 10])
import torchvision
import torchvision.transforms as transforms
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
print('Training data size:', trainset.data.shape)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=256, shuffle=True, num_workers=4)
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 3)
self.fc1 = nn.Linear(16 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net = Net()
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
max_epochs = 100
for epoch in range(max_epochs): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data[0].to(device), data[1].to(device)
optimizer.zero_grad() # zero the parameter gradients
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if (epoch+1) % 10 == 0:
print('[epoch %d] loss: %.3f' %(epoch + 1, running_loss))
print('Finished Training')
PATH = './mnist_net.pth'
torch.save(net.state_dict(), PATH)
Training data size: torch.Size([60000, 28, 28])
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. cpuset_checked))
[epoch 10] loss: 55.086 [epoch 20] loss: 28.516 [epoch 30] loss: 20.077 [epoch 40] loss: 15.824 [epoch 50] loss: 13.531 [epoch 60] loss: 11.561 [epoch 70] loss: 10.326 [epoch 80] loss: 9.081 [epoch 90] loss: 8.197 [epoch 100] loss: 7.455 Finished Training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, 3)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 3)
self.fc1 = nn.Linear(16 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
PATH = './mnist_net.pth'
net = Net()
net.load_state_dict(torch.load(PATH))
net.to(device)
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())
print('Test data size:', testset.data.shape)
testloader = torch.utils.data.DataLoader(testset, batch_size=256, shuffle=False, num_workers=4)
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
images, labels = data[0].to(device), data[1].to(device)
outputs = net(images)
predicted = torch.argmax(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy =', 100 * correct / total)
Test data size: torch.Size([10000, 28, 28])
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py:477: UserWarning: This DataLoader will create 4 worker processes in total. Our suggested max number of worker in current system is 2, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary. cpuset_checked))
Accuracy = 98.53
import torchvision
model = torchvision.models.resnet18(pretrained=True) # Load the ResNet-18 model with pre-trained layer weights
# Prevent training of existing layers
for param in model.parameters():
param.requires_grad = False
# Replace the last fully-connected layer
model.fc = nn.Linear(512, 100) # Parameters of newly constructed modules have requires_grad=True by default
import torch.optim as optim
# Optimize only the classifier
optimizer = optim.SGD(model.fc.parameters(), lr=1e-2, momentum=0.9)