Machine Learning Singapore
23 Feb 2023
Vivek Kalyan
hello@vivekkalyan.com
@vivekkalyansk
AI Research @ Handshakes
.zero_grad()
.train()
/.eval()
/.no_grad()
.to(device)
Model
Data
Loss + Optimizer
Engineering Code
Train/Val/Test Loops
Everything else ...
class Net(LightningModule):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
def train_dataloader(self):
mnist_train = MNIST(os.getcwd(), train=True,
download=True, transform=transforms.ToTensor())
return DataLoader(mnist_train, batch_size=64)
def configure_optimizers(self):
optimizer = Adam(self.parameters(), lr=1e-3)
return optimizer
def training_step(self, batch, batch_idx):
data, target = batch
output = self.forward(data)
loss = F.nll_loss(output, target)
return {"loss": loss}
if __name__ == "__main__":
net = Net()
trainer = Trainer(accelerator="gpu", max_epochs=10)
trainer.fit(net)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
mnist_train = MNIST(os.getcwd(), train=True, download=True,
transform=transforms.ToTensor())
train_loader = DataLoader(mnist_train, batch_size=64)
net = Net().to(device)
optimizer = Adam(net.parameters(), lr=1e-3)
for epoch in range(1, 11):
net.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = net(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % 50 == 0:
print("Train Epoch: {} [{}/{} ({:.0f}%)]\t
Loss: {:.6f}".format(epoch,
batch_idx * len(data),
len(train_loader.dataset),
100 * batch_idx / len(train_loader),
loss.item()))
Pytorch
Pytorch Lightning
self.log
class Net(LightningModule):
def __init__(self):
super(Net, self).__init__()
self.save_hyperparameters()
self.fc1 = nn.Linear(28 * 28, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
# ... forward
def train_dataloader(self):
# ... train_dataloader
def configure_optimizers(self):
# ... configure_optimizers
def training_step(self, batch, batch_idx):
data, target = batch
output = self.forward(data)
loss = F.nll_loss(output, target)
self.log("loss", loss)
return {"loss": loss}
if __name__ == "__main__":
net = Net()
logger = TensorBoardLogger()
trainer = Trainer(accelerator="gpu", max_epochs=10, logger=logger)
trainer.fit(net)
$ tensorboard --logdir lightning_logs --port 8888
class Net(LightningModule):
def __init__(self):
# ... init
def forward(self, x):
# ... forward
def train_dataloader(self):
# ... train_dataloader
def val_dataloader(self):
mnist_train = MNIST(os.getcwd(), train=False, download=True,
transform=transforms.ToTensor())
return DataLoader(mnist_train, batch_size=self.batch_size)
def configure_optimizers(self):
# ... configure_optimizers
def training_step(self, batch, batch_idx):
# ... training_step
def validation_step(self, batch, batch_idx):
data, target = batch
output = self.forward(data)
loss = F.nll_loss(output, target)
pred = output.argmax(dim=1, keepdim=True)
correct = pred.squeeze(1).eq(target).sum().item()
self.log("val/loss", loss)
return {"loss": loss, "correct": correct, "total": len(target)}
def validation_epoch_end(self, outs):
num_correct = sum(map(lambda x: x[f"correct"], outs), 0)
num_total = sum(map(lambda x: x[f"total"], outs), 0)
self.log("val/accuracy", num_correct / num_total)
if __name__ == "__main__":
# ...
class Net(LightningModule):
def __init__(self):
# ... init
def forward(self, x):
# ... forward
def train_dataloader(self):
# ... train_dataloader
def val_dataloader(self):
# ... val_dataloader
def configure_optimizers(self):
# ... configure_optimizers
def on_train_start(self):
self.logger.log_hyperparams(self.hparams, {"val/accuracy": 0})
def training_step(self, batch, batch_idx):
# ... training_step
def validation_step(self, batch, batch_idx):
# ... validation_step
def validation_epoch_end(self, outs):
num_correct = sum(map(lambda x: x[f"correct"], outs), 0)
num_total = sum(map(lambda x: x[f"total"], outs), 0)
self.log("val/accuracy", num_correct / num_total)
if __name__ == "__main__":
# ...
Custom checkpointing
class Net(LightningModule):
# ... net
if __name__ == "__main__":
net = Net()
logger = TensorBoardLogger()
checkpoint_callback = ModelCheckpoint(monitor='val/accuracy', mode='max', verbose=True)
trainer = Trainer(callbacks = [checkpoint_callback], accelerator="gpu", max_epochs=10, logger=logger)
trainer.fit(net)
Early Stopping
class Net(LightningModule):
# ... net
if __name__ == "__main__":
net = Net()
logger = TensorBoardLogger()
checkpoint_callback = ModelCheckpoint(monitor='val/accuracy', mode='max', verbose=True)
early_stopping_callback = EarlyStopping(monitor='val/accuracy', mode='max', patience=2)
trainer = Trainer(callbacks = [early_stopping_callback, checkpoint_callback],
accelerator="gpu", max_epochs=10, logger=logger)
trainer.fit(net)
class Net(LightningModule):
def __init__(self, batch_size, hidden_size, learning_rate, **kwargs):
# ... init
def add_model_specific_args(parent_parser):
parser = parent_parser.add_argument_group("Net")
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--hidden_size", type=int, default=128)
parser.add_argument("--learning_rate", type=float, default=1e-3)
return parent_parser
def forward(self, x):
# ... forward
if __name__ == "__main__":
# ...
parser = ArgumentParser()
parser = Net.add_model_specific_args(parser)
parser = Trainer.add_argparse_args(parser)
args = parser.parse_args()
net = Net(**vars(args))
trainer = Trainer.from_argparse_args(args)
trainer.fit(net)
$ python train.py --fast_dev_run
$ python train.py --overfit_batches 0.01
class Net(LightningModule):
# ... net
if __name__ == "__main__":
net = Net()
trainer = Trainer(accelerator="gpu", max_epochs=10)
# Run learning rate finder
lr_finder = trainer.tuner.lr_find(model)
# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()
Emergent Abilities of Large Language Models (Wei et al., 2022)
$ python train.py --precision 16
$ python train.py --accumulate_grad_batches 4
$ python train.py --gradient_clip_val 1
class Net(LightningModule):
# ... net
if __name__ == "__main__":
net = Net()
logger = TensorBoardLogger()
swa_callback = StochasticWeightAveraging(swa_lrs=1e-2)
trainer = Trainer(callbacks = [swa_callback],
accelerator="gpu", max_epochs=10, logger=logger)
trainer.fit(net)
pl.LightningModule
-> nn.Module
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = x.view(x.size(0), -1)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
model = Net()
checkpoint = torch.load("path/to/lightning/checkpoint.ckpt")
model.load_state_dict(checkpoint["state_dict"])
model.eval()