...
 
Commits (8)
from .casia_webface import CasiaDataset
from .casia_webface import CasiaWebFaceDataset
from .data_folder import DataFolder
from .data_folder_generic import DataFolderGeneric
# transforms
from .utils import FaceCropper
......
This diff is collapsed.
#!/usr/bin/env python
# encoding: utf-8
""" Train a Generic Net
Usage:
%(prog)s <configuration>
[--model=<string>] [--batch-size=<int>] [--num-workers=<int>][--epochs=<int>] [--save-interval=<int>]
[--learning-rate=<float>][--do-crossvalidation][--seed=<int>]
[--output-dir=<path>] [--use-gpu] [--verbose ...]
Arguments:
<configuration> A configuration file, defining the dataset and the network
Options:
-h, --help Shows this help message and exits
--model=<string> Filename of the model to load (if any).
--batch-size=<int> Batch size [default: 64]
--num-workers=<int> Number subprocesses to use for data loading [default: 0]
--epochs=<int> Number of training epochs [default: 20]
--save-interval=<int> Interval between saving epochs [default: 5]
--learning-rate=<float> Learning rate [default: 0.01]
--do-crossvalidation Whether to perform cross validation [default: False]
-S, --seed=<int> The random seed [default: 3]
-o, --output-dir=<path> Dir to save stuff [default: training]
-g, --use-gpu Use the GPU
-v, --verbose Increase the verbosity (may appear multiple times).
Note that arguments provided directly by command-line will override the ones in the configuration file.
Example:
To run the training process
$ %(prog)s config.py
See '%(prog)s --help' for more information.
"""
import os, sys
import pkg_resources
import torch
import numpy
from docopt import docopt
import bob.core
logger = bob.core.log.setup("bob.learn.pytorch")
from bob.extension.config import load
from bob.learn.pytorch.trainers import GenericTrainer
from bob.learn.pytorch.utils import get_parameter
version = pkg_resources.require('bob.learn.pytorch')[0].version
def main(user_input=None):
# Parse the command-line arguments
if user_input is not None:
arguments = user_input
else:
arguments = sys.argv[1:]
prog = os.path.basename(sys.argv[0])
completions = dict(prog=prog, version=version,)
args = docopt(__doc__ % completions,argv=arguments,version='Train a Generic Network (%s)' % version,)
# load configuration file
configuration = load([os.path.join(args['<configuration>'])])
# get the pre-trained model file, if any
model = args['--model']
if hasattr(configuration, 'model'):
model = configuration.model
# get various parameters, either from config file or command-line
batch_size = get_parameter(args, configuration, 'batch_size', 64)
num_workers = get_parameter(args, configuration, 'num_workers', 0)
epochs = get_parameter(args, configuration, 'epochs', 20)
save_interval = get_parameter(args, configuration, 'save_interval', 5)
learning_rate = get_parameter(args, configuration, 'learning_rate', 0.01)
seed = get_parameter(args, configuration, 'seed', 3)
output_dir = get_parameter(args, configuration, 'output_dir', 'training')
use_gpu = get_parameter(args, configuration, 'use_gpu', False)
verbosity_level = get_parameter(args, configuration, 'verbose', 0)
do_crossvalidation = get_parameter(args, configuration, 'do_crossvalidation', False)
bob.core.log.set_verbosity_level(logger, verbosity_level)
bob.io.base.create_directories_safe(output_dir)
# print parameters
logger.debug("Model file = {}".format(model))
logger.debug("Batch size = {}".format(batch_size))
logger.debug("Num workers = {}".format(num_workers))
logger.debug("Epochs = {}".format(epochs))
logger.debug("Save interval = {}".format(save_interval))
logger.debug("Learning rate = {}".format(learning_rate))
logger.debug("Seed = {}".format(seed))
logger.debug("Output directory = {}".format(output_dir))
logger.debug("Use GPU = {}".format(use_gpu))
logger.debug("Perform cross validation = {}".format(do_crossvalidation))
# use new interface
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# process on the arguments / options
torch.manual_seed(seed)
if use_gpu:
torch.cuda.manual_seed_all(seed)
if torch.cuda.is_available() and not use_gpu:
device="cpu"
logger.warn("You have a CUDA device, so you should probably run with --use-gpu")
logger.debug("Device used for training = {}".format(device))
# Which device to use is figured out at this point, no need to use `use-gpu` flag anymore
# get data
if hasattr(configuration, 'dataset'):
dataloader={}
if not do_crossvalidation:
logger.info("There are {} training samples".format(len(configuration.dataset['train'])))
dataloader['train'] = torch.utils.data.DataLoader(configuration.dataset['train'], batch_size=batch_size, num_workers=num_workers, shuffle=True)
else:
dataloader['train'] = torch.utils.data.DataLoader(configuration.dataset['train'], batch_size=batch_size, num_workers=num_workers, shuffle=True)
dataloader['val'] = torch.utils.data.DataLoader(configuration.dataset['val'], batch_size=batch_size, num_workers=num_workers, shuffle=True)
logger.info("There are {} training samples".format(len(configuration.dataset['train'])))
logger.info("There are {} validation samples".format(len(configuration.dataset['val'])))
else:
logger.error("Please provide a dataset in your configuration file !")
sys.exit()
assert(hasattr(configuration, 'optimizer'))
# train the network
if hasattr(configuration, 'network'):
trainer = GenericTrainer(configuration.network, configuration.optimizer,configuration.compute_loss,learning_rate=learning_rate,
device=device, verbosity_level=verbosity_level,tf_logdir=output_dir+'/tf_logs',do_crossvalidation=do_crossvalidation, save_interval=save_interval)
trainer.train(dataloader, n_epochs=epochs, output_dir=output_dir, model=model)
else:
logger.error("Please provide a network in your configuration file !")
sys.exit()
This diff is collapsed.
#!/usr/bin/env python
# encoding: utf-8
import copy
import os
import time
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from .tflog import Logger
import bob.core
logger = bob.core.log.setup("bob.learn.pytorch")
class GenericTrainer(object):
"""
Class to train a generic NN; all the parameters are provided in configs
Attributes
----------
network: :py:class:`torch.nn.Module`
The network to train
optimizer: :py:class:`torch.optim.Optimizer`
Optimizer object to be used. Initialized in the config file.
device: str
Device which will be used for training the model
verbosity_level: int
The level of verbosity output to stdout
"""
def __init__(self, network, optimizer, compute_loss, learning_rate=0.0001, device='cpu', verbosity_level=2, tf_logdir='tf_logs', do_crossvalidation=False, save_interval=5):
""" Init function . The layers to be adapted in the network is selected and the gradients are set to `True`
for the layers which needs to be adapted.
Parameters
----------
network: :py:class:`torch.nn.Module`
The network to train
device: str
Device which will be used for training the model
verbosity_level: int
The level of verbosity output to stdout
do_crossvalidation: bool
If set to `True`, performs validation in each epoch and stores the best model based on validation loss.
"""
self.network = network
self.optimizer = optimizer
self.compute_loss = compute_loss
self.device = device
self.learning_rate = learning_rate
self.save_interval = save_interval
self.do_crossvalidation = do_crossvalidation
if self.do_crossvalidation:
phases = ['train', 'val']
else:
phases = ['train']
self.phases = phases
# Move the network to device
self.network.to(self.device)
bob.core.log.set_verbosity_level(logger, verbosity_level)
self.tf_logger = Logger(tf_logdir)
# Setting the gradients to true for the layers which needs to be adapted
def load_model(self, model_filename):
"""Loads an existing model
Parameters
----------
model_file: str
The filename of the model to load
Returns
-------
start_epoch: int
The epoch to start with
start_iteration: int
The iteration to start with
losses: list(float)
The list of losses from previous training
"""
cp = torch.load(model_filename)
self.network.load_state_dict(cp['state_dict'])
start_epoch = cp['epoch']
start_iter = cp['iteration']
losses = cp['loss']
return start_epoch, start_iter, losses
def save_model(self, output_dir, epoch=0, iteration=0, losses=None):
"""Save the trained network
Parameters
----------
output_dir: str
The directory to write the models to
epoch: int
the current epoch
iteration: int
the current (last) iteration
losses: list(float)
The list of losses since the beginning of training
"""
saved_filename = 'model_{}_{}.pth'.format(epoch, iteration)
saved_path = os.path.join(output_dir, saved_filename)
logger.info('Saving model to {}'.format(saved_path))
cp = {'epoch': epoch,
'iteration': iteration,
'loss': losses,
'state_dict': self.network.cpu().state_dict()
}
torch.save(cp, saved_path)
self.network.to(self.device)
def train(self, dataloader, n_epochs=25, output_dir='out', model=None):
"""Performs the training.
Parameters
----------
dataloader: :py:class:`torch.utils.data.DataLoader`
The dataloader for your data
n_epochs: int
The number of epochs you would like to train for
learning_rate: float
The learning rate for Adam optimizer.
output_dir: str
The directory where you would like to save models
model: str
The path to a pretrained model file to start training from; this is the PAD model; not the LightCNN model
"""
# if model exists, load it
if model is not None:
start_epoch, start_iter, losses = self.load_model(model)
logger.info('Starting training at epoch {}, iteration {} - last loss value is {}'.format(
start_epoch, start_iter, losses[-1]))
else:
start_epoch = 0
start_iter = 0
losses = []
logger.info('Starting training from scratch')
for name, param in self.network.named_parameters():
if param.requires_grad == True:
logger.info(
'Layer to be adapted from grad check : {}'.format(name))
# setup optimizer
self.network.train(True)
best_model_wts = copy.deepcopy(self.network.state_dict())
best_loss = float("inf")
# let's go
for epoch in range(start_epoch, n_epochs):
# in the epoch
train_loss_history = []
val_loss_history = []
for phase in self.phases:
if phase == 'train':
self.network.train() # Set model to training mode
else:
self.network.eval() # Set model to evaluate mode
for i, data in enumerate(dataloader[phase], 0):
if i >= start_iter:
start = time.time()
# get data from dataset
img, labels = data
self.optimizer.zero_grad()
with torch.set_grad_enabled(phase == 'train'):
loss = self.compute_loss(
self.network, img, labels, self.device)
if phase == 'train':
loss.backward()
self.optimizer.step()
train_loss_history.append(loss.item())
else:
val_loss_history.append(loss.item())
end = time.time()
logger.info("[{}/{}][{}/{}] => Loss = {} (time spent: {}), Phase {}".format(
epoch, n_epochs, i, len(dataloader[phase]), loss.item(), (end-start), phase))
losses.append(loss.item())
epoch_train_loss = np.mean(train_loss_history)
logger.info("Train Loss : {} epoch : {}".format(
epoch_train_loss, epoch))
if self.do_crossvalidation:
epoch_val_loss = np.mean(val_loss_history)
logger.info("Val Loss : {} epoch : {}".format(
epoch_val_loss, epoch))
if phase == 'val' and epoch_val_loss < best_loss:
logger.debug("New val loss : {} is better than old: {}, copying over the new weights".format(
epoch_val_loss, best_loss))
best_loss = epoch_val_loss
best_model_wts = copy.deepcopy(self.network.state_dict())
######################################## <Logging> ###################################
if self.do_crossvalidation:
info = {'train_loss': epoch_train_loss,
'val_loss': epoch_val_loss}
else:
info = {'train_loss': epoch_train_loss}
# scalar logs
for tag, value in info.items():
self.tf_logger.scalar_summary(tag, value, epoch+1)
# Log values and gradients of the parameters (histogram summary)
for tag, value in self.network.named_parameters():
tag = tag.replace('.', '/')
try:
self.tf_logger.histo_summary(
tag, value.data.cpu().numpy(), epoch+1)
self.tf_logger.histo_summary(
tag+'/grad', value.grad.data.cpu().numpy(), epoch+1)
except:
pass
######################################## </Logging> ###################################
# do stuff - like saving models
logger.info("EPOCH {} DONE".format(epoch+1))
# comment it out after debugging
# save the last model, and the ones in the specified interval
if (epoch+1) == n_epochs or epoch % self.save_interval == 0:
self.save_model(output_dir, epoch=(epoch+1),
iteration=0, losses=losses)
# load the best weights
self.network.load_state_dict(best_model_wts)
# best epoch is 0
self.save_model(output_dir, epoch=0, iteration=0, losses=losses)
......@@ -3,6 +3,7 @@ from .MCCNNTrainer import MCCNNTrainer
from .DCGANTrainer import DCGANTrainer
from .ConditionalGANTrainer import ConditionalGANTrainer
from .FASNetTrainer import FASNetTrainer
from .GenericTrainer import GenericTrainer
# gets sphinx autodoc done right - don't remove it
__all__ = [_ for _ in dir() if not _.startswith('_')]
......
......@@ -10,6 +10,7 @@ build:
entry_points:
- train_cnn.py = bob.learn.pytorch.scripts.train_cnn:main
- train_mccnn.py = bob.learn.pytorch.scripts.train_mccnn:main
- train_generic.py = bob.learn.pytorch.scripts.train_generic:main
- train_fasnet.py = bob.learn.pytorch.scripts.train_fasnet:main
- train_dcgan.py = bob.learn.pytorch.scripts.train_dcgan:main
- train_conditionalgan.py = bob.learn.pytorch.scripts.train_conditionalgan:main
......@@ -57,6 +58,7 @@ test:
commands:
- train_cnn.py --help
- train_mccnn.py --help
- train_generic.py --help
- train_fasnet.py --help
- train_dcgan.py --help
- train_conditionalgan.py --help
......
......@@ -71,6 +71,7 @@ setup(
'console_scripts' : [
'train_cnn.py = bob.learn.pytorch.scripts.train_cnn:main',
'train_mccnn.py = bob.learn.pytorch.scripts.train_mccnn:main',
'train_generic.py = bob.learn.pytorch.scripts.train_generic:main',
'train_fasnet.py = bob.learn.pytorch.scripts.train_fasnet:main',
'train_dcgan.py = bob.learn.pytorch.scripts.train_dcgan:main',
'train_conditionalgan.py = bob.learn.pytorch.scripts.train_conditionalgan:main',
......