[ADD] package

ca371ff4 · Hatef OTROSHI · 622c9036 · ca371ff4 · ca371ff4 · ca371ff4
Commit ca371ff4 authored 10 months ago by Hatef OTROSHI
--- a/bob/__init__.py
+++ b/bob/__init__.py
+# see https://docs.python.org/3/library/pkgutil.html
+from pkgutil import extend_path
+__path__ = extend_path(__path__, __name__)
--- a/bob/bio/__init__.py
+++ b/bob/bio/__init__.py
+# see https://docs.python.org/3/library/pkgutil.html
+from pkgutil import extend_path
+__path__ = extend_path(__path__, __name__)
--- a/bob/bio/facexzoo/LICENSE.txt
+++ b/bob/bio/facexzoo/LICENSE.txt
--- a/bob/bio/facexzoo/Readme.md
+++ b/bob/bio/facexzoo/Readme.md
+Part of the scripts are taken from FaceXZoo reposiroty which is licensed under Apache 2.
+For more details please check the [LICENSE](LICENSE.txt) file.
\ No newline at end of file
--- a/bob/bio/facexzoo/__init__.py
+++ b/bob/bio/facexzoo/__init__.py
--- a/bob/bio/facexzoo/backbones/AttentionNets.py
+++ b/bob/bio/facexzoo/backbones/AttentionNets.py
+"""
+@author: Jun Wang 
+@date: 20201019 
+@contact: jun21wangustc@gmail.com
+"""
+# based on:
+# https://github.com/tengshaofeng/ResidualAttentionNetwork-pytorch/tree/master/Residual-Attention-Network/model
+import torch
+import torch.nn as nn
+from torch.nn import init
+import functools
+from torch.autograd import Variable
+import numpy as np
+class Flatten(nn.Module):
+    def forward(self, x):
+        return x.reshape(x.size(0), -1)
+class ResidualBlock(nn.Module):
+    def __init__(self, input_channels, output_channels, stride=1):
+        super(ResidualBlock, self).__init__()
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.stride = stride
+        self.bn1 = nn.BatchNorm2d(input_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv1 = nn.Conv2d(input_channels, output_channels//4, 1, 1, bias = False)
+        self.bn2 = nn.BatchNorm2d(output_channels//4)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(output_channels//4, output_channels//4, 3, stride, padding = 1, bias = False)
+        self.bn3 = nn.BatchNorm2d(output_channels//4)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_channels//4, output_channels, 1, 1, bias = False)
+        self.conv4 = nn.Conv2d(input_channels, output_channels , 1, stride, bias = False)        
+    def forward(self, x):
+        residual = x
+        out = self.bn1(x)
+        out1 = self.relu(out)
+        out = self.conv1(out1)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        if (self.input_channels != self.output_channels) or (self.stride !=1 ):
+            residual = self.conv4(out1)
+        out += residual
+        return out
+class AttentionModule_stage1(nn.Module):
+    # input size is 56*56
+    def __init__(self, in_channels, out_channels, size1=(56, 56), size2=(28, 28), size3=(14, 14)):
+        super(AttentionModule_stage1, self).__init__()
+        self.first_residual_blocks = ResidualBlock(in_channels, out_channels)
+        self.trunk_branches = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+         )
+        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax1_blocks = ResidualBlock(in_channels, out_channels)
+        self.skip1_connection_residual_block = ResidualBlock(in_channels, out_channels)
+        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax2_blocks = ResidualBlock(in_channels, out_channels)
+        self.skip2_connection_residual_block = ResidualBlock(in_channels, out_channels)
+        self.mpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax3_blocks = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+        )
+        self.interpolation3 = nn.UpsamplingBilinear2d(size=size3)
+        self.softmax4_blocks = ResidualBlock(in_channels, out_channels)
+        self.interpolation2 = nn.UpsamplingBilinear2d(size=size2)
+        self.softmax5_blocks = ResidualBlock(in_channels, out_channels)
+        self.interpolation1 = nn.UpsamplingBilinear2d(size=size1)
+        self.softmax6_blocks = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels , kernel_size = 1, stride = 1, bias = False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels , kernel_size = 1, stride = 1, bias = False),
+            nn.Sigmoid()
+        )
+        self.last_blocks = ResidualBlock(in_channels, out_channels)
+    def forward(self, x):
+        x = self.first_residual_blocks(x)
+        out_trunk = self.trunk_branches(x)
+        out_mpool1 = self.mpool1(x)
+        out_softmax1 = self.softmax1_blocks(out_mpool1)
+        out_skip1_connection = self.skip1_connection_residual_block(out_softmax1)
+        out_mpool2 = self.mpool2(out_softmax1)
+        out_softmax2 = self.softmax2_blocks(out_mpool2)
+        out_skip2_connection = self.skip2_connection_residual_block(out_softmax2)
+        out_mpool3 = self.mpool3(out_softmax2)
+        out_softmax3 = self.softmax3_blocks(out_mpool3)
+        #
+        out_interp3 = self.interpolation3(out_softmax3) + out_softmax2
+        # print(out_skip2_connection.data)
+        # print(out_interp3.data)
+        out = out_interp3 + out_skip2_connection
+        out_softmax4 = self.softmax4_blocks(out)
+        out_interp2 = self.interpolation2(out_softmax4) + out_softmax1
+        out = out_interp2 + out_skip1_connection
+        out_softmax5 = self.softmax5_blocks(out)
+        out_interp1 = self.interpolation1(out_softmax5) + out_trunk
+        out_softmax6 = self.softmax6_blocks(out_interp1)
+        out = (1 + out_softmax6) * out_trunk
+        out_last = self.last_blocks(out)
+        return out_last
+class AttentionModule_stage2(nn.Module):
+    # input image size is 28*28
+    def __init__(self, in_channels, out_channels, size1=(28, 28), size2=(14, 14)):
+        super(AttentionModule_stage2, self).__init__()
+        self.first_residual_blocks = ResidualBlock(in_channels, out_channels)
+        self.trunk_branches = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+         )
+        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax1_blocks = ResidualBlock(in_channels, out_channels)
+        self.skip1_connection_residual_block = ResidualBlock(in_channels, out_channels)
+        self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax2_blocks = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+        )
+        self.interpolation2 = nn.UpsamplingBilinear2d(size=size2)
+        self.softmax3_blocks = ResidualBlock(in_channels, out_channels)
+        self.interpolation1 = nn.UpsamplingBilinear2d(size=size1)
+        self.softmax4_blocks = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, bias=False),
+            nn.Sigmoid()
+        )
+        self.last_blocks = ResidualBlock(in_channels, out_channels)
+    def forward(self, x):
+        x = self.first_residual_blocks(x)
+        out_trunk = self.trunk_branches(x)
+        out_mpool1 = self.mpool1(x)
+        out_softmax1 = self.softmax1_blocks(out_mpool1)
+        out_skip1_connection = self.skip1_connection_residual_block(out_softmax1)
+        out_mpool2 = self.mpool2(out_softmax1)
+        out_softmax2 = self.softmax2_blocks(out_mpool2)
+        out_interp2 = self.interpolation2(out_softmax2) + out_softmax1
+        # print(out_skip2_connection.data)
+        # print(out_interp3.data)
+        out = out_interp2 + out_skip1_connection
+        out_softmax3 = self.softmax3_blocks(out)
+        out_interp1 = self.interpolation1(out_softmax3) + out_trunk
+        out_softmax4 = self.softmax4_blocks(out_interp1)
+        out = (1 + out_softmax4) * out_trunk
+        out_last = self.last_blocks(out)
+        return out_last
+class AttentionModule_stage3(nn.Module):
+    # input image size is 14*14
+    def __init__(self, in_channels, out_channels, size1=(14, 14)):
+        super(AttentionModule_stage3, self).__init__()
+        self.first_residual_blocks = ResidualBlock(in_channels, out_channels)
+        self.trunk_branches = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+         )
+        self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.softmax1_blocks = nn.Sequential(
+            ResidualBlock(in_channels, out_channels),
+            ResidualBlock(in_channels, out_channels)
+        )
+        self.interpolation1 = nn.UpsamplingBilinear2d(size=size1)
+        self.softmax2_blocks = nn.Sequential(
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, bias=False),
+            nn.Sigmoid()
+        )
+        self.last_blocks = ResidualBlock(in_channels, out_channels)
+    def forward(self, x):
+        x = self.first_residual_blocks(x)
+        out_trunk = self.trunk_branches(x)
+        out_mpool1 = self.mpool1(x)
+        out_softmax1 = self.softmax1_blocks(out_mpool1)
+        out_interp1 = self.interpolation1(out_softmax1) + out_trunk
+        out_softmax2 = self.softmax2_blocks(out_interp1)
+        out = (1 + out_softmax2) * out_trunk
+        out_last = self.last_blocks(out)
+        return out_last
+class ResidualAttentionNet(nn.Module):
+    def __init__(self, stage1_modules, stage2_modules, stage3_modules, feat_dim, out_h, out_w):
+        super(ResidualAttentionNet, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias = False),
+            nn.BatchNorm2d(64),
+            nn.ReLU(inplace=True)
+        )
+        attention_modules = []
+        attention_modules.append(ResidualBlock(64, 256))
+        # stage 1
+        for i in range(stage1_modules):
+            attention_modules.append(AttentionModule_stage1(256, 256))
+        attention_modules.append(ResidualBlock(256, 512, 2))
+        # stage2
+        for i in range(stage2_modules):
+            attention_modules.append(AttentionModule_stage2(512, 512))
+        attention_modules.append(ResidualBlock(512, 1024, 2))
+        # stage3
+        for i in range(stage3_modules):
+            attention_modules.append(AttentionModule_stage3(1024, 1024))
+        # final residual
+        attention_modules.append(ResidualBlock(1024, 2048, 2))
+        attention_modules.append(ResidualBlock(2048, 2048))
+        attention_modules.append(ResidualBlock(2048, 2048))
+        self.attention_body = nn.Sequential(*attention_modules)
+        # output layer
+        self.output_layer = nn.Sequential(
+            Flatten(),
+            nn.Linear(2048 * out_h * out_w, feat_dim, False),
+            nn.BatchNorm1d(feat_dim))
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.attention_body(out)
+        out = self.output_layer(out)
+        return out
--- a/bob/bio/facexzoo/backbones/EfficientNets.py
+++ b/bob/bio/facexzoo/backbones/EfficientNets.py
--- a/bob/bio/facexzoo/backbones/GhostNet.py
+++ b/bob/bio/facexzoo/backbones/GhostNet.py
+"""
+@author: Jun Wang
+@date: 20210121
+@contact: jun21wangustc@gmail.com 
+"""
+# based on:
+# https://github.com/huawei-noah/ghostnet/blob/master/ghostnet_pytorch/ghostnet.py
+# 2020.06.09-Changed for building GhostNet
+#            Huawei Technologies Co., Ltd. <foss@huawei.com>
+"""
+Creates a GhostNet Model as defined in:
+GhostNet: More Features from Cheap Operations By Kai Han, Yunhe Wang, Qi Tian, Jianyuan Guo, Chunjing Xu, Chang Xu.
+https://arxiv.org/abs/1911.11907
+Modified from https://github.com/d-li14/mobilenetv3.pytorch and https://github.com/rwightman/pytorch-image-models
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Sequential, BatchNorm2d, Dropout, Module, Linear, BatchNorm1d
+__all__ = ['ghost_net']
+class Flatten(Module):
+    def forward(self, input):
+        return input.reshape(input.size(0), -1)
+def _make_divisible(v, divisor, min_value=None):
+    """
+    This function is taken from the original tf repo.
+    It ensures that all layers have a channel number that is divisible by 8
+    It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def hard_sigmoid(x, inplace: bool = False):
+    if inplace:
+        return x.add_(3.).clamp_(0., 6.).div_(6.)
+    else:
+        return F.relu6(x + 3.) / 6.
+class SqueezeExcite(nn.Module):
+    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
+                 act_layer=nn.ReLU, gate_fn=hard_sigmoid, divisor=4, **_):
+        super(SqueezeExcite, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = _make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layer(inplace=True)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = x * self.gate_fn(x_se)
+        return x    
+class ConvBnAct(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size,
+                 stride=1, act_layer=nn.ReLU):
+        super(ConvBnAct, self).__init__()
+        self.conv = nn.Conv2d(in_chs, out_chs, kernel_size, stride, kernel_size//2, bias=False)
+        self.bn1 = nn.BatchNorm2d(out_chs)
+        self.act1 = act_layer(inplace=True)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        return x
+class GhostModule(nn.Module):
+    def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True):
+        super(GhostModule, self).__init__()
+        self.oup = oup
+        init_channels = math.ceil(oup / ratio)
+        new_channels = init_channels*(ratio-1)
+        self.primary_conv = nn.Sequential(
+            nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False),
+            nn.BatchNorm2d(init_channels),
+            nn.ReLU(inplace=True) if relu else nn.Sequential(),
+        )
+        self.cheap_operation = nn.Sequential(
+            nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False),
+            nn.BatchNorm2d(new_channels),
+            nn.ReLU(inplace=True) if relu else nn.Sequential(),
+        )
+    def forward(self, x):
+        x1 = self.primary_conv(x)
+        x2 = self.cheap_operation(x1)
+        out = torch.cat([x1,x2], dim=1)
+        return out[:,:self.oup,:,:]
+class GhostBottleneck(nn.Module):
+    """ Ghost bottleneck w/ optional SE"""
+    def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3,
+                 stride=1, act_layer=nn.ReLU, se_ratio=0.):
+        super(GhostBottleneck, self).__init__()
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.stride = stride
+        # Point-wise expansion
+        self.ghost1 = GhostModule(in_chs, mid_chs, relu=True)
+        # Depth-wise convolution
+        if self.stride > 1:
+            self.conv_dw = nn.Conv2d(mid_chs, mid_chs, dw_kernel_size, stride=stride,
+                             padding=(dw_kernel_size-1)//2,
+                             groups=mid_chs, bias=False)
+            self.bn_dw = nn.BatchNorm2d(mid_chs)
+        # Squeeze-and-excitation
+        if has_se:
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio)
+        else:
+            self.se = None
+        # Point-wise linear projection
+        self.ghost2 = GhostModule(mid_chs, out_chs, relu=False)
+        # shortcut
+        if (in_chs == out_chs and self.stride == 1):
+            self.shortcut = nn.Sequential()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_chs, in_chs, dw_kernel_size, stride=stride,
+                       padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False),
+                nn.BatchNorm2d(in_chs),
+                nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False),
+                nn.BatchNorm2d(out_chs),
+            )
+    def forward(self, x):
+        residual = x
+        # 1st ghost bottleneck
+        x = self.ghost1(x)
+        # Depth-wise convolution
+        if self.stride > 1:
+            x = self.conv_dw(x)
+            x = self.bn_dw(x)
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+        # 2nd ghost bottleneck
+        x = self.ghost2(x)
+        x += self.shortcut(residual)
+        return x
+class GhostNet(nn.Module):
+    def __init__(self, width=1.0, drop_ratio=0.2, feat_dim=512, out_h=7, out_w=7):
+        super(GhostNet, self).__init__()
+        # setting of inverted residual blocks
+        self.cfgs = [
+            # k, t, c, SE, s 
+            # stage1
+            [[3,  16,  16, 0, 1]],
+            # stage2
+            [[3,  48,  24, 0, 2]],
+            [[3,  72,  24, 0, 1]],
+            # stage3
+            [[5,  72,  40, 0.25, 2]],
+            [[5, 120,  40, 0.25, 1]],
+            # stage4
+            [[3, 240,  80, 0, 2]],
+            [[3, 200,  80, 0, 1],
+             [3, 184,  80, 0, 1],
+             [3, 184,  80, 0, 1],
+             [3, 480, 112, 0.25, 1],
+             [3, 672, 112, 0.25, 1]
+            ],
+            # stage5
+            [[5, 672, 160, 0.25, 2]],
+            [[5, 960, 160, 0, 1],
+             [5, 960, 160, 0.25, 1],
+             [5, 960, 160, 0, 1],
+             [5, 960, 160, 0.25, 1]
+            ]
+        ]
+        # building first layer
+        output_channel = _make_divisible(16 * width, 4)
+        #self.conv_stem = nn.Conv2d(3, output_channel, 3, 2, 1, bias=False)
+        self.conv_stem = nn.Conv2d(3, output_channel, 3, 1, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(output_channel)
+        self.act1 = nn.ReLU(inplace=True)
+        input_channel = output_channel
+        # building inverted residual blocks
+        stages = []
+        block = GhostBottleneck
+        for cfg in self.cfgs:
+            layers = []
+            for k, exp_size, c, se_ratio, s in cfg:
+                output_channel = _make_divisible(c * width, 4)
+                hidden_channel = _make_divisible(exp_size * width, 4)
+                layers.append(block(input_channel, hidden_channel, output_channel, k, s,
+                              se_ratio=se_ratio))
+                input_channel = output_channel
+            stages.append(nn.Sequential(*layers))
+        output_channel = _make_divisible(exp_size * width, 4)
+        stages.append(nn.Sequential(ConvBnAct(input_channel, output_channel, 1)))
+        input_channel = output_channel
+        self.blocks = nn.Sequential(*stages)        
+        self.output_layer = Sequential(BatchNorm2d(960),
+                                       Dropout(drop_ratio),
+                                       Flatten(),
+                                       Linear(960 * out_h * out_w, feat_dim), # for eye 
+                                       BatchNorm1d(feat_dim))
+    def forward(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.output_layer(x)
+        return x
--- a/bob/bio/facexzoo/backbones/HRNet.py
+++ b/bob/bio/facexzoo/backbones/HRNet.py
--- a/bob/bio/facexzoo/backbones/LightCNN.py
+++ b/bob/bio/facexzoo/backbones/LightCNN.py
+'''
+    implement Light CNN
+    @author: Alfred Xiang Wu
+    @date: 2017.07.04
+'''
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Flatten(nn.Module):
+    def forward(self, input):
+        return input.reshape(input.size(0), -1)
+class mfm(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, type=1):
+        super(mfm, self).__init__()
+        self.out_channels = out_channels
+        if type == 1:
+            self.filter = nn.Conv2d(in_channels, 2*out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        else:
+            self.filter = nn.Linear(in_channels, 2*out_channels)
+    def forward(self, x):
+        x = self.filter(x)
+        out = torch.split(x, self.out_channels, 1)
+        return torch.max(out[0], out[1])
+class group(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
+        super(group, self).__init__()
+        self.conv_a = mfm(in_channels, in_channels, 1, 1, 0)
+        self.conv   = mfm(in_channels, out_channels, kernel_size, stride, padding)
+    def forward(self, x):
+        x = self.conv_a(x)
+        x = self.conv(x)
+        return x
+class resblock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(resblock, self).__init__()
+        self.conv1 = mfm(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.conv2 = mfm(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        res = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = out + res
+        return out
+class network_9layers(nn.Module):
+    def __init__(self, num_classes=79077):
+        super(network_9layers, self).__init__()
+        self.features = nn.Sequential(
+            mfm(1, 48, 5, 1, 2), 
+            nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True), 
+            group(48, 96, 3, 1, 1), 
+            nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True),
+            group(96, 192, 3, 1, 1),
+            nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True), 
+            group(192, 128, 3, 1, 1),
+            group(128, 128, 3, 1, 1),
+            nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True),
+            )
+        self.fc1 = mfm(8*8*128, 256, type=0)
+        self.fc2 = nn.Linear(256, num_classes)
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc1(x)
+        x = F.dropout(x, training=self.training)
+        out = self.fc2(x)
+        return out, x
+class network_29layers(nn.Module):
+    def __init__(self, block, layers, num_classes=79077):
+        super(network_29layers, self).__init__()
+        #self.conv1  = mfm(1, 48, 5, 1, 2)
+        self.conv1  = mfm(3, 48, 5, 1, 2)
+        self.pool1  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block1 = self._make_layer(block, layers[0], 48, 48)
+        self.group1 = group(48, 96, 3, 1, 1)
+        self.pool2  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block2 = self._make_layer(block, layers[1], 96, 96)
+        self.group2 = group(96, 192, 3, 1, 1)
+        self.pool3  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.block3 = self._make_layer(block, layers[2], 192, 192)
+        self.group3 = group(192, 128, 3, 1, 1)
+        self.block4 = self._make_layer(block, layers[3], 128, 128)
+        self.group4 = group(128, 128, 3, 1, 1)
+        self.pool4  = nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)
+        self.fc     = mfm(8*8*128, 256, type=0)
+        self.fc2    = nn.Linear(256, num_classes)
+    def _make_layer(self, block, num_blocks, in_channels, out_channels):
+        layers = []
+        for i in range(0, num_blocks):
+            layers.append(block(in_channels, out_channels))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.pool1(x)
+        x = self.block1(x)
+        x = self.group1(x)
+        x = self.pool2(x)
+        x = self.block2(x)
+        x = self.group2(x)
+        x = self.pool3(x)
+        x = self.block3(x)
+        x = self.group3(x)
+        x = self.block4(x)
+        x = self.group4(x)
+        x = self.pool4(x)
+        x = x.view(x.size(0), -1)
+        fc = self.fc(x)
+        fc = F.dropout(fc, training=self.training)
+        out = self.fc2(fc)
+        return out, fc
+class network_29layers_v2(nn.Module):
+    def __init__(self, block, layers, drop_ratio, out_h, out_w, feat_dim):
+        super(network_29layers_v2, self).__init__()
+        #self.conv1    = mfm(1, 48, 5, 1, 2)
+        self.conv1    = mfm(3, 48, 5, 1, 2)
+        self.block1   = self._make_layer(block, layers[0], 48, 48)
+        self.group1   = group(48, 96, 3, 1, 1)
+        self.block2   = self._make_layer(block, layers[1], 96, 96)
+        self.group2   = group(96, 192, 3, 1, 1)
+        self.block3   = self._make_layer(block, layers[2], 192, 192)
+        self.group3   = group(192, 128, 3, 1, 1)
+        self.block4   = self._make_layer(block, layers[3], 128, 128)
+        self.group4   = group(128, 128, 3, 1, 1)
+        #self.fc       = nn.Linear(8*8*128, 256)
+        #self.fc2 = nn.Linear(256, num_classes, bias=False)
+        self.output_layer = nn.Sequential(nn.BatchNorm2d(128),
+                                       nn.Dropout(drop_ratio),
+                                       Flatten(),
+                                       nn.Linear(128 * out_h * out_w, feat_dim),
+                                       nn.BatchNorm1d(feat_dim))
+    def _make_layer(self, block, num_blocks, in_channels, out_channels):
+        layers = []
+        for i in range(0, num_blocks):
+            layers.append(block(in_channels, out_channels))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)
+        x = self.block1(x)
+        x = self.group1(x)
+        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)
+        x = self.block2(x)
+        x = self.group2(x)
+        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)
+        x = self.block3(x)
+        x = self.group3(x)
+        x = self.block4(x)
+        x = self.group4(x)
+        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2) # 7*7
+        #x = x.view(x.size(0), -1)
+        #fc = self.fc(x)
+        #x = F.dropout(fc, training=self.training)
+        #out = self.fc2(x)
+        #return out, fc
+        x = self.output_layer(x)
+        return x
+def LightCNN_9Layers(drop_ratio, out_h, out_w, feat_dim):
+    model = network_9layers(drop_ratio, out_h, out_w, feat_dim)
+    return model
+def LightCNN_29Layers(drop_ratio, out_h, out_w, feat_dim):
+    model = network_29layers(resblock, [1, 2, 3, 4], drop_ratio, out_h, out_w, feat_dim)
+    return model
+def LightCNN_29Layers_v2(drop_ratio, out_h, out_w, feat_dim):
+    model = network_29layers_v2(resblock, [1, 2, 3, 4], drop_ratio, out_h, out_w, feat_dim)
+    return model
+def LightCNN(depth, drop_ratio, out_h, out_w, feat_dim):
+    if depth == 9:
+        return LightCNN_9Layers(drop_ratio, out_h, out_w, feat_dim)
+    elif depth == 29:
+        return LightCNN_29Layers_v2(drop_ratio, out_h, out_w, feat_dim)
--- a/bob/bio/facexzoo/backbones/MobileFaceNets.py
+++ b/bob/bio/facexzoo/backbones/MobileFaceNets.py
+"""
+@author: Jun Wang 
+@date: 20201019
+@contact: jun21wangustc@gmail.com
+"""
+# based on:
+# https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Sequential, Module
+import torch
+class Flatten(Module):
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+class Conv_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Conv_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+class Linear_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Linear_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False)
+        self.bn = BatchNorm2d(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Depth_Wise(Module):
+     def __init__(self, in_c, out_c, residual = False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(Depth_Wise, self).__init__()
+        self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride)
+        self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+     def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(Depth_Wise(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups))
+        self.model = Sequential(*modules)
+    def forward(self, x):
+        return self.model(x)
+class MobileFaceNet(Module):
+    def __init__(self, embedding_size, out_h, out_w):
+        super(MobileFaceNet, self).__init__()
+        self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+        self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+        self.conv_23 = Depth_Wise(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+        self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_34 = Depth_Wise(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+        self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_45 = Depth_Wise(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+        self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+        self.conv_6_sep = Conv_block(128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+        #self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(7,7), stride=(1, 1), padding=(0, 0))
+        #self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(4,7), stride=(1, 1), padding=(0, 0))
+        self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(out_h, out_w), stride=(1, 1), padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(512, embedding_size, bias=False)
+        self.bn = BatchNorm1d(embedding_size)
+    def forward(self, x):
+        out = self.conv1(x)
+        out = self.conv2_dw(out)
+        out = self.conv_23(out)
+        out = self.conv_3(out)
+        out = self.conv_34(out)
+        out = self.conv_4(out)
+        out = self.conv_45(out)
+        out = self.conv_5(out)
+        out = self.conv_6_sep(out)
+        out = self.conv_6_dw(out)
+        out = self.conv_6_flatten(out)
+        out = self.linear(out)
+        out = self.bn(out)
+        return out
--- a/bob/bio/facexzoo/backbones/ReXNets.py
+++ b/bob/bio/facexzoo/backbones/ReXNets.py
+"""
+@author: Jun Wang
+@date: 20210322
+@contact: jun21wangustc@gmail.com
+"""
+# based on:
+# https://github.com/clovaai/rexnet/blob/master/rexnetv1.py
+"""
+ReXNet
+Copyright (c) 2020-present NAVER Corp.
+MIT license
+"""
+import torch
+import torch.nn as nn
+from math import ceil
+class Flatten(nn.Module):
+    def forward(self, input):
+        return input.reshape(input.size(0), -1)
+# Memory-efficient Siwsh using torch.jit.script borrowed from the code in (https://twitter.com/jeremyphoward/status/1188251041835315200)
+# Currently use memory-efficient Swish as default:
+USE_MEMORY_EFFICIENT_SWISH = True
+if USE_MEMORY_EFFICIENT_SWISH:
+    @torch.jit.script
+    def swish_fwd(x):
+        return x.mul(torch.sigmoid(x))
+    @torch.jit.script
+    def swish_bwd(x, grad_output):
+        x_sigmoid = torch.sigmoid(x)
+        return grad_output * (x_sigmoid * (1. + x * (1. - x_sigmoid)))
+    class SwishJitImplementation(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            ctx.save_for_backward(x)
+            return swish_fwd(x)
+        @staticmethod
+        def backward(ctx, grad_output):
+            x = ctx.saved_tensors[0]
+            return swish_bwd(x, grad_output)
+    def swish(x, inplace=False):
+        return SwishJitImplementation.apply(x)
+else:
+    def swish(x, inplace=False):
+        return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
+class Swish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Swish, self).__init__()
+        self.inplace = inplace
+    def forward(self, x):
+        return swish(x, self.inplace)
+def ConvBNAct(out, in_channels, channels, kernel=1, stride=1, pad=0,
+              num_group=1, active=True, relu6=False):
+    out.append(nn.Conv2d(in_channels, channels, kernel,
+                         stride, pad, groups=num_group, bias=False))
+    out.append(nn.BatchNorm2d(channels))
+    if active:
+        out.append(nn.ReLU6(inplace=True) if relu6 else nn.ReLU(inplace=True))
+def ConvBNSwish(out, in_channels, channels, kernel=1, stride=1, pad=0, num_group=1):
+    out.append(nn.Conv2d(in_channels, channels, kernel,
+                         stride, pad, groups=num_group, bias=False))
+    out.append(nn.BatchNorm2d(channels))
+    out.append(Swish())
+class SE(nn.Module):
+    def __init__(self, in_channels, channels, se_ratio=12):
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Conv2d(in_channels, channels // se_ratio, kernel_size=1, padding=0),
+            nn.BatchNorm2d(channels // se_ratio),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(channels // se_ratio, channels, kernel_size=1, padding=0),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.fc(y)
+        return x * y
+class LinearBottleneck(nn.Module):
+    def __init__(self, in_channels, channels, t, stride, use_se=True, se_ratio=12,
+                 **kwargs):
+        super(LinearBottleneck, self).__init__(**kwargs)
+        self.use_shortcut = stride == 1 and in_channels <= channels
+        self.in_channels = in_channels
+        self.out_channels = channels
+        out = []
+        if t != 1:
+            dw_channels = in_channels * t
+            ConvBNSwish(out, in_channels=in_channels, channels=dw_channels)
+        else:
+            dw_channels = in_channels
+        ConvBNAct(out, in_channels=dw_channels, channels=dw_channels, kernel=3, stride=stride, pad=1,
+                  num_group=dw_channels, active=False)
+        if use_se:
+            out.append(SE(dw_channels, dw_channels, se_ratio))
+        out.append(nn.ReLU6())
+        ConvBNAct(out, in_channels=dw_channels, channels=channels, active=False, relu6=True)
+        self.out = nn.Sequential(*out)
+    def forward(self, x):
+        out = self.out(x)
+        if self.use_shortcut:
+            out[:, 0:self.in_channels] += x
+        return out
+class ReXNetV1(nn.Module):
+    def __init__(self, input_ch=16, final_ch=180, width_mult=1.0, depth_mult=1.0, 
+                 use_se=True, se_ratio=12, out_h=7, out_w=7, feat_dim=512,
+                 dropout_ratio=0.2, bn_momentum=0.9):
+        super(ReXNetV1, self).__init__()
+        layers = [1, 2, 2, 3, 3, 5]
+        strides = [1, 2, 2, 2, 1, 2]
+        use_ses = [False, False, True, True, True, True]
+        layers = [ceil(element * depth_mult) for element in layers]
+        strides = sum([[element] + [1] * (layers[idx] - 1)
+                       for idx, element in enumerate(strides)], [])
+        if use_se:
+            use_ses = sum([[element] * layers[idx] for idx, element in enumerate(use_ses)], [])
+        else:
+            use_ses = [False] * sum(layers[:])
+        ts = [1] * layers[0] + [6] * sum(layers[1:])
+        self.depth = sum(layers[:]) * 3
+        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
+        inplanes = input_ch / width_mult if width_mult < 1.0 else input_ch
+        features = []
+        in_channels_group = []
+        channels_group = []
+        # The following channel configuration is a simple instance to make each layer become an expand layer.
+        for i in range(self.depth // 3):
+            if i == 0:
+                in_channels_group.append(int(round(stem_channel * width_mult)))
+                channels_group.append(int(round(inplanes * width_mult)))
+            else:
+                in_channels_group.append(int(round(inplanes * width_mult)))
+                inplanes += final_ch / (self.depth // 3 * 1.0)
+                channels_group.append(int(round(inplanes * width_mult)))
+        #ConvBNSwish(features, 3, int(round(stem_channel * width_mult)), kernel=3, stride=2, pad=1)
+        ConvBNSwish(features, 3, int(round(stem_channel * width_mult)), kernel=3, stride=1, pad=1)
+        for block_idx, (in_c, c, t, s, se) in enumerate(zip(in_channels_group, channels_group, ts, strides, use_ses)):
+            features.append(LinearBottleneck(in_channels=in_c,
+                                             channels=c,
+                                             t=t,
+                                             stride=s,
+                                             use_se=se, se_ratio=se_ratio))
+        #pen_channels = int(1280 * width_mult)
+        pen_channels = int(512 * width_mult)
+        ConvBNSwish(features, c, pen_channels)
+        #features.append(nn.AdaptiveAvgPool2d(1))
+        self.features = nn.Sequential(*features)
+        self.output_layer = nn.Sequential(nn.BatchNorm2d(512),
+                                          nn.Dropout(dropout_ratio),
+                                          Flatten(),
+                                          nn.Linear(512 * out_h * out_w, feat_dim), 
+                                          nn.BatchNorm1d(feat_dim))
+    def forward(self, x):
+        x = self.features(x)
+        x = self.output_layer(x)
+        return x
--- a/bob/bio/facexzoo/backbones/Readme.md
+++ b/bob/bio/facexzoo/backbones/Readme.md
+# Source:
+- https://github.com/JDAI-CV/FaceX-Zoo/tree/main/backbone
+- https://gitlab.idiap.ch/bob/bob.learn.pytorch/-/tree/master/bob/learn/pytorch/architectures/facexzoo
\ No newline at end of file
--- a/bob/bio/facexzoo/backbones/RepVGG.py
+++ b/bob/bio/facexzoo/backbones/RepVGG.py
+"""
+@author: Jun Wang
+@date: 20210910
+@contact: jun21wangustc@gmail.com
+"""
+# based on:
+# https://github.com/DingXiaoH/RepVGG/edit/main/repvgg.py
+import torch.nn as nn
+import numpy as np
+import torch
+import copy
+class Flatten(nn.Module):
+    def forward(self, input):
+        return input.reshape(input.size(0), -1)
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    result = nn.Sequential()
+    result.add_module('conv', nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
+                                                  kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False))
+    result.add_module('bn', nn.BatchNorm2d(num_features=out_channels))
+    return result
+class RepVGGBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 stride=1, padding=0, dilation=1, groups=1, padding_mode='zeros', deploy=False, use_se=False):
+        super(RepVGGBlock, self).__init__()
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        assert kernel_size == 3
+        assert padding == 1
+        padding_11 = padding - kernel_size // 2
+        self.nonlinearity = nn.ReLU()
+        '''
+        if use_se:
+            self.se = SEBlock(out_channels, internal_neurons=out_channels // 16)
+        else:
+            self.se = nn.Identity()
+        '''
+        self.se = nn.Identity()
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride,
+                                      padding=padding, dilation=dilation, groups=groups, bias=True, padding_mode=padding_mode)
+        else:
+            self.rbr_identity = nn.BatchNorm2d(num_features=in_channels) if out_channels == in_channels and stride == 1 else None
+            self.rbr_dense = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups)
+            self.rbr_1x1 = conv_bn(in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=stride, padding=padding_11, groups=groups)
+            #print('RepVGG Block, identity = ', self.rbr_identity)
+    def forward(self, inputs):
+        if hasattr(self, 'rbr_reparam'):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+        return self.nonlinearity(self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+    #   Optional. This improves the accuracy and facilitates quantization.
+    #   1.  Cancel the original weight decay on rbr_dense.conv.weight and rbr_1x1.conv.weight.
+    #   2.  Use like this.
+    #       loss = criterion(....)
+    #       for every RepVGGBlock blk:
+    #           loss += weight_decay_coefficient * 0.5 * blk.get_cust_L2()
+    #       optimizer.zero_grad()
+    #       loss.backward()
+    def get_custom_L2(self):
+        K3 = self.rbr_dense.conv.weight
+        K1 = self.rbr_1x1.conv.weight
+        t3 = (self.rbr_dense.bn.weight / ((self.rbr_dense.bn.running_var + self.rbr_dense.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+        t1 = (self.rbr_1x1.bn.weight / ((self.rbr_1x1.bn.running_var + self.rbr_1x1.bn.eps).sqrt())).reshape(-1, 1, 1, 1).detach()
+        l2_loss_circle = (K3 ** 2).sum() - (K3[:, :, 1:2, 1:2] ** 2).sum()      # The L2 loss of the "circle" of weights in 3x3 kernel. Use regular L2 on them.
+        eq_kernel = K3[:, :, 1:2, 1:2] * t3 + K1 * t1                           # The equivalent resultant central point of 3x3 kernel.
+        l2_loss_eq_kernel = (eq_kernel ** 2 / (t3 ** 2 + t1 ** 2)).sum()        # Normalize for an L2 coefficient comparable to regular L2.
+        return l2_loss_eq_kernel + l2_loss_circle
+#   This func derives the equivalent kernel and bias in a DIFFERENTIABLE way.
+#   You can get the equivalent kernel and bias at any time and do whatever you want,
+    #   for example, apply some penalties or constraints during training, just like you do to the other models.
+#   May be useful for quantization or pruning.
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1,1,1,1])
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros((self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+    def switch_to_deploy(self):
+        if hasattr(self, 'rbr_reparam'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(in_channels=self.rbr_dense.conv.in_channels, out_channels=self.rbr_dense.conv.out_channels,
+                                     kernel_size=self.rbr_dense.conv.kernel_size, stride=self.rbr_dense.conv.stride,
+                                     padding=self.rbr_dense.conv.padding, dilation=self.rbr_dense.conv.dilation, groups=self.rbr_dense.conv.groups, bias=True)
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('rbr_dense')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity'):
+            self.__delattr__('rbr_identity')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+        self.deploy = True
+class RepVGG(nn.Module):
+    def __init__(self, num_blocks, width_multiplier, feat_dim=512, out_h=7, out_w=7, override_groups_map=None, deploy=False, use_se=False):
+        super(RepVGG, self).__init__()
+        assert len(width_multiplier) == 4
+        self.deploy = deploy
+        self.override_groups_map = override_groups_map or dict()
+        self.use_se = use_se
+        assert 0 not in self.override_groups_map
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+        #self.stage0 = RepVGGBlock(in_channels=3, out_channels=self.in_planes, kernel_size=3, stride=2, padding=1, deploy=self.deploy, use_se=self.use_se)
+        self.stage0 = RepVGGBlock(in_channels=3, out_channels=self.in_planes, kernel_size=3, stride=1, padding=1, deploy=self.deploy, use_se=self.use_se)
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(int(64 * width_multiplier[0]), num_blocks[0], stride=2)
+        self.stage2 = self._make_stage(int(128 * width_multiplier[1]), num_blocks[1], stride=2)
+        self.stage3 = self._make_stage(int(256 * width_multiplier[2]), num_blocks[2], stride=2)
+        self.stage4 = self._make_stage(int(512 * width_multiplier[3]), num_blocks[3], stride=2)
+        self.output_layer = nn.Sequential(nn.BatchNorm2d(int(512*width_multiplier[3])),
+                                       Flatten(),
+                                       nn.Linear(int(512 * width_multiplier[3]) * out_h * out_w, feat_dim), # for eye
+                                       nn.BatchNorm1d(feat_dim))
+    def _make_stage(self, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        blocks = []
+        for stride in strides:
+            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
+            blocks.append(RepVGGBlock(in_channels=self.in_planes, out_channels=planes, kernel_size=3,
+                                      stride=stride, padding=1, groups=cur_groups, deploy=self.deploy, use_se=self.use_se))
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.Sequential(*blocks)
+    def forward(self, x):
+        out = self.stage0(x)
+        out = self.stage1(out)
+        out = self.stage2(out)
+        out = self.stage3(out)
+        out = self.stage4(out)
+        out = self.output_layer(out)
+        return out
+optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
+g2_map = {l: 2 for l in optional_groupwise_layers}
+g4_map = {l: 4 for l in optional_groupwise_layers}
+def create_RepVGG_A0(deploy=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], 
+                  width_multiplier=[0.75, 0.75, 0.75, 2.5], override_groups_map=None, deploy=deploy)
+def create_RepVGG_A1(deploy=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], 
+                  width_multiplier=[1, 1, 1, 2.5], override_groups_map=None, deploy=deploy)
+def create_RepVGG_A2(deploy=False):
+    return RepVGG(num_blocks=[2, 4, 14, 1], 
+                  width_multiplier=[1.5, 1.5, 1.5, 2.75], override_groups_map=None, deploy=deploy)
+def create_RepVGG_B0(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[1, 1, 1, 2.5], override_groups_map=None, deploy=deploy)
+def create_RepVGG_B1(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=None, deploy=deploy)
+def create_RepVGG_B1g2(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=g2_map, deploy=deploy)
+def create_RepVGG_B1g4(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2, 2, 2, 4], override_groups_map=g4_map, deploy=deploy)
+def create_RepVGG_B2(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=None, deploy=deploy)
+def create_RepVGG_B2g2(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=g2_map, deploy=deploy)
+def create_RepVGG_B2g4(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=g4_map, deploy=deploy)
+def create_RepVGG_B3(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=None, deploy=deploy)
+def create_RepVGG_B3g2(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=g2_map, deploy=deploy)
+def create_RepVGG_B3g4(deploy=False):
+    return RepVGG(num_blocks=[4, 6, 16, 1], 
+                  width_multiplier=[3, 3, 3, 5], override_groups_map=g4_map, deploy=deploy)
+def create_RepVGG_D2se(deploy=False):
+    return RepVGG(num_blocks=[8, 14, 24, 1],
+                  width_multiplier=[2.5, 2.5, 2.5, 5], override_groups_map=None, deploy=deploy, use_se=True)
+func_dict = {
+'RepVGG-A0': create_RepVGG_A0,
+'RepVGG-A1': create_RepVGG_A1,
+'RepVGG-A2': create_RepVGG_A2,
+'RepVGG-B0': create_RepVGG_B0,
+'RepVGG-B1': create_RepVGG_B1,
+'RepVGG-B1g2': create_RepVGG_B1g2,
+'RepVGG-B1g4': create_RepVGG_B1g4,
+'RepVGG-B2': create_RepVGG_B2,
+'RepVGG-B2g2': create_RepVGG_B2g2,
+'RepVGG-B2g4': create_RepVGG_B2g4,
+'RepVGG-B3': create_RepVGG_B3,
+'RepVGG-B3g2': create_RepVGG_B3g2,
+'RepVGG-B3g4': create_RepVGG_B3g4,
+'RepVGG-D2se': create_RepVGG_D2se,      #   Updated at April 25, 2021. This is not reported in the CVPR paper.
+}
+def get_RepVGG_func_by_name(name):
+    return func_dict[name]
+#   Use this for converting a RepVGG model or a bigger model with RepVGG as its component
+#   Use like this
+#   model = create_RepVGG_A0(deploy=False)
+#   train model or load weights
+#   repvgg_model_convert(model, save_path='repvgg_deploy.pth')
+#   If you want to preserve the original model, call with do_copy=True
+#   ====================== for using RepVGG as the backbone of a bigger model, e.g., PSPNet, the pseudo code will be like
+#   train_backbone = create_RepVGG_B2(deploy=False)
+#   train_backbone.load_state_dict(torch.load('RepVGG-B2-train.pth'))
+#   train_pspnet = build_pspnet(backbone=train_backbone)
+#   segmentation_train(train_pspnet)
+#   deploy_pspnet = repvgg_model_convert(train_pspnet)
+#   segmentation_test(deploy_pspnet)
+#   =====================   example_pspnet.py shows an example
+def repvgg_model_convert(model:torch.nn.Module, save_path=None, do_copy=True):
+    if do_copy:
+        model = copy.deepcopy(model)
+    for module in model.modules():
+        if hasattr(module, 'switch_to_deploy'):
+            module.switch_to_deploy()
+    if save_path is not None:
+        torch.save(model.state_dict(), save_path)
+    return model
--- a/bob/bio/facexzoo/backbones/ResNets.py
+++ b/bob/bio/facexzoo/backbones/ResNets.py
+"""
+@author: Jun Wang    
+@date: 20201019   
+@contact: jun21wangustc@gmail.com 
+"""
+# based on:  
+# https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, ReLU, Sigmoid, Dropout2d, Dropout, AvgPool2d, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module, Parameter
+import torch.nn.functional as F
+import torch
+from collections import namedtuple
+class Flatten(Module):
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+class SEModule(Module):
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels, channels // reduction, kernel_size=1, padding=0 ,bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction, channels, kernel_size=1, padding=0 ,bias=False)
+        self.sigmoid = Sigmoid()
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+class bottleneck_IR(Module):
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride ,bias=False), BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1 ,bias=False), PReLU(depth),
+            Conv2d(depth, depth, (3, 3), stride, 1 ,bias=False), BatchNorm2d(depth))
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+class bottleneck_IR_SE(Module):
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR_SE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride ,bias=False), 
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3,3), (1,1),1 ,bias=False),
+            PReLU(depth),
+            Conv2d(depth, depth, (3,3), stride, 1 ,bias=False),
+            BatchNorm2d(depth),
+            SEModule(depth,16)
+            )
+    def forward(self,x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+def get_block(in_channel, depth, num_units, stride = 2):
+  return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units-1)]
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units = 3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    return blocks
+#class Backbone(Module):
+class Resnet(Module):
+    def __init__(self, num_layers, drop_ratio, mode='ir', feat_dim=512, out_h=7, out_w=7):
+        super(Resnet, self).__init__()
+        assert num_layers in [50, 100, 152], 'num_layers should be 50,100, or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = bottleneck_IR
+        elif mode == 'ir_se':
+            unit_module = bottleneck_IR_SE
+        self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1 ,bias=False), 
+                                      BatchNorm2d(64), 
+                                      PReLU(64))
+        self.output_layer = Sequential(BatchNorm2d(512), 
+                                       Dropout(drop_ratio),
+                                       Flatten(),
+                                       Linear(512 * out_h * out_w, feat_dim), # for eye
+                                       BatchNorm1d(feat_dim))
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel,
+                                bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+    def forward(self,x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+        return x
--- a/bob/bio/facexzoo/backbones/Swin_Transformer.py
+++ b/bob/bio/facexzoo/backbones/Swin_Transformer.py
--- a/bob/bio/facexzoo/backbones/TF_NAS.py
+++ b/bob/bio/facexzoo/backbones/TF_NAS.py
--- a/bob/bio/facexzoo/backbones/__init__.py
+++ b/bob/bio/facexzoo/backbones/__init__.py
+from .models import FaceXZooModelFactory
--- a/bob/bio/facexzoo/backbones/backbone_conf.yaml
+++ b/bob/bio/facexzoo/backbones/backbone_conf.yaml
+MobileFaceNet:
+    feat_dim: 512
+    #out_h: 4
+    out_h: 7
+    out_w: 7
+# ResNet:
+#     depth: 152
+#     drop_ratio: 0.4
+#     net_mode: ir_se
+#     feat_dim: 512
+#     out_h: 7
+#     out_w: 7
+# according to the log of model:
+ResNet50_ir:
+    depth: 50
+    drop_ratio: 0.4
+    net_mode: ir
+    feat_dim: 512
+    out_h: 7
+    out_w: 7
+# according to the log of model:
+ResNet152_irse:
+    depth: 152
+    drop_ratio: 0.4
+    net_mode: ir_se
+    feat_dim: 512
+    out_h: 7
+    out_w: 7
+EfficientNet_B0:
+    width: 1.0
+    depth: 1.0
+    image_size: 110
+    drop_ratio: 0.2
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+HRNet:
+  NAME: cls_hrnet
+  out_h: 7
+  out_w: 7
+  feat_dim: 512
+  IMAGE_SIZE:
+    - 112
+    - 112
+  EXTRA:
+    STAGE1:
+      NUM_MODULES: 1
+      NUM_RANCHES: 1
+      BLOCK: BOTTLENECK
+      NUM_BLOCKS:
+      - 4
+      NUM_CHANNELS:
+      - 64
+      FUSE_METHOD: SUM
+    STAGE2:
+      NUM_MODULES: 1
+      NUM_BRANCHES: 2
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 18
+      - 36
+      FUSE_METHOD: SUM
+    STAGE3:
+      NUM_MODULES: 4
+      NUM_BRANCHES: 3
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 18
+      - 36
+      - 72
+      FUSE_METHOD: SUM
+    STAGE4:
+      NUM_MODULES: 3
+      NUM_BRANCHES: 4
+      BLOCK: BASIC
+      NUM_BLOCKS:
+      - 4
+      - 4
+      - 4
+      - 4
+      NUM_CHANNELS:
+      - 18
+      - 36
+      - 72
+      - 144
+      FUSE_METHOD: SUM
+GhostNet:
+    width: 1.0
+    drop_ratio: 0.2
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+# AttentionNet:
+#     stage1_modules: 1
+#     stage2_modules: 2
+#     stage3_modules: 3
+#     feat_dim: 512
+#     out_h: 7
+#     out_w: 7
+# https://github.com/JDAI-CV/FaceX-Zoo/issues/96#issuecomment-929808352
+AttentionNet56:
+    #AttentionNet: 
+    stage1_modules: 1
+    stage2_modules: 1
+    stage3_modules: 1
+    feat_dim: 512
+    out_h: 7
+    out_w: 7
+# https://github.com/JDAI-CV/FaceX-Zoo/issues/96#issuecomment-929808352
+AttentionNet92:
+    #AttentionNet:
+    stage1_modules: 1
+    stage2_modules: 2
+    stage3_modules: 3
+    feat_dim: 512
+    out_h: 7
+    out_w: 7
+TF_NAS_A:
+    feat_dim: 512
+    drop_ratio: 0.2
+    out_h: 7
+    out_w: 7
+ResNeSt50:
+    depth: 50
+    drop_ratio: 0.4
+    feat_dim: 512
+    out_h: 7
+    out_w: 7
+ReXNet_1:
+    input_ch: 16
+    final_ch: 180
+    width_mult: 1.0
+    depth_mult: 1.0
+    use_se: 0
+    se_ratio: 12
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+    dropout_ratio: 0.2
+LightCNN29:
+    depth: 29
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+    dropout_ratio: 0.2
+# RepVGG:
+#     blocks1: 4
+#     blocks2: 6
+#     blocks3: 16
+#     blocks4: 1
+#     width1: 2
+#     width2: 2
+#     width3: 2
+#     width4: 4
+#     out_h: 7
+#     out_w: 7
+#     feat_dim: 512
+# according to the log of model:
+RepVGG_A0:
+    blocks1: 2
+    blocks2: 4
+    blocks3: 14
+    blocks4: 1
+    width1: 0.75
+    width2: 0.75
+    width3: 0.75
+    width4: 2.5
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+# according to the log of model:
+RepVGG_B0:
+    blocks1: 4
+    blocks2: 6
+    blocks3: 16
+    blocks4: 1
+    width1: 1
+    width2: 1
+    width3: 1
+    width4: 2.5
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+# according to the log of model:
+RepVGG_B1:
+    blocks1: 4
+    blocks2: 6
+    blocks3: 16
+    blocks4: 1
+    width1: 2
+    width2: 2
+    width3: 2
+    width4: 4
+    out_h: 7
+    out_w: 7
+    feat_dim: 512
+# SwinTransformer:
+#     img_size: 224
+#     patch_size: 4
+#     in_chans: 3
+#     embed_dim: 96
+#     depths:
+#     - 2
+#     - 2
+#     - 18
+#     - 2
+#     num_heads:
+#     - 3
+#     - 6
+#     - 12
+#     - 24
+#     window_size: 7
+#     mlp_ratio: 4.0
+#     drop_rate: 0.0
+#     drop_path_rate: 0.3
+# according to the log of model:
+SwinTransformer_S:
+    img_size: 224
+    patch_size: 4
+    in_chans: 3
+    embed_dim: 96
+    depths:
+    - 2
+    - 2
+    - 18
+    - 2
+    num_heads:
+    - 3
+    - 6
+    - 12
+    - 24
+    window_size: 7
+    mlp_ratio: 4.0
+    drop_rate: 0.0
+    drop_path_rate: 0.3
+SwinTransformer_T:
+    img_size: 224
+    patch_size: 4
+    in_chans: 3
+    embed_dim: 96
+    depths:
+    - 2
+    - 2
+    - 6
+    - 2
+    num_heads:
+    - 3
+    - 6
+    - 12
+    - 24
+    window_size: 7
+    mlp_ratio: 4.0
+    drop_rate: 0.0
+    drop_path_rate: 0.2
\ No newline at end of file
--- a/bob/bio/facexzoo/backbones/backbone_def.py
+++ b/bob/bio/facexzoo/backbones/backbone_def.py