distill-and-select/model/layers.py


								import math

								import torch

								import numpy as np

								import torch.nn as nn

								import torch.nn.functional as F


								from .constraints import L2Constrain


								class VideoNormalizer(nn.Module):


								    def __init__(self):

								        super(VideoNormalizer, self).__init__()

								        self.scale = nn.Parameter(torch.Tensor([255.]), requires_grad=False)

								        self.mean = nn.Parameter(torch.Tensor([0.485, 0.456, 0.406]), requires_grad=False)

								        self.std = nn.Parameter(torch.Tensor([0.229, 0.224, 0.225]), requires_grad=False)


								    def forward(self, video):

								        video = video.float()

								        video = ((video / self.scale) - self.mean) / self.std

								        return video.permute(0, 3, 1, 2)


								class RMAC(nn.Module):


								    def __init__(self, L=[3]):

								        super(RMAC,self).__init__()

								        self.L = L


								    def forward(self, x):

								        return self.region_pooling(x, L=self.L)


								    def region_pooling(self, x, L=[3]):

								        ovr = 0.4  # desired overlap of neighboring regions

								        steps = torch.Tensor([2, 3, 4, 5, 6, 7])  # possible regions for the long dimension


								        W = x.shape[3]

								        H = x.shape[2]


								        w = min(W, H)

								        w2 = math.floor(w / 2.0 - 1)


								        b = (max(H, W) - w) / (steps - 1)

								        (tmp, idx) = torch.min(torch.abs(((w ** 2 - w * b) / w ** 2) - ovr), 0)  # steps(idx) regions for long dimension


								        # region overplus per dimension

								        Wd = 0

								        Hd = 0

								        if H < W:

								            Wd = idx.item() + 1

								        elif H > W:

								            Hd = idx.item() + 1


								        vecs = []

								        for l in L:

								            wl = math.floor(2 * w / (l + 1))

								            wl2 = math.floor(wl / 2 - 1)


								            if l + Wd == 1:

								                b = 0

								            else:

								                b = (W - wl) / (l + Wd - 1)

								            cenW = torch.floor(wl2 + torch.tensor(range(l - 1 + Wd + 1)) * b) - wl2  # center coordinates

								            if l + Hd == 1:

								                b = 0

								            else:

								                b = (H - wl) / (l + Hd - 1)

								            cenH = torch.floor(wl2 + torch.tensor(range(l - 1 + Hd + 1)) * b) - wl2  # center coordinates


								            for i in cenH.long().tolist():

								                v = []

								                for j in cenW.long().tolist():

								                    if wl == 0:

								                        continue

								                    R = x[:, :, i: i+wl, j: j+wl]

								                    v.append(F.adaptive_max_pool2d(R, (1, 1)))

								                vecs.append(torch.cat(v, dim=3))

								        return torch.cat(vecs, dim=2)


								class PCA(nn.Module):


								    def __init__(self, n_components=None):

								        super(PCA, self).__init__()

								        pretrained_url = 'http://ndd.iti.gr/visil/pca_resnet50_vcdb_1M.pth'

								        white = torch.hub.load_state_dict_from_url(pretrained_url)

								        idx = torch.argsort(white['d'], descending=True)[: n_components]

								        d = white['d'][idx]

								        V = white['V'][:, idx]

								        D = torch.diag(1. / torch.sqrt(d + 1e-7))

								        self.mean = nn.Parameter(white['mean'], requires_grad=False)

								        self.DVt = nn.Parameter(torch.mm(D, V.T).T, requires_grad=False)


								    def forward(self, logits):

								        logits -= self.mean.expand_as(logits)

								        logits = torch.matmul(logits, self.DVt)

								        logits = F.normalize(logits, p=2, dim=-1)

								        return logits


								class Attention(nn.Module):


								    def __init__(self, dims, norm=False):

								        super(Attention, self).__init__()

								        self.norm = norm

								        if self.norm:

								            self.constrain = L2Constrain()

								        else:

								            self.transform = nn.Linear(dims, dims)

								        self.context_vector = nn.Linear(dims, 1, bias=False)

								        self.reset_parameters()


								    def forward(self, x):

								        if self.norm:

								            weights = self.context_vector(x)

								            weights = torch.add(torch.div(weights, 2.), .5)

								        else:

								            x_tr = torch.tanh(self.transform(x))

								            weights = self.context_vector(x_tr)

								            weights = torch.sigmoid(weights)

								        x = x * weights

								        return x, weights


								    def reset_parameters(self):

								        if self.norm:

								            nn.init.normal_(self.context_vector.weight)

								            self.constrain(self.context_vector)

								        else:

								            nn.init.xavier_uniform_(self.context_vector.weight)

								            nn.init.xavier_uniform_(self.transform.weight)

								            nn.init.zeros_(self.transform.bias)


								    def apply_contraint(self):

								        if self.norm:

								            self.constrain(self.context_vector)


								class BinarizationLayer(nn.Module):


								    def __init__(self, dims, bits=None, sigma=1e-6, ITQ_init=True):

								        super(BinarizationLayer, self).__init__()

								        self.sigma = sigma

								        if ITQ_init:

								            pretrained_url = 'https://mever.iti.gr/distill-and-select/models/itq_resnet50W_dns100k_1M.pth'

								            self.W = nn.Parameter(torch.hub.load_state_dict_from_url(pretrained_url)['proj'])

								        else:

								            if bits is None:

								                bits = dims

								            self.W = nn.Parameter(torch.rand(dims, bits))


								    def forward(self, x):

								        x = F.normalize(x, p=2, dim=-1)

								        x = torch.matmul(x, self.W)

								        if self.training:

								            x = torch.erf(x / np.sqrt(2 * self.sigma))

								        else:

								            x = torch.sign(x)

								        return x


								    def __repr__(self,):

								        return '{}(dims={}, bits={}, sigma={})'.format(

								            self.__class__.__name__, self.W.shape[0], self.W.shape[1], self.sigma)


								class NetVLAD(nn.Module):

								    """Acknowledgement to @lyakaap and @Nanne for providing their implementations"""


								    def __init__(self, dims, num_clusters, outdims=None):

								        super(NetVLAD, self).__init__()

								        self.num_clusters = num_clusters

								        self.dims = dims


								        self.centroids = nn.Parameter(torch.randn(num_clusters, dims) / math.sqrt(self.dims))

								        self.conv = nn.Conv2d(dims, num_clusters, kernel_size=1, bias=False)


								        if outdims is not None:

								            self.outdims = outdims

								            self.reduction_layer = nn.Linear(self.num_clusters * self.dims, self.outdims, bias=False)

								        else:

								            self.outdims = self.num_clusters * self.dims

								        self.norm = nn.LayerNorm(self.outdims)

								        self.reset_parameters()


								    def reset_parameters(self):

								        self.conv.weight = nn.Parameter(self.centroids.detach().clone().unsqueeze(-1).unsqueeze(-1))

								        if hasattr(self, 'reduction_layer'):

								            nn.init.normal_(self.reduction_layer.weight, std=1 / math.sqrt(self.num_clusters * self.dims))


								    def forward(self, x, mask=None):

								        N, C, T, R = x.shape


								        # soft-assignment

								        soft_assign = self.conv(x).view(N, self.num_clusters, -1)

								        soft_assign = F.softmax(soft_assign, dim=1).view(N, self.num_clusters, T, R)


								        x_flatten = x.view(N, C, -1)


								        vlad = torch.zeros([N, self.num_clusters, C], dtype=x.dtype, layout=x.layout, device=x.device)

								        for cluster in range(self.num_clusters):  # slower than non-looped, but lower memory usage

								            residual = x_flatten.unsqueeze(0).permute(1, 0, 2, 3) - self.centroids[cluster:cluster + 1, :].\

								                expand(x_flatten.size(-1), -1, -1).permute(1, 2, 0).unsqueeze(0)

								            residual = residual.view(N, C, T, R)

								            residual *= soft_assign[:, cluster:cluster + 1, :]

								            if mask is not None:

								                residual = residual.masked_fill((1 - mask.unsqueeze(1).unsqueeze(-1)).bool(), 0.0)

								            vlad[:, cluster:cluster+1, :] = residual.sum([-2, -1]).unsqueeze(1)


								        vlad = F.normalize(vlad, p=2, dim=2)  # intra-normalization

								        vlad = vlad.view(x.size(0), -1)  # flatten

								        vlad = F.normalize(vlad, p=2, dim=1)  # L2 normalize


								        if hasattr(self, 'reduction_layer'):

								            vlad = self.reduction_layer(vlad)

								        return self.norm(vlad)