dolg/dolg_impl.py

import timm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6, requires_grad=False):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p, requires_grad=requires_grad)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)

    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

    def __repr__(self):
        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'

class MultiAtrous(nn.Module):
    def __init__(self, in_channel, out_channel, size, dilation_rates=[3, 6, 9]):
        super().__init__()
        self.dilated_convs = [
            nn.Conv2d(in_channel, int(out_channel/4),
                      kernel_size=3, dilation=rate, padding=rate)
            for rate in dilation_rates
        ]
        self.gap_branch = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_channel, int(out_channel/4), kernel_size=1),
            nn.ReLU(),
            nn.Upsample(size=(size, size), mode='bilinear')
        )
        self.dilated_convs.append(self.gap_branch)
        self.dilated_convs = nn.ModuleList(self.dilated_convs)

    def forward(self, x):
        local_feat = []
        for dilated_conv in self.dilated_convs:
            local_feat.append(dilated_conv(x))
        local_feat = torch.cat(local_feat, dim=1)
        return local_feat


class DolgLocalBranch(nn.Module):
    def __init__(self, img_size, in_channel, out_channel, hidden_channel=2048):
        super().__init__()
        self.multi_atrous = MultiAtrous(in_channel, hidden_channel, size=int(img_size/8))
        self.conv1x1_1 = nn.Conv2d(hidden_channel, out_channel, kernel_size=1)
        self.conv1x1_2 = nn.Conv2d(
            out_channel, out_channel, kernel_size=1, bias=False)
        self.conv1x1_3 = nn.Conv2d(out_channel, out_channel, kernel_size=1)

        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm2d(out_channel)
        self.softplus = nn.Softplus()

    def forward(self, x):
        local_feat = self.multi_atrous(x)

        local_feat = self.conv1x1_1(local_feat)
        local_feat = self.relu(local_feat)
        local_feat = self.conv1x1_2(local_feat)
        local_feat = self.bn(local_feat)

        attention_map = self.relu(local_feat)
        attention_map = self.conv1x1_3(attention_map)
        attention_map = self.softplus(attention_map)

        local_feat = F.normalize(local_feat, p=2, dim=1)
        local_feat = local_feat * attention_map

        return local_feat

class OrthogonalFusion(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, local_feat, global_feat):
        global_feat_norm = torch.norm(global_feat, p=2, dim=1)
        projection = torch.bmm(global_feat.unsqueeze(1), torch.flatten(
            local_feat, start_dim=2))
        projection = torch.bmm(global_feat.unsqueeze(
            2), projection).view(local_feat.size())
        projection = projection / \
            (global_feat_norm * global_feat_norm).view(-1, 1, 1, 1)
        orthogonal_comp = local_feat - projection
        global_feat = global_feat.unsqueeze(-1).unsqueeze(-1)
        return torch.cat([global_feat.expand(orthogonal_comp.size()), orthogonal_comp], dim=1)

class DolgNet(nn.Module):
    def __init__(self, img_size, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.cnn = timm.create_model(
            'tv_resnet101',
            pretrained=True,
            features_only=True,
            in_chans=input_dim,
            out_indices=(2, 3)
        )
        self.orthogonal_fusion = OrthogonalFusion()
        self.local_branch = DolgLocalBranch(img_size, 512, hidden_dim)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.gem_pool = GeM()
        self.fc_1 = nn.Linear(1024, hidden_dim)
        self.fc_2 = nn.Linear(int(2*hidden_dim), output_dim)
#
#        self.criterion = ArcFace(
#            in_features=output_dim,
#            out_features=num_of_classes,
#            scale_factor=30,
#            margin=0.15,
#            criterion=nn.CrossEntropyLoss()
#        )
#
    def forward(self, x):
        output = self.cnn(x)

        local_feat = self.local_branch(output[0])  # ,hidden_channel,16,16
        global_feat = self.fc_1(self.gem_pool(output[1]).squeeze(3).squeeze(2))  # ,1024

        feat = self.orthogonal_fusion(local_feat, global_feat)
        feat = self.gap(feat).squeeze()
        feat = self.fc_2(feat)

        return feat
init the dolg. Signed-off-by: wxywb <xy.wang@zilliz.com> 4 years ago			`import timm`
			`import torch`
			`import torch.nn as nn`
			`import torch.nn.functional as F`
			`import torch.optim as optim`

			`class GeM(nn.Module):`
			`def __init__(self, p=3, eps=1e-6, requires_grad=False):`
			`super(GeM, self).__init__()`
			`self.p = nn.Parameter(torch.ones(1)*p, requires_grad=requires_grad)`
			`self.eps = eps`

			`def forward(self, x):`
			`return self.gem(x, p=self.p, eps=self.eps)`

			`def gem(self, x, p=3, eps=1e-6):`
			`return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)`

			`def __repr__(self):`
			`return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', ' + 'eps=' + str(self.eps) + ')'`

			`class MultiAtrous(nn.Module):`
			`def __init__(self, in_channel, out_channel, size, dilation_rates=[3, 6, 9]):`
			`super().__init__()`
			`self.dilated_convs = [`
			`nn.Conv2d(in_channel, int(out_channel/4),`
			`kernel_size=3, dilation=rate, padding=rate)`
			`for rate in dilation_rates`
			`]`
			`self.gap_branch = nn.Sequential(`
			`nn.AdaptiveAvgPool2d(1),`
			`nn.Conv2d(in_channel, int(out_channel/4), kernel_size=1),`
			`nn.ReLU(),`
			`nn.Upsample(size=(size, size), mode='bilinear')`
			`)`
			`self.dilated_convs.append(self.gap_branch)`
			`self.dilated_convs = nn.ModuleList(self.dilated_convs)`

			`def forward(self, x):`
			`local_feat = []`
			`for dilated_conv in self.dilated_convs:`
			`local_feat.append(dilated_conv(x))`
			`local_feat = torch.cat(local_feat, dim=1)`
			`return local_feat`


			`class DolgLocalBranch(nn.Module):`
			`def __init__(self, img_size, in_channel, out_channel, hidden_channel=2048):`
			`super().__init__()`
			`self.multi_atrous = MultiAtrous(in_channel, hidden_channel, size=int(img_size/8))`
			`self.conv1x1_1 = nn.Conv2d(hidden_channel, out_channel, kernel_size=1)`
			`self.conv1x1_2 = nn.Conv2d(`
			`out_channel, out_channel, kernel_size=1, bias=False)`
			`self.conv1x1_3 = nn.Conv2d(out_channel, out_channel, kernel_size=1)`

			`self.relu = nn.ReLU()`
			`self.bn = nn.BatchNorm2d(out_channel)`
			`self.softplus = nn.Softplus()`

			`def forward(self, x):`
			`local_feat = self.multi_atrous(x)`

			`local_feat = self.conv1x1_1(local_feat)`
			`local_feat = self.relu(local_feat)`
			`local_feat = self.conv1x1_2(local_feat)`
			`local_feat = self.bn(local_feat)`

			`attention_map = self.relu(local_feat)`
			`attention_map = self.conv1x1_3(attention_map)`
			`attention_map = self.softplus(attention_map)`

			`local_feat = F.normalize(local_feat, p=2, dim=1)`
			`local_feat = local_feat * attention_map`

			`return local_feat`

			`class OrthogonalFusion(nn.Module):`
			`def __init__(self):`
			`super().__init__()`

			`def forward(self, local_feat, global_feat):`
			`global_feat_norm = torch.norm(global_feat, p=2, dim=1)`
			`projection = torch.bmm(global_feat.unsqueeze(1), torch.flatten(`
			`local_feat, start_dim=2))`
			`projection = torch.bmm(global_feat.unsqueeze(`
			`2), projection).view(local_feat.size())`
			`projection = projection / \`
			`(global_feat_norm * global_feat_norm).view(-1, 1, 1, 1)`
			`orthogonal_comp = local_feat - projection`
			`global_feat = global_feat.unsqueeze(-1).unsqueeze(-1)`
			`return torch.cat([global_feat.expand(orthogonal_comp.size()), orthogonal_comp], dim=1)`

			`class DolgNet(nn.Module):`
			`def __init__(self, img_size, input_dim, hidden_dim, output_dim):`
			`super().__init__()`
			`self.cnn = timm.create_model(`
			`'tv_resnet101',`
			`pretrained=True,`
			`features_only=True,`
			`in_chans=input_dim,`
			`out_indices=(2, 3)`
			`)`
			`self.orthogonal_fusion = OrthogonalFusion()`
			`self.local_branch = DolgLocalBranch(img_size, 512, hidden_dim)`
			`self.gap = nn.AdaptiveAvgPool2d(1)`
			`self.gem_pool = GeM()`
			`self.fc_1 = nn.Linear(1024, hidden_dim)`
			`self.fc_2 = nn.Linear(int(2*hidden_dim), output_dim)`
			`#`
			`# self.criterion = ArcFace(`
			`# in_features=output_dim,`
			`# out_features=num_of_classes,`
			`# scale_factor=30,`
			`# margin=0.15,`
			`# criterion=nn.CrossEntropyLoss()`
			`# )`
			`#`
			`def forward(self, x):`
			`output = self.cnn(x)`

			`local_feat = self.local_branch(output[0]) # ,hidden_channel,16,16`
			`global_feat = self.fc_1(self.gem_pool(output[1]).squeeze(3).squeeze(2)) # ,1024`

			`feat = self.orthogonal_fusion(local_feat, global_feat)`
			`feat = self.gap(feat).squeeze()`
			`feat = self.fc_2(feat)`

			`return feat`