lightningdot/uniter_model/model/layer.py

"""
BERT layers from the huggingface implementation
(https://github.com/huggingface/transformers)
"""
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import math

import torch
from torch import nn
#from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm

BertLayerNorm = torch.nn.LayerNorm 


logger = logging.getLogger(__name__)


def gelu(x):
    """Implementation of the gelu activation function.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
    return x * torch.sigmoid(x)


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


class GELU(nn.Module):
    def forward(self, input_):
        output = gelu(input_)
        return output


class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super(BertSelfAttention, self).__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        return context_layer


class BertSelfOutput(nn.Module):
    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertAttention(nn.Module):
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask):
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output


class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


class BertOutput(nn.Module):
    def __init__(self, config):
        super(BertOutput, self).__init__()
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


class BertLayer(nn.Module):
    def __init__(self, config):
        super(BertLayer, self).__init__()
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask):
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output


class BertPooler(nn.Module):
    def __init__(self, config):
        super(BertPooler, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class BertPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


class BertLMPredictionHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertLMPredictionHead, self).__init__()
        self.transform = BertPredictionHeadTransform(config)

        # The output weights are the same as the input embeddings, but there is
        # an output-only bias for each token.
        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
                                 bert_model_embedding_weights.size(0),
                                 bias=False)
        self.decoder.weight = bert_model_embedding_weights
        self.bias = nn.Parameter(
            torch.zeros(bert_model_embedding_weights.size(0)))

    def forward(self, hidden_states):
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states) + self.bias
        return hidden_states


class BertOnlyMLMHead(nn.Module):
    def __init__(self, config, bert_model_embedding_weights):
        super(BertOnlyMLMHead, self).__init__()
        self.predictions = BertLMPredictionHead(config,
                                                bert_model_embedding_weights)

    def forward(self, sequence_output):
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores
update the operator. Signed-off-by: wxywb <xy.wang@zilliz.com> 2 years ago			`"""`
			`BERT layers from the huggingface implementation`
			`(https://github.com/huggingface/transformers)`
			`"""`
			`# coding=utf-8`
			`# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.`
			`# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`import logging`
			`import math`

			`import torch`
			`from torch import nn`
			`#from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm`

			`BertLayerNorm = torch.nn.LayerNorm`


			`logger = logging.getLogger(__name__)`


			`def gelu(x):`
			`"""Implementation of the gelu activation function.`
			`For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):`
			`0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))`
			`Also see https://arxiv.org/abs/1606.08415`
			`"""`
			`return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))`


			`def swish(x):`
			`return x * torch.sigmoid(x)`


			`ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}`


			`class GELU(nn.Module):`
			`def forward(self, input_):`
			`output = gelu(input_)`
			`return output`


			`class BertSelfAttention(nn.Module):`
			`def __init__(self, config):`
			`super(BertSelfAttention, self).__init__()`
			`if config.hidden_size % config.num_attention_heads != 0:`
			`raise ValueError(`
			`"The hidden size (%d) is not a multiple of the number of attention "`
			`"heads (%d)" % (config.hidden_size, config.num_attention_heads))`
			`self.num_attention_heads = config.num_attention_heads`
			`self.attention_head_size = int(config.hidden_size / config.num_attention_heads)`
			`self.all_head_size = self.num_attention_heads * self.attention_head_size`

			`self.query = nn.Linear(config.hidden_size, self.all_head_size)`
			`self.key = nn.Linear(config.hidden_size, self.all_head_size)`
			`self.value = nn.Linear(config.hidden_size, self.all_head_size)`

			`self.dropout = nn.Dropout(config.attention_probs_dropout_prob)`

			`def transpose_for_scores(self, x):`
			`new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)`
			`x = x.view(*new_x_shape)`
			`return x.permute(0, 2, 1, 3)`

			`def forward(self, hidden_states, attention_mask):`
			`mixed_query_layer = self.query(hidden_states)`
			`mixed_key_layer = self.key(hidden_states)`
			`mixed_value_layer = self.value(hidden_states)`

			`query_layer = self.transpose_for_scores(mixed_query_layer)`
			`key_layer = self.transpose_for_scores(mixed_key_layer)`
			`value_layer = self.transpose_for_scores(mixed_value_layer)`

			`# Take the dot product between "query" and "key" to get the raw attention scores.`
			`attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))`
			`attention_scores = attention_scores / math.sqrt(self.attention_head_size)`
			`# Apply the attention mask is (precomputed for all layers in BertModel forward() function)`
			`attention_scores = attention_scores + attention_mask`

			`# Normalize the attention scores to probabilities.`
			`attention_probs = nn.Softmax(dim=-1)(attention_scores)`

			`# This is actually dropping out entire tokens to attend to, which might`
			`# seem a bit unusual, but is taken from the original Transformer paper.`
			`attention_probs = self.dropout(attention_probs)`

			`context_layer = torch.matmul(attention_probs, value_layer)`
			`context_layer = context_layer.permute(0, 2, 1, 3).contiguous()`
			`new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)`
			`context_layer = context_layer.view(*new_context_layer_shape)`
			`return context_layer`


			`class BertSelfOutput(nn.Module):`
			`def __init__(self, config):`
			`super(BertSelfOutput, self).__init__()`
			`self.dense = nn.Linear(config.hidden_size, config.hidden_size)`
			`self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)`
			`self.dropout = nn.Dropout(config.hidden_dropout_prob)`

			`def forward(self, hidden_states, input_tensor):`
			`hidden_states = self.dense(hidden_states)`
			`hidden_states = self.dropout(hidden_states)`
			`hidden_states = self.LayerNorm(hidden_states + input_tensor)`
			`return hidden_states`


			`class BertAttention(nn.Module):`
			`def __init__(self, config):`
			`super(BertAttention, self).__init__()`
			`self.self = BertSelfAttention(config)`
			`self.output = BertSelfOutput(config)`

			`def forward(self, input_tensor, attention_mask):`
			`self_output = self.self(input_tensor, attention_mask)`
			`attention_output = self.output(self_output, input_tensor)`
			`return attention_output`


			`class BertIntermediate(nn.Module):`
			`def __init__(self, config):`
			`super(BertIntermediate, self).__init__()`
			`self.dense = nn.Linear(config.hidden_size, config.intermediate_size)`
			`if isinstance(config.hidden_act, str):`
			`self.intermediate_act_fn = ACT2FN[config.hidden_act]`
			`else:`
			`self.intermediate_act_fn = config.hidden_act`

			`def forward(self, hidden_states):`
			`hidden_states = self.dense(hidden_states)`
			`hidden_states = self.intermediate_act_fn(hidden_states)`
			`return hidden_states`


			`class BertOutput(nn.Module):`
			`def __init__(self, config):`
			`super(BertOutput, self).__init__()`
			`self.dense = nn.Linear(config.intermediate_size, config.hidden_size)`
			`self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)`
			`self.dropout = nn.Dropout(config.hidden_dropout_prob)`

			`def forward(self, hidden_states, input_tensor):`
			`hidden_states = self.dense(hidden_states)`
			`hidden_states = self.dropout(hidden_states)`
			`hidden_states = self.LayerNorm(hidden_states + input_tensor)`
			`return hidden_states`


			`class BertLayer(nn.Module):`
			`def __init__(self, config):`
			`super(BertLayer, self).__init__()`
			`self.attention = BertAttention(config)`
			`self.intermediate = BertIntermediate(config)`
			`self.output = BertOutput(config)`

			`def forward(self, hidden_states, attention_mask):`
			`attention_output = self.attention(hidden_states, attention_mask)`
			`intermediate_output = self.intermediate(attention_output)`
			`layer_output = self.output(intermediate_output, attention_output)`
			`return layer_output`


			`class BertPooler(nn.Module):`
			`def __init__(self, config):`
			`super(BertPooler, self).__init__()`
			`self.dense = nn.Linear(config.hidden_size, config.hidden_size)`
			`self.activation = nn.Tanh()`

			`def forward(self, hidden_states):`
			`# We "pool" the model by simply taking the hidden state corresponding`
			`# to the first token.`
			`first_token_tensor = hidden_states[:, 0]`
			`pooled_output = self.dense(first_token_tensor)`
			`pooled_output = self.activation(pooled_output)`
			`return pooled_output`


			`class BertPredictionHeadTransform(nn.Module):`
			`def __init__(self, config):`
			`super(BertPredictionHeadTransform, self).__init__()`
			`self.dense = nn.Linear(config.hidden_size, config.hidden_size)`
			`if isinstance(config.hidden_act, str):`
			`self.transform_act_fn = ACT2FN[config.hidden_act]`
			`else:`
			`self.transform_act_fn = config.hidden_act`
			`self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)`

			`def forward(self, hidden_states):`
			`hidden_states = self.dense(hidden_states)`
			`hidden_states = self.transform_act_fn(hidden_states)`
			`hidden_states = self.LayerNorm(hidden_states)`
			`return hidden_states`


			`class BertLMPredictionHead(nn.Module):`
			`def __init__(self, config, bert_model_embedding_weights):`
			`super(BertLMPredictionHead, self).__init__()`
			`self.transform = BertPredictionHeadTransform(config)`

			`# The output weights are the same as the input embeddings, but there is`
			`# an output-only bias for each token.`
			`self.decoder = nn.Linear(bert_model_embedding_weights.size(1),`
			`bert_model_embedding_weights.size(0),`
			`bias=False)`
			`self.decoder.weight = bert_model_embedding_weights`
			`self.bias = nn.Parameter(`
			`torch.zeros(bert_model_embedding_weights.size(0)))`

			`def forward(self, hidden_states):`
			`hidden_states = self.transform(hidden_states)`
			`hidden_states = self.decoder(hidden_states) + self.bias`
			`return hidden_states`


			`class BertOnlyMLMHead(nn.Module):`
			`def __init__(self, config, bert_model_embedding_weights):`
			`super(BertOnlyMLMHead, self).__init__()`
			`self.predictions = BertLMPredictionHead(config,`
			`bert_model_embedding_weights)`

			`def forward(self, sequence_output):`
			`prediction_scores = self.predictions(sequence_output)`
			`return prediction_scores`