Source code for deepke.attribution_extraction.standard.module.Transformer

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from .Attention import MultiHeadAttention


[docs]def gelu(x): """ Original Implementation of the gelu activation function in Google Bert repo when initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
[docs]def gelu_new(x): """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see https://arxiv.org/abs/1606.08415 """ return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
[docs]def swish(x): return x * torch.sigmoid(x)
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}
[docs]class TransformerAttention(nn.Module): def __init__(self, config): super(TransformerAttention, self).__init__() # self.xxx = config.xxx self.hidden_size = config.hidden_size self.num_heads = config.num_heads self.dropout = config.dropout self.output_attentions = config.output_attentions self.layer_norm_eps = config.layer_norm_eps self.multihead_attention = MultiHeadAttention(self.hidden_size, self.num_heads, self.dropout, self.output_attentions) self.dense = nn.Linear(self.hidden_size, self.hidden_size) self.dropout = nn.Dropout(self.dropout) self.layerNorm = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
[docs] def forward(self, x, key_padding_mask=None, attention_mask=None, head_mask=None): """ :param x: [B, L, Hs] :param attention_mask: [B, L] padding后的句子后面补0了,补0的位置为True,前面部分为False :param head_mask: [L] [N,L] :return: """ attention_outputs = self.multihead_attention(x, x, x, key_padding_mask, attention_mask, head_mask) attention_output = attention_outputs[0] attention_output = self.dense(attention_output) attention_output = self.dropout(attention_output) attention_output = self.layerNorm(attention_output + x) outputs = (attention_output, ) + attention_outputs[1:] # 后面是 attention weight return outputs
[docs]class TransformerOutput(nn.Module): def __init__(self, config): super(TransformerOutput, self).__init__() # self.xxx = config.xxx self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size self.dropout = config.dropout self.layer_norm_eps = config.layer_norm_eps self.zoom_in = nn.Linear(self.hidden_size, self.intermediate_size) self.intermediate_act_fn = ACT2FN[config.hidden_act] self.zoom_out = nn.Linear(self.intermediate_size, self.hidden_size) self.dropout = nn.Dropout(self.dropout) self.layerNorm = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
[docs] def forward(self, input_tensor): hidden_states = self.zoom_in(input_tensor) hidden_states = self.intermediate_act_fn(hidden_states) hidden_states = self.zoom_out(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.layerNorm(hidden_states + input_tensor) return hidden_states
[docs]class TransformerLayer(nn.Module): def __init__(self, config): super(TransformerLayer, self).__init__() self.attention = TransformerAttention(config) self.output = TransformerOutput(config)
[docs] def forward(self, hidden_states, key_padding_mask=None, attention_mask=None, head_mask=None): attention_outputs = self.attention(hidden_states, key_padding_mask, attention_mask, head_mask) attention_output = attention_outputs[0] layer_output = self.output(attention_output) outputs = (layer_output, ) + attention_outputs[1:] return outputs
[docs]class Transformer(nn.Module): def __init__(self, config): super(Transformer, self).__init__() # self.xxx = config.xxx self.num_hidden_layers = config.num_hidden_layers self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states self.layer = nn.ModuleList([TransformerLayer(config) for _ in range(self.num_hidden_layers)])
[docs] def forward(self, hidden_states, key_padding_mask=None, attention_mask=None, head_mask=None): """ :param hidden_states: [B, L, Hs] :param key_padding_mask: [B, S] 为 1/True 的地方需要 mask :param attn_mask: [S] / [L, S] 指定位置 mask 掉, 为 1/True 的地方需要 mask :param head_mask: [N] / [L, N] 指定 head mask 掉, 为 1/True 的地方需要 mask """ if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.expand((self.num_hidden_layers, ) + head_mask.shape) else: head_mask = [None] * self.num_hidden_layers all_hidden_states = () all_attentions = () for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) layer_outputs = layer_module(hidden_states, key_padding_mask, attention_mask, head_mask[i]) hidden_states = layer_outputs[0] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1], ) # Add last layer if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) outputs = (hidden_states, ) if self.output_hidden_states: outputs = outputs + (all_hidden_states, ) if self.output_attentions: outputs = outputs + (all_attentions, ) return outputs # last-layer hidden state, (all hidden states), (all attentions)