BERT模型源码解析( 十 )


to_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `to_tensor`.
Returns: 返回值 浮点值的张量
float Tensor of shape [batch_size, from_seq_length,
num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
true, this will be of shape [batch_size * from_seq_length,
num_attention_heads * size_per_head]).
Raises: 异常:参数无效,或者张量形状无效
ValueError: Any of the arguments or tensor shapes are invalid.
"""
■变形+转置→为了获取得分
def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
seq_length, width):
output_tensor = tf.reshape(
input_tensor, [batch_size, seq_length, num_attention_heads, width])
tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高(即为二维数组的个数),1代表二维数组的行,2代表二维数组的列 。
tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置 。
output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
return output_tensor
获取形状
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
if len(from_shape) != len(to_shape):
raise ValueError(
"The rank of `from_tensor` must match the rank of `to_tensor`.")
if len(from_shape) == 3: 三维张量
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_seq_length = to_shape[1]
elif len(from_shape) == 2: 二维张量
if (batch_size is None or from_seq_length is None or to_seq_length is None):
raise ValueError(
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified.")
引用的 维度:
# Scalar dimensions referenced here: 标量维度
#   B = batch size (number of sequences)  B批处理量
#   F = `from_tensor` sequence length  F输入张量的序列长度
#   T = `to_tensor` sequence length   T输出张量的序列长度
#   N = `num_attention_heads`     N注意力头数
#   H = `size_per_head`    H每个头的大小
from_tensor_2d = reshape_to_matrix(from_tensor) 转换为二维矩阵
to_tensor_2d = reshape_to_matrix(to_tensor) 转换为二维矩阵
# `query_layer` = [B*F, N*H]  询问层=[批处理量*输入长度,头数*每头的大小]
query_layer = tf.layers.dense(  创建一个全连接层,密集层
from_tensor_2d,
num_attention_heads * size_per_head,
activation=query_act,
name="query",
kernel_initializer=create_initializer(initializer_range))
# `key_layer` = [B*T, N*H]  关键层=[批处理量*输出长度,头数*每头的大小]
key_layer = tf.layers.dense( 创建一个全连接层,密集层
to_tensor_2d,
num_attention_heads * size_per_head,
activation=key_act,
name="key",
kernel_initializer=create_initializer(initializer_range))
# `value_layer` = [B*T, N*H]  数值层=[批处理量*输出长度,头数*每头的大小]
value_layer = tf.layers.dense( 创建一个全连接层,密集层
to_tensor_2d,
num_attention_heads * size_per_head,
activation=value_act,
name="value",
kernel_initializer=create_initializer(initializer_range))
# `query_layer` = [B, N, F, H] 变形+转置→为了获取得分
query_layer = transpose_for_scores(query_layer, batch_size,
num_attention_heads, from_seq_length,
size_per_head)
# `key_layer` = [B, N, T, H]  变形+转置→为了获取得分
key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,

经验总结扩展阅读