to_seq_length: (Optional) If the input is 2D, this might be the seq length
of the 3D version of the `to_tensor`.
Returns: 返回值 浮点值的张量
float Tensor of shape [batch_size, from_seq_length,
num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
true, this will be of shape [batch_size * from_seq_length,
num_attention_heads * size_per_head]).
Raises: 异常:参数无效,或者张量形状无效
ValueError: Any of the arguments or tensor shapes are invalid.
"""
■变形+转置→为了获取得分
def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
seq_length, width):
output_tensor = tf.reshape(
input_tensor, [batch_size, seq_length, num_attention_heads, width])
tf.transpose的第二个参数perm=[0,1,2],0代表三维数组的高(即为二维数组的个数),1代表二维数组的行,2代表二维数组的列 。
tf.transpose(x, perm=[1,0,2])代表将三位数组的高和行进行转置 。
output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
return output_tensor
获取形状
from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
if len(from_shape) != len(to_shape):
raise ValueError(
"The rank of `from_tensor` must match the rank of `to_tensor`.")
if len(from_shape) == 3: 三维张量
batch_size = from_shape[0]
from_seq_length = from_shape[1]
to_seq_length = to_shape[1]
elif len(from_shape) == 2: 二维张量
if (batch_size is None or from_seq_length is None or to_seq_length is None):
raise ValueError(
"When passing in rank 2 tensors to attention_layer, the values "
"for `batch_size`, `from_seq_length`, and `to_seq_length` "
"must all be specified.")
引用的 维度:
# Scalar dimensions referenced here: 标量维度
# B = batch size (number of sequences) B批处理量
# F = `from_tensor` sequence length F输入张量的序列长度
# T = `to_tensor` sequence length T输出张量的序列长度
# N = `num_attention_heads` N注意力头数
# H = `size_per_head` H每个头的大小
from_tensor_2d = reshape_to_matrix(from_tensor) 转换为二维矩阵
to_tensor_2d = reshape_to_matrix(to_tensor) 转换为二维矩阵
# `query_layer` = [B*F, N*H] 询问层=[批处理量*输入长度,头数*每头的大小]
query_layer = tf.layers.dense( 创建一个全连接层,密集层
from_tensor_2d,
num_attention_heads * size_per_head,
activation=query_act,
name="query",
kernel_initializer=create_initializer(initializer_range))
# `key_layer` = [B*T, N*H] 关键层=[批处理量*输出长度,头数*每头的大小]
key_layer = tf.layers.dense( 创建一个全连接层,密集层
to_tensor_2d,
num_attention_heads * size_per_head,
activation=key_act,
name="key",
kernel_initializer=create_initializer(initializer_range))
# `value_layer` = [B*T, N*H] 数值层=[批处理量*输出长度,头数*每头的大小]
value_layer = tf.layers.dense( 创建一个全连接层,密集层
to_tensor_2d,
num_attention_heads * size_per_head,
activation=value_act,
name="value",
kernel_initializer=create_initializer(initializer_range))
# `query_layer` = [B, N, F, H] 变形+转置→为了获取得分
query_layer = transpose_for_scores(query_layer, batch_size,
num_attention_heads, from_seq_length,
size_per_head)
# `key_layer` = [B, N, T, H] 变形+转置→为了获取得分
key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
经验总结扩展阅读
- 【lwip】11-UDP协议&源码分析
- 硬核剖析Java锁底层AQS源码,深入理解底层架构设计
- SpringCloudAlibaba 微服务组件 Nacos 之配置中心源码深度解析
- Seata 1.5.2 源码学习
- MindStudio模型训练场景精度比对全流程和结果分析
- .NET 源码学习 [数据结构-线性表1.2] 链表与 LinkedList<T>
- Redisson源码解读-公平锁
- OpenHarmony移植案例: build lite源码分析之hb命令__entry__.py
- 【深入浅出 Yarn 架构与实现】1-2 搭建 Hadoop 源码阅读环境
- JVM学习笔记——内存模型篇