to_seq_length, size_per_head)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
对query和key进行点乘计算,获取原始得分
# `attention_scores` = [B, N, F, T]
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(size_per_head)))
if attention_mask is not None: 如果注意力掩码非空
# `attention_mask` = [B, 1, F, T]
attention_mask = tf.expand_dims(attention_mask, axis=[1]) 扩张
对于关心的位置掩码为1,其他位置掩码为0,
这将创建一个新的矩阵,关心的位置为0,掩码位置为-10000
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
attention_scores += adder
在进行softmax计算前,将其加到得分里面,相当于完全删除这些内容
将注意力分数值标准化,然后就变成了概率值(归一化)
# Normalize the attention scores to probabilities.
# `attention_probs` = [B, N, F, T] probs=probabilities概率值
attention_probs = tf.nn.softmax(attention_scores)
这真的丢弃了我们关心的值,这看起来有点不正常,
但是这是按照Transformer 论文进行的
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
进行部分数据的丢弃,防止过拟合
# `value_layer` = [B, T, N, H]
value_layer = tf.reshape(
value_layer,
[batch_size, to_seq_length, num_attention_heads, size_per_head])
# `value_layer` = [B, N, T, H] 第二维和第三维进行转置
value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
# `context_layer` = [B, N, F, H]
context_layer = tf.matmul(attention_probs, value_layer)
注意力矩阵*值矩阵=上下文矩阵
# `context_layer` = [B, F, N, H] 上下文矩阵转置
context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
if do_return_2d_tensor: 如果要求返回二维矩阵,就将上下文矩阵变形成二维
# `context_layer` = [B*F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size * from_seq_length, num_attention_heads * size_per_head])
else: 如果不要求返回二维矩阵,就变形为一个三维矩阵
# `context_layer` = [B, F, N*H]
context_layer = tf.reshape(
context_layer,
[batch_size, from_seq_length, num_attention_heads * size_per_head])
return context_layer
transformer模型
■transformer模型 函数
def transformer_model(input_tensor,
attention_mask=None,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):
"""Multi-headed, multi-layer Transformer from "Attention is All You Need".
多头的,多层的Transformer 模型,基于一个理念“注意的才是需要的”
This is almost an exact implementation of the original Transformer encoder.
这是对原始Transformer 代码的精确实现
See the original paper: 论文参照下面的链接
https://arxiv.org/abs/1706.03762
经验总结扩展阅读
- 【lwip】11-UDP协议&源码分析
- 硬核剖析Java锁底层AQS源码,深入理解底层架构设计
- SpringCloudAlibaba 微服务组件 Nacos 之配置中心源码深度解析
- Seata 1.5.2 源码学习
- MindStudio模型训练场景精度比对全流程和结果分析
- .NET 源码学习 [数据结构-线性表1.2] 链表与 LinkedList<T>
- Redisson源码解读-公平锁
- OpenHarmony移植案例: build lite源码分析之hb命令__entry__.py
- 【深入浅出 Yarn 架构与实现】1-2 搭建 Hadoop 源码阅读环境
- JVM学习笔记——内存模型篇