我已经在数据集上训练了一个关于语言建模的Transformer模型(即在给定上下文的情况下预测下一个字符)。CONTEXT_LENGTH = 200
,我希望模型预测输入的长度不是CONTEXT_LENGTH
,那么我如何修改我的代码,以便我可以预测不同的输入形状,并帮助我编写生成下一个字符的函数的代码。
class Embed(keras.layers.Layer):
"""word_embedding + positional_embedding """
def __init__(self):
super().__init__()
self.word_embed = keras.layers.Embedding(VOCAB_SIZE, d_model) # (B, T) =(vocab_size, d_model)=> (B, T, d_model)
self.position_embed = keras.layers.Embedding(CONTEXT_LENGTH, d_model) # (B, T) =(CONTEXT_LENGTH, d_model)=> (B, T, d_model)
def call(self, inputs):
B, T = inputs.shape # when training CONTEXT_LENGTH = T
tok_embed = self.word_embed(inputs) # (B, T, d_model)
pos_embed = self.position_embed(tf.range(T)) # (T, d_model)
return tok_embed + pos_embed # (B, T, d_model)
def get_config(self):
base_config = super().get_config()
return {**base_config}
class MultiHeadAttention(keras.layers.Layer):
def __init__(self, mask: bool):
super().__init__()
self.mask = mask
self.linear = keras.layers.Dense(d_model, use_bias=False)
self.linearqkv = keras.layers.Dense(d_k, use_bias=False), keras.layers.Dense(d_k, use_bias=False), keras.layers.Dense(d_v, use_bias=False)
self.dropout = keras.layers.Dropout(0.1)
def attention(self, Q, K, V):
def mask_tensor(x):
tril = tf.experimental.numpy.tril(tf.ones_like(x))
return tf.where(tril==0, float('-inf'), x)
scores = Q @ tf.transpose(K, perm=[0, 2, 1])/K.shape[-1]**0.5 # (B, T, T)
scores = mask_tensor(scores) if self.mask else scores
return tf.nn.softmax(scores, axis=-1) @ V # (B, T, d_v)
def head(self, X):
Q, K, V = self.linearqkv[0](X), self.linearqkv[1](X), self.linearqkv[2](X)
return self.attention(Q, K, V)
def call(self, X):
heads = tf.concat([self.head(X) for _ in range(h)], axis=-1)
output = self.linear(heads)
output = self.dropout(output)
return output
def get_config(self):
base_config = super().get_config()
return {**base_config, "mask": self.mask}
def FeedForward():
return keras.Sequential([
keras.layers.Dense(d_in),
keras.layers.ReLU(),
keras.layers.Dense(d_model),
keras.layers.Dropout(0.2)
])
inputs = keras.Input(shape=(200,))
X = Embed()(inputs)
for _ in range(N):
Z = MultiHeadAttention(mask=True)(X)
X = keras.layers.LayerNormalization()(Z + X)
Z = FeedForward()(X)
X = keras.layers.LayerNormalization()(Z + X)
outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(X) # (B, T, VOCAB_SIZE)
model = keras.Model(inputs=inputs, outputs=outputs, name="transformer")
字符串
我想可能是Embed
层有问题,当添加tok_embed
和pos_embed
时。我认为它可以修改,以便它可以接受不同长度的输入。填充会影响模型性能,那么还有其他方法吗?
请帮帮忙,谢谢。
编辑:训练中没有问题,准确性很好。
1条答案
按热度按时间6l7fqoea1#
我已经改变了代码,所以Transformer模型可以接受不同长度的输入。
字符串
不要使用
model.predict(...)
生成,使用model(..., training=False)
型