keras 我已经在语言建模上训练了一个自定义的Transformer模型,现在我如何使用它进行预测?

eblbsuwk  于 2023-08-06  发布在  其他
关注(0)|答案(1)|浏览(102)

我已经在数据集上训练了一个关于语言建模的Transformer模型(即在给定上下文的情况下预测下一个字符)。CONTEXT_LENGTH = 200,我希望模型预测输入的长度不是CONTEXT_LENGTH,那么我如何修改我的代码,以便我可以预测不同的输入形状,并帮助我编写生成下一个字符的函数的代码。

class Embed(keras.layers.Layer):
    """word_embedding + positional_embedding """
    def __init__(self):
        super().__init__()
        self.word_embed = keras.layers.Embedding(VOCAB_SIZE, d_model) # (B, T) =(vocab_size, d_model)=> (B, T, d_model)
        self.position_embed = keras.layers.Embedding(CONTEXT_LENGTH, d_model) # (B, T) =(CONTEXT_LENGTH, d_model)=> (B, T, d_model)

    def call(self, inputs):
        B, T = inputs.shape # when training CONTEXT_LENGTH = T
        tok_embed = self.word_embed(inputs) # (B, T, d_model)
        pos_embed = self.position_embed(tf.range(T)) # (T, d_model)
        return  tok_embed + pos_embed # (B, T, d_model)
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config}

class MultiHeadAttention(keras.layers.Layer):
    def __init__(self, mask: bool):
        super().__init__()
        self.mask = mask
        self.linear = keras.layers.Dense(d_model, use_bias=False)
        self.linearqkv = keras.layers.Dense(d_k, use_bias=False), keras.layers.Dense(d_k, use_bias=False), keras.layers.Dense(d_v, use_bias=False)
        self.dropout = keras.layers.Dropout(0.1)
    
    def attention(self, Q, K, V):
        def mask_tensor(x):
            tril = tf.experimental.numpy.tril(tf.ones_like(x))
            return tf.where(tril==0, float('-inf'), x)
        scores = Q @ tf.transpose(K, perm=[0, 2, 1])/K.shape[-1]**0.5 # (B, T, T)
        scores = mask_tensor(scores) if self.mask else scores
        return tf.nn.softmax(scores, axis=-1) @ V # (B, T, d_v)

    def head(self, X):
        Q, K, V = self.linearqkv[0](X), self.linearqkv[1](X), self.linearqkv[2](X)
        return self.attention(Q, K, V)

    def call(self, X):
        heads = tf.concat([self.head(X) for _ in range(h)], axis=-1)
        output = self.linear(heads)
        output = self.dropout(output)
        return output
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "mask": self.mask}

def FeedForward():
    return keras.Sequential([
        keras.layers.Dense(d_in),
        keras.layers.ReLU(),
        keras.layers.Dense(d_model),
        keras.layers.Dropout(0.2)
    ])

inputs = keras.Input(shape=(200,))
X = Embed()(inputs)
for _ in range(N):
    Z = MultiHeadAttention(mask=True)(X)
    X = keras.layers.LayerNormalization()(Z + X)

    Z = FeedForward()(X)
    X = keras.layers.LayerNormalization()(Z + X)
outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(X) # (B, T, VOCAB_SIZE)

model = keras.Model(inputs=inputs, outputs=outputs, name="transformer")

字符串
我想可能是Embed层有问题,当添加tok_embedpos_embed时。我认为它可以修改,以便它可以接受不同长度的输入。填充会影响模型性能,那么还有其他方法吗?
请帮帮忙,谢谢。
编辑:训练中没有问题,准确性很好。

6l7fqoea

6l7fqoea1#

我已经改变了代码,所以Transformer模型可以接受不同长度的输入。

def transformer():
    class Embed(keras.layers.Layer):
        """word_embedding + positional_embedding """
        def __init__(self, **kwargs):
            super().__init__(**kwargs)
            self.word_embed = keras.layers.Embedding(VOCAB_SIZE, d_model) # (B, T) =(vocab_size, d_model)=> (B, T, d_model)
            self.position_embed = keras.layers.Embedding(MAX_LENGTH, d_model) # (B, T) =(MAX_LENGTH, d_model)=> (B, T, d_model)

        def call(self, inputs):
            B, T = inputs.shape # if training, T = MAX_LENGTH
            tok_embed = self.word_embed(inputs) # (B, T, d_model)
            pos_embed = self.position_embed(tf.range(MAX_LENGTH)) # (MAX_LENGTH, d_model) =[:T, :]=> (T, d_model)
            return tok_embed + pos_embed[:T, :] # (B, T, d_model) + (T, d_model) ==> (B, T, d_model)

        def get_config(self):
            base_config = super().get_config()
            return {**base_config}
       
    class MultiHeadAttention(keras.layers.Layer):
        def __init__(self, causal: bool, **kwargs):
            super().__init__(**kwargs)
            self.causal = causal
            self.linear = keras.layers.Dense(d_model, use_bias=False)
            self.linearqkv = [keras.layers.Dense(d_k, use_bias=False),
                              keras.layers.Dense(d_k, use_bias=False),
                              keras.layers.Dense(d_v, use_bias=False)]
            self.dropout = keras.layers.Dropout(0.1)

        def attention(self, Q, K, V):
            def mask_tensor(x):
                tril = tf.experimental.numpy.tril(tf.ones_like(x))
                return tf.where(tril==0, float('-inf'), x)
            scores = Q @ tf.transpose(K, perm=[0, 2, 1])/K.shape[-1]**0.5 # (B, T, T)
            scores = mask_tensor(scores) if self.causal else scores
            return tf.nn.softmax(scores, axis=-1) @ V # (B, T, d_v)

        def head(self, X):
            Q, K, V = self.linearqkv[0](X), self.linearqkv[1](X), self.linearqkv[2](X)
            return self.attention(Q, K, V)

        def call(self, X):
            heads = tf.concat([self.head(X) for _ in range(h)], axis=-1)
            output = self.linear(heads)
            return self.dropout(output)

        def get_config(self):
            base_config = super().get_config()
            return {**base_config, "causal": self.causal}
        
    def FeedForward():
        return keras.Sequential([
            keras.layers.Dense(d_in),
            keras.layers.ReLU(),
            keras.layers.Dense(d_model),
            keras.layers.Dropout(0.1)
        ])
    
    inputs = keras.Input(shape=(None,)) # so can take inputs of varied length
    x = Embed()(inputs)
    for _ in range(N): # transformer's decoder
        z = MultiHeadAttention(causal=True)(x)
        x = keras.layers.LayerNormalization()(keras.layers.Add()([z, x]))

        z = FeedForward()(x)
        x = keras.layers.LayerNormalization()(keras.layers.Add()([z, x]))
    outputs = keras.layers.Dense(VOCAB_SIZE, activation="softmax")(x) # (B, T, VOCAB_SIZE)

    model = keras.Model(inputs=inputs, outputs=outputs, name="transformer")
    print("number of parameters in the model", model.count_params())
    return model

字符串

不要使用model.predict(...)生成,使用model(..., training=False)

def generate_file(prompt: str, num_char: int, temperature=1):
    def next_char(seq):
        return sentence(tf.argmax(model(np.array(encode(seq))[np.newaxis], training=False)/temperature, axis=-1)[0].numpy().tolist())[-1]
    seq = prompt 
    for i in range(num_char):
        if len(seq) >= MAX_LENGTH: 
            seq += next_char(seq[-(MAX_LENGTH-1):]) # last MAX_LENGTH-1 characters so can predict char at MAX_LENGTH
        elif len(seq) < MAX_LENGTH:
            seq += next_char(seq)
    print(seq)
    with open(“machine_generated_text.txt”, ”w”) as f:
        f.write(seq)

相关问题