keras 为什么Attention层不能提高模型的性能

我正在研究一个多标签文本分类问题，下面是我的样本描述

x_train shape: (8066, 3000)
x_test shape: (1729, 3000)
x_valid shape: (1573, 3000)

我实现了一个RCNN模型，它给我的数据60 MiF评分，但当我添加注意力层到RCNN模型时，它仍然给我同样的60分。我尝试了几乎每一种可能的解决方案，包括调整批量大小、时期、辍学、批量规范、隐藏单元的不同大小，但没有改善我的基于注意力的模型的评分。P.S我试图在RNN之前添加注意力层，在RNN前后增加注意层会降低模型的性能。
接下来我将分享我的模型架构，不需要注意，也需要注意。

未引起注意的模型

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Embedding, Input, Bidirectional, LSTM
maxlen=3000
max_features= 50000
embedding_dims=100
input_length=maxlen

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(maxlen,), dtype="int32")
# Embed each integer in a 100-dimensional vector
x = layers.Embedding(max_features,
                            embedding_dims,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)(inputs)
x = layers.Dropout(0.5)(x)

# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
b = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)

c1 = layers.Conv1D(128, 1, activation='relu')(b)
A1 = layers.GlobalAveragePooling1D()(c1)
M1=  layers.GlobalMaxPooling1D()(c1)

c2 = layers.Conv1D(128, 2, activation='relu')(b)
A2 = layers.GlobalAveragePooling1D()(c2)
M2=  layers.GlobalMaxPooling1D()(c2)

c3 = layers.Conv1D(128, 3, activation='relu')(b)
A3 = layers.GlobalAveragePooling1D()(c3)
M3=  layers.GlobalMaxPooling1D()(c3)

c4 = layers.Conv1D(128, 4, activation='relu')(b)
A4 = layers.GlobalAveragePooling1D()(c4)
M4=  layers.GlobalMaxPooling1D()(c4)

c5 = layers.Conv1D(128, 5, activation='relu')(b)
A5 = layers.GlobalAveragePooling1D()(c5)
M5 =  layers.GlobalMaxPooling1D()(c5)

concat = layers.concatenate([A1, M1, A2, M2, A3, M3, A4, M4, A5, M5])

outputs = layers.Dense(50, activation="sigmoid")(concat)
model = keras.Model(inputs, outputs)

注意层

from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class Attention(Layer):
    def __init__(self,
                 kernel_regularizer=None, bias_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True,
                 return_attention=False,
                 **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 1.x
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(kernel_regularizer)
        self.b_regularizer = regularizers.get(bias_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_attention = return_attention
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name))

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zeros',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.built = True

    def compute_mask(self, inputs, mask=None):
        # do not pass the mask to the next layers
        if self.return_attention:
            return [None, None]
        return None

    def call(self, x, mask=None):
        eij = dot_product(x, self.W)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        result = K.sum(weighted_input, axis=1)

        if self.return_attention:
            return [result, a]
        return result

    def compute_output_shape(self, input_shape):
        if self.return_attention:
            return [(input_shape[0], input_shape[-1]),
                    (input_shape[0], input_shape[1])]
        else:
            return input_shape[0], input_shape[-1]
    def get_config(self):
        config = super().get_config()
        return config

注意模型

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Embedding, Input, Bidirectional, LSTM
maxlen=3000
max_features= 50000
embedding_dims=100
input_length=maxlen

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(maxlen,), dtype="int32")
# Embed each integer in a 100-dimensional vector
x = layers.Embedding(max_features,
                            embedding_dims,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)(inputs)
x = layers.Dropout(0.5)(x)

# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
b = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)

c1 = layers.Conv1D(128, 1, activation='relu')(b)
A1 = layers.GlobalAveragePooling1D()(c1)
M1=  layers.GlobalMaxPooling1D()(c1)

c2 = layers.Conv1D(128, 2, activation='relu')(b)
A2 = layers.GlobalAveragePooling1D()(c2)
M2=  layers.GlobalMaxPooling1D()(c2)

c3 = layers.Conv1D(128, 3, activation='relu')(b)
A3 = layers.GlobalAveragePooling1D()(c3)
M3=  layers.GlobalMaxPooling1D()(c3)

c4 = layers.Conv1D(128, 4, activation='relu')(b)
A4 = layers.GlobalAveragePooling1D()(c4)
M4=  layers.GlobalMaxPooling1D()(c4)

c5 = layers.Conv1D(128, 5, activation='relu')(b)
A5 = layers.GlobalAveragePooling1D()(c5)
M5 =  layers.GlobalMaxPooling1D()(c5)

concat = layers.concatenate([A1, M1, A2, M2, A3, M3, A4, M4, A5, M5])
reshape_layer = layers.Reshape([1, 1280]) (concat)

att = Attention()(reshape_layer)
batch= layers.BatchNormalization()(att)
outputs = layers.Dense(50, activation="sigmoid")(batch)
model = keras.Model(inputs, outputs)

请引导我，我会感激

我做了一些更改。我删除了大部分内容，然后添加了一些内容。双向LSTM在分类中的大多数时间表现与LSTM相同。因此，我删除了它，然后添加了多显示端注意层。我运行了此模型，运行正常。但有一点需要注意，那就是您的输入形状应该是-〉（批量大小、序列长度、嵌入维数），并且嵌入维数应能被头数整除。

# Here onething to notice, your data should have the shape -> (Batch-size, sequence-length, embedding-dims)

x = tf.keras.layers.MultiHeadAttention(8, key_dim=100, dropout=0.1, kernel_regularizer=tf.keras.regularizers.l1(0.1))(inputs , inputs)
x = layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid')(x)
b = layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid')(x)

c1 = layers.Conv1D(128, 1)(b)
c1 = tf.keras.layers.BatchNormalization()(c1)
c1 = tf.keras.activations.relu(c1)
z1 = tf.keras.layers.add([b , c1])

c2 = layers.Conv1D(128, 1)(z1)
c2 = tf.keras.layers.BatchNormalization()(c2)
c2 = tf.keras.activations.relu(c2)

z2 = tf.keras.layers.add([z1 , c2])

c3 = layers.Conv1D(128, 1)(z2)
c3 = tf.keras.layers.BatchNormalization()(c3)
c3 = tf.keras.activations.relu(c3)

z3 = tf.keras.layers.add([z2 , c3])

c4 = layers.Conv1D(128, 1)(z3)
c4 = tf.keras.layers.BatchNormalization()(c4)
c4 = tf.keras.activations.relu(c4)

z4 = tf.keras.layers.add([z3 , c4])

c5 = layers.Conv1D(128, 1)(z1)
c5 = tf.keras.layers.BatchNormalization()(c5)
c5 = tf.keras.activations.relu(c5)

z5 = tf.keras.layers.add([z4 , c5])

z5 = tf.keras.layers.MaxPooling1D()(z5)
z5 = tf.keras.layers.GlobalAveragePooling1D()(z5)
z5 = tf.keras.layers.Dropout(0.5)(z5)

outputs = tf.keras.layers.Dense(50, activation="sigmoid" , kernel_regularizer=tf.keras.regularizers.l1(0.1))(z5)
model = keras.Model(inputs, outputs)

keras 为什么Attention层不能提高模型的性能

1条答案

相关问题

热门标签

最新问答