我正在研究一个多标签文本分类问题,下面是我的样本描述
x_train shape: (8066, 3000)
x_test shape: (1729, 3000)
x_valid shape: (1573, 3000)
我实现了一个RCNN模型,它给我的数据60 MiF评分,但当我添加注意力层到RCNN模型时,它仍然给我同样的60分。我尝试了几乎每一种可能的解决方案,包括调整批量大小、时期、辍学、批量规范、隐藏单元的不同大小,但没有改善我的基于注意力的模型的评分。P.S我试图在RNN之前添加注意力层,在RNN前后增加注意层会降低模型的性能。
接下来我将分享我的模型架构,不需要注意,也需要注意。
未引起注意的模型
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Embedding, Input, Bidirectional, LSTM
maxlen=3000
max_features= 50000
embedding_dims=100
input_length=maxlen
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(maxlen,), dtype="int32")
# Embed each integer in a 100-dimensional vector
x = layers.Embedding(max_features,
embedding_dims,
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)(inputs)
x = layers.Dropout(0.5)(x)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
b = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
c1 = layers.Conv1D(128, 1, activation='relu')(b)
A1 = layers.GlobalAveragePooling1D()(c1)
M1= layers.GlobalMaxPooling1D()(c1)
c2 = layers.Conv1D(128, 2, activation='relu')(b)
A2 = layers.GlobalAveragePooling1D()(c2)
M2= layers.GlobalMaxPooling1D()(c2)
c3 = layers.Conv1D(128, 3, activation='relu')(b)
A3 = layers.GlobalAveragePooling1D()(c3)
M3= layers.GlobalMaxPooling1D()(c3)
c4 = layers.Conv1D(128, 4, activation='relu')(b)
A4 = layers.GlobalAveragePooling1D()(c4)
M4= layers.GlobalMaxPooling1D()(c4)
c5 = layers.Conv1D(128, 5, activation='relu')(b)
A5 = layers.GlobalAveragePooling1D()(c5)
M5 = layers.GlobalMaxPooling1D()(c5)
concat = layers.concatenate([A1, M1, A2, M2, A3, M3, A4, M4, A5, M5])
outputs = layers.Dense(50, activation="sigmoid")(concat)
model = keras.Model(inputs, outputs)
注意层
from tensorflow.keras import backend as K
from tensorflow.keras import initializers, regularizers, constraints
from tensorflow.keras.layers import Layer
def dot_product(x, kernel):
"""
Wrapper for dot product operation, in order to be compatible with both
Theano and Tensorflow
Args:
x (): input
kernel (): weights
Returns:
"""
if K.backend() == 'tensorflow':
return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
else:
return K.dot(x, kernel)
class Attention(Layer):
def __init__(self,
kernel_regularizer=None, bias_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True,
return_attention=False,
**kwargs):
"""
Keras Layer that implements an Attention mechanism for temporal data.
Supports Masking.
Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
# Input shape
3D tensor with shape: `(samples, steps, features)`.
# Output shape
2D tensor with shape: `(samples, features)`.
:param kwargs:
Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
The dimensions are inferred based on the output shape of the RNN.
Note: The layer has been tested with Keras 1.x
Example:
model.add(LSTM(64, return_sequences=True))
model.add(Attention())
# next add a Dense layer (for classification/regression) or whatever...
"""
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(kernel_regularizer)
self.b_regularizer = regularizers.get(bias_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.return_attention = return_attention
super(Attention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(shape=(input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name))
if self.bias:
self.b = self.add_weight(shape=(input_shape[1],),
initializer='zeros',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
self.built = True
def compute_mask(self, inputs, mask=None):
# do not pass the mask to the next layers
if self.return_attention:
return [None, None]
return None
def call(self, x, mask=None):
eij = dot_product(x, self.W)
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
# and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
# a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
result = K.sum(weighted_input, axis=1)
if self.return_attention:
return [result, a]
return result
def compute_output_shape(self, input_shape):
if self.return_attention:
return [(input_shape[0], input_shape[-1]),
(input_shape[0], input_shape[1])]
else:
return input_shape[0], input_shape[-1]
def get_config(self):
config = super().get_config()
return config
注意模型
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Embedding, Input, Bidirectional, LSTM
maxlen=3000
max_features= 50000
embedding_dims=100
input_length=maxlen
# Input for variable-length sequences of integers
inputs = keras.Input(shape=(maxlen,), dtype="int32")
# Embed each integer in a 100-dimensional vector
x = layers.Embedding(max_features,
embedding_dims,
weights=[embedding_matrix],
input_length=maxlen,
trainable=False)(inputs)
x = layers.Dropout(0.5)(x)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
b = layers.Bidirectional(layers.LSTM(128, return_sequences=True, recurrent_activation='sigmoid'))(x)
c1 = layers.Conv1D(128, 1, activation='relu')(b)
A1 = layers.GlobalAveragePooling1D()(c1)
M1= layers.GlobalMaxPooling1D()(c1)
c2 = layers.Conv1D(128, 2, activation='relu')(b)
A2 = layers.GlobalAveragePooling1D()(c2)
M2= layers.GlobalMaxPooling1D()(c2)
c3 = layers.Conv1D(128, 3, activation='relu')(b)
A3 = layers.GlobalAveragePooling1D()(c3)
M3= layers.GlobalMaxPooling1D()(c3)
c4 = layers.Conv1D(128, 4, activation='relu')(b)
A4 = layers.GlobalAveragePooling1D()(c4)
M4= layers.GlobalMaxPooling1D()(c4)
c5 = layers.Conv1D(128, 5, activation='relu')(b)
A5 = layers.GlobalAveragePooling1D()(c5)
M5 = layers.GlobalMaxPooling1D()(c5)
concat = layers.concatenate([A1, M1, A2, M2, A3, M3, A4, M4, A5, M5])
reshape_layer = layers.Reshape([1, 1280]) (concat)
att = Attention()(reshape_layer)
batch= layers.BatchNormalization()(att)
outputs = layers.Dense(50, activation="sigmoid")(batch)
model = keras.Model(inputs, outputs)
请引导我,我会感激
1条答案
按热度按时间ffx8fchx1#
我做了一些更改。我删除了大部分内容,然后添加了一些内容。双向LSTM在分类中的大多数时间表现与LSTM相同。因此,我删除了它,然后添加了多显示端注意层。我运行了此模型,运行正常。但有一点需要注意,那就是您的输入形状应该是-〉(批量大小、序列长度、嵌入维数),并且嵌入维数应能被头数整除。