我正在尝试使用BERT、Transformers和Tensorflow对电影评论进行情感分类。这是我目前拥有的代码:
def read_dataset(filename, model_name="bert-base-uncased"):
"""Reads a dataset from the specified path and returns sentences and labels"""
tokenizer = BertTokenizer.from_pretrained(model_name)
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
# preallocate memory for the data
sents, labels = list(), np.empty((len(lines), 1), dtype=int)
for i, line in enumerate(lines):
text, str_label, _ = line.split("\t")
labels[i] = int(str_label.split("=")[1] == "POS")
sents.append(text)
return dict(tokenizer(sents, padding=True, truncation=True, return_tensors="tf")), labels
class BertMLP(tf.keras.Model):
def __init__(self, embed_batch_size=100, model_name="bert-base-cased"):
super(BertMLP, self).__init__()
self.bs = embed_batch_size
self.model = TFBertModel.from_pretrained(model_name)
self.classification_head = tf.keras.models.Sequential(
layers = [
tf.keras.Input(shape=(self.model.config.hidden_size,)),
tf.keras.layers.Dense(350, activation="tanh"),
tf.keras.layers.Dense(200, activation="tanh"),
tf.keras.layers.Dense(50, activation="tanh"),
tf.keras.layers.Dense(1, activation="sigmoid", use_bias=False)
]
)
def call(self, inputs):
outputs = self.model(inputs)
return outputs
def evaluate(model, inputs, labels, loss_func):
mean_loss = tf.keras.metrics.Mean(name="train_loss")
accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")
predictions = model(inputs)
mean_loss(loss_func(labels, predictions))
accuracy(labels, predictions)
return mean_loss.result(), accuracy.result() * 100
if __name__ == "__main__":
train = read_dataset("datasets/rt-polarity.train.vecs")
dev = read_dataset("datasets/rt-polarity.dev.vecs")
test = read_dataset("datasets/rt-polarity.test.vecs")
mlp = BertMLP()
mlp.compile(tf.keras.optimizers.SGD(learning_rate=0.01), loss='mse')
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("Before training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
mlp.fit(*train, epochs=10, batch_size=10)
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("After training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
然而,当我运行这段代码时,我得到一个错误:
Traceback (most recent call last):
File "C:\Users\home\anaconda3\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\home\downloads\mlp.py", line 60, in <module>
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
File "c:\users\home\downloads\mlp.py", line 46, in evaluate
predictions = model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\home\downloads\mlp.py", line 39, in call
outputs = self.model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1108, in call
outputs = self.bert(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 781, in call
embedding_output = self.embeddings(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 203, in call
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFBertEmbeddings).
indices[1174,8] = 29550 is not in [0, 28996) [Op:ResourceGather]
Call arguments received:
• input_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• position_ids=None
• token_type_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• inputs_embeds=None
• past_key_values_length=0
• training=False
我在谷歌上搜索了一会儿,没有找到任何确凿的证据,我很确定这和这部分有关:
def call(self, inputs):
outputs = self.model(inputs)
return outputs
但是,我已经尝试了很多不同的方法,包括限制数据集的大小和安装不同版本的Transformers和Tensorflow,但是都没有用。请告诉我我做错了什么。谢谢!
1条答案
按热度按时间8mmmxcuj1#
OP使用
bert-base-cased
作为模型,使用bert-base-uncased
作为标记器,当模型和标记化数据的词汇大小不同时,会在训练过程中产生问题。