我试图建立一个多标签文本分类模型来分类有毒评论。我从这个链接中找到了一篇文章:Multi-label Text Classification with BERT using Pytorch
我还使用了来自Kaggle的数据集:jigsaw-toxic-comment-classification-challenge的
我使用谷歌colab运行我的模型与V100 gpu运行时设置。
不幸的是,经过几个小时的训练(4个epoch),我的f1分数只有0.04214842148421484。我的最终损失分数是0.00354736
我知道损失函数和f1得分是两个不同的东西,但就我的理解而言,低成本函数得分应该会影响f1得分。哪里做错了?
下面是代码:
import torch
import numpy as np
import pandas as pd
import shutil, sys
import transformers
from sklearn import metrics
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
val_targets=[]
val_outputs=[]
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len,):
self.tokenizer = tokenizer
self.data = dataframe
self.title = dataframe['comment_text']
self.targets = self.data.target_list
self.max_len = max_len
def __len__(self):
return len(self.title)
def __getitem__(self, index):
title = str(self.title[index])
title = " ".join(title.split(" "))
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
return_token_type_ids=True,
truncation=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased', return_dict=False)
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 6)
def forward(self, ids, mask, token_type_ids):
_, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output
def loss_fn(outputs, targets):
return torch.nn.BCEWithLogitsLoss()(outputs, targets)
def save_ckp(state, is_best, checkpoint_path, best_model_path):
"""
state: checkpoint we want to save
is_best: is this the best checkpoint; min validation loss
checkpoint_path: path to save checkpoint
best_model_path: path to save best model
"""
f_path = checkpoint_path
# save checkpoint data to the path given, checkpoint_path
torch.save(state, f_path)
# if it is a best model, min validation loss
if is_best:
best_fpath = best_model_path
# copy that checkpoint file to best path given, best_model_path
shutil.copyfile(f_path, best_fpath)
def load_ckp(checkpoint_fpath, model, optimizer):
"""
checkpoint_path: path to save checkpoint
model: model that we want to load checkpoint parameters into
optimizer: optimizer we defined in previous training
"""
# load checkpoint
checkpoint = torch.load(checkpoint_fpath)
# initialize state_dict from checkpoint to model
model.load_state_dict(checkpoint['state_dict'])
# initialize optimizer from checkpoint to optimizer
optimizer.load_state_dict(checkpoint['optimizer'])
# handle valid_loss_min based on its type
valid_loss_min = checkpoint['valid_loss_min']
if isinstance(valid_loss_min, torch.Tensor):
valid_loss_min = valid_loss_min.item()
# return model, optimizer, epoch value, min validation loss
return model, optimizer, checkpoint['epoch'], valid_loss_min
def train_model(start_epochs, n_epochs, valid_loss_min_input,
training_loader, validation_loader, model,
optimizer, checkpoint_path, best_model_path):
# initialize tracker for minimum validation loss
valid_loss_min = valid_loss_min_input
for epoch in range(start_epochs, n_epochs+1):
train_loss = 0
valid_loss = 0
model.train()
print('############# Epoch {}: Training Start #############'.format(epoch))
for batch_idx, data in enumerate(training_loader):
#print('yyy epoch', batch_idx)
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
optimizer.zero_grad()
outputs = model(ids, mask, token_type_ids)
print(outputs.shape)
loss = loss_fn(outputs, targets)
if batch_idx%100==0:
print(f'Epoch: {epoch}, Training Loss: {loss.item()}')
loss.backward()
optimizer.step()
#print('before loss data in training', loss.item(), train_loss)
train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
#print('after loss data in training', loss.item(), train_loss)
print('############# Epoch {}: Training End #############'.format(epoch))
print('############# Epoch {}: Validation Start #############'.format(epoch))
######################
# validate the model #
######################
model.eval()
outputs, targets = do_validation(validation_loader)
val_preds = (np.array(outputs) > 0.5).astype(int)
val_targets = (np.array(targets) > 0.5).astype(int)
accuracy = metrics.accuracy_score(val_targets, val_preds)
f1_score_micro = metrics.f1_score(val_targets, val_preds, average='micro')
f1_score_macro = metrics.f1_score(val_targets, val_preds, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")
print('############# Epoch {}: Validation End #############'.format(epoch))
# calculate average losses
#print('before cal avg train loss', train_loss)
train_loss = train_loss/len(training_loader)
valid_loss = valid_loss/len(validation_loader)
# print training/validation statistics
print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
epoch,
train_loss,
valid_loss
))
# create checkpoint variable and add important data
checkpoint = {
'epoch': epoch + 1,
'valid_loss_min': valid_loss,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict()
}
# save checkpoint
save_ckp(checkpoint, False, checkpoint_path, best_model_path)
## TODO: save the model if validation loss has decreased
if valid_loss <= valid_loss_min:
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,valid_loss))
# save checkpoint as best model
save_ckp(checkpoint, True, checkpoint_path, best_model_path)
valid_loss_min = valid_loss
print('############# Epoch {} Done #############\n'.format(epoch))
return model
def do_validation(dataloader):
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
for _, data in enumerate(dataloader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
fin_targets.extend(targets.cpu().detach().numpy().tolist())
fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return fin_outputs, fin_targets
if __name__ == '__main__':
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
train_df = pd.read_csv(train_data_location,on_bad_lines='skip')
test_df = pd.read_csv(test_data_location,on_bad_lines='skip')
select_labels = train_df.columns.values.tolist()[2:]
train_df['target_list'] = train_df[select_labels].values.tolist()
test_df['target_list'] = test_df[select_labels].values.tolist()
MAX_LEN = 64
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
validation_set = CustomDataset(test_df, tokenizer, MAX_LEN)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': False,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)
model = BERTClass()
model.to(device)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
checkpoint_path = '/content/checkpoints/current_checkpoint.pt'
best_model = '/content/checkpoints/best_model.pt'
trained_model = train_model(1, EPOCHS, np.Inf, training_loader, validation_loader, model,
optimizer,checkpoint_path,best_model)
字符串
1条答案
按热度按时间wxclj1h51#
F1分数是精确度和召回率的调和平均值。它允许程序员在一个数字中看到精确度和召回率。损失分数与其他性能指标不直接相关。
对于多标签文本分类,accuracy, precision, and recall是重要的度量。具体来说,检查你的总体准确率得分,然后是每个类的精确率和召回率得分。在研究和商业目的之外,F1分数并不是特别有用。
至于为什么你一开始就得到了一个很低的F1分数,你有没有split your dataset?我看到你导入了Sklearn的train_test_split库,但你从来没有在代码中调用它。看起来你只是把整个原始数据集传递给你的训练函数。