Pytorch模型在保存后提供零精度

egmofgnx  于 2024-01-09  发布在  其他
关注(0)|答案(2)|浏览(238)

我构建了一个Pytorch模型来执行一些命名实体识别任务。它在40个epoch后的验证准确率约为90%,在测试数据上约为87%。然而,当我保存模型并重新加载时,准确率几乎为零。
模型定义如下:

class NERModel(nn.Module):
  def __init__(self):
    super(NERModel,self).__init__()
    self.base_count = 128
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()
    self.BatchNorm = nn.BatchNorm1d(num_features=100,)#100 is just a placeholder which will be updated
    self.SpanEmbed = nn.Embedding(num_embeddings=len(char2idx),embedding_dim=EMBEDDING_DIM,_weight = torch.from_numpy(embed_matrix)).type(torch.float)#,_freeze = True
    self.SentEmbed = nn.Embedding(num_embeddings=len(char2idx),embedding_dim=EMBEDDING_DIM,_weight = torch.from_numpy(embed_matrix)).type(torch.float)#,_freeze = True

    self.Span_Layer_Stack_PreS = nn.Sequential(
        #nn.Embedding(num_embeddings=len(char2idx),embedding_dim=EMBEDDING_DIM,_weight = torch.from_numpy(embed_matrix),_freeze = True).type(torch.float),
        self.SpanEmbed,
        nn.LSTM(input_size=EMBEDDING_DIM,hidden_size=1800,bidirectional=True,batch_first=True).type(torch.float)
        )

    self.Span_Layer_Stack_PostS = nn.Sequential(
        # nn.BatchNorm1d(num_features=MAX_SPAN_LEN),#100 is just a placeholder which will be updated
        nn.Dropout(0.5),
        nn.Linear(in_features = 3600,out_features = 1920),
        nn.ReLU(),
        nn.Linear(in_features = 1920,out_features = 640),
        nn.ReLU(),
        nn.Linear(in_features = 640,out_features = 1280)

    )

    self.Sent_Layer_Stack_PreS = nn.Sequential(
        # nn.Embedding(num_embeddings=len(char2idx),embedding_dim=EMBEDDING_DIM,_weight = torch.from_numpy(embed_matrix),_freeze = True).type(torch.float),
        self.SentEmbed,
        nn.LSTM(input_size=EMBEDDING_DIM,hidden_size=1800,bidirectional=True,batch_first = True).type(torch.float)
        )

    self.Sent_Layer_Stack_PostS = nn.Sequential(
        nn.BatchNorm1d(num_features=3600),#100 is just a placeholder which will be updated
        nn.Dropout(0.5),
        nn.Linear(in_features = 3600,out_features = 1920),
        nn.ReLU(),
        nn.Linear(in_features = 1920,out_features = 640),
        nn.ReLU(),
        nn.Linear(in_features = 640,out_features = 1280)

    )

    self.Comb_Layer_Stack = nn.Sequential(
        nn.Linear(in_features = 2560,out_features = 2048),
        nn.ReLU(),
        nn.Linear(in_features = 2048,out_features = 1024),
        nn.ReLU(),
        nn.Linear(in_features = 1024,out_features = 512),
        nn.ReLU(),
        nn.Linear(in_features = 512,out_features = 512),
        nn.ReLU(),
        nn.Linear(in_features = 512,out_features = 256),
        nn.ReLU(),
        nn.Linear(in_features = 256,out_features = 128),
        nn.ReLU(),
        nn.Linear(in_features = 128,out_features = len(label_idx)),
        nn.Sigmoid()
    )

  def forward(self,x:torch.Tensor) -> torch.Tensor:

    span_input,sent_input = torch.split(x,[int(x.shape[1]/2),int(x.shape[1]/2)],dim = 1)
    span_out_tot,(span_out,span_c_state_out) = self.Span_Layer_Stack_PreS(span_input.type(torch.long))
    sent_out_tot,(sent_out,sent_c_state_out) = self.Sent_Layer_Stack_PreS(sent_input.type(torch.long))

    #stack the layers
    span_out = torch.hstack([span_out[0],span_out[1]])
    sent_out = torch.hstack([sent_out[0],sent_out[1]])

    #run the stacked layers through the post stack layers
    span_out = self.Span_Layer_Stack_PostS(span_out.type(torch.float))
    sent_out = self.Sent_Layer_Stack_PostS(sent_out.type(torch.float))

    #combine the layers
    comb_layers = torch.hstack([span_out,sent_out])

    span_label = self.Comb_Layer_Stack(comb_layers)

    return(span_label)

字符串
我的训练代码(也包含在检查点保存模型的代码)如下:

def train_model(model:NERModel,num_epochs = 10,lrate = 0.005, save_freq = 2):

  torch.manual_seed(42)

  # m_accuracy = Accuracy(task="multiclass", num_classes=len(label_idx)).to(device)

  loss_fn = nn.BCELoss()
  opt = torch.optim.Adam(params=[param for param in model.parameters() if param.requires_grad == True],lr=lrate)

  epochs = num_epochs

  scheduler = lr_scheduler.ReduceLROnPlateau(opt,mode='min',factor=0.8,patience=2,verbose=True)

  mb = master_bar(range(epochs))

  for epoch in mb:#range(epochs):#tqdm(range(epochs),total = epochs):

    train_loss = 0
    train_acc = 0

    model.train()
    print(f'Epoch {epoch + 1}')
    total_data_processed = 0
    start_time = timer.perf_counter()

    for batch, (X,y) in progress_bar(enumerate(train_dataloader),total = len(train_dataloader),parent=mb):#enumerate(train_dataloader):#tqdm(enumerate(train_dataloader),total = len(train_dataloader)):

      y_pred = model(X).to(device)

      loss = loss_fn(y_pred.type(torch.float),y.type(torch.float))#torch.argmax(y_pred,dim=1).type(torch.float),y.type(torch.float))

      train_loss+=loss

      opt.zero_grad()

      loss.backward()

      opt.step()

      train_acc+= get_accuracy(torch.where(y_pred>0.5,1.0,0.0),y)#torch.argmax(y_pred,dim=1),y)

      total_data_processed += len(X)

    model.eval()

    with torch.inference_mode():
      total_val_loss, total_val_acc = 0, 0
      for X_val,y_val in val_dataloader:
        y_val_pred = model(X_val)
        total_val_loss += loss_fn(y_val_pred.type(torch.float),y_val.type(torch.float))
        total_val_acc += get_accuracy(torch.where(y_val_pred>0.5,1.0,0.0),y_val)
      val_loss = total_val_loss/len(val_dataloader)
      val_acc = total_val_acc/len(val_dataloader)
      scheduler.step(val_loss)
      print('Avg. Training Loss:{:.4f} | Val Loss:{:.4f} | Avg. Training Accuracy:{:.4f} | Val Accuracy:{:.4f}'\
      .format(train_loss/len(train_dataloader),val_loss,train_acc/len(train_dataloader),val_acc))

    end_time = timer.perf_counter()
    print('Epoch {:2d} | Elapsed Time:{:.3f}s'.format(epoch + 1, end_time - start_time))
    mb.write('Epoch {:2d} completed | Elapsed Time:{:.3f}s'.format(epoch + 1, end_time - start_time))

    if (epoch + 1) % save_freq == 0 and epoch + 1 > 0:
      filepath = config.environ_path[config.environ]['save'] + "Char_Tok_Models/Models/cp{:02d}.pth".format(epoch + 1)
      print('Saving model to:',filepath)
      torch.save({
              'epoch': epoch,
              'model_state_dict': model.state_dict(),
              'loss': val_loss,
              'accuracy': val_acc,
              }, filepath)

      print('Save completed')

def test_model(model_file:str,d_loader:DataLoader):

  torch.manual_seed(42)

  # m_accuracy = Accuracy(task="multiclass", num_classes=len(label_idx)).to(device)

  test_model = NERModel().to(device)

  loss_fn = nn.BCELoss()

  checkpoint = torch.load(config.environ_path[config.environ]['save'] + "Char_Tok_Models/Models/" + model_file)

  test_model.load_state_dict(checkpoint['model_state_dict'])

  test_model.eval()

  with torch.inference_mode():
    total_test_loss, total_test_acc = 0, 0
    for batch,(X_test,y_test) in progress_bar(enumerate(d_loader),total=len(d_loader)):
      y_test_pred = test_model(X_test)
      total_test_loss += loss_fn(y_test_pred.type(torch.float),y_test.type(torch.float))
      total_test_acc += get_accuracy(torch.where(y_test_pred>0.5,1.0,0.0),y_test)
    test_loss = total_test_loss/len(test_dataloader)
    test_acc = total_test_acc/len(test_dataloader)
    # scheduler.step(test_loss)
    print('Test Loss:{:.4f} | Test Accuracy:{:.4f}'.format(test_loss,test_acc))


进行预测的代码如下:

def init_model(model_file:str):

  torch.manual_seed(42)

  new_model = NERModel().to(device)

  checkpoint = torch.load(config.environ_path[config.environ]['save'] + "Char_Tok_Models/Models/" + model_file,map_location=torch.device(device))

  new_model.load_state_dict(checkpoint['model_state_dict'])

  return new_model

def predict(input_sent:str,model:NERModel):

  torch.manual_seed(42)

  pred_can_spans = get_can_spans(input_sent)
  can_spans = []
  for span in pred_can_spans:
    can_spans.append(span[2])
    # print(span[2])
  enc_spans = pad_encode_text(can_spans)
  # for span in enc_spans:
  #   print(span)
  enc_input_sent = pad_encode_text([input_sent])[0]
  enc_input_sent_arr = np.repeat([np.array(enc_input_sent)],len(enc_spans),axis=0)
  # print(enc_input_sent_arr.shape)
  enc_spans_arr = np.array(enc_spans)
  # print(enc_spans_arr.shape)
  pred_input = torch.from_numpy(np.concatenate([enc_spans_arr,enc_input_sent_arr],axis=1)).to(device)
  # print(pred_input.shape,type(pred_input))

  model.eval()

  with torch.no_grad():

    pred_out = model(pred_input)

    pred_out = torch.where(pred_out>0.5,1.0,0.0)

  outcome = zip(pred_input,pred_out)

  print('initial outcome length',len(pred_input))

  filt_outcome = [item for item in outcome if item[1][label_idx['']] != 1]

  print('final outcome length',len(filt_outcome))

  return pred_out


我用来计算精度度量的函数如下:

def get_accuracy(y_pred,y_true):
  y_pred_arr = y_pred.cpu().numpy();y_true_arr = y_true.cpu().numpy()
  comp_out = np.where(np.sum(np.equal(y_pred_arr,y_true_arr),axis=1)<y_true_arr.shape[1],0,1)
  return np.sum(comp_out)/comp_out.shape[0]


我已经种子 Torch 随机种子以及,但没有效果。什么问题可能是?我会感谢任何帮助。谢谢。
我期望模型给予我接近测试期间达到的准确性,但它没有。

ngynwnxp

ngynwnxp1#

您可能需要将模型输出更改为索引类别,而不是原始输出:

y_pred_arr = torch.argmax(y_pred.cpu().numpy(), dim=-1)

字符串
与NumPy相同

2w2cym1i

2w2cym1i2#

解决了。每次我重启内核时,标签代码都会重新生成,并且每次都会为每个标签分配新的编号。通过硬编码标签解决了问题。是否有更优雅的解决方案?请告诉我。

相关问题