pytorch 我已将输入数据移至GPU,但仍无法训练模型

h7wcgrx3  于 2022-11-09  发布在  其他
关注(0)|答案(1)|浏览(234)

错误:输入类型(torch.FloatTensor)和权重类型(torch.cuda.FloatTensor)应该相同,或者输入应该是MKLDNNTensor而权重是密集Tensor
我试着检查模型和输入数据的位置,但得到的是device ='cuda 0',但当我打印get_device()时得到的是-1,我不知道这是否是错误的。
我还尝试了.to(设备)和.cuda(),但都不起作用
这里是这个代码的链接,https://www.kaggle.com/code/dongjj/dataloader/edit/run/106753596提前感谢!!!

from torch.utils.tensorboard import SummaryWriter

model = base_model
model.cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

loss_fn = F.cross_entropy
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_loader):
        # Every data instance is an input + label pair
        inputs, labels = data
        inputs.cuda()
        print(inputs.get_device())
        labels.cuda()

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(training_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 5

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    # We don't need gradients on to do reporting
    model.train(False)

    running_vloss = 0.0
    for i, vdata in enumerate(test_loader):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    epoch_number += 1

EPOCH 1:
-1
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_17/1115936933.py in <module>
     55     print('EPOCH {}:'.format(epoch_number + 1))
     56     model.train(True)
---> 57     avg_loss = train_one_epoch(epoch_number, writer)
     58 
     59     # We don't need gradients on to do reporting

/tmp/ipykernel_17/1115936933.py in train_one_epoch(epoch_index, tb_writer)
     25 
     26         # Make predictions for this batch
---> 27         outputs = model(inputs)
     28 
     29         # Compute the loss and its gradients

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input,**kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input,**kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/torchvision/models/resnet.py in forward(self, x)
    281 
    282     def forward(self, x: Tensor) -> Tensor:
--> 283         return self._forward_impl(x)
    284 
    285 

/opt/conda/lib/python3.7/site-packages/torchvision/models/resnet.py in _forward_impl(self, x)
    264     def _forward_impl(self, x: Tensor) -> Tensor:
    265         # See note [TorchScript super()]
--> 266         x = self.conv1(x)
    267         x = self.bn1(x)
    268         x = self.relu(x)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _call_impl(self, *input,**kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input,**kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/conv.py in forward(self, input)
    445 
    446     def forward(self, input: Tensor) -> Tensor:
--> 447         return self._conv_forward(input, self.weight, self.bias)
    448 
    449 class Conv3d(_ConvNd):

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/conv.py in _conv_forward(self, input, weight, bias)
    442                             _pair(0), self.dilation, self.groups)
    443         return F.conv2d(input, weight, bias, self.stride,
--> 444                         self.padding, self.dilation, self.groups)
    445 
    446     def forward(self, input: Tensor) -> Tensor:

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
3vpjnl9f

3vpjnl9f1#

假设cuda是一个就地函数,但实际上它是在复制数据。换句话说,需要重新分配输入和标签:

inputs = inputs.cuda()
labels = labels.cuda()

相关问题