pytorch updated:错误-Tensor的元素0不需要grad并且没有grad_fn

dxxyhpgq  于 2023-08-05  发布在  其他
关注(0)|答案(1)|浏览(159)

我试图从keras重新创建CNN模型,并在我的loss.bakcward()上面临运行时错误
我已经检查了模型摘要,看到摘要的通道和Tensor运行良好,但由于我缺乏知识,我无法找到错误。有人能给予我一个建议吗?
预处理:

class SeqDatasetOHE(Dataset):
'''
Dataset for one-hot-encoded sequences
'''
def __init__(self,
             df,
             seq_col='Seq',
             target_col='boxcox_exp_mean'
            ):
    # +--------------------+
    # | Get the X examples |
    # +--------------------+
    # extract the DNA from the appropriate column in the df
    self.seqs = list(df[seq_col].values)
    self.seq_len = len(self.seqs[0])
    
    # one-hot encode sequences, then stack in a torch tensor
    self.ohe_seqs = torch.stack([torch.tensor(one_hot_encode(x)) for x in self.seqs])

    # +------------------+
    # | Get the Y labels |
    # +------------------+
    self.labels = torch.tensor(list(df[target_col].values)).unsqueeze(1)
    
def __len__(self): return len(self.seqs)

def __getitem__(self,idx):
    # Given an index, return a tuple of an X with it's associated Y
    # This is called inside DataLoader
    seq = self.ohe_seqs[idx]
    label = self.labels[idx]
    
    return seq, label

#### buidling dataloader - batch size setting
currently batch size 4096 and it is fastai dataloader 

## constructed DataLoaders from Datasets.
def build_dataloaders(train_df,
                      test_df,
                      seq_col='Seq',
                      target_col='boxcox_exp_mean',
                      batch_size=512,
                      shuffle=True
                     ):
#Batch size – Refers to the number of samples in each batch.
#Shuffle – Whether you want the data to be reshuffled or not.
'''
Given a train and test df with some batch construction
details, put them into custom SeqDatasetOHE() objects. 
Give the Datasets to the DataLoaders and return.
'''
# create Datasets    
train_ds = SeqDatasetOHE(train_df,seq_col=seq_col,target_col=target_col)
test_ds = SeqDatasetOHE(test_df,seq_col=seq_col,target_col=target_col)

# Put DataSets into DataLoaders
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=shuffle)
test_dl = DataLoader(test_ds, batch_size=batch_size)

return train_dl,test_dl

train_dl, val_dl = build_dataloaders(train_df, val_df)

字符串
型号:

from typing import List
class DNA_CNN_test2(nn.Module): # deepcre model  
    def __init__(self,
                 seq_len: int =1000,
                 #num_filters: List[int] = [64, 128, 64],
                 kernel_size: int = 8,
                 p = 0.25): # drop out value 
        super().__init__()
        self.seq_len = seq_len
       
        window_size = int(seq_len*(8/3000))
        # CNN module
        self.conv_net = nn.Sequential() # sequential containter. the forward() method of sequential accepts cany input and forwards it to yhe first module it contains 
        #num_filters = [4] + num_filters
     
        self.model = nn.Sequential(
            # conv block 1
            nn.Conv1d(4,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True), 
            nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'), 
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            # conv block 2
            nn.Conv1d(64,128,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.Conv1d(128,128,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            # conv block 3
            nn.Conv1d(128,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.Conv1d(64,64,kernel_size=kernel_size, padding='same'),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=window_size),
            nn.Dropout(p),
            nn.Flatten(),
        
            nn.Linear(64*(seq_len//window_size**3), 1))
            #nn.ReLU(inplace=True),
            #nn.Dropout(p),
            #nn.Linear(128, 64),
            #nn.ReLU(inplace=True),
            #nn.Linear(64*seq_len, 1))

    def forward(self, xb: torch.Tensor):
        """Forward pass."""
        # reshape view to batch_ssize x 4channel x seq_len
        # permute to put channel in correct order
        means (batch size, 4 channel - OHE(DNA), Seq.length  )
            
        xb = xb.permute(0, 2, 1).mean( dim = [1,2], keepdim = True).squeeze(dim= -1)
        out = self.conv_net(xb)
        return out


loss_batch、train和test步骤

# +--------------------------------+
# | Training and fitting functions |
# +--------------------------------+

def loss_batch(model, loss_func, xb, yb, opt=None,verbose=False):
    '''
    Apply loss function to a batch of inputs. If no optimizer
    is provided, skip the back prop step.
    '''
    if verbose:
        print('loss batch ****')
        print("xb shape:",xb.shape)
        print("yb shape:",yb.shape)
        print("yb shape:",yb.squeeze(1).shape)
        #print("yb",yb)

    # get the batch output from the model given your input batch 
    # ** This is the model's prediction for the y labels! **
    xb_out = model(xb.float())
    
    if verbose:
        print("model out pre loss", xb_out.shape)
        #print('xb_out', xb_out)
        print("xb_out:",xb_out.shape)
        print("yb:",yb.shape)
        print("yb.long:",yb.long().shape)
    
    loss = loss_func(xb_out, yb.float()) # for MSE/regression
    # __FOOTNOTE 2__
    
    if opt is not None: # if opt
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)

def train_step(model, train_dl, loss_func, device, opt):
    '''
    Execute 1 set of batched training within an epoch
    '''
    # Set model to Training mode
    model.train()
    tl = [] # train losses
    ns = [] # batch sizes, n
    
    # loop through train DataLoader
    for xb, yb in train_dl:
        # put on GPU
        xb, yb = xb.to(device),yb.to(device)
        
        # provide opt so backprop happens
        t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
        
        # collect train loss and batch sizes
        tl.append(t)
        ns.append(n)
    
    # average the losses over all batches    
    train_loss = np.sum(np.multiply(tl, ns)) / np.sum(ns)
    
    return train_loss

def val_step(model, val_dl, loss_func, device):
    '''
    Execute 1 set of batched validation within an epoch
    '''
    # Set model to Evaluation mode
    model.eval()
    with torch.no_grad():
        vl = [] # val losses
        ns = [] # batch sizes, n
        
        # loop through validation DataLoader
        for xb, yb in val_dl:
            # put on GPU
            xb, yb = xb.to(device),yb.to(device)

            # Do NOT provide opt here, so backprop does not happen
            v, n = loss_batch(model, loss_func, xb, yb)

            # collect val loss and batch sizes
            vl.append(v)
            ns.append(n)

    # average the losses over all batches
    val_loss = np.sum(np.multiply(vl, ns)) / np.sum(ns)
    
    return val_loss

def fit(epochs, model, loss_func, opt, train_dl, val_dl,device,patience=1000):
    '''
    Fit the model params to the training data, eval on unseen data.
    Loop for a number of epochs and keep train of train and val losses 
    along the way
    '''
    # keep track of losses
    train_losses = []    
    val_losses = []
    
    # loop through epochs
    for epoch in range(epochs):
        # take a training step
        train_loss = train_step(model,train_dl,loss_func,device,opt)
        train_losses.append(train_loss)

        # take a validation step
        val_loss = val_step(model,val_dl,loss_func,device)
        val_losses.append(val_loss)
        
        print(f"E{epoch} | train loss: {train_loss:.3f} | val loss: {val_loss:.3f}")

    return train_losses, val_losses

def run_model(train_dl,val_dl,model,device,
              lr=1e-2, epochs=50, 
              lossf=None,opt=None
             ):
    '''
    Given train and val DataLoaders and a NN model, fit the mode to the training
    data. By default, use MSE loss and an SGD optimizer
    '''
    # define optimizer
    if opt:
        optimizer = opt
    else: # if no opt provided, just use SGD
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    # define loss function
    if lossf:
        loss_func = lossf
    else: # if no loss function provided, just use MSE
        loss_func = torch.nn.MSELoss()
    
    # run the training loop
    train_losses, val_losses = fit(
                                epochs, 
                                model, 
                                loss_func, 
                                optimizer, 
                                train_dl, 
                                val_dl, 
                                device)

    return train_losses, val_losses


运行模式:

# deeper CNN model from paper 
DNA_CNN_test2 = DNA_CNN_test2(seq_len)
DNA_CNN_test2.to(device)

DNA_CNN_test2_train_losses_lr4, DNA_CNN_test2_val_losses_lr4 = run_model(
    train_dl, 
    val_dl, 
    DNA_CNN_test2,
    device,
    epochs=100,
    lr= 1e-2
)


错误代码:

RuntimeError                              Traceback (most recent call last)
Cell In[51], line 5
      2 DNA_CNN_test2 = DNA_CNN_test2(seq_len)
      3 DNA_CNN_test2.to(device)
----> 5 DNA_CNN_test2_train_losses_lr4, DNA_CNN_test2_val_losses_lr4 = run_model(
      6     train_dl, 
      7     val_dl, 
      8     DNA_CNN_test2,
      9     device,
     10     epochs=100,
     11     lr= 1e-2
     12 )

Cell In[42], line 139, in run_model(train_dl, val_dl, model, device, lr, epochs, lossf, opt)
    136     loss_func = torch.nn.MSELoss()
    138 # run the training loop
--> 139 train_losses, val_losses = fit(
    140                             epochs, 
    141                             model, 
    142                             loss_func, 
    143                             optimizer, 
    144                             train_dl, 
    145                             val_dl, 
    146                             device)
    148 return train_losses, val_losses

Cell In[42], line 106, in fit(epochs, model, loss_func, opt, train_dl, val_dl, device, patience)
    103 # loop through epochs
    104 for epoch in range(epochs):
    105     # take a training step
--> 106     train_loss = train_step(model,train_dl,loss_func,device,opt)
    107     train_losses.append(train_loss)
    109     # take a validation step

Cell In[42], line 54, in train_step(model, train_dl, loss_func, device, opt)
     51 xb, yb = xb.to(device),yb.to(device)
     53 # provide opt so backprop happens
---> 54 t, n = loss_batch(model, loss_func, xb, yb, opt=opt)
     56 # collect train loss and batch sizes
     57 tl.append(t)

Cell In[42], line 32, in loss_batch(model, loss_func, xb, yb, opt, verbose)
     29 # __FOOTNOTE 2__
     31 if opt is not None: # if opt
---> 32     loss.backward()
     33     opt.step()
     34     opt.zero_grad()

File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    477 if has_torch_function_unary(self):
    478     return handle_torch_function(
    479         Tensor.backward,
    480         (self,),
   (...)
    485         inputs=inputs,
    486     )
--> 487 torch.autograd.backward(
    488     self, gradient, retain_graph, create_graph, inputs=inputs
    489 )

File /mnt/biostat/environments/parkj/dna2rna/lib/python3.11/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

uyhoqukh

uyhoqukh1#

很多事情都可能是问题所在,但如果没有数据,我们将无法进行更多的调试。
1.你可以用requires_grad = True创建一个新的变量。例如:

var_xs_h = Variable(xs_h.data, requires_grad=True)

字符串
1.如果代码 Package 在:

with torch.no_grad():
    # some code


去掉这个表情就成功了。
1.如果上述方法不适用,请尝试:

torch.set_grad_enabled(True)


也可以在这里阅读:https://discuss.pytorch.org/t/runtimeerror-element-0-of-variables-does-not-require-grad-and-does-not-have-a-grad-fn/11074/34

相关问题