我在pytorch有一个深度学习模型。我训练了100个纪元的模型,随着纪元的增加,它需要更多的内存。你能帮我一下吗?据我所知,内存使用量应该是恒定的。我说错了吗?或者我在下面的代码中犯了一个错误?下面是内存使用情况的图表。还有我用来训练和验证迷你批处理和批处理的函数。- 谢谢你-谢谢
注:我的模型是一个条件变分自动编码器,损耗与kl损耗等有关。
def runs_for_validate(test_it, num_samples = 100):
len_test_it = len(test_it)
# set the evaluation mode
model.eval()
# add for validate
L_loss_b, L_loss, test_loss = 0, 0, 0
# test loss for the data
samples = {}
num_batch = 0
with torch.no_grad():
for local_batch, local_labels in test_it:
length = local_batch.size(0)
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
samples, mu_par, log_var_par, mu_r, log_var_r = model.test_for_validate(local_batch,num_samples)
samples = np.transpose(np.array(samples),(1,0,2))
truths = local_labels
num_batch+=1
#break
L_loss_b = log_lik(local_labels, mu_par, log_var_par)
L_loss += (torch.sum(L_loss_b))/length
test_loss = L_loss/num_batch
return samples,truths.cpu().numpy(), test_loss
def train_batch(model, optimizer, device, batch, labels):
model.train()
optimizer.zero_grad()
length = float(batch.size(0))
mu_x, log_var_x, mu_q, log_var_q, mu_r, log_var_r = model(batch,labels)
kl_loss_b = KL(mu_r,log_var_r,mu_q,log_var_q)
L_loss_b = log_lik(labels, mu_x, log_var_x)
#print("Size of list3: " + str(sys.getsizeof(train_losses)) + "bytes")
L_loss = torch.sum(L_loss_b)
kl_loss = torch.sum(kl_loss_b)
loss = -(L_loss - kl_loss)/length
loss.backward()
# update the weights
optimizer.step()
# add for validation
return loss, kl_loss/length, L_loss/length
### when we want to check with validate data
def trainv(model, device, epochs, train_iterator, optimizer, validate_iterator):
n_samples = 100
train_losses, kl_losses, lik_losses, test_losses = [], [],[], []
for epoch in range(epochs):
ep_tr, ep_kl,ep_l, num_batch, iterator = 0,0,0, 0, 0
for local_batch, local_labels in train_iterator:
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
train_loss, kl_loss, lik_loss = train_batch(model, optimizer, device, local_batch,local_labels)
ep_tr += train_loss
ep_kl += kl_loss
ep_l += lik_loss
num_batch+= 1
iterator+=1
del local_batch, local_labels
train_losses.append(ep_tr/num_batch)
#kl_losses.append(kl_loss/num_batch)
#lik_losses.append(-lik_loss/num_batch)
run_validate_flag = 0
if run_validate_flag ==1:
samples, truths, test_loss = runs_for_validate(validate_iterator, n_samples)
test_losses.append(-test_loss)
else:
test_losses = f'run_validate_flag;{0}'
if epoch % int(epochs/5) == 0:
samples, truths, test_loss = runs_for_validate(validate_iterator, n_samples)
print("Epoch: {}, Training loss: {}, val_loss : {} ".format(epoch,train_losses[epoch], -1*test_loss))
#return train_losses, kl_losses, lik_losses, test_losses
return train_losses, test_losses
字符串
x1c 0d1x的数据
1条答案
按热度按时间8qgya5xd1#
我找到了这个解决方案,它对我很有效https://discuss.pytorch.org/t/gpu-memory-consumption-increases-while-training/2770基于这个解决方案,我在每次迭代中分离损失。