我使用两个Nvidia-Quadro 1200(4GB)GPU来推断大小的图像(1024*1792)在UNET分割使用Pytorch数据分析方法。即使代码将开始推理,它将只去一个GPU和其他将保持空闲。因为它只去一个GPU,我得到的CUDA OOM错误每次。我不能妥协的图像大小,因为它会改变我们的目标。我将添加下面的代码。任何帮助将不胜感激。
import torch
import cv2
import numpy as np
from model import build_unet
from torch.nn.parallel import DataParallel
import os
from tqdm import tqdm
torch.cuda.empty_cache()
checkpoint_path = "Weights/best_model.pth"
def mask_parse(mask):
mask = np.expand_dims(mask, axis=-1)
mask = np.concatenate(\[mask, mask, mask\], axis=-1)
return mask
def inference_with_dataparallel(model, cv_img, imname):
image = cv_img
imageCopy = image.copy()
x = np.transpose(image, (2, 0, 1))
x = x / 255.0
x = np.expand_dims(x, axis=0)
x = x.astype(np.float32)
x = torch.from_numpy(x)
x = x.to(device)
with torch.no_grad():
pred_y = model(x)
pred_y = torch.sigmoid(pred_y)
pred_y = pred_y[0].cpu().numpy()
pred_y = np.squeeze(pred_y, axis=0)
pred_y = pred_y > 0.1 # 0.15
pred_y = np.array(pred_y, dtype=np.uint8)
pred_y = mask_parse(pred_y)
out = pred_y * 255
imageCopy = cv2.resize(imageCopy, (mWidth, mHeight))
out = cv2.resize(out, (mWidth, mHeight))
finalOut = cv2.addWeighted(imageCopy, 0.6, out, 0.4, 0)
cv2.imwrite('Output/' + imname, finalOut)
if __name__ == "__main__":
model = build_unet()
device = torch.device("cuda")
model = model.to(device)
model = DataParallel(model)
model.load_state_dict(torch.load(checkpoint_path,map_location="cuda"))
dirListCSV=os.listdir('TestImages')
for allCSV in tqdm(dirListCSV):
imgName = 'TestImages/'+ allCSV
cv_img = cv2.imread(imgName)
mHeight, mWidth = cv_img.shape[0], cv_img.shape[1]
cv_img = cv2.resize(cv_img, (1024, 1792))
inference_with_dataparallel(model, cv_img, allCSV)
print('Gpu0:',torch.cuda.max_memory_allocated(device=device_ids[0]))
print('Gpu1:',torch.cuda.max_memory_allocated(device=device_ids[1]))
字符串
1条答案
按热度按时间k10s72fa1#
DataParallel
在批处理维度上分割输入(参见docs)。这意味着如果您一次对单个图像运行推理,则整个输入将进入一个GPU。您需要FSDP将模型分割到多个GPU上。