bug描述 Describe the Bug
这是我们的模型训练代码,它在不同显卡的相同环境下输出有较大差异。我们进一步地跟pytorch进行了对比,pytorch的输出结果与其中一个几乎一致。
class Model_1715506898(nn.Layer):
def __init__(self):
super(Model_1715506898, self).__init__()
self.conv1_mutated = paddle.nn.Conv2DTranspose(in_channels=1, out_channels=6, kernel_size=[5, 5], stride=[1, 1], padding=[0, 0], output_padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu1 = paddle.nn.ReLU()
self.pool1 = paddle.nn.MaxPool2D(kernel_size=[2, 2], stride=[2, 2], padding=[0, 0], ceil_mode=False)
self.conv2_mutated = paddle.nn.Conv2D(in_channels=6, out_channels=16, kernel_size=[5, 5], stride=[7, 5], padding=[0, 0], dilation=[1, 1], groups=1, bias_attr=None)
self.relu2_mutated = paddle.nn.ELU(alpha=0.1)
self.pool2 = paddle.nn.MaxPool2D(kernel_size=[2, 2], stride=[2, 2], padding=[0, 0], ceil_mode=False)
self.flatten = paddle.nn.Flatten()
self.linear1_mutated = paddle.nn.Linear(in_features=16, out_features=120)
self.relu3 = paddle.nn.ReLU()
self.linear2_mutated = paddle.nn.Linear(in_features=120, out_features=84)
self.relu4_mutated = paddle.floor
self.tail_flatten = paddle.nn.Flatten()
self.tail_fc = paddle.nn.Linear(in_features=84, out_features=10)
def forward(self, input):
conv1_output = self.conv1_mutated(input)
relu1_output = self.relu1(conv1_output)
maxpool1_output = self.pool1(relu1_output)
conv2_output = self.conv2_mutated(maxpool1_output)
relu2_output = self.relu2_mutated(conv2_output)
maxpool2_output = self.pool2(relu2_output)
flatten_output = self.flatten(maxpool2_output)
fc1_output = self.linear1_mutated(flatten_output)
relu3_output = self.relu3(fc1_output)
fc2_output = self.linear2_mutated(relu3_output)
relu4_output = self.relu4_mutated(fc2_output)
tail_flatten_output = self.tail_flatten(relu4_output)
tail_fc_output = self.tail_fc(tail_flatten_output)
tail_fc_output = tail_fc_output
return tail_fc_output
复现代码
https://github.com/PhyllisJi/MoCoDiff_Bug/tree/paddle-issue%2364591
其中有详细的复现步骤
输出差异
# W0525 05:58:23.551180 1942 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 8.6, Driver API Version: 12.2, Runtime API Version: 11.8
# W0525 05:58:23.552209 1942 gpu_resources.cc:164] device: 0, cuDNN Version: 8.9.
relu2_output.npz 1.1175870895385742e-08
maxpool2_output.npz 1.1175870895385742e-08
conv1_output.npz 0.0
relu4_output.npz 1.0
fc2_output.npz 0.0001531541347503662
conv2_output.npz 0.0
relu3_output.npz 2.9802322387695312e-08
flatten_output.npz 1.1175870895385742e-08
output.npz 0.248759925365448
relu1_output.npz 0.0
fc1_output.npz 5.960464477539063e-08
maxpool1_output.npz 0.0
# W0525 05:54:59.668944 515 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.5, Driver API Version: 12.2, Runtime API Version: 11.8
# W0525 05:54:59.670363 515 gpu_resources.cc:164] device: 0, cuDNN Version: 8.9.
relu2_output.npz 1.1175870895385742e-08
maxpool2_output.npz 1.1175870895385742e-08
conv1_output.npz 0.0
relu4_output.npz 0.0
fc2_output.npz 2.086162567138672e-07
conv2_output.npz 0.0
relu3_output.npz 2.9802322387695312e-08
flatten_output.npz 1.1175870895385742e-08
output.npz 0.0
relu1_output.npz 0.0
fc1_output.npz 5.960464477539063e-08
maxpool1_output.npz 0.0
其他补充信息 Additional Supplementary Information
paddle版本 2.6.1
2条答案
按热度按时间hivapdat1#
我按照你github上的readme 执行,怎么跑不出你贴图的结果?
gblwokeq2#
我按照你github上的readme 执行,怎么跑不出你贴图的结果?
我们分别在3080ti显卡和2080ti上运行的代码