之前在https://blog.csdn.net/fengbingchun/article/details/114493591中介绍DenseNet时，网络中会有BN层，即Batch Normalization，在每个Dense Block中都会有BN参与运算，下面对BN进行介绍并给出C++和PyTorch实现。

Batch Normalization即批量归一化由Sergey loffe等人于2015年提出，论文名为：《Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift》，论文见：https://arxiv.org/pdf/1502.03167.pdf 。

Batch Normalization是一种算法方法，它使深度神经网络的训练更快、更稳定。它可在激活函数前也可在激活函数后进行。它依赖于batch size，当batch size较小时，性能退化严重。在训练和测试阶段，它的计算方式不同。

对于CNN，使用BN更好；对于RNN，使用LN(Layer Normalization)更好。

在训练过程中，由于每层输入的分布随着前一层的参数发生变化而发生变化，因此训练深度神经网络很复杂。由于需要较低的学习率和仔细的参数初始化，这会减慢训练速度，并且使得训练具有饱和非线性的模型变得非常困难。我们将这种现象称为内部协变量偏移(internal covariate shift)，并通过归一化层输入来解决该问题。

Batch Normalization用于训练小批量样本(mini-batch)。它允许我们使用更高的学习率，并且不必太小心初始化。它还充当正则化器，在某些情况下消除了Dropout的需要。

Batch Normalization实现算法如下，截图来自原始论文：

在一个mini-batch中，在每一个BN层中，对每个样本的同一通道，计算它们的均值和方差，再对数据进行归一化，归一化到平均值为0，标准差为1 的常态分布，最后使用两个可学习参数gamma和beta对归一化的数据进行缩放和移位。此外，在训练过程中还保存了每个mini-batch每一BN层的均值和方差，最后求所有mini-batch均值和方差的期望值，以此来作为推理过程中该BN层的均值和方差。

Batch Normalization优点：

(1).在不影响收敛性的情况下，可使用更大的学习率，使训练更快、更稳定；

(2).具有正则化效果，防止过拟合，可去除Dropout和局部响应归一化(Local Response Normalization, LRN)；

(3).由于训练数据打乱顺序，使得每个epoch中mini-batch都不一样，对不同mini-batch做归一化可以起到数据增强的效果；

(4).缓减梯度爆炸和梯度消失。

以下是C++实现：

batch_normalization.hpp:

#ifndef FBC_SRC_NN_BATCH_NORM_HPP_
#define FBC_SRC_NN_BATCH_NORM_HPP_
#include <vector>
#include <memory>
// Blog: 
namespace ANN {
class BatchNorm {
public:
	BatchNorm(int number, int channels, int height, int width) : number_(number), channels_(channels), height_(height), width_(width) {}
	int LoadData(const float* data, int length);
	std::unique_ptr<float []> Run();
	void SetGamma(float gamma) { gamma_ = gamma; }
	float GetGamma() const { return gamma_; }
	void SetBeta(float beta) { beta_ = beta; }
	float GetBeta() const { return beta_; }
	void SetMean(std::vector<float> mean) { mean_ = mean; }
	std::vector<float> GetMean() const { return mean_; }
	void SetVariance(std::vector<float> variance) { variance_ = variance; }
	std::vector<float> GetVariance() const { return variance_; }
	void SetEpsilon(float epsilon) { epsilon_ = epsilon; }
private:
	int number_; // mini-batch
	int channels_;
	int height_;
	int width_;
	std::vector<float> mean_;
	std::vector<float> variance_;
	float gamma_  = 1.;
	float beta_ = 0.;
	float epsilon_ = 1e-5;
	std::vector<float> data_;
};
} // namespace ANN
#endif // FBC_SRC_NN_BATCH_NORM_HPP_

batch_normalization.cpp:

#include "batch_normalization.hpp"
#include <string.h>
#include <vector>
#include <cmath>
#include "common.hpp"
namespace ANN {
int BatchNorm::LoadData(const float* data, int length)
{
	CHECK(number_ * channels_ * height_ * width_ == length);
	data_.resize(length);
	memcpy(data_.data(), data, length * sizeof(float));
	return 0;
}
std::unique_ptr<float[]> BatchNorm::Run()
{
	mean_.resize(channels_ * height_ * width_);
	memset(mean_.data(), 0, mean_.size() * sizeof(float));
	for (int n = 0; n < number_; ++n) {
		const float* p = data_.data() + n * (channels_ * height_ * width_);
		for (int c = 0; c < channels_; ++c) {
			for (int h = 0; h < height_; ++h) {
				for (int w = 0; w < width_; ++w) {
					mean_[c * height_ * width_ + h * width_ + w] += p[c * height_ * width_ + h * width_ + w];
				}
			}
		}
	}
	for (int len = 0; len < channels_ * height_ * width_; ++len) {
		mean_[len] /= number_;
	}
	variance_.resize(channels_ * height_ * width_);
	memset(variance_.data(), 0, variance_.size() * sizeof(float));
	for (int n = 0; n < number_; ++n) {
		const float* p = data_.data() + n * (channels_ * height_ * width_);
		for (int c = 0; c < channels_; ++c) {
			for (int h = 0; h < height_; ++h) {
				for (int w = 0; w < width_; ++w) {
					variance_[c * height_ * width_ + h * width_ + w] += std::pow(p[c * height_ * width_ + h * width_ + w] - mean_[c * height_ * width_ + h * width_ + w], 2.);
				}
			}
		}
	}
	for (int len = 0; len < channels_ * height_ * width_; ++len) {
		variance_[len] /= number_;
	}
	std::unique_ptr<float[]> output(new float[number_ * channels_ * height_ * width_]);
	for (int n = 0; n < number_; ++n) {
		const float* p1 = data_.data() + n * (channels_ * height_ * width_);
		float* p2 = output.get() + n * (channels_ * height_ * width_);
		for (int c = 0; c < channels_; ++c) {
			for (int h = 0; h < height_; ++h) {
				for (int w = 0; w < width_; ++w) {
					p2[c * height_ * width_ + h * width_ + w] = (p1[c * height_ * width_ + h * width_ + w] - mean_[c * height_ * width_ + h * width_ + w]) /
						std::sqrt(variance_[c * height_ * width_ + h * width_ + w] + epsilon_);
				}
			}
		}
	}
	return output;
}
} // namespace ANN

funset.cpp:

int test_batch_normalization()
{
	const std::vector<float> data = { 11.1, -2.2, 23.3, 54.4, 58.5, -16.6,
									-97.7, -28.8, 49.9, -61.3, 52.6, -33.9,
									-2.45, -15.7, 72.4, 9.1, 47.2, 21.7};
	const int number = 3, channels = 1, height = 1, width = 6;
	ANN::BatchNorm bn(number, channels, height, width);
	bn.LoadData(data.data(), data.size());
	std::unique_ptr<float[]> output = bn.Run();
	
	fprintf(stdout, "result:\n");
	for (int n = 0; n < number; ++n) {
		const float* p = output.get() + n * (channels * height * width);
		for (int c = 0; c < channels; ++c) {
			for (int h = 0; h < height; ++h) {
				for (int w = 0; w < width; ++w) {
					fprintf(stdout, "%f, ", p[c * (height * width) + h * width + w]);
				}
				fprintf(stdout, "\n");
			}
		}
	}
	return 0;
}

执行结果如下：

以下是调用PyTorch接口实现：test_batch_normalization.py

import torch
from torch import nn
import numpy as np
# reference: https://github.com/Johann-Huber/batchnorm_pytorch/blob/main/batch_normalization_in_pytorch.ipynb
# BatchNorm reimplementation
class myBatchNorm2d(nn.Module):
    def __init__(self, input_size = None , epsilon = 1e-5, momentum = 0.99):
        super(myBatchNorm2d, self).__init__()
        assert input_size, print('Missing input_size parameter.')
        # Batch mean & var must be defined during training
        self.mu = torch.zeros(1, input_size)
        self.var = torch.ones(1, input_size)
        # For numerical stability
        self.epsilon = epsilon
        # Exponential moving average for mu & var update
        self.it_call = 0  # training iterations
        self.momentum = momentum # EMA smoothing
        # Trainable parameters
        self.beta = torch.nn.Parameter(torch.zeros(1, input_size))
        self.gamma = torch.nn.Parameter(torch.ones(1, input_size))
        # Batch size on which the normalization is computed
        self.batch_size = 0
    def forward(self, x):
        # [batch_size, input_size]
        self.it_call += 1
        if self.training:
            print("Info: training ...")
            if( self.batch_size == 0 ):
                # First iteration : save batch_size
                self.batch_size = x.shape[0]
            # Training : compute BN pass
            #batch_mu = (x.sum(dim=0)/x.shape[0]).unsqueeze(0) # [1, input_size]
            batch_mu = torch.mean(x, dim=0)
            #batch_var = (x.var(dim=0)/x.shape[0]).unsqueeze(0)*2 # [1, input_size]
            batch_var = torch.var(x, unbiased=False, dim=0)
            #print("batch_mu:", batch_mu)
            #print("batch_var:", batch_var)
            x_normalized = (x-batch_mu)/torch.sqrt(batch_var + self.epsilon) # [batch_size, input_size]
            x_bn = self.gamma * x_normalized + self.beta # [batch_size, input_size]
            # Update mu & std
            if(x.shape[0] == self.batch_size):
                running_mu = batch_mu
                running_var = batch_var
            else:
                running_mu = batch_mu*self.batch_size/x.shape[0]
                running_var = batch_var*self.batch_size/x.shape[0]
            self.mu = running_mu * (self.momentum/self.it_call) + \
                            self.mu * (1 - (self.momentum/self.it_call))
            self.var = running_var * (self.momentum/self.it_call) + \
                        self.var * (1 - (self.momentum/self.it_call))
        else:
            print("Info: inference ...")
            # Inference: compute BN pass using estimated mu & var
            if (x.shape[0] == self.batch_size):
                estimated_mu = self.mu
                estimated_var = self.var
            else :
                estimated_mu = self.mu*x.shape[0]/self.batch_size
                estimated_var = self.var*x.shape[0]/self.batch_size
            x_normalized = (x-estimated_mu)/torch.sqrt(estimated_var + self.epsilon) # [batch_size, input_size]
            x_bn = self.gamma * x_normalized + self.beta # [batch_size, input_size]
        return x_bn # [batch_size, output_size=input_size]
# N = 3, C = 1, H = 1, W = 6
input_size = 1 # channel
bn = myBatchNorm2d(input_size)
data =  [[[[11.1, -2.2, 23.3, 54.4, 58.5, -16.6]]],
		[[[-97.7, -28.8, 49.9, -61.3, 52.6, -33.9]]],
		[[[-2.45, -15.7, 72.4, 9.1, 47.2, 21.7]]]]
input = torch.FloatTensor(data) # [N, C, H, W]
print("input:", input)
output = bn.forward(input)
print("output:", output)
'''
print("######################")
a = np.array(data)
print(np.mean(a, axis=0))
print(np.var(a, axis=0))
'''

执行结果如下：可见，C++和PyTorch实现结果相同

GitHub:

https://github.com/fengbingchun/NN_Test

https://github.com/fengbingchun/PyTorch_Test

深度神经网络中的Batch Normalization介绍及实现

相关文章

热门标签

最新文章