深度神经网络中的Batch Normalization介绍及实现

x33g5p2x  于2021-11-11 转载在 其他  
字(7.5k)|赞(0)|评价(0)|浏览(337)

    之前在https://blog.csdn.net/fengbingchun/article/details/114493591中介绍DenseNet时,网络中会有BN层,即Batch Normalization,在每个Dense Block中都会有BN参与运算,下面对BN进行介绍并给出C++和PyTorch实现。

    Batch Normalization即批量归一化由Sergey loffe等人于2015年提出,论文名为:《Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift》,论文见:https://arxiv.org/pdf/1502.03167.pdf

    Batch Normalization是一种算法方法,它使深度神经网络的训练更快、更稳定。它可在激活函数前也可在激活函数后进行。它依赖于batch size,当batch size较小时,性能退化严重。在训练和测试阶段,它的计算方式不同。

    对于CNN,使用BN更好;对于RNN,使用LN(Layer Normalization)更好。

    在训练过程中,由于每层输入的分布随着前一层的参数发生变化而发生变化,因此训练深度神经网络很复杂。由于需要较低的学习率和仔细的参数初始化,这会减慢训练速度,并且使得训练具有饱和非线性的模型变得非常困难。我们将这种现象称为内部协变量偏移(internal covariate shift),并通过归一化层输入来解决该问题。

    Batch Normalization用于训练小批量样本(mini-batch)。它允许我们使用更高的学习率,并且不必太小心初始化。它还充当正则化器,在某些情况下消除了Dropout的需要。

    Batch Normalization实现算法如下,截图来自原始论文:

    在一个mini-batch中,在每一个BN层中,对每个样本的同一通道,计算它们的均值和方差,再对数据进行归一化,归一化到平均值为0,标准差为1 的常态分布,最后使用两个可学习参数gamma和beta对归一化的数据进行缩放和移位。此外,在训练过程中还保存了每个mini-batch每一BN层的均值和方差,最后求所有mini-batch均值和方差的期望值,以此来作为推理过程中该BN层的均值和方差。

    Batch Normalization优点:

    (1).在不影响收敛性的情况下,可使用更大的学习率,使训练更快、更稳定;

    (2).具有正则化效果,防止过拟合,可去除Dropout和局部响应归一化(Local Response Normalization, LRN);

    (3).由于训练数据打乱顺序,使得每个epoch中mini-batch都不一样,对不同mini-batch做归一化可以起到数据增强的效果;

    (4).缓减梯度爆炸和梯度消失。

    以下是C++实现:

    batch_normalization.hpp:

  1. #ifndef FBC_SRC_NN_BATCH_NORM_HPP_
  2. #define FBC_SRC_NN_BATCH_NORM_HPP_
  3. #include <vector>
  4. #include <memory>
  5. // Blog:
  6. namespace ANN {
  7. class BatchNorm {
  8. public:
  9. BatchNorm(int number, int channels, int height, int width) : number_(number), channels_(channels), height_(height), width_(width) {}
  10. int LoadData(const float* data, int length);
  11. std::unique_ptr<float []> Run();
  12. void SetGamma(float gamma) { gamma_ = gamma; }
  13. float GetGamma() const { return gamma_; }
  14. void SetBeta(float beta) { beta_ = beta; }
  15. float GetBeta() const { return beta_; }
  16. void SetMean(std::vector<float> mean) { mean_ = mean; }
  17. std::vector<float> GetMean() const { return mean_; }
  18. void SetVariance(std::vector<float> variance) { variance_ = variance; }
  19. std::vector<float> GetVariance() const { return variance_; }
  20. void SetEpsilon(float epsilon) { epsilon_ = epsilon; }
  21. private:
  22. int number_; // mini-batch
  23. int channels_;
  24. int height_;
  25. int width_;
  26. std::vector<float> mean_;
  27. std::vector<float> variance_;
  28. float gamma_ = 1.;
  29. float beta_ = 0.;
  30. float epsilon_ = 1e-5;
  31. std::vector<float> data_;
  32. };
  33. } // namespace ANN
  34. #endif // FBC_SRC_NN_BATCH_NORM_HPP_

    batch_normalization.cpp:

  1. #include "batch_normalization.hpp"
  2. #include <string.h>
  3. #include <vector>
  4. #include <cmath>
  5. #include "common.hpp"
  6. namespace ANN {
  7. int BatchNorm::LoadData(const float* data, int length)
  8. {
  9. CHECK(number_ * channels_ * height_ * width_ == length);
  10. data_.resize(length);
  11. memcpy(data_.data(), data, length * sizeof(float));
  12. return 0;
  13. }
  14. std::unique_ptr<float[]> BatchNorm::Run()
  15. {
  16. mean_.resize(channels_ * height_ * width_);
  17. memset(mean_.data(), 0, mean_.size() * sizeof(float));
  18. for (int n = 0; n < number_; ++n) {
  19. const float* p = data_.data() + n * (channels_ * height_ * width_);
  20. for (int c = 0; c < channels_; ++c) {
  21. for (int h = 0; h < height_; ++h) {
  22. for (int w = 0; w < width_; ++w) {
  23. mean_[c * height_ * width_ + h * width_ + w] += p[c * height_ * width_ + h * width_ + w];
  24. }
  25. }
  26. }
  27. }
  28. for (int len = 0; len < channels_ * height_ * width_; ++len) {
  29. mean_[len] /= number_;
  30. }
  31. variance_.resize(channels_ * height_ * width_);
  32. memset(variance_.data(), 0, variance_.size() * sizeof(float));
  33. for (int n = 0; n < number_; ++n) {
  34. const float* p = data_.data() + n * (channels_ * height_ * width_);
  35. for (int c = 0; c < channels_; ++c) {
  36. for (int h = 0; h < height_; ++h) {
  37. for (int w = 0; w < width_; ++w) {
  38. variance_[c * height_ * width_ + h * width_ + w] += std::pow(p[c * height_ * width_ + h * width_ + w] - mean_[c * height_ * width_ + h * width_ + w], 2.);
  39. }
  40. }
  41. }
  42. }
  43. for (int len = 0; len < channels_ * height_ * width_; ++len) {
  44. variance_[len] /= number_;
  45. }
  46. std::unique_ptr<float[]> output(new float[number_ * channels_ * height_ * width_]);
  47. for (int n = 0; n < number_; ++n) {
  48. const float* p1 = data_.data() + n * (channels_ * height_ * width_);
  49. float* p2 = output.get() + n * (channels_ * height_ * width_);
  50. for (int c = 0; c < channels_; ++c) {
  51. for (int h = 0; h < height_; ++h) {
  52. for (int w = 0; w < width_; ++w) {
  53. p2[c * height_ * width_ + h * width_ + w] = (p1[c * height_ * width_ + h * width_ + w] - mean_[c * height_ * width_ + h * width_ + w]) /
  54. std::sqrt(variance_[c * height_ * width_ + h * width_ + w] + epsilon_);
  55. }
  56. }
  57. }
  58. }
  59. return output;
  60. }
  61. } // namespace ANN

    funset.cpp:

  1. int test_batch_normalization()
  2. {
  3. const std::vector<float> data = { 11.1, -2.2, 23.3, 54.4, 58.5, -16.6,
  4. -97.7, -28.8, 49.9, -61.3, 52.6, -33.9,
  5. -2.45, -15.7, 72.4, 9.1, 47.2, 21.7};
  6. const int number = 3, channels = 1, height = 1, width = 6;
  7. ANN::BatchNorm bn(number, channels, height, width);
  8. bn.LoadData(data.data(), data.size());
  9. std::unique_ptr<float[]> output = bn.Run();
  10. fprintf(stdout, "result:\n");
  11. for (int n = 0; n < number; ++n) {
  12. const float* p = output.get() + n * (channels * height * width);
  13. for (int c = 0; c < channels; ++c) {
  14. for (int h = 0; h < height; ++h) {
  15. for (int w = 0; w < width; ++w) {
  16. fprintf(stdout, "%f, ", p[c * (height * width) + h * width + w]);
  17. }
  18. fprintf(stdout, "\n");
  19. }
  20. }
  21. }
  22. return 0;
  23. }

    执行结果如下:

    以下是调用PyTorch接口实现:test_batch_normalization.py

  1. import torch
  2. from torch import nn
  3. import numpy as np
  4. # reference: https://github.com/Johann-Huber/batchnorm_pytorch/blob/main/batch_normalization_in_pytorch.ipynb
  5. # BatchNorm reimplementation
  6. class myBatchNorm2d(nn.Module):
  7. def __init__(self, input_size = None , epsilon = 1e-5, momentum = 0.99):
  8. super(myBatchNorm2d, self).__init__()
  9. assert input_size, print('Missing input_size parameter.')
  10. # Batch mean & var must be defined during training
  11. self.mu = torch.zeros(1, input_size)
  12. self.var = torch.ones(1, input_size)
  13. # For numerical stability
  14. self.epsilon = epsilon
  15. # Exponential moving average for mu & var update
  16. self.it_call = 0 # training iterations
  17. self.momentum = momentum # EMA smoothing
  18. # Trainable parameters
  19. self.beta = torch.nn.Parameter(torch.zeros(1, input_size))
  20. self.gamma = torch.nn.Parameter(torch.ones(1, input_size))
  21. # Batch size on which the normalization is computed
  22. self.batch_size = 0
  23. def forward(self, x):
  24. # [batch_size, input_size]
  25. self.it_call += 1
  26. if self.training:
  27. print("Info: training ...")
  28. if( self.batch_size == 0 ):
  29. # First iteration : save batch_size
  30. self.batch_size = x.shape[0]
  31. # Training : compute BN pass
  32. #batch_mu = (x.sum(dim=0)/x.shape[0]).unsqueeze(0) # [1, input_size]
  33. batch_mu = torch.mean(x, dim=0)
  34. #batch_var = (x.var(dim=0)/x.shape[0]).unsqueeze(0)*2 # [1, input_size]
  35. batch_var = torch.var(x, unbiased=False, dim=0)
  36. #print("batch_mu:", batch_mu)
  37. #print("batch_var:", batch_var)
  38. x_normalized = (x-batch_mu)/torch.sqrt(batch_var + self.epsilon) # [batch_size, input_size]
  39. x_bn = self.gamma * x_normalized + self.beta # [batch_size, input_size]
  40. # Update mu & std
  41. if(x.shape[0] == self.batch_size):
  42. running_mu = batch_mu
  43. running_var = batch_var
  44. else:
  45. running_mu = batch_mu*self.batch_size/x.shape[0]
  46. running_var = batch_var*self.batch_size/x.shape[0]
  47. self.mu = running_mu * (self.momentum/self.it_call) + \
  48. self.mu * (1 - (self.momentum/self.it_call))
  49. self.var = running_var * (self.momentum/self.it_call) + \
  50. self.var * (1 - (self.momentum/self.it_call))
  51. else:
  52. print("Info: inference ...")
  53. # Inference: compute BN pass using estimated mu & var
  54. if (x.shape[0] == self.batch_size):
  55. estimated_mu = self.mu
  56. estimated_var = self.var
  57. else :
  58. estimated_mu = self.mu*x.shape[0]/self.batch_size
  59. estimated_var = self.var*x.shape[0]/self.batch_size
  60. x_normalized = (x-estimated_mu)/torch.sqrt(estimated_var + self.epsilon) # [batch_size, input_size]
  61. x_bn = self.gamma * x_normalized + self.beta # [batch_size, input_size]
  62. return x_bn # [batch_size, output_size=input_size]
  63. # N = 3, C = 1, H = 1, W = 6
  64. input_size = 1 # channel
  65. bn = myBatchNorm2d(input_size)
  66. data = [[[[11.1, -2.2, 23.3, 54.4, 58.5, -16.6]]],
  67. [[[-97.7, -28.8, 49.9, -61.3, 52.6, -33.9]]],
  68. [[[-2.45, -15.7, 72.4, 9.1, 47.2, 21.7]]]]
  69. input = torch.FloatTensor(data) # [N, C, H, W]
  70. print("input:", input)
  71. output = bn.forward(input)
  72. print("output:", output)
  73. '''
  74. print("######################")
  75. a = np.array(data)
  76. print(np.mean(a, axis=0))
  77. print(np.var(a, axis=0))
  78. '''

    执行结果如下:可见,C++和PyTorch实现结果相同

    GitHub:

    https://github.com/fengbingchun/NN_Test

    https://github.com/fengbingchun/PyTorch_Test

相关文章