深度学习中的优化算法之Adam

x33g5p2x  于2022-05-28 转载在 其他  
字(5.4k)|赞(0)|评价(0)|浏览(327)

    之前在https://blog.csdn.net/fengbingchun/article/details/124909910 介绍过深度学习中的优化算法Adadelta,这里介绍下深度学习的另一种优化算法Adam。论文名字为《ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION》,论文地址:https://arxiv.org/pdf/1412.6980.pdf

    Adam(Adaptive Moment Estimation):自适应矩估计是一种计算每个参数或权重的自适应学习率的方法,它除了像Adadelta和RMSProp一样存储exponentially decaying average of past squared gradients外,Adam还保持exponentially decaying average of past gradients,类似于动量(momentum)。如下图所示,截图来自:https://arxiv.org/pdf/1609.04747.pdf

    Adam是梯度下降优化算法的扩展,当处理涉及大量数据或大量参数时,该算法非常有效。直观地说,它是带动量的梯度下降算法和RMSProp算法的结合。

    Adam旨在加速优化过程,例如减少达到最优值所需的迭代次数,或提高优化算法的能力,例如获得更好的最终结果。

    以下是与Adadelta不同的代码片段:

    1.在原有枚举类Optimizaiton的基础上新增Adam:

  1. enum class Optimization {
  2. BGD, // Batch Gradient Descent
  3. SGD, // Stochastic Gradient Descent
  4. MBGD, // Mini-batch Gradient Descent
  5. SGD_Momentum, // SGD with Momentum
  6. AdaGrad, // Adaptive Gradient
  7. RMSProp, // Root Mean Square Propagation
  8. Adadelta, // an adaptive learning rate method
  9. Adam // Adaptive Moment Estimation
  10. };

    2.calculate_gradient_descent函数:

  1. void LogisticRegression2::calculate_gradient_descent(int start, int end)
  2. {
  3. switch (optim_) {
  4. case Optimization::Adam: {
  5. int len = end - start;
  6. std::vector<float> m(feature_length_, 0.), v(feature_length_, 0.), mhat(feature_length_, 0.), vhat(feature_length_, 0.);
  7. std::vector<float> z(len, 0.), dz(len, 0.);
  8. float beta1t = 1., beta2t = 1.;
  9. for (int i = start, x = 0; i < end; ++i, ++x) {
  10. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  11. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  12. beta1t *= beta1_;
  13. beta2t *= beta2_;
  14. for (int j = 0; j < feature_length_; ++j) {
  15. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  16. m[j] = beta1_ * m[j] + (1. - beta1_) * dw; // formula 19
  17. v[j] = beta2_ * v[j] + (1. - beta2_) * (dw * dw); // formula 19
  18. mhat[j] = m[j] / (1. - beta1t); // formula 20
  19. vhat[j] = v[j] / (1. - beta2t); // formula 20
  20. w_[j] = w_[j] - alpha_ * mhat[j] / (std::sqrt(vhat[j]) + eps_); // formula 21
  21. }
  22. b_ -= (alpha_ * dz[x]);
  23. }
  24. }
  25. break;
  26. case Optimization::Adadelta: {
  27. int len = end - start;
  28. std::vector<float> g(feature_length_, 0.), p(feature_length_, 0.);
  29. std::vector<float> z(len, 0.), dz(len, 0.);
  30. for (int i = start, x = 0; i < end; ++i, ++x) {
  31. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  32. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  33. for (int j = 0; j < feature_length_; ++j) {
  34. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  35. g[j] = mu_ * g[j] + (1. - mu_) * (dw * dw); // formula 10
  36. float alpha = (eps_ + std::sqrt(p[j])) / (eps_ + std::sqrt(g[j]));
  37. float change = alpha * dw;
  38. p[j] = mu_ * p[j] + (1. - mu_) * (change * change); // formula 15
  39. w_[j] = w_[j] - change;
  40. }
  41. b_ -= (eps_ * dz[x]);
  42. }
  43. }
  44. break;
  45. case Optimization::RMSProp: {
  46. int len = end - start;
  47. std::vector<float> g(feature_length_, 0.);
  48. std::vector<float> z(len, 0), dz(len, 0);
  49. for (int i = start, x = 0; i < end; ++i, ++x) {
  50. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  51. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  52. for (int j = 0; j < feature_length_; ++j) {
  53. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  54. g[j] = mu_ * g[j] + (1. - mu_) * (dw * dw); // formula 18
  55. w_[j] = w_[j] - alpha_ * dw / (std::sqrt(g[j]) + eps_);
  56. }
  57. b_ -= (alpha_ * dz[x]);
  58. }
  59. }
  60. break;
  61. case Optimization::AdaGrad: {
  62. int len = end - start;
  63. std::vector<float> g(feature_length_, 0.);
  64. std::vector<float> z(len, 0), dz(len, 0);
  65. for (int i = start, x = 0; i < end; ++i, ++x) {
  66. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  67. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  68. for (int j = 0; j < feature_length_; ++j) {
  69. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  70. g[j] += dw * dw;
  71. w_[j] = w_[j] - alpha_ * dw / (std::sqrt(g[j]) + eps_);
  72. }
  73. b_ -= (alpha_ * dz[x]);
  74. }
  75. }
  76. break;
  77. case Optimization::SGD_Momentum: {
  78. int len = end - start;
  79. std::vector<float> change(feature_length_, 0.);
  80. std::vector<float> z(len, 0), dz(len, 0);
  81. for (int i = start, x = 0; i < end; ++i, ++x) {
  82. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  83. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  84. for (int j = 0; j < feature_length_; ++j) {
  85. float new_change = mu_ * change[j] - alpha_ * (data_->samples[random_shuffle_[i]][j] * dz[x]);
  86. w_[j] += new_change;
  87. change[j] = new_change;
  88. }
  89. b_ -= (alpha_ * dz[x]);
  90. }
  91. }
  92. break;
  93. case Optimization::SGD:
  94. case Optimization::MBGD: {
  95. int len = end - start;
  96. std::vector<float> z(len, 0), dz(len, 0);
  97. for (int i = start, x = 0; i < end; ++i, ++x) {
  98. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  99. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  100. for (int j = 0; j < feature_length_; ++j) {
  101. w_[j] = w_[j] - alpha_ * (data_->samples[random_shuffle_[i]][j] * dz[x]);
  102. }
  103. b_ -= (alpha_ * dz[x]);
  104. }
  105. }
  106. break;
  107. case Optimization::BGD:
  108. default: // BGD
  109. std::vector<float> z(m_, 0), dz(m_, 0);
  110. float db = 0.;
  111. std::vector<float> dw(feature_length_, 0.);
  112. for (int i = 0; i < m_; ++i) {
  113. z[i] = calculate_z(data_->samples[i]);
  114. o_[i] = calculate_activation_function(z[i]);
  115. dz[i] = calculate_loss_function_derivative(o_[i], data_->labels[i]);
  116. for (int j = 0; j < feature_length_; ++j) {
  117. dw[j] += data_->samples[i][j] * dz[i]; // dw(i)+=x(i)(j)*dz(i)
  118. }
  119. db += dz[i]; // db+=dz(i)
  120. }
  121. for (int j = 0; j < feature_length_; ++j) {
  122. dw[j] /= m_;
  123. w_[j] -= alpha_ * dw[j];
  124. }
  125. b_ -= alpha_*(db/m_);
  126. }
  127. }

    执行结果如下图所示:测试函数为test_logistic_regression2_gradient_descent,多次执行每种配置,最终结果都相同。图像集使用MNIST,其中训练图像总共10000张,0和1各5000张,均来自于训练集;预测图像总共1800张,0和1各900张,均来自于测试集。在Adadelta中设置eps为1e-3,耗时为26秒;在Adam中设置eps为1e-8,学习率为0.001,beta1为0.8,beta2为0.888时,耗时为27秒;它们的识别率均为100%

    GitHub: https://github.com/fengbingchun/NN_Test

相关文章