深度学习中的优化算法之AdaMax

x33g5p2x  于2022-05-30 转载在 其他  
字(6.1k)|赞(0)|评价(0)|浏览(436)

    之前在https://blog.csdn.net/fengbingchun/article/details/125018001 介绍过深度学习中的优化算法Adam,这里介绍下深度学习的另一种优化算法AdaMax。AdaMax与Adam来自于同一篇论文。论文名字为《ADAM: A METHOD FOR STOCHASTIC OPTIMIZATION》,论文地址:https://arxiv.org/pdf/1412.6980.pdf

    AdaMax:是梯度优化算法的扩展,基于无穷范数的Adam****的变体(a variant of Adam based on the infinity norm)。此算法对学习率的上限提供了一个更简单的范围,并可能对某些问题进行更有效的优化。如下图所示,截图来自:https://arxiv.org/pdf/1609.04747.pdf

    AdaMax与Adam区别:本质上前者是将L2范数推广到L-infinity****范数。AdaMax与Adam最终公式中仅分母的计算方式不同,AdaMax使用公式24,Adam使用公式20。

以下是与Adam不同的代码片段:

    1.在原有枚举类Optimizaiton的基础上新增AdaMax:

  1. enum class Optimization {
  2. BGD, // Batch Gradient Descent
  3. SGD, // Stochastic Gradient Descent
  4. MBGD, // Mini-batch Gradient Descent
  5. SGD_Momentum, // SGD with Momentum
  6. AdaGrad, // Adaptive Gradient
  7. RMSProp, // Root Mean Square Propagation
  8. Adadelta, // an adaptive learning rate method
  9. Adam, // Adaptive Moment Estimation
  10. AdaMax // a variant of Adam based on the infinity norm
  11. };

    2.calculate_gradient_descent函数:

  1. void LogisticRegression2::calculate_gradient_descent(int start, int end)
  2. {
  3. switch (optim_) {
  4. case Optimization::AdaMax: {
  5. int len = end - start;
  6. std::vector<float> m(feature_length_, 0.), u(feature_length_, 1e-8), mhat(feature_length_, 0.);
  7. std::vector<float> z(len, 0.), dz(len, 0.);
  8. float beta1t = 1.;
  9. for (int i = start, x = 0; i < end; ++i, ++x) {
  10. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  11. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  12. beta1t *= beta1_;
  13. for (int j = 0; j < feature_length_; ++j) {
  14. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  15. m[j] = beta1_ * m[j] + (1. - beta1_) * dw; // formula 19
  16. u[j] = std::max(beta2_ * u[j], std::fabs(dw)); // formula 24
  17. mhat[j] = m[j] / (1. - beta1t); // formula 20
  18. // Note: need to ensure than u[j] cannot be 0.
  19. // (1). u[j] is initialized to 1e-8, or
  20. // (2). if u[j] is initialized to 0., then u[j] adjusts to (u[j] + 1e-8)
  21. w_[j] = w_[j] - alpha_ * mhat[j] / u[j]; // formula 25
  22. }
  23. b_ -= (alpha_ * dz[x]);
  24. }
  25. }
  26. break;
  27. case Optimization::Adam: {
  28. int len = end - start;
  29. std::vector<float> m(feature_length_, 0.), v(feature_length_, 0.), mhat(feature_length_, 0.), vhat(feature_length_, 0.);
  30. std::vector<float> z(len, 0.), dz(len, 0.);
  31. float beta1t = 1., beta2t = 1.;
  32. for (int i = start, x = 0; i < end; ++i, ++x) {
  33. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  34. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  35. beta1t *= beta1_;
  36. beta2t *= beta2_;
  37. for (int j = 0; j < feature_length_; ++j) {
  38. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  39. m[j] = beta1_ * m[j] + (1. - beta1_) * dw; // formula 19
  40. v[j] = beta2_ * v[j] + (1. - beta2_) * (dw * dw); // formula 19
  41. mhat[j] = m[j] / (1. - beta1t); // formula 20
  42. vhat[j] = v[j] / (1. - beta2t); // formula 20
  43. w_[j] = w_[j] - alpha_ * mhat[j] / (std::sqrt(vhat[j]) + eps_); // formula 21
  44. }
  45. b_ -= (alpha_ * dz[x]);
  46. }
  47. }
  48. break;
  49. case Optimization::Adadelta: {
  50. int len = end - start;
  51. std::vector<float> g(feature_length_, 0.), p(feature_length_, 0.);
  52. std::vector<float> z(len, 0.), dz(len, 0.);
  53. for (int i = start, x = 0; i < end; ++i, ++x) {
  54. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  55. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  56. for (int j = 0; j < feature_length_; ++j) {
  57. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  58. g[j] = mu_ * g[j] + (1. - mu_) * (dw * dw); // formula 10
  59. float alpha = (eps_ + std::sqrt(p[j])) / (eps_ + std::sqrt(g[j]));
  60. float change = alpha * dw;
  61. p[j] = mu_ * p[j] + (1. - mu_) * (change * change); // formula 15
  62. w_[j] = w_[j] - change;
  63. }
  64. b_ -= (eps_ * dz[x]);
  65. }
  66. }
  67. break;
  68. case Optimization::RMSProp: {
  69. int len = end - start;
  70. std::vector<float> g(feature_length_, 0.);
  71. std::vector<float> z(len, 0), dz(len, 0);
  72. for (int i = start, x = 0; i < end; ++i, ++x) {
  73. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  74. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  75. for (int j = 0; j < feature_length_; ++j) {
  76. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  77. g[j] = mu_ * g[j] + (1. - mu_) * (dw * dw); // formula 18
  78. w_[j] = w_[j] - alpha_ * dw / (std::sqrt(g[j]) + eps_);
  79. }
  80. b_ -= (alpha_ * dz[x]);
  81. }
  82. }
  83. break;
  84. case Optimization::AdaGrad: {
  85. int len = end - start;
  86. std::vector<float> g(feature_length_, 0.);
  87. std::vector<float> z(len, 0), dz(len, 0);
  88. for (int i = start, x = 0; i < end; ++i, ++x) {
  89. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  90. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  91. for (int j = 0; j < feature_length_; ++j) {
  92. float dw = data_->samples[random_shuffle_[i]][j] * dz[x];
  93. g[j] += dw * dw;
  94. w_[j] = w_[j] - alpha_ * dw / (std::sqrt(g[j]) + eps_);
  95. }
  96. b_ -= (alpha_ * dz[x]);
  97. }
  98. }
  99. break;
  100. case Optimization::SGD_Momentum: {
  101. int len = end - start;
  102. std::vector<float> change(feature_length_, 0.);
  103. std::vector<float> z(len, 0), dz(len, 0);
  104. for (int i = start, x = 0; i < end; ++i, ++x) {
  105. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  106. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  107. for (int j = 0; j < feature_length_; ++j) {
  108. float new_change = mu_ * change[j] - alpha_ * (data_->samples[random_shuffle_[i]][j] * dz[x]);
  109. w_[j] += new_change;
  110. change[j] = new_change;
  111. }
  112. b_ -= (alpha_ * dz[x]);
  113. }
  114. }
  115. break;
  116. case Optimization::SGD:
  117. case Optimization::MBGD: {
  118. int len = end - start;
  119. std::vector<float> z(len, 0), dz(len, 0);
  120. for (int i = start, x = 0; i < end; ++i, ++x) {
  121. z[x] = calculate_z(data_->samples[random_shuffle_[i]]);
  122. dz[x] = calculate_loss_function_derivative(calculate_activation_function(z[x]), data_->labels[random_shuffle_[i]]);
  123. for (int j = 0; j < feature_length_; ++j) {
  124. w_[j] = w_[j] - alpha_ * (data_->samples[random_shuffle_[i]][j] * dz[x]);
  125. }
  126. b_ -= (alpha_ * dz[x]);
  127. }
  128. }
  129. break;
  130. case Optimization::BGD:
  131. default: // BGD
  132. std::vector<float> z(m_, 0), dz(m_, 0);
  133. float db = 0.;
  134. std::vector<float> dw(feature_length_, 0.);
  135. for (int i = 0; i < m_; ++i) {
  136. z[i] = calculate_z(data_->samples[i]);
  137. o_[i] = calculate_activation_function(z[i]);
  138. dz[i] = calculate_loss_function_derivative(o_[i], data_->labels[i]);
  139. for (int j = 0; j < feature_length_; ++j) {
  140. dw[j] += data_->samples[i][j] * dz[i]; // dw(i)+=x(i)(j)*dz(i)
  141. }
  142. db += dz[i]; // db+=dz(i)
  143. }
  144. for (int j = 0; j < feature_length_; ++j) {
  145. dw[j] /= m_;
  146. w_[j] -= alpha_ * dw[j];
  147. }
  148. b_ -= alpha_*(db/m_);
  149. }
  150. }

    执行结果如下图所示:测试函数为test_logistic_regression2_gradient_descent,多次执行每种配置,最终结果都相同。图像集使用MNIST,其中训练图像总共10000张,0和1各5000张,均来自于训练集;预测图像总共1800张,0和1各900张,均来自于测试集。Adam和AdaMax配置参数相同的情况下,即eps为1e-8,学习率为0.002,beta1为0.9,beta2为0.999的情况下,Adam耗时30秒,AdaMax耗时为25秒;它们的识别率均为100%

   GitHub: https://github.com/fengbingchun/NN_Test

相关文章