Commit 55585f5b authored by Evan Shelhamer's avatar Evan Shelhamer
Browse files

adjust local learning rate and decay according to gradient accumulation

Divide local rate by `iter_size` to normalize the gradient according to
the full minibatch size and not only the computational batch size.

Multiply the local decay by `iter_size` to counter the division of the
local learning rate since the decay is multiplied by the rate in the
update equation.
parent 67b1ff31
......@@ -488,7 +488,7 @@ void SGDSolver<Dtype>::ApplyUpdate() {
for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
ComputeUpdateValue(param_id, rate);
ComputeUpdateValue(param_id, rate / this->param_.iter_size());
......@@ -500,7 +500,8 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
Dtype weight_decay = this->param_.weight_decay();
string regularization_type = this->param_.regularization_type();
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
Dtype local_decay = weight_decay * net_params_weight_decay[param_id]
* this->param_.iter_size();
switch (Caffe::mode()) {
case Caffe::CPU: {
if (local_decay) {
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment