Commit 41cf06cc authored by Jonathan L Long's avatar Jonathan L Long Committed by Evan Shelhamer
Browse files

zero-init param diffs and accumulate gradients

(With layers whose backward accumulates gradients), this effectively
decouples the computational batch from the SGD minibatch. Each
iteration accumulates gradients over iter_size batches, then parameters
are updated.
parent b12c1710
......@@ -96,7 +96,7 @@ message NetParameter {
// NOTE
// Update the next available ID when you add a new SolverParameter field.
//
// SolverParameter next available ID: 36 (last added: clip_gradients)
// SolverParameter next available ID: 37 (last added: iter_size)
message SolverParameter {
//////////////////////////////////////////////////////////////////////////////
// Specifying the train and test networks
......@@ -149,6 +149,8 @@ message SolverParameter {
// Display the loss averaged over the last average_loss iterations
optional int32 average_loss = 33 [default = 1];
optional int32 max_iter = 7; // the maximum number of iterations
// accumulate gradients over `iter_size` x `batch_size` instances
optional int32 iter_size = 36 [default = 1];
optional string lr_policy = 8; // The learning rate decay policy.
optional float gamma = 9; // The parameter to compute the learning rate.
optional float power = 10; // The parameter to compute the learning rate.
......
......@@ -168,6 +168,25 @@ void Solver<Dtype>::Step(int iters) {
Dtype smoothed_loss = 0;
while (iter_ < stop_iter) {
// zero-init the params
for (int i = 0; i < net_->params().size(); ++i) {
shared_ptr<Blob<Dtype> > blob = net_->params()[i];
switch(Caffe::mode()) {
case Caffe::CPU:
caffe_set(blob->count(), static_cast<Dtype>(0),
blob->mutable_cpu_diff());
break;
case Caffe::GPU:
#ifndef CPU_ONLY
caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
blob->mutable_gpu_diff());
#else
NO_GPU;
#endif
break;
}
}
if (param_.test_interval() && iter_ % param_.test_interval() == 0
&& (iter_ > 0 || param_.test_initialization())) {
TestAll();
......@@ -175,7 +194,13 @@ void Solver<Dtype>::Step(int iters) {
const bool display = param_.display() && iter_ % param_.display() == 0;
net_->set_debug_info(display && param_.debug_info());
Dtype loss = net_->ForwardBackward(bottom_vec);
// accumulate the loss and gradient
Dtype loss = 0;
for (int i = 0; i < param_.iter_size(); ++i) {
loss += net_->ForwardBackward(bottom_vec);
}
loss /= param_.iter_size();
// average the loss across iterations for smoothed reporting
if (losses.size() < average_loss) {
losses.push_back(loss);
int size = losses.size();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment