# 梯度累积算法

## 梯度累积原理

$Loss(\theta)=\frac{1}{2}\left(h(x^{k})-y^{k}\right)^{2}$

$\theta{i}=\theta_{i-1}-lr * grad_{i}$

$accumulated=\sum_{i=0}^{N} grad_{i}$

$\theta{i}=\theta_{i-1}-lr * \sum_{i=0}^{N} grad_{i}$

1. 学习率 learning rate：一定条件下，Batch size越大训练效果越好，梯度累积则模拟了Batch size增大的效果，如果accumulation steps为4，则Batch size增大了4倍，根据经验，使用梯度累积的时候需要把学习率适当放大。

2. 归一化 Batch Norm：accumulation steps为4时进行Batch size模拟放大的效果，与真实Batch size相比，数据的分布其实并不完全相同，4倍Batch size的BN计算出来的均值和方差与实际数据均值和方差不太相同，因此有些实现中会使用Group Norm来代替Batch Norm。

## 梯度累积实现

### 单机模式

git clone https://gitee.com/mindspore/models.git


import sys
sys.path.append(models仓的路径)


#### 导入需要的库文件

import argparse
import os
from collections.abc import Iterable

import mindspore.nn as nn
from mindspore import ParameterTuple
from mindspore import context, DatasetHelper, save_checkpoint
from mindspore.nn import Cell
import mindspore.ops as ops
from models.official.cv.lenet.src.dataset import create_dataset
from models.official.cv.lenet.src.lenet import LeNet5


#### 定义训练流程

• TrainForwardBackward 计算loss和梯度，利用grad_sum实现梯度累加。

• TrainOptim 实现参数更新。

• TrainClear 实现对梯度累加变量grad_sum清零。

_sum_op = ops.MultitypeFuncGraph("grad_sum_op")
_clear_op = ops.MultitypeFuncGraph("clear_op")

@_sum_op.register("Tensor", "Tensor")

@_clear_op.register("Tensor", "Tensor")
success = True
return success

class TrainForwardBackward(Cell):
def __init__(self, network, optimizer, grad_sum, sens=1.0):
super(TrainForwardBackward, self).__init__(auto_prefix=False)
self.network = network
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.sens = sens
self.hyper_map = ops.HyperMap()

def construct(self, *inputs):
weights = self.weights
loss = self.network(*inputs)
sens = ops.Fill()(ops.DType()(loss), ops.Shape()(loss), self.sens)

class TrainOptim(Cell):
super(TrainOptim, self).__init__(auto_prefix=False)
self.optimizer = optimizer

def construct(self):

class TrainClear(Cell):
super(TrainClear, self).__init__(auto_prefix=False)
self.zeros = zeros
self.hyper_map = ops.HyperMap()

def construct(self):
return success


#### 定义训练模型

class GradientAccumulation:
def __init__(self, network, loss_fn, optimizer):
self._network = network
self._loss_fn = loss_fn
self._optimizer = optimizer

params = self._optimizer.parameters
self._zeros = params.clone(prefix="zeros", init='zeros')
self._train_forward_backward = self._build_train_forward_backward_network()
self._train_optim = self._build_train_optim()
self._train_clear = self._build_train_clear()

@staticmethod
def _transform_callbacks(callbacks):
"""Transform callback to a list."""
if callbacks is None:
return []

if isinstance(callbacks, Iterable):
return list(callbacks)

return [callbacks]

def _build_train_forward_backward_network(self):
"""Build forward and backward network"""
network = self._network
network = nn.WithLossCell(network, self._loss_fn)
loss_scale = 1.0
network = TrainForwardBackward(network, self._optimizer, self._grad_sum, loss_scale).set_train()
return network

def _build_train_optim(self):
"""Build optimizer network"""
return network

def _build_train_clear(self):
"""Build clear network"""
return network

def train_process(self, epoch, train_dataset, mini_steps=None):
"""
Training process. The data would be passed to network directly.
"""
dataset_helper = DatasetHelper(train_dataset, dataset_sink_mode=False, epoch_num=epoch)

for i in range(epoch):
step = 0
for k, next_element in enumerate(dataset_helper):
loss = self._train_forward_backward(*next_element)
if (k + 1) % mini_steps == 0:
step += 1
print("epoch:", i + 1, "step:", step, "loss is ", loss)
self._train_optim()
self._train_clear()

train_dataset.reset()



#### 训练并保存模型

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='MindSpore Grad Cumulative Example')
help='device where the code will be implemented (default: GPU)')
help='path where the dataset is saved')
args = parser.parse_args()

context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
ds_train = create_dataset(os.path.join(args.data_path, "train"), 32)

net = LeNet5(10)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(net.trainable_params(), 0.01, 0.9)

print("============== Starting Training ==============")
model.train_process(10, ds_train, mini_steps=4)


#### 实验结果

1. 运行训练代码，查看运行结果。

python train.py --data_path=./MNIST_Data


输出如下，可以看到loss值随着训练逐步降低：

epoch: 1 step: 27 loss is  0.3660637
epoch: 1 step: 28 loss is  0.25238192
...
epoch: 3 step: 2 loss is  0.12296932
epoch: 3 step: 3 loss is  0.15799297
...
epoch: 10 step: 448 loss is  0.06443884
epoch: 10 step: 449 loss is  0.0067842817

2. 查看保存的CheckPoint文件。

训练过程中保存了CheckPoint文件gradient_accumulation.ckpt，即模型文件。

python eval.py --data_path=./MNIST_Data --ckpt_path=./gradient_accumulation.ckpt --device_target=GPU


============== Starting Testing ==============
============== {'Accuracy': 0.9631730769230769} ==============


### Boost模式

#### 导入需要的库文件

import argparse
import os

import mindspore.nn as nn
from mindspore import Model, context
from mindspore.nn import WithLossCell, TrainOneStepCell, Accuracy
import mindspore.ops as ops
from mindspore.train.callback import LossMonitor, TimeMonitor

from models.official.cv.lenet.src.dataset import create_dataset
from models.official.cv.lenet.src.lenet import LeNet5


#### 定义训练模型

class TrainGradAccumulationStepsCell(TrainOneStepCell):
"""construct train accu step cell"""
def __init__(self, network, optimizer, sens=1.0, max_accumulation_step=1):
self.max_accumulation_step = max_accumulation_step

def construct(self, *inputs):
loss = self.network(*inputs)
sens = ops.fill(loss.dtype, loss.shape, self.sens)
return loss


#### 训练模型并进行推理

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='MindSpore Grad Cumulative Example')
help='device where the code will be implemented (default: Ascend)')
help='path where the dataset is saved')
args = parser.parse_args()

context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
ds_train = create_dataset(os.path.join(args.data_path, "train"), 32)

net = LeNet5(10)
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
net_opt = nn.Momentum(net.trainable_params(), 0.01, 0.9)
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

train_net = nn.WithLossCell(net, net_loss)
train_net = TrainGradAccumulationStepsCell(train_net, net_opt, 1.0, 5)
model = Model(train_net)

print("============== Starting Training ==============")
model.train(10, ds_train, callbacks=[time_cb, LossMonitor()])

print("============== Starting Testing ==============")
model = Model(net, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
ds_eval = create_dataset(os.path.join(args.data_path, "test"), 32, 1)
if ds_eval.get_dataset_size() == 0:
raise ValueError("Please check dataset size > 0 and batch_size <= dataset size")

acc = model.eval(ds_eval)
print("============== {} ==============".format(acc))



#### 实验结果

1. 运行训练与推理代码，查看运行结果。

python train_and_eval_boost.py --data_path=./MNIST_Data


输出如下，可以看到loss值随着训练逐步降低：

epoch: 1 step: 1875 loss is  0.1889342879
...
epoch: 5 step: 1875 loss is  0.11749879342
...
epoch: 10 step: 1875 loss is  0.00029468764328

2. 查看推理精度，代码中会将checkpoint保存到当前目录，随后会加载该checkpoint推理。

============== Starting Testing ==============
============== {'Accuracy': 0.983072916666} ==============


## 参考文献

• [1] Hermans, Joeri R., Gerasimos Spanakis, and Rico Möckel. “Accumulated gradient normalization.” Asian Conference on Machine Learning. PMLR, 2017.

• [2] Lin, Yujun, et al. “Deep gradient compression: Reducing the communication bandwidth for distributed training.” arXiv preprint arXiv:1712.01887 (2017).