# 自适应梯度求和算法

## 概述

\begin{split} \begin{aligned} w^{’} &= w_0 - \alpha \cdot [(1 - \frac{g^T_2 \cdot g_1}{2 \cdot ||g_1||^2}) \cdot g_1 + (1 - \frac{g^T_2 \cdot g_1}{2 \cdot ||g_2||^2}) \cdot g_2] \\ &= w_0 - \alpha \cdot Adasum(g_1,g_2) \end{aligned} \end{split}

$Adasum(g_{|0,n|}) = Adasum(Adasum(g_{|0, n/2|}), Adasum(g_{|n/2, n|}))$

## 准备环节

└─sample_code
│      rank_table_16pcs.json
│      resnet.py
│      training.py
│      run_node1.sh
│      run_node2.sh

### 配置分布式环境变量

{
"version": "1.0",
"server_count": "2",
"server_list": [
{
"server_id": "10.*.*.*",
"device": [
{"device_id": "0","device_ip": "192.1.27.6","rank_id": "0"},
{"device_id": "1","device_ip": "192.2.27.6","rank_id": "1"},
{"device_id": "2","device_ip": "192.3.27.6","rank_id": "2"},
{"device_id": "3","device_ip": "192.4.27.6","rank_id": "3"},
{"device_id": "4","device_ip": "192.1.27.7","rank_id": "4"},
{"device_id": "5","device_ip": "192.2.27.7","rank_id": "5"},
{"device_id": "6","device_ip": "192.3.27.7","rank_id": "6"},
{"device_id": "7","device_ip": "192.4.27.7","rank_id": "7"}],
"host_nic_ip": "reserve"
},
{
"server_id": "10.*.*.*",
"device": [
{"device_id": "0","device_ip": "192.1.27.8","rank_id": "8"},
{"device_id": "1","device_ip": "192.2.27.8","rank_id": "9"},
{"device_id": "2","device_ip": "192.3.27.8","rank_id": "10"},
{"device_id": "3","device_ip": "192.4.27.8","rank_id": "11"},
{"device_id": "4","device_ip": "192.1.27.9","rank_id": "12"},
{"device_id": "5","device_ip": "192.2.27.9","rank_id": "13"},
{"device_id": "6","device_ip": "192.3.27.9","rank_id": "14"},
{"device_id": "7","device_ip": "192.4.27.9","rank_id": "15"}],
"host_nic_ip": "reserve"
}
],
"status": "completed"
}

### 数据集准备

• 数据集大小：共1000个类、224*224彩色图像

• 训练集：共1,281,167张图像

• 测试集：共50,000张图像

• 数据格式：JPEG

• 下载数据集，目录结构如下：

└─dataset
├─train                 # 训练数据集
└─validation_preprocess # 评估数据集

## 运行模式配置

set_context(mode=GRAPH_MODE, device_target="Ascend")
device_id = int(os.getenv('DEVICE_ID'))
set_context(device_id=device_id)
init()

## 数据并行模式加载数据集

# define train dataset
train_data_path = os.path.join(args.data_path, "train")
ds_train = create_dataset(dataset_path=train_data_path, do_train=True, batch_size=256, train_image_size=224,
eval_image_size=224, target="Ascend", distribute=True)
step_size = ds_train.get_dataset_size()

## 定义网络

ResNet-50网络的构建代码由resnet.py导入。

# define net
net = resnet(num_classes=1001)
init_weight(net=net)

## 定义训练模型

# define loss
loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=0.1, num_classes=1001)
loss_scale = FixedLossScaleManager(1024, drop_overflow_update=False)

# define optimizer
group_params = init_group_params(net)
lr = get_lr(lr_init=0, lr_end=0.0, lr_max=0.8, warmup_epochs=5, total_epochs=90, steps_per_epoch=step_size,
lr_decay_mode="linear")
lr = Tensor(lr)
opt = Momentum(group_params, lr, 0.9, loss_scale=1024)

# define model
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", boost_level="O2",
keep_batchnorm_fp32=False)
# define eval_network
dist_eval_network = ClassifyCorrectCell(net)

# define boost config dictionary
boost_dict = {
"boost": {
"mode": "manual",
"less_bn": False,
"dim_reduce": False
}
}

# define model
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, amp_level="O2", boost_level="O2",
keep_batchnorm_fp32=False, boost_config_dict=boost_dict, eval_network=dist_eval_network)

## 训练模型

# define callback
cb = [TimeMonitor(data_size=step_size), LossMonitor()]

print("============== Starting Training ==============")
model.train(90, ds_train, callbacks=cb, sink_size=step_size, dataset_sink_mode=True)

## 运行脚本

2机16卡训练模型，在机器1上运行脚本run_node1.sh，在机器2上运行脚本run_node2.sh。

bash run_node{i}.sh ./imagenet

export RANK_TABLE_FILE=${EXEC_PATH}/rank_table_16pcs.json export RANK_SIZE=16 export DEVICE_NUM=8 export SERVER_ID=0 rank_start=$((DEVICE_NUM * SERVER_ID))

============== Starting Training ==============
epoch: 1 step: 312 loss is  5.5303826
...
epoch: 10 step: 312 loss is  3.3762435
...
...
...