[ "MindSpore Made Easy" ]

[ "MindSpore Made Easy" ]

MindSpore Made Easy Conversion from PyTorch Source Code into MindSpore Low-Level API Code and Single-Server Single-Device Training on Ascend Processors

August 12, 2022 1. Overview This blog describes how to convert the PyTorch source code into MindSpore low-level API code and implement single-server single-device training on Ascend processors. The following figure shows the differences between the training processes of MindSpore high-level APIs, low-level APIs, and PyTorch.

Similar to MindSpore high-level APIs, low-level API training also requires run configuration, data reading and preprocessing, network definition, loss function definition and optimizers. 2. Model Construction (Low-Level APIs) During model construction, the network prototype and loss function are encapsulated first. Then the combined model is encapsulated with an optimizer to form a network that can be used for training. Training and validation require the accuracy on the training set. Therefore, the return value must contain the output value of the network.

import mindsporefrom mindspore import Modelimport mindspore.nn as nnfrom mindspore.ops import functional as Ffrom mindspore.ops import operations as P
class BuildTrainNetwork(nn.Cell):
'''Build train network.'''
def __init__(self, my_network, my_criterion, train_batch_size, class_num):
super(BuildTrainNetwork, self).__init__()
self.network = my_network
self.criterion = my_criterion
self.print = P.Print()
# Initialize self.output
self.output = mindspore.Parameter(Tensor(np.ones((train_batch_size,
class_num)), mindspore.float32), requires_grad=False)

def construct(self, input_data, label):
output = self.network(input_data)
# Get the network output and assign it to self.output
self.output = output
loss0 = self.criterion(output, label)
return loss0
class TrainOneStepCellV2(TrainOneStepCell):
'''Build train network.'''
def __init__(self, network, optimizer, sens=1.0):
super(TrainOneStepCellV2, self).__init__(network, optimizer, sens=1.0)

def construct(self, *inputs):
weights = self.weights
loss = self.network(*inputs)
# Obtain self.network from BuildTrainNetwork
output = self.network.output
sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
# Get the gradient of the network parameters
grads = self.grad(self.network, weights)(*inputs, sens)
grads = self.grad_reducer(grads)
# Optimize model parameters
loss = F.depend(loss, self.optimizer(grads))
return loss, output
# Construct model
model_constructed = BuildTrainNetwork(net, loss_function, TRAIN_BATCH_SIZE, CLASS_NUM)
model_constructed = TrainOneStepCellV2(model_constructed, opt)

3 Training and Validation (Low-Level APIs) Similar to PyTorch, network training and validation are performed with low-level APIs.

class CorrectLabelNum(nn.Cell):

def __init__(self):

super(CorrectLabelNum, self).__init__()

self.print = P.Print()

self.argmax = mindspore.ops.Argmax(axis=1)

self.sum = mindspore.ops.ReduceSum()



def construct(self, output, target):

output = self.argmax(output)

correct = self.sum((output == target).astype(mindspore.dtype.float32))

return correct

def train_net(model, network, criterion,

epoch_max, train_path, val_path,

train_batch_size, val_batch_size,

repeat_size):



"""define the training method"""

# Create dataset

ds_train, steps_per_epoch_train = create_dataset(train_path,

do_train=True, batch_size=train_batch_size, repeat_num=repeat_size)

ds_val, steps_per_epoch_val = create_dataset(val_path, do_train=False,

batch_size=val_batch_size, repeat_num=repeat_size)



# CheckPoint CallBack definition

config_ck = CheckpointConfig(save_checkpoint_steps=steps_per_epoch_train,

keep_checkpoint_max=epoch_max)

ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10",

directory="./", config=config_ck)



# Create dict to save internal callback object's parameters

cb_params = _InternalCallbackParam()

cb_params.train_network = model

cb_params.epoch_num = epoch_max

cb_params.batch_num = steps_per_epoch_train

cb_params.cur_epoch_num = 0

cb_params.cur_step_num = 0

run_context = RunContext(cb_params)

ckpoint_cb.begin(run_context)



print("============== Starting Training ==============")

correct_num = CorrectLabelNum()

correct_num.set_train(False)



for epoch in range(epoch_max):

print("
Epoch:", epoch+1, "/", epoch_max)

train_loss = 0

train_correct = 0

train_total = 0

for _, (data, gt_classes) in enumerate(ds_train):

model.set_train()

loss, output = model(data, gt_classes)

train_loss += loss

correct = correct_num(output, gt_classes)

correct = correct.asnumpy()

train_correct += correct.sum()

# Update current step number

cb_params.cur_step_num += 1

# Check whether to save checkpoint or not

ckpoint_cb.step_end(run_context)



cb_params.cur_epoch_num += 1

my_train_loss = train_loss/steps_per_epoch_train

my_train_accuracy = 100*train_correct/(train_batch_size*

steps_per_epoch_train)

print('Train Loss:', my_train_loss)

print('Train Accuracy:', my_train_accuracy, '%')



print('evaluating {}/{} ...'.format(epoch + 1, epoch_max))

val_loss = 0

val_correct = 0

for _, (data, gt_classes) in enumerate(ds_val):

network.set_train(False)

output = network(data)

loss = criterion(output, gt_classes)

val_loss += loss

correct = correct_num(output, gt_classes)

correct = correct.asnumpy()

val_correct += correct.sum()



my_val_loss = val_loss/steps_per_epoch_val

my_val_accuracy = 100*val_correct/(val_batch_size*steps_per_epoch_val)

print('Validation Loss:', my_val_loss)

print('Validation Accuracy:', my_val_accuracy, '%')



print("--------- trains out ---------")

4 Script Running Run the command:

python MindSpore_1P_low_API.py --data_path=xxx --epoch_num=xxx

Run the script on the terminal in the development environment and the network output is displayed.

Note: High-level APIs support model training in data offloading mode, which is not supported by low-level APIs. Therefore, model training with high-level APIs is faster than that with low-level APIs. Performance comparison: Low-level APIs: 2000 imgs/sec; high-level APIs: 2200 imgs/sec