求导

Ascend GPU CPU 模型开发

一阶求导

MindSpore计算一阶导数方法mindspore.ops.GradOperation (get_all=False, get_by_list=False, sens_param=False)，其中get_allFalse时，只会对第一个输入求导，为True时，会对所有输入求导；get_by_listFalse时，不会对权重求导，为True时，会对权重求导；sens_param对网络的输出值做缩放以改变最终梯度，故其维度与输出维度保持一致。下面用MatMul算子的一阶求导做深入分析。

输入求导

import numpy as np
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from mindspore import ParameterTuple, Parameter
from mindspore import dtype as mstype
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.matmul = ops.MatMul()
self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')
def construct(self, x, y):
x = x * self.z
out = self.matmul(x, y)
return out

def __init__(self, net):
self.net = net
def construct(self, x, y):

x = Tensor([[0.8, 0.6, 0.2], [1.8, 1.3, 1.1]], dtype=mstype.float32)
y = Tensor([[0.11, 3.3, 1.1], [1.1, 0.2, 1.4], [1.1, 2.2, 0.3]], dtype=mstype.float32)
print(output)


[[4.5099998 2.7 3.6000001]
[4.5099998 2.7 3.6000001]]


x = Tensor([[x1, x2, x3], [x4, x5, x6]])
y = Tensor([[y1, y2, y3], [y4, y5, y6], [y7, y8, y9]])
z = Tensor([z])


$$output = [[(x1 \cdot y1 + x2 \cdot y4 + x3 \cdot y7) \cdot z, (x1 \cdot y2 + x2 \cdot y5 + x3 \cdot y8) \cdot z, (x1 \cdot y3 + x2 \cdot y6 + x3 \cdot y9) \cdot z]$$,

$$[(x4 \cdot y1 + x5 \cdot y4 + x6 \cdot y7) \cdot z, (x4 \cdot y2 + x5 \cdot y5 + x6 \cdot y8) \cdot z, (x4 \cdot y3 + x5 \cdot y6 + x6 \cdot y9) \cdot z]]$$

(1) 求和公式：

$$\sum{output} = [(x1 \cdot y1 + x2 \cdot y4 + x3 \cdot y7) + (x1 \cdot y2 + x2 \cdot y5 + x3 \cdot y8) + (x1 \cdot y3 + x2 \cdot y6 + x3 \cdot y9) +$$

$$(x4 \cdot y1 + x5 \cdot y4 + x6 \cdot y7) + (x4 \cdot y2 + x5 \cdot y5 + x6 \cdot y8) + (x4 \cdot y3 + x5 \cdot y6 + x6 \cdot y9)] \cdot z$$

(2) 求导公式：

$$\frac{\mathrm{d}(\sum{output})}{\mathrm{d}x} = [[(y1 + y2 + y3) \cdot z，(y4 + y5 + y6) \cdot z，(y7 + y8 + y9) \cdot z]，[(y1 + y2 + y3) \cdot z，(y4 + y5 + y6) \cdot z，(y7 + y8 + y9) \cdot z]]$$

(3) 计算结果：

$$\frac{\mathrm{d}(\sum{output})}{\mathrm{d}x} = [[4.5099998 \quad 2.7 \quad 3.6000001] [4.5099998 \quad 2.7 \quad 3.6000001]]$$

权重求导

class GradNetWrtX(nn.Cell):
def __init__(self, net):
self.net = net
self.params = ParameterTuple(net.trainable_params())
def construct(self, x, y):

output = GradNetWrtX(Net())(x, y)
print(output)


(Tensor(shape=[1], dtype=Float32, value= [ 2.15359993e+01]),)


$$\frac{\mathrm{d}(\sum{output})}{\mathrm{d}z} = (x1 \cdot y1 + x2 \cdot y4 + x3 \cdot y7) + (x1 \cdot y2 + x2 \cdot y5 + x3 \cdot y8) + (x1 \cdot y3 + x2 \cdot y6 + x3 \cdot y9) +$$

$$(x4 \cdot y1 + x5 \cdot y4 + x6 \cdot y7) + (x4 \cdot y2 + x5 \cdot y5 + x6 \cdot y8) + (x4 \cdot y3 + x5 \cdot y6 + x6 \cdot y9)$$

$$\frac{\mathrm{d}(\sum{output})}{\mathrm{d}z} = [2.15359993e+01]$$

梯度值缩放

class GradNetWrtX(nn.Cell):
def __init__(self, net):
self.net = net
self.grad_wrt_output = Tensor([[0.1, 0.6, 0.2], [0.8, 1.3, 1.1]], dtype=mstype.float32)
def construct(self, x, y):

output = GradNetWrtX(Net())(x, y)
print(output)


[[2.211 0.51 1.49 ]
[5.588 2.68 4.07 ]]


self.grad_wrt_output可以记作如下形式：

self.grad_wrt_output = Tensor([[s1, s2, s3], [s4, s5, s6]])


$$output = [[(x1 \cdot y1 + x2 \cdot y4 + x3 \cdot y7) \cdot z \cdot s1，(x1 \cdot y2 + x2 \cdot y5 + x3 \cdot y8) \cdot z \cdot s2，(x1 \cdot y3 + x2 \cdot y6 + x3 \cdot y9) \cdot z \cdot s3]，$$

$$[(x4 \cdot y1 + x5 \cdot y4 + x6 \cdot y7) \cdot z \cdot s4，(x4 \cdot y2 + x5 \cdot y5 + x6 \cdot y8) \cdot z \cdot s5，(x4 \cdot y3 + x5 \cdot y6 + x6 \cdot y9) \cdot z \cdot s6]]$$

$$\frac{\mathrm{d}(\sum{output})}{\mathrm{d}x} = [[(s1 \cdot y1 + s2 \cdot y2 + s3 \cdot y3) \cdot z，(s1 \cdot y4 + s2 \cdot y5 + s3 \cdot y6) \cdot z，(s1 \cdot y7 + s2 \cdot y8 + s3 \cdot y9) \cdot z]，$$

$$[(s4 \cdot y1 + s5 \cdot y2 + s6 \cdot y3) \cdot z，(s4 \cdot y4 + s5 \cdot y5 + s6 \cdot y6) \cdot z，(s4 \cdot y7 + s5 \cdot y8 + s6 \cdot y9) \cdot z]]$$

class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.matmul = ops.MatMul()
self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')
def construct(self, x, y):
x = x * self.z
out = self.matmul(x, y)
return out[0][0]

output = GradNetWrtX(Net())(x, y)
print(output)


[[0.11 1.1 1.1]
[0.   0.  0. ]]


停止计算梯度

import numpy as np
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from mindspore import ParameterTuple, Parameter
from mindspore import dtype as mstype

class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.matmul = ops.MatMul()

def construct(self, x, y):
out1 = self.matmul(x, y)
out2 = self.matmul(x, y)
out = out1 + out2
return out

def __init__(self, net):
self.net = net

def construct(self, x, y):

x = Tensor([[0.8, 0.6, 0.2], [1.8, 1.3, 1.1]], dtype=mstype.float32)
y = Tensor([[0.11, 3.3, 1.1], [1.1, 0.2, 1.4], [1.1, 2.2, 0.3]], dtype=mstype.float32)
print(output)

    [[4.5, 2.7, 3.6],
[4.5, 2.7, 3.6]]


    [[9.0, 5.4, 7.2],
[9.0, 5.4, 7.2]]


高阶求导

MindSpore可通过多次求导的方式支持高阶导数，下面通过几类例子展开阐述。

单输入单输出高阶导数

import numpy as np
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.sin = ops.Sin()
def construct(self, x):
out = self.sin(x)
return out

def __init__(self, network):
self.network = network
def construct(self, x):
return gout
def __init__(self, network):
self.network = network
def construct(self, x):
return gout

net=Net()
x_train = Tensor(np.array([1.0], dtype=np.float32))
print(output)


[-0.841471]


单输入多输出高阶导数

import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from mindspore import dtype as mstype
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.mul = ops.Mul()
def construct(self, x):
out = self.mul(x, x)
return out

def __init__(self, network):
self.network = network
def construct(self, x):
return gout
def __init__(self, network):
self.network = network
def construct(self, x):
return gout

net=Net()
x = Tensor([0.1, 0.2, 0.3], dtype=mstype.float32)
print(output)


[2. 2. 2.]


多输入多输出高阶导数

import numpy as np
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.mul = ops.Mul()

def construct(self, x, y):
x_square = self.mul(x, x)
x_square_y = self.mul(x_square, y)
return x_square_y

def __init__(self, network):
self.network = network
def construct(self, x, y):
gout = self.grad(self.network)(x, y) # return dx, dy
return gout

def __init__(self, network):
self.network = network
self.sens1 = Tensor(np.array([1]).astype('float32'))
self.sens2 = Tensor(np.array([0]).astype('float32'))
def construct(self, x, y):
dxdx, dxdy = self.grad(self.network)(x, y, (self.sens1,self.sens2))
dydx, dydy = self.grad(self.network)(x, y, (self.sens2,self.sens1))
return dxdx, dxdy, dydx, dydy

net = Net()
x_train = Tensor(np.array([4],dtype=np.float32))
y_train = Tensor(np.array([5],dtype=np.float32))
dxdx, dxdy, dydx, dydy = secondgrad(x_train, y_train)
print(dxdx, dxdy, dydx, dydy)


[10] [8.] [8.] [0.]


二阶微分算子支持情况

CPU支持算子：SquareExpNegMulMatMul

GPU支持算子：PowLogSquareExpNegMulDivMatMulSinCosTanAtanh

Ascend支持算子：PowLogSquareExpNegMulDivMatMulSinCosTanSinhCoshAtanh

Jvp与Vjp接口

Jvp

Jvp(Jacobian-vector-product)对应的是前向模式的自动微分，适用在输入的维度小于输出的维度的网络中。Jvp会将输入网络的正向运行结果以及微分结果返回出来。不同于反向自动微分，前向自动微分可以在求取网络的原本输出的同时求取其梯度，不需要像反向微分一样保存太多的中间结果，因此前向自动微分相比于反向自动微分往往会节省一定的内存。反向微分与正向微分的区别可以详见自动微分设计

import numpy as np
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from mindspore import dtype as mstype
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.sin = ops.Sin()
self.cos = ops.Cos()

def construct(self, x, y):
a = self.sin(x)
b = self.cos(y)
out = a + b
return out

def __init__(self, net):
self.net = net

def construct(self, x, y, v):
output = self.grad_op(x, y, (v, v))
return output

x = Tensor([0.8, 0.6, 0.2], dtype=mstype.float32)
y = Tensor([0.7, 0.4, 0.3], dtype=mstype.float32)
v = Tensor([1, 1, 1], dtype=mstype.float32)
print(output)


(Tensor(shape=[3], dtype=Float32, value= [ 1.48219836e+00, 1.48570347e+00, 1.15400589e+00]), Tensor(shape=[3], dtype=Float32, value= [ 5.24890423e-02,
4.35917288e-01, 6.84546351e-01]))


Vjp

Vjp(Vector-jacobian-product), 运行的是反向模式的自动微分。Vjp会将输入网络的前向结果以及微分结果一并输出出来。 反向微分更加适用在输入的维度大于输出维度的网络中，具体内容详见自动微分设计

import numpy as np
import mindspore.context as context
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore import Tensor
from mindspore import dtype as mstype
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.sin = ops.Sin()
self.cos = ops.Cos()

def construct(self, x, y):
a = self.sin(x)
b = self.cos(y)
out = a + b
return out

def __init__(self, net):
self.net = net

def construct(self, x, y, v):
return output

x = Tensor([0.8, 0.6, 0.2], dtype=mstype.float32)
y = Tensor([0.7, 0.4, 0.3], dtype=mstype.float32)
v = Tensor([1, 1, 1], dtype=mstype.float32)
print(output)


(Tensor(shape=[3], dtype=Float32, value= [ 1.48219836e+00, 1.48570347e+00, 1.15400589e+00]), (Tensor(shape=[3], dtype=Float32, value= [ 6.96706712e-01,
8.25335622e-01, 9.80066597e-01]), Tensor(shape=[3], dtype=Float32, value= [-6.44217670e-01, -3.89418334e-01, -2.95520216e-01])))


grad接口用于生成输入函数的梯度。利用grad_positionsens_param参数控制梯度的计算方式。grad_position默认是0，表示只对第一个输入求导；为非零int类型时，会对该数字对应索引位置输入求导；为tuple时，会对tuple内数字索引位置的输入求导。sens_param默认是False，表示不对网络输出值缩放，当sens_paramTrue时，对网络输出值进行缩放。

grad_position参数控制对特定输入求导。

import numpy as np
import mindspore.nn as nn
import mindspore.context as context
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE)
class Net(nn.Cell):
def construct(self, x, y, z):
return x*y*z

x = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
y = Tensor(np.array([[-2, 3], [-1, 2]]).astype(np.float32))
z = Tensor(np.array([[0, 3], [5, -1]]).astype(np.float32))
net = Net()
print(output)


(Tensor(shape=[2, 2], dtype=Float32, value=
[[ 0.00000000e+00,  6.00000000e+00],
[ 1.50000000e+01, -4.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
[[-2.00000000e+00,  6.00000000e+00],
[-3.00000000e+00,  8.00000000e+00]]))


sens_param参数控制对网络输出进行缩放。

import numpy as np
import mindspore.nn as nn
import mindspore.context as context
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE)
class Net(nn.Cell):
def construct(self, x, y, z):
return x**2 + y**2 + z**2, x*y*z

x = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
y = Tensor(np.array([[-2, 3], [-1, 2]]).astype(np.float32))
z = Tensor(np.array([[0, 3], [5, -1]]).astype(np.float32))
v = Tensor(np.array([[-1, 3], [2, 1]]).astype(np.float32))
net = Net()
print(output)


(Tensor(shape=[2, 2], dtype=Float32, value=
[[ 4.00000000e+00,  3.60000000e+01],
[ 2.60000000e+01,  0.00000000e+00]]), Tensor(shape=[2, 2], dtype=Float32, value=
[[ 2.00000000e+00,  3.60000000e+01],
[ 1.40000000e+01,  6.00000000e+00]]))


函数式jvp

jvp对应前向模式的自动微分，输出网络的正向运算结果和正向微分结果。输出的元组的第一个元素为网络的正向运行结果，第二个元组为网络的正向微分结果。

import numpy as np
import mindspore.nn as nn
import mindspore.context as context
from mindspore.ops import jvp
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE)
class Net(nn.Cell):
def construct(self, x, y):
return x**3 + y

x = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
y = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
v = Tensor(np.array([[1, 1], [1, 1]]).astype(np.float32))
output = jvp(Net(), (x, y), (v, v))
print(output)


(Tensor(shape=[2, 2], dtype=Float32, value=
[[ 2.00000000e+00, 1.00000000e+01],
[ 3.00000000e+01, 6.80000000e+01]]), Tensor(shape=[2, 2], dtype=Float32, value=
[[ 4.00000000e+00, 1.30000000e+01],
[ 2.80000000e+01, 4.90000000e+01]]))


函数式vjp

vjp对应反向模式的自动微分，输出网络的正向运算结果和反向微分结果。输出的元组的第一个元素为网络的正向运行结果，第二个元组为网络的反向微分结果。

import numpy as np
import mindspore.nn as nn
import mindspore.context as context
from mindspore.ops import vjp
from mindspore import Tensor
context.set_context(mode=context.GRAPH_MODE)
class Net(nn.Cell):
def construct(self, x, y):
return x**3 + y

x = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
y = Tensor(np.array([[1, 2], [3, 4]]).astype(np.float32))
v = Tensor(np.array([[1, 1], [1, 1]]).astype(np.float32))
output = vjp(Net(), (x, y), v)
print(output)


(Tensor(shape=[2, 2], dtype=Float32, value=
[[ 2.00000000e+00, 1.00000000e+01],
[ 3.00000000e+01, 6.80000000e+01]]), (Tensor(shape=[2, 2], dtype=Float32, value=
[[ 3.00000000e+00, 1.20000000e+01],
[ 2.70000000e+01, 4.80000000e+01]]), Tensor(shape=[2, 2], dtype=Float32, value=
[[ 1.00000000e+00, 1.00000000e+00],
[ 1.00000000e+00, 1.00000000e+00]])))


引用

[1] Zhang L, Han J, Wang H, et al. Deep potential molecular dynamics: a scalable model with the accuracy of quantum mechanics[J]. Physical review letters, 2018, 120(14): 143001.

[2] Raissi M, Perdikaris P, Karniadakis G E. Physics informed deep learning (part i): Data-driven solutions of nonlinear partial differential equations[J]. arXiv preprint arXiv:1711.10561, 2017.

[3] Baydin A G, Pearlmutter B A, Radul A A, et al. Automatic differentiation in machine learning: a survey[J]. The Journal of Machine Learning Research, 2017, 18(1): 5595-5637.