第四天 实现自动求导机制


前3天,我们已经实现了基本的数据结构无缝切换与CPU与GPU,现在实现自动求导机制。

第一步 建立计算节点。

1
2
3
4
5
6
class Node(object):
    def __init__(self):
        self.inputs = []
        self.op = None
        self.const_attr = None
        self.name = ""

每个节点有输入 操作符 常量值 以及 名称

定义操作运算符

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
class Op(object):
    def __call__(self):
        new_node = Node()
        new_node.op = self
        return new_node

    def compute(self, node, input_vals, output_val, use_numpy=True):
        raise NotImplementedError

    def gradient(self, node, output_grad):
        raise NotImplementedError

    def infer_shape(self, node, input_shapes):
        raise NotImplementedError

接下来的各种操作都是继承该运算符 比如 加法运算符

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
class AddOp(Op):
    def __call__(self, node_A, node_B):
        new_node = Op.__call__(self)
        new_node.inputs = [node_A, node_B]
        new_node.name = "(%s+%s)" % (node_A.name, node_B.name)
        return new_node

    def compute(self, node, input_vals, output_val, use_numpy=True):
        assert len(input_vals) == 2
        if use_numpy:
            # output_val[:] allows modify in-place
            output_val[:] = input_vals[0] + input_vals[1]
        else:
            if input_vals[0].shape == input_vals[1].shape:
                gpu_op.matrix_elementwise_add(
                    input_vals[0], input_vals[1], output_val)
            else:
                if input_vals[1].shape == (1,):
                    const_val = input_vals[1].asnumpy()[0]
                    gpu_op.matrix_elementwise_add_by_const(
                        input_vals[0], const_val, output_val)
                elif input_vals[0].shape == (1,):
                    const_val = input_vals[0].asnumpy()[0]
                    gpu_op.matrix_elementwise_add_by_const(
                        input_vals[1], const_val, output_val)

    def gradient(self, node, output_grad):
        return [output_grad, output_grad]

    def infer_shape(self, node, input_shapes):
        assert len(input_shapes) == 2
        assert input_shapes[0] == input_shapes[1]

同时,将Node类添加各种运算符

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class Node(object):
    def __init__(self):
        self.inputs = []
        self.op = None
        self.const_attr = None
        self.name = ""

    def __add__(self, other):
        if isinstance(other, Node):
            new_node = add_op(self, other)
        else:
            # Add by a constant stores the constant in new node's const_attr
            # 'other' argument is a constant
            new_node = add_byconst_op(self, other)
        return new_node

    def __mul__(self, other):
        if isinstance(other, Node):
            new_node = mul_op(self, other)
        else:
            new_node = mul_byconst_op(self, other)
        return new_node

    __radd__ = __add__
    __rmul__ = __mul__

    def __str__(self):
        return self.name

同时定义Varibale

1
2
3
4
5
6
7
def Variable(name):
    """
        x = Variable(name = "x")
    """
    placeholder_node = placeholder_op()
    placeholder_node.name = name
    return placeholder_node

定义placeholder

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
class PlaceholderOp(Op):
    def __call__(self):
        new_node = Op.__call__(self)
        return new_node

    def compute(self, node, input_vals, output_val, use_numpy=True):
        assert False, "placeholder %s values provided by feed_dict" % node.name

    def gradient(self, node, output_grad):
        return None

    def infer_shape(self, node, input_shapes):
        assert False, "placeholder %s shape provided by feed_shape" % node.name

自动求导机制

在过去的神经网络计算中,都是采用的反向求导算法,但是反向求导有一个不好的地方在于对于内存消耗特别厉害,原因在于反向求导需要记住已经运算过的数值。而本文采用的为逆向自动求导。

如表所示

前向传播完成以后,反向求导直接逆向即可。

那么具体的算法为

那么如何得到计算的次序呢?

答: 拓扑排序。

采用拓扑排序得到没有输入值得节点(即placeholder节点)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def find_topo_sort(node_list):
    visited = set()
    topo_order = []
    for node in node_list:
        topo_sort_dfs(node, visited, topo_order)
    return topo_order


def topo_sort_dfs(node, visited, topo_order):
    if node in visited:
        return
    visited.add(node)
    for n in node.inputs:
        topo_sort_dfs(n, visited, topo_order)
    topo_order.append(node)

当获得节点依赖信息后,先进行前向传播

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
for node in self.topo_order:
    if node in node_to_val_map:
        # Skip placeholder nodes. Values already provided by feed_dict.
        continue
    input_vals = [node_to_val_map[n] for n in node.inputs]
    if use_numpy:
        node_val = np.empty(shape=self.node_to_shape_map[node])
    else:
        node_val = self.node_to_arr_map[node]
    # node_val is modified in-place whether np.ndarray or NDArray
    node.op.compute(node, input_vals, node_val, use_numpy)
    node_to_val_map[node] = node_val

自动求导反向传播

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
def gradients(output_node, node_list):
    node_to_output_grads_list = {}
    node_to_output_grads_list[output_node] = [oneslike_op(output_node)]
    node_to_output_grad = {}
    reverse_topo_order = reversed(find_topo_sort([output_node]))
    for node in reverse_topo_order:
        node_to_output_grad[node] = sum_node_list(node_to_output_grads_list[node])
        grads = node.op.gradient(node, node_to_output_grad[node])
        for ind, in_node in enumerate(node.inputs):
            if in_node in node_to_output_grads_list:
                node_to_output_grads_list[in_node].append(grads[ind])
            else:
                node_to_output_grads_list[in_node] = [grads[ind]]
    grad_node_list = [node_to_output_grad[node] for node in node_list]
    return grad_node_list

具体实现方式我会另开一篇文章来完成Autodiff的实现方式。

完成

到现在为止,已经实现了基本的功能,接下来进行测试

测试MNIST的效果

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def mnist_mlp(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
    W1 = ad.Variable(name="W1")
    W2 = ad.Variable(name="W2")
    W3 = ad.Variable(name="W3")
    b1 = ad.Variable(name="b1")
    b2 = ad.Variable(name="b2")
    b3 = ad.Variable(name="b3")
    X = ad.Variable(name="X")
    y_ = ad.Variable(name="y_")

    # relu(X W1+b1)
    z1 = ad.matmul_op(X, W1)
    z2 = z1 + ad.broadcastto_op(b1, z1)
    z3 = ad.relu_op(z2)

    # relu(z3 W2+b2)
    z4 = ad.matmul_op(z3, W2)
    z5 = z4 + ad.broadcastto_op(b2, z4)
    z6 = ad.relu_op(z5)

    # softmax(z5 W2+b2)
    z7 = ad.matmul_op(z6, W3)
    y = z7 + ad.broadcastto_op(b3, z7)

    loss = ad.softmaxcrossentropy_op(y, y_)

    grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3 = ad.gradients(
        loss, [W1, W2, W3, b1, b2, b3])
    executor = ad.Executor(
        [loss, grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3, y],
        ctx=executor_ctx)

    datasets = load_mnist_data("mnist.pkl.gz")
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    batch_size = 1000
    n_train_batches = train_set_x.shape[0] // batch_size
    n_valid_batches = valid_set_x.shape[0] // batch_size

    print("Start training loop...")

    W2_val = rand.normal(scale=0.1, size=(256, 100))
    W3_val = rand.normal(scale=0.1, size=(100, 10))
    b1_val = rand.normal(scale=0.1, size=(256))
    b2_val = rand.normal(scale=0.1, size=(100))
    b3_val = rand.normal(scale=0.1, size=(10))
    X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
    y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
    valid_X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
    valid_y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
    if ndarray.is_gpu_ctx(executor_ctx):
        W1_val = ndarray.array(W1_val, ctx=executor_ctx)
        W2_val = ndarray.array(W2_val, ctx=executor_ctx)
        W3_val = ndarray.array(W3_val, ctx=executor_ctx)
        b1_val = ndarray.array(b1_val, ctx=executor_ctx)
        b2_val = ndarray.array(b2_val, ctx=executor_ctx)
        b3_val = ndarray.array(b3_val, ctx=executor_ctx)
        X_val = ndarray.array(X_val, ctx=executor_ctx)
        y_val = ndarray.array(y_val, ctx=executor_ctx)

    lr = 1.0e-3
    for i in range(num_epochs):
        print("epoch %d" % i)
        for minibatch_index in range(n_train_batches):
            minibatch_start = minibatch_index * batch_size
            minibatch_end = (minibatch_index + 1) * batch_size
            X_val[:] = train_set_x[minibatch_start:minibatch_end]
            y_val[:] = convert_to_one_hot(
                train_set_y[minibatch_start:minibatch_end])
            loss_val, grad_W1_val, grad_W2_val, grad_W3_val, \
                grad_b1_val, grad_b2_val, grad_b3_val, _ = executor.run(
                    feed_dict={
                        X: X_val,
                        y_: y_val,
                        W1: W1_val,
                        W2: W2_val,
                        W3: W3_val,
                        b1: b1_val,
                        b2: b2_val,
                        b3: b3_val})
            # SGD update
            if (executor_ctx is None):
                W1_val = W1_val - lr * grad_W1_val
                W2_val = W2_val - lr * grad_W2_val
                W3_val = W3_val - lr * grad_W3_val
                b1_val = b1_val - lr * grad_b1_val
                b2_val = b2_val - lr * grad_b2_val
                b3_val = b3_val - lr * grad_b3_val
            else:
                sgd_update_gpu(W1_val, grad_W1_val, lr)
                sgd_update_gpu(W2_val, grad_W2_val, lr)
                sgd_update_gpu(W3_val, grad_W3_val, lr)
                sgd_update_gpu(b1_val, grad_b1_val, lr)
                sgd_update_gpu(b2_val, grad_b2_val, lr)
                sgd_update_gpu(b3_val, grad_b3_val, lr)
        if print_loss_val_each_epoch:
            if isinstance(loss_val, ndarray.NDArray):
                print(loss_val.asnumpy())
            else:
                print(loss_val)

    correct_predictions = []
    for minibatch_index in range(n_valid_batches):
        minibatch_start = minibatch_index * batch_size
        minibatch_end = (minibatch_index + 1) * batch_size
        valid_X_val[:] = valid_set_x[minibatch_start:minibatch_end]
        valid_y_val[:] = convert_to_one_hot(
            valid_set_y[minibatch_start:minibatch_end])
        _, _, _, _, _, _, _, valid_y_predicted = executor.run(
            feed_dict={
                X: valid_X_val,
                y_: valid_y_val,
                W1: W1_val,
                W2: W2_val,
                W3: W3_val,
                b1: b1_val,
                b2: b2_val,
                b3: b3_val},
            convert_to_numpy_ret_vals=True)
        correct_prediction = np.equal(
            np.argmax(valid_y_val, 1),
            np.argmax(valid_y_predicted, 1)).astype(np.float)
        correct_predictions.extend(correct_prediction)
    accuracy = np.mean(correct_predictions)
    print("validation set accuracy=%f" % accuracy)

最后的效果在 97%。

总结

    通过这个系列的深度学习框架的搭建,我们学会了基本的深度学习框架的构建流程。首先定义无缝切换的数据结构能够同时运行与CPU与GPU,当将各个操作实现,将其封装为动态链接库,在windows下为DLL文件,在Linux下为.so文件。     同时将C语言对应的深度学习数据结构以Python语言构建,使用ctypes标准库,将Python数据结构转换为C类型的数据结构。     最后实现自动求导的图模型,使用拓扑排序将图模型节点进行排序,通过Op数据结构进行计算,整个的深度学习框架即完成。

不足

    没有实现CUDNN的卷积神经网络,未来有时间会实现CUDA方式的卷积,同时Cython方式的卷积速度也可以,未来会看着方面的资料。完成这个系列,花费了我一个多星期的时间,但是里面肯定还有很多不足,希望大家多提意见。

最终的github为Thunder

参考资料

  1. cse599
  2. Pytorch
  3. Mxnet
  4. autodiff
  5. cs231n
  6. cs224n