[{"data":1,"prerenderedAt":684},["ShallowReactive",2],{"content-query-rrqaiQ4XTC":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":678,"_id":679,"_source":680,"_file":681,"_stem":682,"_extension":683},"/technology-blogs/zh/1533","zh",false,"","【MindSpore开发者分享】模型训练之断点续训初体验","MindSpore提供了断点续训的功能，当用户开启该功能时，如果在训练过程中发生了异常，那么MindSpore会自动保存异常发生时的CheckPoint文件(临终CheckPoint)。","2022-06-02","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/13/5a176f91adef483c924586561c40682c.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":662},"root",[17,25,31,44,50,59,78,83,93,98,131,143,155,160,165,173,184,196,210,219,224,232,243,256,269,277,300,305,316,321,329,334,339,344,352,365,373,385,393,420,431,444,452,457,465,477,482,489,499,507,517,525,536,540,548,568,580,588,598,606],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore开发者分享模型训练之断点续训初体验",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：kaierlong",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"来源：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":19,"props":45,"children":47},{"id":46},"the-first-experience-of-exception_save-for-model-training",[48],{"type":24,"value":49},"the first experience of exception_save for model training",{"type":18,"tag":51,"props":52,"children":53},"blockquote",{},[54],{"type":18,"tag":26,"props":55,"children":56},{},[57],{"type":24,"value":58},"模型训练之断点续训初体验",{"type":18,"tag":26,"props":60,"children":61},{},[62,69,71],{"type":18,"tag":37,"props":63,"children":66},{"href":64,"rel":65},"https://bbs.huaweicloud.com/forumreview/thread-187445-1-1.html",[41],[67],{"type":24,"value":68},"模型训练之断点续训初体验之一",{"type":24,"value":70}," ",{"type":18,"tag":37,"props":72,"children":75},{"href":73,"rel":74},"https://bbs.huaweicloud.com/forumreview/thread-187447-1-1.html",[41],[76],{"type":24,"value":77},"模型训练之断点续训初体验之二",{"type":18,"tag":26,"props":79,"children":80},{},[81],{"type":24,"value":82},"本文开发环境：",{"type":18,"tag":84,"props":85,"children":86},"ul",{},[87],{"type":18,"tag":88,"props":89,"children":90},"li",{},[91],{"type":24,"value":92},"MindSpore 1.7.0",{"type":18,"tag":26,"props":94,"children":95},{},[96],{"type":24,"value":97},"本文内容提要：",{"type":18,"tag":84,"props":99,"children":100},{},[101,106,111,116,121,126],{"type":18,"tag":88,"props":102,"children":103},{},[104],{"type":24,"value":105},"文档示例",{"type":18,"tag":88,"props":107,"children":108},{},[109],{"type":24,"value":110},"几种尝试",{"type":18,"tag":88,"props":112,"children":113},{},[114],{"type":24,"value":115},"源码探究",{"type":18,"tag":88,"props":117,"children":118},{},[119],{"type":24,"value":120},"案例介绍",{"type":18,"tag":88,"props":122,"children":123},{},[124],{"type":24,"value":125},"本文总结",{"type":18,"tag":88,"props":127,"children":128},{},[129],{"type":24,"value":130},"本文参考",{"type":18,"tag":132,"props":133,"children":135},"h2",{"id":134},"_1-文档示例",[136,141],{"type":18,"tag":37,"props":137,"children":140},{"href":138,"rel":139},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#1.-%E6%96%87%E6%A1%A3%E7%A4%BA%E4%BE%8B",[41],[],{"type":24,"value":142},"1. 文档示例",{"type":18,"tag":144,"props":145,"children":147},"h3",{"id":146},"_11-官方文档",[148,153],{"type":18,"tag":37,"props":149,"children":152},{"href":150,"rel":151},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#1.1-%E5%AE%98%E6%96%B9%E6%96%87%E6%A1%A3",[41],[],{"type":24,"value":154},"1.1 官方文档",{"type":18,"tag":26,"props":156,"children":157},{},[158],{"type":24,"value":159},"老传统，先看官方文档说明，说明如下：",{"type":18,"tag":26,"props":161,"children":162},{},[163],{"type":24,"value":164},"笔者解读：exception_save参数是1.7.0版本新加的功能，该参数为bool数据类型，但是官方文档对该参数的使用场景没有明确说明。",{"type":18,"tag":26,"props":166,"children":167},{},[168],{"type":18,"tag":169,"props":170,"children":172},"img",{"alt":7,"src":171},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20225/11/1652252912509431689.png",[],{"type":18,"tag":144,"props":174,"children":176},{"id":175},"_12-官方示例",[177,182],{"type":18,"tag":37,"props":178,"children":181},{"href":179,"rel":180},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#1.2-%E5%AE%98%E6%96%B9%E7%A4%BA%E4%BE%8B",[41],[],{"type":24,"value":183},"1.2 官方示例",{"type":18,"tag":26,"props":185,"children":186},{},[187,189],{"type":24,"value":188},"官方给出的示例如下：",{"type":18,"tag":37,"props":190,"children":193},{"href":191,"rel":192},"https://gitee.com/mindspore/docs/blob/r1.7/tutorials/source_zh_cn/advanced/train/save.ipynb",[41],[194],{"type":24,"value":195},"代码链接",{"type":18,"tag":26,"props":197,"children":198},{},[199,201,208],{"type":24,"value":200},"MindSpore提供了断点续训的功能，当用户开启该功能时，如果在训练过程中发生了异常，那么MindSpore会自动保存异常发生时的CheckPoint文件(临终CheckPoint)。断点续训的功能通过CheckpointConfig中的",{"type":18,"tag":202,"props":203,"children":205},"code",{"className":204},[],[206],{"type":24,"value":207},"exception_save",{"type":24,"value":209},"参数(bool类型)控制，设置为True时开启该功能，False关闭该功能，默认为False。断点续训功能保存的临终CheckPoint文件与正常流程保存的CheckPoint互不影响，命名机制和保存路径与正常流程设置保持一致，唯一不同之处在于会在临终CheckPoint文件名最后加上’_breakpoint’进行区分。其用法如下：",{"type":18,"tag":211,"props":212,"children":214},"pre",{"code":213},"from mindspore.train.callback import ModelCheckpoint, CheckpointConfig\n\n# 配置断点续训功能开启\nconfig_ck = CheckpointConfig(save_checkpoint_steps=32, keep_checkpoint_max=10, exception_save=True)\n",[215],{"type":18,"tag":202,"props":216,"children":217},{"__ignoreMap":7},[218],{"type":24,"value":213},{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":24,"value":223},"如果在训练过程中发生了异常，那么会自动保存临终CheckPoint，假如在训练中的第10个epoch的第10个step中发生异常，保存的临终CheckPoint文件如下。",{"type":18,"tag":211,"props":225,"children":227},{"code":226},"# 临终CheckPoint文件名最后会加上'_breakpoint'与正常流程CheckPoint区分开\nresnet50-10_10_breakpoint.ckpt\n",[228],{"type":18,"tag":202,"props":229,"children":230},{"__ignoreMap":7},[231],{"type":24,"value":226},{"type":18,"tag":132,"props":233,"children":235},{"id":234},"_2-几种尝试",[236,241],{"type":18,"tag":37,"props":237,"children":240},{"href":238,"rel":239},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#2.-%E5%87%A0%E7%A7%8D%E5%B0%9D%E8%AF%95",[41],[],{"type":24,"value":242},"2. 几种尝试",{"type":18,"tag":26,"props":244,"children":245},{},[246,248,254],{"type":24,"value":247},"在",{"type":18,"tag":202,"props":249,"children":251},{"className":250},[],[252],{"type":24,"value":253},"1.1",{"type":24,"value":255},"中笔者谈到了官方并没有给出使用场景说明，笔者先按照自己的猜测来进行尝试。",{"type":18,"tag":26,"props":257,"children":258},{},[259],{"type":18,"tag":260,"props":261,"children":262},"em",{},[263],{"type":18,"tag":264,"props":265,"children":266},"strong",{},[267],{"type":24,"value":268},"猜测：训练过程中手动终止训练，是否会触发该参数生效。",{"type":18,"tag":26,"props":270,"children":271},{},[272],{"type":18,"tag":264,"props":273,"children":274},{},[275],{"type":24,"value":276},"下面进行代码验证",{"type":18,"tag":26,"props":278,"children":279},{},[280,282,289,291,298],{"type":24,"value":281},"本文使用代码取自笔者之前开源案例",{"type":18,"tag":37,"props":283,"children":286},{"href":284,"rel":285},"https://gitee.com/kaierlong/fashion_mnist_classification_with_cnn_by_mindspore",[41],[287],{"type":24,"value":288},"fashion_mnist_classification_with_cnn_by_mindspore",{"type":24,"value":290},"，并在",{"type":18,"tag":37,"props":292,"children":295},{"href":293,"rel":294},"https://gitee.com/kaierlong/fashion_mnist_classification_with_cnn_by_mindspore/blob/master/main.py",[41],[296],{"type":24,"value":297},"原始代码",{"type":24,"value":299},"基础上进行适当修改。",{"type":18,"tag":26,"props":301,"children":302},{},[303],{"type":24,"value":304},"该案例相关数据处理和运行请参考案例readme。",{"type":18,"tag":144,"props":306,"children":308},{"id":307},"_21-exception_save设置为false",[309,314],{"type":18,"tag":37,"props":310,"children":313},{"href":311,"rel":312},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#2.1-exception_save%E8%AE%BE%E7%BD%AE%E4%B8%BAfalse",[41],[],{"type":24,"value":315},"2.1 exception_save设置为False",{"type":18,"tag":26,"props":317,"children":318},{},[319],{"type":24,"value":320},"测试代码如下：",{"type":18,"tag":211,"props":322,"children":324},{"code":323},"#!/usr/bin/env python3\n# -*- coding: utf-8 -*-\n# -------------------\n# @Version : 1.0\n# @Author : xingchaolong\n# @For : MindSpore FashionMnist LeNet Example.\n# -------------------\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom __future__ import print_function\n\nimport argparse\n\nimport mindspore.dataset as ds\nimport mindspore.nn as nn\nimport mindspore.dataset.transforms.c_transforms as C\nimport mindspore.dataset.vision.c_transforms as CV\n\nfrom mindspore import context\nfrom mindspore import dtype as mstype\nfrom mindspore import Model\nfrom mindspore.common.initializer import Normal\nfrom mindspore.dataset.vision import Inter\nfrom mindspore.nn import Accuracy\nfrom mindspore.train.callback import CheckpointConfig, LossMonitor, ModelCheckpoint\n\n\ndef create_dataset(data_path, usage=\"train\", batch_size=32, repeat_size=1, num_parallel_workers=1):\n    # 定义数据集\n    fashion_mnist_ds = ds.FashionMnistDataset(data_path, usage=usage)\n    resize_height, resize_width = 28, 28\n    rescale = 1.0 / 255.0\n    shift = 0.0\n    rescale_nml = 1 / 0.3081\n    shift_nml = -1 * 0.1307 / 0.3081\n\n    # 定义所需要操作的map映射\n    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)\n    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)\n    rescale_op = CV.Rescale(rescale, shift)\n    hwc2chw_op = CV.HWC2CHW()\n    type_cast_op = C.TypeCast(mstype.int32)\n\n    # 使用map映射函数，将数据操作应用到数据集\n    fashion_mnist_ds = fashion_mnist_ds.map(\n        operations=type_cast_op, input_columns=\"label\", num_parallel_workers=num_parallel_workers)\n    fashion_mnist_ds = fashion_mnist_ds.map(\n        operations=[resize_op, rescale_op, rescale_nml_op, hwc2chw_op],\n        input_columns=\"image\", num_parallel_workers=num_parallel_workers)\n\n    # 进行shuffle、batch、repeat操作\n    buffer_size = 10000\n    fashion_mnist_ds = fashion_mnist_ds.shuffle(buffer_size=buffer_size)\n    fashion_mnist_ds = fashion_mnist_ds.batch(batch_size, drop_remainder=True)\n    fashion_mnist_ds = fashion_mnist_ds.repeat(count=repeat_size)\n\n    return fashion_mnist_ds\n\n\nclass LeNet5(nn.Cell):\n    \"\"\"\n    Lenet网络结构\n    \"\"\"\n    def __init__(self, num_class=10, num_channel=1):\n        super(LeNet5, self).__init__()\n        # 定义所需要的运算\n        self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')\n        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')\n        self.fc1 = nn.Dense(16 * 4 * 4, 256, weight_init=Normal(0.02))\n        self.fc2 = nn.Dense(256, 128, weight_init=Normal(0.02))\n        self.fc3 = nn.Dense(128, num_class, weight_init=Normal(0.02))\n        self.relu = nn.ReLU()\n        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)\n        self.flatten = nn.Flatten()\n\n    def construct(self, x):\n        # 使用定义好的运算构建前向网络\n        x = self.conv1(x)\n        x = self.relu(x)\n        x = self.max_pool2d(x)\n        x = self.conv2(x)\n        x = self.relu(x)\n        x = self.max_pool2d(x)\n        x = self.flatten(x)\n        x = self.fc1(x)\n        x = self.relu(x)\n        x = self.fc2(x)\n        x = self.relu(x)\n        x = self.fc3(x)\n        return x\n\n\ndef train_net(model, epoch_size, data_path, batch_size, repeat_size, ckpt_cb, sink_mode):\n    \"\"\"定义训练的方法\"\"\"\n    # 加载训练数据集\n    ds_train = create_dataset(data_path, usage=\"train\", batch_size=batch_size, repeat_size=repeat_size)\n    model.train(epoch_size, ds_train, callbacks=[ckpt_cb, LossMonitor(125)], dataset_sink_mode=sink_mode)\n\n\ndef test_net(model, data_path):\n    \"\"\"定义验证的方法\"\"\"\n    ds_eval = create_dataset(data_path, usage=\"test\")\n    acc = model.eval(ds_eval, dataset_sink_mode=False)\n    print(\"acc: {}\".format(acc), flush=True)\n\n\ndef run(data_path, model_dir, device_target=\"CPU\", batch_size=32, train_epoch=5, dataset_size=1):\n    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n\n    net = LeNet5()\n    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')\n    net_opt = nn.Momentum(net.trainable_params(), learning_rate=0.01, momentum=0.9)\n\n    # 设置模型保存参数\n    config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=False)\n    # 应用模型保存参数\n    ckpt_cb = ModelCheckpoint(prefix=\"lenet_ckpt\", directory=model_dir, config=config_ck)\n\n    model = Model(net, net_loss, net_opt, metrics={\"Accuracy\": Accuracy()})\n    train_net(model, train_epoch, data_path, batch_size, dataset_size, ckpt_cb, False)\n    test_net(model, data_path)\n\n\ndef main():\n    parser = argparse.ArgumentParser(description='MindSpore FashionMnist LeNet Example.')\n    parser.add_argument(\"--data_path\", type=str, required=True, help=\"fashion mnist data path.\")\n    parser.add_argument(\"--device_target\", type=str, default=\"CPU\", choices=['Ascend', 'GPU', 'CPU'],\n                        help=\"target device\")\n    parser.add_argument(\"--model_dir\", type=str, required=True, help=\"directory to save model ckpt.\")\n    parser.add_argument(\"--batch_size\", type=int, default=32, help=\"batch size.\")\n    parser.add_argument(\"--train_epoch\", type=int, default=5, help=\"train epoch.\")\n    parser.add_argument(\"--dataset_size\", type=int, default=1, help=\"dataset size.\")\n\n    args = parser.parse_args()\n\n    run(\n        data_path=args.data_path,\n        model_dir=args.model_dir,\n        device_target=args.device_target,\n        batch_size=args.batch_size,\n        train_epoch=args.train_epoch,\n        dataset_size=args.dataset_size\n    )\n\n\nif __name__ == \"__main__\":\n    main()\n",[325],{"type":18,"tag":202,"props":326,"children":327},{"__ignoreMap":7},[328],{"type":24,"value":323},{"type":18,"tag":26,"props":330,"children":331},{},[332],{"type":24,"value":333},"前台运行代码，命令如下：",{"type":18,"tag":26,"props":335,"children":336},{},[337],{"type":24,"value":338},"./data为数据目录，读者需要自行替换。",{"type":18,"tag":26,"props":340,"children":341},{},[342],{"type":24,"value":343},"./ckpt为模型保存目录，读者需要自行替换。",{"type":18,"tag":211,"props":345,"children":347},{"code":346},"python3 main.py --data_path=./data --model_dir=./ckpt\n",[348],{"type":18,"tag":202,"props":349,"children":350},{"__ignoreMap":7},[351],{"type":24,"value":346},{"type":18,"tag":26,"props":353,"children":354},{},[355,357,363],{"type":24,"value":356},"使用",{"type":18,"tag":202,"props":358,"children":360},{"className":359},[],[361],{"type":24,"value":362},"ctrl+c",{"type":24,"value":364},"命令手动终止命令运行，输出内容如下：",{"type":18,"tag":211,"props":366,"children":368},{"code":367},"epoch: 1 step: 125, loss is 2.2966978549957275\nepoch: 1 step: 250, loss is 2.2930874824523926\nepoch: 1 step: 375, loss is 2.257183074951172\nepoch: 1 step: 500, loss is 1.0803303718566895\n^CWARNING: Logging before InitGoogleLogging() is written to STDERR\n[WARNING] RUNTIME_FRAMEWORK(18086,0x10f6e9dc0,Python):2022-05-11-10:56:54.267.943 [mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc:203] IntHandler] Process 18086 receive KeyboardInterrupt signal.\nTerminated: 15\n",[369],{"type":18,"tag":202,"props":370,"children":371},{"__ignoreMap":7},[372],{"type":24,"value":367},{"type":18,"tag":26,"props":374,"children":375},{},[376,377,383],{"type":24,"value":356},{"type":18,"tag":202,"props":378,"children":380},{"className":379},[],[381],{"type":24,"value":382},"tree ckpt",{"type":24,"value":384},"命令查看模型保存目录情况，输出内容如下：",{"type":18,"tag":211,"props":386,"children":388},{"code":387},"ckpt/\n├── lenet_ckpt-1_100.ckpt\n├── lenet_ckpt-1_200.ckpt\n├── lenet_ckpt-1_300.ckpt\n├── lenet_ckpt-1_400.ckpt\n├── lenet_ckpt-1_500.ckpt\n└── lenet_ckpt-graph.meta\n\n0 directories, 6 files\n",[389],{"type":18,"tag":202,"props":390,"children":391},{"__ignoreMap":7},[392],{"type":24,"value":387},{"type":18,"tag":26,"props":394,"children":395},{},[396],{"type":18,"tag":260,"props":397,"children":398},{},[399],{"type":18,"tag":264,"props":400,"children":401},{},[402,404,410,412,418],{"type":24,"value":403},"解读：可以看到模型保存目录内容正常，并没有",{"type":18,"tag":202,"props":405,"children":407},{"className":406},[],[408],{"type":24,"value":409},"_breakpoint",{"type":24,"value":411},"相关的",{"type":18,"tag":202,"props":413,"children":415},{"className":414},[],[416],{"type":24,"value":417},"ckpt",{"type":24,"value":419},"出现。",{"type":18,"tag":144,"props":421,"children":423},{"id":422},"_22-exception_save设置为true",[424,429],{"type":18,"tag":37,"props":425,"children":428},{"href":426,"rel":427},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#2.2-exception_save%E8%AE%BE%E7%BD%AE%E4%B8%BAtrue",[41],[],{"type":24,"value":430},"2.2 exception_save设置为True",{"type":18,"tag":26,"props":432,"children":433},{},[434,436,442],{"type":24,"value":435},"将",{"type":18,"tag":202,"props":437,"children":439},{"className":438},[],[440],{"type":24,"value":441},"2.1",{"type":24,"value":443},"中测试代码",{"type":18,"tag":211,"props":445,"children":447},{"code":446},"config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=False)\n",[448],{"type":18,"tag":202,"props":449,"children":450},{"__ignoreMap":7},[451],{"type":24,"value":446},{"type":18,"tag":26,"props":453,"children":454},{},[455],{"type":24,"value":456},"修改为",{"type":18,"tag":211,"props":458,"children":460},{"code":459},"config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=True)\n",[461],{"type":18,"tag":202,"props":462,"children":463},{"__ignoreMap":7},[464],{"type":24,"value":459},{"type":18,"tag":466,"props":467,"children":469},"h4",{"id":468},"_221-前台运行代码并手动终止",[470,475],{"type":18,"tag":37,"props":471,"children":474},{"href":472,"rel":473},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#2.2.1-%E5%89%8D%E5%8F%B0%E8%BF%90%E8%A1%8C%E4%BB%A3%E7%A0%81%EF%BC%8C%E5%B9%B6%E6%89%8B%E5%8A%A8%E7%BB%88%E6%AD%A2%E3%80%82",[41],[],{"type":24,"value":476},"2.2.1 前台运行代码，并手动终止。",{"type":18,"tag":26,"props":478,"children":479},{},[480],{"type":24,"value":481},"运行测试代码，命令如下：",{"type":18,"tag":211,"props":483,"children":484},{"code":346},[485],{"type":18,"tag":202,"props":486,"children":487},{"__ignoreMap":7},[488],{"type":24,"value":346},{"type":18,"tag":26,"props":490,"children":491},{},[492,493,498],{"type":24,"value":356},{"type":18,"tag":202,"props":494,"children":496},{"className":495},[],[497],{"type":24,"value":362},{"type":24,"value":364},{"type":18,"tag":211,"props":500,"children":502},{"code":501},"epoch: 1 step: 125, loss is 2.2990877628326416\nepoch: 1 step: 250, loss is 2.3014278411865234\nepoch: 1 step: 375, loss is 2.300143003463745\nepoch: 1 step: 500, loss is 2.2685062885284424\nepoch: 1 step: 625, loss is 1.2246686220169067\n^CWARNING: Logging before InitGoogleLogging() is written to STDERR\n[WARNING] RUNTIME_FRAMEWORK(22670,0x10c621dc0,Python):2022-05-11-10:59:14.927.645 [mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc:203] IntHandler] Process 22670 receive KeyboardInterrupt signal.\nTerminated: 15\n",[503],{"type":18,"tag":202,"props":504,"children":505},{"__ignoreMap":7},[506],{"type":24,"value":501},{"type":18,"tag":26,"props":508,"children":509},{},[510,511,516],{"type":24,"value":356},{"type":18,"tag":202,"props":512,"children":514},{"className":513},[],[515],{"type":24,"value":382},{"type":24,"value":384},{"type":18,"tag":211,"props":518,"children":520},{"code":519},"ckpt/\n├── lenet_ckpt-1_100.ckpt\n├── lenet_ckpt-1_200.ckpt\n├── lenet_ckpt-1_300.ckpt\n├── lenet_ckpt-1_400.ckpt\n├── lenet_ckpt-1_500.ckpt\n├── lenet_ckpt-1_600.ckpt\n└── lenet_ckpt-graph.meta\n\n0 directories, 7 files\n",[521],{"type":18,"tag":202,"props":522,"children":523},{"__ignoreMap":7},[524],{"type":24,"value":519},{"type":18,"tag":466,"props":526,"children":528},{"id":527},"_222-后台运行代码并手动终止",[529,534],{"type":18,"tag":37,"props":530,"children":533},{"href":531,"rel":532},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445#2.2.2-%E5%90%8E%E5%8F%B0%E8%BF%90%E8%A1%8C%E4%BB%A3%E7%A0%81%EF%BC%8C%E5%B9%B6%E6%89%8B%E5%8A%A8%E7%BB%88%E6%AD%A2%E3%80%82",[41],[],{"type":24,"value":535},"2.2.2 后台运行代码，并手动终止。",{"type":18,"tag":26,"props":537,"children":538},{},[539],{"type":24,"value":481},{"type":18,"tag":211,"props":541,"children":543},{"code":542},"nohup python3 main.py --data_path=./data --model_dir=./ckpt &\n",[544],{"type":18,"tag":202,"props":545,"children":546},{"__ignoreMap":7},[547],{"type":24,"value":542},{"type":18,"tag":26,"props":549,"children":550},{},[551,552,558,560,566],{"type":24,"value":356},{"type":18,"tag":202,"props":553,"children":555},{"className":554},[],[556],{"type":24,"value":557},"ps aux|grep main",{"type":24,"value":559},"查看进程id，并使用",{"type":18,"tag":202,"props":561,"children":563},{"className":562},[],[564],{"type":24,"value":565},"kill",{"type":24,"value":567},"命令进行手动终止。",{"type":18,"tag":26,"props":569,"children":570},{},[571,572,578],{"type":24,"value":356},{"type":18,"tag":202,"props":573,"children":575},{"className":574},[],[576],{"type":24,"value":577},"cat nohup.out",{"type":24,"value":579},"查看进程运行情况，输出内容如下：",{"type":18,"tag":211,"props":581,"children":583},{"code":582},"epoch: 1 step: 125, loss is 2.308577537536621\nepoch: 1 step: 250, loss is 2.303668737411499\nepoch: 1 step: 375, loss is 2.3061931133270264\nepoch: 1 step: 500, loss is 1.572475790977478\nepoch: 1 step: 625, loss is 1.2929679155349731\nepoch: 1 step: 750, loss is 0.8329849243164062\n",[584],{"type":18,"tag":202,"props":585,"children":586},{"__ignoreMap":7},[587],{"type":24,"value":582},{"type":18,"tag":26,"props":589,"children":590},{},[591,592,597],{"type":24,"value":356},{"type":18,"tag":202,"props":593,"children":595},{"className":594},[],[596],{"type":24,"value":382},{"type":24,"value":384},{"type":18,"tag":211,"props":599,"children":601},{"code":600},"ckpt/\n├── lenet_ckpt-1_100.ckpt\n├── lenet_ckpt-1_200.ckpt\n├── lenet_ckpt-1_300.ckpt\n├── lenet_ckpt-1_400.ckpt\n├── lenet_ckpt-1_500.ckpt\n├── lenet_ckpt-1_600.ckpt\n├── lenet_ckpt-1_700.ckpt\n├── lenet_ckpt-1_800.ckpt\n└── lenet_ckpt-graph.meta\n\n0 directories, 9 files\n",[602],{"type":18,"tag":202,"props":603,"children":604},{"__ignoreMap":7},[605],{"type":24,"value":600},{"type":18,"tag":26,"props":607,"children":608},{},[609],{"type":18,"tag":260,"props":610,"children":611},{},[612],{"type":18,"tag":264,"props":613,"children":614},{},[615,617,623,625,631,633,638,640,646,648,653,655,660],{"type":24,"value":616},"解读：",{"type":18,"tag":202,"props":618,"children":620},{"className":619},[],[621],{"type":24,"value":622},"2.2.1",{"type":24,"value":624},"和",{"type":18,"tag":202,"props":626,"children":628},{"className":627},[],[629],{"type":24,"value":630},"2.2.2",{"type":24,"value":632},"测试示例中，",{"type":18,"tag":202,"props":634,"children":636},{"className":635},[],[637],{"type":24,"value":207},{"type":24,"value":639},"均设置为",{"type":18,"tag":202,"props":641,"children":643},{"className":642},[],[644],{"type":24,"value":645},"True",{"type":24,"value":647},"。一个为前台运行，手动终止训练；一个为后台运行，杀死训练进程，但是模型保存目录均没有",{"type":18,"tag":202,"props":649,"children":651},{"className":650},[],[652],{"type":24,"value":409},{"type":24,"value":654},"类型的",{"type":18,"tag":202,"props":656,"children":658},{"className":657},[],[659],{"type":24,"value":417},{"type":24,"value":661},"生成，也就是说此处测试示例的用法不对",{"title":7,"searchDepth":663,"depth":663,"links":664},4,[665,671],{"id":134,"depth":666,"text":142,"children":667},2,[668,670],{"id":146,"depth":669,"text":154},3,{"id":175,"depth":669,"text":183},{"id":234,"depth":666,"text":242,"children":672},[673,674],{"id":307,"depth":669,"text":315},{"id":422,"depth":669,"text":430,"children":675},[676,677],{"id":468,"depth":663,"text":476},{"id":527,"depth":663,"text":535},"markdown","content:technology-blogs:zh:1533.md","content","technology-blogs/zh/1533.md","technology-blogs/zh/1533","md",1776506113174]