[{"data":1,"prerenderedAt":639},["ShallowReactive",2],{"content-query-A1fb9w8pGt":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":10,"date":11,"cover":12,"type":13,"category":14,"body":15,"_type":633,"_id":634,"_source":635,"_file":636,"_stem":637,"_extension":638},"/technology-blogs/en/1845","en",false,"",[9],"MindSpore Made Easy","Experience on using resumable training at a checkpoint.","2022-06-14","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/29/c09ccd81b71c4af5b87240bc2625a9a6.png","technology-blogs","Developer Sharing",{"type":16,"children":17,"toc":630},"root",[18,32,38,43,56,61,66,71,76,81,86,91,96,101,110,118,130,138,146,151,156,201,211,216,227,232,240,245,253,258,263,268,276,281,289,297,302,312,322,327,339,344,349,354,359,364,369,374,386,394,405,413,418,423,428,433,441,446,450,459,464,469,474,479,484,488,493,497,506,514,522,526,531,549,560,565,570,575,580,585,590,599,607],{"type":19,"tag":20,"props":21,"children":23},"element","h1",{"id":22},"mindspore-made-easy-model-training-resumable-training-at-a-checkpoint-1",[24,30],{"type":19,"tag":25,"props":26,"children":27},"span",{},[28],{"type":29,"value":9},"text",{"type":29,"value":31}," Model Training - Resumable Training at a Checkpoint (1)",{"type":19,"tag":33,"props":34,"children":35},"p",{},[36],{"type":29,"value":37},"June 13, 2022",{"type":19,"tag":33,"props":39,"children":40},{},[41],{"type":29,"value":42},"Author: kaierlong",{"type":19,"tag":33,"props":44,"children":45},{},[46,48],{"type":29,"value":47},"Source: ",{"type":19,"tag":49,"props":50,"children":54},"a",{"href":51,"rel":52},"https://bbs.huaweicloud.com/forum/forum.php?mod=viewthread&tid=187445",[53],"nofollow",[55],{"type":29,"value":51},{"type":19,"tag":33,"props":57,"children":58},{},[59],{"type":29,"value":60},"Development environment",{"type":19,"tag":33,"props":62,"children":63},{},[64],{"type":29,"value":65},"MindSpore 1.7.0",{"type":19,"tag":33,"props":67,"children":68},{},[69],{"type":29,"value":70},"Contents",{"type":19,"tag":33,"props":72,"children":73},{},[74],{"type":29,"value":75},"· Examples in Documents",{"type":19,"tag":33,"props":77,"children":78},{},[79],{"type":29,"value":80},"· Guess and Verification",{"type":19,"tag":33,"props":82,"children":83},{},[84],{"type":29,"value":85},"· Source Code Exploration",{"type":19,"tag":33,"props":87,"children":88},{},[89],{"type":29,"value":90},"· Cases",{"type":19,"tag":33,"props":92,"children":93},{},[94],{"type":29,"value":95},"· Summary",{"type":19,"tag":33,"props":97,"children":98},{},[99],{"type":29,"value":100},"· Reference",{"type":19,"tag":33,"props":102,"children":103},{},[104],{"type":19,"tag":105,"props":106,"children":107},"strong",{},[108],{"type":29,"value":109},"1. Examples in Documents",{"type":19,"tag":33,"props":111,"children":112},{},[113],{"type":19,"tag":105,"props":114,"children":115},{},[116],{"type":29,"value":117},"1.1 Official Document",{"type":19,"tag":33,"props":119,"children":120},{},[121,123,128],{"type":29,"value":122},"The ",{"type":19,"tag":105,"props":124,"children":125},{},[126],{"type":29,"value":127},"exception_save",{"type":29,"value":129}," parameter (bool type) controls the new resumable training function added in MindSpore 1.7.0, but the official document does not specify its application scenarios. See the following figure.",{"type":19,"tag":33,"props":131,"children":132},{},[133],{"type":19,"tag":134,"props":135,"children":137},"img",{"alt":7,"src":136},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/09/29/0fa47328f46840ac9c31baee41c49df4.png",[],{"type":19,"tag":33,"props":139,"children":140},{},[141],{"type":19,"tag":105,"props":142,"children":143},{},[144],{"type":29,"value":145},"1.2 Official Example",{"type":19,"tag":33,"props":147,"children":148},{},[149],{"type":29,"value":150},"For details about the official example, see Saving and Exporting Models.",{"type":19,"tag":33,"props":152,"children":153},{},[154],{"type":29,"value":155},"MindSpore provides the resumable training function. As it is enabled, if an exception occurs during training, MindSpore automatically saves the checkpoint file (last checkpoint) generated when the exception occurs.",{"type":19,"tag":33,"props":157,"children":158},{},[159,161,165,167,172,174,179,181,186,188,192,194,199],{"type":29,"value":160},"Resumable training is controlled by the ",{"type":19,"tag":105,"props":162,"children":163},{},[164],{"type":29,"value":127},{"type":29,"value":166}," parameter (bool type) in ",{"type":19,"tag":105,"props":168,"children":169},{},[170],{"type":29,"value":171},"CheckpointConfig",{"type":29,"value":173},". If this parameter is set to ",{"type":19,"tag":105,"props":175,"children":176},{},[177],{"type":29,"value":178},"True",{"type":29,"value":180},", resumable training is enabled. If it is set to ",{"type":19,"tag":105,"props":182,"children":183},{},[184],{"type":29,"value":185},"False",{"type":29,"value":187},", resumable training is disabled. The default value is ",{"type":19,"tag":105,"props":189,"children":190},{},[191],{"type":29,"value":185},{"type":29,"value":193},". The last checkpoint file saved in resumable training and the checkpoint files saved in the normal process do not affect each other. Their naming mechanism and save path are the same. The only difference is that ",{"type":19,"tag":105,"props":195,"children":196},{},[197],{"type":29,"value":198},"_breakpoint",{"type":29,"value":200}," will be added to the end of the last checkpoint file name. The parameter usage is as follows:",{"type":19,"tag":202,"props":203,"children":205},"pre",{"code":204},"from mindspore.train.callback import ModelCheckpoint, CheckpointConfig\n\n# Enable resumable training.\n\nconfig_ck = CheckpointConfig(save_checkpoint_steps=32, keep_checkpoint_max=10, exception_save=True)\n",[206],{"type":19,"tag":207,"props":208,"children":209},"code",{"__ignoreMap":7},[210],{"type":29,"value":204},{"type":19,"tag":33,"props":212,"children":213},{},[214],{"type":29,"value":215},"If an exception occurs during training, the last checkpoint is automatically saved. If the exception occurs in the tenth step of the tenth epoch during training, the saved last checkpoint file is as follows:",{"type":19,"tag":33,"props":217,"children":218},{},[219,221,225],{"type":29,"value":220},"# The name of the last checkpoint file is suffixed with ",{"type":19,"tag":105,"props":222,"children":223},{},[224],{"type":29,"value":198},{"type":29,"value":226}," to distinguish it from the checkpoint files in the normal process.",{"type":19,"tag":33,"props":228,"children":229},{},[230],{"type":29,"value":231},"resnet50-10_10_breakpoint.ckpt",{"type":19,"tag":33,"props":233,"children":234},{},[235],{"type":19,"tag":105,"props":236,"children":237},{},[238],{"type":29,"value":239},"2. Guess and Verification",{"type":19,"tag":33,"props":241,"children":242},{},[243],{"type":29,"value":244},"In section 1.1, I mentioned that the official website does not provide the application scenarios of this parameter, so I'll make a guess and verify it.",{"type":19,"tag":33,"props":246,"children":247},{},[248],{"type":19,"tag":105,"props":249,"children":250},{},[251],{"type":29,"value":252},"Guess: The parameter takes effect when the training is manually terminated.",{"type":19,"tag":33,"props":254,"children":255},{},[256],{"type":29,"value":257},"Next, I'll verify it using code.",{"type":19,"tag":33,"props":259,"children":260},{},[261],{"type":29,"value":262},"I selected the source code of my own open source case fashion_mnist_classification_with_cnn_by_mindspore and made some modifications.",{"type":19,"tag":33,"props":264,"children":265},{},[266],{"type":29,"value":267},"For details about the data processing and execution of this case, see its README.md.",{"type":19,"tag":33,"props":269,"children":270},{},[271],{"type":19,"tag":105,"props":272,"children":273},{},[274],{"type":29,"value":275},"2.1 Setting exception_save to False",{"type":19,"tag":33,"props":277,"children":278},{},[279],{"type":29,"value":280},"The test code is as follows:",{"type":19,"tag":202,"props":282,"children":284},{"code":283},"#!/usr/bin/env python3\n\n# -*- coding: utf-8 -*-\n\n# -------------------\n\n# @Version : 1.0\n\n# @Author : xingchaolong\n\n# @For : MindSpore FashionMnist LeNet Example.\n\n# -------------------\n\nfrom __future__ import absolute_import\n\nfrom __future__ import division\n\nfrom __future__ import print_function\n\n\n\nimport argparse\n\n\n\nimport mindspore.dataset as ds\n\nimport mindspore.nn as nn\n\nimport mindspore.dataset.transforms.c_transforms as C\n\nimport mindspore.dataset.vision.c_transforms as CV\n\n\n\nfrom mindspore import context\n\nfrom mindspore import dtype as mstype\n\nfrom mindspore import Model\n\nfrom mindspore.common.initializer import Normal\n\nfrom mindspore.dataset.vision import Inter\n\nfrom mindspore.nn import Accuracy\n\nfrom mindspore.train.callback import CheckpointConfig, LossMonitor, ModelCheckpoint\n\n\n\n\n\ndef create_dataset(data_path, usage=\"train\", batch_size=32, repeat_size=1, num_parallel_workers=1):\n\n    # Define the dataset.\n\n    fashion_mnist_ds = ds.FashionMnistDataset(data_path, usage=usage)\n\n    resize_height, resize_width = 28, 28\n\n    rescale = 1.0 / 255.0\n\n    shift = 0.0\n\n    rescale_nml = 1 / 0.3081\n\n    shift_nml = -1 * 0.1307 / 0.3081\n\n    # Define the mapping to be operated.\n\n    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)\n\n    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)\n\n    rescale_op = CV.Rescale(rescale, shift)\n\n    hwc2chw_op = CV.HWC2CHW()\n\n    type_cast_op = C.TypeCast(mstype.int32)\n\n\n    # Use the map function to apply data operations to the dataset.\n\n    fashion_mnist_ds = fashion_mnist_ds.map(\n\n        operations=type_cast_op, input_columns=\"label\", num_parallel_workers=num_parallel_workers)\n\n    fashion_mnist_ds = fashion_mnist_ds.map(\n\n        operations=[resize_op, rescale_op, rescale_nml_op, hwc2chw_op],\n\n        input_columns=\"image\", num_parallel_workers=num_parallel_workers)\n\n\n    # Perform shuffle, batch, and repeat operations.\n\n    buffer_size = 10000\n\n    fashion_mnist_ds = fashion_mnist_ds.shuffle(buffer_size=buffer_size)\n\n    fashion_mnist_ds = fashion_mnist_ds.batch(batch_size, drop_remainder=True)\n\n    fashion_mnist_ds = fashion_mnist_ds.repeat(count=repeat_size)\n\n\n    return fashion_mnist_ds\n",[285],{"type":19,"tag":207,"props":286,"children":287},{"__ignoreMap":7},[288],{"type":29,"value":283},{"type":19,"tag":202,"props":290,"children":292},{"code":291},"class LeNet5(nn.Cell):\n\n    \"\"\"\n\n    LeNet network structure\n\n    \"\"\"\n\n    def __init__(self, num_class=10, num_channel=1):\n\n        super(LeNet5, self).__init__()\n\n        # Define the required operations.\n\n        self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid')\n\n        self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid')\n\n        self.fc1 = nn.Dense(16 * 4 * 4, 256, weight_init=Normal(0.02))\n\n        self.fc2 = nn.Dense(256, 128, weight_init=Normal(0.02))\n\n        self.fc3 = nn.Dense(128, num_class, weight_init=Normal(0.02))\n\n        self.relu = nn.ReLU()\n\n        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)\n\n        self.flatten = nn.Flatten()\n\n\n\n    def construct(self, x):\n\n        # Use the defined operations to build a feedfoward network.\n\n        x = self.conv1(x)\n\n        x = self.relu(x)\n\n        x = self.max_pool2d(x)\n\n        x = self.conv2(x)\n\n        x = self.relu(x)\n\n        x = self.max_pool2d(x)\n\n        x = self.flatten(x)\n\n        x = self.fc1(x)\n\n        x = self.relu(x)\n\n        x = self.fc2(x)\n\n        x = self.relu(x)\n\n        x = self.fc3(x)\n\n        return x\n\n\n\n\n\ndef train_net(model, epoch_size, data_path, batch_size, repeat_size, ckpt_cb, sink_mode):\n\n    \"\"\"Define the training method.\"\"\"\n\n    # Load the training dataset.\n\n    ds_train = create_dataset(data_path, usage=\"train\", batch_size=batch_size, repeat_size=repeat_size)\n\n    model.train(epoch_size, ds_train, callbacks=[ckpt_cb, LossMonitor(125)], dataset_sink_mode=sink_mode)\n\n\n\ndef test_net(model, data_path):\n\n    \"\"\"Define the verification method.\"\"\"\n\n    ds_eval = create_dataset(data_path, usage=\"test\")\n\n    acc = model.eval(ds_eval, dataset_sink_mode=False)\n\n    print(\"acc: {}\".format(acc), flush=True)\n\n\n\ndef run(data_path, model_dir, device_target=\"CPU\", batch_size=32, train_epoch=5, dataset_size=1):\n\n    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)\n\n\n    net = LeNet5()\n\n    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')\n\n    net_opt = nn.Momentum(net.trainable_params(), learning_rate=0.01, momentum=0.9)\n\n\n\n    # Set the model saving parameter.\n\n    config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=False)\n\n    # Apply the model saving parameter.\n\n    ckpt_cb = ModelCheckpoint(prefix=\"lenet_ckpt\", directory=model_dir, config=config_ck)\n\n\n    model = Model(net, net_loss, net_opt, metrics={\"Accuracy\": Accuracy()})\n\n    train_net(model, train_epoch, data_path, batch_size, dataset_size, ckpt_cb, False)\n\n    test_net(model, data_path)\n\n\ndef main():\n\n    parser = argparse.ArgumentParser(description='MindSpore FashionMnist LeNet Example.')\n\n    parser.add_argument(\"--data_path\", type=str, required=True, help=\"fashion mnist data path.\")\n\n    parser.add_argument(\"--device_target\", type=str, default=\"CPU\", choices=['Ascend', 'GPU', 'CPU'],\n\n                        help=\"target device\")\n\n    parser.add_argument(\"--model_dir\", type=str, required=True, help=\"directory to save model ckpt.\")\n\n    parser.add_argument(\"--batch_size\", type=int, default=32, help=\"batch size.\")\n\n    parser.add_argument(\"--train_epoch\", type=int, default=5, help=\"train epoch.\")\n\n    parser.add_argument(\"--dataset_size\", type=int, default=1, help=\"dataset size.\")\n\n\n    args = parser.parse_args()\n\n\n    run(\n\n        data_path=args.data_path,\n\n        model_dir=args.model_dir,\n\n        device_target=args.device_target,\n\n        batch_size=args.batch_size,\n\n        train_epoch=args.train_epoch,\n\n        dataset_size=args.dataset_size\n\n    )\n\n\nif __name__ == \"__main__\":\n\n    main()\n",[293],{"type":19,"tag":207,"props":294,"children":295},{"__ignoreMap":7},[296],{"type":29,"value":291},{"type":19,"tag":33,"props":298,"children":299},{},[300],{"type":29,"value":301},"Run the following command to execute the code on the foreground:",{"type":19,"tag":33,"props":303,"children":304},{},[305,310],{"type":19,"tag":105,"props":306,"children":307},{},[308],{"type":29,"value":309},"./data",{"type":29,"value":311}," is the data directory. Replace it as required.",{"type":19,"tag":33,"props":313,"children":314},{},[315,320],{"type":19,"tag":105,"props":316,"children":317},{},[318],{"type":29,"value":319},"./ckpt",{"type":29,"value":321}," is the model saving directory. Replace it as required.",{"type":19,"tag":33,"props":323,"children":324},{},[325],{"type":29,"value":326},"python3 main.py --data_path=./data --model_dir=./ckpt",{"type":19,"tag":33,"props":328,"children":329},{},[330,332,337],{"type":29,"value":331},"Press ",{"type":19,"tag":105,"props":333,"children":334},{},[335],{"type":29,"value":336},"Ctrl+C",{"type":29,"value":338}," to manually stop the command execution. The output is as follows:",{"type":19,"tag":33,"props":340,"children":341},{},[342],{"type":29,"value":343},"epoch: 1 step: 125, loss is 2.2966978549957275",{"type":19,"tag":33,"props":345,"children":346},{},[347],{"type":29,"value":348},"epoch: 1 step: 250, loss is 2.2930874824523926",{"type":19,"tag":33,"props":350,"children":351},{},[352],{"type":29,"value":353},"epoch: 1 step: 375, loss is 2.257183074951172",{"type":19,"tag":33,"props":355,"children":356},{},[357],{"type":29,"value":358},"epoch: 1 step: 500, loss is 1.0803303718566895",{"type":19,"tag":33,"props":360,"children":361},{},[362],{"type":29,"value":363},"^CWARNING: Logging before InitGoogleLogging() is written to STDERR",{"type":19,"tag":33,"props":365,"children":366},{},[367],{"type":29,"value":368},"[WARNING] RUNTIME_FRAMEWORK(18086,0x10f6e9dc0,Python):2022-05-11-10:56:54.267.943 [mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc:203] IntHandler] Process 18086 receive KeyboardInterrupt signal.",{"type":19,"tag":33,"props":370,"children":371},{},[372],{"type":29,"value":373},"Terminated: 15",{"type":19,"tag":33,"props":375,"children":376},{},[377,379,384],{"type":29,"value":378},"Run the ",{"type":19,"tag":105,"props":380,"children":381},{},[382],{"type":29,"value":383},"tree ckpt",{"type":29,"value":385}," command to check the model saving directory. The output is as follows:",{"type":19,"tag":202,"props":387,"children":389},{"code":388},"ckpt/\n\n├── lenet_ckpt-1_100.ckpt\n\n├── lenet_ckpt-1_200.ckpt\n\n├── lenet_ckpt-1_300.ckpt\n\n├── lenet_ckpt-1_400.ckpt\n\n├── lenet_ckpt-1_500.ckpt\n\n└── lenet_ckpt-graph.meta\n\n\n0 directories, 6 files\n",[390],{"type":19,"tag":207,"props":391,"children":392},{"__ignoreMap":7},[393],{"type":29,"value":388},{"type":19,"tag":33,"props":395,"children":396},{},[397,399,403],{"type":29,"value":398},"Interpretation: The content of the model saving directory is normal, and no .ckpt file related to ",{"type":19,"tag":105,"props":400,"children":401},{},[402],{"type":29,"value":198},{"type":29,"value":404}," is displayed.",{"type":19,"tag":33,"props":406,"children":407},{},[408],{"type":19,"tag":105,"props":409,"children":410},{},[411],{"type":29,"value":412},"2.2 Setting exception_save to True",{"type":19,"tag":33,"props":414,"children":415},{},[416],{"type":29,"value":417},"Change the following test code used in section 2.1",{"type":19,"tag":33,"props":419,"children":420},{},[421],{"type":29,"value":422},"config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=False)",{"type":19,"tag":33,"props":424,"children":425},{},[426],{"type":29,"value":427},"to",{"type":19,"tag":33,"props":429,"children":430},{},[431],{"type":29,"value":432},"config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10, exception_save=True)",{"type":19,"tag":33,"props":434,"children":435},{},[436],{"type":19,"tag":105,"props":437,"children":438},{},[439],{"type":29,"value":440},"2.2.1 Executing Code on the Foreground and Manually Terminating the Execution",{"type":19,"tag":33,"props":442,"children":443},{},[444],{"type":29,"value":445},"Run the following command to execute the test code:",{"type":19,"tag":33,"props":447,"children":448},{},[449],{"type":29,"value":326},{"type":19,"tag":33,"props":451,"children":452},{},[453,454,458],{"type":29,"value":331},{"type":19,"tag":105,"props":455,"children":456},{},[457],{"type":29,"value":336},{"type":29,"value":338},{"type":19,"tag":33,"props":460,"children":461},{},[462],{"type":29,"value":463},"epoch: 1 step: 125, loss is 2.2990877628326416",{"type":19,"tag":33,"props":465,"children":466},{},[467],{"type":29,"value":468},"epoch: 1 step: 250, loss is 2.3014278411865234",{"type":19,"tag":33,"props":470,"children":471},{},[472],{"type":29,"value":473},"epoch: 1 step: 375, loss is 2.300143003463745",{"type":19,"tag":33,"props":475,"children":476},{},[477],{"type":29,"value":478},"epoch: 1 step: 500, loss is 2.2685062885284424",{"type":19,"tag":33,"props":480,"children":481},{},[482],{"type":29,"value":483},"epoch: 1 step: 625, loss is 1.2246686220169067",{"type":19,"tag":33,"props":485,"children":486},{},[487],{"type":29,"value":363},{"type":19,"tag":33,"props":489,"children":490},{},[491],{"type":29,"value":492},"[WARNING] RUNTIME_FRAMEWORK(22670,0x10c621dc0,Python):2022-05-11-10:59:14.927.645 [mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc:203] IntHandler] Process 22670 receive KeyboardInterrupt signal.",{"type":19,"tag":33,"props":494,"children":495},{},[496],{"type":29,"value":373},{"type":19,"tag":33,"props":498,"children":499},{},[500,501,505],{"type":29,"value":378},{"type":19,"tag":105,"props":502,"children":503},{},[504],{"type":29,"value":383},{"type":29,"value":385},{"type":19,"tag":202,"props":507,"children":509},{"code":508},"ckpt/\n\n├── lenet_ckpt-1_100.ckpt\n\n├── lenet_ckpt-1_200.ckpt\n\n├── lenet_ckpt-1_300.ckpt\n\n├── lenet_ckpt-1_400.ckpt\n\n├── lenet_ckpt-1_500.ckpt\n\n├── lenet_ckpt-1_600.ckpt\n\n└── lenet_ckpt-graph.meta\n\n\n0 directories, 7 files\n",[510],{"type":19,"tag":207,"props":511,"children":512},{"__ignoreMap":7},[513],{"type":29,"value":508},{"type":19,"tag":33,"props":515,"children":516},{},[517],{"type":19,"tag":105,"props":518,"children":519},{},[520],{"type":29,"value":521},"2.2.2 Executing Code on the Background and Manually Terminating the Execution",{"type":19,"tag":33,"props":523,"children":524},{},[525],{"type":29,"value":445},{"type":19,"tag":33,"props":527,"children":528},{},[529],{"type":29,"value":530},"nohup python3 main.py --data_path=./data --model_dir=./ckpt &",{"type":19,"tag":33,"props":532,"children":533},{},[534,535,540,542,547],{"type":29,"value":378},{"type":19,"tag":105,"props":536,"children":537},{},[538],{"type":29,"value":539},"ps aux|grep main",{"type":29,"value":541}," command to view the process ID and run the ",{"type":19,"tag":105,"props":543,"children":544},{},[545],{"type":29,"value":546},"kill",{"type":29,"value":548}," command to manually terminate the process.",{"type":19,"tag":33,"props":550,"children":551},{},[552,553,558],{"type":29,"value":378},{"type":19,"tag":105,"props":554,"children":555},{},[556],{"type":29,"value":557},"cat nohup.out",{"type":29,"value":559}," command to check the running status of the process. The output is as follows:",{"type":19,"tag":33,"props":561,"children":562},{},[563],{"type":29,"value":564},"epoch: 1 step: 125, loss is 2.308577537536621",{"type":19,"tag":33,"props":566,"children":567},{},[568],{"type":29,"value":569},"epoch: 1 step: 250, loss is 2.303668737411499",{"type":19,"tag":33,"props":571,"children":572},{},[573],{"type":29,"value":574},"epoch: 1 step: 375, loss is 2.3061931133270264",{"type":19,"tag":33,"props":576,"children":577},{},[578],{"type":29,"value":579},"epoch: 1 step: 500, loss is 1.572475790977478",{"type":19,"tag":33,"props":581,"children":582},{},[583],{"type":29,"value":584},"epoch: 1 step: 625, loss is 1.2929679155349731",{"type":19,"tag":33,"props":586,"children":587},{},[588],{"type":29,"value":589},"epoch: 1 step: 750, loss is 0.8329849243164062",{"type":19,"tag":33,"props":591,"children":592},{},[593,594,598],{"type":29,"value":378},{"type":19,"tag":105,"props":595,"children":596},{},[597],{"type":29,"value":383},{"type":29,"value":385},{"type":19,"tag":202,"props":600,"children":602},{"code":601},"ckpt/\n\n├── lenet_ckpt-1_100.ckpt\n\n├── lenet_ckpt-1_200.ckpt\n\n├── lenet_ckpt-1_300.ckpt\n\n├── lenet_ckpt-1_400.ckpt\n\n├── lenet_ckpt-1_500.ckpt\n\n├── lenet_ckpt-1_600.ckpt\n\n├── lenet_ckpt-1_700.ckpt\n\n├── lenet_ckpt-1_800.ckpt\n\n└── lenet_ckpt-graph.meta\n\n\n0 directories, 9 files\n",[603],{"type":19,"tag":207,"props":604,"children":605},{"__ignoreMap":7},[606],{"type":29,"value":601},{"type":19,"tag":33,"props":608,"children":609},{},[610,612,616,618,622,624,628],{"type":29,"value":611},"Interpretation: In the test examples in sections 2.2.1 and 2.2.2, ",{"type":19,"tag":105,"props":613,"children":614},{},[615],{"type":29,"value":127},{"type":29,"value":617}," is set to ",{"type":19,"tag":105,"props":619,"children":620},{},[621],{"type":29,"value":178},{"type":29,"value":623},". In 2.2.1, I execute the code on the foreground and manually terminate the training. In 2.2.2, I execute the code on the background and kill the training process. However, no .ckpt files of the ",{"type":19,"tag":105,"props":625,"children":626},{},[627],{"type":29,"value":198},{"type":29,"value":629}," type are generated in the model saving directory. That is, the parameter usages in the test examples are incorrect.",{"title":7,"searchDepth":631,"depth":631,"links":632},4,[],"markdown","content:technology-blogs:en:1845.md","content","technology-blogs/en/1845.md","technology-blogs/en/1845","md",1776506105609]