[{"data":1,"prerenderedAt":426},["ShallowReactive",2],{"content-query-kqjpZOsAMx":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":420,"_id":421,"_source":422,"_file":423,"_stem":424,"_extension":425},"/technology-blogs/zh/1522","zh",false,"","【AI设计模式】05-检查点模式（CheckPoints）：如何定期存储模型？","检查点（CheckPoints）模式最大的作用在于保证了模型训练的可靠性，同时也可以让开发者更容易的做早停","2022-06-02","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/4b8dc3c387f8496c8e1c97875827ce79.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":406},"root",[17,25,50,58,63,72,77,85,110,118,123,128,136,169,174,181,186,192,200,213,222,256,264,278,286,291,299,306,318,325,333,341,364,372,383,395],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"ai设计模式05-检查点模式checkpoints如何定期存储模型",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31,40,42,48],{"type":24,"value":30},"在前一篇文章",{"type":18,"tag":32,"props":33,"children":37},"a",{"href":34,"rel":35},"https://zhuanlan.zhihu.com/p/517450571",[36],"nofollow",[38],{"type":24,"value":39},"《数据处理-Eager模式》",{"type":24,"value":41},"中分享了数据处理-Eager模式，那么在模型训练时，有哪些设计模式可以使用呢？在数据库领域，为了防止执行时间较长的存储过程失败重新执行，会将中间的过程状态以检查点的形式持续记录下来，每次失败时不需要重头执行，而是加载最近的检查点，继续执行，避免浪费时间。和存储过程类似，模型的训练时间会更长，如果缺乏一定的可靠性机制，过程中一旦失败，就需要重头开始训练，浪费时间较多。因此，需要实现类似的机制来保证可靠性问题，这种机制被称为",{"type":18,"tag":43,"props":44,"children":45},"strong",{},[46],{"type":24,"value":47},"检查点（CheckPoints）模式",{"type":24,"value":49},"。",{"type":18,"tag":26,"props":51,"children":52},{},[53],{"type":18,"tag":54,"props":55,"children":57},"img",{"alt":7,"src":56},"https://pic3.zhimg.com/80/v2-2e65e659348eeb19377da0c0db0c8e9a_720w.jpg",[],{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":24,"value":62},"AI设计模式总览",{"type":18,"tag":64,"props":65,"children":67},"h2",{"id":66},"模式定义",[68],{"type":18,"tag":43,"props":69,"children":70},{},[71],{"type":24,"value":66},{"type":18,"tag":26,"props":73,"children":74},{},[75],{"type":24,"value":76},"**检查点模式（CheckPoints）**是指通过周期性（迭代/时间）的保存模型的完整状态，在模型训练失败时，可以从保存的检查点模型继续训练，以避免训练失败时每次都需要从头开始带来的训练时间浪费。检查点模式适用于模型训练时间长、训练需要提前结束、fine-tune等场景，也可以拓展到异常时的断点续训场景。",{"type":18,"tag":64,"props":78,"children":80},{"id":79},"问题",[81],{"type":18,"tag":43,"props":82,"children":83},{},[84],{"type":24,"value":79},{"type":18,"tag":86,"props":87,"children":88},"ol",{},[89,100,105],{"type":18,"tag":90,"props":91,"children":92},"li",{},[93,98],{"type":18,"tag":43,"props":94,"children":95},{},[96],{"type":24,"value":97},"训练耗时的网络在训练过程中失败，从头开始训练的代价高",{"type":24,"value":99},"：对于层数比较深的神经网络，或者需要大规模训练数据的模型，训练的时间会很长。因为有更多的参数以及更多的数据样本需要处理。比如对于VGG16的网络，cifar-10的数据集，普通的NVIDIA显卡训练需要3-4小时；一旦过程中失败，需要重头开始训练，时间成本高。",{"type":18,"tag":90,"props":101,"children":102},{},[103],{"type":24,"value":104},"**训练时间越长，精度可能不发生变化，或者产生过拟合的现象。**这种场景时，提前结束（early stopping）获得中间的模型状态收益会更高。",{"type":18,"tag":90,"props":106,"children":107},{},[108],{"type":24,"value":109},"Fine-Tune时，通常需要最终模型前面的一些模型状态进行基础上进行调优，这样可以更好的针对新数据进行训练，获得更好的泛化性。",{"type":18,"tag":64,"props":111,"children":113},{"id":112},"解决方案",[114],{"type":18,"tag":43,"props":115,"children":116},{},[117],{"type":24,"value":112},{"type":18,"tag":26,"props":119,"children":120},{},[121],{"type":24,"value":122},"**在每轮训练结束时，都保存当前的模型状态作为检查点，如果下轮训练失败时，可以从这个检查点模型继续训练。**和训练完成导出的模型（以神经网络为例，最终的模型包含权重、激活函数以及隐藏层信息）相比，这个中间模型状态需要额外的轮、当前的批量计数等信息，以保证基于这个中间模型继续训练。通常这个中间模型被称为检查点（CheckPoints）。检查点的模型状态中通常不包括学习率，因为训练过程中它可能会动态调整。",{"type":18,"tag":26,"props":124,"children":125},{},[126],{"type":24,"value":127},"如果在每个批量数据训练完，权重更新后都保存检查点，中间模型的数量和占用的空间会非常大。所以实践中通常会在每轮结束后保存检查点，或者保留最近的几个检查点。",{"type":18,"tag":64,"props":129,"children":131},{"id":130},"案例",[132],{"type":18,"tag":43,"props":133,"children":134},{},[135],{"type":24,"value":130},{"type":18,"tag":26,"props":137,"children":138},{},[139,141,146,148,153,155,160,162,167],{"type":24,"value":140},"AI框架通常都提供了模型训练的检查点保存能力。在MindSpore中，通过训练API提供了",{"type":18,"tag":43,"props":142,"children":143},{},[144],{"type":24,"value":145},"ModelCheckPoint",{"type":24,"value":147},"和",{"type":18,"tag":43,"props":149,"children":150},{},[151],{"type":24,"value":152},"CheckpointConfig",{"type":24,"value":154},"模块来帮助开发者保存模型训练过程中的检查点。MindSpore提供了三种检查点保存策略，包括",{"type":18,"tag":43,"props":156,"children":157},{},[158],{"type":24,"value":159},"直接保存、周期保存",{"type":24,"value":161},"（迭代次数或者训练时长）、和",{"type":18,"tag":43,"props":163,"children":164},{},[165],{"type":24,"value":166},"异常保存",{"type":24,"value":168},"（在训练失败的异常情况下保存的策略）。",{"type":18,"tag":26,"props":170,"children":171},{},[172],{"type":24,"value":173},"说明：检查点文件是一个二进制文件，存储了所有训练参数的值；且检查点的实现上采用了Protocol Buffers机制，与开发语言、平台无关，具有良好的可扩展性。",{"type":18,"tag":26,"props":175,"children":176},{},[177],{"type":18,"tag":54,"props":178,"children":180},{"alt":7,"src":179},"https://pic2.zhimg.com/80/v2-00cc1a1e6b37c15f20c7e124dba1233d_720w.jpg",[],{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":24,"value":185},"在此，我们重点介绍下如何在MindSpore中周期性保存模型状态、以及在异常情况下保存故障点的模型状态。",{"type":18,"tag":187,"props":188,"children":190},"h3",{"id":189},"周期保存",[191],{"type":24,"value":189},{"type":18,"tag":26,"props":193,"children":194},{},[195],{"type":18,"tag":43,"props":196,"children":197},{},[198],{"type":24,"value":199},"1）迭代次数方式保存",{"type":18,"tag":26,"props":201,"children":202},{},[203,205,212],{"type":24,"value":204},"下面的MindSpore代码片段展示了使用迭代次数配置检查点保存策略，以及在模型训练时通过回调的方式应用保存策略。训练开始后，会每隔1785个step保存一次检查点模型，并最多保留10个中间模型，模型的名称格式为",{"type":18,"tag":206,"props":207,"children":209},"code",{"className":208},[],[210],{"type":24,"value":211},"checkpoint_lenet-1_1875.ckpt",{"type":24,"value":49},{"type":18,"tag":214,"props":215,"children":217},"pre",{"code":216}," from mindspore.train.callback import ModelCheckpoint, CheckpointConfig \n    \n # 设置模型保存参数，设置模型保存的策略，如本例中设置最多保存10个checkpoints，每隔1875个step保存一次 \n    \n config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) \n    \n # 应用模型保存参数 \n    \n ckpoint = ModelCheckpoint(prefix=\"checkpoint_lenet\", config=config_ck) \n    \n #通过回调的方式配置在模型训练的过程中 \n    \n model.train(epoch_size, ds_train, callbacks=[ckpoint_cb]) \n      \n",[218],{"type":18,"tag":206,"props":219,"children":220},{"__ignoreMap":7},[221],{"type":24,"value":216},{"type":18,"tag":26,"props":223,"children":224},{},[225,227,233,234,240,242,247,249,254],{"type":24,"value":226},"加载CheckPoint可以通过",{"type":18,"tag":206,"props":228,"children":230},{"className":229},[],[231],{"type":24,"value":232},"load_checkpoint",{"type":24,"value":147},{"type":18,"tag":206,"props":235,"children":237},{"className":236},[],[238],{"type":24,"value":239},"load_param_into_net",{"type":24,"value":241},"方法来完成，如下面的代码，通过",{"type":18,"tag":206,"props":243,"children":245},{"className":244},[],[246],{"type":24,"value":232},{"type":24,"value":248},"方法从保存好的checkpoint中加载网络的参数，再通过",{"type":18,"tag":206,"props":250,"children":252},{"className":251},[],[253],{"type":24,"value":239},{"type":24,"value":255},"将参数导入到具体的网络实例中，方便后面续训或者评估。",{"type":18,"tag":214,"props":257,"children":259},{"code":258},"from mindspore import load_checkpoint, load_param_into_net\n# 加载已经保存的用于测试的模型\nparam_dict = load_checkpoint(\"checkpoint_lenet-1_1875.ckpt\")\n# 加载参数到网络中\nload_param_into_net(net, param_dict)\n",[260],{"type":18,"tag":206,"props":261,"children":262},{"__ignoreMap":7},[263],{"type":24,"value":258},{"type":18,"tag":26,"props":265,"children":266},{},[267,269,276],{"type":24,"value":268},"完整的代码可以参考",{"type":18,"tag":32,"props":270,"children":273},{"href":271,"rel":272},"https://link.zhihu.com/?target=https%3A//mindspore.cn/tutorials/zh-CN/r1.7/beginner/quick_start.html",[36],[274],{"type":24,"value":275},"[1]",{"type":24,"value":277},"中的案例。",{"type":18,"tag":26,"props":279,"children":280},{},[281],{"type":18,"tag":43,"props":282,"children":283},{},[284],{"type":24,"value":285},"2）周期时间方式保存",{"type":18,"tag":26,"props":287,"children":288},{},[289],{"type":24,"value":290},"时间策略提供了按照秒和分钟配置参数，如下面的代码，每隔30秒保存一个CheckPoint文件，每隔3分钟保留一个CheckPoint文件。",{"type":18,"tag":214,"props":292,"children":294},{"code":293},"from mindspore import CheckpointConfig\n\n# 每隔30秒保存一个CheckPoint文件，每隔3分钟保留一个CheckPoint文件\nconfig_ck = CheckpointConfig(save_checkpoint_seconds=30, keep_checkpoint_per_n_minutes=3)\n",[295],{"type":18,"tag":206,"props":296,"children":297},{"__ignoreMap":7},[298],{"type":24,"value":293},{"type":18,"tag":187,"props":300,"children":301},{"id":166},[302],{"type":18,"tag":43,"props":303,"children":304},{},[305],{"type":24,"value":166},{"type":18,"tag":26,"props":307,"children":308},{},[309,311,316],{"type":24,"value":310},"如果模型较大，通常会减少梳理保留的检查点模型，间隔的时间会拉长。如盘古大模型的检查点保存间隔在4-5小时，如果在两个检查点之间失败，那么从上个检查点重新训练的时间损失会比较大。MindSpore在1.7版本扩展了检查点功能，提供",{"type":18,"tag":43,"props":312,"children":313},{},[314],{"type":24,"value":315},"断点续训能力",{"type":24,"value":317},"，保证在训练异常时触发检查点，保证下次可以从发生故障时的模型状态继续训练，训练时间无损失。引入断点续训功能，只需在策略配置时增加“exception_save=True”的参数即可。",{"type":18,"tag":26,"props":319,"children":320},{},[321],{"type":18,"tag":54,"props":322,"children":324},{"alt":7,"src":323},"https://pic3.zhimg.com/80/v2-c35d149bb22be4009cafc86a07f03ec6_720w.jpg",[],{"type":18,"tag":214,"props":326,"children":328},{"code":327},"from mindspore import ModelCheckpoint, CheckpointConfig\n# 配置断点续训功能开启\nconfig_ck = CheckpointConfig(save_checkpoint_steps=32, keep_checkpoint_max=10, exception_save=True)\n",[329],{"type":18,"tag":206,"props":330,"children":331},{"__ignoreMap":7},[332],{"type":24,"value":327},{"type":18,"tag":64,"props":334,"children":336},{"id":335},"总结",[337],{"type":18,"tag":43,"props":338,"children":339},{},[340],{"type":24,"value":335},{"type":18,"tag":26,"props":342,"children":343},{},[344,348,350,355,357,362],{"type":18,"tag":43,"props":345,"children":346},{},[347],{"type":24,"value":47},{"type":24,"value":349},"最大的作用在于",{"type":18,"tag":43,"props":351,"children":352},{},[353],{"type":24,"value":354},"保证了模型训练的可靠性",{"type":24,"value":356},"，同时也可以让开发者",{"type":18,"tag":43,"props":358,"children":359},{},[360],{"type":24,"value":361},"更容易的做早停",{"type":24,"value":363},"。断点续训能力对于大模型的价值较大，异常状态下续训无时间损失，检查点模式也有利于转移学习时做fine-tune，这也是我们下一个要介绍的模式。",{"type":18,"tag":64,"props":365,"children":367},{"id":366},"参考资料",[368],{"type":18,"tag":43,"props":369,"children":370},{},[371],{"type":24,"value":366},{"type":18,"tag":26,"props":373,"children":374},{},[375,377],{"type":24,"value":376},"[1] MindSpore完整案例：",{"type":18,"tag":32,"props":378,"children":380},{"href":271,"rel":379},[36],[381],{"type":24,"value":382},"https://mindspore.cn/tutorials/zh-CN/r1.7/beginner/quick_start.html",{"type":18,"tag":26,"props":384,"children":385},{},[386,388],{"type":24,"value":387},"[2] MindSpore模型保存：",{"type":18,"tag":32,"props":389,"children":392},{"href":390,"rel":391},"https://link.zhihu.com/?target=https%3A//gitee.com/mindspore/docs/blob/master/tutorials/source_zh_cn/advanced/train/save.ipynb",[36],[393],{"type":24,"value":394},"https://gitee.com/mindspore/docs/blob/master/tutorials/source_zh_cn/advanced/train/save.ipynb",{"type":18,"tag":26,"props":396,"children":397},{},[398,400],{"type":24,"value":399},"[3] 机器学习设计模式：",{"type":18,"tag":32,"props":401,"children":404},{"href":402,"rel":403},"https://www.oreilly.com/library/view/machine-learning-design/9781098115777/",[36],[405],{"type":24,"value":402},{"title":7,"searchDepth":407,"depth":407,"links":408},4,[409,411,412,413,418,419],{"id":66,"depth":410,"text":66},2,{"id":79,"depth":410,"text":79},{"id":112,"depth":410,"text":112},{"id":130,"depth":410,"text":130,"children":414},[415,417],{"id":189,"depth":416,"text":189},3,{"id":166,"depth":416,"text":166},{"id":335,"depth":410,"text":335},{"id":366,"depth":410,"text":366},"markdown","content:technology-blogs:zh:1522.md","content","technology-blogs/zh/1522.md","technology-blogs/zh/1522","md",1776506113090]