[{"data":1,"prerenderedAt":891},["ShallowReactive",2],{"content-query-Vn7falznQ5":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":885,"_id":886,"_source":887,"_file":888,"_stem":889,"_extension":890},"/technology-blogs/zh/1521","zh",false,"","【MindSpore易点通机器人-04】MLOps 环境搭建过程","总结了如何在本地完成MLOps环境的搭建，基于Minikube、Argo等工具可以让我们很好的在本地展开开发验证工作，不用依赖复杂的基础设施，也不用有额外的开销。","2022-05-25","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/8f78bb97bed64f9b9773d77faaa53b1e.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":867},"root",[17,25,51,59,64,69,94,101,115,121,133,143,164,172,192,200,205,213,226,234,254,275,282,288,300,318,325,347,355,384,412,418,470,477,489,496,501,508,514,519,553,558,566,571,579,584,591,597,609,627,633,671,682,690,702,710,731,739,745,750,788,793,801,806,814,819,825,830,838,851,858,862],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通机器人-04mlops-环境搭建过程",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31,40,42,49],{"type":24,"value":30},"在上一篇",{"type":18,"tag":32,"props":33,"children":37},"a",{"href":34,"rel":35},"https://zhuanlan.zhihu.com/p/509494866",[36],"nofollow",[38],{"type":24,"value":39},"【MindSpore易点通机器人-03】迭代0的准备工作",{"type":24,"value":41},"，我们从整体上概述了MindSpore易点通机器人项目开始前需要在迭代0的准备工作，本篇将会为大家讲述迭代0中具体的MLOps 环境搭建过程。整体的MLOps流水线设计如下图，包含持续训练流水线和CI/CD流水线。相关代码请参考",{"type":18,"tag":32,"props":43,"children":46},{"href":44,"rel":45},"https://link.zhihu.com/?target=https%3A//gitee.com/msu-sig/robot",[36],[47],{"type":24,"value":48},"MindSpore易点通机器人代码仓",{"type":24,"value":50},"。",{"type":18,"tag":26,"props":52,"children":53},{},[54],{"type":18,"tag":55,"props":56,"children":58},"img",{"alt":7,"src":57},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/251af6583487462887c1fe6500a09866.jpg",[],{"type":18,"tag":26,"props":60,"children":61},{},[62],{"type":24,"value":63},"实际的技术选型上，我们基于Jenkins构建CI/CD流水线，基于Argo构建持续训练流水线。同时，我们把Jenkins和Argo都运行在K8S上来保证可用性和弹性。下文将为大家介绍如何在本地使用Minikube完成机器人项目的MLOps流水线的搭建，并在本地运行起来。",{"type":18,"tag":26,"props":65,"children":66},{},[67],{"type":24,"value":68},"具体的过程如下：",{"type":18,"tag":70,"props":71,"children":72},"ol",{},[73,79,84,89],{"type":18,"tag":74,"props":75,"children":76},"li",{},[77],{"type":24,"value":78},"安装配置WSL+Ubuntu+Docker；",{"type":18,"tag":74,"props":80,"children":81},{},[82],{"type":24,"value":83},"基于Minikube运行K8S；",{"type":18,"tag":74,"props":85,"children":86},{},[87],{"type":24,"value":88},"基于K8S+Jenkins构建CI/CD流水线；",{"type":18,"tag":74,"props":90,"children":91},{},[92],{"type":24,"value":93},"基于K8S+Argo持续训练流水线。",{"type":18,"tag":95,"props":96,"children":98},"h2",{"id":97},"_1-安装配置wslubuntudocker",[99],{"type":24,"value":100},"1. 安装配置WSL+Ubuntu+Docker",{"type":18,"tag":26,"props":102,"children":103},{},[104,106,113],{"type":24,"value":105},"因为团队大部分人的开发环境都是Windows，所以需要选择WSL+Linux+Docker的方式。这里我们没有采用Docker Desktop+WSL Backend，而是利用Distord让Docker在Ubuntu上直接运行，相关的安装配置文档可以参考",{"type":18,"tag":32,"props":107,"children":110},{"href":108,"rel":109},"https://zhuanlan.zhihu.com/p/500450853",[36],[111],{"type":24,"value":112},"如何不安装Docker Desktop在WSL下运行Docker",{"type":24,"value":114},"这篇文章。",{"type":18,"tag":95,"props":116,"children":118},{"id":117},"_2-基于minikube运行k8s",[119],{"type":24,"value":120},"2. 基于Minikube运行K8S",{"type":18,"tag":26,"props":122,"children":123},{},[124,131],{"type":18,"tag":32,"props":125,"children":128},{"href":126,"rel":127},"https://link.zhihu.com/?target=https%3A//minikube.sigs.k8s.io/docs/",[36],[129],{"type":24,"value":130},"Minikube",{"type":24,"value":132},"是一个单机安装配置K8S集群的工具，它支持多平台（Mac/Linux/Windows）。Minikube可以将K8S集群安装配置在单个Docker容器或者VM（hyper-v/VMWare等）中，过程也非常简单。首先在Ubuntu中安装Minikube：",{"type":18,"tag":134,"props":135,"children":137},"pre",{"code":136},"curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64\nsudo install minikube-linux-amd64 /usr/local/bin/minikube\n",[138],{"type":18,"tag":139,"props":140,"children":141},"code",{"__ignoreMap":7},[142],{"type":24,"value":136},{"type":18,"tag":26,"props":144,"children":145},{},[146,148,154,156,162],{"type":24,"value":147},"然后执行",{"type":18,"tag":139,"props":149,"children":151},{"className":150},[],[152],{"type":24,"value":153},"minikube start",{"type":24,"value":155}," ，因为在第一步中我们已经配置好了Docker，Minikube在启动时会默认使用Docker作为VM，然后在容器中启动K8S集群。启动完成后在Ubuntu上使用",{"type":18,"tag":139,"props":157,"children":159},{"className":158},[],[160],{"type":24,"value":161},"docker ps",{"type":24,"value":163},"只能看到一个容器：",{"type":18,"tag":134,"props":165,"children":167},{"code":166},"CONTAINER ID   IMAGE                    COMMAND                  CREATED       STATUS       PORTS                                                                                                                                  NAMES\n699a71fee349   kicbase/stable:v0.0.30   \"/usr/local/bin/entr…\"   2 weeks ago   Up 2 hours   xxx.xxx.xxx..xxx:49157->22/tcp, xxx.xxx.xxx..xxx:49156->2376/tcp, xxx.xxx.xxx..xxx:49155->5000/tcp, xxx.xxx.xxx..xxx:49154->8443/tcp, xxx.xxx.xxx..xxx:49153->32443/tcp   minikube\n",[168],{"type":18,"tag":139,"props":169,"children":170},{"__ignoreMap":7},[171],{"type":24,"value":166},{"type":18,"tag":26,"props":173,"children":174},{},[175,177,183,185,190],{"type":24,"value":176},"如果我们执行",{"type":18,"tag":139,"props":178,"children":180},{"className":179},[],[181],{"type":24,"value":182},"docker exec -it 699a71fee349",{"type":24,"value":184},"进入容器后再执行",{"type":18,"tag":139,"props":186,"children":188},{"className":187},[],[189],{"type":24,"value":161},{"type":24,"value":191},"，就能发现K8S集群的服务了：",{"type":18,"tag":134,"props":193,"children":195},{"code":194},"CONTAINER ID   IMAGE                                  COMMAND                    CREATED       STATUS       PORTS     NAMES\naf45f3caa0d9   99a3486be4f2                           \"kube-scheduler --au…\"    2 hours ago   Up 2 hours             k8s_kube-scheduler_kube-scheduler-minikube_kube-system_be132fe5c6572cb34d93f5e05ce2a540_1\ne648e7d30a7d   Error 404 (Not Found)!!1               \"/pause\"                   2 hours ago   Up 2 hours             k8s_POD_kube-apiserver-minikube_kube-system_cd6e47233d36a9715b0ab9632f871843_1\ne26d9e92c4e3   k8s.gcr.io/pause:3.6                   \"/pause\"                   2 hours ago   Up 2 hours             k8s_POD_kube-scheduler-minikube_kube-system_be132fe5c6572cb34d93f5e05ce2a540_1\ne658bf17922d   Error 404 (Not Found)!!1               \"/pause\"                   2 hours ago   Up 2 hours             k8s_POD_kube-controller-manager-minikube_kube-system_b965983ec05322d0973594a01d5e8245_1\n1f85a9bae877   Error 404 (Not Found)!!1               \"/pause\"                   2 hours ago   Up 2 hours             k8s_POD_etcd-minikube_kube-system_9d3d310935e5fabe942511eec3e2cd0c_1\n....\n",[196],{"type":18,"tag":139,"props":197,"children":198},{"__ignoreMap":7},[199],{"type":24,"value":194},{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"从容器中退出，安装kubectl之后就在Ubuntu上使用Kubectl管理集群了：",{"type":18,"tag":134,"props":206,"children":208},{"code":207},"curl -LO \"https://dl.k8s.io/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256\"\nsudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl\n",[209],{"type":18,"tag":139,"props":210,"children":211},{"__ignoreMap":7},[212],{"type":24,"value":207},{"type":18,"tag":26,"props":214,"children":215},{},[216,218,224],{"type":24,"value":217},"执行",{"type":18,"tag":139,"props":219,"children":221},{"className":220},[],[222],{"type":24,"value":223},"kubectl get nodes",{"type":24,"value":225},"，查询K8S管理的节点。",{"type":18,"tag":134,"props":227,"children":229},{"code":228},"~$ kubectl get nodes\nNAME       STATUS   ROLES                  AGE   VERSION\nminikube   Ready    control-plane,master   19d   v1.23.3\n",[230],{"type":18,"tag":139,"props":231,"children":232},{"__ignoreMap":7},[233],{"type":24,"value":228},{"type":18,"tag":26,"props":235,"children":236},{},[237,239,245,247,253],{"type":24,"value":238},"如果关注Kubernetes的界面，使用",{"type":18,"tag":139,"props":240,"children":242},{"className":241},[],[243],{"type":24,"value":244},"minikube dashboard",{"type":24,"value":246},"就可以启动K8S的管理界面，并在Windows上通过浏览器访问 ",{"type":18,"tag":32,"props":248,"children":251},{"href":249,"rel":250},"http://xxx.xxx.xxx..xxx:44185/api-server/v1/namespaces/kubernetes-dashboard/services/http:kubernetes-dashboard:/proxy/",[36],[252],{"type":24,"value":249},{"type":24,"value":50},{"type":18,"tag":26,"props":255,"children":256},{},[257,259,265,267,273],{"type":24,"value":258},"使用Minikube可以让我们在本地拥有一个和生产环境一样功能的K8S集群，但这种方式同样带来了网络的复杂性，如下图，Ubuntu运行在Hyper-v的虚机中，在K8S部署的服务运行在Ubuntu的Docker容器的容器中（Docker in Docker）。所以，如果要在Windows的浏览器上访问K8S中运行的服务，需要先通过",{"type":18,"tag":139,"props":260,"children":262},{"className":261},[],[263],{"type":24,"value":264},"kubectl port-forward",{"type":24,"value":266},"完成Ubuntu VM和Minikube容器的端口映射，然后再使用",{"type":18,"tag":139,"props":268,"children":270},{"className":269},[],[271],{"type":24,"value":272},"minikube tunnel",{"type":24,"value":274},"完成VM到Windows的端口映射。",{"type":18,"tag":26,"props":276,"children":277},{},[278],{"type":18,"tag":55,"props":279,"children":281},{"alt":7,"src":280},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/5239328abd2a4c3abcf83549ae389de0.jpg",[],{"type":18,"tag":95,"props":283,"children":285},{"id":284},"_3-基于k8sjenkins构建cicd流水线",[286],{"type":24,"value":287},"3. 基于K8S+Jenkins构建CI/CD流水线",{"type":18,"tag":26,"props":289,"children":290},{},[291,298],{"type":18,"tag":32,"props":292,"children":295},{"href":293,"rel":294},"https://link.zhihu.com/?target=https%3A//www.jenkins.io/",[36],[296],{"type":24,"value":297},"Jenkins",{"type":24,"value":299},"是一个经久不衰的持续集成工具，它的插件生态比较强大。我们构建基于Jenkins+Kubernetes的CI/CD流水线要达到的目的如下：",{"type":18,"tag":70,"props":301,"children":302},{},[303,308,313],{"type":18,"tag":74,"props":304,"children":305},{},[306],{"type":24,"value":307},"基于K8S实现Jenkins的弹性部署；",{"type":18,"tag":74,"props":309,"children":310},{},[311],{"type":24,"value":312},"基于Jenkins插件实现CI任务在K8S上的运行；",{"type":18,"tag":74,"props":314,"children":315},{},[316],{"type":24,"value":317},"基于Pipeline as Code实现Jenkins的流水线配置管理（持续集成任务+持续部署任务）。",{"type":18,"tag":319,"props":320,"children":322},"h3",{"id":321},"基于k8s实现jenkins的弹性部署",[323],{"type":24,"value":324},"基于K8S实现Jenkins的弹性部署",{"type":18,"tag":26,"props":326,"children":327},{},[328,330,337,339,345],{"type":24,"value":329},"MindSpore易点通机器人的",{"type":18,"tag":32,"props":331,"children":334},{"href":332,"rel":333},"https://link.zhihu.com/?target=https%3A//gitee.com/msu-sig/robot/blob/master/infra/jenkins/",[36],[335],{"type":24,"value":336},"代码仓",{"type":24,"value":338},"已经给出了在K8S上部署Jenkins的配置文件，这里要设置成",{"type":18,"tag":139,"props":340,"children":342},{"className":341},[],[343],{"type":24,"value":344},"LoadBalancer",{"type":24,"value":346},"类型。",{"type":18,"tag":134,"props":348,"children":350},{"code":349},"---\napiVersion: v1\nkind: Service\nmetadata:\n  name: jenkins\nspec:\n  type: LoadBalancer\n  selector:\n    name: jenkins\n  ports:\n    -\n      name: http\n      port: 8080\n      targetPort: 8080\n      protocol: TCP\n",[351],{"type":18,"tag":139,"props":352,"children":353},{"__ignoreMap":7},[354],{"type":24,"value":349},{"type":18,"tag":26,"props":356,"children":357},{},[358,360,366,368,374,376,382],{"type":24,"value":359},"再用",{"type":18,"tag":139,"props":361,"children":363},{"className":362},[],[364],{"type":24,"value":365},"kubectl create -n jenkins",{"type":24,"value":367},"创建Jenkins的namespace，通过",{"type":18,"tag":139,"props":369,"children":371},{"className":370},[],[372],{"type":24,"value":373},"kubectl apply -f jenkins.yaml -n jenkins",{"type":24,"value":375},"完成部署，然后用",{"type":18,"tag":139,"props":377,"children":379},{"className":378},[],[380],{"type":24,"value":381},"kubectl apply -f service-account.yaml -n jenkins",{"type":24,"value":383},"完成API调用的授权。",{"type":18,"tag":26,"props":385,"children":386},{},[387,389,395,397,402,404,410],{"type":24,"value":388},"在配置文件中我们指定了对外暴露的端口是8080，所以可以用",{"type":18,"tag":139,"props":390,"children":392},{"className":391},[],[393],{"type":24,"value":394},"kubectl port-forward svc jenkins/jenkins 8080:8080 -n jenkins",{"type":24,"value":396},"完成端口映射，再执行",{"type":18,"tag":139,"props":398,"children":400},{"className":399},[],[401],{"type":24,"value":272},{"type":24,"value":403},"就可以在浏览器中使用",{"type":18,"tag":139,"props":405,"children":407},{"className":406},[],[408],{"type":24,"value":409},"xxx.xxx.xxx..xxx:8080",{"type":24,"value":411},"打开Jenkins界面了，初始密码可以在pod启动的日志中获得。",{"type":18,"tag":319,"props":413,"children":415},{"id":414},"基于jenkins插件实现ci任务在k8s上的运行",[416],{"type":24,"value":417},"基于Jenkins插件实现CI任务在K8S上的运行",{"type":18,"tag":26,"props":419,"children":420},{},[421,423,429,431,436,438,444,446,452,454,460,462,468],{"type":24,"value":422},"完成Jenkins在K8S上的部署后，需要在Jenkins中安装",{"type":18,"tag":139,"props":424,"children":426},{"className":425},[],[427],{"type":24,"value":428},"kubernetes",{"type":24,"value":430},"插件，配置节点类型为K8S集群。从插件管理中先安装",{"type":18,"tag":139,"props":432,"children":434},{"className":433},[],[435],{"type":24,"value":428},{"type":24,"value":437},"插件，然后在",{"type":18,"tag":139,"props":439,"children":441},{"className":440},[],[442],{"type":24,"value":443},"节点管理",{"type":24,"value":445},"中选择",{"type":18,"tag":139,"props":447,"children":449},{"className":448},[],[450],{"type":24,"value":451},"配置集群",{"type":24,"value":453},"。 首先配置K8S集群的信息，因为在一个K8S集群，服务间都可以通过主机名的方式相互访问，所以“Kubernetes地址”配置只需要输入",{"type":18,"tag":139,"props":455,"children":457},{"className":456},[],[458],{"type":24,"value":459},"https://kubernetes.defaults",{"type":24,"value":461},"，同时补充配置“Kubernetes命名空间”为",{"type":18,"tag":139,"props":463,"children":465},{"className":464},[],[466],{"type":24,"value":467},"jenkins",{"type":24,"value":469},"，如下图所示。",{"type":18,"tag":26,"props":471,"children":472},{},[473],{"type":18,"tag":55,"props":474,"children":476},{"alt":7,"src":475},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/1119d701f6d5465baff45e9955667924.jpg",[],{"type":18,"tag":26,"props":478,"children":479},{},[480,482,488],{"type":24,"value":481},"同理，对于“Jenkins地址”，只需要填入",{"type":18,"tag":32,"props":483,"children":486},{"href":484,"rel":485},"http://jenkins.jenkins:8080",[36],[487],{"type":24,"value":484},{"type":24,"value":50},{"type":18,"tag":26,"props":490,"children":491},{},[492],{"type":18,"tag":55,"props":493,"children":495},{"alt":7,"src":494},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/0ec1983f81ef4cabaef847b8a1f3dd09.jpg",[],{"type":18,"tag":26,"props":497,"children":498},{},[499],{"type":24,"value":500},"另一个要配置是pod模板，既任务运行的pod的基础镜像及相关信息配置。要同时运行Java和Python，所以在dockerhub找了一个Java和Python都有的镜像，如下图所示，保存后即可完成配置。",{"type":18,"tag":26,"props":502,"children":503},{},[504],{"type":18,"tag":55,"props":505,"children":507},{"alt":7,"src":506},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/6afd95c9d12b42e0ac14d48e5c1d0012.jpg",[],{"type":18,"tag":319,"props":509,"children":511},{"id":510},"基于pipeline-as-code实现jenkins的流水线配置管理",[512],{"type":24,"value":513},"基于Pipeline as Code实现Jenkins的流水线配置管理",{"type":18,"tag":26,"props":515,"children":516},{},[517],{"type":24,"value":518},"我们的最后一步是基于Jenkins的流水线即代码功能完成CI/CD流水线搭建，按照设计，它应该包含如下任务：",{"type":18,"tag":520,"props":521,"children":522},"ul",{},[523,528,533,538,543,548],{"type":18,"tag":74,"props":524,"children":525},{},[526],{"type":24,"value":527},"持续集成流水线",{"type":18,"tag":74,"props":529,"children":530},{},[531],{"type":24,"value":532},"代码检查：数据处理代码、模型代码、推理代码以及脚本代码规范检查任务",{"type":18,"tag":74,"props":534,"children":535},{},[536],{"type":24,"value":537},"单元测试：数据处理逻辑、模型代码逻辑、推理代码逻辑的单元测试任务",{"type":18,"tag":74,"props":539,"children":540},{},[541],{"type":24,"value":542},"API测试：推理接口功能测试",{"type":18,"tag":74,"props":544,"children":545},{},[546],{"type":24,"value":547},"训练触发：如果修改了训练代码，触发Argo的训练流水线",{"type":18,"tag":74,"props":549,"children":550},{},[551],{"type":24,"value":552},"部署",{"type":18,"tag":26,"props":554,"children":555},{},[556],{"type":24,"value":557},"使用Jenkins流水线即代码功能，对应的配置如下：",{"type":18,"tag":134,"props":559,"children":561},{"code":560},"pipeline {\n    agent {\n        kubernetes {\n            containerTemplate {\n                name 'python'\n                image 'bitnami/java:1.8'\n                command 'sleep'\n                args 'infinity'\n            }\n            defaultContainer 'python'\n        }\n    }\n   stages {\n        stage('Code Check ') {\n            steps(\"Code Check\") {\n                echo 'checking python code.'\n            }\n        }\n        stage('Unit Testing') {\n            steps(\"Unit Testing\") {\n                echo \"running unit tests\"\n            }\n        }\n        stage('API Testing') {\n            steps {\n                echo 'running inference API Testing'\n            }\n        }\n        stage('Training Trigger') {\n            when {\n                changeset \"src/train/*.py\"\n            }\n            steps(\"trigger training\") {\n                echo 'trigger new round of training'\n            }\n        }\n    }\n}\n",[562],{"type":18,"tag":139,"props":563,"children":564},{"__ignoreMap":7},[565],{"type":24,"value":560},{"type":18,"tag":26,"props":567,"children":568},{},[569],{"type":24,"value":570},"部署流水线的pipeline脚本如下：",{"type":18,"tag":134,"props":572,"children":574},{"code":573},"pipeline {\n  agent  any\n   stages {\n        stage('Packaging') {\n            steps(\"Packaging\") {\n                echo 'packaging with model and inference code'\n            }\n        }\n        stage('Continuous Deployment') {\n            steps(\"deploying\") {\n                echo 'deploy new version of model'\n            }\n        }\n    }\n}\n",[575],{"type":18,"tag":139,"props":576,"children":577},{"__ignoreMap":7},[578],{"type":24,"value":573},{"type":18,"tag":26,"props":580,"children":581},{},[582],{"type":24,"value":583},"配置文件中每个step先置空，是为了方便调试。在Jenkins基于上面的配置文件创建一个流水线后的测试结果如下：",{"type":18,"tag":26,"props":585,"children":586},{},[587],{"type":18,"tag":55,"props":588,"children":590},{"alt":7,"src":589},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/ea974ce986d9448d8856ebc71e0c8819.jpg",[],{"type":18,"tag":95,"props":592,"children":594},{"id":593},"_4-基于k8sargo持续训练流水线",[595],{"type":24,"value":596},"4. 基于K8S+Argo持续训练流水线",{"type":18,"tag":26,"props":598,"children":599},{},[600,607],{"type":18,"tag":32,"props":601,"children":604},{"href":602,"rel":603},"https://link.zhihu.com/?target=https%3A//argoproj.github.io/",[36],[605],{"type":24,"value":606},"Argo",{"type":24,"value":608},"是一个基于K8S的开源的工作流管理工具，也支持机器学习的工作流。MindSpore DX Sig已经在先前的社区机器人项目中使用了该工具，所以这里我们复用了工具和配置。Argo Workflow的安装配置如下：",{"type":18,"tag":70,"props":610,"children":611},{},[612,617,622],{"type":18,"tag":74,"props":613,"children":614},{},[615],{"type":24,"value":616},"在K8S上安装Argo；",{"type":18,"tag":74,"props":618,"children":619},{},[620],{"type":24,"value":621},"基于Argo Workflow 配置机器学习流水线；",{"type":18,"tag":74,"props":623,"children":624},{},[625],{"type":24,"value":626},"运行机器学习流水线。",{"type":18,"tag":319,"props":628,"children":630},{"id":629},"_1-在k8s上安装argo",[631],{"type":24,"value":632},"1. 在K8S上安装Argo",{"type":18,"tag":26,"props":634,"children":635},{},[636,638,644,646,653,655,661,663,669],{"type":24,"value":637},"首先，我们执行",{"type":18,"tag":139,"props":639,"children":641},{"className":640},[],[642],{"type":24,"value":643},"kubectl create -n argo",{"type":24,"value":645},"为Argo创建新的命名空间。然后，基于",{"type":18,"tag":32,"props":647,"children":650},{"href":648,"rel":649},"https://link.zhihu.com/?target=https%3A//gitee.com/msu-sig/robot/tree/master/infra/argo",[36],[651],{"type":24,"value":652},"配置文件",{"type":24,"value":654},"，执行",{"type":18,"tag":139,"props":656,"children":658},{"className":657},[],[659],{"type":24,"value":660},"kubect apply -f install.yml -n argo",{"type":24,"value":662},"完成安装。最后，执行",{"type":18,"tag":139,"props":664,"children":666},{"className":665},[],[667],{"type":24,"value":668},"kubect apply -f manifests/create_serviceaccount.yaml -n argo",{"type":24,"value":670},"完成权限配置。",{"type":18,"tag":26,"props":672,"children":673},{},[674,676,681],{"type":24,"value":675},"和前面的Jenkins配置类似，Argo Server需要设置为",{"type":18,"tag":139,"props":677,"children":679},{"className":678},[],[680],{"type":24,"value":344},{"type":24,"value":346},{"type":18,"tag":134,"props":683,"children":685},{"code":684},"apiVersion: v1\nkind: Service\nmetadata:\n  name: argo-server\nspec:\n  ports:\n  - name: web\n    port: 2746\n    targetPort: 2746\n  type: LoadBalancer\n  sessionAffinity: None\n  externalTrafficPolicy: Cluster\n  selector:\n    app: argo-server\n",[686],{"type":18,"tag":139,"props":687,"children":688},{"__ignoreMap":7},[689],{"type":24,"value":684},{"type":18,"tag":26,"props":691,"children":692},{},[693,694,700],{"type":24,"value":217},{"type":18,"tag":139,"props":695,"children":697},{"className":696},[],[698],{"type":24,"value":699},"kubectl get svc -n argo",{"type":24,"value":701},"可以看到：",{"type":18,"tag":134,"props":703,"children":705},{"code":704},"~$ kubectl get svc -n argo\nNAME                          TYPE           CLUSTER-IP      EXTERNAL-IP   PORT(S)          AGE\nargo-server                   LoadBalancer   xxx.xxx.xxx..xxx   xxx.xxx.xxx..xxx     2746:30001/TCP   19d\nworkflow-controller-metrics   ClusterIP      xxx.xxx.xxx..xxx            9090/TCP         19d\n",[706],{"type":18,"tag":139,"props":707,"children":708},{"__ignoreMap":7},[709],{"type":24,"value":704},{"type":18,"tag":26,"props":711,"children":712},{},[713,715,721,723,729],{"type":24,"value":714},"如果要在Windows的浏览器上访问Argo的Web界面，则还需要把argo-server的端口暴露出来，同上使用",{"type":18,"tag":139,"props":716,"children":718},{"className":717},[],[719],{"type":24,"value":720},"kubectl port-forward svc/argo-server 2746:2746 -n argo",{"type":24,"value":722},"完成端口映射。然后浏览器上访问",{"type":18,"tag":139,"props":724,"children":726},{"className":725},[],[727],{"type":24,"value":728},"https://xxx.xxx.xxx..xxx:2746",{"type":24,"value":730},"，通过下面的脚本获得密码，登录后就可以正常使用Argo的Web管理界面了。",{"type":18,"tag":134,"props":732,"children":734},{"code":733},"#!/bin/bash\nSECRET=$(kubectl get sa argo-server -n argo -o=jsonpath='{.secrets[0].name}')\nARGO_TOKEN=\"Bearer $(kubectl get secret $SECRET -n argo -o=jsonpath='{.data.token}' | base64 --decode)\"\necho $ARGO_TOKEN\n",[735],{"type":18,"tag":139,"props":736,"children":737},{"__ignoreMap":7},[738],{"type":24,"value":733},{"type":18,"tag":319,"props":740,"children":742},{"id":741},"_2-基于argo-workflow-配置机器学习流水线",[743],{"type":24,"value":744},"2. 基于Argo Workflow 配置机器学习流水线",{"type":18,"tag":26,"props":746,"children":747},{},[748],{"type":24,"value":749},"我们期望训练的工作流可以完成以下任务：",{"type":18,"tag":70,"props":751,"children":752},{},[753,758,763,768,773,778,783],{"type":18,"tag":74,"props":754,"children":755},{},[756],{"type":24,"value":757},"数据处理",{"type":18,"tag":74,"props":759,"children":760},{},[761],{"type":24,"value":762},"训练",{"type":18,"tag":74,"props":764,"children":765},{},[766],{"type":24,"value":767},"评估",{"type":18,"tag":74,"props":769,"children":770},{},[771],{"type":24,"value":772},"质量评估：基于测试集数据评估模型，预测性能需要高于基线值",{"type":18,"tag":74,"props":774,"children":775},{},[776],{"type":24,"value":777},"可解释性评估",{"type":18,"tag":74,"props":779,"children":780},{},[781],{"type":24,"value":782},"可靠性评估",{"type":18,"tag":74,"props":784,"children":785},{},[786],{"type":24,"value":787},"总结",{"type":18,"tag":26,"props":789,"children":790},{},[791],{"type":24,"value":792},"基于工作流的设计以及Argo工作流语法，可以得出基础的配置：",{"type":18,"tag":134,"props":794,"children":796},{"code":795},"apiVersion: Page Not Found\nkind: Workflow\nmetadata:\n  generateName: robot-train-eval-\nspec:\n  serviceAccountName: robot-sa\n  entrypoint: robot-controller\n  onExit: summary\n  templates:\n  - name: robot-controller\n    steps:\n    - - name: data-process\n        template: process\n    - - name: robot-train\n        template: train\n    - - name: robot-eval\n        template: eval\n      - name: robot-interpretability\n        template: interpretability\n      - name: robot-reliability\n        template: reliability\n    - - name: summary\n        template: summary\n",[797],{"type":18,"tag":139,"props":798,"children":799},{"__ignoreMap":7},[800],{"type":24,"value":795},{"type":18,"tag":26,"props":802,"children":803},{},[804],{"type":24,"value":805},"而后，展开每个任务需要的配置，如训练的配置代码：",{"type":18,"tag":134,"props":807,"children":809},{"code":808}," - name: train\n    container:\n      image: ubuntu\n      imagePullPolicy: Always\n      env:\n      - name: IS_TRAIN\n        value: \"True\"\n      - name: NUM_STEPS\n        value: \"10\"\n      command: ['echo']\n      args: [\"trainning\"]\n    ...\n",[810],{"type":18,"tag":139,"props":811,"children":812},{"__ignoreMap":7},[813],{"type":24,"value":808},{"type":18,"tag":26,"props":815,"children":816},{},[817],{"type":24,"value":818},"把每个阶段的任务汇总在一起就完成了整个的工作流。接下来，我们尝试使用Argo运行下训练流水线。",{"type":18,"tag":319,"props":820,"children":822},{"id":821},"_3-运行机器学习流水线",[823],{"type":24,"value":824},"3. 运行机器学习流水线",{"type":18,"tag":26,"props":826,"children":827},{},[828],{"type":24,"value":829},"可以使用Argo CLI的客户端完成工作流任务的提交，首先安装客户端：",{"type":18,"tag":134,"props":831,"children":833},{"code":832},"#!/bin/bash\n# Download the binary\ncurl -sLO https://github.com/argoproj/argo-workflows/releases/download/v3.3.5/argo-linux-amd64.gz\n# Unzip\ngunzip argo-linux-amd64.gz\n# Make binary executable\nchmod +x argo-linux-amd64\n# Move binary to path\nmv ./argo-linux-amd64 /usr/local/bin/argo\n",[834],{"type":18,"tag":139,"props":835,"children":836},{"__ignoreMap":7},[837],{"type":24,"value":832},{"type":18,"tag":26,"props":839,"children":840},{},[841,843,849],{"type":24,"value":842},"然后通过argo cli提交工作流",{"type":18,"tag":139,"props":844,"children":846},{"className":845},[],[847],{"type":24,"value":848},"argo submit -n robot --watch robot-train-eval.yaml",{"type":24,"value":850},"，执行结果如下，和我们期望的流水线步骤一致。",{"type":18,"tag":26,"props":852,"children":853},{},[854],{"type":18,"tag":55,"props":855,"children":857},{"alt":7,"src":856},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/06/06/ac9ad6641dba469d93db62dbfe9a1ffe.jpg",[],{"type":18,"tag":95,"props":859,"children":860},{"id":787},[861],{"type":24,"value":787},{"type":18,"tag":26,"props":863,"children":864},{},[865],{"type":24,"value":866},"本篇文章总结了如何在本地完成MLOps环境的搭建，基于Minikube、Argo等工具可以让我们很好的在本地展开开发验证工作，不用依赖复杂的基础设施，也不用有额外的开销。在配置文件中和脚本中，我们没有加真实的实现，是为了先打通流程，然后再将调试好内容逐步补充进去，始终都可以从端到端的角度来完成验证。",{"title":7,"searchDepth":868,"depth":868,"links":869},4,[870,872,873,879,884],{"id":97,"depth":871,"text":100},2,{"id":117,"depth":871,"text":120},{"id":284,"depth":871,"text":287,"children":874},[875,877,878],{"id":321,"depth":876,"text":324},3,{"id":414,"depth":876,"text":417},{"id":510,"depth":876,"text":513},{"id":593,"depth":871,"text":596,"children":880},[881,882,883],{"id":629,"depth":876,"text":632},{"id":741,"depth":876,"text":744},{"id":821,"depth":876,"text":824},{"id":787,"depth":871,"text":787},"markdown","content:technology-blogs:zh:1521.md","content","technology-blogs/zh/1521.md","technology-blogs/zh/1521","md",1776506113045]