[{"data":1,"prerenderedAt":412},["ShallowReactive",2],{"content-query-rrPSFgVREA":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":406,"_id":407,"_source":408,"_file":409,"_stem":410,"_extension":411},"/technology-blogs/zh/702","zh",false,"","AlphaFold/ RoseTTAFold开源复现（2）—AlphaFold流程分析和训练构建","流程分析和训练构建","2021-09-01","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/09/01/47921be99b7e4453b668d3b494812919.png","technology-blogs","开发者分享",{"type":15,"children":16,"toc":397},"root",[17,25,31,44,55,60,65,70,75,86,91,96,101,106,111,120,128,136,141,149,158,163,171,176,181,186,191,196,201,206,211,219,224,232,237,245,250,258,266,276,281,290,295,300,308,316,324,329,334,343,350,359,364,379,384,389],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"alphafold-rosettafold开源复现2alphafold流程分析和训练构建",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：金雪锋",{"type":18,"tag":26,"props":32,"children":33},{},[34,36],{"type":24,"value":35},"作者主页：",{"type":18,"tag":37,"props":38,"children":42},"a",{"href":39,"rel":40},"https://www.zhihu.com/people/jin-xue-feng",[41],"nofollow",[43],{"type":24,"value":39},{"type":18,"tag":26,"props":45,"children":46},{},[47,49],{"type":24,"value":48},"文章来源：",{"type":18,"tag":37,"props":50,"children":53},{"href":51,"rel":52},"https://zhuanlan.zhihu.com/p/405352109",[41],[54],{"type":24,"value":51},{"type":18,"tag":26,"props":56,"children":57},{},[58],{"type":24,"value":59},"AlphaFold开源后（后续若不做特殊说明，AlphaFold均指AlphaFold 2），很多研究团队都在分析、重现和尝试进一步提升。相比于AlphaFold的推理运行起来，AlphaFold的训练重现要复杂得多。主要挑战在于：",{"type":18,"tag":26,"props":61,"children":62},{},[63],{"type":24,"value":64},"1）AlphaFold开源的是的推理代码，训练的部分没有公开，但给出了深度网络结构和主要训练超参；",{"type":18,"tag":26,"props":66,"children":67},{},[68],{"type":24,"value":69},"2）AlphaFold训练数据集的构造对训练出好效果非常重要但非常耗时，包含原始训练序列MAS和模型收敛后作为训练样本扩展序列的MSA的搜索，和Template的搜索。每条MSA和Template的搜索从数十分钟到数小时不等，计算成本非常高。",{"type":18,"tag":26,"props":71,"children":72},{},[73],{"type":24,"value":74},"我们尝试从开源的推理代码分析开始，构建典型的训练代码：",{"type":18,"tag":76,"props":77,"children":79},"h2",{"id":78},"_1整体结构",[80],{"type":18,"tag":81,"props":82,"children":83},"strong",{},[84],{"type":24,"value":85},"1、整体结构",{"type":18,"tag":26,"props":87,"children":88},{},[89],{"type":24,"value":90},"AlphaFold包含三大部分：",{"type":18,"tag":26,"props":92,"children":93},{},[94],{"type":24,"value":95},"1）Data蛋白质多序列比对和模板数据处理，",{"type":18,"tag":26,"props":97,"children":98},{},[99],{"type":24,"value":100},"2）Model深度学习网络部分，",{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":24,"value":105},"3）Relax预测结果再处理部分。",{"type":18,"tag":26,"props":107,"children":108},{},[109],{"type":24,"value":110},"AlphaFold基于Jax实现，在下表给出了在AlphaFold中用到的Jax和Jax之上NN相关的库用到的主要的API和功能。 在Data和Relax部分，是AI无关的，下表简洁的罗列了数据集和对应的处理工具。",{"type":18,"tag":26,"props":112,"children":113},{},[114],{"type":18,"tag":115,"props":116,"children":119},"img",{"alt":117,"src":118},"1.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/151705qsuhagtctgs9oeww.jpg",[],{"type":18,"tag":26,"props":121,"children":122},{},[123],{"type":18,"tag":81,"props":124,"children":125},{},[126],{"type":24,"value":127},"如果对Jax不熟悉，下图给出了一个基于Jax构建应用算法的简单的模块关系：",{"type":18,"tag":26,"props":129,"children":130},{},[131],{"type":18,"tag":115,"props":132,"children":135},{"alt":133,"src":134},"2.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/151739o30ezsh1vvzzfpel.jpg",[],{"type":18,"tag":26,"props":137,"children":138},{},[139],{"type":24,"value":140},"在构建训练代码前，需对AlphaFold的整个流程了然于心。下面三幅图，是中间AI相关的部分最主要的三幅图。为了理解方便，在图中用多于原图的部分-追加文字，标明了缩写的含义、主要模块之间的流动的数据、和Recycling具体的实现对应的代码。",{"type":18,"tag":26,"props":142,"children":143},{},[144],{"type":18,"tag":115,"props":145,"children":148},{"alt":146,"src":147},"3.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/1517594y1ui3lk9u6dvo6u.jpg",[],{"type":18,"tag":76,"props":150,"children":152},{"id":151},"_2数据处理",[153],{"type":18,"tag":81,"props":154,"children":155},{},[156],{"type":24,"value":157},"2、数据处理",{"type":18,"tag":26,"props":159,"children":160},{},[161],{"type":24,"value":162},"训练的数据处理，可以基于推理的数据处理增补，数据集包含：",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":18,"tag":81,"props":167,"children":168},{},[169],{"type":24,"value":170},"原始数据：",{"type":18,"tag":26,"props":172,"children":173},{},[174],{"type":24,"value":175},"genetics:",{"type":18,"tag":26,"props":177,"children":178},{},[179],{"type":24,"value":180},"UniRef90: v2020_01 #JackHMMER",{"type":18,"tag":26,"props":182,"children":183},{},[184],{"type":24,"value":185},"MGnify: v2018_12 #JackHMMER",{"type":18,"tag":26,"props":187,"children":188},{},[189],{"type":24,"value":190},"Uniclust30: v2018_08 #HHblits",{"type":18,"tag":26,"props":192,"children":193},{},[194],{"type":24,"value":195},"BFD: only version available #HHblits",{"type":18,"tag":26,"props":197,"children":198},{},[199],{"type":24,"value":200},"templates:",{"type":18,"tag":26,"props":202,"children":203},{},[204],{"type":24,"value":205},"PDB70: (downloaded 2020-05-13) #HHsearch",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":210},"PDB: (downloaded 2020-05-14) #Kalign(MSA)",{"type":18,"tag":26,"props":212,"children":213},{},[214],{"type":18,"tag":81,"props":215,"children":216},{},[217],{"type":24,"value":218},"派生数据：",{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":24,"value":223},"按照论文的技巧，sequence-coordinate数据对不仅有来自于PDB原始的17万多的数据的清洗，还有在训练收敛后，挑选了置信度高的35万左右的数据。这部分的数据的产生，可以从自己的模型训练收敛后排序选择；也可以直接利用AlphaFold提供的模型参数，直接推理无结构的序列来选择；还可以从AlphaFold公开的预测数据集中下载排序选择，从而节省计算资源。",{"type":18,"tag":26,"props":225,"children":226},{},[227],{"type":18,"tag":81,"props":228,"children":229},{},[230],{"type":24,"value":231},"数据处理部分的代码结构：",{"type":18,"tag":26,"props":233,"children":234},{},[235],{"type":24,"value":236},"run_alphafold.py",{"type":18,"tag":26,"props":238,"children":239},{},[240],{"type":18,"tag":115,"props":241,"children":244},{"alt":242,"src":243},"4.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/152051gygx64e5aqdw8a10.jpg",[],{"type":18,"tag":26,"props":246,"children":247},{},[248],{"type":24,"value":249},"data_pipeline.py",{"type":18,"tag":26,"props":251,"children":252},{},[253],{"type":18,"tag":115,"props":254,"children":257},{"alt":255,"src":256},"5.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/152113zttwhn69p9iqzhql.jpg",[],{"type":18,"tag":26,"props":259,"children":260},{},[261],{"type":18,"tag":81,"props":262,"children":263},{},[264],{"type":24,"value":265},"如下是预测部分的输入数据列表（样例）：",{"type":18,"tag":267,"props":268,"children":270},"pre",{"code":269},"predict-input:\n 'aatype': (4, 779),\n 'residue_index': (4, 779),\n 'seq_length': (4,),\n 'template_aatype': (4, 4, 779),\n 'template_all_atom_masks': (4, 4, 779, 37),\n 'template_all_atom_positions': (4, 4, 779, 37, 3),\n 'template_sum_probs': (4, 4, 1),\n 'is_distillation': (4,),\n 'seq_mask': (4, 779),\n 'msa_mask': (4, 508, 779),\n 'msa_row_mask': (4, 508),\n 'random_crop_to_size_seed': (4, 2),\n 'template_mask': (4, 4),\n 'template_pseudo_beta': (4, 4, 779, 3),\n 'template_pseudo_beta_mask': (4, 4, 779),\n 'atom14_atom_exists': (4, 779, 14),\n 'residx_atom14_to_atom37': (4, 779, 14),\n 'residx_atom37_to_atom14': (4, 779, 37),\n 'atom37_atom_exists': (4, 779, 37),\n 'extra_msa': (4, 5120, 779),\n 'extra_msa_mask': (4, 5120, 779),\n 'extra_msa_row_mask': (4, 5120),\n 'bert_mask': (4, 508, 779),\n 'true_msa': (4, 508, 779),\n 'extra_has_deletion': (4, 5120, 779),\n 'extra_deletion_value': (4, 5120, 779),\n 'msa_feat': (4, 508, 779, 49),\n 'target_feat': (4, 779, 22)\n",[271],{"type":18,"tag":272,"props":273,"children":274},"code",{"__ignoreMap":7},[275],{"type":24,"value":269},{"type":18,"tag":26,"props":277,"children":278},{},[279],{"type":24,"value":280},"如果复用AlphaFold的代码实现train逻辑，输入数据上还需要增加一些字段的处理：如pseudo_beta等target信息，当然可以另行修改自己的框架的表示。",{"type":18,"tag":76,"props":282,"children":284},{"id":283},"_3主体网络",[285],{"type":18,"tag":81,"props":286,"children":287},{},[288],{"type":24,"value":289},"3、主体网络",{"type":18,"tag":26,"props":291,"children":292},{},[293],{"type":24,"value":294},"文首附上了主体网络的结构图，AlphaFold的代码实现部分，结构如下：",{"type":18,"tag":26,"props":296,"children":297},{},[298],{"type":24,"value":299},"model.py",{"type":18,"tag":26,"props":301,"children":302},{},[303],{"type":18,"tag":115,"props":304,"children":307},{"alt":305,"src":306},"6.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/152358ovol5qs3fpyelwds.jpg",[],{"type":18,"tag":26,"props":309,"children":310},{},[311],{"type":18,"tag":81,"props":312,"children":313},{},[314],{"type":24,"value":315},"整个模型的构建和关键点，如下图：",{"type":18,"tag":26,"props":317,"children":318},{},[319],{"type":18,"tag":115,"props":320,"children":323},{"alt":321,"src":322},"7.jpg","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/152430qdq99ei6ny9ayuqp.jpg",[],{"type":18,"tag":26,"props":325,"children":326},{},[327],{"type":24,"value":328},"关于AlphaFold技术上什么点，让效果这么好，网上解读甚多。最客观的解读其实论文的对主要技术点的消融实验，很能说明问题。如果非要最简洁的总结，我们认为：让各种各层信息在整个网络中来回流动是最重要的，各种信息包含Seq + MSA+ (Pair) + Template，各层信息的各种功能流动是指各种Iteration + Recycling + Multiplication + Production。",{"type":18,"tag":26,"props":330,"children":331},{},[332],{"type":24,"value":333},"在训练代码的构建部分，由于是train的逻辑，在AlphaFold的构造参数中，需要设置：is_training=True, compute_loss=True，这样才会讲各层各处的复合loss给返回出来，计算梯度和让优化器优化权重。",{"type":18,"tag":76,"props":335,"children":337},{"id":336},"_4结构精化",[338],{"type":18,"tag":81,"props":339,"children":340},{},[341],{"type":24,"value":342},"4、结构精化",{"type":18,"tag":26,"props":344,"children":345},{},[346],{"type":18,"tag":115,"props":347,"children":349},{"alt":321,"src":348},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/202109/01/152450zm0cmmevf0c3y6qu.jpg",[],{"type":18,"tag":76,"props":351,"children":353},{"id":352},"_5训练构建",[354],{"type":18,"tag":81,"props":355,"children":356},{},[357],{"type":24,"value":358},"5、训练构建",{"type":18,"tag":26,"props":360,"children":361},{},[362],{"type":24,"value":363},"在数据准备好后，要实现自己的训练，有两种做法：在不修改网络结构的情况下，可以从AlphaFold公开的模型参数上开始进一步训练优化；如果要修改网络，一个最基本的训练逻辑，通过实现两部分可以开始从头训练：",{"type":18,"tag":365,"props":366,"children":367},"ol",{},[368,374],{"type":18,"tag":369,"props":370,"children":371},"li",{},[372],{"type":24,"value":373},"构建自己的数据集和加载器。最简单的做法，是从pipeline.DataPipeline开始修改，增加上训练所需的target相关的信息的读取。",{"type":18,"tag":369,"props":375,"children":376},{},[377],{"type":24,"value":378},"类似于推理的RunModel，实现自己的TrainModel，其重要逻辑包含：模型的代码直接利用开源的推理的：",{"type":18,"tag":26,"props":380,"children":381},{},[382],{"type":24,"value":383},"modules.AlphaFold(model_config.model)(batch, is_training=True, compute_loss=True, ensemble_representations=True, return_representations=False).",{"type":18,"tag":26,"props":385,"children":386},{},[387],{"type":24,"value":388},"优化器用optax@jax实现。",{"type":18,"tag":26,"props":390,"children":391},{},[392],{"type":18,"tag":81,"props":393,"children":394},{},[395],{"type":24,"value":396},"如上，可以基于现有推理代码，构建一个最简单的训练版本。",{"title":7,"searchDepth":398,"depth":398,"links":399},4,[400,402,403,404,405],{"id":78,"depth":401,"text":85},2,{"id":151,"depth":401,"text":157},{"id":283,"depth":401,"text":289},{"id":336,"depth":401,"text":342},{"id":352,"depth":401,"text":358},"markdown","content:technology-blogs:zh:702.md","content","technology-blogs/zh/702.md","technology-blogs/zh/702","md",1776506140031]