[{"data":1,"prerenderedAt":357},["ShallowReactive",2],{"content-query-4IJHlPx8yr":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":351,"_id":352,"_source":353,"_file":354,"_stem":355,"_extension":356},"/technology-blogs/zh/634","zh",false,"","MindSpore AI科学计算系列（2）：药物分子预训练模型分析","药物分子预训练模型分析","2021-07-02","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2021/07/02/ae17d57e7dc04f3d8f81e5303f3c33ca.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":348},"root",[17,25,28,34,47,58,67,72,80,85,93,101,109,114,119,126,134,139,149,154,161,166,174,179,186,191,196,204,209,216,221,226,233,238,246,251,259,274,281,286,291,296,301,308,313,321,326,333,338,343],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-ai科学计算系列2药物分子预训练模型分析",[23],{"type":24,"value":8},"text",{"type":18,"tag":19,"props":26,"children":27},{"id":7},[],{"type":18,"tag":29,"props":30,"children":31},"p",{},[32],{"type":24,"value":33},"作者：于璠",{"type":18,"tag":29,"props":35,"children":36},{},[37,39],{"type":24,"value":38},"作者主页：",{"type":18,"tag":40,"props":41,"children":45},"a",{"href":42,"rel":43},"https://www.zhihu.com/people/yu-fan-42-9",[44],"nofollow",[46],{"type":24,"value":42},{"type":18,"tag":29,"props":48,"children":49},{},[50,52],{"type":24,"value":51},"文章来源：",{"type":18,"tag":40,"props":53,"children":56},{"href":54,"rel":55},"https://zhuanlan.zhihu.com/p/385924365",[44],[57],{"type":24,"value":54},{"type":18,"tag":29,"props":59,"children":60},{},[61],{"type":18,"tag":62,"props":63,"children":64},"strong",{},[65],{"type":24,"value":66},"写在之前",{"type":18,"tag":29,"props":68,"children":69},{},[70],{"type":24,"value":71},"深度学习模型广泛地应用于图像、文本、音频等领域，并发挥出了巨大作用。而在生物制药领域，研究人员也正在通过深度学习模型来辅助药物分子的设计、表征和优化，以此来减少资源成本，其在虚拟筛选等环节中发挥很好的作用，缩短药物分子研发的周期，从而降低研发成本。",{"type":18,"tag":29,"props":73,"children":74},{},[75],{"type":18,"tag":62,"props":76,"children":77},{},[78],{"type":24,"value":79},"分子表示",{"type":18,"tag":29,"props":81,"children":82},{},[83],{"type":24,"value":84},"首先来看一下如何来表示一个分子，我们最常见的一般是化学表达式，但这并不适用于深度学习模型的输入。分子表示是指用于每个分子的数字编码，表示的方法必须捕获每个分子的基本结构信息，从分子结构中创建适当的表示称为特征化。一般表示需要有两个重要属性：唯一性和可逆性。唯一性意思就是每个分子结构为单一表示，可逆性指每个表示与单个分子相关联。大部分用于分子生成的表示都是可逆的，但并不一定是唯一的，原因例如旋转、平移等。下图列举了常用的分子表示方法（包含2D和3D表示）。",{"type":18,"tag":29,"props":86,"children":87},{},[88],{"type":18,"tag":89,"props":90,"children":92},"img",{"alt":7,"src":91},"https://pic2.zhimg.com/80/v2-b38ca33e61fcc0f18ece10888226f935_720w.jpg",[],{"type":18,"tag":29,"props":94,"children":95},{},[96],{"type":18,"tag":62,"props":97,"children":98},{},[99],{"type":24,"value":100},"传统深度学习方法",{"type":18,"tag":29,"props":102,"children":103},{},[104],{"type":18,"tag":62,"props":105,"children":106},{},[107],{"type":24,"value":108},"循环神经网络（RNN）",{"type":18,"tag":29,"props":110,"children":111},{},[112],{"type":24,"value":113},"为什么首先提到RNN呢，因为对于VAE、GAN进行分子生成的方法，RNN是重要的序列生成基础，RNN类方法的训练loss往往通过在序列空间中最大化似然概率。",{"type":18,"tag":29,"props":115,"children":116},{},[117],{"type":24,"value":118},"以RNN为基础的分子生成模型结构如下图，分子表示以SMILES为例（其表示应用最为广泛）",{"type":18,"tag":29,"props":120,"children":121},{},[122],{"type":18,"tag":89,"props":123,"children":125},{"alt":7,"src":124},"https://pic4.zhimg.com/80/v2-b19e1b81c98d971c00330f0bea996e5f_720w.jpg",[],{"type":18,"tag":29,"props":127,"children":128},{},[129],{"type":18,"tag":62,"props":130,"children":131},{},[132],{"type":24,"value":133},"变分自编码器（VAE）",{"type":18,"tag":29,"props":135,"children":136},{},[137],{"type":24,"value":138},"若单纯使用RNN会存在着长期依赖的问题，生成效果并不好，因此引入了VAE的结构来优化生成效果。VAE是从变分理论推导出来的，相关原理可参考我之前写的一篇文章。",{"type":18,"tag":29,"props":140,"children":141},{},[142],{"type":18,"tag":40,"props":143,"children":146},{"href":144,"rel":145},"https://zhuanlan.zhihu.com/p/259977975",[44],[147],{"type":24,"value":148},"于璠：MindSpore深度概率推断算法与概率模型zhuanlan.zhihu.com",{"type":18,"tag":29,"props":150,"children":151},{},[152],{"type":24,"value":153},"那么基于VAE架构的分子生成模型则如下图所示：",{"type":18,"tag":29,"props":155,"children":156},{},[157],{"type":18,"tag":89,"props":158,"children":160},{"alt":7,"src":159},"https://pic2.zhimg.com/80/v2-cd8c64c251fc18a356e573eef707f415_720w.jpg",[],{"type":18,"tag":29,"props":162,"children":163},{},[164],{"type":24,"value":165},"encoder部分可用CNN或RNN来提取分子特征，之后压缩到隐层空间中，通过VAE采样隐层向量，接RNN decoder来解码生成分子SMILES表达式，采样的存在也使得生成的分子多样性得到一定的保证。",{"type":18,"tag":29,"props":167,"children":168},{},[169],{"type":18,"tag":62,"props":170,"children":171},{},[172],{"type":24,"value":173},"生成对抗网络（GAN）",{"type":18,"tag":29,"props":175,"children":176},{},[177],{"type":24,"value":178},"当然以RNN为基础的GAN网络通过生成对抗的方式也广泛应用于分子生成任务，引入了一个Discriminator，通过Discriminator来判别输入的SMILES是属于reconstruct之后的SMILES，还是属于真实的数据，这样相当于额外增加了loss。模型结构如下图：",{"type":18,"tag":29,"props":180,"children":181},{},[182],{"type":18,"tag":89,"props":183,"children":185},{"alt":7,"src":184},"https://pic3.zhimg.com/80/v2-527609d7aa6948c54b83d7d57eb8842e_720w.jpg",[],{"type":18,"tag":29,"props":187,"children":188},{},[189],{"type":24,"value":190},"--------此处为分割线-------",{"type":18,"tag":29,"props":192,"children":193},{},[194],{"type":24,"value":195},"近些年来，Transformer在NLP领域大放光采，它们摒弃了基于RNN结构的序列依赖特性，引入了全新的attention机制，让处于序列任意位置的单元都能看到全局信息，促进了NLP预训练模型的发展。引入这个思想，分子如上文所说可以表达为图或者序列，那么分子预训练模型则应运而生，其在分子表征、生成以及各类下游任务中均取得了不错的效果。下面介绍一下基于文本表示（SMILES）的方法和基于图（2D）的方法。",{"type":18,"tag":29,"props":197,"children":198},{},[199],{"type":18,"tag":62,"props":200,"children":201},{},[202],{"type":24,"value":203},"基于SMILES的预训练模型",{"type":18,"tag":29,"props":205,"children":206},{},[207],{"type":24,"value":208},"这方面比较有代表性的是业界的X-MOL预训练模型，该模型基于文本表示为输入，学习分子的SMILES表达，先看一下总的模型结构图",{"type":18,"tag":29,"props":210,"children":211},{},[212],{"type":18,"tag":89,"props":213,"children":215},{"alt":7,"src":214},"https://pic2.zhimg.com/80/v2-b7f7da4c841a99c26bc5ed14033dd59d_720w.jpg",[],{"type":18,"tag":29,"props":217,"children":218},{},[219],{"type":24,"value":220},"a)指X-MOL的工作流，包括pre-training和fine-tuning两大模块； b)指基于encoder-decoder的语言模型 c)指X-MOL用于不同下游任务的微调 d)指X-MOL用于不同生成任务",{"type":18,"tag":29,"props":222,"children":223},{},[224],{"type":24,"value":225},"该工作中提到的“SMILES is all you need”这种思想也比较有意思，将SMILES表达式新采用knowledge embedding的形式，包含三种策略的embedding，分别为link embedding，ring embedding和type embedding，之后和传统的embedding（Char embedding，Pos embedding以及id embedding）进行结合。",{"type":18,"tag":29,"props":227,"children":228},{},[229],{"type":18,"tag":89,"props":230,"children":232},{"alt":7,"src":231},"https://pic4.zhimg.com/80/v2-230f6ab8d0beecece119fb3cdcadce33_720w.jpg",[],{"type":18,"tag":29,"props":234,"children":235},{},[236],{"type":24,"value":237},"link embedding主要设计了将结构信息的每个原子、键和符号的连接信息合并到SMILES中；SMILES使用一对数字来表示环形结构，两个相同的数字代表一个开环原子和闭环原子，那么在这种情况下，数字代表连接信息和环结构，因此设计了环嵌入来包含环结构信息的数量对；SMILES需要引入额外的字符来表示结构，不同类型的字符设计了类型嵌入来包含类型信息。",{"type":18,"tag":29,"props":239,"children":240},{},[241],{"type":18,"tag":62,"props":242,"children":243},{},[244],{"type":24,"value":245},"基于GNN的预训练模型",{"type":18,"tag":29,"props":247,"children":248},{},[249],{"type":24,"value":250},"这方面有代表性的为GROVER模型，其思想主要为基于graph transformer的节点、边和图任务来学习分子的丰富结构信息。该模型中，一个分子可以被抽象成一个图结构",{"type":18,"tag":29,"props":252,"children":253},{},[254],{"type":18,"tag":89,"props":255,"children":258},{"alt":256,"src":257},"[公式]","https://www.zhihu.com/equation?tex=G%3D%28%5Cnu%2C%5Cvarepsilon%29+",[],{"type":18,"tag":29,"props":260,"children":261},{},[262,266,268,272],{"type":18,"tag":89,"props":263,"children":265},{"alt":256,"src":264},"https://www.zhihu.com/equation?tex=%7C%5Cnu%7C%3Dn+",[],{"type":24,"value":267}," 代表的是n个节点，即原子的个数； ",{"type":18,"tag":89,"props":269,"children":271},{"alt":256,"src":270},"https://www.zhihu.com/equation?tex=+%7C%5Cvarepsilon%7C%3Dm+",[],{"type":24,"value":273}," 代表的是边的条数，即原子间的连接边。在图学习中，主要有两类学习任务：节点级分类/回归和图级分类/回归。GROVER包含两类子模型：Node GNN transformer和Edge GNN transformer，来看一下GNN transformer的总体结构",{"type":18,"tag":29,"props":275,"children":276},{},[277],{"type":18,"tag":89,"props":278,"children":280},{"alt":7,"src":279},"https://pic2.zhimg.com/80/v2-4c8394e243e6338986fdb98840ee303d_720w.jpg",[],{"type":18,"tag":29,"props":282,"children":283},{},[284],{"type":24,"value":285},"GTransformer可以进行双层信息提取：1）消息传递过程捕获局部图的结构信息，这一步采用GNN模型输出q，k，v。2）q，k，v输入到Transformer中进行全局节点间的特征提取。通过双层的信息提取来更好的表达分子结构信息。",{"type":18,"tag":29,"props":287,"children":288},{},[289],{"type":24,"value":290},"另外，远程残差连接（Long-range residual connection）将输入特征的初始节点/边特征信息直接连接到Gtransformer的最后一层，而不是原始transformer架构中的多个短层残差连接。这样有什么好处呢，1）和普通残差连接一样，可以通过减轻梯度消失问题来改进训练过程；2）远程残差连接可以缓解消息传递过程中的过度平滑问题。",{"type":18,"tag":29,"props":292,"children":293},{},[294],{"type":24,"value":295},"模型中的动态消息传递网络（dyMPN）采样随机消息传递方案，被证明比没有随机化的普通消息传递网络更好的泛化性能。",{"type":18,"tag":29,"props":297,"children":298},{},[299],{"type":24,"value":300},"GROVER预训练过程采用了多个自监督任务，如下图所示：",{"type":18,"tag":29,"props":302,"children":303},{},[304],{"type":18,"tag":89,"props":305,"children":307},{"alt":7,"src":306},"https://pic1.zhimg.com/80/v2-66c1831490e285c11858db3c6c196370_720w.jpg",[],{"type":18,"tag":29,"props":309,"children":310},{},[311],{"type":24,"value":312},"该模型在预训练过程中不使用监督标签，而是提取出了新的自监督学习任务：上下文属性预测（节点/边级）和图级基序预测。",{"type":18,"tag":29,"props":314,"children":315},{},[316],{"type":18,"tag":62,"props":317,"children":318},{},[319],{"type":24,"value":320},"下游任务",{"type":18,"tag":29,"props":322,"children":323},{},[324],{"type":24,"value":325},"上述两类预训练模型的下游任务均在MoleculeNet榜单上进行了验证，经典的下游任务如下图所示。",{"type":18,"tag":29,"props":327,"children":328},{},[329],{"type":18,"tag":89,"props":330,"children":332},{"alt":7,"src":331},"https://pic2.zhimg.com/80/v2-58212a567d9d59e1d31121a76ed10481_720w.jpg",[],{"type":18,"tag":29,"props":334,"children":335},{},[336],{"type":24,"value":337},"下游任务涵盖量子力学、物理化学、生物和生理等领域，主要为分类和回归任务，例如计算常见有机小分子的水溶性，属性预测等，可以在这些下游任务上一定程度验证模型的表征效果。",{"type":18,"tag":29,"props":339,"children":340},{},[341],{"type":24,"value":342},"另外，分子表征还可以应用于化学反应产品预测、药物间相互作用、分子生成及分子优化等方面，对于药物研发的流程进行诸多方面的AI赋能。",{"type":18,"tag":29,"props":344,"children":345},{},[346],{"type":24,"value":347},"本篇文章就到这里啦，欢迎大家批评指正哈。目前，我们团队也在药物分子预训练模型进行了一些探索，另外我们也努力在药物研发的更多环节进行AI的赋能，也希望感兴趣的朋友能一起探讨下AI科学计算带来的更多应用。",{"title":7,"searchDepth":349,"depth":349,"links":350},4,[],"markdown","content:technology-blogs:zh:634.md","content","technology-blogs/zh/634.md","technology-blogs/zh/634","md",1776506139037]