[{"data":1,"prerenderedAt":352},["ShallowReactive",2],{"content-query-76UuSCvfFv":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":346,"_id":347,"_source":348,"_file":349,"_stem":350,"_extension":351},"/technology-blogs/zh/3771","zh",false,"","开发者说 | 基于昇思MindSpore实现LDM扩散模型","作者：Adream   来源：昇思论坛","2025-06-24","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/27/a3a6c9ea7bb44a34af5145c12ca36989.png","technology-blogs","实践",{"type":15,"children":16,"toc":341},"root",[17,25,31,36,41,50,58,63,73,78,83,88,93,101,109,119,124,134,139,147,152,160,168,177,182,197,206,211,224,234,239,252,260,268,277,285,293,301,309,319,328],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"开发者说-基于昇思mindspore实现ldm扩散模型",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"作者：Adream",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"来源：昇思论坛",{"type":18,"tag":26,"props":37,"children":38},{},[39],{"type":24,"value":40},"昇思MindSpore2024年技术帖分享大会圆满结束！全年收获80+高质量技术帖， 2025年全新升级，推出“2025年昇思干货小卖部，你投我就收！”，活动继续每月征集技术帖。本期技术文章由社区开发者Adrem输出并投稿。如果您对活动感兴趣，欢迎在昇思论坛投稿。",{"type":18,"tag":26,"props":42,"children":43},{},[44],{"type":18,"tag":45,"props":46,"children":47},"strong",{},[48],{"type":24,"value":49},"# 01",{"type":18,"tag":26,"props":51,"children":52},{},[53],{"type":18,"tag":45,"props":54,"children":55},{},[56],{"type":24,"value":57},"概述",{"type":18,"tag":26,"props":59,"children":60},{},[61],{"type":24,"value":62},"LDM（Latent Diffusion Model）是一种基于扩散过程的先进生成模型，其核心思想是通过逐步去除图像中的噪声来生成高质量的图像样本。与传统扩散模型不同，LDM 引入了潜在空间的概念，将图像表示为潜在空间中的向量。通过学习潜在空间中的映射关系，结合预训练好的变分自编码器（VAE），实现高效的图像生成。由于 LDM 在潜在空间进行操作，特征维度远小于图像空间，因此推理速度相较于基于 DDPM 的模型有显著提升。",{"type":18,"tag":26,"props":64,"children":65},{},[66,71],{"type":18,"tag":45,"props":67,"children":68},{},[69],{"type":24,"value":70},"LDM",{"type":24,"value":72}," 的主要特点包括：",{"type":18,"tag":26,"props":74,"children":75},{},[76],{"type":24,"value":77},"**1、VAE 潜在表示学习：**利用 VAE 学习图像的潜在表示，在压缩数据的同时保留关键特征，为后续的扩散过程提供低维高效的输入。",{"type":18,"tag":26,"props":79,"children":80},{},[81],{"type":24,"value":82},"**2、UNet 映射学习：**采用 UNet 架构学习从潜在空间到图像空间的映射关系，充分利用 UNet 的多尺度特征提取和重建能力。",{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":24,"value":87},"**3、条件扩散生成：**支持条件扩散过程，能够根据给定的条件（如文本、图像等）生成特定类型的图像，增强了模型的可控性。",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":24,"value":92},"**4、分层扩散机制：**运用分层扩散过程，在不同的分辨率层级上进行图像生成，有助于生成高分辨率、细节丰富的图像。",{"type":18,"tag":26,"props":94,"children":95},{},[96],{"type":18,"tag":45,"props":97,"children":98},{},[99],{"type":24,"value":100},"# 02",{"type":18,"tag":26,"props":102,"children":103},{},[104],{"type":18,"tag":45,"props":105,"children":106},{},[107],{"type":24,"value":108},"主要步骤",{"type":18,"tag":26,"props":110,"children":111},{},[112,114],{"type":24,"value":113},"**1、**",{"type":18,"tag":45,"props":115,"children":116},{},[117],{"type":24,"value":118},"数据编码",{"type":18,"tag":26,"props":120,"children":121},{},[122],{"type":24,"value":123},"使用预训练的 VAE 编码器将原始高维图像数据映射到低维潜在空间。这一过程不仅降低了计算复杂度，还提取了图像的关键特征，为后续的扩散过程提供了更紧凑的表示。",{"type":18,"tag":26,"props":125,"children":126},{},[127,129],{"type":24,"value":128},"**2、**",{"type":18,"tag":45,"props":130,"children":131},{},[132],{"type":24,"value":133},"潜在空间扩散",{"type":18,"tag":26,"props":135,"children":136},{},[137],{"type":24,"value":138},"在潜在空间中执行与 DDPM 类似的扩散过程。通过逐步向潜在表示添加高斯噪声，将其从原始数据分布转化为纯噪声分布。在反向过程中，通过 UNet 逐步去除噪声，生成新的潜在样本。",{"type":18,"tag":26,"props":140,"children":141},{},[142],{"type":18,"tag":45,"props":143,"children":144},{},[145],{"type":24,"value":146},"3、数据重建",{"type":18,"tag":26,"props":148,"children":149},{},[150],{"type":24,"value":151},"使用预训练好的 VAE 解码器将去噪后的潜在表示映射回原始图像空间，得到最终的生成图像。",{"type":18,"tag":26,"props":153,"children":154},{},[155],{"type":18,"tag":45,"props":156,"children":157},{},[158],{"type":24,"value":159},"# 03",{"type":18,"tag":26,"props":161,"children":162},{},[163],{"type":18,"tag":45,"props":164,"children":165},{},[166],{"type":24,"value":167},"关键点",{"type":18,"tag":26,"props":169,"children":170},{},[171,172],{"type":24,"value":113},{"type":18,"tag":45,"props":173,"children":174},{},[175],{"type":24,"value":176},"感知压缩与潜在空间优化",{"type":18,"tag":26,"props":178,"children":179},{},[180],{"type":24,"value":181},"LDM 通过预训练的自动编码器将高维图像数据进行感知压缩，映射到低维潜在空间。例如，将 256×256 的图像压缩为 16×16 的潜在表示，大幅减少了计算量。为了控制潜在空间的分布，LDM 采用了以下两种正则化技术：",{"type":18,"tag":183,"props":184,"children":185},"ul",{},[186,192],{"type":18,"tag":187,"props":188,"children":189},"li",{},[190],{"type":24,"value":191},"**KL 正则化：**约束潜在变量接近标准正态分布，类似于 VAE 的做法，增强了生成过程的稳定性。",{"type":18,"tag":187,"props":193,"children":194},{},[195],{"type":24,"value":196},"**VQ 正则化：**通过向量量化层对潜在表示进行离散化，类似于 VQ - VAE，有助于提升模型对结构化特征的学习能力。",{"type":18,"tag":26,"props":198,"children":199},{},[200,201],{"type":24,"value":128},{"type":18,"tag":45,"props":202,"children":203},{},[204],{"type":24,"value":205},"扩散过程与去噪机制",{"type":18,"tag":26,"props":207,"children":208},{},[209],{"type":24,"value":210},"LDM 在潜在空间中进行扩散过程，分为前向扩散和反向去噪两个阶段：",{"type":18,"tag":183,"props":212,"children":213},{},[214,219],{"type":18,"tag":187,"props":215,"children":216},{},[217],{"type":24,"value":218},"**前向扩散：**逐步向潜在变量添加高斯噪声，噪声强度随时间步递增（如采用线性或余弦调度），最终使潜在表示接近纯噪声分布。",{"type":18,"tag":187,"props":220,"children":221},{},[222],{"type":24,"value":223},"**反向去噪：**使用 UNet 预测当前潜在表示中的噪声，并逐步去除噪声。目标函数为预测噪声与实际噪声的均方误差，通过优化该目标函数提升模型的去噪能力。",{"type":18,"tag":26,"props":225,"children":226},{},[227,229],{"type":24,"value":228},"**3、**",{"type":18,"tag":45,"props":230,"children":231},{},[232],{"type":24,"value":233},"多模态条件生成",{"type":18,"tag":26,"props":235,"children":236},{},[237],{"type":24,"value":238},"LDM 引入了交叉注意力机制，支持文本、图像、语义地图等多模态条件输入。具体实现方式如下：",{"type":18,"tag":183,"props":240,"children":241},{},[242,247],{"type":18,"tag":187,"props":243,"children":244},{},[245],{"type":24,"value":246},"**条件编码器：**将不同模态的输入（如文本、图像）映射为中间表示，再与 UNet 的中间层进行交互。",{"type":18,"tag":187,"props":248,"children":249},{},[250],{"type":24,"value":251},"**注意力融合：**通过查询（Q）、键（K）、值（V）机制，将条件信息融入生成过程，实现对图像生成的精准控制，例如根据文本描述生成相应的图像。",{"type":18,"tag":26,"props":253,"children":254},{},[255],{"type":18,"tag":45,"props":256,"children":257},{},[258],{"type":24,"value":259},"# 04",{"type":18,"tag":26,"props":261,"children":262},{},[263],{"type":18,"tag":45,"props":264,"children":265},{},[266],{"type":24,"value":267},"模型结构",{"type":18,"tag":26,"props":269,"children":270},{},[271,272],{"type":24,"value":113},{"type":18,"tag":45,"props":273,"children":274},{},[275],{"type":24,"value":276},"像素空间与潜在空间的转换",{"type":18,"tag":183,"props":278,"children":279},{},[280],{"type":18,"tag":187,"props":281,"children":282},{},[283],{"type":24,"value":284},"**编码器（Encoder）：**由卷积网络构成，通过下采样和特征提取操作，将输入图像 x从高维像素空间压缩到低维潜在空间，生成潜在表示 z。",{"type":18,"tag":26,"props":286,"children":287},{},[288],{"type":18,"tag":289,"props":290,"children":292},"img",{"alt":7,"src":291},"https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2025/06/27/0f0917ac6a924c8db9a6e51533f7680e.png",[],{"type":18,"tag":26,"props":294,"children":295},{},[296],{"type":18,"tag":45,"props":297,"children":298},{},[299],{"type":24,"value":300},"# 05",{"type":18,"tag":26,"props":302,"children":303},{},[304],{"type":18,"tag":45,"props":305,"children":306},{},[307],{"type":24,"value":308},"MindSpo****re代码实现",{"type":18,"tag":310,"props":311,"children":313},"pre",{"code":312},"\nimport mindspore as ms\nfrom mindspore import nn, ops, Tensor, Parameter\nimport mindspore.numpy as mnp\nimport numpy as np\n\n# 超参数配置\nconfig = {\n    \"latent_dim\": 64,          # 潜在空间维度\n    \"image_size\": 64,          # 输入图像尺寸\n    \"batch_size\": 32,          # 批次大小\n    \"timesteps\": 1000,         # 扩散时间步数\n    \"lr\": 1e-4,                # 学习率\n    \"channels\": [64, 128, 256],# UNet通道数\n    \"condition_dim\": 128       # 条件嵌入维度\n}\n\n# 2. 编码器 - 解码器模块\nclass Encoder(nn.Cell):\n    def __init__(self):\n        super().__init__()\n        self.conv1 = nn.Conv2d(3, 32, 3, pad_mode='same', padding=1)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.conv2 = nn.Conv2d(32, 64, 3, pad_mode='same', padding=1)\n        self.bn2 = nn.BatchNorm2d(64)\n        self.pool = nn.MaxPool2d(2, 2)\n        # 对于 64x64 输入，特征图尺寸为 64x16x16\n        self.fc = nn.Dense(64 * 16 * 16, config[\"latent_dim\"])\n   \n    def construct(self, x):\n        x = ops.relu(self.bn1(self.conv1(x)))\n        x = self.pool(ops.relu(self.bn2(self.conv2(x))))\n        x = x.view(x.shape[0], -1)\n        return self.fc(x)\n\nclass Decoder(nn.Cell):\n    def __init__(self):\n        super().__init__()\n        self.fc = nn.Dense(config[\"latent_dim\"], 64 * 16 * 16)\n        self.conv1 = nn.Conv2d(64, 32, 3, pad_mode='same', padding=1)\n        self.bn1 = nn.BatchNorm2d(32)\n        self.conv2 = nn.Conv2d(32, 3, 3, pad_mode='same', padding=1)\n        self.upsample = nn.ResizeNearestNeighbor((32, 32))\n    \n    def construct(self, z):\n        x = self.fc(z).view(-1, 64, 16, 16)\n        x = self.upsample(ops.relu(self.bn1(self.conv1(x))))\n        return self.conv2(x)\n\n# 3. 时间编码模块\nclass TimeEmbedding(nn.Cell):\n    def __init__(self, dim):\n        super().__init__()\n        self.embed = nn.Embedding(config[\"timesteps\"], dim)\n        self.proj = nn.Dense(dim, dim)\n    \n    def construct(self, t):\n        return self.proj(self.embed(t))\n\n# 4. 交叉注意力模块\nclass CrossAttention(nn.Cell):\n    def __init__(self):\n        super().__init__()\n        self.q_proj = nn.Dense(config[\"latent_dim\"], config[\"condition_dim\"])\n        self.k_proj = nn.Dense(config[\"condition_dim\"], config[\"condition_dim\"])\n        self.v_proj = nn.Dense(config[\"condition_dim\"], config[\"condition_dim\"])\n        self.scale = mnp.sqrt(mnp.asarray(config[\"condition_dim\"], dtype=ms.float32))\n   \n    def construct(self, z, condition):\n        q = self.q_proj(z)\n        k = self.k_proj(condition)\n        v = self.v_proj(condition)\n        attn = ops.matmul(q, k.transpose(0, 1)) / self.scale\n        attn = ops.softmax(attn, axis=-1)\n        return ops.matmul(attn, v)\n\n# 5. UNet 去噪网络\nclass UNetBlock(nn.Cell):\n    def __init__(self, in_channels, out_channels):\n        super().__init__()\n        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, pad_mode='same', padding=1)\n        self.bn1 = nn.BatchNorm2d(out_channels)\n        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, pad_mode='same', padding=1)\n        self.bn2 = nn.BatchNorm2d(out_channels)\n    \n    def construct(self, x):\n        residual = x\n        x = ops.relu(self.bn1(self.conv1(x)))\n        x = self.bn2(self.conv2(x))\n        return x + residual  # 残差连接\n\nclass UNet(nn.Cell):\n    def __init__(self):\n        super().__init__()\n        self.time_emb = TimeEmbedding(config[\"latent_dim\"])\n        self.down1 = UNetBlock(3, config[\"channels\"][0])\n        self.down2 = UNetBlock(config[\"channels\"][0], config[\"channels\"][1])\n        self.down3 = UNetBlock(config[\"channels\"][1], config[\"channels\"][2])\n        self.up2 = UNetBlock(config[\"channels\"][1] + config[\"channels\"][2], config[\"channels\"][1])\n        self.up1 = UNetBlock(config[\"channels\"][0] + config[\"channels\"][1], config[\"channels\"][0])\n        self.final = nn.Conv2d(config[\"channels\"][0], 3, 3, pad_mode='same', padding=1)\n   \n    def construct(self, x, t, condition):\n        # 时间条件注入\n        t_emb = self.time_emb(t)\n        t_emb = ops.broadcast_to(t_emb[:, None, None, :], (x.shape[0], x.shape[1], x.shape[2], t_emb.shape[1]))\n        x = ops.concat((x, t_emb), axis=-1)\n       \n        # 下采样路径\n        x1 = self.down1(x)\n        x2 = self.down2(ops.max_pool2d(x1, 2))\n        x3 = self.down3(ops.max_pool2d(x2, 2))\n      \n        # 上采样路径\n        x = self.up2(ops.concat((ops.resize_nearest_neighbor(x3, scale=2), x2), axis=1))\n        x = self.up1(ops.concat((ops.resize_nearest_neighbor(x, scale=2), x1), axis=1))\n       \n        # 交叉注意力融合条件\n        attn = CrossAttention()(x.view(x.shape[0], -1, x.shape[-1]), condition)\n        x = x + attn.view(x.shape)\n       \n        return self.final(x)\n\n# 6. 完整 LDM 模型\nclass LDM(nn.Cell):\n    def __init__(self):\n        super().__init__()\n        self.encoder = Encoder()\n        self.decoder = Decoder()\n        self.unet = UNet()\n        self.cross_attn = CrossAttention()\n  \n    def construct(self, x, t, condition):\n        # 1. 编码到潜在空间\n        z = self.encoder(x)\n      \n        # 2. 添加噪声（使用线性噪声调度）\n        beta = 1e-4 + (0.02 - 1e-4) * (t / config[\"timesteps\"])\n        noise = ops.randn(z.shape) * mnp.sqrt(beta)\n        z_noisy = z + noise\n        \n        # 3. UNet 去噪\n        # 调整形状以适应 UNet 输入\n        denoised = self.unet(z_noisy.view(-1, 64, 8, 8), t, condition)\n       \n        # 4. 解码回图像空间\n        return self.decoder(denoised)\n",[314],{"type":18,"tag":315,"props":316,"children":317},"code",{"__ignoreMap":7},[318],{"type":24,"value":312},{"type":18,"tag":320,"props":321,"children":323},"h3",{"id":322},"参考链接",[324],{"type":18,"tag":45,"props":325,"children":326},{},[327],{"type":24,"value":322},{"type":18,"tag":26,"props":329,"children":330},{},[331,333],{"type":24,"value":332},"[1] 论文地址：",{"type":18,"tag":334,"props":335,"children":339},"a",{"href":336,"rel":337},"https://arxiv.org/pdf/2112.10752",[338],"nofollow",[340],{"type":24,"value":336},{"title":7,"searchDepth":342,"depth":342,"links":343},4,[344],{"id":322,"depth":345,"text":322},3,"markdown","content:technology-blogs:zh:3771.md","content","technology-blogs/zh/3771.md","technology-blogs/zh/3771","md",1776506134974]