[{"data":1,"prerenderedAt":446},["ShallowReactive",2],{"content-query-Zvn3P6FdA2":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":440,"_id":441,"_source":442,"_file":443,"_stem":444,"_extension":445},"/technology-blogs/zh/2864","zh",false,"","MindSpore AI科学计算系列 | “没有最快，只有更快”，快速傅里叶变换详解","作者：于璠 来源：知乎","2023-11-06","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2023/11/10/32e0dec709de4b139c22a179f7fff58b.png","technology-blogs","大V博文",{"type":15,"children":16,"toc":428},"root",[17,25,44,52,57,66,71,76,83,88,95,100,105,115,125,133,138,143,151,156,163,168,173,178,188,193,200,205,214,219,224,235,240,247,252,257,265,270,279,284,293,298,307,312,322,327,332,340,345,358,363,368,373,378,383,388,398,408,418],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore-ai科学计算系列-没有最快只有更快快速傅里叶变换详解",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29,31,37,39],{"type":24,"value":30},"**作者：**",{"type":18,"tag":32,"props":33,"children":34},"strong",{},[35],{"type":24,"value":36},"于璠",{"type":24,"value":38}," ",{"type":18,"tag":32,"props":40,"children":41},{},[42],{"type":24,"value":43},"来源：知乎",{"type":18,"tag":26,"props":45,"children":46},{},[47],{"type":18,"tag":32,"props":48,"children":49},{},[50],{"type":24,"value":51},"背景",{"type":18,"tag":26,"props":53,"children":54},{},[55],{"type":24,"value":56},"离散傅里叶变换（Discrete Fourier Transform，缩写为DFT）将信号的时域采样变换为频域的采样，是傅里叶变换在时域和频域上的离散形式。DFT是非常重要的离散变换，在许多实际应用中用于执行傅里叶分析。例如在数字信号处理中，声压、无线电信号或每日温度读数，都可以看作随时间变化的函数在有限时间间隔（窗口）内的采样。在图像处理中，样本可以是沿光栅图像的行或列的像素值。在数学领域中，DFT还用于有效求解偏微分方程、卷积或大整数相乘等运算。在AI4Science领域，DFT也是很多算法(如Fourier Neural Operator，FNO)的基础。",{"type":18,"tag":26,"props":58,"children":59},{},[60],{"type":18,"tag":61,"props":62,"children":65},"img",{"alt":63,"src":64},"image.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064048.23068416756581302278166268165020:50541109073039:2400:3E4460A469C98650FA654B61151DEAA7254ABA2B3711EB4937926ABA047C45D0.png",[],{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"图1. 傅里叶变换示意图",{"type":18,"tag":26,"props":72,"children":73},{},[74],{"type":24,"value":75},"对于给定长度N的复数序列，DFT如式1所示，其中，x为N个复数的输入序列，X为对应的输出序列，指数幂项称为常被称为“旋转因子”(twiddle factor)。",{"type":18,"tag":26,"props":77,"children":78},{},[79],{"type":18,"tag":61,"props":80,"children":82},{"alt":63,"src":81},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064105.92429732208523998184124287990965:50541109073039:2400:9A6FEB983128A3D10A23348D57E688CC9C30863579D9DA9416108200F280A823.png",[],{"type":18,"tag":26,"props":84,"children":85},{},[86],{"type":24,"value":87},"式1",{"type":18,"tag":26,"props":89,"children":90},{},[91],{"type":18,"tag":61,"props":92,"children":94},{"alt":63,"src":93},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064125.42892868975334314776667799940832:50541109073039:2400:5F38B1DA5E978A12DD0355B623F483E6B76030CD82E40115911B9DE00C07492A.png",[],{"type":18,"tag":26,"props":96,"children":97},{},[98],{"type":24,"value":99},"式2",{"type":18,"tag":26,"props":101,"children":102},{},[103],{"type":24,"value":104},"从式2可以看出，DFT本质上是向量矩阵操作，计算N个点的DFT需要N^2次复数乘法，N(N-1)次复数加法，总的时间复杂度为O(N^2×mul+N(N-1)×add)。",{"type":18,"tag":26,"props":106,"children":107},{},[108,113],{"type":18,"tag":32,"props":109,"children":110},{},[111],{"type":24,"value":112},"快速傅里叶变换",{"type":24,"value":114},"（Fast Fourier Transform, 缩写为FFT）是高效、快速计算离散傅里叶变换（DFT）或其逆变换的统称。FFT利用DFT的特性，例如对称性和周期性，去减少DFT中的重复计算，降低复杂度。FFT算法的提出，使DFT在工程、科学和数学领域等领域得到了更加广泛的应用，被IEEE科学与工程计算期刊评选为20 世纪十大算法之一。",{"type":18,"tag":26,"props":116,"children":117},{},[118,120],{"type":24,"value":119},"**1、**",{"type":18,"tag":32,"props":121,"children":122},{},[123],{"type":24,"value":124},"方法",{"type":18,"tag":26,"props":126,"children":127},{},[128],{"type":18,"tag":32,"props":129,"children":130},{},[131],{"type":24,"value":132},"1.1 库利-图基算法",{"type":18,"tag":26,"props":134,"children":135},{},[136],{"type":24,"value":137},"库利-图基算法（Cooley-Tukey Algorithm）是目前应用最广泛、最流行的FFT算法。这一方法以分治法为策略递归地将长度为N的DFT分解为长度N1和N2的子序列的DFT，以及与旋转因子的复数乘法。此处以基为2的Cooley-Tukey算法为例，即N的长度为2的幂次(不满足时补零)，将序列长为N的DFT分割为两个长为N/2的子序列的DFT。",{"type":18,"tag":26,"props":139,"children":140},{},[141],{"type":24,"value":142},"Cooley-Tukey算法利用了DFT的对称性和周期性，减少DFT中的重复计算，降低复杂度。推导如下：",{"type":18,"tag":26,"props":144,"children":145},{},[146],{"type":18,"tag":61,"props":147,"children":150},{"alt":148,"src":149},"cke_2801.png","https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064155.76416179559566719534674303297407:50541109073039:2400:E312F0EC0ECC99174EF3E16467340D136F90E5866E73F827067511BAA1D1774B.png",[],{"type":18,"tag":26,"props":152,"children":153},{},[154],{"type":24,"value":155},"可以看出，蝶形网络中的每次蝶形运算需要一次复数乘法以及两次复数加法。对于n=8的采样序列x(n)，可推出以下蝶式网络",{"type":18,"tag":26,"props":157,"children":158},{},[159],{"type":18,"tag":61,"props":160,"children":162},{"alt":63,"src":161},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064225.24356630054013335143514232528969:50541109073039:2400:AF93A996ACF42B4D45B95EC397D59A9BF28EF34493EFEF5A5C905409A2049749.png",[],{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":24,"value":167},"图3. 8点蝶形网络",{"type":18,"tag":26,"props":169,"children":170},{},[171],{"type":24,"value":172},"按照Cooley-Tukey算法流程，在以2为基底的条件下，假设给定长度为N=2^L长度的复数序列，共需要2^(L-1)次复数乘法和2^L次复数加法，依次递推后，总的计算复杂度为O(N/2*logN×mul+NlogN×add)，相较于DFT，显著的降低了计算复杂度。",{"type":18,"tag":26,"props":174,"children":175},{},[176],{"type":24,"value":177},"进一步的，Cooley-Tukey算法还可以选择不同的基底为单位对序列进行拆解，不同的拆解方法会产生不同层数快速傅里叶变换的架构，基底越大则层数越少，复数乘法器也越少，但是每级的蝴蝶形架构则会越复杂，因此常见的架构为2基底、4基底与8基底这三种设计。",{"type":18,"tag":179,"props":180,"children":182},"h2",{"id":181},"_12-分裂基算法split-radix",[183],{"type":18,"tag":32,"props":184,"children":185},{},[186],{"type":24,"value":187},"1.2 分裂基算法(Split-radix)",{"type":18,"tag":26,"props":189,"children":190},{},[191],{"type":24,"value":192},"分裂基算法（Split-radix FFT Algorithm, SRA）由Duhamel和Hollman于1984提出。SRA算法是目前众多 FFT 算法中乘法和加法次数最少的算法，而且它具有良好的运算结构，以及较短的运算程序。SRA算法基本思路就是对偶序列使用radix-2FFT算法，对奇序列使用radix-4FFT 算法。",{"type":18,"tag":26,"props":194,"children":195},{},[196],{"type":18,"tag":61,"props":197,"children":199},{"alt":63,"src":198},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064325.56973778368193190643885071007623:50541109073039:2400:27F8F8A9CBB5D899A55AECFCA1474F0379F3CEFC93C690DA3B37ADD562A6D1F4.png",[],{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"图4. 分裂基算法示意图",{"type":18,"tag":179,"props":206,"children":208},{"id":207},"_13-互质因子算法prime-factor-algorithm-pfa",[209],{"type":18,"tag":32,"props":210,"children":211},{},[212],{"type":24,"value":213},"1.3 互质因子算法(Prime Factor Algorithm, PFA)",{"type":18,"tag":26,"props":215,"children":216},{},[217],{"type":24,"value":218},"互质因子算法（Prime-factor FFT algorithm, PFA），又称为Good-Thomas算法，也是一种快速傅里叶变换。它是把输入规模为N的DFT，分解为N_1 × N_2大小的二维DFT，其中N_1与N_2互质。变成大小为N1和N2的DFT之后，可以继续递归使用 PFA，或选择其他 FFT 算法(例如 Cooley-Tukey)来计算。",{"type":18,"tag":26,"props":220,"children":221},{},[222],{"type":24,"value":223},"互质因子算法将一维的 DFT 问题转变为多维问题来进行计算。总的时间复杂度：O(N(N_1+N_2)(mul+add))。",{"type":18,"tag":179,"props":225,"children":227},{"id":226},"_14-raders-fft算法",[228,230],{"type":24,"value":229},"**1.4 Rader'**",{"type":18,"tag":32,"props":231,"children":232},{},[233],{"type":24,"value":234},"s FFT算法",{"type":18,"tag":26,"props":236,"children":237},{},[238],{"type":24,"value":239},"Rader算法是针对输入大小为质数DFT的快速算法，该算法将长度为N的DFT重新表示为长度为N-1的循环卷积",{"type":18,"tag":26,"props":241,"children":242},{},[243],{"type":18,"tag":61,"props":244,"children":246},{"alt":63,"src":245},"https://fileserver.developer.huaweicloud.com/FileServer/getFile/cmtybbs/e64/154/b38/90a1d5d431e64154b387b3660e356ff5.20231110064403.07692673863947767110291324557693:50541109073039:2400:913691952C7A5E02B6C2D01C9DE3A202C1AD3107DF56AB0F989914ED5D79A6B1.png",[],{"type":18,"tag":26,"props":248,"children":249},{},[250],{"type":24,"value":251},"式5",{"type":18,"tag":26,"props":253,"children":254},{},[255],{"type":24,"value":256},"循环卷积又是可分解的，再根据卷积定理，将N-1的循环卷积再分解为更小规模的DFT问题。",{"type":18,"tag":26,"props":258,"children":259},{},[260],{"type":18,"tag":32,"props":261,"children":262},{},[263],{"type":24,"value":264},"2、结论",{"type":18,"tag":26,"props":266,"children":267},{},[268],{"type":24,"value":269},"通过学习上述的几种FFT方法，可总结出以下几点：",{"type":18,"tag":179,"props":271,"children":273},{"id":272},"_21-蝶形网络的组织方式将影响整个优化",[274],{"type":18,"tag":32,"props":275,"children":276},{},[277],{"type":24,"value":278},"2.1 蝶形网络的组织方式将影响整个优化",{"type":18,"tag":26,"props":280,"children":281},{},[282],{"type":24,"value":283},"蝶形网络决定了数据访问模式和蝶形计算执行顺序，相同的蝶形网络不同的实现和优化可能导致不同的性能。例如相对于Cooley-Tukey算法来说，PFA算法虽然在浮点计算量上有所减少，但是因为其复杂的映射关系，这将导致更多的数据存取，因此PFA算法适用于数据存取开销小于浮点计算开销的平台架构。",{"type":18,"tag":179,"props":285,"children":287},{"id":286},"_22-蝶形计算的性能将直接影响fft算法的最终性能",[288],{"type":18,"tag":32,"props":289,"children":290},{},[291],{"type":24,"value":292},"2.2 蝶形计算的性能将直接影响FFT算法的最终性能",{"type":18,"tag":26,"props":294,"children":295},{},[296],{"type":24,"value":297},"在FFT计算过程中蝶形计算被反复调用，例如，根据DFT本身的周期性与对称性，可以挖掘蝶形计算过程中的共同项，供各级重复利用，这将减少大量的浮点计算开销。",{"type":18,"tag":179,"props":299,"children":301},{"id":300},"_23-实现高性能fft需要与硬件架构相结合",[302],{"type":18,"tag":32,"props":303,"children":304},{},[305],{"type":24,"value":306},"2.3 实现高性能FFT需要与硬件架构相结合",{"type":18,"tag":26,"props":308,"children":309},{},[310],{"type":24,"value":311},"例如，蝶形计算时互相独立的，采用SIMD可并行处理多个蝶形计算，并行的数量又依赖其余部件的情况，如浮点寄存器的个数。所以，硬件本身能否充分利用，直接关系到FFT程序性能的提升。",{"type":18,"tag":26,"props":313,"children":314},{},[315,317],{"type":24,"value":316},"**3、**",{"type":18,"tag":32,"props":318,"children":319},{},[320],{"type":24,"value":321},"总结",{"type":18,"tag":26,"props":323,"children":324},{},[325],{"type":24,"value":326},"随着异构计算框架的发展，使用不同类型指令集和体系架构的计算单元组成系统的计算方式已经非常普遍。部分研究人员对于FFT算法的研究，从降低FFT本身的计算开销转移至在特定的硬件架构下实现高性能的FFT。在这方面，已经有一些成熟的高性能快速傅里叶变换算法库，例如FFTW、ARMPL、Intel MKL、鲲鹏的KML_FFT、AOCL（AMD optimizing CPU libraries）、CUFFT（CUDA fast Fourier transform library）等。",{"type":18,"tag":26,"props":328,"children":329},{},[330],{"type":24,"value":331},"昇腾异构计算架构也在结合自身优势，从提升计算效率、降低功耗等方面，实现高性能的FFT库，预计不久将与大家见面。",{"type":18,"tag":179,"props":333,"children":335},{"id":334},"参考文献",[336],{"type":18,"tag":32,"props":337,"children":338},{},[339],{"type":24,"value":334},{"type":18,"tag":26,"props":341,"children":342},{},[343],{"type":24,"value":344},"[1]Cooley J W, Tukey J W. An algorithm for the machine calculation of complex Fourier series[J]. Mathematics of computation, 1965, 19(90): 297-301.",{"type":18,"tag":26,"props":346,"children":347},{},[348,350],{"type":24,"value":349},"[2]Good I J. The interaction algorithm and practical Fourier analysis[J]. Journal of the Royal Statistical Society Series B: Statistical Methodology, 1958, 20(2): 361-372. ",{"type":18,"tag":351,"props":352,"children":356},"a",{"href":353,"rel":354},"https://doi.org/10.1111/j.2517-6161.1958.tb00300.x",[355],"nofollow",[357],{"type":24,"value":353},{"type":18,"tag":26,"props":359,"children":360},{},[361],{"type":24,"value":362},"[3]Thomas L H. Using a computer to solve problems in physics[J]. Applications of digital computers, 1963: 44-45.",{"type":18,"tag":26,"props":364,"children":365},{},[366],{"type":24,"value":367},"[4]Duhamel P, Hollmann H. ‘Split radix’FFT algorithm[J]. Electronics letters, 1984, 20(1): 14-16.",{"type":18,"tag":26,"props":369,"children":370},{},[371],{"type":24,"value":372},"[5]Rader C M. Discrete Fourier transforms when the number of data samples is prime[J]. Proceedings of the IEEE, 1968, 56(6): 1107-1108. DOI: 10.1109/PROC.1968.6477",{"type":18,"tag":26,"props":374,"children":375},{},[376],{"type":24,"value":377},"[6]陈暾, 李志豪, 贾海鹏, 等. 基于 ARMv8 平台的多维 FFT 实现与优化研究[J]. 计算机学报, 2019, 42(11): 2384-2402.",{"type":18,"tag":26,"props":379,"children":380},{},[381],{"type":24,"value":382},"[7]郭金鑫, 张广婷, 张云泉, 等. Cooley-Tukey FFT 算法高性能实现与优化研究[J]. 计算机科学与探索, 2022, 16(6): 1304. DOI: 10.3778/j.issn.1673-9418.2011092",{"type":18,"tag":26,"props":384,"children":385},{},[386],{"type":24,"value":387},"往期回顾",{"type":18,"tag":26,"props":389,"children":390},{},[391],{"type":18,"tag":351,"props":392,"children":395},{"href":393,"rel":394},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247610343&idx=1&sn=439ab653c8b9a1c43522cb1ff4edb324&chksm=c11e3fa8f669b6beebd68b7298d7c3d48abd229a4fb0b7b6fd628539c5d1e81cbb08578ad1d4&scene=21#wechat_redirect",[355],[396],{"type":24,"value":397},"MindSpore AI科学计算系列 | 有效改善预报模糊问题，生成式模型在短临降水预报中大放异彩",{"type":18,"tag":26,"props":399,"children":400},{},[401],{"type":18,"tag":351,"props":402,"children":405},{"href":403,"rel":404},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247610058&idx=1&sn=7358eefefaf4e48be10e16b2b24a3170&chksm=c11e3e85f669b793d55eea2a57f2caa44aad8fc88d2c8ba0ef2e1f277d2f8f60f6a9c1d58562&scene=21#wechat_redirect",[355],[406],{"type":24,"value":407},"MindSpore AI科学计算系列 | 基于深度学习模型来替代传统DFT模型以及DeephE3模型的分析综述",{"type":18,"tag":26,"props":409,"children":410},{},[411],{"type":18,"tag":351,"props":412,"children":415},{"href":413,"rel":414},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247609303&idx=1&sn=a2704ead205580312b39366deeebeba3&chksm=c11e3b98f669b28e6c6ed04acf3e3bea3303733ada959d4db83e2bf940a095071408509d6d98&scene=21#wechat_redirect",[355],[416],{"type":24,"value":417},"MindSpore AI科学计算系列 | 以MindSpore Elec为例的智能电磁计算若干进展综述",{"type":18,"tag":26,"props":419,"children":420},{},[421],{"type":18,"tag":351,"props":422,"children":425},{"href":423,"rel":424},"http://mp.weixin.qq.com/s?__biz=MzkxMTM2MjMzNg==&mid=2247606710&idx=1&sn=6375a788218ccdd3f275e305bff292be&chksm=c11e31f9f669b8ef3af56d04541d885904905734f5bbe69c8c9d57dd6c91b1e9bdd93d579b75&scene=21#wechat_redirect",[355],[426],{"type":24,"value":427},"MindSpore AI科学计算系列 | 最新综述文章梳理量子到宏观尺度AI4S共性介绍",{"title":7,"searchDepth":429,"depth":429,"links":430},4,[431,433,434,436,437,438,439],{"id":181,"depth":432,"text":187},2,{"id":207,"depth":432,"text":213},{"id":226,"depth":432,"text":435},"**1.4 Rader'**s FFT算法",{"id":272,"depth":432,"text":278},{"id":286,"depth":432,"text":292},{"id":300,"depth":432,"text":306},{"id":334,"depth":432,"text":334},"markdown","content:technology-blogs:zh:2864.md","content","technology-blogs/zh/2864.md","technology-blogs/zh/2864","md",1776506123421]