[{"data":1,"prerenderedAt":277},["ShallowReactive",2],{"content-query-dNwHiLjIev":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"cover":11,"type":12,"category":13,"body":14,"_type":271,"_id":272,"_source":273,"_file":274,"_stem":275,"_extension":276},"/technology-blogs/zh/1614","zh",false,"","【MindSpore易点通】Python在手，PDF不愁","Python办公系列~","2022-07-04","https://obs-mindspore-file.obs.cn-north-4.myhuaweicloud.com/file/2022/07/12/e0391f553f6c454c9be1855828b9db2e.png","technology-blogs","基础知识",{"type":15,"children":16,"toc":261},"root",[17,25,31,36,43,48,56,61,66,71,79,84,92,98,105,110,115,120,125,130,135,145,150,157,163,168,172,177,182,187,192,200,205,211,216,223,227,232,239,244,251,256],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"mindspore易点通python在手pdf不愁",[23],{"type":24,"value":8},"text",{"type":18,"tag":26,"props":27,"children":28},"p",{},[29],{"type":24,"value":30},"宝子们！时隔大半年（实在对不起大家），久违的办公系列又更新啦！之前我们已经给大家种草了利用Python批量处理word和excel文件，今天继续带领家人们学习如何利用Python处理PDF！",{"type":18,"tag":26,"props":32,"children":33},{},[34],{"type":24,"value":35},"废话不多说，我们就开干吧~~",{"type":18,"tag":37,"props":38,"children":40},"h3",{"id":39},"_1-库介绍",[41],{"type":24,"value":42},"1 库介绍",{"type":18,"tag":26,"props":44,"children":45},{},[46],{"type":24,"value":47},"通过之前的学习，我们其实也可以总结发现，常规的操作基本可以分为：创建PDF、提取PDF中的内容、操作PDF页面的内容等，那么通过这样的操作，我们找到了一堆可以使用的相关库，比如PDFrw 、Python-PDFkit、PDFplumber、PDFtabextract、PDF-redactor等等。",{"type":18,"tag":26,"props":49,"children":50},{},[51],{"type":18,"tag":52,"props":53,"children":55},"img",{"alt":7,"src":54},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656904217134104037.png",[],{"type":18,"tag":26,"props":57,"children":58},{},[59],{"type":24,"value":60},"那么既然知道有这些库了（因为它们都不是Python的标准库），因此接下来的工作就是pip install安装各种库文件啦~",{"type":18,"tag":26,"props":62,"children":63},{},[64],{"type":24,"value":65},"这里是小编安装部分库的展示：",{"type":18,"tag":26,"props":67,"children":68},{},[69],{"type":24,"value":70},"pip install PyPDF2",{"type":18,"tag":26,"props":72,"children":73},{},[74],{"type":18,"tag":52,"props":75,"children":78},{"alt":76,"src":77},"图片1.png","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656903674540946250.png",[],{"type":18,"tag":26,"props":80,"children":81},{},[82],{"type":24,"value":83},"pip insatll PDFplumber",{"type":18,"tag":26,"props":85,"children":86},{},[87],{"type":18,"tag":52,"props":88,"children":91},{"alt":89,"src":90},"图片2.png","https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656903697236567283.png",[],{"type":18,"tag":37,"props":93,"children":95},{"id":94},"_2-常规操作",[96],{"type":24,"value":97},"2 常规操作",{"type":18,"tag":99,"props":100,"children":102},"h4",{"id":101},"_21-文字信息提取",[103],{"type":24,"value":104},"2.1 文字信息提取",{"type":18,"tag":26,"props":106,"children":107},{},[108],{"type":24,"value":109},"案例：提取test2.PDF文件第2页的文字信息",{"type":18,"tag":26,"props":111,"children":112},{},[113],{"type":24,"value":114},"思路分析：",{"type":18,"tag":26,"props":116,"children":117},{},[118],{"type":24,"value":119},"1.利用PDFplumber打开一个 PDF 文件",{"type":18,"tag":26,"props":121,"children":122},{},[123],{"type":24,"value":124},"2.获取指定的页，或者遍历每一页",{"type":18,"tag":26,"props":126,"children":127},{},[128],{"type":24,"value":129},"3.利用.extract_text()方法提取当前页的文字",{"type":18,"tag":26,"props":131,"children":132},{},[133],{"type":24,"value":134},"示例代码：",{"type":18,"tag":136,"props":137,"children":139},"pre",{"code":138},"import PDFplumber\n\nfile_path = r'C:\\Users\\hangtian\\Desktop\\test2.PDF'\n\n \n\nwith PDFplumber.open(file_path) as PDF:\n\n    page = PDF.pages[1]\n\nprint(page.extract_text())\n",[140],{"type":18,"tag":141,"props":142,"children":143},"code",{"__ignoreMap":7},[144],{"type":24,"value":138},{"type":18,"tag":26,"props":146,"children":147},{},[148],{"type":24,"value":149},"运行结果：",{"type":18,"tag":26,"props":151,"children":152},{},[153],{"type":18,"tag":52,"props":154,"children":156},{"alt":7,"src":155},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656903748016231814.png",[],{"type":18,"tag":99,"props":158,"children":160},{"id":159},"_22-批量拆分",[161],{"type":24,"value":162},"2.2 批量拆分",{"type":18,"tag":26,"props":164,"children":165},{},[166],{"type":24,"value":167},"案例：将一个完整的 PDF 拆分成几个小的 PDF，由于这里的文件太大，我们只拆分成两个PDF文件。",{"type":18,"tag":26,"props":169,"children":170},{},[171],{"type":24,"value":114},{"type":18,"tag":26,"props":173,"children":174},{},[175],{"type":24,"value":176},"1.读取PDF的整体信息、总页数等",{"type":18,"tag":26,"props":178,"children":179},{},[180],{"type":24,"value":181},"2.遍历每一页内容，以每个 step 为间隔将PDF存成每一个小的文件块",{"type":18,"tag":26,"props":183,"children":184},{},[185],{"type":24,"value":186},"3.将小的文件块重新保存为新的PDF文件",{"type":18,"tag":26,"props":188,"children":189},{},[190],{"type":24,"value":191},"代码展示：",{"type":18,"tag":136,"props":193,"children":195},{"code":194},"import os\n\nfrom PyPDF2 import PDFFileWriter, PDFFileReader\n\n \n\ndef split_PDF(filename, filepath, save_dirpath, step=5):\n\n    \"\"\"\n\n    拆分PDF为多个小的PDF文件，\n\n    @param filename:文件名\n\n    @param filepath:文件路径\n\n    @param save_dirpath:保存小的PDF的文件路径\n\n    @param step: 每step间隔的页面生成一个文件，例如step=5，表示0-4页、5-9页...为一个文件\n\n    @return:\n\n    \"\"\"\n\n    if not os.path.exists(save_dirpath):\n\n        os.mkdir(save_dirpath)\n\n    PDF_reader = PDFFileReader(filepath)\n\n    # 读取每一页的数据\n\n    pages = PDF_reader.getNumPages()\n\n    for page in range(0, pages, step):\n\n        PDF_writer = PDFFileWriter()\n\n        # 拆分PDF，每 step 页的拆分为一个文件\n\n        for index in range(page, page+step):\n\n            if index \u003C pages:\n\n                PDF_writer.addPage(PDF_reader.getPage(index))\n\n        # 保存拆分后的小文件\n\n        save_path = os.path.join(save_dirpath, filename+str(int(page/step)+1)+'.PDF')\n\n        print(save_path)\n\n        with open(save_path, \"wb\") as out:\n\n            PDF_writer.write(out)\n\n \n\n    print(\"文件已成功拆分，保存路径为：\"+save_dirpath)\n\n   \n\nsplit_PDF(filename, filepath, save_dirpath, step=5)\n",[196],{"type":18,"tag":141,"props":197,"children":198},{"__ignoreMap":7},[199],{"type":24,"value":194},{"type":18,"tag":26,"props":201,"children":202},{},[203],{"type":24,"value":204},"通过这个代码逻辑我们也可以明白，写入器初始化和输出的位置一定都在读取PDF循环每一页的循环体内，而不是在循环体外。同理，批量合并的思路更加简单啦——确定要合并的文件顺序、循环追加到一个文件块中、保存成一个新的文件，那么我们批量合并的问题也轻松解决！",{"type":18,"tag":99,"props":206,"children":208},{"id":207},"_23-添加水印",[209],{"type":24,"value":210},"2.3 添加水印",{"type":18,"tag":26,"props":212,"children":213},{},[214],{"type":24,"value":215},"案例：将下图作为水印添加到test2.PDF中",{"type":18,"tag":26,"props":217,"children":218},{},[219],{"type":18,"tag":52,"props":220,"children":222},{"alt":7,"src":221},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656903897684838621.png",[],{"type":18,"tag":26,"props":224,"children":225},{},[226],{"type":24,"value":114},{"type":18,"tag":26,"props":228,"children":229},{},[230],{"type":24,"value":231},"1.将需要作为水印的图片插入word中调整合适位置后保存为PDF文件，同时需要额外用到copy模块：",{"type":18,"tag":26,"props":233,"children":234},{},[235],{"type":18,"tag":52,"props":236,"children":238},{"alt":7,"src":237},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656904016422522800.png",[],{"type":18,"tag":26,"props":240,"children":241},{},[242],{"type":24,"value":243},"2.把读取器和写入器初始化，并且把水印PDF页先读取好备用。",{"type":18,"tag":26,"props":245,"children":246},{},[247],{"type":18,"tag":52,"props":248,"children":250},{"alt":7,"src":249},"https://bbs-img.huaweicloud.com/data/forums/attachment/forum/20227/4/1656904036087870445.png",[],{"type":18,"tag":26,"props":252,"children":253},{},[254],{"type":24,"value":255},"3.加水印本质上就是把水印PDF页和需要加水印的每一页都合并一遍。由于需要加水印的PDF可能有很多页，而水印PDF只有一页，那么就会造成水印PDF和原始PDF页数不匹配的问题。因此需要把水印PDF页不断复制作为new_page备用，再通过mergePage完成每一页的合并，最后交给写入器统一输出。",{"type":18,"tag":26,"props":257,"children":258},{},[259],{"type":24,"value":260},"当然啦，关于Python处理PDF还有很多别的操作，比如加密PDF文件、html网页自动转为PDF等等，欢迎各位宝子们复现、纠错上述操作，并在小编的基础上大放异彩！本次抽5名宝子赠送MindSpore精美周边一份！",{"title":7,"searchDepth":262,"depth":262,"links":263},4,[264,266],{"id":39,"depth":265,"text":42},3,{"id":94,"depth":265,"text":97,"children":267},[268,269,270],{"id":101,"depth":262,"text":104},{"id":159,"depth":262,"text":162},{"id":207,"depth":262,"text":210},"markdown","content:technology-blogs:zh:1614.md","content","technology-blogs/zh/1614.md","technology-blogs/zh/1614","md",1776506114459]