當前位置：首頁 > 编程语言 > python >内容正文

python

python 将pdf分页后插入至word中

發布時間：2023/12/20 python 32 豆豆

生活随笔收集整理的這篇文章主要介紹了 python 将pdf分页后插入至word中小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

所用技術

　　1. python編程基礎

　　2. 使用pyPdf

　　3. 使用python操作word

　　4. 正則表達式的使用

　　5. windows的bat編程

下面是一個pyPdf庫使用的示例：

from pyPdf import PdfFileWriter, PdfFileReaderoutput = PdfFileWriter()input1 = PdfFileReader(file("document1.pdf", "rb"))# add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0))# add page 2 from input1, but rotated clockwise 90 degreesoutput.addPage(input1.getPage(1).rotateClockwise(90))# add page 3 from input1, rotated the other way:output.addPage(input1.getPage(2).rotateCounterClockwise(90))# alt: output.addPage(input1.getPage(2).rotateClockwise(270))# add page 4 from input1, but first add a watermark from another pdf:page4 = input1.getPage(3)watermark = PdfFileReader(file("watermark.pdf", "rb"))page4.mergePage(watermark.getPage(0))# add page 5 from input1, but crop it to half size:page5 = input1.getPage(4)page5.mediaBox.upperRight = (page5.mediaBox.getUpperRight_x() / 2,page5.mediaBox.getUpperRight_y() / 2)output.addPage(page5)# print how many pages input1 has:print "document1.pdf has %s pages." % input1.getNumPages())# finally, write "output" to document-output.pdfoutputStream = file("document-output.pdf", "wb")output.write(outputStream)

有了該庫，就可以很容易將現有的pdf做分割。

因為我的需求是要將pdf中的關鍵字提取出來，用它來作為文件名。pyPdf中提供了將pdf中的文字全部提取出來。

inputfile.getPage(0).extractText()

這里返回的unicode，需要轉為str

inputfile.getPage(0).extractText().encode("utf-8")

然后將每頁的關鍵字提取出來，增加函數如下：

p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str):m = p_sheetName.search(str)if m:return m.group(1)else:return None;

最終代碼如下：

from pyPdf import PdfFileWriter, PdfFileReader import re,osp_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str):m = p_sheetName.search(str)if m:return m.group(1)else:return None;def splitpdf(srcFile):input1 = file(srcFile,"rb")inputfile = PdfFileReader(input1)numofpages = inputfile.getNumPages()print "pages: %d" % numofpages#new directoryfolderName,ext_ = os.path.splitext(srcFile)if not os.path.isdir(folderName):os.makedirs(folderName)for page_index in range(1,numofpages+1):output = PdfFileWriter()output.addPage(inputfile.getPage(page_index-1))sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))#save filesaveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))print saveFileNameoutputFile = file(saveFileName,"wb")output.write(outputFile)outputFile.close()input1.close()splitpdf("E:\\test.pdf")

下一步，將pdf參數化

from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,stringdef translator(frm='', to='', delete='', keep=None):if len(to) == 1 :to = to * len(frm)trans = string.maketrans(frm,to)if keep is not None:allchars = string.maketrans('','')delete = allchars.translate(allchars,keep.translate(allchars,delete))def translate(s):return s.translate(trans,delete)return translatedelete_some_speicl = translator(delete="/:\\?*><|")p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str):m = p_sheetName.search(str)return delete_some_speicl(m.group(1))def splitpdf(srcFile):try:folderName,ext_ = os.path.splitext(srcFile)if ext_ != '.pdf':raise Exception(os.path.basename(srcFile) + " is not pdf!")input1 = file(srcFile,"rb")inputfile = PdfFileReader(input1)numofpages = inputfile.getNumPages()print "pages: %d" % numofpages#new directoryif not os.path.isdir(folderName):os.makedirs(folderName)for page_index in range(1,numofpages+1):output = PdfFileWriter()output.addPage(inputfile.getPage(page_index-1))sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))#save filesaveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))print saveFileNameoutputFile = file(saveFileName,"wb")output.write(outputFile)outputFile.close()input1.close()print "Split success!"print "please find them at " + folderNameexcept Exception,e:print eif __name__ == '__main__':if len(sys.argv) < 2:print 'usage: %s filename' % os.path.basename(sys.argv[0])exit(0)#print sys.argv[1]splitpdf(sys.argv[1])

這里translator函數是將關鍵字中的特殊字符過濾掉，因為新建文件時可能會出錯。

其實分開pdf也還需要一些手動操作，不然還需用vba導入到word中，我想直接用python干完這些事，如果就用到了win32com來操作word

下面是使用操作word的一個示例：

import win32com from win32com.client import Dispatch, constantsw = win32com.client.Dispatch('Word.Application') # 或者使用下面的方法，使用啟動獨立的進程： # w = win32com.client.DispatchEx('Word.Application')# 后臺運行，不顯示，不警告 w.Visible = 0 w.DisplayAlerts = 0# 打開新的文件 doc = w.Documents.Open( FileName = filenamein ) # worddoc = w.Documents.Add() # 創建新的文檔# 插入文字 myRange = doc.Range(0,0) myRange.InsertBefore('Hello from Python!')# 使用樣式 wordSel = myRange.Select() wordSel.Style = constants.wdStyleHeading1# 正文文字替換 w.Selection.Find.ClearFormatting() w.Selection.Find.Replacement.ClearFormatting() w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)# 頁眉文字替換 w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting() w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting() w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)# 表格操作 doc.Tables[0].Rows[0].Cells[0].Range.Text ='123123' worddoc.Tables[0].Rows.Add() # 增加一行# 轉換為html wc = win32com.client.constants w.ActiveDocument.WebOptions.RelyOnCSS = 1 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4 w.ActiveDocument.WebOptions.OrganizeInFolder = 0 w.ActiveDocument.WebOptions.UseLongFileNames = 1 w.ActiveDocument.WebOptions.RelyOnVML = 0 w.ActiveDocument.WebOptions.AllowPNG = 1 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )# 打印 doc.PrintOut()# 關閉 # doc.Close() w.Documents.Close(wc.wdDoNotSaveChanges) w.Quit()

仿照上例，修改前面的代碼如下：

from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,string,win32com from win32com.client import Dispatch, constants win32com.client.gencache.EnsureDispatch('Word.Application')def translator(frm='', to='', delete='', keep=None):if len(to) == 1 :to = to * len(frm)trans = string.maketrans(frm,to)if keep is not None:allchars = string.maketrans('','')delete = allchars.translate(allchars,keep.translate(allchars,delete))def translate(s):return s.translate(trans,delete)return translatedelete_some_speicl = translator(delete="/:\\?*><|")p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str):m = p_sheetName.search(str)return m.group(1)def splitPdfToWord(srcFile):try:folderName,ext_ = os.path.splitext(srcFile)if ext_ != '.pdf':raise Exception(os.path.basename(srcFile) + " is not pdf!")input1 = file(srcFile,"rb")inputfile = PdfFileReader(input1)numofpages = inputfile.getNumPages()print "Total Pages: %d" % numofpageswordApp = win32com.client.Dispatch('Word.Application')wordApp.Visible = FalsewordApp.DisplayAlerts = 0doc = wordApp.Documents.Add()sel = wordApp.Selection#new directoryif not os.path.isdir(folderName):os.makedirs(folderName)for page_index in range(1,numofpages+1):output = PdfFileWriter()output.addPage(inputfile.getPage(page_index-1))sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))sel.Style = constants.wdStyleHeading1sel.TypeText("Page%d %s" % (page_index,sheetName))sheetName = delete_some_speicl(sheetName)#save filesaveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))print "Add Page %d" % page_index#print saveFileNameoutputFile = file(saveFileName,"wb")output.write(outputFile)outputFile.close()sel.TypeParagraph()sel.Style = constants.wdStyleBodyTextsel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName)sel.InsertBreak(Type=constants.wdPageBreak)input1.close()doc.SaveAs(folderName+".doc")print "Split success!"print "please find them at " + folderNameprint "create word document success!"print "Location:" + folderName + ".doc"except Exception,e:print efinally:wordApp.Quit()if __name__ == '__main__':if len(sys.argv) < 2:print 'usage: %s filename' % os.path.basename(sys.argv[0])sys.exit(1)splitPdfToWord(sys.argv[1])

轉載于:https://www.cnblogs.com/zhangyonghugo/p/3501065.html

創作挑戰賽新人創作獎勵來咯，堅持創作打卡瓜分現金大獎

總結

以上是生活随笔為你收集整理的python 将pdf分页后插入至word中的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： HTTP首部（3）
下一篇： csv 20位数据如何打开可以预览完整