利用python实现Word文档与PDF文档的格式转化(提取PDF文本内容),实现PDF文件的合并、拆分,PDF页面旋转以及PDF页面增加水印的完整版代码实现
时间: 2024-02-27 14:56:42 浏览: 87
好的,下面是这些功能的完整版代码实现,包括异常处理和一些额外的功能:
```python
# 导入所需的库
import os
import PyPDF2
import docx2pdf
import pdf2docx
from PyPDF2 import PdfFileReader, PdfFileWriter
# 定义转化Word文档为PDF文档的函数
def word_to_pdf(input_path, output_path):
try:
docx2pdf.convert(input_path, output_path)
print("Word文档转换为PDF文档成功!")
except Exception as e:
print("Word文档转换为PDF文档失败:", e)
# 定义转化PDF文档为Word文档的函数
def pdf_to_word(input_path, output_path):
try:
pdf2docx.parse(input_path, output_path)
print("PDF文档转换为Word文档成功!")
except Exception as e:
print("PDF文档转换为Word文档失败:", e)
# 定义提取PDF文本内容的函数
def extract_pdf_text(input_path):
try:
with open(input_path, 'rb') as f:
pdf_reader = PdfFileReader(f)
text = ""
for page in range(pdf_reader.getNumPages()):
page_obj = pdf_reader.getPage(page)
text += page_obj.extractText()
return text
except Exception as e:
print("提取PDF文本内容失败:", e)
# 定义合并PDF文件的函数
def merge_pdf_files(input_paths, output_path):
try:
pdf_writer = PdfFileWriter()
for path in input_paths:
with open(path, 'rb') as f:
pdf_reader = PdfFileReader(f)
for page in range(pdf_reader.getNumPages()):
pdf_writer.addPage(pdf_reader.getPage(page))
with open(output_path, 'wb') as f:
pdf_writer.write(f)
print("合并PDF文件成功!")
except Exception as e:
print("合并PDF文件失败:", e)
# 定义拆分PDF文件的函数
def split_pdf_file(input_path, output_folder):
try:
with open(input_path, 'rb') as f:
pdf_reader = PdfFileReader(f)
for page in range(pdf_reader.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf_reader.getPage(page))
output_path = os.path.join(output_folder, f"page{page}.pdf")
with open(output_path, 'wb') as out_f:
pdf_writer.write(out_f)
print("拆分PDF文件成功!")
except Exception as e:
print("拆分PDF文件失败:", e)
# 定义旋转PDF页面的函数
def rotate_pdf_pages(input_path, output_path, clockwise=True):
try:
with open(input_path, 'rb') as f:
pdf_reader = PdfFileReader(f)
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
pdf_page = pdf_reader.getPage(page)
if clockwise:
pdf_page.rotateClockwise(90)
else:
pdf_page.rotateCounterClockwise(90)
pdf_writer.addPage(pdf_page)
with open(output_path, 'wb') as out_f:
pdf_writer.write(out_f)
print("旋转PDF页面成功!")
except Exception as e:
print("旋转PDF页面失败:", e)
# 定义给PDF页面增加水印的函数
def add_watermark_to_pdf(input_path, output_path, watermark_path):
try:
watermark = PdfFileReader(open(watermark_path, 'rb')).getPage(0)
with open(input_path, 'rb') as f:
pdf_reader = PdfFileReader(f)
pdf_writer = PdfFileWriter()
for page in range(pdf_reader.getNumPages()):
pdf_page = pdf_reader.getPage(page)
pdf_page.mergePage(watermark)
pdf_writer.addPage(pdf_page)
with open(output_path, 'wb') as out_f:
pdf_writer.write(out_f)
print("给PDF页面增加水印成功!")
except Exception as e:
print("给PDF页面增加水印失败:", e)
# 测试代码
if __name__ == '__main__':
# 测试Word文档转PDF文档
word_to_pdf('test.docx', 'test.pdf')
# 测试PDF文档转Word文档
pdf_to_word('test.pdf', 'test.docx')
# 测试提取PDF文本内容
print(extract_pdf_text('test.pdf'))
# 测试合并PDF文件
merge_pdf_files(['file1.pdf', 'file2.pdf'], 'merged.pdf')
# 测试拆分PDF文件
split_pdf_file('example.pdf', 'output')
# 测试旋转PDF页面
rotate_pdf_pages('example.pdf', 'rotated.pdf', clockwise=True)
# 测试给PDF页面增加水印
add_watermark_to_pdf('example.pdf', 'watermarked.pdf', 'watermark.pdf')
```
希望这个完整版代码可以帮助到你。如果你还有其他问题,可以继续问我哦!
阅读全文