import os import fitz # 获取指定目录下的所有PDF文件路径 pdf_dir = r"D:\点宽学院" pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')] for pdf_file in pdf_files: # 打开PDF文件 doc = fitz.open(pdf_file) text = "" # 存储PDF内容的字符串 for page in doc: text += page.get_text() # 将文本按照中文句号分割成句子 sentences = text.split("。") # 打印每一句 for sentence in sentences: print(sentence.strip() + "。")这是我的代码，需求发生变更，合并pdf内容后要先把所有的空行和空格去掉再分割，请帮我修改

时间: 2023-06-19 10:03:38 浏览: 132

import os import fitz # 获取指定目录下的所有PDF文件路径 pdf_dir = r"D:\点宽学院" pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')] for pdf_file in pdf_files: # 打开PDF文件 doc = fitz.open(pdf_file) text = "" # 存储PDF内容的字符串 for page in doc: text += page.get_text() # 去掉所有空格和空行 text = text.replace(" ", "").replace("\n", "") # 将文本按照中文句号分割成句子 sentences = text.split("。") # 打印每一句 for sentence in sentences: print(sentence.strip() + "。")

import fitz # PyMuPDF库 import os # 读取PDF文件的内容 def read_pdf(file_path): doc = fitz.open(file_path) content = "" for page in doc: content += page.getText("text") doc.close() return content # 去除字符串中的空格和空行 def remove_spaces(text): return "\n".join([line.strip() for line in text.split("\n") if line.strip()]) # 将字符串按中文句号分割成多个语句 def split_sentences(text): sentences = [] for sentence in text.split("。"): sentence = sentence.strip() if sentence: sentences.append(sentence + "。") return sentences # 读取指定目录下所有PDF文件的内容并合并成一个字符串 def read_all_pdfs(dir_path): all_content = "" for file_name in os.listdir(dir_path): if file_name.endswith(".pdf"): file_path = os.path.join(dir_path, file_name) content = read_pdf(file_path) content = remove_spaces(content) all_content += content return all_content # 将字符串按中文句号分割成多个语句并打印出来 def print_sentences(text): sentences = split_sentences(text) for sentence in sentences: print(sentence) # 测试 dir_path = r"D:\点宽学院" all_content = read_all_pdfs(dir_path) print_sentences(all_content)代码运行显示AttributeError: 'Page' object has no attribute 'getText'我该如何修改

这个错误可能是因为fitz的版本问题，可以尝试更新fitz库。同时，建议在读取PDF文件时，使用try-except语句来捕捉异常，以便更好地处理错误。以下是更新fitz库和添加try-except语句的修改代码： ```python import fitz # PyMuPDF库 import os # 读取PDF文件的内容 def read_pdf(file_path): doc = fitz.open(file_path) content = "" for page in doc: try: content += page.getText("text") except: pass doc.close() return content # 去除字符串中的空格和空行 def remove_spaces(text): return "\n".join([line.strip() for line in text.split("\n") if line.strip()]) # 将字符串按中文句号分割成多个语句 def split_sentences(text): sentences = [] for sentence in text.split("。"): sentence = sentence.strip() if sentence: sentences.append(sentence + "。") return sentences # 读取指定目录下所有PDF文件的内容并合并成一个字符串 def read_all_pdfs(dir_path): all_content = "" for file_name in os.listdir(dir_path): if file_name.endswith(".pdf"): file_path = os.path.join(dir_path, file_name) content = read_pdf(file_path) content = remove_spaces(content) all_content += content return all_content # 将字符串按中文句号分割成多个语句并打印出来 def print_sentences(text): sentences = split_sentences(text) for sentence in sentences: print(sentence) # 测试 dir_path = r"D:\点宽学院" all_content = read_all_pdfs(dir_path) print_sentences(all_content) ```

import fitz def extract_text_from_pdf(file_path): text = '' with fitz.open(file_path) as doc: for page in doc: text += page.getText() return text pdf_file_path = 'D:\pukoutest_report.pdff' extracted_text = extract_text_from_pdf(pdf_file_path) print(extracted_text) 这个代码哪里有问题

这段代码的问题在于文件路径字符串中的反斜杠`\`使用了转义字符的形式，导致路径无法正确识别。为了解决这个问题，您可以采取以下两种方式之一： 1. 使用原始字符串（raw string）：在文件路径字符串前面加上`r`，将其转换为原始字符串，这样反斜杠将被当作普通字符处理而不是转义字符。示例代码如下： ```python pdf_file_path = r'D:\pukoutest_report.pdff' ``` 2. 使用双反斜杠：将反斜杠`\`替换为双反斜杠`\\`，这样每个反斜杠都会被当作普通字符处理。示例代码如下： ```python pdf_file_path = 'D:\\pukoutest_report.pdff' ``` 请根据您的需求选择其中一种方式来修改代码中的文件路径字符串。这样就能正确读取PDF文件并提取文本内容了。

阅读全文

import fitz def extract_text_from_pdf(file_path): text = '' with fitz.open(file_path) as doc: for page in doc: text += page.getText() return text pdf_file_path = 'D:\pukoutest_report.pdff' extracted_text = extract_text_from_pdf(pdf_file_path) print(extracted_text) 这个代码哪里有问题

相关推荐

使用pymupdf库拆分PDF文件的源代码

pdf-viewer:纯Python PDFViewer，其功能与其他著名的PDFViewer相同

PDF转换工具-python实现

解释代码 os.makedirs(save_dir,exist_ok=True) doc = fitz.open(file_path)

基于STM32单片机的激光雕刻机控制系统设计-含详细步骤和代码

白色简洁风格的前端网站模板下载.zip

大家在看

STM8L051F3P6使用手册（中文）.zip

华为2403安装手册.

TwinCAT3.1学习笔记

新代plc资料

先栅极还是后栅极 业界争论高K技术

最新推荐

Python使用到第三方库PyMuPDF图片与pdf相互转换

基于STM32单片机的激光雕刻机控制系统设计-含详细步骤和代码

白色简洁风格的前端网站模板下载.zip

HarmonyException如何解决.md

掌握HTML/CSS/JS和Node.js的Web应用开发实践

管理建模和仿真的文件

计算机体系结构概述：基础概念与发展趋势

int a[][3]={{1,2},{4}}输出这个数组

勒玛算法研讨会项目：在线商店模拟与Qt界面实现

"互动学习：行动中的多样性与论文攻读经历"

先栅极还是后栅极业界争论高K技术