import os import fitz # 获取指定目录下的所有PDF文件路径 pdf_dir = r"D:\点宽学院" pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')] for pdf_file in pdf_files: # 打开PDF文件 doc = fitz.open(pdf_file) text = "" # 存储PDF内容的字符串 for page in doc: text += page.get_text() # 将文本按照中文句号分割成句子 sentences = text.split("。") # 打印每一句 for sentence in sentences: print(sentence.strip() + "。")这是我的代码,需求发生变更,合并pdf内容后要先把所有的空行和空格去掉再分割,请帮我修改
时间: 2023-06-19 21:03:38 浏览: 126
import os
import fitz
# 获取指定目录下的所有PDF文件路径
pdf_dir = r"D:\点宽学院"
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
for pdf_file in pdf_files:
# 打开PDF文件
doc = fitz.open(pdf_file)
text = ""
# 存储PDF内容的字符串
for page in doc:
text += page.get_text()
# 去掉所有空格和空行
text = text.replace(" ", "").replace("\n", "")
# 将文本按照中文句号分割成句子
sentences = text.split("。")
# 打印每一句
for sentence in sentences:
print(sentence.strip() + "。")
相关问题
import fitz # PyMuPDF库 import os # 读取PDF文件的内容 def read_pdf(file_path): doc = fitz.open(file_path) content = "" for page in doc: content += page.getText("text") doc.close() return content # 去除字符串中的空格和空行 def remove_spaces(text): return "\n".join([line.strip() for line in text.split("\n") if line.strip()]) # 将字符串按中文句号分割成多个语句 def split_sentences(text): sentences = [] for sentence in text.split("。"): sentence = sentence.strip() if sentence: sentences.append(sentence + "。") return sentences # 读取指定目录下所有PDF文件的内容并合并成一个字符串 def read_all_pdfs(dir_path): all_content = "" for file_name in os.listdir(dir_path): if file_name.endswith(".pdf"): file_path = os.path.join(dir_path, file_name) content = read_pdf(file_path) content = remove_spaces(content) all_content += content return all_content # 将字符串按中文句号分割成多个语句并打印出来 def print_sentences(text): sentences = split_sentences(text) for sentence in sentences: print(sentence) # 测试 dir_path = r"D:\点宽学院" all_content = read_all_pdfs(dir_path) print_sentences(all_content)代码运行显示AttributeError: 'Page' object has no attribute 'getText'我该如何修改
这个错误可能是因为fitz的版本问题,可以尝试更新fitz库。同时,建议在读取PDF文件时,使用try-except语句来捕捉异常,以便更好地处理错误。
以下是更新fitz库和添加try-except语句的修改代码:
```python
import fitz # PyMuPDF库
import os # 读取PDF文件的内容
def read_pdf(file_path):
doc = fitz.open(file_path)
content = ""
for page in doc:
try:
content += page.getText("text")
except:
pass
doc.close()
return content
# 去除字符串中的空格和空行
def remove_spaces(text):
return "\n".join([line.strip() for line in text.split("\n") if line.strip()])
# 将字符串按中文句号分割成多个语句
def split_sentences(text):
sentences = []
for sentence in text.split("。"):
sentence = sentence.strip()
if sentence:
sentences.append(sentence + "。")
return sentences
# 读取指定目录下所有PDF文件的内容并合并成一个字符串
def read_all_pdfs(dir_path):
all_content = ""
for file_name in os.listdir(dir_path):
if file_name.endswith(".pdf"):
file_path = os.path.join(dir_path, file_name)
content = read_pdf(file_path)
content = remove_spaces(content)
all_content += content
return all_content
# 将字符串按中文句号分割成多个语句并打印出来
def print_sentences(text):
sentences = split_sentences(text)
for sentence in sentences:
print(sentence)
# 测试
dir_path = r"D:\点宽学院"
all_content = read_all_pdfs(dir_path)
print_sentences(all_content)
```
import fitz # 打开PDF文件 doc = fitz.open(r"D:\点宽学院\在线绿色互动如何影响共享型绿色消费行为_——自然联结性的调节作用_王建明.pdf") text = "" # 存储PDF内容的字符串 for page in doc: text += page.get_text() # 将文本按照中文句号分割成句子 sentences = text.split("。") # 打印每一句 for sentence in sentences: print(sentence.strip() + "。")请帮我修改这个代码,我想把处理的对象改为“D:\点宽学院”下所有的PDF文件
可以使用Python的os模块来获取指定目录下的所有PDF文件路径,然后对每个PDF文件进行处理。
以下是修改后的代码:
```python
import os
import fitz
# 获取指定目录下的所有PDF文件路径
pdf_dir = r"D:\点宽学院"
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
for pdf_file in pdf_files:
# 打开PDF文件
doc = fitz.open(pdf_file)
text = "" # 存储PDF内容的字符串
for page in doc:
text += page.get_text()
# 将文本按照中文句号分割成句子
sentences = text.split("。")
# 打印每一句
for sentence in sentences:
print(sentence.strip() + "。")
```
这样就可以处理指定目录下所有的PDF文件了。
阅读全文