if p_text.strip() != ""

This code checks if the variable p_text is not empty or contains only whitespace characters. The strip() method removes any leading or trailing whitespace characters from the string. If the resulting string is not empty, the condition evaluates to True.

import os import csv from bs4 import BeautifulSoup # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path = os.path.join(root, file_name) # 打开HTML文件并读取源代码 with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # 在这里可以对源代码进行进一步的操作和提取 soup = BeautifulSoup(html_content, 'html.parser') messages = soup.find_all('p') for message in messages: talk_id_tag = message.find_previous('a') if talk_id_tag: talk_id = talk_id_tag.text.strip()[1:] time = message.contents[0].strip().split(',')[0][1:] send_phone = message.find('span', class_='hint-success').text.strip() receive_phone = message.find_all('span', class_='hint-success')[1].text.strip() content_tag = message.find('a') if content_tag: content = content_tag['href'] content_type = '音频' else: content = message.text.strip() content_type = '文本' print("talkid:", talk_id) print("时间:", time) print("发送号码:", send_phone) print("接收号码:", receive_phone) print("发送内容:", content) print()

这段代码有一个缩进问题，最后的`print`语句没有在正确的位置，应该在`for`循环内，以便在每次迭代时打印出提取的信息。请将最后的`print`语句缩进到正确的位置。以下是修改后的代码： ```python import os import csv from bs4 import BeautifulSoup # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path = os.path.join(root, file_name) # 打开HTML文件并读取源代码 with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # 在这里可以对源代码进行进一步的操作和提取 soup = BeautifulSoup(html_content, 'html.parser') messages = soup.find_all('p') for message in messages: talk_id_tag = message.find_previous('a') if talk_id_tag: talk_id = talk_id_tag.text.strip()[1:] time = message.contents[0].strip().split(',')[0][1:] send_phone = message.find('span', class_='hint-success').text.strip() receive_phone = message.find_all('span', class_='hint-success')[1].text.strip() content_tag = message.find('a') if content_tag: content = content_tag['href'] content_type = '音频' else: content = message.text.strip() content_type = '文本' print("talkid:", talk_id) print("时间:", time) print("发送号码:", send_phone) print("接收号码:", receive_phone) print("发送内容:", content) print() ``` 请注意，这段代码还没有将提取到的信息写入CSV文件，你可以根据需要进一步完善代码来实现写入CSV的功能。

import os import re from bs4 import BeautifulSoup # 设置html文件路径 folder_path = r'C:\Users\test\Desktop\DIDItest' # 提取html文件内所需要数据 def extract_html_info(file_path): with open(file_path, 'r', encoding='utf-8') as file: # 读取HTML源代码 html = file.read() soup = BeautifulSoup(html, 'html.parser') # 提取所有的<p>标签 p_tags = soup.find_all('p') for p_tag in p_tags: # 提取ID talk_id = p_tag.find_previous(string=lambda text: isinstance(text, str) and '[talkid:' in text) talk_id = talk_id.strip('[talkid:]') # 提取时间 timestamp = p_tag.find_previous('body').find_previous('head').find('meta', {'http-equiv': 'Content=-Type'})[ 'content'] # 提取发送号码 send_number = p_tag.find('span', {'class': 'hint-success'}).text # 提取接收号码 receive_number = p_tag.find_all('span', {'class': 'hint-success'})[1].text # 提取信息内容 message = p_tag.previous_sibling.strip() # 遍历文件夹及子文件夹，提取HTML文件信息 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) extract_html_info(file_path) # 判断是否是音频 if '音频' in message: file_url = p_tag.find('a')['href'] print( f"ID: {talk_id}, 时间: {timestamp}, 发送号码: {send_number}, 接收号码: {receive_number}, 音频文件地址: {file_url}") else: print( f"ID: {talk_id}, 时间: {timestamp}, 发送号码: {send_number}, 接收号码: {receive_number}, 信息内容: {message}")

你的代码有一些缩进错误，导致无法正确运行。以下是修正后的代码： ```python import os import re from bs4 import BeautifulSoup # 设置html文件路径 folder_path = r'C:\Users\test\Desktop\DIDItest' # 提取html文件内所需要数据 def extract_html_info(file_path): with open(file_path, 'r', encoding='utf-8') as file: # 读取HTML源代码 html = file.read() soup = BeautifulSoup(html, 'html.parser') # 提取所有的<p>标签 p_tags = soup.find_all('p') for p_tag in p_tags: # 提取ID talk_id = p_tag.find_previous(string=lambda text: isinstance(text, str) and '[talkid:' in text) talk_id = talk_id.strip('[talkid:]') # 提取时间 timestamp = p_tag.find_previous('body').find_previous('head').find('meta', {'http-equiv': 'Content=-Type'})['content'] # 提取发送号码 send_number = p_tag.find('span', {'class': 'hint-success'}).text # 提取接收号码 receive_number = p_tag.find_all('span', {'class': 'hint-success'})[1].text # 提取信息内容 message = p_tag.previous_sibling.strip() # 判断是否是音频 if '音频' in message: file_url = p_tag.find('a')['href'] print(f"ID: {talk_id}, 时间: {timestamp}, 发送号码: {send_number}, 接收号码: {receive_number}, 音频文件地址: {file_url}") else: print(f"ID: {talk_id}, 时间: {timestamp}, 发送号码: {send_number}, 接收号码: {receive_number}, 信息内容: {message}") # 遍历文件夹及子文件夹，提取HTML文件信息 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) extract_html_info(file_path) ``` 在这个修正后的代码中，我将提取HTML文件信息的部分封装到了一个函数`extract_html_info`中。然后，我使用`os.walk`函数遍历了指定文件夹及其子文件夹，找到所有的HTML文件，并调用`extract_html_info`函数来提取信息。希望这次能够顺利运行！如果你还有其他问题，请随时提问。

if p_text.strip() != ""

相关推荐

auto_restrict:一个轻松限制访问网页php脚本的脚本...仅包含auto_restrict.php！

tif_strip.rar_tif linux

ESP32_LED_STRIP:ESP32 LED灯带库

给这串代码加上省份数据city_data = hotel_data.groupby('city').agg({'hotel_name': 'count', 'room_count': 'sum'}).sort_values(by='hotel_name', ascending=False)

基于STM32控制遥控车的蓝牙应用程序

Memcached 1.2.4 版本源码包

软件项目开发全过程文档资料.zip

最新推荐

基于STM32控制遥控车的蓝牙应用程序

Memcached 1.2.4 版本源码包

软件项目开发全过程文档资料.zip

Java基础上机题-分类整理版.doc

利用迪杰斯特拉算法的全国交通咨询系统设计与实现

管理建模和仿真的文件

【实战演练】基于TensorFlow的卷积神经网络图像识别项目

CD40110工作原理

全国交通咨询系统C++实现源码解析

"互动学习：行动中的多样性与论文攻读经历"