import os from bs4 import BeautifulSoup import re 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" 正则表达式模式 pattern = r'<body>(.*?)</body>' 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).*?<span.*?>(\d+)<.*?>(.*?)<.*?''((中发言|发送)\s(.*?)\s)', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.*?hint-success.*?>(\d+)', body_data) matches2 = re.findall(r'(?:中发言|发送)\s*(.*?)\s*(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] content = match[4] # 提取第二个号码为接收号码 if len(matches1) >= 2: receive_id = matches1[3] # 替换字符 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') content = content.replace('音频', '').replace('图片', '').replace('发送','').replace('中发言','') content = re.sub(r'\n', '', content) print("---导入完成-----") 使用python 创建sql数据库并将数据导入到sql文件中

import os import re from bs4 import BeautifulSoup # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser')提取发送或发言的字符串

folder_path = "C:/Users/test/Desktop/DIDItest" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with ...

import os from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.*?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "")

您的代码看起来很好，它使用 BeautifulSoup 库解析 HTML 并使用正则表达式模匹配 <body> 标签内的数据。然后通过剔除 标签和 () 的方式处理数据。请注意，这里的代码假设每个文件只有一个 <body> ...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 创建CSV文件并写入表头 # CSV文件路径 csv_file = 'path/to/your/csv/file.csv' csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建CSV文件并写入表头 csv_file = "output.csv" header = ['File Name'] # 表头 # 首次创建CSV文件时，写入...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 创建一个空的DataFrame用于存储提取的文件名数据 df_...

from bs4 import BeautifulSoup import csv import os import re # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹下的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) # 打开文件并解析HTML源代码 with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # 将soup对象转换为字符串 html_string = str(soup) # 提取talkid、时间、发送号码、接收号码、信息类型和消息内容的正则表达式模式 pattern = r'\[talkid:(\d+)\](.?) 向 (.?) 发送 (.?):\[(.?)\]' matches = re.findall(pattern, html_string)

4. 遍历文件夹下的所有文件：使用os.walk函数遍历指定文件夹下的所有文件。 5. 判断文件是否为HTML文件：通过判断文件名的后缀是否为.html来确定是否为HTML文件。 6. 打开文件并解析HTML源代码：使用open...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取链接地址 matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','')将上述file_name的结果添加到已有数据的csv文中，将其存放值在指定的file_name一列中

你可以使用pandas库来将file_name的结果添加到已有数据的CSV文件中，并将其存放在指定的file_name一列中。以下是一个示例代码： python # 指定CSV文件路径 csv_file = "path/to/your/csv/file.csv" # ...

import os import sqlite3 from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)</body>' # 连接数据库 conn = sqlite3.connect('chat_data.db') cursor = conn.cursor() # 添加新的字段 cursor.execute("ALTER TABLE DIDI_talk ADD COLUMN file_name TEXT") # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取链接地址 matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # 替换字符 file_name = file_name.replace('No matches found.', '') new_data = [file_name] # 更新数据库中新字段的数据 for i, data in enumerate(new_data): cursor.execute("UPDATE DIDI_talk SET file_name = ? WHERE talk_id = ?", (data, i + 1)) # # 处理匹配结果并更新数据库 # for i, match in enumerate(matches): # file_name = matches[0] # new_column_data = new_data[i] # 根据匹配的索引获取对应的新数据 # 提交事务并关闭连接 conn.commit() conn.close() print("---新列数据已添加到数据库中---")

然后，指定了要遍历的文件夹路径，并定义了一个正则表达式模式用于匹配<body>标签内的数据。接下来，连接到SQLite数据库，并添加一个新的字段到名为"DIDI_talk"的数据表中，使用了ALTER TABLE语句。之后，...

利用python爬虫，提取C:/Users/test/Desktop/DIDItest文件夹下多个文件内的html文件源代码，并提取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容为音频则提取音频所在位置，反之则保留发送内容，并将爬取的内容写入csv中网页内源代码如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031361]2014年4月20日 03:55:45 , 434343 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031362]2014年4月20日 04:45:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031363]2014年4月20日 04:55:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031364]2014年4月20日 05:55:45 , 434343 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031365]2014年4月20日 06:55:45 , 434343 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>

folder_path = "C:/Users/test/Desktop/DIDItest" # 提取数据的正则表达式模式 pattern = r'\[talkid:(\d+)\](.*?)(\d+) 向 <span class="hint-success" data-hint...

仅提取body内数据并删除<>p()

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\链接导入csv中.py", line 57, in <module> df_extracted = df_extracted.append({'File Name': file_name}, ignore_index=True) ^^^^^^^^^^^^^^^^^^^ File "C:\Users\test\PycharmProjects\pythonProject\venv\Lib\site-packages\pandas\core\generic.py", line 5989, in getattr return object.getattribute(self, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'DataFrame' object has no attribute 'append'. Did you mean: '_append'?

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建一个空的DataFrame用于存储提取的文件名数据 df_extracted = pd.DataFrame(columns=['File Name']) # 遍历...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\1234.py", line 59, in <module> print("时间:", time[i]) ~~~~^^^ IndexError: list index out of range

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'\[talkid:(\d+)\]' time_pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]' # 遍历文件夹中的所有文件 for root, dirs, ...

仅提取包含talkid的数据

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'\[talkid:(\d+)\]' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html...

with open(csv_file, "w", newline="", encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(["Talk ID", "Time", "Sender ID", "Receive ID", "Talk Type", "Content", "File Name"])

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 打开CSV文件进行写入 with open(csv_file, "w", newline="",...

原有data数据添加python爬取的一列数据

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)</body>' # 连接数据库 conn = sqlite3.connect('chat_data.db') cursor = conn.cursor() # 添加新的列 cursor.execute(...

提取talk_id、时间（时间转化为‘0000-00-00’格式）、发送ID、接收ID、信息类型（如果是向设置为私聊，如果是在群设置为群聊）、发送内容（如果是语音或图片设置默认值空值，反之则提取信息内容）、文件名（如果是图片、语音提取<a href=后面的链接地址）

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 talkid_pattern = r'\[talkid:(\d+)\]' time_pattern = r'\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]' sender_pattern = r'\[sender:(.*?)\]' ...

beautifulsoup-readthedocs-io-zh_CN-latest.zip

Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.

相关推荐

import sys import os import urllib from bs4 import BeautifulSoup

import reimport requestsfrom bs4 import BeautifulSoupimport t

beautifulsoup以及正则表达式re之间的一些知识！

仅提取body内数据并删除<>p()

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\1234.py", line 59, in <module> print("时间:", time[i]) ~~~~^^^ IndexError: list index out of range

仅提取包含talkid的数据

with open(csv_file, "w", newline="", encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(["Talk ID", "Time", "Sender ID", "Receive ID", "Talk Type", "Content", "File Name"])

原有data数据添加python爬取的一列数据

beautifulsoup-readthedocs-io-zh_CN-latest.zip

最新推荐

Java课程设计-java web 网上商城，后台商品管理（前后端源码+数据库+文档） .zip

母线电容计算 .xmcd

2022年中国大学生计算机设计大赛国赛优秀作品点评微课与教学辅助&数媒静态设计专业组视频

输出月份英文名pta.doc

单片机C语言Proteus仿真实例万年历

zigbee-cluster-library-specification

管理建模和仿真的文件

实现实时数据湖架构：Kafka与Hive集成

用matlab绘制高斯色噪声情况下的频率估计CRLB，其中w(n)是零均值高斯色噪声，w(n)=0.8*w(n-1)+e(n)，e(n)服从零均值方差为se的高斯分布

JSBSim Reference Manual