from bs4 import BeautifulSoup import csv import os import re # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹下的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) # 打开文件并解析HTML源代码 with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # 提取talkid、时间、发送号码、接收号码、信息类型和消息内容的正则表达式模式 pattern = r'\[talkid:(\d+)\](.*?)</span> 向 (.*?) 发送 (.*?):\[(.*?)\]' matches = re.findall(pattern, soup) # 遍历匹配结果并输出 for match in matches: talkid = match[0] time = match[1].strip() sender = match[2].strip() receiver = match[3].strip() type = match[4].strip() content = re.findall(r'<a href="(.*?)">', match[5])[0] if type in ['音频', '图片'] else match[5] # 写入CSV文件 csv_writer.writerow([talkid, time, sender, receiver, content]) # 关闭CSV文件 csv_file.close() print("数据已成功写入CSV文件。")

from bs4 import BeautifulSoup import csv import os import re # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹下的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) # 打开文件并解析HTML源代码 with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # 将soup对象转换为字符串 html_string = str(soup) # 提取talkid、时间、发送号码、接收号码、信息类型和消息内容的正则表达式模式 pattern = r'\[talkid:(\d+)\](.?) 向 (.?) 发送 (.?):\[(.?)\]' matches = re.findall(pattern, html_string)

2. 设置文件夹路径：将需要解析的HTML文件所在的文件夹路径赋值给变量folder_path。 3. 创建CSV文件：使用open函数创建一个名为output.csv的CSV文件，并创建一个csv.writer对象用于写入数据。第一行写入CSV...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 创建一个空的DataFrame用于存储提取的文件名数据 df_...

import os import csv from bs4 import BeautifulSoup # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path = os.path.join(root, file_name) # 打开HTML文件并读取源代码 with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # 在这里可以对源代码进行进一步的操作和提取 soup = BeautifulSoup(html_content, 'html.parser') messages = soup.find_all('p') for message in messages: talk_id_tag = message.find_previous('a') if talk_id_tag: talk_id = talk_id_tag.text.strip()[1:] time = message.contents[0].strip().split(',')[0][1:] send_phone = message.find('span', class_='hint-success').text.strip() receive_phone = message.find_all('span', class_='hint-success')[1].text.strip() content_tag = message.find('a') if content_tag: content = content_tag['href'] content_type = '音频' else: content = message.text.strip() content_type = '文本' print("talkid:", talk_id) print("时间:", time) print("发送号码:", send_phone) print("接收号码:", receive_phone) print("发送内容:", content) print()

folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path =...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取链接地址 matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','')将上述file_name的结果添加到已有数据的csv文中，将其存放值在指定的file_name一列中

请注意，你需要将path/to/your/csv/file.csv替换为你实际的CSV文件路径。此代码将在CSV文件中创建一个名为file_name的新列，并将file_name的结果添加到每一行中。最后，它将保存修改后的数据回到CSV文件中，...

import os import csv from bs4 import BeautifulSoup # 文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # CSV文件路径 csv_file = r'C:/Users/test/Desktop/output.csv' # 创建CSV文件并写入表头 with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹内的所有文件 for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) # 判断文件是否为HTML文件 if file_name.endswith('.html'): # 打开HTML文件并解析源代码 with open(file_path, 'r', encoding='utf-8') as file: soup = BeautifulSoup(file, 'html.parser') # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断发送内容是否为音频 if '音频' in content: audio_link = soup.find('a')['href'] content = f'音频文件位置：{audio_link}' # 将提取的信息写入CSV文件 with open(csv_file, 'a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([talk_id, time, send_number, receive_number, content]) print("数据已成功写入CSV文件。")

请确保将folder_path变量设置为你的文件夹路径，并将csv_file变量设置为你想要保存CSV文件的路径。代码运行完成后，会在指定路径下生成一个名为output.csv的CSV文件，并将提取的数据写入其中。如果一切顺利...

import os from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).?<span.?>(\d+)<.?>(.?)<', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.?hint-success.?>(\d+)<.?>', body_data) # match = re.search('(中发言|发送)\s(.?)\s', body_data) # if match: # content = match.group(2) matches2 = re.findall('(中发言|发送)\s(.?)\s', body_data) for match in matches2: content = match[1] soup = BeautifulSoup(content, 'html.parser') if soup.find('= 2: receive_id = matches1[3] # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] # 进行时间格式转换，将time转换为"0000-00-00"格式 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') # 打印结果 print("Talk ID:", talk_id) print("Time:", time) print("Sender ID:", send_id) print("Receive_id:", receive_id) print("Talk_type:", talk_type) print("Content:",content) print("---")导入至csv

folder_path = "C:/Users/test/Desktop/DIDItest" output_file = "output.csv" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建 CSV 文件并写入表头 with open(output_file, "w", newline="", encoding="utf-...

import os import csv from bs4 import BeautifulSoup # 文件夹路径 folder_path = 'C:\Users\test\Desktop\DIDItest' # CSV文件路径 csv_file = r'C:\Users\test\Desktop\output.csv' # 创建CSV文件并写入表头 with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹内的所有文件 for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) # 判断文件是否为HTML文件 if file_name.endswith('.html'): # 打开HTML文件并解析源代码 with open(file_path, 'r', encoding='utf-8') as file: soup = BeautifulSoup(file, 'html.parser') # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断发送内容是否为音频 if '音频' in content: audio_link = soup.find('a')['href'] content = f'音频文件位置：{audio_link}' # 将提取的信息写入CSV文件 with open(csv_file, 'a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([talk_id, time, send_number, receive_number, content]) print("数据已成功写入CSV文件。")

请确保将folder_path和csv_file中的路径分隔符改为\\，或者使用原始字符串（在字符串前面加上r）。 2. 请确保文件夹路径和CSV文件路径是正确的，并且有读取和写入的权限。如果你仍然遇到乱码问题，请提供...

import os import re import csv from bs4 import BeautifulSoup # 设置html文件路径 folder_path = r'C:\Users\test\Desktop\DIDItest' output_file = r'C:\Users\test\Desktop\output.csv' # 提取html文件内所需要数据 def extract_html_info(file_path, csv_writer): with open(file_path, 'r', encoding='utf-8') as file: # 读取HTML源代码 html = file.read() soup = BeautifulSoup(html, 'html.parser') # 提取所有的标签 p_tags = soup.find_all('p') for p_tag in p_tags: # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断是否是音频 if '音频' in message: file_url = p_tag.find('a')['href'] csv_writer.writerow([talk_id, timestamp, send_number, receive_number, file_url]) else: csv_writer.writerow([talk_id, timestamp, send_number, receive_number, message]) # 创建CSV文件并写入数据 with open(output_file, 'w', newline='', encoding='utf-8') as file: csv_writer = csv.writer(file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '内容']) # 遍历文件夹及子文件夹，提取HTML文件信息 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) extract_html_info(file_path, csv_writer) print("数据已成功写入CSV文件。")

1. 导入必要的模块：os、re、csv和BeautifulSoup。 2. 设置HTML文件夹路径和输出CSV文件路径。 3. 定义一个函数extract_html_info用于提取HTML文件中所需的数据。 4. 打开HTML文件，读取其源代码。 5. 使用...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并爬取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码'...

利用python爬虫，提取C:/Users/test/Desktop/DIDItest文件夹下每个文件夹内的html文件源代码，并提取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容为音频则提取音频所在位置，反之则保留发送内容，并将爬取的内容写入csv中网页内源代码如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031361]2014年4月20日 03:55:45 , 434343 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031362]2014年4月20日 04:45:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031363]2014年4月20日 04:55:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031364]2014年4月20日 05:55:45 , 434343 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031365]2014年4月20日 06:55:45 , 434343 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>

folder_path = 'C:/Users/test/Desktop/DIDItest' # CSV文件路径 csv_file = 'output.csv' # CSV文件列名 csv_columns = ['ID', '时间', '发送号码', '接收号码', '发送内容'] # 创建CSV文件并写入列名 with open...

利用python爬虫，提取C:/Users/test/Desktop/DIDItest文件夹下多个文件内的html文件源代码，并提取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容为音频则提取音频所在位置，反之则保留发送内容，并将爬取的内容写入csv中网页内源代码如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031361]2014年4月20日 03:55:45 , 434343 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031362]2014年4月20日 04:45:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031363]2014年4月20日 04:55:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031364]2014年4月20日 05:55:45 , 434343 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031365]2014年4月20日 06:55:45 , 434343 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>

folder_path = "C:/Users/test/Desktop/DIDItest" # 提取数据的正则表达式模式 pattern = r'\[talkid:(\d+)\](.*?)<span class="hint-success" data-hint="">(\d+)</span> 向 <span class="hint-success" data-hint...

<html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031373]2014年4月20日 05:55:45 , 111222 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031374]2014年4月20日 06:55:45 , 111222 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并爬取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码'...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并将源代码转换为字符串，爬取源代码字符串中的ID、时间、发送号码、接收号码、信息类型、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中，talkid提取[]中talkid：后的数字、时间精确至年月日时分秒、发送号码提取第一个 data-hint"">之间的数字，接收号码提取第二个data-hint"">，信息类型就提取发送与：之间的文字，如果没有：则定义为文字

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入标题行 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码'...

BS4_BeautifulSoup.docx

from bs4 import BeautifulSoup soup = BeautifulSoup(open("index.html"), 'lxml') 也可以直接将HTML字符串作为参数传递给BeautifulSoup构造函数： python soup = BeautifulSoup("<html>data</html>", ...

bs4_beautifulsoup4.zip

Python中用于网络爬虫读取网页的函数库，BeautifulSoup是python解析html非常好用的第三方库！

相关推荐

import sys import os import urllib from bs4 import BeautifulSoup

import reimport requestsfrom bs4 import BeautifulSoupimport t

爬取彼岸图网的壁纸 https://pic.netbian.com/

BS4_BeautifulSoup.docx

bs4_beautifulsoup4.zip

大家在看

ORACLE_EBS用户 职责 菜单 预置文件

地图分幅制作生产方法

surfer教程

和利时macs3手册

多變異圖的概念-minitab的PPT简易教程

最新推荐

白色简洁风格的软件UI界面后台管理系统模板.zip

自动软包电芯极耳短路测试精切一体机sw17可编辑全套技术资料100%好用.zip

RuntimeException如何解决.md

云链客服需要注意的事项

白色简洁风格的室内设计案例源码下载.rar

掌握HTML/CSS/JS和Node.js的Web应用开发实践

管理建模和仿真的文件

计算机体系结构概述：基础概念与发展趋势

int a[][3]={{1,2},{4}}输出这个数组

勒玛算法研讨会项目：在线商店模拟与Qt界面实现

ORACLE_EBS用户职责菜单预置文件