import os import re import csv from bs4 import BeautifulSoup # 设置html文件路径 folder_path = r'C:\Users\test\Desktop\DIDItest' output_file = r'C:\Users\test\Desktop\output.csv' # 提取html文件内所需要数据 def extract_html_info(file_path, csv_writer): with open(file_path, 'r', encoding='utf-8') as file: # 读取HTML源代码 html = file.read() soup = BeautifulSoup(html, 'html.parser') # 提取所有的<p>标签 p_tags = soup.find_all('p') for p_tag in p_tags: # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断是否是音频 if '音频' in message: file_url = p_tag.find('a')['href'] csv_writer.writerow([talk_id, timestamp, send_number, receive_number, file_url]) else: csv_writer.writerow([talk_id, timestamp, send_number, receive_number, message]) # 创建CSV文件并写入数据 with open(output_file, 'w', newline='', encoding='utf-8') as file: csv_writer = csv.writer(file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '内容']) # 遍历文件夹及子文件夹，提取HTML文件信息 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) extract_html_info(file_path, csv_writer) print("数据已成功写入CSV文件。")

import os from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).?<span.?>(\d+)<.?>(.?)<', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.?hint-success.?>(\d+)<.?>', body_data) # match = re.search('(中发言|发送)\s(.?)\s', body_data) # if match: # content = match.group(2) matches2 = re.findall('(中发言|发送)\s(.?)\s', body_data) for match in matches2: content = match[1] soup = BeautifulSoup(content, 'html.parser') if soup.find('= 2: receive_id = matches1[3] # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] # 进行时间格式转换，将time转换为"0000-00-00"格式 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') # 打印结果 print("Talk ID:", talk_id) print("Time:", time) print("Sender ID:", send_id) print("Receive_id:", receive_id) print("Talk_type:", talk_type) print("Content:",content) print("---")导入至csv

folder_path = "C:/Users/test/Desktop/DIDItest" output_file = "output.csv" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建 CSV 文件并写入表头 with open(output_file, "w", newline="", encoding="utf-...

Traceback (most recent call last): File "C:\Users\test\Desktop\DIDI测试.py", line 55, in <module> extract_html_info(file_path, csv_writer) File "C:\Users\test\Desktop\DIDI测试.py", line 25, in extract_html_info timestamp = p_tag.find_previous('body').find_previous('head').find('meta', {'http-equiv': 'Content=-Type'})[ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: 'NoneType' object is not subscriptable

folder_path = r'C:\Users\test\Desktop\DIDItest' output_file = r'C:\Users\test\Desktop\output.csv' # 提取html文件内所需要数据 def extract_html_info(file_path, csv_writer): with open(file_path, 'r', ...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\链接导入csv中.py", line 57, in <module> df_extracted = df_extracted.append({'File Name': file_name}, ignore_index=True) ^^^^^^^^^^^^^^^^^^^ File "C:\Users\test\PycharmProjects\pythonProject\venv\Lib\site-packages\pandas\core\generic.py", line 5989, in getattr return object.getattribute(self, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'DataFrame' object has no attribute 'append'. Did you mean: '_append'?

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建一个空的DataFrame用于存储提取的文件名数据 df_extracted = pd.DataFrame(columns=['File Name']) # 遍历...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\DIDI数据写入CSV.py", line 26, in <module> talk_id = message.find_previous('a').text.strip()[1:] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'text'

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入表头 csv_file = open('output.csv', 'w', newline='', encoding='utf-8') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并爬取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码'...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并将源代码转换为字符串，爬取源代码字符串中的ID、时间、发送号码、接收号码、信息类型、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中，talkid提取[]中talkid：后的数字、时间精确至年月日时分秒、发送号码提取第一个 data-hint"">之间的数字，接收号码提取第二个data-hint"">，信息类型就提取发送与：之间的文字，如果没有：则定义为文字

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入标题行 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码'...

<html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031373]2014年4月20日 05:55:45 , 111222 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031374]2014年4月20日 06:55:45 , 111222 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并爬取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码'...

利用python爬虫，提取C:/Users/test/Desktop/DIDItest文件夹下多个文件内的html文件源代码，并提取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容为音频则提取音频所在位置，反之则保留发送内容，并将爬取的内容写入csv中网页内源代码如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031361]2014年4月20日 03:55:45 , 434343 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031362]2014年4月20日 04:45:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031363]2014年4月20日 04:55:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031364]2014年4月20日 05:55:45 , 434343 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031365]2014年4月20日 06:55:45 , 434343 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>

folder_path = 'C:/Users/test/Desktop/DIDItest' output_file = 'output.csv' extracted_data = [] for file_name in os.listdir(folder_path): if file_name.endswith('.html'): file_path = os.path.join...

from bs4 import BeautifulSoup import csv import os import re # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹下的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) # 打开文件并解析HTML源代码 with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # 提取talkid、时间、发送号码、接收号码、信息类型和消息内容的正则表达式模式 pattern = r'\[talkid:(\d+)\](.?) 向 (.?) 发送 (.?):\[(.?)\]' matches = re.findall(pattern, soup) # 遍历匹配结果并输出 for match in matches: talkid = match[0] time = match[1].strip() sender = match[2].strip() receiver = match[3].strip() type = match[4].strip() content = re.findall(r'', match[5])[0] if type in ['音频', '图片'] else match[5] # 写入CSV文件 csv_writer.writerow([talkid, time, sender, receiver, content]) # 关闭CSV文件 csv_file.close() print("数据已成功写入CSV文件。")

2. 设置文件夹路径：将需要解析的HTML文件所在的文件夹路径赋值给变量folder_path。 3. 创建CSV文件：使用open函数创建一个名为output.csv的CSV文件，并创建一个csv.writer对象用于写入数据。第一行写入CSV...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 创建CSV文件并写入表头 # CSV文件路径 csv_file = 'path/to/your/csv/file.csv' csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建CSV文件并写入表头 csv_file = "output.csv" header = ['File Name'] # 表头 # 首次创建CSV文件时，写入...

import os import csv from bs4 import BeautifulSoup # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path = os.path.join(root, file_name) # 打开HTML文件并读取源代码 with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # 在这里可以对源代码进行进一步的操作和提取 soup = BeautifulSoup(html_content, 'html.parser') messages = soup.find_all('p') for message in messages: talk_id_tag = message.find_previous('a') if talk_id_tag: talk_id = talk_id_tag.text.strip()[1:] time = message.contents[0].strip().split(',')[0][1:] send_phone = message.find('span', class_='hint-success').text.strip() receive_phone = message.find_all('span', class_='hint-success')[1].text.strip() content_tag = message.find('a') if content_tag: content = content_tag['href'] content_type = '音频' else: content = message.text.strip() content_type = '文本' print("talkid:", talk_id) print("时间:", time) print("发送号码:", send_phone) print("接收号码:", receive_phone) print("发送内容:", content) print()将提取到的数据写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入表头 csv_file_path = 'output.csv' with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file: writer = csv.writer(csv_...

import os import re import csv from bs4 import BeautifulSoup folder_path = 'C:/Users/test/Desktop/DIDItest' html_files = [] # 遍历文件夹及其子文件夹下所有HTML文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): html_files.append(os.path.join(root, file))打开并提取所有html文件源代码

folder_path = 'C:/Users/test/Desktop/DIDItest' html_files = [] 3. 使用os.walk()函数遍历文件夹及其子文件夹下的所有文件，并筛选出以.html结尾的文件： python for root, dirs, files in os.walk...

import os import csv from bs4 import BeautifulSoup # 文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # CSV文件路径 csv_file = r'C:/Users/test/Desktop/output.csv' # 创建CSV文件并写入表头 with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹内的所有文件 for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) # 判断文件是否为HTML文件 if file_name.endswith('.html'): # 打开HTML文件并解析源代码 with open(file_path, 'r', encoding='utf-8') as file: soup = BeautifulSoup(file, 'html.parser') # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断发送内容是否为音频 if '音频' in content: audio_link = soup.find('a')['href'] content = f'音频文件位置：{audio_link}' # 将提取的信息写入CSV文件 with open(csv_file, 'a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([talk_id, time, send_number, receive_number, content]) print("数据已成功写入CSV文件。")

请确保将folder_path变量设置为你的文件夹路径，并将csv_file变量设置为你想要保存CSV文件的路径。代码运行完成后，会在指定路径下生成一个名为output.csv的CSV文件，并将提取的数据写入其中。如果一切顺利...

import os import csv from bs4 import BeautifulSoup # 文件夹路径 folder_path = 'C:\Users\test\Desktop\DIDItest' # CSV文件路径 csv_file = r'C:\Users\test\Desktop\output.csv' # 创建CSV文件并写入表头 with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹内的所有文件 for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) # 判断文件是否为HTML文件 if file_name.endswith('.html'): # 打开HTML文件并解析源代码 with open(file_path, 'r', encoding='utf-8') as file: soup = BeautifulSoup(file, 'html.parser') # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断发送内容是否为音频 if '音频' in content: audio_link = soup.find('a')['href'] content = f'音频文件位置：{audio_link}' # 将提取的信息写入CSV文件 with open(csv_file, 'a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([talk_id, time, send_number, receive_number, content]) print("数据已成功写入CSV文件。")

请确保将folder_path和csv_file中的路径分隔符改为\\，或者使用原始字符串（在字符串前面加上r）。 2. 请确保文件夹路径和CSV文件路径是正确的，并且有读取和写入的权限。如果你仍然遇到乱码问题，请提供...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取链接地址 matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','')将上述file_name的结果添加到已有数据的csv文中，将其存放值在指定的file_name一列中

请注意，你需要将path/to/your/csv/file.csv替换为你实际的CSV文件路径。此代码将在CSV文件中创建一个名为file_name的新列，并将file_name的结果添加到每一行中。最后，它将保存修改后的数据回到CSV文件中，...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 创建一个空的DataFrame用于存储提取的文件名数据 df_...

基于springboot教育资源共享平台源码数据库文档.zip

视频笔记linux开发篇

linux开发篇，配套视频：https://www.bilibili.com/list/474327672?sid=4493702&spm_id_from=333.999.0.0&desc=1

相关推荐

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\DIDI数据写入CSV.py", line 26, in <module> talk_id = message.find_previous('a').text.strip()[1:] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'text'

基于springboot教育资源共享平台源码数据库文档.zip

视频笔记linux开发篇

最新推荐

基于springboot教育资源共享平台源码数据库文档.zip

视频笔记linux开发篇

readera-24-09-08plus2020.apk

STM32单片机控制舵机旋转

基于springboot仓库管理系统源码数据库文档.zip

全国江河水系图层shp文件包下载

管理建模和仿真的文件

Keras模型压缩与优化：减小模型尺寸与提升推理速度

MTK 6229 BB芯片在手机中有哪些核心功能，OTG支持、Wi-Fi支持和RTC晶振是如何实现的？

点云二值化测试数据集的详细解读