import os import csv from bs4 import BeautifulSoup # 文件夹路径 folder_path = 'C:\Users\test\Desktop\DIDItest' # CSV文件路径 csv_file = r'C:\Users\test\Desktop\output.csv' # 创建CSV文件并写入表头 with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹内的所有文件 for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) # 判断文件是否为HTML文件 if file_name.endswith('.html'): # 打开HTML文件并解析源代码 with open(file_path, 'r', encoding='utf-8') as file: soup = BeautifulSoup(file, 'html.parser') # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断发送内容是否为音频 if '音频' in content: audio_link = soup.find('a')['href'] content = f'音频文件位置：{audio_link}' # 将提取的信息写入CSV文件 with open(csv_file, 'a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([talk_id, time, send_number, receive_number, content]) print("数据已成功写入CSV文件。")

import os import csv from bs4 import BeautifulSoup # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path = os.path.join(root, file_name) # 打开HTML文件并读取源代码 with open(file_path, 'r', encoding='utf-8') as file: html_content = file.read() # 在这里可以对源代码进行进一步的操作和提取 soup = BeautifulSoup(html_content, 'html.parser') messages = soup.find_all('p') for message in messages: talk_id_tag = message.find_previous('a') if talk_id_tag: talk_id = talk_id_tag.text.strip()[1:] time = message.contents[0].strip().split(',')[0][1:] send_phone = message.find('span', class_='hint-success').text.strip() receive_phone = message.find_all('span', class_='hint-success')[1].text.strip() content_tag = message.find('a') if content_tag: content = content_tag['href'] content_type = '音频' else: content = message.text.strip() content_type = '文本' print("talkid:", talk_id) print("时间:", time) print("发送号码:", send_phone) print("接收号码:", receive_phone) print("发送内容:", content) print()

folder_path = 'C:/Users/test/Desktop/DIDItest' # 遍历文件夹中的所有文件和子文件夹 for root, dirs, files in os.walk(folder_path): for file_name in files: if file_name.endswith('.html'): file_path =...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 创建CSV文件并写入表头 # CSV文件路径 csv_file = 'path/to/your/csv/file.csv' csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建CSV文件并写入表头 csv_file = "output.csv" header = ['File Name'] # 表头 # 首次创建CSV文件时，写入...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','') # 将提取的数据加载到DataFrame中 df_extracted = pd.DataFrame(file_name) # 读取原有的CSV文件 df_original = pd.read_csv(csv_file) print("---导入完成-----")

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 打开原有的CSV文件路径 csv_file = "output.csv" # 创建一个空的DataFrame用于存储提取的文件名数据 df_...

from bs4 import BeautifulSoup import csv import os import re # 设置文件夹路径 folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '发送内容']) # 遍历文件夹下的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) # 打开文件并解析HTML源代码 with open(file_path, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') # 将soup对象转换为字符串 html_string = str(soup) # 提取talkid、时间、发送号码、接收号码、信息类型和消息内容的正则表达式模式 pattern = r'\[talkid:(\d+)\](.?) 向 (.?) 发送 (.?):\[(.?)\]' matches = re.findall(pattern, html_string)

2. 设置文件夹路径：将需要解析的HTML文件所在的文件夹路径赋值给变量folder_path。 3. 创建CSV文件：使用open函数创建一个名为output.csv的CSV文件，并创建一个csv.writer对象用于写入数据。第一行写入CSV...

import os from bs4 import BeautifulSoup import re import csv import pandas as pd # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # # 使用正则表达式提取链接地址 matches2 = re.findall(r'(?:中发言|发送)\s(.?)\s(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) for match in matches2: # 提取链接地址 file_text = match[1] matches = re.findall(r'"([^"]*)"', file_text) if matches: file_name = matches[0] else: file_name = "No matches found." # print(file_name) # 替换字符 file_name = file_name.replace('No matches found.','')将上述file_name的结果添加到已有数据的csv文中，将其存放值在指定的file_name一列中

请注意，你需要将path/to/your/csv/file.csv替换为你实际的CSV文件路径。此代码将在CSV文件中创建一个名为file_name的新列，并将file_name的结果添加到每一行中。最后，它将保存修改后的数据回到CSV文件中，...

import os import re import csv from bs4 import BeautifulSoup # 设置html文件路径 folder_path = r'C:\Users\test\Desktop\DIDItest' output_file = r'C:\Users\test\Desktop\output.csv' # 提取html文件内所需要数据 def extract_html_info(file_path, csv_writer): with open(file_path, 'r', encoding='utf-8') as file: # 读取HTML源代码 html = file.read() soup = BeautifulSoup(html, 'html.parser') # 提取所有的标签 p_tags = soup.find_all('p') for p_tag in p_tags: # 提取ID、时间、发送号码、接收号码和发送内容 talk_id = soup.find('span', class_='hint-success').text.strip() time = soup.find('body').contents[0].strip() send_number = soup.find_all('span', class_='hint-success')[0].text.strip() receive_number = soup.find_all('span', class_='hint-success')[1].text.strip() content = soup.find('p').contents[0].strip() # 判断是否是音频 if '音频' in message: file_url = p_tag.find('a')['href'] csv_writer.writerow([talk_id, timestamp, send_number, receive_number, file_url]) else: csv_writer.writerow([talk_id, timestamp, send_number, receive_number, message]) # 创建CSV文件并写入数据 with open(output_file, 'w', newline='', encoding='utf-8') as file: csv_writer = csv.writer(file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码', '内容']) # 遍历文件夹及子文件夹，提取HTML文件信息 for root, dirs, files in os.walk(folder_path): for file in files: if file.endswith('.html'): file_path = os.path.join(root, file) extract_html_info(file_path, csv_writer) print("数据已成功写入CSV文件。")

1. 导入必要的模块：os、re、csv和BeautifulSoup。 2. 设置HTML文件夹路径和输出CSV文件路径。 3. 定义一个函数extract_html_info用于提取HTML文件中所需的数据。 4. 打开HTML文件，读取其源代码。 5. 使用...

import os from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除和() body_data = body_data[0].replace("", "").replace("()", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).?<span.?>(\d+)<.?>(.?)<', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.?hint-success.?>(\d+)<.?>', body_data) # match = re.search('(中发言|发送)\s(.?)\s', body_data) # if match: # content = match.group(2) matches2 = re.findall('(中发言|发送)\s(.?)\s', body_data) for match in matches2: content = match[1] soup = BeautifulSoup(content, 'html.parser') if soup.find('= 2: receive_id = matches1[3] # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] # 进行时间格式转换，将time转换为"0000-00-00"格式 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') # 打印结果 print("Talk ID:", talk_id) print("Time:", time) print("Sender ID:", send_id) print("Receive_id:", receive_id) print("Talk_type:", talk_type) print("Content:",content) print("---")导入至csv

folder_path = "C:/Users/test/Desktop/DIDItest" output_file = "output.csv" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建 CSV 文件并写入表头 with open(output_file, "w", newline="", encoding="utf-...

Traceback (most recent call last): File "C:\Users\test\Desktop\DIDI测试.py", line 55, in <module> extract_html_info(file_path, csv_writer) File "C:\Users\test\Desktop\DIDI测试.py", line 25, in extract_html_info timestamp = p_tag.find_previous('body').find_previous('head').find('meta', {'http-equiv': 'Content=-Type'})[ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: 'NoneType' object is not subscriptable

folder_path = r'C:\Users\test\Desktop\DIDItest' output_file = r'C:\Users\test\Desktop\output.csv' # 提取html文件内所需要数据 def extract_html_info(file_path, csv_writer): with open(file_path, 'r', ...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\链接导入csv中.py", line 57, in <module> df_extracted = df_extracted.append({'File Name': file_name}, ignore_index=True) ^^^^^^^^^^^^^^^^^^^ File "C:\Users\test\PycharmProjects\pythonProject\venv\Lib\site-packages\pandas\core\generic.py", line 5989, in getattr return object.getattribute(self, name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'DataFrame' object has no attribute 'append'. Did you mean: '_append'?

folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'(.*?)<\/body>' # 创建一个空的DataFrame用于存储提取的文件名数据 df_extracted = pd.DataFrame(columns=['File Name']) # 遍历...

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\DIDI数据写入CSV.py", line 26, in <module> talk_id = message.find_previous('a').text.strip()[1:] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'text'

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入表头 csv_file = open('output.csv', 'w', newline='', encoding='utf-8') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并爬取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码', '接收号码'...

网页内源代码模板如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031370]2014年4月20日 03:55:45 , 111222 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031371]2014年4月20日 04:45:45 , 111222 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031372]2014年4月20日 04:55:45 , 111222 向 123456 发送图片 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () </body> </html> 利用python爬虫，打开C:/Users/test/Desktop/DIDItest文件夹下多个文件夹内的html文件源代码，并将源代码转换为字符串，爬取源代码字符串中的ID、时间、发送号码、接收号码、信息类型、发送内容，如果发送内容不为文本，则提取文件所在链接地址，并将爬取的内容写入csv中，talkid提取[]中talkid：后的数字、时间精确至年月日时分秒、发送号码提取第一个 data-hint"">之间的数字，接收号码提取第二个data-hint"">，信息类型就提取发送与：之间的文字，如果没有：则定义为文字

folder_path = 'C:/Users/test/Desktop/DIDItest' # 创建CSV文件并写入标题行 csv_file = open('output.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(['ID', '时间', '发送号码'...

利用python爬虫，提取C:/Users/test/Desktop/DIDItest文件夹下每个文件夹内的html文件源代码，并提取源代码中的ID、时间、发送号码、接收号码、发送内容，如果发送内容为音频则提取音频所在位置，反之则保留发送内容，并将爬取的内容写入csv中网页内源代码如下： <html> <meta http-equiv="Content=-Type" content="text/html; charset=utf-8"> <head> </head> <body>[talkid:138031361]2014年4月20日 03:55:45 , 434343 向 232323 发送我们已经是好友了，开始聊天吧！ () [talkid:138031362]2014年4月20日 04:45:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.m4a"]>音频 () [talkid:138031363]2014年4月20日 04:55:45 , 434343 向 123456 发送音频 :[<ahref="files/f/f123fsasfsfsjdfrhf_n.jpg"]>图片 () [talkid:138031364]2014年4月20日 05:55:45 , 434343 向 3234221 发送我们已经是好友了，开始聊天吧！ () [talkid:138031365]2014年4月20日 06:55:45 , 434343 向 1359075 发送我们已经是好友了，开始聊天吧！ () </body> </html>

folder_path = 'C:/Users/test/Desktop/DIDItest' # CSV文件路径 csv_file = 'output.csv' # CSV文件列名 csv_columns = ['ID', '时间', '发送号码', '接收号码', '发送内容'] # 创建CSV文件并写入列名 with open...

aiohttp-3.7.3-cp36-cp36m-win_amd64.whl.rar

python whl离线安装包 pip安装失败可以尝试使用whl离线安装包安装第一步下载whl文件，注意需要与python版本配套 python版本号、32位64位、arm或amd64均有区别第二步使用pip install XXXXX.whl 命令安装，如果whl路径不在cmd窗口当前目录下，需要带上路径 WHL文件是以Wheel格式保存的Python安装包， Wheel是Python发行版的标准内置包格式。在本质上是一个压缩包，WHL文件中包含了Python安装的py文件和元数据，以及经过编译的pyd文件，这样就使得它可以在不具备编译环境的条件下，安装适合自己python版本的库文件。如果要查看WHL文件的内容，可以把.whl后缀名改成.zip，使用解压软件（如WinRAR、WinZIP）解压打开即可查看。为什么会用到whl文件来安装python库文件呢？在python的使用过程中，我们免不了要经常通过pip来安装自己所需要的包，大部分的包基本都能正常安装，但是总会遇到有那么一些包因为各种各样的问题导致安装不了的。这时我们就可以通过尝试去Python安装包大全中（whl包下载）下载whl包来安装解决问题。

基于Java中的swing类的图形化飞机游戏的开发练习.zip

基于Java中的Swing类开发的图形化飞机游戏练习包，为初学者和进阶学习者提供了实践Java GUI编程的绝佳机会。通过本资源，开发者可以利用Java语言和Swing库构建一个用户交互式的2D游戏，深入理解图形用户界面（GUI）编程和事件处理机制。该游戏的核心包括玩家飞机的控制、敌机的生成与移动、子弹发射与碰撞检测以及游戏胜负判定等逻辑。玩家通过鼠标移动控制己方飞机，实现平滑的移动和连续的子弹发射；而敌方飞机则按照一定算法无规律出现，随着游戏进程难度逐渐增加。游戏中还引入了特殊NPC，增加了额外的挑战和乐趣。为了提高游戏体验，游戏还包含了开始背景、结束背景以及背景音乐等元素。当玩家击毁敌机时，会有相应的得分计算和展示；若被敌机击中，则游戏结束并显示最终得分。此外，游戏还提供了查看历史前十记录、帮助和退出等选项，方便玩家进行游戏设置和了解游戏玩法。本资源适用于计算机科学与技术、软件工程、信息管理及相关专业的课程设计、毕业设计等环节，为学生提供实践操作的机会，帮助他们巩固Java编程知识，提高动手能力和发散思维。同时，也为希望学习不同技术领域的学习者提供了一个优秀的入门项目。

相关推荐

import sys import os import urllib from bs4 import BeautifulSoup

BS4_BeautifulSoup.docx

import reimport requestsfrom bs4 import BeautifulSoupimport t

Traceback (most recent call last): File "C:\Users\test\PycharmProjects\pythonProject\DIDI数据写入CSV.py", line 26, in <module> talk_id = message.find_previous('a').text.strip()[1:] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ AttributeError: 'NoneType' object has no attribute 'text'

aiohttp-3.7.3-cp36-cp36m-win_amd64.whl.rar

基于Java中的swing类的图形化飞机游戏的开发练习.zip

最新推荐

aiohttp-3.7.3-cp36-cp36m-win_amd64.whl.rar

基于Java中的swing类的图形化飞机游戏的开发练习.zip

SQLite：SQLite数据库创建与管理.docx

【完整源码+数据库】SpringBoot 集成 Spring Security短信验证码登录

去年和朋友一起做的java小游戏.游戏具体界面在readme中,游戏设计的uml图在design.pdf中.zip

探索AVL树算法：以Faculdade Senac Porto Alegre实践为例

管理建模和仿真的文件

【ggplot2绘图技巧】：R语言中的数据可视化艺术

HAL库怎样将ADC两个通道的电压结果输出到OLED上？

小学语文教学新工具：创新黑板设计解析