从body_data中提取Talkid
时间: 2024-04-16 10:27:27 浏览: 133
要从 `body_data` 中提取 Talkid,您可以使用正则表达式或 BeautifulSoup 进行处理。这里提供两种方法:
方法一:使用正则表达式
```python
import re
# 提取 Talkid
talkid_pattern = r'Talkid: (\d+)'
talkid_match = re.search(talkid_pattern, body_data)
if talkid_match:
talkid = talkid_match.group(1)
else:
talkid = None
```
方法二:使用 BeautifulSoup
```python
from bs4 import BeautifulSoup
# 创建 BeautifulSoup 对象
soup = BeautifulSoup(body_data, 'html.parser')
# 提取 Talkid
talkid_element = soup.find('span', class_='talkid')
if talkid_element:
talkid = talkid_element.text.strip()
else:
talkid = None
```
以上两种方法都可以从 `body_data` 中提取 Talkid。您可以根据自己的需求选择其中一种方法。
如果还有任何其他问题,请随时提问!
相关问题
import os from bs4 import BeautifulSoup import re # 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" # 正则表达式模式 pattern = r'<body>(.*?)<\/body>' # 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8") as f: html_code = f.read() # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除<p>和()</p> body_data = body_data[0].replace("<p>", "").replace("()</p>", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).*?<span.*?>(\d+)<.*?>(.*?)<', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.*?hint-success.*?>(\d+)<.*?>', body_data) # match = re.search('(中发言|发送)\s(.*?)\s', body_data) # if match: # content = match.group(2) matches2 = re.findall('(中发言|发送)\s(.*?)\s', body_data) for match in matches2: content = match[1] soup = BeautifulSoup(content, 'html.parser') if soup.find('<a href'): content = '' # 提取第二个号码为接收号码 if len(matches1) >= 2: receive_id = matches1[3] # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] # 进行时间格式转换,将time转换为"0000-00-00"格式 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') # 打印结果 print("Talk ID:", talk_id) print("Time:", time) print("Sender ID:", send_id) print("Receive_id:", receive_id) print("Talk_type:", talk_type) print("Content:",content) print("---")导入至csv
要将提取的信息导入 CSV 文件,你可以使用 Python 的 CSV 模块。下面是一个示例代码,可以帮助你实现这个需求:
```python
import os
import csv
from bs4 import BeautifulSoup
import re
folder_path = "C:/Users/test/Desktop/DIDItest"
output_file = "output.csv"
# 正则表达式模式
pattern = r'<body>(.*?)<\/body>'
# 创建 CSV 文件并写入表头
with open(output_file, "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Talk ID", "Time", "Sender ID", "Receiver ID", "Talk Type", "Content"])
# 遍历文件夹中的所有文件
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
# 读取 HTML 文件
with open(file_path, "r", encoding="utf-8") as html_file:
html_code = html_file.read()
# 使用正则表达式匹配<body>标签内的数据
body_data = re.findall(pattern, html_code, re.DOTALL)
if body_data:
# 剔除<p>和()</p>
body_data = body_data[0].replace("<p>", "").replace("()</p>", "")
# 使用正则表达式提取信息
matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).*?<span.*?>(\d+)<.*?>(.*?)<', body_data)
matches1 = re.findall(r'<span.*?hint-success.*?>(\d+)<.*?>', body_data)
matches2 = re.findall('(中发言|发送)\s(.*?)\s', body_data)
if len(matches1) >= 2:
receive_id = matches1[1]
# 处理匹配结果
for match in matches:
talk_id = match[0]
time = match[1]
send_id = match[2]
talk_type = match[3]
# 进行时间格式转换
time = time.replace('年', '-').replace('月', '-').replace('日', '')
talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊')
# 处理内容
content = ""
for match in matches2:
content = match[1]
soup = BeautifulSoup(content, 'html.parser')
if soup.find('a'):
content = ""
break
# 写入 CSV 文件
writer.writerow([talk_id, time, send_id, receive_id, talk_type, content])
```
这段代码将提取的信息写入名为 "output.csv" 的 CSV 文件中。你可以根据需要修改输出文件的路径和名称。
import os from bs4 import BeautifulSoup import re 指定文件夹路径 folder_path = "C:/Users/test/Desktop/DIDItest" 正则表达式模式 pattern = r'<body>(.*?)</body>' 遍历文件夹中的所有文件 for root, dirs, files in os.walk(folder_path): for file in files: # 读取html文件 file_path = os.path.join(root, file) with open(file_path, "r", encoding="utf-8-sig") as f: html_code = f.read() # 创建BeautifulSoup对象 soup = BeautifulSoup(html_code, 'html.parser') # 使用正则表达式匹配<body>标签内的数据 body_data = re.findall(pattern, html_code, re.DOTALL) # 剔除<p>和()</p> body_data = body_data[0].replace("<p>", "").replace("()</p>", "") # 使用正则表达式提取talk_id、时间、发送者ID和接收者ID matches = re.findall(r'\[talkid:(\d+)\](\d+年\d+月\d+日 \d+:\d+:\d+).*?<span.*?>(\d+)<.*?>(.*?)<.*?''((中发言|发送)\s(.*?)\s)', body_data) # 提取唯一ID,时间,发送号码和私聊群聊关键词 matches1 = re.findall(r'<span.*?hint-success.*?>(\d+)', body_data) matches2 = re.findall(r'(?:中发言|发送)\s*(.*?)\s*(?:音频 :|图片 :)?(?:\[([^\]]+)\])?', body_data) # 处理匹配结果 for match in matches: talk_id = match[0] time = match[1] send_id = match[2] talk_type = match[3] content = match[4] # 提取第二个号码为接收号码 if len(matches1) >= 2: receive_id = matches1[3] # 替换字符 time = time.replace('年', '-').replace('月', '-').replace('日', '') talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊') content = content.replace('音频', '').replace('图片', '').replace('发送','').replace('中发言','') content = re.sub(r'\n', '', content) print("---导入完成-----") 创建sql数据库并将数据导入到sql文件中
import sqlite3
# 连接数据库
conn = sqlite3.connect('chat_data.db')
cursor = conn.cursor()
# 创建表格
cursor.execute('''CREATE TABLE IF NOT EXISTS chat_data
(talk_id INT PRIMARY KEY,
time TEXT,
send_id INT,
receive_id INT,
talk_type TEXT,
content TEXT)''')
# 插入数据
for match in matches:
talk_id = match[0]
time = match[1]
send_id = match[2]
_type = match[3]
content = match[4]
# 提取第二个码为接收号码
if len(matches1) >= 2:
receive_id = matches1[3]
else:
receive_id = None
# 替换字符
time = time.replace('年', '-').replace('月', '-').replace('日', '')
talk_type = talk_type.replace('向', '私聊').replace('在群', '群聊')
content = content.replace('音频', '').replace('图片', '').replace('发送','').replace('中发言','')
content = re.sub(r'\n', '', content)
# 插入数据到表格中
cursor.execute("INSERT INTO chat_data VALUES (?, ?, ?, ?, ?, ?)", (talk_id, time, send_id, receive_id, talk_type, content))
# 提交事务并关闭连接
conn.commit()
conn.close()
print("---导入完成并保存到数据库中---")
阅读全文