def data_split(data, rate): train_l = int(len(data) * rate) test_l = len(data) - train_l """打乱数据集并且划分""" train_set, test_set = torch.utils.data.random_split(data, [train_l, test_l]) return train_set, test_set
时间: 2024-04-27 17:24:12 浏览: 6
这是一个数据集划分函数,用于将数据集按照给定比例划分为训练集和测试集。
输入参数 data 是一个 PyTorch 数据集对象,rate 是训练集所占比例,取值范围为 (0, 1)。
在函数内部,首先计算出训练集和测试集的样本数量,然后调用 PyTorch 中的 torch.utils.data.random_split() 函数将数据集随机划分为训练集和测试集,这个函数返回的是两个新的数据集对象。最后将训练集和测试集分别返回。
相关问题
写一个python Flask销售预测系统中,有一个suanfa.py文件:先读取shuju.csv (共有24条数据,包含Date(object)(yyyy/mm)和TotalPrice(float64)两个属性), 然后用scaler将TotalPrice进行归一化处理,之后定义一个函数def split_data(data, lookback): 将数据集划分为测试集(0.2)和训练集(0.8),data_raw = data.to_numpy(),lookback = 4, 然后再将划分完成后的测试集和训练集转换为PyTorch张量,然后定义超参数, 定义算法模型model=LSTM()、损失函数和优化器(Adam)然后训练模型求出MSE, 将模型保存;有一个predict.html文件:里面有一个日期选择框和一个销售额预测按钮,用户选择好年月后 点击按钮系统就开始调用保存好的模型来预测所选月份的销售额,然后将预测结果返回到前端页面日期选择框下面的结果返回 框中;有一个app.py文件:定义路径。用flask和bootstrap、LayUI写出完整详细代码
suanfa.py:
```python
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
# Read csv file
data = pd.read_csv('shuju.csv')
# Normalize the data
scaler = MinMaxScaler()
data['TotalPrice'] = scaler.fit_transform(data[['TotalPrice']])
# Split data into train and test sets
def split_data(data, lookback):
data_raw = data.to_numpy()
data = []
for index in range(len(data_raw) - lookback):
data.append(data_raw[index: index + lookback])
data = np.array(data)
train_size = int(len(data) * 0.8)
train_data = data[:train_size, :]
test_data = data[train_size:, :]
return train_data, test_data
train_data, test_data = split_data(data, lookback=4)
# Convert to PyTorch tensors
train_data = torch.from_numpy(train_data).type(torch.Tensor)
test_data = torch.from_numpy(test_data).type(torch.Tensor)
# Define hyperparameters
input_size = 1
hidden_size = 2
num_layers = 1
output_size = 1
learning_rate = 0.01
num_epochs = 200
# Define LSTM model
class LSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super(LSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_()
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out[:, -1, :])
return out
model = LSTM(input_size, hidden_size, num_layers, output_size)
# Define loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
for epoch in range(num_epochs):
outputs = model(train_data)
optimizer.zero_grad()
loss = criterion(outputs, train_data[:, -1, :])
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))
# Save the model
torch.save(model, 'model.pt')
```
predict.html:
```html
<!DOCTYPE html>
<html>
<head>
<title>Predict sales</title>
<link rel="stylesheet" type="text/css" href="{{ url_for('static', filename='layui/css/layui.css') }}">
<script type="text/javascript" src="{{ url_for('static', filename='layui/layui.js') }}"></script>
</head>
<body>
<div class="layui-container">
<div class="layui-row">
<div class="layui-col-md-offset4 layui-col-md-4">
<h2 class="layui-text-center">Predict sales</h2>
<form class="layui-form" action="">
<div class="layui-form-item">
<label class="layui-form-label">Date</label>
<div class="layui-input-inline">
<input type="text" name="date" id="date" class="layui-input" placeholder="yyyy/mm">
</div>
</div>
<div class="layui-form-item">
<div class="layui-input-block">
<button type="button" class="layui-btn layui-btn-normal" onclick="predict()">Predict</button>
</div>
</div>
</form>
<div class="layui-text-center">
<h3>Predicted sales:</h3>
<h4 id="result"></h4>
</div>
</div>
</div>
</div>
<script type="text/javascript">
function predict() {
var date = document.getElementById("date").value;
if (date === "") {
layer.msg("Please enter a date");
return;
}
var year = parseInt(date.split("/")[0]);
var month = parseInt(date.split("/")[1]);
if (isNaN(year) || isNaN(month)) {
layer.msg("Invalid date format");
return;
}
if (month < 1 || month > 12) {
layer.msg("Invalid month");
return;
}
var data = [[
[{{ test_data[-1, :][0] }}],
[{{ test_data[-2, :][0] }}],
[{{ test_data[-3, :][0] }}],
[{{ test_data[-4, :][0] }}]
]];
var model = "{{ url_for('static', filename='model.pt') }}";
$.ajax({
type: "POST",
url: "{{ url_for('predict') }}",
data: JSON.stringify({
"year": year,
"month": month,
"data": data,
"model": model
}),
contentType: "application/json; charset=utf-8",
dataType: "json",
success: function(result) {
document.getElementById("result").innerHTML = result.predicted_sales.toFixed(2);
},
error: function(xhr, textStatus, errorThrown) {
layer.msg("Error: " + xhr.responseText);
}
});
}
</script>
</body>
</html>
```
app.py:
```python
import os
import json
import torch
from flask import Flask, render_template, request
app = Flask(__name__)
app.config['SECRET_KEY'] = os.urandom(24)
class SalesDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
x = self.data[index, :-1]
y = self.data[index, -1]
return x, y
@app.route('/')
def index():
return render_template('predict.html')
@app.route('/predict', methods=['POST'])
def predict():
year = request.json['year']
month = request.json['month']
data = torch.from_numpy(np.array(request.json['data'])).type(torch.Tensor)
model = torch.load(request.json['model'])
predicted_sales = model(data).item()
return json.dumps({'predicted_sales': predicted_sales})
if __name__ == '__main__':
app.run(debug=True)
```
实现C4.5算法,验证算法的正确性,并将算法应用于C:/Users/Administrator/Desktop/Bank-data
C4.5算法是一种常用的决策树算法,用于分类和预测。具体实现步骤如下:
1. 数据预处理:将数据集按照一定比例分为训练集和测试集,并将数据离散化。
2. 特征选择:计算每个特征的信息增益或信息增益比,选择信息增益或信息增益比最大的特征作为划分属性。
3. 决策树生成:以选定的划分属性作为根节点,将数据集分为多个子集,并递归生成子树。
4. 决策树剪枝:通过预测错误率进行剪枝,得到最终的决策树。
下面是Python实现C4.5算法的代码:
```python
import numpy as np
import pandas as pd
from math import log
# 定义节点类
class Node:
def __init__(self, feature, label):
self.feature = feature # 划分属性
self.label = label # 叶节点类别
self.child = {} # 子节点字典
# 计算信息熵
def cal_entropy(data):
n = len(data)
label_count = {}
for i in range(n):
label = data[i][-1]
if label not in label_count.keys():
label_count[label] = 0
label_count[label] += 1
entropy = 0.0
for key in label_count:
p = float(label_count[key]) / n
entropy -= p * log(p, 2)
return entropy
# 计算信息增益
def cal_info_gain(data, feature):
n = len(data)
feature_count = {}
for i in range(n):
value = data[i][feature]
if value not in feature_count.keys():
feature_count[value] = 0
feature_count[value] += 1
feature_entropy = 0.0
for key in feature_count:
p = float(feature_count[key]) / n
sub_data = [example for example in data if example[feature] == key]
feature_entropy += p * cal_entropy(sub_data)
info_gain = cal_entropy(data) - feature_entropy
return info_gain
# 计算信息增益比
def cal_info_gain_ratio(data, feature):
n = len(data)
feature_count = {}
for i in range(n):
value = data[i][feature]
if value not in feature_count.keys():
feature_count[value] = 0
feature_count[value] += 1
feature_entropy = 0.0
split_info = 0.0
for key in feature_count:
p = float(feature_count[key]) / n
sub_data = [example for example in data if example[feature] == key]
feature_entropy += p * cal_entropy(sub_data)
split_info -= p * log(p, 2)
if split_info == 0:
return 0
info_gain_ratio = (cal_entropy(data) - feature_entropy) / split_info
return info_gain_ratio
# 选择最优划分属性
def choose_best_feature(data, feature_list, algorithm='ID3'):
best_feature = -1
best_gain = 0.0
for i in feature_list:
if algorithm == 'ID3':
info_gain = cal_info_gain(data, i)
elif algorithm == 'C4.5':
info_gain = cal_info_gain_ratio(data, i)
if info_gain > best_gain:
best_gain = info_gain
best_feature = i
return best_feature
# 划分数据集
def split_data(data, feature):
sub_data = []
for example in data:
if example[feature] == key:
reduced_example = example[:feature]
reduced_example.extend(example[feature+1:])
sub_data.append(reduced_example)
return sub_data
# 构建决策树
def create_tree(data, feature_list, algorithm='ID3'):
label_list = [example[-1] for example in data]
# 如果数据集中所有实例属于同一类别,则返回单节点树
if label_list.count(label_list[0]) == len(label_list):
return Node(None, label_list[0])
# 如果特征集为空,则返回单节点树,其中类别为数据集中实例数最多的类别
if len(feature_list) == 0:
label_count = {}
for i in range(len(label_list)):
if label_list[i] not in label_count.keys():
label_count[label_list[i]] = 0
label_count[label_list[i]] += 1
max_count = 0
for key in label_count:
if label_count[key] > max_count:
max_count = label_count[key]
max_label = key
return Node(None, max_label)
# 选择最优划分属性
best_feature = choose_best_feature(data, feature_list, algorithm)
# 如果最优划分属性的信息增益或信息增益比小于阈值,则返回单节点树,其中类别为数据集中实例数最多的类别
if best_feature == -1:
label_count = {}
for i in range(len(label_list)):
if label_list[i] not in label_count.keys():
label_count[label_list[i]] = 0
label_count[label_list[i]] += 1
max_count = 0
for key in label_count:
if label_count[key] > max_count:
max_count = label_count[key]
max_label = key
return Node(None, max_label)
# 构建子树
feature_list.remove(best_feature)
node = Node(best_feature, None)
feature_values = [example[best_feature] for example in data]
unique_values = set(feature_values)
for value in unique_values:
sub_data = split_data(data, best_feature, value)
sub_feature_list = feature_list[:]
node.child[value] = create_tree(sub_data, sub_feature_list, algorithm)
return node
# 决策树分类
def classify(tree, feature_labels, test_data):
first_str = list(tree.child.keys())[0]
second_dict = tree.child[first_str]
feature_index = feature_labels.index(tree.feature)
for key in second_dict.keys():
if test_data[feature_index] == key:
if type(second_dict[key]).__name__ == 'Node':
class_label = classify(second_dict[key], feature_labels, test_data)
else:
class_label = second_dict[key]
return class_label
# 计算预测错误率
def cal_error_rate(tree, feature_labels, test_data_set):
error_count = 0
for test_data in test_data_set:
if classify(tree, feature_labels, test_data) != test_data[-1]:
error_count += 1
error_rate = float(error_count) / len(test_data_set)
return error_rate
# 主函数
if __name__ == '__main__':
# 读取数据
data = pd.read_csv('C:/Users/Administrator/Desktop/Bank-data.csv')
# 数据预处理
data = data.dropna() # 去掉缺失值
data = data.sample(frac=1).reset_index(drop=True) # 随机打乱数据集
n = len(data)
m = len(data.columns) - 1
train_data = data[:int(0.7*n)]
test_data = data[int(0.7*n):]
train_data_set = train_data.values.tolist()
test_data_set = test_data.values.tolist()
# 特征选择
feature_list = list(range(m))
# 构建决策树
algorithm = 'C4.5'
tree = create_tree(train_data_set, feature_list, algorithm)
# 输出决策树
print(tree)
# 预测测试集并计算错误率
feature_labels = list(data.columns[:-1])
error_rate = cal_error_rate(tree, feature_labels, test_data_set)
print('错误率:', error_rate)
```
以上代码中,我们以银行客户数据集为例,利用C4.5算法构建决策树,并对模型进行测试并计算错误率。