已知# 计算信息熵 def cal_entropy(dataset): numEntries = len(dataset) labelCounts = {} # 给所有可能分类创建字典 for featVec in dataset: currentlabel = featVec[-1] if currentlabel not in labelCounts.keys(): labelCounts[currentlabel] = 0 labelCounts[currentlabel] += 1 Ent = 0.0 # *** START CODE HERE *** for key in labelCounts: prob = float(labelCounts[key])/numEntries Ent -= prob*log(prob,2) # 计算信息熵Ent # *** END CODE HERE *** return Ent # 划分数据集 def splitdataset(dataset, axis, value): retdataset = [] # 创建返回的数据集列表 for featVec in dataset: # 抽取符合划分特征的值 if featVec[axis] == value: reducedfeatVec = featVec[:axis] # 去掉axis特征 reducedfeatVec.extend(featVec[axis + 1:]) # 将符合条件的特征添加到返回的数据集列表 retdataset.append(reducedfeatVec) return retdataset请补全# CART算法 def CART_chooseBestFeatureToSplit(dataset): numFeatures = len(dataset[0]) - 1 bestGini = 999999.0 bestFeature = -1 # *** START CODE HERE *** # 利用CART的原理以及仿照ID3_chooseBestFeatureToSplit代码进行编码 # *** END CODE HERE *** return bestFeature
时间: 2023-07-15 07:13:53 浏览: 52
请补全代码如下:
```python
import numpy as np
# 计算基尼不纯度
def cal_gini(dataset):
numEntries = len(dataset)
labelCounts = {}
for featVec in dataset:
currentlabel = featVec[-1]
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel] = 0
labelCounts[currentlabel] += 1
Gini = 1.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
Gini -= prob * prob
return Gini
# 划分数据集
def splitdataset(dataset, axis, value):
retdataset = []
for featVec in dataset:
if featVec[axis] == value:
reducedfeatVec = featVec[:axis]
reducedfeatVec.extend(featVec[axis + 1:])
retdataset.append(reducedfeatVec)
return retdataset
# CART算法选择最优特征
def CART_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
bestGini = 999999.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataset]
uniqueVals = set(featList)
newGini = 0.0
for value in uniqueVals:
subdataset = splitdataset(dataset, i, value)
prob = len(subdataset) / float(len(dataset))
newGini += prob * cal_gini(subdataset)
if newGini < bestGini:
bestGini = newGini
bestFeature = i
return bestFeature
```
其中,cal_gini()函数用来计算基尼不纯度,CART_chooseBestFeatureToSplit()函数用来选择最优划分特征。