已知# 计算信息熵 def cal_entropy(dataset): numEntries = len(dataset) labelCounts = {} # 给所有可能分类创建字典 for featVec in dataset: currentlabel = featVec[-1] if currentlabel not in labelCounts.keys(): labelCounts[currentlabel] = 0 labelCounts[currentlabel] += 1 Ent = 0.0 # *** START CODE HERE *** for key in labelCounts: prob = float(labelCounts[key])/numEntries Ent -= prob*log(prob,2) # 计算信息熵Ent # *** END CODE HERE *** return Ent # 划分数据集 def splitdataset(dataset, axis, value): retdataset = [] # 创建返回的数据集列表 for featVec in dataset: # 抽取符合划分特征的值 if featVec[axis] == value: reducedfeatVec = featVec[:axis] # 去掉axis特征 reducedfeatVec.extend(featVec[axis + 1:]) # 将符合条件的特征添加到返回的数据集列表 retdataset.append(reducedfeatVec) return retdataset,请补全C4.5算法def C45_chooseBestFeatureToSplit(dataset): numFeatures = len(dataset[0]) - 1 baseEnt = cal_entropy(dataset) bestInfoGain_ratio = 0.0 bestFeature = -1 # *** START CODE HERE *** # 利用C4.5的原理以及仿照ID3_chooseBestFeatureToSplit代码进行编码 # *** END CODE HERE *** return bestFeature
时间: 2023-07-15 08:14:15 浏览: 72
在C4.5算法中,需要对ID3算法进行改进,即考虑到信息增益比。C4.5算法选择最优划分特征的过程如下:
1. 计算每个特征的信息增益比。
2. 选择信息增益比最大的特征作为最优划分特征。
3. 根据最优特征的取值将数据集划分成多个子集。
下面是C4.5算法的代码实现:
import math
# 计算信息熵
def cal_entropy(dataset):
numEntries = len(dataset)
labelCounts = {}
# 给所有可能分类创建字典
for featVec in dataset:
currentlabel = featVec[-1]
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel] = 0
labelCounts[currentlabel] += 1
Ent = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
Ent -= prob*math.log(prob,2) # 计算信息熵Ent
return Ent
# 划分数据集
def splitdataset(dataset, axis, value):
retdataset = [] # 创建返回的数据集列表
for featVec in dataset: # 抽取符合划分特征的值
if featVec[axis] == value:
reducedfeatVec = featVec[:axis] # 去掉axis特征
reducedfeatVec.extend(featVec[axis + 1:]) # 将符合条件的特征添加到返回的数据集列表
retdataset.append(reducedfeatVec)
return retdataset
# 计算信息增益比
def cal_infoGain_ratio(dataset, baseEnt, axis):
# 计算特征的取值数目
numEntries = len(dataset)
featList = [example[axis] for example in dataset]
uniqueVals = set(featList)
# 计算条件熵
newEnt = 0.0
splitInfo = 0.0
for value in uniqueVals:
subdataset = splitdataset(dataset, axis, value)
prob = len(subdataset) / float(numEntries)
newEnt += prob * cal_entropy(subdataset)
splitInfo -= prob * math.log(prob, 2)
# 计算信息增益比
infoGain = baseEnt - newEnt
if splitInfo == 0:
return 0
infoGain_ratio = infoGain / splitInfo
return infoGain_ratio
# C4.5算法选择最优划分特征
def C45_chooseBestFeatureToSplit(dataset):
numFeatures = len(dataset[0]) - 1
baseEnt = cal_entropy(dataset)
bestInfoGain_ratio = 0.0
bestFeature = -1
for i in range(numFeatures):
infoGain_ratio = cal_infoGain_ratio(dataset, baseEnt, i)
if infoGain_ratio > bestInfoGain_ratio:
bestInfoGain_ratio = infoGain_ratio
bestFeature = i
return bestFeature
阅读全文