'windy'])
return dataSet, labels
def createTestSet():
testSet = np.array([[0, 1, 0, 0],
[0, 2, 1, 0],
[2, 1, 1, 0],
[0, 1, 1, 1],
[1, 1, 0, 1],
[1, 0, 1, 0],
[2, 1, 0, 1]])
result = np.array(['N', 'N', 'Y', 'N', 'Y', 'Y', 'N'])
return testSet, result
def datasetEntropy(dataSet): #
求信息熵
# print(dataSet[:, -1])
classLabel = dataSet[:, -1] #
得到所有的行,最后一列
labelCount = {} #
用来存放每个属性的名称和其对应结果
的个数
for i in range(classLabel.size):
label = classLabel[i]
labelCount[label] = labelCount.get(label, 0) + 1 #
字
典
get(
键
,
默认值
)
,返回某个键的值,若无则返回默认值
# print(labelCount.get('Y'))
ent = 0 #
信息熵初始化
for k, v in labelCount.items(): #
遍历字典中的每一个属性键值对,
对每个属性计算信息熵
ent += -v / classLabel.size * np.log2(v /
classLabel.size)
return ent
def splitDataSet(dataSet, featureIndex, value):
#
划分后的子集
subDataset = []
for example in dataSet:
if example[featureIndex] == value:
subDataset.append(example)
return np.delete(subDataset, featureIndex, axis=1)
def chooseBestFeatrue(dataSet, labels): #
计算最优属性
评论5