请用scala编写一个ID3决策树分类算法
时间: 2023-09-15 13:19:59 浏览: 155
好的,以下是一个简单的ID3决策树分类算法的Scala实现:
```scala
import scala.collection.mutable.ArrayBuffer
case class TreeNode(attribute: Int, children: Map[String, TreeNode], result: Option[String])
def id3(data: Array[Array[String]], labels: Array[String]): TreeNode = {
val attributes = Array.range(0, labels.length - 1)
val rootNode = buildTree(data, labels, attributes)
rootNode
}
def buildTree(data: Array[Array[String]], labels: Array[String], attributes: Array[Int]): TreeNode = {
val classCounts = countClasses(data, labels)
val defaultClass = classCounts.maxBy(_._2)._1
if (classCounts.size == 1 || attributes.isEmpty) {
TreeNode(-1, Map.empty[String, TreeNode], Some(defaultClass))
} else {
val bestAttribute = chooseBestAttribute(data, labels, attributes)
val children = splitData(data, labels, bestAttribute).map {
case (value, subset) => (value, buildTree(subset, labels, attributes.filter(_ != bestAttribute)))
}.toMap
TreeNode(bestAttribute, children, None)
}
}
def countClasses(data: Array[Array[String]], labels: Array[String]): Map[String, Int] = {
val classIndex = labels.length - 1
data.foldLeft(Map.empty[String, Int]) { (counts, instance) =>
val label = instance(classIndex)
counts + (label -> (counts.getOrElse(label, 0) + 1))
}
}
def chooseBestAttribute(data: Array[Array[String]], labels: Array[String], attributes: Array[Int]): Int = {
val classIndex = labels.length - 1
val entropy = calculateEntropy(data, labels)
attributes.maxBy { attribute =>
val attributeValues = data.map(_(attribute)).distinct
attributeValues.map { value =>
val subset = data.filter(_(attribute) == value)
val classCounts = countClasses(subset, labels)
val subsetEntropy = classCounts.values.map { count =>
val prob = count.toDouble / subset.length
-prob * math.log(prob) / math.log(2)
}.sum
subsetEntropy * subset.length / data.length
}.sum
}
}
def calculateEntropy(data: Array[Array[String]], labels: Array[String]): Double = {
val classCounts = countClasses(data, labels)
val classProbabilities = classCounts.values.map(_.toDouble / data.length)
classProbabilities.map(p => -p * math.log(p) / math.log(2)).sum
}
def splitData(data: Array[Array[String]], labels: Array[String], attribute: Int): Map[String, Array[Array[String]]] = {
data.groupBy(_(attribute)).mapValues(_.map(_.patch(attribute, Nil, 1)))
}
```
这个实现的ID3算法接受一个数据集的二维数组和一个标签数组作为输入,并返回一个ID3决策树的根节点。其中,数据集的每一行表示一个样本,每一列表示一个特征值;标签数组的每个元素表示一个样本所属的类别。
使用该算法的示例代码:
```scala
val data = Array(
Array("Sunny", "Hot", "High", "Weak", "No"),
Array("Sunny", "Hot", "High", "Strong", "No"),
Array("Overcast", "Hot", "High", "Weak", "Yes"),
Array("Rain", "Mild", "High", "Weak", "Yes"),
Array("Rain", "Cool", "Normal", "Weak", "Yes"),
Array("Rain", "Cool", "Normal", "Strong", "No"),
Array("Overcast", "Cool", "Normal", "Strong", "Yes"),
Array("Sunny", "Mild", "High", "Weak", "No"),
Array("Sunny", "Cool", "Normal", "Weak", "Yes"),
Array("Rain", "Mild", "Normal", "Weak", "Yes"),
Array("Sunny", "Mild", "Normal", "Strong", "Yes"),
Array("Overcast", "Mild", "High", "Strong", "Yes"),
Array("Overcast", "Hot", "Normal", "Weak", "Yes"),
Array("Rain", "Mild", "High", "Strong", "No")
)
val labels = Array("Outlook", "Temperature", "Humidity", "Wind", "PlayTennis")
val rootNode = id3(data, labels)
```
这个示例代码使用了一个简单的天气预测数据集,并构建了一个ID3决策树。你可以根据自己的需求替换数据集和标签数组来使用这个算法。
阅读全文