使用scala编写spark word2vec tfidf进行加权 ,计算两个列表的文本相似度的代码
时间: 2024-03-05 18:52:22 浏览: 21
好的,下面是代码,注意需要先安装Spark和相关依赖库:
```scala
import org.apache.spark.ml.feature.{HashingTF, IDF, Word2Vec}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object Word2VecAndTFIDF {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Word2VecAndTFIDF")
.master("local[*]")
.getOrCreate()
// 读取数据,两个列表分别存储在不同的文件中,每行为一个句子
val list1 = spark.read.textFile("path/to/list1.txt")
val list2 = spark.read.textFile("path/to/list2.txt")
// 合并两个列表
val data = list1.union(list2).toDF("text")
// 对文本进行分词
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val tokenized = tokenizer.transform(data)
// 训练Word2Vec模型
val word2Vec = new Word2Vec()
.setInputCol("words")
.setOutputCol("word2vec")
.setVectorSize(100)
.setMinCount(0)
val model = word2Vec.fit(tokenized)
val result = model.transform(tokenized)
// 计算TF-IDF
val hashingTF = new HashingTF()
.setInputCol("words")
.setOutputCol("rawFeatures")
.setNumFeatures(10000)
val featurizedData = hashingTF.transform(result)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
// 计算两个列表的文本相似度
val list1Data = rescaledData.filter(col("text").isin(list1.collect(): _*))
val list2Data = rescaledData.filter(col("text").isin(list2.collect(): _*))
val list1Vectors = list1Data.select("features").rdd.map {
case Row(v: Vector) => v
}
val list2Vectors = list2Data.select("features").rdd.map {
case Row(v: Vector) => v
}
val sim = list1Vectors.cartesian(list2Vectors).map { case (v1, v2) =>
val cosSim = cosineSimilarity(v1, v2)
Row(cosSim)
}
// 输出结果
val schema = StructType(Seq(StructField("similarity", DoubleType)))
val simDF = spark.createDataFrame(sim, schema)
simDF.show()
}
def cosineSimilarity(v1: Vector, v2: Vector): Double = {
val dotProduct = v1.dot(v2)
val magnitude1 = math.sqrt(v1.toArray.map(x => x * x).sum)
val magnitude2 = math.sqrt(v2.toArray.map(x => x * x).sum)
dotProduct / (magnitude1 * magnitude2)
}
}
```
这段代码中,我们首先读取存储在文件中的两个列表,然后对它们的每个句子进行分词,训练Word2Vec模型并计算TF-IDF。最后,我们从TF-IDF结果中提取出列表中的向量,并计算两个列表中所有向量的余弦相似度,最终输出文本相似度。