首页spark dataframe有三列col1，col2，col3，对col1和col2进行tfidf处理，然后分别进行minmax归一化处理，给出scala代码

spark dataframe有三列col1，col2，col3，对col1和col2进行tfidf处理，然后分别进行minmax归一化处理，给出scala代码

时间: 2024-01-27 13:03:28 浏览: 162

下面是对col1和col2进行tfidf处理并分别进行minmax归一化处理的scala代码： ``` import org.apache.spark.ml.feature.{HashingTF, IDF, MaxAbsScaler} import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType // 假设输入的dataframe名字为df，同时假设col1和col2是字符串类型 val hashingTF = new HashingTF().setInputCol("text").setOutputCol("rawFeatures").setNumFeatures(10000) val tf = hashingTF.transform(df) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(tf) val tfidf = idfModel.transform(tf).withColumnRenamed("features", "features_unscaled") val udf_v_to_dv = udf((v: org.apache.spark.ml.linalg.Vector) => new DenseVector(v.toArray), org.apache.spark.sql.types.DataTypes.createArrayType(DoubleType)) val udf_dv_to_v = udf((dv: DenseVector) => org.apache.spark.ml.linalg.Vectors.dense(dv.toArray), org.apache.spark.sql.types.DataTypes.createArrayType(org.apache.spark.sql.types.DataTypes.DoubleType)) val minmaxScaler1 = new MaxAbsScaler().setInputCol("features_unscaled").setOutputCol("features_scaled_col1") val tfidfScaled1 = minmaxScaler1.fit(tfidf).transform(tfidf).withColumn("features_scaled_col1", udf_v_to_dv(col("features_scaled_col1"))) val minmaxScaler2 = new MaxAbsScaler().setInputCol("features_unscaled").setOutputCol("features_scaled_col2") val tfidfScaled2 = minmaxScaler2.fit(tfidf).transform(tfidf).withColumn("features_scaled_col2", udf_v_to_dv(col("features_scaled_col2"))) tfidfScaled2.select("features_scaled_col1", "features_scaled_col2").withColumn("features_scaled_col1", udf_dv_to_v(col("features_scaled_col1"))).withColumn("features_scaled_col2", udf_dv_to_v(col("features_scaled_col2"))) ```

阅读全文