函数参数中的 RDD[Vector] 出错
Error with RDD[Vector] in function parameter
我正在尝试在 Scala 中定义一个函数以使用 Spark 对其进行迭代。
这是我的代码:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.rdd._
val assembler = new VectorAssembler()
.setInputCols(Array("feature1", "feature2", "feature3"))
.setOutputCol("features")
val assembled = assembler.transform(df)
// measures the average distance to centroid, for a model built with a given k.
def clusteringScore(data: RDD[Vector],k:Int) = {
val kmeans = new KMeans()
.setK(k)
.setFeaturesCol("features")
.setPredictionCol("prediction")
val model = kmeans.fit(data)
val WSSSE = model.computeCost(data) println(s"Within Set Sum of Squared Errors = $WSSSE")
}
(5 to 40 by 5).map(k => (k, clusteringScore(assembled, k))).
foreach(println)
使用这段代码我得到了这个错误:
type Vector takes type parameters
我不知道这个错误是什么意思...
我正在尝试在 Scala 中定义一个函数以使用 Spark 对其进行迭代。 这是我的代码:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.rdd._
val assembler = new VectorAssembler()
.setInputCols(Array("feature1", "feature2", "feature3"))
.setOutputCol("features")
val assembled = assembler.transform(df)
// measures the average distance to centroid, for a model built with a given k.
def clusteringScore(data: RDD[Vector],k:Int) = {
val kmeans = new KMeans()
.setK(k)
.setFeaturesCol("features")
.setPredictionCol("prediction")
val model = kmeans.fit(data)
val WSSSE = model.computeCost(data) println(s"Within Set Sum of Squared Errors = $WSSSE")
}
(5 to 40 by 5).map(k => (k, clusteringScore(assembled, k))).
foreach(println)
使用这段代码我得到了这个错误:
type Vector takes type parameters
我不知道这个错误是什么意思...