Spark:值 reduceByKey 不是成员
Spark: value reduceByKey is not a member
对一些稀疏向量进行聚类后,我需要在每个聚类中找到相交向量。为此,我尝试减少 MLlib 向量,如下例所示:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
//For Sparse Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
object Recommend {
def main(args: Array[String]) {
// set up environment
val conf = new SparkConf()
.setAppName("Test")
.set("spark.executor.memory", "2g")
val sc = new SparkContext(conf)
// Some vectors
val vLen = 1800
val sv11: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (110,1.0), (120,1.0), (130, 1.0) ))
val sv12: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (110,1.0), (120,1.0), (130, 1.0), (140, 1.0) ))
val sv13: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (120,1.0), (130,1.0) ))
val sv14: Vector = Vectors.sparse(vLen,Seq( (110,1.0), (130, 1.0) ))
val sv15: Vector = Vectors.sparse(vLen,Seq( (140, 1.0) ))
val sv21: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (210,1.0), (220,1.0), (230, 1.0) ))
val sv22: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (210,1.0), (220,1.0), (230, 1.0), (240, 1.0) ))
val sv23: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (220,1.0), (230,1.0) ))
val sv24: Vector = Vectors.sparse(vLen,Seq( (210,1.0), (230, 1.0) ))
val sv25: Vector = Vectors.sparse(vLen,Seq( (240, 1.0) ))
val sv31: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (310,1.0), (320,1.0), (330, 1.0) ))
val sv32: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (310,1.0), (320,1.0), (330, 1.0), (340, 1.0) ))
val sv33: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (320,1.0), (330,1.0) ))
val sv34: Vector = Vectors.sparse(vLen,Seq( (310,1.0), (330, 1.0) ))
val sv35: Vector = Vectors.sparse(vLen,Seq( (340, 1.0) ))
val sparseData = sc.parallelize(Seq(
sv11, sv12, sv13, sv14, sv15,
sv21, sv22, sv23, sv24, sv25,
sv31, sv32, sv33, sv34, sv35
))
// Cluster the data into two classes using KMeans
val numClusters = 3
val numIterations = 20
test(numClusters, numIterations, sparseData)
}
def test(numClusters:Int, numIterations:Int,
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]) = {
val clusters = KMeans.train(data, numClusters, numIterations)
val predictions = data.map(v => (clusters.predict(v), v) )
predictions.reduceByKey((v1, v2) => v1)
}
}
行predictions.reduceByKey((v1, v2) => v1)
导致错误:
value reduceByKey is not a member of org.apache.spark.rdd.RDD[(Int, org.apache.spark.mllib.linalg.Vector)]
这是什么原因?
正如您已经猜到的那样,您的代码应该添加了此导入:
import org.apache.spark.SparkContext._
为什么?因为随之而来的是一些隐式转换,最重要的(对于您的情况)是 PairRDD
隐式转换。
当你有一个 Tuple
的 RDD
时,Spark 会猜测 左边可以被认为是一个键 ,因此会给你一些方便的转换或 reduceByKey
.
之类的操作
此致,
对一些稀疏向量进行聚类后,我需要在每个聚类中找到相交向量。为此,我尝试减少 MLlib 向量,如下例所示:
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
//For Sparse Vector
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
object Recommend {
def main(args: Array[String]) {
// set up environment
val conf = new SparkConf()
.setAppName("Test")
.set("spark.executor.memory", "2g")
val sc = new SparkContext(conf)
// Some vectors
val vLen = 1800
val sv11: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (110,1.0), (120,1.0), (130, 1.0) ))
val sv12: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (110,1.0), (120,1.0), (130, 1.0), (140, 1.0) ))
val sv13: Vector = Vectors.sparse(vLen,Seq( (100,1.0), (120,1.0), (130,1.0) ))
val sv14: Vector = Vectors.sparse(vLen,Seq( (110,1.0), (130, 1.0) ))
val sv15: Vector = Vectors.sparse(vLen,Seq( (140, 1.0) ))
val sv21: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (210,1.0), (220,1.0), (230, 1.0) ))
val sv22: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (210,1.0), (220,1.0), (230, 1.0), (240, 1.0) ))
val sv23: Vector = Vectors.sparse(vLen,Seq( (200,1.0), (220,1.0), (230,1.0) ))
val sv24: Vector = Vectors.sparse(vLen,Seq( (210,1.0), (230, 1.0) ))
val sv25: Vector = Vectors.sparse(vLen,Seq( (240, 1.0) ))
val sv31: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (310,1.0), (320,1.0), (330, 1.0) ))
val sv32: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (310,1.0), (320,1.0), (330, 1.0), (340, 1.0) ))
val sv33: Vector = Vectors.sparse(vLen,Seq( (300,1.0), (320,1.0), (330,1.0) ))
val sv34: Vector = Vectors.sparse(vLen,Seq( (310,1.0), (330, 1.0) ))
val sv35: Vector = Vectors.sparse(vLen,Seq( (340, 1.0) ))
val sparseData = sc.parallelize(Seq(
sv11, sv12, sv13, sv14, sv15,
sv21, sv22, sv23, sv24, sv25,
sv31, sv32, sv33, sv34, sv35
))
// Cluster the data into two classes using KMeans
val numClusters = 3
val numIterations = 20
test(numClusters, numIterations, sparseData)
}
def test(numClusters:Int, numIterations:Int,
data: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector]) = {
val clusters = KMeans.train(data, numClusters, numIterations)
val predictions = data.map(v => (clusters.predict(v), v) )
predictions.reduceByKey((v1, v2) => v1)
}
}
行predictions.reduceByKey((v1, v2) => v1)
导致错误:
value reduceByKey is not a member of org.apache.spark.rdd.RDD[(Int, org.apache.spark.mllib.linalg.Vector)]
这是什么原因?
正如您已经猜到的那样,您的代码应该添加了此导入:
import org.apache.spark.SparkContext._
为什么?因为随之而来的是一些隐式转换,最重要的(对于您的情况)是 PairRDD
隐式转换。
当你有一个 Tuple
的 RDD
时,Spark 会猜测 左边可以被认为是一个键 ,因此会给你一些方便的转换或 reduceByKey
.
此致,