如何在 Spark 中获取 spark.ml NaiveBayes 概率向量而不是 [0-1] class?
How to get spark.ml NaiveBayes probability vector not [0-1] class in Spark?
我正在研究 NaiveBayes 分类器,我可以使用经过训练的模型预测单个数据点的值,但我想获得概率值。
数据分为两个类而已。和预测函数 returns 0
或 1
.
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
object Test {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
val dataset = spark.read.option("inferSchema", "true").csv("data/labelled.csv").toDF()
import spark.sqlContext.implicits._
val output = dataset.map(row => {
LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
})
val Array(training, test) = output.randomSplit(Array(0.7, 0.3),seed = 11L)
training.cache()
val model : NaiveBayesModel = new NaiveBayes().fit(training)
val speed = 110
val hour = 11
val label1 : Double = model.predict(Vectors.dense(speed,hour))
// UPDATE
val label = model.predictProbability(Vectors.dense(speed,hour)) // This not work and raise error[1]
}
}
[1] 使用model.predictProbability
时出现的错误
Error:(24, 23) method predictProbability in class
ProbabilisticClassificationModel cannot be accessed in
org.apache.spark.ml.classification.NaiveBayesModel Access to
protected method predictProbability not permitted because enclosing
object Test is not a subclass of class
ProbabilisticClassificationModel in package classification where
target is defined
val label = model.predictProbability(Vectors.dense(speed,hour))
经过多次研究,我没有在 spark.ml
库中找到此功能,但我能够使用 spark.mllib
做到这一点,代码应修改为
import org.apache.log4j.{Level, Logger}
// Import NaiveBayes, NaiveBayesModel from mlib
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
// Import LabeledPoint, Vectors from mlib to create dataset
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession
object Test {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
val dataset = spark.read.option("inferSchema","true").csv("data/labelled.csv").toDF()
import spark.sqlContext.implicits._
// using mllib.regression.LabeledPoint & mllib.linalg.Vectors then transform DF to JavaRDD
val output = dataset.map(row => {
LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
}).toJavaRDD
val Array(training, test) = output.randomSplit(Array(0.7, 0.3),seed = 11L)
training.cache()
//Using Run instead of fit method
val model : NaiveBayesModel = new NaiveBayes().run(training)
val speed = 110
val hour = 11
// return predict value
val label1 : Double = model.predict(Vectors.dense(speed,hour))
// return array of predict Probabilities `each class Probability`
val testLabel = model.predictProbabilities(Vectors.dense(speed,hour))
}
}
我正在研究 NaiveBayes 分类器,我可以使用经过训练的模型预测单个数据点的值,但我想获得概率值。
数据分为两个类而已。和预测函数 returns 0
或 1
.
import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.SparkSession
object Test {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
val dataset = spark.read.option("inferSchema", "true").csv("data/labelled.csv").toDF()
import spark.sqlContext.implicits._
val output = dataset.map(row => {
LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
})
val Array(training, test) = output.randomSplit(Array(0.7, 0.3),seed = 11L)
training.cache()
val model : NaiveBayesModel = new NaiveBayes().fit(training)
val speed = 110
val hour = 11
val label1 : Double = model.predict(Vectors.dense(speed,hour))
// UPDATE
val label = model.predictProbability(Vectors.dense(speed,hour)) // This not work and raise error[1]
}
}
[1] 使用model.predictProbability
Error:(24, 23) method predictProbability in class ProbabilisticClassificationModel cannot be accessed in org.apache.spark.ml.classification.NaiveBayesModel Access to protected method predictProbability not permitted because enclosing object Test is not a subclass of class ProbabilisticClassificationModel in package classification where target is defined val label = model.predictProbability(Vectors.dense(speed,hour))
经过多次研究,我没有在 spark.ml
库中找到此功能,但我能够使用 spark.mllib
做到这一点,代码应修改为
import org.apache.log4j.{Level, Logger}
// Import NaiveBayes, NaiveBayesModel from mlib
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
// Import LabeledPoint, Vectors from mlib to create dataset
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SparkSession
object Test {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
val spark = SparkSession.builder.appName("Test").master("local[4]").getOrCreate
val dataset = spark.read.option("inferSchema","true").csv("data/labelled.csv").toDF()
import spark.sqlContext.implicits._
// using mllib.regression.LabeledPoint & mllib.linalg.Vectors then transform DF to JavaRDD
val output = dataset.map(row => {
LabeledPoint(row.getInt(2), Vectors.dense( row.getInt(0) , row.getInt(1)))
}).toJavaRDD
val Array(training, test) = output.randomSplit(Array(0.7, 0.3),seed = 11L)
training.cache()
//Using Run instead of fit method
val model : NaiveBayesModel = new NaiveBayes().run(training)
val speed = 110
val hour = 11
// return predict value
val label1 : Double = model.predict(Vectors.dense(speed,hour))
// return array of predict Probabilities `each class Probability`
val testLabel = model.predictProbabilities(Vectors.dense(speed,hour))
}
}