如何使用 CrossValidator 在不同模型之间进行选择
How to use CrossValidator to choose between different models
我知道我可以使用 CrossValidator to tune a single model. But what is the suggested approach for evaluating different models against each other? For example, say that I wanted to evaluate a LogisticRegression classifier against a LinearSVC classifier using CrossValidator。
在稍微熟悉了 API 之后,我通过实现自定义 Estimator that wraps two or more estimators it can delegate to, where the selected estimator is controlled by a single Param[Int] 解决了这个问题。这是实际的代码:
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.Model
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType
trait DelegatingEstimatorModelParams extends Params {
final val selectedEstimator = new Param[Int](this, "selectedEstimator", "The selected estimator")
}
class DelegatingEstimator private (override val uid: String, delegates: Array[Estimator[_]]) extends Estimator[DelegatingEstimatorModel] with DelegatingEstimatorModelParams {
private def this(estimators: Array[Estimator[_]]) = this(Identifiable.randomUID("delegating-estimator"), estimators)
def this(estimator1: Estimator[_], estimator2: Estimator[_], estimators: Estimator[_]*) = {
this((Seq(estimator1, estimator2) ++ estimators).toArray)
}
setDefault(selectedEstimator -> 0)
override def fit(dataset: Dataset[_]): DelegatingEstimatorModel = {
val estimator = delegates(getOrDefault(selectedEstimator))
val model = estimator.fit(dataset).asInstanceOf[Model[_]]
new DelegatingEstimatorModel(uid, model)
}
override def copy(extra: ParamMap): Estimator[DelegatingEstimatorModel] = {
val that = new DelegatingEstimator(uid, delegates)
copyValues(that, extra)
}
override def transformSchema(schema: StructType): StructType = {
// All delegates are assumed to perform the same schema transformation,
// so we can simply select the first one:
delegates(0).transformSchema(schema)
}
}
class DelegatingEstimatorModel(override val uid: String, val delegate: Model[_]) extends Model[DelegatingEstimatorModel] with DelegatingEstimatorModelParams {
def copy(extra: ParamMap): DelegatingEstimatorModel = new DelegatingEstimatorModel(uid, delegate.copy(extra).asInstanceOf[Model[_]])
def transform(dataset: Dataset[_]): DataFrame = delegate.transform(dataset)
def transformSchema(schema: StructType): StructType = delegate.transformSchema(schema)
}
上面的LogistcRegression against a LinearSVC和类的计算可以这样使用:
val logRegression = new LogisticRegression()
.setFeaturesCol(columnNames.features)
.setPredictionCol(columnNames.prediction)
.setRawPredictionCol(columnNames.rawPrediciton)
.setLabelCol(columnNames.label)
val svmEstimator = new LinearSVC()
.setFeaturesCol(columnNames.features)
.setPredictionCol(columnNames.prediction)
.setRawPredictionCol(columnNames.rawPrediciton)
.setLabelCol(columnNames.label)
val delegatingEstimator = new DelegatingEstimator(logRegression, svmEstimator)
val paramGrid = new ParamGridBuilder()
.addGrid(delegatingEstimator.selectedEstimator, Array(0, 1))
.build()
val model = crossValidator.fit(data)
val bestModel = model.bestModel.asInstanceOf[DelegatingEstimatorModel].delegate
我知道我可以使用 CrossValidator to tune a single model. But what is the suggested approach for evaluating different models against each other? For example, say that I wanted to evaluate a LogisticRegression classifier against a LinearSVC classifier using CrossValidator。
在稍微熟悉了 API 之后,我通过实现自定义 Estimator that wraps two or more estimators it can delegate to, where the selected estimator is controlled by a single Param[Int] 解决了这个问题。这是实际的代码:
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.Model
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.Params
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType
trait DelegatingEstimatorModelParams extends Params {
final val selectedEstimator = new Param[Int](this, "selectedEstimator", "The selected estimator")
}
class DelegatingEstimator private (override val uid: String, delegates: Array[Estimator[_]]) extends Estimator[DelegatingEstimatorModel] with DelegatingEstimatorModelParams {
private def this(estimators: Array[Estimator[_]]) = this(Identifiable.randomUID("delegating-estimator"), estimators)
def this(estimator1: Estimator[_], estimator2: Estimator[_], estimators: Estimator[_]*) = {
this((Seq(estimator1, estimator2) ++ estimators).toArray)
}
setDefault(selectedEstimator -> 0)
override def fit(dataset: Dataset[_]): DelegatingEstimatorModel = {
val estimator = delegates(getOrDefault(selectedEstimator))
val model = estimator.fit(dataset).asInstanceOf[Model[_]]
new DelegatingEstimatorModel(uid, model)
}
override def copy(extra: ParamMap): Estimator[DelegatingEstimatorModel] = {
val that = new DelegatingEstimator(uid, delegates)
copyValues(that, extra)
}
override def transformSchema(schema: StructType): StructType = {
// All delegates are assumed to perform the same schema transformation,
// so we can simply select the first one:
delegates(0).transformSchema(schema)
}
}
class DelegatingEstimatorModel(override val uid: String, val delegate: Model[_]) extends Model[DelegatingEstimatorModel] with DelegatingEstimatorModelParams {
def copy(extra: ParamMap): DelegatingEstimatorModel = new DelegatingEstimatorModel(uid, delegate.copy(extra).asInstanceOf[Model[_]])
def transform(dataset: Dataset[_]): DataFrame = delegate.transform(dataset)
def transformSchema(schema: StructType): StructType = delegate.transformSchema(schema)
}
上面的LogistcRegression against a LinearSVC和类的计算可以这样使用:
val logRegression = new LogisticRegression()
.setFeaturesCol(columnNames.features)
.setPredictionCol(columnNames.prediction)
.setRawPredictionCol(columnNames.rawPrediciton)
.setLabelCol(columnNames.label)
val svmEstimator = new LinearSVC()
.setFeaturesCol(columnNames.features)
.setPredictionCol(columnNames.prediction)
.setRawPredictionCol(columnNames.rawPrediciton)
.setLabelCol(columnNames.label)
val delegatingEstimator = new DelegatingEstimator(logRegression, svmEstimator)
val paramGrid = new ParamGridBuilder()
.addGrid(delegatingEstimator.selectedEstimator, Array(0, 1))
.build()
val model = crossValidator.fit(data)
val bestModel = model.bestModel.asInstanceOf[DelegatingEstimatorModel].delegate