带火花的随机森林:获得预测值和 R²

random forest with spark: get predicted values and R²

我正在使用 sparkMLlib 来执行 regression random forest

我在这里使用 python 代码: https://spark.apache.org/docs/1.2.0/mllib-ensembles.html#tab_python_1

它有效,但现在我想获得 predicted values 以及预测模型的 R。 如何获得?

以下是如何将 csv 文件保存为 RDD(spark 数据格式):

# Imports
import csv
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
from collections import namedtuple
from operator import add, itemgetter
from pyspark import SparkConf, SparkContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
import shutil
import numpy

def parse(row):
    """
    Parses a row and returns a named tuple.
    """
    row[0]  = str(row[0])
    row[1]  = float(row[1])
    row[2]  = float(row[2])
    row[3]  = float(row[3])
    row[4]  = float(row[4])
    return LabeledPoint(row[4], row[:4])    


def split(line):
    """
    Operator function for splitting a line with csv module
    """
    reader = csv.reader(StringIO(line), delimiter=';')
    return next(reader)

#save csv file on a spark cluster (RDD format)
data = sc.textFile("datafile").map(split).map(parse)

下面是如何执行随机森林算法以及如何获得预测值:

def random_forest_regression(data):
    """
    Run the random forest (regression) algorithm on the data to perform the prediction
    """
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32)
    #increase number of trees to have a better prediction

    # Evaluate model on TEST instances and compute test error
    predictions_test = model.predict(testData.map(lambda x: x.features))
    real_and_predicted_test = testData.map(lambda lp: lp.label).zip(predictions_test)

    #get the list of real and predicted values FOR ALL THE POINTS
    predictions = model.predict(data.map(lambda x: x.features))
    real_and_predicted = data.map(lambda lp: lp.label).zip(predictions)
    real_and_predicted=real_and_predicted.collect()
    print("real and predicted values")
    for value in real_and_predicted:
        print(value)

    return model, real_and_predicted

为了获得 correlation coefficientR 值),我使用了 numpy:

def compute_correlation_coefficient(real_and_predicted):
    """
    compute and display the correlation coefficient from a list of real and predicted values
    """
    list1=[]
    list2=[]
    for tuple in real_and_predicted:
        list1.append(tuple[0])
        list2.append(tuple[1])
    print("correlation coefficient")
    print(numpy.corrcoef(list1, list2)[0, 1])

要得到 ,取 correlation coefficient 的平方值。

瞧!