苏打水:无法使用 spark ml 管道的支持

Sparkling water: Can't make use of the support of spark ml pipelines

根据 Sparkling water 家伙的 this 博客,您现在可以使用 Spark ML 管道组件在最新版本中构建 DL 模型。我尝试在 build.sbt

中添加最新版本
"org.apache.spark" % "spark-mllib_2.10" % "2.0.0" % "provided",
"ai.h2o" % "sparkling-water-core_2.10" % "1.6.5" % "provided"

但运气不好,尝试导入 org.apache.spark.ml.h2o.H2OPipeline 不起作用。 spark.ml 中的 h2o 包似乎不存在于火花罐中。尽管它似乎适用于上述 link 以及 here。我真的很想重用我的 spark-mllib 特征转换器来使用 h2o 创建 DL 模型,如博客中所示。

感谢任何帮助!

谢谢。

1) 请不要将 spark 2 与 sw 1.6.5 一起使用 - 它不会工作。我们为 scala 2.11 发布了 sw2.0 https://mvnrepository.com/artifact/ai.h2o/sparkling-water-core_2.11

2) 您只是在构建中添加了 SW 核心,您正在寻找的 类 在 sparkling-water-ml https://mvnrepository.com/artifact/ai.h2o/sparkling-water-ml_2.11

我使用了以下版本的 运行 H2O 示例和 Maven pom.xml,它正在工作

  • Spark - 1.6
  • 苏打水 - 1.6.8
  • ai h2o - 3.10.0.8

这里是maven pom.xml(请参考GIT repo - https://github.com/seerampavan/H2oTesting/blob/master/pom.xml

<properties>
    <spark.version>1.6.0-cdh5.7.1</spark.version>
    <scala.version>2.10.4</scala.version>
    <scala.binary.version>2.10</scala.binary.version>
    <top.dir>${project.basedir}/..</top.dir>
    <hadoop.version>2.6.0-cdh5.7.1</hadoop.version>
</properties>

<dependencies>
    <!-- Force import of Spark's servlet API for unit tests -->
    <dependency>
        <groupId>javax.servlet</groupId>
        <artifactId>javax.servlet-api</artifactId>
        <version>3.0.1</version>
    </dependency>
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>${scala.version}</version>
        <!--<scope>provided</scope>-->
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

        <exclusions>
            <exclusion>
                <!-- make sure wrong scala version is not pulled in -->
                <groupId>org.scala-lang</groupId>
                <artifactId>scala-library</artifactId>
            </exclusion>
            <exclusion>
                <!-- make sure wrong scala version is not pulled in -->
                <groupId>org.scala-lang</groupId>
                <artifactId>scalap</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-hive_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

        <exclusions>
            <exclusion>
                <groupId>org.jpmml</groupId>
                <artifactId>pmml-model</artifactId>
            </exclusion>
        </exclusions>

    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>

    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>
        <type>test-jar</type>
        <classifier>tests</classifier>

    </dependency>
    <dependency>
        <groupId>org.scalatest</groupId>
        <artifactId>scalatest_${scala.binary.version}</artifactId>
        <version>2.2.1</version>

    </dependency>
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.12</version>

    </dependency>

    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>${hadoop.version}</version>
        <exclusions>
            <exclusion>
                <groupId>log4j</groupId>
                <artifactId>log4j</artifactId>
            </exclusion>
            <exclusion>
                <groupId>javax.servlet</groupId>
                <artifactId>servlet-api</artifactId>
            </exclusion>
            <exclusion>
                <groupId>javax.servlet.jsp</groupId>
                <artifactId>jsp-api</artifactId>
            </exclusion>
            <exclusion>
                <groupId>org.jruby</groupId>
                <artifactId>jruby-complete</artifactId>
            </exclusion>
            <exclusion>
                <groupId>org.jboss.netty</groupId>
                <artifactId>netty</artifactId>
            </exclusion>
            <exclusion>
                <groupId>io.netty</groupId>
                <artifactId>netty</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-reflect</artifactId>
        <version>2.10.5</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-web</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-scala_2.10</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-persist-s3</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-persist-hdfs</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-parquet-parser</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-genmodel</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-core</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-bindings</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-avro-parser</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-app</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>h2o-algos</artifactId>
        <version>3.10.0.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>sparkling-water-repl_2.10</artifactId>
        <version>1.6.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>sparkling-water-ml_2.10</artifactId>
        <version>1.6.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>sparkling-water-examples_2.10</artifactId>
        <version>1.6.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>sparkling-water-core_2.10</artifactId>
        <version>1.6.8</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>deepwater-backend-api</artifactId>
        <version>1.0.0</version>
    </dependency>

    <dependency>
        <groupId>joda-time</groupId>
        <artifactId>joda-time</artifactId>
        <version>2.9.2</version>
    </dependency>
    <dependency>
        <groupId>org.joda</groupId>
        <artifactId>joda-convert</artifactId>
        <version>1.8.1</version>
    </dependency>
    <dependency>
        <groupId>org.javassist</groupId>
        <artifactId>javassist</artifactId>
        <version>3.22.0-CR1</version>
    </dependency>
    <dependency>
        <groupId>gov.nist.math</groupId>
        <artifactId>jama</artifactId>
        <version>1.0.3</version>
    </dependency>
    <dependency>
        <groupId>com.google.code.gson</groupId>
        <artifactId>gson</artifactId>
        <version>2.7</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>reflections</artifactId>
        <version>0.9.11-h2o-custom</version>
    </dependency>
    <dependency>
        <groupId>ai.h2o</groupId>
        <artifactId>google-analytics-java</artifactId>
        <version>1.1.2-H2O-CUSTOM</version>
    </dependency>
    <dependency>
        <groupId>com.github.tony19</groupId>
        <artifactId>named-regexp</artifactId>
        <version>0.2.4</version>
    </dependency>
    <dependency>
        <groupId>com.amazonaws</groupId>
        <artifactId>aws-java-sdk-s3</artifactId>
        <version>1.11.45</version>
    </dependency>
    <dependency>
        <groupId>com.amazonaws</groupId>
        <artifactId>aws-java-sdk-kms</artifactId>
        <version>1.11.45</version>
    </dependency>
    <dependency>
        <groupId>com.amazonaws</groupId>
        <artifactId>aws-java-sdk-core</artifactId>
        <version>1.11.45</version>
    </dependency>
    <dependency>
        <groupId>org.eclipse.jetty.aggregate</groupId>
        <artifactId>jetty-servlet</artifactId>
        <version>8.2.0.v20160908</version>
    </dependency>
    <dependency>
        <groupId>org.eclipse.jetty.aggregate</groupId>
        <artifactId>jetty-server</artifactId>
        <version>8.2.0.v20160908</version>
    </dependency>
    <dependency>
        <groupId>org.eclipse.jetty.aggregate</groupId>
        <artifactId>jetty-plus</artifactId>
        <version>8.1.17.v20150415</version>
    </dependency>
</dependencies>