Gremlin Spark Java Maven 项目 - 查询响应缓慢
Gremlin Spark Java Maven Project - Slow query response
我已经编写了一个程序,以便在 Spark 的帮助下在 Gremlin(我使用 Janus Graph 和 Cassandra 和 Solr 作为引擎)上执行一些查询,但是查询结果非常慢。
很可能是我设置不正确。
这是我用过的代码。
驱动程序:
import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer;
import org.apache.tinkerpop.gremlin.structure.Graph;
import org.apache.tinkerpop.gremlin.structure.util.GraphFactory;
public class SimpleApp {
public static void main(String[] args) throws Exception {
Configuration config = GraphTraversalProvider.makeLocal();
Graph hadoopGraph = GraphFactory.open(config);
Long totalVertices = hadoopGraph.traversal().withComputer(SparkGraphComputer.class).V().count().next();
System.out.println("IT WORKED: " + totalVertices);
hadoopGraph.close();
}
}
GraphTraversalProvider class
import org.apache.commons.configuration.BaseConfiguration;
import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.hadoop.Constants;
public class GraphTraversalProvider {
private static final String KEY_SPACE = "janusgraph";
private static final String CASSANDRA_ADDRESS = "localhost";
public static Configuration makeLocal() {
return make(true);
}
public static Configuration makeRemote() {
return make(false);
}
private static Configuration make(boolean local) {
final Configuration hadoopConfig = new BaseConfiguration();
hadoopConfig.setProperty("gremlin.graph", "org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, "org.janusgraph.hadoop.formats.cql.CqlInputFormat");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, "org.apache.hadoop.mapreduce.lib.output.NullOutputFormat");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, true);
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, "none");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, "output");
hadoopConfig.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.backend", "cql");
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.hostname", CASSANDRA_ADDRESS);
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.port", "9042");
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.cassandra.keyspace", KEY_SPACE);
hadoopConfig.setProperty("cassandra.input.partitioner.class", "org.apache.cassandra.dht.Murmur3Partitioner");
hadoopConfig.setProperty("cassandra.input.widerows", true);
if (local) {
hadoopConfig.setProperty("spark.master", "local[*]"); // Run Spark locally with as many worker threads as logical cores on your machine.
} else {
hadoopConfig.setProperty("spark.master", "spark://ADD_YOUR_URL");
}
hadoopConfig.setProperty("spark.executor.memory", "2g");
hadoopConfig.setProperty(Constants.SPARK_SERIALIZER, "org.apache.spark.serializer.KryoSerializer");
hadoopConfig.setProperty("spark.kryo.registrator", "org.janusgraph.hadoop.serialize.JanusGraphKryoRegistrator");
hadoopConfig.setProperty("storage.hostname", CASSANDRA_ADDRESS);
hadoopConfig.setProperty("storage.cassandra.keyspace", KEY_SPACE);
return hadoopConfig;
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ibc</groupId>
<artifactId>sparkdemo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<janus.version>0.5.1</janus.version>
<spark.version>2.4.0</spark.version>
<gremlin.version>3.4.6</gremlin.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.2</version>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-cassandra</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-hadoop</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-cql</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-solr</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
<!-- GREMLIN -->
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>spark-gremlin</artifactId>
<version>${gremlin.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>${gremlin.version}</version>
</dependency>
<!-- SPARK -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
</dependencies>
</project>
输出如下:
23:36:29,708 |-INFO in ch.qos.logback.classic.joran.action.RootLoggerAction - Setting level of ROOT logger to WARN
23:36:29,708 |-INFO in ch.qos.logback.core.joran.action.AppenderRefAction - Attaching appender named [CONSOLE] to Logger[ROOT]
23:36:29,708 |-INFO in ch.qos.logback.classic.joran.action.ConfigurationAction - End of configuration.
23:36:29,710 |-INFO in ch.qos.logback.classic.joran.JoranConfigurator@704d6e83 - Registering current configuration as safe fallback point
SLF4J: Actual binding is of type [ch.qos.logback.classic.util.ContextSelectorStaticBinder]
23:36:30.225 [main] WARN o.a.t.g.s.p.c.SparkGraphComputer - class org.apache.hadoop.mapreduce.lib.output.NullOutputFormat does not implement PersistResultGraphAware and thus, persistence options are unknown -- assuming all options are possible
23:36:30.516 [SparkGraphComputer-boss] WARN org.apache.spark.util.Utils - Your hostname, nchristidis-GL502VMK resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp3s0)
23:36:30.516 [SparkGraphComputer-boss] WARN org.apache.spark.util.Utils - Set SPARK_LOCAL_IP if you need to bind to another address
23:36:32.191 [SparkGraphComputer-boss] WARN o.a.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23:36:33.279 [SparkGraphComputer-boss] WARN o.a.t.g.s.p.c.SparkGraphComputer - HADOOP_GREMLIN_LIBS is not set -- proceeding regardless
23:36:35.266 [SparkGraphComputer-boss] WARN com.datastax.driver.core.NettyUtil - Found Netty's native epoll transport in the classpath, but epoll is not available. Using NIO instead.
IT WORKED: 43
23:39:32.111 [Thread-3] WARN org.apache.spark.SparkContext - Ignoring Exception while stopping SparkContext from shutdown hook
java.lang.NoSuchMethodError: io.netty.bootstrap.ServerBootstrap.config()Lio/netty/bootstrap/ServerBootstrapConfig;
at org.apache.spark.network.server.TransportServer.close(TransportServer.java:154)
at org.apache.spark.network.netty.NettyBlockTransferService.close(NettyBlockTransferService.scala:180)
at org.apache.spark.storage.BlockManager.stop(BlockManager.scala:1615)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:90)
at org.apache.spark.SparkContext$$anonfun$stop.apply$mcV$sp(SparkContext.scala:1974)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1973)
at org.apache.spark.SparkContext$$anonfun.apply$mcV$sp(SparkContext.scala:575)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager.run(ShutdownHookManager.java:54)
Process finished with exit code 123
所以我得到了正确的输出:IT WORKED: 43
总共有43个顶点,但是时间太长了。
还有这条日志消息:
23:36:33.279 [SparkGraphComputer-boss] WARN o.a.t.g.s.p.c.SparkGraphComputer - HADOOP_GREMLIN_LIBS is not set -- proceeding regardless
强调我很可能没有正确设置某些东西。
====================================== =========================
更新:10 月 27 日星期二
通过将程序提交到具有一个从属节点的 spark 集群,而不是通过本地 IDE 运行 提交程序,我的时间从 6 分钟显着减少到 3 分钟。
基于 OLAP 的 Gremlin 遍历将比标准 OLTP 遍历慢得多,即使对于小数据集也是如此。仅仅让 Spark 准备好处理您的遍历就需要付出相当大的代价。仅此一项开销就很容易使您的 OLAP 查询比 OLTP 查询慢 1 分钟。在对您的问题的评论中,您解释说您的查询大约需要六分钟。这似乎有点长,但根据您的环境,可能在 OLAP 的正常范围内??
有些图表会针对 OLAP count()
进行优化,并为您提供相当快速的结果,但您用“JanusGraph”标记了这个问题,所以我认为这不适用于此处。
在您开始关注大规模图形之前,您通常看不到基于 OLAP 的遍历的价值。比较 OLAP 和 OLTP 中 100+ 百万边的计数,您可能根本不介意等待六分钟才能得到答案(因为 OLTP 可能根本无法完成)。
很难说您可以做些什么来使当前设置更快,因为此时您实际上只是在证明一切正常。现在你有了一个工作模型,我建议下一步是生成一个大得多的图(可能有 1000 万个顶点)并用一个合适大小的 spark 集群再次尝试你的计数。
我已经编写了一个程序,以便在 Spark 的帮助下在 Gremlin(我使用 Janus Graph 和 Cassandra 和 Solr 作为引擎)上执行一些查询,但是查询结果非常慢。
很可能是我设置不正确。
这是我用过的代码。
驱动程序:
import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.spark.process.computer.SparkGraphComputer;
import org.apache.tinkerpop.gremlin.structure.Graph;
import org.apache.tinkerpop.gremlin.structure.util.GraphFactory;
public class SimpleApp {
public static void main(String[] args) throws Exception {
Configuration config = GraphTraversalProvider.makeLocal();
Graph hadoopGraph = GraphFactory.open(config);
Long totalVertices = hadoopGraph.traversal().withComputer(SparkGraphComputer.class).V().count().next();
System.out.println("IT WORKED: " + totalVertices);
hadoopGraph.close();
}
}
GraphTraversalProvider class
import org.apache.commons.configuration.BaseConfiguration;
import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.hadoop.Constants;
public class GraphTraversalProvider {
private static final String KEY_SPACE = "janusgraph";
private static final String CASSANDRA_ADDRESS = "localhost";
public static Configuration makeLocal() {
return make(true);
}
public static Configuration makeRemote() {
return make(false);
}
private static Configuration make(boolean local) {
final Configuration hadoopConfig = new BaseConfiguration();
hadoopConfig.setProperty("gremlin.graph", "org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_GRAPH_READER, "org.janusgraph.hadoop.formats.cql.CqlInputFormat");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_GRAPH_WRITER, "org.apache.hadoop.mapreduce.lib.output.NullOutputFormat");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_JARS_IN_DISTRIBUTED_CACHE, true);
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_INPUT_LOCATION, "none");
hadoopConfig.setProperty(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION, "output");
hadoopConfig.setProperty(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, true);
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.backend", "cql");
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.hostname", CASSANDRA_ADDRESS);
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.port", "9042");
hadoopConfig.setProperty("janusgraphmr.ioformat.conf.storage.cassandra.keyspace", KEY_SPACE);
hadoopConfig.setProperty("cassandra.input.partitioner.class", "org.apache.cassandra.dht.Murmur3Partitioner");
hadoopConfig.setProperty("cassandra.input.widerows", true);
if (local) {
hadoopConfig.setProperty("spark.master", "local[*]"); // Run Spark locally with as many worker threads as logical cores on your machine.
} else {
hadoopConfig.setProperty("spark.master", "spark://ADD_YOUR_URL");
}
hadoopConfig.setProperty("spark.executor.memory", "2g");
hadoopConfig.setProperty(Constants.SPARK_SERIALIZER, "org.apache.spark.serializer.KryoSerializer");
hadoopConfig.setProperty("spark.kryo.registrator", "org.janusgraph.hadoop.serialize.JanusGraphKryoRegistrator");
hadoopConfig.setProperty("storage.hostname", CASSANDRA_ADDRESS);
hadoopConfig.setProperty("storage.cassandra.keyspace", KEY_SPACE);
return hadoopConfig;
}
}
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ibc</groupId>
<artifactId>sparkdemo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<janus.version>0.5.1</janus.version>
<spark.version>2.4.0</spark.version>
<gremlin.version>3.4.6</gremlin.version>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.2</version>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-cassandra</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-hadoop</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-cql</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>org.janusgraph</groupId>
<artifactId>janusgraph-solr</artifactId>
<version>${janus.version}</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.2.3</version>
</dependency>
<!-- GREMLIN -->
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>spark-gremlin</artifactId>
<version>${gremlin.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>${gremlin.version}</version>
</dependency>
<!-- SPARK -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
</dependencies>
</project>
输出如下:
23:36:29,708 |-INFO in ch.qos.logback.classic.joran.action.RootLoggerAction - Setting level of ROOT logger to WARN
23:36:29,708 |-INFO in ch.qos.logback.core.joran.action.AppenderRefAction - Attaching appender named [CONSOLE] to Logger[ROOT]
23:36:29,708 |-INFO in ch.qos.logback.classic.joran.action.ConfigurationAction - End of configuration.
23:36:29,710 |-INFO in ch.qos.logback.classic.joran.JoranConfigurator@704d6e83 - Registering current configuration as safe fallback point
SLF4J: Actual binding is of type [ch.qos.logback.classic.util.ContextSelectorStaticBinder]
23:36:30.225 [main] WARN o.a.t.g.s.p.c.SparkGraphComputer - class org.apache.hadoop.mapreduce.lib.output.NullOutputFormat does not implement PersistResultGraphAware and thus, persistence options are unknown -- assuming all options are possible
23:36:30.516 [SparkGraphComputer-boss] WARN org.apache.spark.util.Utils - Your hostname, nchristidis-GL502VMK resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp3s0)
23:36:30.516 [SparkGraphComputer-boss] WARN org.apache.spark.util.Utils - Set SPARK_LOCAL_IP if you need to bind to another address
23:36:32.191 [SparkGraphComputer-boss] WARN o.a.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23:36:33.279 [SparkGraphComputer-boss] WARN o.a.t.g.s.p.c.SparkGraphComputer - HADOOP_GREMLIN_LIBS is not set -- proceeding regardless
23:36:35.266 [SparkGraphComputer-boss] WARN com.datastax.driver.core.NettyUtil - Found Netty's native epoll transport in the classpath, but epoll is not available. Using NIO instead.
IT WORKED: 43
23:39:32.111 [Thread-3] WARN org.apache.spark.SparkContext - Ignoring Exception while stopping SparkContext from shutdown hook
java.lang.NoSuchMethodError: io.netty.bootstrap.ServerBootstrap.config()Lio/netty/bootstrap/ServerBootstrapConfig;
at org.apache.spark.network.server.TransportServer.close(TransportServer.java:154)
at org.apache.spark.network.netty.NettyBlockTransferService.close(NettyBlockTransferService.scala:180)
at org.apache.spark.storage.BlockManager.stop(BlockManager.scala:1615)
at org.apache.spark.SparkEnv.stop(SparkEnv.scala:90)
at org.apache.spark.SparkContext$$anonfun$stop.apply$mcV$sp(SparkContext.scala:1974)
at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
at org.apache.spark.SparkContext.stop(SparkContext.scala:1973)
at org.apache.spark.SparkContext$$anonfun.apply$mcV$sp(SparkContext.scala:575)
at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$$anonfun$apply$mcV$sp.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply$mcV$sp(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll.apply(ShutdownHookManager.scala:188)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
at org.apache.spark.util.SparkShutdownHookManager$$anon.run(ShutdownHookManager.scala:178)
at org.apache.hadoop.util.ShutdownHookManager.run(ShutdownHookManager.java:54)
Process finished with exit code 123
所以我得到了正确的输出:IT WORKED: 43
总共有43个顶点,但是时间太长了。
还有这条日志消息:
23:36:33.279 [SparkGraphComputer-boss] WARN o.a.t.g.s.p.c.SparkGraphComputer - HADOOP_GREMLIN_LIBS is not set -- proceeding regardless
强调我很可能没有正确设置某些东西。
====================================== =========================
更新:10 月 27 日星期二
通过将程序提交到具有一个从属节点的 spark 集群,而不是通过本地 IDE 运行 提交程序,我的时间从 6 分钟显着减少到 3 分钟。
基于 OLAP 的 Gremlin 遍历将比标准 OLTP 遍历慢得多,即使对于小数据集也是如此。仅仅让 Spark 准备好处理您的遍历就需要付出相当大的代价。仅此一项开销就很容易使您的 OLAP 查询比 OLTP 查询慢 1 分钟。在对您的问题的评论中,您解释说您的查询大约需要六分钟。这似乎有点长,但根据您的环境,可能在 OLAP 的正常范围内??
有些图表会针对 OLAP count()
进行优化,并为您提供相当快速的结果,但您用“JanusGraph”标记了这个问题,所以我认为这不适用于此处。
在您开始关注大规模图形之前,您通常看不到基于 OLAP 的遍历的价值。比较 OLAP 和 OLTP 中 100+ 百万边的计数,您可能根本不介意等待六分钟才能得到答案(因为 OLTP 可能根本无法完成)。
很难说您可以做些什么来使当前设置更快,因为此时您实际上只是在证明一切正常。现在你有了一个工作模型,我建议下一步是生成一个大得多的图(可能有 1000 万个顶点)并用一个合适大小的 spark 集群再次尝试你的计数。