尝试使用 Java MapReduce 作业批量加载到 titan 时出现 ClassNotFoundException
ClassNotFoundException when trying to bulk load into titan using a Java MapReduce job
我们目前正在尝试使用 map reduce 作业和 titan 依赖项将一些文件从 HDFS 批量加载到 titan。但是,一旦地图作业开始时找不到 tinkerpop class,我们就会 运行 遇到问题。这是错误:
java.lang.ClassNotFoundException: org.apache.tinkerpop.gremlin.structure.Vertex
我在某处读到 Titan 1.0.0 仅与 Tinkerpop 3.0.1-incubating 兼容,因此这就是我们的依赖项版本。
看看我们的 pom.xml 和代码
可能会有所帮助
pom.xml:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>replacementID</groupId>
<artifactId>replacementID</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>com.thinkaurelius.titan</groupId>
<artifactId>titan-hbase</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>3.0.1-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-core</artifactId>
<version>3.0.1-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-driver</artifactId>
<version>3.0.1-incubating</version>
</dependency>
</dependencies>
</project>
映射器:
package edu.rosehulman.brubakbd;
import java.io.IOException;
import org.apache.commons.configuration.BaseConfiguration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.thinkaurelius.titan.core.TitanFactory;
import com.thinkaurelius.titan.core.TitanGraph;
import com.thinkaurelius.titan.core.TitanVertex;
import org.apache.tinkerpop.gremlin.structure.Vertex;
public class TitanMRMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
String[] vals = line.split("\t");
BaseConfiguration conf = new BaseConfiguration();
conf.setProperty("gremlin.graph", "com.thinkaurelius.titan.core.TitanFactory");
conf.setProperty("storage.backend", "hbase");
conf.setProperty("storage.hostname", "hadoop-16.csse.rose-hulman.edu");
conf.setProperty("storage.batch-loading", true);
conf.setProperty("storage.hbase.ext.zookeeper.znode.parent","/hbase-unsecure");
conf.setProperty("storage.hbase.ext.hbase.zookeeper.property.clientPort", 2181);
conf.setProperty("cache.db-cache",true);
conf.setProperty("cache.db-cache-clean-wait", 20);
conf.setProperty("cache.db-cache-time", 180000);
conf.setProperty("cache.db-cache-size", 0.5);
TitanGraph graph = TitanFactory.open(conf);
TitanVertex v1 = graph.addVertex();
v1.property("pageID", vals[0]);
TitanVertex v2 = graph.addVertex();
v2.property("pageID", vals[1]);
v1.addEdge("links_To", v2);
graph.tx().commit();
}
}
Driver:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
public class TitanMR {
public static void main(String[] args) throws Exception{
if (args.length != 1){
System.err.println("Usage: TitanMR <input path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(TitanMR.class);
job.setJobName("TitanMR");
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapperClass(TitanMRMapper.class);
job.setNumReduceTasks(0);
System.out.println("about to submit job");
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
在 pom.xml
中升级你的 gremlin 罐子
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>3.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-core</artifactId>
<version>3.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-driver</artifactId>
<version>3.2.3</version>
</dependency>
我建议您考虑创建一个包含所有项目依赖项的 uber-jar。由于您使用 Apache Maven 进行构建,因此您使用 Apache Maven Assembly Plugin or the Apache Maven Shade Plugin.
我们目前正在尝试使用 map reduce 作业和 titan 依赖项将一些文件从 HDFS 批量加载到 titan。但是,一旦地图作业开始时找不到 tinkerpop class,我们就会 运行 遇到问题。这是错误:
java.lang.ClassNotFoundException: org.apache.tinkerpop.gremlin.structure.Vertex
我在某处读到 Titan 1.0.0 仅与 Tinkerpop 3.0.1-incubating 兼容,因此这就是我们的依赖项版本。 看看我们的 pom.xml 和代码
可能会有所帮助pom.xml:
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>replacementID</groupId>
<artifactId>replacementID</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<dependency>
<groupId>com.thinkaurelius.titan</groupId>
<artifactId>titan-hbase</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>3.0.1-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-core</artifactId>
<version>3.0.1-incubating</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-driver</artifactId>
<version>3.0.1-incubating</version>
</dependency>
</dependencies>
</project>
映射器:
package edu.rosehulman.brubakbd;
import java.io.IOException;
import org.apache.commons.configuration.BaseConfiguration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import com.thinkaurelius.titan.core.TitanFactory;
import com.thinkaurelius.titan.core.TitanGraph;
import com.thinkaurelius.titan.core.TitanVertex;
import org.apache.tinkerpop.gremlin.structure.Vertex;
public class TitanMRMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException{
String line = value.toString();
String[] vals = line.split("\t");
BaseConfiguration conf = new BaseConfiguration();
conf.setProperty("gremlin.graph", "com.thinkaurelius.titan.core.TitanFactory");
conf.setProperty("storage.backend", "hbase");
conf.setProperty("storage.hostname", "hadoop-16.csse.rose-hulman.edu");
conf.setProperty("storage.batch-loading", true);
conf.setProperty("storage.hbase.ext.zookeeper.znode.parent","/hbase-unsecure");
conf.setProperty("storage.hbase.ext.hbase.zookeeper.property.clientPort", 2181);
conf.setProperty("cache.db-cache",true);
conf.setProperty("cache.db-cache-clean-wait", 20);
conf.setProperty("cache.db-cache-time", 180000);
conf.setProperty("cache.db-cache-size", 0.5);
TitanGraph graph = TitanFactory.open(conf);
TitanVertex v1 = graph.addVertex();
v1.property("pageID", vals[0]);
TitanVertex v2 = graph.addVertex();
v2.property("pageID", vals[1]);
v1.addEdge("links_To", v2);
graph.tx().commit();
}
}
Driver:
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
public class TitanMR {
public static void main(String[] args) throws Exception{
if (args.length != 1){
System.err.println("Usage: TitanMR <input path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(TitanMR.class);
job.setJobName("TitanMR");
FileInputFormat.addInputPath(job, new Path(args[0]));
job.setOutputFormatClass(NullOutputFormat.class);
job.setMapperClass(TitanMRMapper.class);
job.setNumReduceTasks(0);
System.out.println("about to submit job");
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
在 pom.xml
中升级你的 gremlin 罐子<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>hadoop-gremlin</artifactId>
<version>3.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-core</artifactId>
<version>3.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.tinkerpop</groupId>
<artifactId>gremlin-driver</artifactId>
<version>3.2.3</version>
</dependency>
我建议您考虑创建一个包含所有项目依赖项的 uber-jar。由于您使用 Apache Maven 进行构建,因此您使用 Apache Maven Assembly Plugin or the Apache Maven Shade Plugin.