My Jena Hadoop MapReduce example throw java.lang.NoClassDefFoundError. It's a Maven project. I read that it may be related to a missing dependency, but i can't figure out which one i'm missing! What may be the problem?
Console Log
java.lang.NoClassDefFoundError: org/apache/jena/hadoop/rdf/types/NodeWritable
at org.apache.jena.hadoop.rdf.stats.RdfMapReduceExample.main(RdfMapReduceExample.java:29)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Caused by: java.lang.ClassNotFoundException: org.apache.jena.hadoop.rdf.types.NodeWritable
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 7 more
Map code part 1
package org.apache.jena.hadoop.rdf.mapreduce.count;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
public abstract class AbstractNodeTupleNodeCountMapper<TKey, TValue, T extends AbstractNodeTupleWritable<TValue>>
extends Mapper<TKey, T, NodeWritable, LongWritable> {
private LongWritable initialCount = new LongWritable(1);
@Override
protected void map(TKey key, T value, Context context) throws IOException, InterruptedException {
NodeWritable[] ns = this.getNodes(value);
for (NodeWritable n : ns) {
context.write(n, this.initialCount);
}
}
protected abstract NodeWritable[] getNodes(T tuple);
}
Map code part 2
package org.apache.jena.hadoop.rdf.mapreduce.count;
import org.apache.jena.graph.Triple;
import org.apache.jena.hadoop.rdf.mapreduce.count.AbstractNodeTupleNodeCountMapper;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
import org.apache.jena.hadoop.rdf.types.TripleWritable;
public class TripleNodeCountMapper<TKey> extends AbstractNodeTupleNodeCountMapper<TKey, Triple, TripleWritable> {
@Override
protected NodeWritable[] getNodes(TripleWritable tuple) {
Triple t = tuple.get();
return new NodeWritable[] { new NodeWritable(t.getSubject()), new NodeWritable(t.getPredicate()),
new NodeWritable(t.getObject()) };
}
}
Reduce Code
package org.apache.jena.hadoop.rdf.mapreduce.count;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
public class NodeCountReducer extends Reducer<NodeWritable, LongWritable, NodeWritable, LongWritable> {
@Override
protected void reduce(NodeWritable key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
long count = 0;
Iterator<LongWritable> iter = values.iterator();
while (iter.hasNext()) {
count += iter.next().get();
}
context.write(key, new LongWritable(count));
}
}
Job Handler
package org.apache.jena.hadoop.rdf.stats;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
public class RdfMapReduceExample {
public static void main(String[] args) {
try {
// Get Hadoop configuration
Configuration config = new Configuration(true);
// Create job
Job job = Job.getInstance(config);
job.setJarByClass(RdfMapReduceExample.class);
job.setJobName("RDF Triples Node Usage Count");
// Map/Reduce classes
job.setMapperClass(TripleNodeCountMapper.class);
job.setMapOutputKeyClass(NodeWritable.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(NodeCountReducer.class);
// Input and Output
job.setInputFormatClass(TriplesInputFormat.class);
job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
// Launch the job and await completion
job.submit();
if (job.monitorAndPrintJob()) {
// OK
System.out.println("Completed");
} else {
// Failed
System.err.println("Failed");
}
} catch (Throwable e) {
e.printStackTrace();
}
}
}
Pom.xml dependencies
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.jena/jena-elephas-common -->
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-elephas-common</artifactId>
<version>3.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>jena-elephas-io</artifactId>
<version>3.1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.7.1</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
<scope>provided</scope>
</dependency>
</dependencies>
Your dependency declarations are correct as otherwise your code would not compile at all.
Your problem is that your JAR likely only contains your code and doesn't contain any of the necessary dependencies. So when Map Reduce tries to run your code none of the dependencies are present.
Generally when building for Map Reduce it is best to create a fat JAR that contains your code and all your dependencies. The maven assembly plug-in can be used to do this (you could also use Maven shade if preferred).
Add this to your
pom.xml
:Add use this
hadoop-job.xml
:Essentially this asks Maven to build you a fat JAR that contains all the non-provided dependencies. This will create an additional artefact called
your-artifact-VERSION-hadoop-job.jar
which you should run instead of the normal JAR