NodeWritable java.lang.NoClassDefFoundError Hadoop Jena

376 Views Asked by At

My Jena Hadoop MapReduce example throw java.lang.NoClassDefFoundError. It's a Maven project. I read that it may be related to a missing dependency, but i can't figure out which one i'm missing! What may be the problem?

Console Log

java.lang.NoClassDefFoundError: org/apache/jena/hadoop/rdf/types/NodeWritable
        at org.apache.jena.hadoop.rdf.stats.RdfMapReduceExample.main(RdfMapReduceExample.java:29)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:497)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
Caused by: java.lang.ClassNotFoundException: org.apache.jena.hadoop.rdf.types.NodeWritable
        at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
        at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
        ... 7 more

Map code part 1

package org.apache.jena.hadoop.rdf.mapreduce.count;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.jena.hadoop.rdf.types.AbstractNodeTupleWritable;
import org.apache.jena.hadoop.rdf.types.NodeWritable;

public abstract class AbstractNodeTupleNodeCountMapper<TKey, TValue, T extends AbstractNodeTupleWritable<TValue>>
        extends Mapper<TKey, T, NodeWritable, LongWritable> {
    private LongWritable initialCount = new LongWritable(1);

    @Override
    protected void map(TKey key, T value, Context context) throws IOException, InterruptedException {
        NodeWritable[] ns = this.getNodes(value);
        for (NodeWritable n : ns) {
            context.write(n, this.initialCount);
        }
    }

    protected abstract NodeWritable[] getNodes(T tuple);
}

Map code part 2

package org.apache.jena.hadoop.rdf.mapreduce.count;

import org.apache.jena.graph.Triple;
import org.apache.jena.hadoop.rdf.mapreduce.count.AbstractNodeTupleNodeCountMapper;
import org.apache.jena.hadoop.rdf.types.NodeWritable;
import org.apache.jena.hadoop.rdf.types.TripleWritable;

public class TripleNodeCountMapper<TKey> extends AbstractNodeTupleNodeCountMapper<TKey, Triple, TripleWritable> {
    @Override
    protected NodeWritable[] getNodes(TripleWritable tuple) {
        Triple t = tuple.get();
        return new NodeWritable[] { new NodeWritable(t.getSubject()), new NodeWritable(t.getPredicate()),
                new NodeWritable(t.getObject()) };
    }
}

Reduce Code

package org.apache.jena.hadoop.rdf.mapreduce.count;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.jena.hadoop.rdf.types.NodeWritable;

public class NodeCountReducer extends Reducer<NodeWritable, LongWritable, NodeWritable, LongWritable> {
    @Override
    protected void reduce(NodeWritable key, Iterable<LongWritable> values, Context context)
            throws IOException, InterruptedException {
        long count = 0;
        Iterator<LongWritable> iter = values.iterator();
        while (iter.hasNext()) {
            count += iter.next().get();
        }
        context.write(key, new LongWritable(count));
    }
}

Job Handler

package org.apache.jena.hadoop.rdf.stats;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat;
import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat;
import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer;
import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper;
import org.apache.jena.hadoop.rdf.types.NodeWritable;

public class RdfMapReduceExample {

    public static void main(String[] args) {
        try {
            // Get Hadoop configuration
            Configuration config = new Configuration(true);

            // Create job
            Job job = Job.getInstance(config);
            job.setJarByClass(RdfMapReduceExample.class);
            job.setJobName("RDF Triples Node Usage Count");

            // Map/Reduce classes
            job.setMapperClass(TripleNodeCountMapper.class);
            job.setMapOutputKeyClass(NodeWritable.class);
            job.setMapOutputValueClass(LongWritable.class);
            job.setReducerClass(NodeCountReducer.class);

            // Input and Output
            job.setInputFormatClass(TriplesInputFormat.class);
            job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
            FileInputFormat.setInputPaths(job, new Path(args[1]));
            FileOutputFormat.setOutputPath(job, new Path(args[2]));

            // Launch the job and await completion
            job.submit();
            if (job.monitorAndPrintJob()) {
                // OK
                System.out.println("Completed");
            } else {
                // Failed
                System.err.println("Failed");
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

Pom.xml dependencies

<dependencies>
    <!-- https://mvnrepository.com/artifact/org.apache.jena/jena-elephas-common -->
    <dependency>
        <groupId>org.apache.jena</groupId>
        <artifactId>jena-elephas-common</artifactId>
        <version>3.1.1</version>
    </dependency>

    <dependency>
        <groupId>org.apache.jena</groupId>
        <artifactId>jena-elephas-io</artifactId>
        <version>3.1.1</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-mapreduce-client-common</artifactId>
        <version>2.7.1</version>
        <scope>provided</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.1</version>
        <scope>provided</scope>
    </dependency>
</dependencies>
1

There are 1 best solutions below

1
On

Your dependency declarations are correct as otherwise your code would not compile at all.

Your problem is that your JAR likely only contains your code and doesn't contain any of the necessary dependencies. So when Map Reduce tries to run your code none of the dependencies are present.

Generally when building for Map Reduce it is best to create a fat JAR that contains your code and all your dependencies. The maven assembly plug-in can be used to do this (you could also use Maven shade if preferred).

Add this to your pom.xml:

      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <descriptors>
            <descriptor>hadoop-job.xml</descriptor>
          </descriptors>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>

Add use this hadoop-job.xml:

<assembly>
  <id>hadoop-job</id>
  <formats>
    <format>jar</format>
  </formats>
  <includeBaseDirectory>false</includeBaseDirectory>
  <dependencySets>
    <dependencySet>
      <unpack>false</unpack>
      <scope>provided</scope>
      <outputDirectory>lib</outputDirectory>
      <excludes>
        <exclude>${groupId}:${artifactId}</exclude>
      </excludes>
    </dependencySet>
    <dependencySet>
      <unpack>true</unpack>
      <includes>
        <include>${groupId}:${artifactId}</include>
      </includes>
    </dependencySet>
  </dependencySets>
</assembly>

Essentially this asks Maven to build you a fat JAR that contains all the non-provided dependencies. This will create an additional artefact called your-artifact-VERSION-hadoop-job.jar which you should run instead of the normal JAR