GPU Parallelism in Javascript going slower

1k Views Asked by At

This is kind of a specific problem. I have recently tested out gpu.js. This library is supposed to accelerate computations by using webgl to parallelize computations. I made a quick test:

var gpu = new GPU();

function product(v, u) {
  return gpu.createKernel(function(X, Y) {
      return X[this.thread.x] * Y[this.thread.x];
  }).dimensions([v.length])(v, u);
}


var before = new Date().getTime();
console.log(product(numeric.random([100000]), numeric.random([100000])).length);
console.log('Parallel Time: ', (new Date().getTime()) - before);

before = new Date().getTime();
v = numeric.random([100000])
u = numeric.random([100000])
for(var i = 0; i < v.length; i++){
  v[i] = v[i] * u[i];
}
console.log(v.length);
console.log('Procedural Time: ', (new Date().getTime()) - before);

And got the following output:

script.js:11 100000 
script.js:12 Parallel Time:  340 
script.js:20 100000 
script.js:21 Procedural Time:  15

The parallel time is over an order of magnitude slower. Is there any reason why this would be the case? I tried this on a few machines with different GPUs. I have tried a few similar operations as well. Am I doing something wrong or is it a problem with the library? Is there some way I can improve this?

3

There are 3 best solutions below

2
On

When dealing with the GPU you have to be aware of overhead.

Calls to gpu.createKernel are likely to be very expensive as it has to parse your JavaScript code, create the appropriate GLSL code, and send it to WebGL to be compiled and linked.

At the very least you'll want to call that command once and store the result in a global variable to be reused every time you call product.

It's also worth being aware that there is a non zero amount of work required to move the data to and from the GPU, so you'll see more gains with more complicated calculations.

1
On

I combed through the source code of their benchmark and I found you only get the speedup when you run a lot of operations in a row. I do think it is an overhead issue. I created the following super simple benchmark comparing gpu.js to numeric.js. Here it is if anyone is interested:

var gpu = new GPU();

var size = 512;
var scale = 10;
var iterations = 100;

// Scaling up the matricies decreases the effect of precision errors
A = numeric.mul(numeric.random([size, size]), scale)
B = numeric.mul(numeric.random([size, size]), scale)

// I know eval is dangerous but I couldn't get the size in any other way
function multGen(size) {
  return eval("(function(A, B) { var sum = 0; for (var i=0; i<"+ size +"; i++) {sum += A[this.thread.y][i] * B[i][this.thread.x];} return sum;})")
}

var mat_mult = gpu.createKernel(multGen(size)).dimensions([size, size]);

var before = new Date().getTime();
var parallel = mat_mult(A, B);

// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
  parallel = mat_mult(A, B);
}
var parTime = (new Date().getTime()) - before;
console.log('Parallel Time: ', parTime);

before = new Date().getTime();
var procedural = numeric.dot(A, B);

// Need to do many computations to get the advantages of the GPU
for(var i = 0; i < iterations; i++) {
  procedural = numeric.dot(A, B);
}
var procTime = (new Date().getTime()) - before;
console.log('Procedural Time: ', procTime);

console.log((procTime / parTime) + ' times faster');

// This is for RMSD nornalization, flattening and doing min and max that way exceeded the call stack
var max = Math.max(Math.max(...A.map((function(row) {return Math.max(...row);}))), Math.max(...B.map((function(row) {return Math.max(...row);}))))

var min = Math.min(Math.min(...A.map((function(row) {return Math.min(...row);}))), Math.min(...B.map((function(row) {return Math.min(...row);}))))

// The matricies will be different due to precision issues so the Normalized RMDS can give you an idea of the difference
var nrmsd = Math.sqrt(numeric.sum(numeric.pow(numeric.sub(parallel, procedural), 2)) / size) / (max - min);

console.log('Normalized RMSD: ', nrmsd);

This gave me the following output:

scriptfour.js:26 Parallel Time:  20490
scriptfour.js:36 Procedural Time:  28736
scriptfour.js:38 1.402440214738897 times faster
scriptfour.js:48 Normalized RMSD:  0.009671934749138042

These results are pretty good. The eval slowed down the parallel one unfairly but it is still always faster. I don't think a setup like that is good for production but it still works here.

1
On

Use:

t0 = performance.now();
yourFunctionCall();
t1 = performance.now();
console.log("Function yourFunctionCall took " + (t1 - t0) + " ms.");

Not sure if that's the core of the issue, but I've been having problems with Date too.