Why did an OpenCL rendering optimization make my code slower?

68 Views Asked by At

I'm trying to optimize my 3D rendering OpenCL code. Currently, I render a 3D triangle by dividing its bounding box into 2x2 tiles and doing necessary computations for each pixel inside the tile. This turned out to be slightly faster than giving each work item just a single pixel, because based on the barycentric coordinates of point (x,y), it's easy to update them and get the coordinates for (x+1,y) etc. Here's the implementation of this idea:

typedef float3 vec3;

// v1,v2,v3 are (x,y,1/z) coordinates of the triangle's projection onto the screen 
__kernel void draw(__global float* depthBuffer,__global int* colorArray,
                   vec3 v1, vec3 v2, vec3 v3, int clr,
                   int screen_width, int screen_height, int minX, int minY, float inv) {

    // do all the computations for a 2x2px tile.
    // inv is the precomputed denominator in the barycentric coords formula:
    // 1/(v1.x*v2.y - v1.x*v3.y - v1.y*v2.x + v1.y*v3.x + v2.x*v3.y - v2.y*v3.x)
    
    // x,y of top-left corner of tile:
    int x = 2*(get_global_id(0))+minX; // minX,minY - top left of triangle bounding box
    int y = 2*(get_global_id(1))+minY; // 2 is the tile size
    
    float lambda1 = (-v1.x*v3.y + v1.x*y + v1.y*v3.x - v1.y*x - v3.x*y + v3.y*x)*inv; // barycentric coordinates of the top-left corner of the tile.
    float lambda2 = (v1.x*v2.y - v1.x*y - v1.y*v2.x + v1.y*x + v2.x*y - v2.y*x)*inv;
    float dx1 = (v3.y - v1.y)*inv, dx2 = (v1.y - v2.y)*inv; // barycentric coordinates of a point increase by this value when x increases by 1.
    int index = (pos.y + (screen_height / 2)) * screen_width + (pos.x + (screen_width / 2));
    // test if top-left pixel is inside triangle's projection
    if(lambda1>=0 && lambda2>=0 && lambda1+lambda2<=1){ 
        float f = lambda2*v3.z + lambda1*v2.z + (1-lambda1-lambda2)*v1.z; // depth
        if (f > depthBuffer[index]) {
            depthBuffer[index] = f;
            colorArray[index] = clr;
        }
    }
    
    ++index; // update pixel index
    lambda1 += dx1; // get barycentric coordinates for point (x+1,y) 
    lambda2 += dx2;
    if(lambda1>=0 && lambda2>=0 && lambda1+lambda2<=1){
        float f = lambda2*v3.z + lambda1*v2.z + (1-lambda1-lambda2)*v1.z;
        if (f > depthBuffer[index]) {
            depthBuffer[index] = f;
            colorArray[index] = clr;
        }
    }

    index+=screen_width;
    lambda1 += (v1.x - v3.x)*inv; // same as dx1,dx2 but for changing y.
    lambda2 += (v2.x - v1.x)*inv; // (x+1,y+1)
    if(lambda1>=0 && lambda2>=0 && lambda1+lambda2<=1){
        float f = lambda2*v3.z + lambda1*v2.z + (1-lambda1-lambda2)*v1.z;
        if (f > depthBuffer[index]) {
            depthBuffer[index] = f;
            colorArray[index] = clr;
        }
    }

    --index; 
    lambda1 -= dx1; // (x,y+1)
    lambda2 -= dx2;
    if(lambda1>=0 && lambda2>=0 && lambda1+lambda2<=1){
        float f = lambda2*v3.z + lambda1*v2.z + (1-lambda1-lambda2)*v1.z;
        if (f > depthBuffer[index]) {
            depthBuffer[index] = f;
            colorArray[index] = clr;
        }
    }    
}

It tried to make it even faster, so I did something similar but for bigger tiles (4x4,...,8x8) but the results were very underwhelming. My idea was to check if any of the tile's corners lies inside the triangle's projection. If not, the tile is skipped and no computations are done for the remaining pixels inside it. Otherwise, I do computations for all the pixels inside the tile in the following order:

enter image description here

Here's the code:

__kernel void draw(__global float * depthBuffer, __global int * colorArray,
    vec3 v1, vec3 v2, vec3 v3, int clr,
    int screen_width, int screen_height, int minX, int minY, float inv) {
    int tileW = 4, tileH = 4;
    int x = tileW * (get_global_id(0)) + minX;
    int y = tileH * (get_global_id(1)) + minY;
   

    float lambda1 = (-v1.x * v3.y + v1.x * y + v1.y * v3.x - v1.y * x - v3.x * y + v3.y * x) * inv;
    float lambda2 = (v1.x * v2.y - v1.x * y - v1.y * v2.x + v1.y * x + v2.x * y - v2.y * x) * inv;
    // the coords of top-left pixel will be useful if the tile overlaps the triangle's projection
    float old1 = lambda1, old2 = lambda2;
    float dx1 = (v3.y - v1.y) * inv, dx2 = (v1.y - v2.y) * inv;
    float dy1 = (v1.x - v3.x) * inv, dy2 = (v2.x - v1.x) * inv; // how barycentric coords change

    if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 <= 1)) {
        lambda1 += dx1 * (tileW - 1);
        lambda2 += dx2 * (tileW - 1);
        if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 <= 1)) {
            lambda1 += dy1 * (tileH - 1);
            lambda2 += dy2 * (tileH - 1);
            if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 <= 1)) {
                lambda1 -= dx1 * (tileW - 1);
                lambda2 -= dx2 * (tileW - 1);
                if (!(lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 <= 1)) {
                    return; // skip the tile if it's completely outside the triangle's projection 
                }
            }
        }
    }
    int index = (y + (screen_height / 2)) * screen_width + (x + (screen_width / 2));
    lambda1 = old1;
    lambda2 = old2;
    int n = tileW * tileH;
    int level = 0, tx = 0, ty = tileH - 1; // start at the top-left corner
    for (int i = 0; i < n; ++i) {
        if (lambda1 >= 0 && lambda2 >= 0 && lambda1 + lambda2 <= 1) {
            float f = lambda2 * v3.z + lambda1 * v2.z + (1 - lambda1 - lambda2) * v1.z;
            if (f > depthBuffer[index]) {
                depthBuffer[index] = f;
                colorArray[index] = clr;
            }
        }
        if ((!level && tx == tileW - 1) || (level && tx == 0)) {
            ++ty;
            index += screen_width;
            lambda1 += dy1;
            lambda2 += dy2;
            level ^= 1;
        } else if (!level) {
            ++tx;
            ++index;
            lambda1 += dx1;
            lambda2 += dx2;
        } else {
            --tx;
            --index;
            lambda1 -= dx1;
            lambda2 -= dx2;
        }
    }
}

At best, this approach was slightly slower than the previous one, and only when I set tile size to 2x2 (which defeats the purpose). For bigger tile sizes it just was getting worse and worse.

I don't understand these results. Fewer pixels that lie outside the triangle are tested and I use the full formula for barycentric coordinates for fewer pixels in the second part of the code (Only for the top-left pixel inside a tile).

Can someone explain why this is happening and how to further optimize my code? I'm running this on Ubuntu with the integrated GPU that comes with i5-1240p.

I based my ideas on this paper: https://www.digipen.edu/sites/default/files/public/docs/theses/salem-haykal-digipen-master-of-science-in-computer-science-thesis-an-optimized-triangle-rasterizer.pdf

0

There are 0 best solutions below