Multithreading vector multiplication

95 Views Asked by At

Suppose we have a system with radix 2^32 (x = a0 * 2^0 + a1 * 2^32 + a2 * 2^64 +...) and we store the coefficients in a vector (a0 in position [0] , a1 in position [1] ...).

This function allows you to calculate the square of x (for simplicity the non-zero coefficients of x occupy only half of the vector).

#include <iostream>
#include <vector>
#include <cstdlib>
#include <stdint.h>
#include <vector>
#include <omp.h>

const int64_t esp_base_N = 32; 
const uint64_t base_N = 1ull << esp_base_N;

void sqr_base_N(std::vector<uint64_t> &X, std::vector<uint64_t> &Sqr , const int64_t len_base_N)
{
    std::vector<uint64_t> Sqr_t(len_base_N, 0ull);
    uint64_t base_N_m1 = base_N - 1ull;
    for (int64_t i = 0; i < (len_base_N / 2); i++)
    {
        uint64_t x_t = X[i];
        for (int64_t j = i + 1; j < (len_base_N / 2); j++)
        {
            uint64_t xy_t = x_t * X[j];
            Sqr_t[i + j] += 2ull * (xy_t & base_N_m1);
            Sqr_t[i + j + 1] += 2ull * (xy_t >> esp_base_N);
        }
        x_t *= x_t;
        Sqr_t[2 * i] += x_t & base_N_m1;
        Sqr_t[2 * i + 1] += x_t >> esp_base_N;
    }

    uint64_t mul_t;
    mul_t = Sqr_t[0] >> esp_base_N;
    Sqr[0] = Sqr_t[0] & base_N_m1;
    for (int64_t j = 1; j < len_base_N  ; j++)
    {
        Sqr[j] = (Sqr_t[j] + mul_t) & base_N_m1;
        mul_t = (Sqr_t[j] + mul_t) >> esp_base_N;
    }
}

I don't know if it is possible to speed up the function using multithreading.

I tried using OpenMP but managed to get no error just by applying to the outer for loop, furthermore the function is slower than the previous one.

#pragma omp parallel for
for (int64_t i = 0; i < (len_base_N / 2); i++)
{
    uint64_t x_t = X[i];
    x_t *= x_t;
    Sqr_t[2 * i] += x_t & base_N_m1;
    Sqr_t[2 * i + 1] += x_t >> esp_base_N;
}
for (int64_t i = 0; i < (len_base_N / 2); i++)
{
    uint64_t x_t = X[i];
    for (int64_t j = i + 1; j < (len_base_N / 2); j++)
    {
        uint64_t xy_t = x_t * X[j];
        Sqr_t[i + j] += 2ull * (xy_t & base_N_m1);
        Sqr_t[i + j + 1] += 2ull * (xy_t >> esp_base_N);
    }
}

Is there an easy way to use multithreading for the function (OpenMP or other) to speed it up?

Edit (to test the function):

int main()
{ 
    int64_t len_base_N = 6;
    std::vector<uint64_t> X_t(len_base_N, 0ull);
    X_t[0] = 1286608618ull;
    X_t[1] = 2;
    std::vector<uint64_t> Sqr_X(len_base_N, 0ull);
    sqr_base_N(X_t, Sqr_X , len_base_N);
    for(int64_t i = 0; i < len_base_N - 1; i++)
        std::cout << Sqr_X[i] << ", ";
    std::cout << Sqr_X[len_base_N - 1] << std::endl;
    return 0;
}
0

There are 0 best solutions below