Suppose we have a system with radix 2^32 (x = a0 * 2^0 + a1 * 2^32 + a2 * 2^64 +...) and we store the coefficients in a vector (a0 in position [0] , a1 in position [1] ...).
This function allows you to calculate the square of x (for simplicity the non-zero coefficients of x occupy only half of the vector).
#include <iostream>
#include <vector>
#include <cstdlib>
#include <stdint.h>
#include <vector>
#include <omp.h>
const int64_t esp_base_N = 32;
const uint64_t base_N = 1ull << esp_base_N;
void sqr_base_N(std::vector<uint64_t> &X, std::vector<uint64_t> &Sqr , const int64_t len_base_N)
{
std::vector<uint64_t> Sqr_t(len_base_N, 0ull);
uint64_t base_N_m1 = base_N - 1ull;
for (int64_t i = 0; i < (len_base_N / 2); i++)
{
uint64_t x_t = X[i];
for (int64_t j = i + 1; j < (len_base_N / 2); j++)
{
uint64_t xy_t = x_t * X[j];
Sqr_t[i + j] += 2ull * (xy_t & base_N_m1);
Sqr_t[i + j + 1] += 2ull * (xy_t >> esp_base_N);
}
x_t *= x_t;
Sqr_t[2 * i] += x_t & base_N_m1;
Sqr_t[2 * i + 1] += x_t >> esp_base_N;
}
uint64_t mul_t;
mul_t = Sqr_t[0] >> esp_base_N;
Sqr[0] = Sqr_t[0] & base_N_m1;
for (int64_t j = 1; j < len_base_N ; j++)
{
Sqr[j] = (Sqr_t[j] + mul_t) & base_N_m1;
mul_t = (Sqr_t[j] + mul_t) >> esp_base_N;
}
}
I don't know if it is possible to speed up the function using multithreading.
I tried using OpenMP but managed to get no error just by applying to the outer for loop, furthermore the function is slower than the previous one.
#pragma omp parallel for
for (int64_t i = 0; i < (len_base_N / 2); i++)
{
uint64_t x_t = X[i];
x_t *= x_t;
Sqr_t[2 * i] += x_t & base_N_m1;
Sqr_t[2 * i + 1] += x_t >> esp_base_N;
}
for (int64_t i = 0; i < (len_base_N / 2); i++)
{
uint64_t x_t = X[i];
for (int64_t j = i + 1; j < (len_base_N / 2); j++)
{
uint64_t xy_t = x_t * X[j];
Sqr_t[i + j] += 2ull * (xy_t & base_N_m1);
Sqr_t[i + j + 1] += 2ull * (xy_t >> esp_base_N);
}
}
Is there an easy way to use multithreading for the function (OpenMP or other) to speed it up?
Edit (to test the function):
int main()
{
int64_t len_base_N = 6;
std::vector<uint64_t> X_t(len_base_N, 0ull);
X_t[0] = 1286608618ull;
X_t[1] = 2;
std::vector<uint64_t> Sqr_X(len_base_N, 0ull);
sqr_base_N(X_t, Sqr_X , len_base_N);
for(int64_t i = 0; i < len_base_N - 1; i++)
std::cout << Sqr_X[i] << ", ";
std::cout << Sqr_X[len_base_N - 1] << std::endl;
return 0;
}