I have this function in C++
void routine2(float alpha, float beta) {
unsigned int i, j;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
w[i] = w[i] - beta + alpha * A[i][j] * x[j];
}
And this is the SSE(vectorized) version I wrote
void routine2_vec(float alpha, float beta) {
__m128 alpha_2 = _mm_set1_ps(alpha);
__m128 beta_2 = _mm_set1_ps(beta);
__m128 w2,num1,A2,X2;
__m128 multiple1, multiple2,sub;
unsigned int i, j;
for ( i = 0; i < N; ++i) {
w2 = _mm_setzero_ps();
for ( j = 0; j < (N/4)*4; j += 4) {
num1 = _mm_loadu_ps(&w[i]);
A2 = _mm_loadu_ps(&A[i][j]);
X2 = _mm_loadu_ps(&x[j]);
multiple1 = _mm_mul_ps(A2, X2);
multiple2 = _mm_mul_ps(alpha_2, multiple1);
sub = _mm_sub_ps(num1, beta_2);
w2 = _mm_add_ps(sub, multiple2);
_mm_store_ss(&w[i], w2);
}
}
}
The result of the both function should be the same, so I wrote another function to run and compare the result of both of them.
Test & compare functions
int routine2_test(float alpha, float beta) {
unsigned int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
test2[i] = test2[i] - beta + alpha * A[i][j] * x[j];
}
}
// compare
for (j = 0; j < N; j++) {
if (equal(w[j], test2[j]) == 1) {
printf("\n The result of w[%d] is not equal to test2[%d] \n", j, j);
return 1;
}
}
return 0;
}
unsigned short int equal(float a, float b) {
float temp = a - b;
//printf("\n %f %f", a, b);
if ((fabs(temp) / fabs(b)) < EPSILON)
return 0; //success
else
return 1; //wrong result
}
The test function runs the primary code and store the result in the test2 array, and the routine2_vec function is do the same with SSE and store the result in w array, then I compare both results in the routine2_test with the equal function.
EDITED
This is remaining part including main and init functions.
#include <stdio.h>
#include <time.h>
#include <pmmintrin.h>
#include <process.h>
#include <chrono>
#include <iostream>
#include <immintrin.h>
#include <omp.h>
#define M 256*128
#define ARITHMETIC_OPERATIONS1 3*M
#define TIMES1 1
#define N 2048
#define ARITHMETIC_OPERATIONS2 4*N*N
#define TIMES2 1
constexpr auto EPSILON =0.0001;
//function declaration
void initialize();
void routine2(float alpha, float beta);
unsigned short int equal(float a, float b);
int routine2_test(float, float);
void routine2_vec(float alpha, float beta);
__declspec(align(64)) float test2[N];
__declspec(align(64)) float A[N][N], x[N], w[N];
int main() {
float alpha = 0.023f, beta = 0.045f;
double run_time, start_time;
unsigned int t;
initialize();
printf("\nRoutine2:");
start_time = omp_get_wtime(); //start timer
for (t = 0; t < TIMES2; t++){
// routine2(alpha, beta);
routine2_vec(alpha, beta);
routine2_test(alpha, beta);
}
run_time = omp_get_wtime() - start_time; //end timer
printf("\n Time elapsed is %f secs \n %e FLOPs achieved\n", run_time, (double)(ARITHMETIC_OPERATIONS2) / ((double)run_time / TIMES2));
return 0;
}
void initialize() {
unsigned int i, j;
//initialize routine2 arrays
for (i = 0; i < N; i++)
for (j = 0; j < N; j++) {
A[i][j] = (i % 99) + (j % 14) + 0.013f;
}
//initialize routine2 arrays
for (i = 0; i < N; i++) {
x[i] = (i % 19) - 0.01f;
w[i] = (i % 5) - 0.002f;
test2[i]= (i % 5) - 0.002f;
}
}
We can start by optimizing the scalar version.
The way
betais used looks suspicious but that is for you to decide.Now we vectorize. You didn't write a proper reduction loop. Here is how it should look like:
Discussions on the best way to do the reduction step can be found here: Fastest way to do horizontal SSE vector sum (or other reduction)
Since you are doing essentially a matrix-vector product, you can instead compute four values along the outer loop. This avoids choking on the loop-carried dependency chain through the accumulator as discussed here: Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators)
Further improvements such as micro-optimizing the loop conditions or aligning one of the inputs can be found here. That also includes code for dealing with the tail of the array if the array size is not a multiple of 4.