Optimizing Code w/ Intel x86 SSE Intrinsics
The following code solves eigen values. It has been accelerated using Intel x86 SSE Intrinsics.
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "benchmark.h"
#include <nmmintrin.h>
#include <smmintrin.h>
#include <omp.h>
/** Computes the dot product of 2 vectors*/
float dotp(float* u, float* A, size_t n);
/** Computes the dot product of 2 vectors without using intrinsics*/
float dotp_naive(float* u, float* A, size_t n);
/** Performs in place transposition of a square matrix */
void transpose_square(float *B, size_t n);
/** Pads a square matrix (of type float) with 0s */
void eig(float *v, float *A, float *u, size_t x, unsigned iters) {
size_t n = x;
if (n > 20) {
transpose_square(A, n);
}
size_t leftover = n % 4;
for (size_t k = 0; k < iters; k += 1) {
/* v_k = Au_{k-1} */
if (n > 35) {
#pragma omp parallel for
for (size_t l = 0; l < n; l += 1) {
for (size_t i = 0; i < n; i += 1) {
v[i + l*n] = dotp(A + i*n, u + l*n, n);
}
}
}
else {
#pragma omp parallel for
for (size_t l = 0; l < n; l += 1) {
for (size_t i = 0; i < n; i += 1) {
v[i + l*n] = dotp(A + i*n, u + l*n, n);
}
}
}
/* mu_k = ||v_k|| */
float mu[n]; float global_sum;
#pragma omp parallel for
for (size_t l = 0; l < n; l += 1) {
mu[l] = sqrt(dotp(v + l*n, v + l*n, n));
}
/* u_k = v_k / mu_k */
float temp;
if (n > 35) {
#pragma omp parallel for private(temp)
for (size_t l = 0; l < n; l += 1) {
temp = mu[l];
for (size_t i = 0; i < n; i += 1) {
u[i + l*n] = v[i + l*n] / temp;
}
}
}
else {
for (size_t l = 0; l < n; l += 1) {
temp = mu[l];
for (size_t i = 0; i < n; i += 1) {
u[i + l*n] = v[i + l*n] / temp;
}
}
}
}
}
/** * Function: dotp
* --------------------
* computes the dotproduct of two vectors:
*n: size of the square matrix (n x n)
*A: a column of the transposed matrix A (column major)
*u: a column of the u matrix (column major)
* returns: a scalar float that is the dotproduct of the two vectors u and A
*/
float dotp(float* u, float* A, size_t n) {
__m128 global_sum = _mm_setzero_ps();
__m128 tempu1, tempu2, tempu3, tempu4, tempA1, tempA2, tempA3, tempA4;//, sum1, sum2, sum3;
//PACKED AND UNROLLED
for (size_t i = 0; i < n / 16 * 16; i += 16) {
tempu1 = _mm_loadu_ps(u + i + 0);
tempA1 = _mm_loadu_ps(A + i + 0);
tempu2 = _mm_loadu_ps(u + i + 4);
tempA2 = _mm_loadu_ps(A + i + 4);
tempu3 = _mm_loadu_ps(u + i + 8);
tempA3 = _mm_loadu_ps(A + i + 8);
tempu4 = _mm_loadu_ps(u + i + 12);
tempA4 = _mm_loadu_ps(A + i + 12);
//the += synatax is finicky with this intrinsic
//global_sum += _mm_dp_ps(tempu1, tempA1, 0xF1) + _mm_dp_ps(tempu2, tempA2, 0xF1) + _mm_dp_ps(tempu3, tempA3, 0xF1) + _mm_dp_ps(tempu4, tempA4, 0xF1);
global_sum = _mm_add_ps(global_sum, _mm_add_ps(_mm_add_ps(_mm_dp_ps(tempu1, tempA1, 0xF1), _mm_dp_ps(tempu2, tempA2, 0xF1)), _mm_add_ps(_mm_dp_ps(tempu3, tempA3, 0xF1), _mm_dp_ps(tempu4, tempA4, 0xF1))));
}
//PACKED (NOT UNROLLED)
for (size_t i = n / 16 * 16; i < n / 4 * 4; i += 4) {
tempu1 = _mm_loadu_ps(u + i + 0);
tempA1 = _mm_loadu_ps(A + i + 0);
//the += synatax is finicky with this intrinsic
//global_sum += _mm_dp_ps(tempu1, tempA1, 0xF1);
global_sum = _mm_add_ps(global_sum, _mm_dp_ps(tempu1, tempA1, 0xF1));
}
float result = _mm_cvtss_f32(global_sum);
//NORMAL (NOT UNPACKED, NOT UNROLLED)
for (size_t i = n / 4 * 4; i < n; i++) {
result += u[i] * A[i];
}
return result;
}
float dotp_naive(float* u, float* A, size_t n) {
float global_sum = 0.0;
#pragma omp parallel
{
#pragma omp for reduction(+:global_sum)
for (size_t i = 0; i < n; i++) {
global_sum += u[i] * A[i];
}
}
return global_sum;
}
/**
* Function: transpose_square
* --------------------
* Performs in-place transposition of the input matrix
*n: size of the square matrix (n x n)
*B: a column major matrix to be transposed
* returns: nothing, the matrix is transposed in place
*/
void transpose_square(float *B, size_t n) {
#pragma omp parallel for
for (size_t row = 0; row < n; row++) {
for (size_t col = row + 1; col < n; col++) {
//swap A(n,m) with A(m,n)
if (row != col) {
float c = B[row + col*n];
B[row + col*n] = B[col + row*n];
B[col + row*n] = c;
}
}
}
}
Comments