Optimizing Code w/ Intel x86 SSE Intrinsics

3 minute read
The following code solves eigen values. It has been accelerated using Intel x86 SSE Intrinsics.
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "benchmark.h"
#include <nmmintrin.h>
#include <smmintrin.h>
#include <omp.h>

/** Computes the dot product of 2 vectors*/
float dotp(float* u, float* A, size_t n);
/** Computes the dot product of 2 vectors without using intrinsics*/
float dotp_naive(float* u, float* A, size_t n);
/** Performs in place transposition of a square matrix */
void transpose_square(float *B, size_t n);

/** Pads a square matrix (of type float) with 0s */
void eig(float *v, float *A, float *u, size_t x, unsigned iters) {
	size_t n = x;
	if (n > 20) {
		transpose_square(A, n);
	}
	size_t leftover = n % 4;
	for (size_t k = 0; k < iters; k += 1) {
		/* v_k = Au_{k-1} */
		if (n > 35) {
#pragma omp parallel for
			for (size_t l = 0; l < n; l += 1) {
				for (size_t i = 0; i < n; i += 1) {
					v[i + l*n] = dotp(A + i*n, u + l*n, n);
				}
			}
		}
		else {
#pragma omp parallel for
			for (size_t l = 0; l < n; l += 1) {
				for (size_t i = 0; i < n; i += 1) {
					v[i + l*n] = dotp(A + i*n, u + l*n, n);
				}
			}
		}
		/* mu_k = ||v_k|| */
		float mu[n]; float global_sum;
#pragma omp parallel for
		for (size_t l = 0; l < n; l += 1) {
			mu[l] = sqrt(dotp(v + l*n, v + l*n, n));
		}
		/* u_k = v_k / mu_k */
		float temp;
		if (n > 35) {
#pragma omp parallel for private(temp)
			for (size_t l = 0; l < n; l += 1) {
				temp = mu[l];
				for (size_t i = 0; i < n; i += 1) {
					u[i + l*n] = v[i + l*n] / temp;

				}
			}
		}
		else {
			for (size_t l = 0; l < n; l += 1) {
				temp = mu[l];
				for (size_t i = 0; i < n; i += 1) {
					u[i + l*n] = v[i + l*n] / temp;
				}
			}
		}
	}
}

/** * Function: dotp
* --------------------
* computes the dotproduct of two vectors:
*n: size of the square matrix (n x n)
*A: a column of the transposed matrix A (column major)
*u: a column of the u matrix (column major)
* returns: a scalar float that is the dotproduct of the two vectors u and A
*/
float dotp(float* u, float* A, size_t n) {
	__m128 global_sum = _mm_setzero_ps();
	__m128 tempu1, tempu2, tempu3, tempu4, tempA1, tempA2, tempA3, tempA4;//, sum1, sum2, sum3;
	//PACKED AND UNROLLED
	for (size_t i = 0; i < n / 16 * 16; i += 16) {
		tempu1 = _mm_loadu_ps(u + i + 0);
		tempA1 = _mm_loadu_ps(A + i + 0);
		tempu2 = _mm_loadu_ps(u + i + 4);
		tempA2 = _mm_loadu_ps(A + i + 4);
		tempu3 = _mm_loadu_ps(u + i + 8);
		tempA3 = _mm_loadu_ps(A + i + 8);
		tempu4 = _mm_loadu_ps(u + i + 12);
		tempA4 = _mm_loadu_ps(A + i + 12);
		//the += synatax is finicky with this intrinsic
		//global_sum += _mm_dp_ps(tempu1, tempA1, 0xF1) + _mm_dp_ps(tempu2, tempA2, 0xF1) + _mm_dp_ps(tempu3, tempA3, 0xF1) + _mm_dp_ps(tempu4, tempA4, 0xF1); 
		global_sum = _mm_add_ps(global_sum, _mm_add_ps(_mm_add_ps(_mm_dp_ps(tempu1, tempA1, 0xF1), _mm_dp_ps(tempu2, tempA2, 0xF1)), _mm_add_ps(_mm_dp_ps(tempu3, tempA3, 0xF1), _mm_dp_ps(tempu4, tempA4, 0xF1))));
	}

	//PACKED (NOT UNROLLED)
	for (size_t i = n / 16 * 16; i < n / 4 * 4; i += 4) {
		tempu1 = _mm_loadu_ps(u + i + 0);
		tempA1 = _mm_loadu_ps(A + i + 0);
		//the += synatax is finicky with this intrinsic
		//global_sum += _mm_dp_ps(tempu1, tempA1, 0xF1);

		global_sum = _mm_add_ps(global_sum, _mm_dp_ps(tempu1, tempA1, 0xF1));
	}
	float result = _mm_cvtss_f32(global_sum);
	//NORMAL (NOT UNPACKED, NOT UNROLLED)
	for (size_t i = n / 4 * 4; i < n; i++) {
		result += u[i] * A[i];
	}
	return result;
}


float dotp_naive(float* u, float* A, size_t n) {
	float global_sum = 0.0;
#pragma omp parallel
	{
#pragma omp for reduction(+:global_sum)
		for (size_t i = 0; i < n; i++) {
			global_sum += u[i] * A[i];
		}
	}
	return global_sum;
}

/**
* Function: transpose_square
* --------------------
* Performs in-place transposition of the input matrix
*n: size of the square matrix (n x n)
*B: a column major matrix to be transposed
* returns: nothing, the matrix is transposed in place
*/
void transpose_square(float *B, size_t n) {
#pragma omp parallel for
	for (size_t row = 0; row < n; row++) {
		for (size_t col = row + 1; col < n; col++) {
			//swap A(n,m) with A(m,n)
			if (row != col) {
				float c = B[row + col*n];
				B[row + col*n] = B[col + row*n];
				B[col + row*n] = c;
			}
		}
	}
}
Twitter Facebook LinkedIn
David Young

Optimizing Code w/ Intel x86 SSE Intrinsics

Comments

You May Also Enjoy

Interactive Neural Network Visualizer

Practical ML: Detecting Out-of-Distribution Data

Automating Free Staging Deployments for Github Pages

Performant Clustering of Geo Coordinates w/ Custom Distance Functions