Program Listing for File VDVH_kernel.cpp¶
↰ Return to documentation for file (src_util/VDVH_kernel.cpp)
#include <VDVH_kernel.hpp>
#include <vector>
#include <cstring>
#include "types.hpp"
#ifdef HAVE_AVX2
#include <immintrin.h>
#endif
/*
Some refs:
https://en.algorithmica.org/hpc/algorithms/matmul/
https://sci-hub.st/https://dl.acm.org/doi/10.1145/1356052.1356053
https://ia601407.us.archive.org/23/items/cnx-org-col11136/high-performance-computing.pdf
https://people.freebsd.org/~lstewart/articles/cpumemory.pdf
*/
//
// NAIVE, ORIGINAL VERSION
//
//Double version
void VDVH_naive(std::vector<Complex> &G, const std::vector<double> &V, const std::vector<Complex> &D, const int L, const int M) {
for(size_t i=0; i<M; ++i){
for(size_t a=0; a<L; ++a){
Complex vai = V[a+i*L]*D[i];
for(size_t b=0; b<L; ++b){
G[b + a*L] += vai*V[b+i*L]; // original was G(a,b) but was wrong with complex operators
}
}
}
}
//Complex version
void VDVH_naive(std::vector<Complex> &G, const std::vector<Complex> &V, const std::vector<Complex> &D, const int L, const int M) {
for(size_t i=0; i<M; ++i){
for(size_t a=0; a<L; ++a){
Complex vai = V[a+i*L]*D[i];
for(size_t b=0; b<L; ++b){
G[b + a*L] += vai*conjugate(V[b+i*L]); // original was G(a,b) but was wrong with complex operators
}
}
}
}
#ifdef HAVE_AVX2
//
// KERNEL AVX2 version
//
// Double
#define KERNEL_HEIGH_D 4
#define KERNEL_WIDTH_D 2
void kernel_avx2(Complex* G, const double* V, const Complex* D, const int x, const int y, const int l, const int r, const int M, const int L)
{
__m256d res[KERNEL_HEIGH_D][KERNEL_WIDTH_D] = {0.}; //hold two complex type
__m256d reg_temp = {0.}, reg_temp2 = {0.};
Complex temp;
for(int k=l; k<r; k++) { //k inner dim to reduce (V column, square size of D)
//loops must be unrooled
for (int i = 0; i<KERNEL_HEIGH_D; i++) {
//broadcast lines of V(x+i,k) * D(k) into a register
temp = V[x+i + k*L] * D[k];
reg_temp = _mm256_set_pd(temp.imag(),temp.real(),temp.imag(),temp.real());
//now multiply the temp register by column of B
for (int j = 0; j < KERNEL_WIDTH_D; j++) {
//we should take indice V^T(k,y+j) and V^T(k,y+j+1)
//so for V: V(y+j,k) and V(y+j+1,k)
int index = y+2*j + k*L;
reg_temp2 = _mm256_set_pd(V[index+1],V[index+1],V[index],V[index]);
//res[i][j] += reg_temp2 * reg_temp; // as a vec register and FMA
res[i][j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[i][j]);
}
}
}
// write the results back to G considering symmetry
double* _G = (double*) G;
for (int j = 0; j < KERNEL_WIDTH_D; j++) {
for (int i = 0; i < KERNEL_HEIGH_D; i++) {
if (x+i > y+2*j+1) {
_G[2*(x+i + (y+2*j+1)*L)] += res[i][j][2]; //lower triangle
_G[2*(x+i + (y+2*j+1)*L)+1] += res[i][j][3];
_G[2*(y+2*j+1 + (x+i)*L)] += res[i][j][2]; //upper triangle
_G[2*(y+2*j+1 + (x+i)*L)+1] += res[i][j][3];
_G[2*(x+i + (y+2*j)*L)] += res[i][j][0]; //lower triangle
_G[2*(x+i + (y+2*j)*L)+1] += res[i][j][1];
_G[2*(y+2*j + (x+i)*L)] += res[i][j][0]; //upper triangle
_G[2*(y+2*j + (x+i)*L)+1] += res[i][j][1];
}
else if (x+i == y+2*j+1) {
_G[2*(x+i + (y+2*j+1)*L)] += res[i][j][2]; //diagonal
_G[2*(x+i + (y+2*j+1)*L)+1] += res[i][j][3];
_G[2*(x+i + (y+2*j)*L)] += res[i][j][0]; //lower triangle
_G[2*(x+i + (y+2*j)*L)+1] += res[i][j][1];
_G[2*(y+2*j + (x+i)*L)] += res[i][j][0]; //upper triangle
_G[2*(y+2*j + (x+i)*L)+1] += res[i][j][1];
}
else if (x+i == y+2*j) {
_G[2*(x+i + (y+2*j)*L)] += res[i][j][0]; //diagonal
_G[2*(x+i + (y+2*j)*L)+1] += res[i][j][1];
}
}
}
}
void kernel_avx2_hor(Complex* G, const double* V, const Complex* D, const int x, const int y, const int l, const int r, const int M, const int L)
{
__m256d res[KERNEL_WIDTH_D] = {0.}; //hold two complex type
__m256d reg_temp = {0.}, reg_temp2 = {0.};
Complex temp;
for(int k=l; k<r; k++) {
temp = V[x + k*L] * D[k];
reg_temp = _mm256_set_pd(temp.imag(),temp.real(),temp.imag(),temp.real());
for (int j = 0; j < KERNEL_WIDTH_D; j++) {
int index = y+2*j + k*L;
reg_temp2 = _mm256_set_pd(V[index+1],V[index+1],V[index],V[index]);
res[j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[j]);
}
}
// write the results back to G considering symmetry
double* _G = (double*) G;
for (int j = 0; j < KERNEL_WIDTH_D; j++) {
if (x > y+2*j+1) {
_G[2*(x + (y+2*j+1)*L)] += res[j][2]; //lower triangle
_G[2*(x + (y+2*j+1)*L)+1] += res[j][3];
_G[2*(y+2*j+1 + x*L)] += res[j][2]; //upper triangle
_G[2*(y+2*j+1 + x*L)+1] += res[j][3];
_G[2*(x + (y+2*j)*L)] += res[j][0]; //lower triangle
_G[2*(x + (y+2*j)*L)+1] += res[j][1];
_G[2*(y+2*j + x*L)] += res[j][0]; //upper triangle
_G[2*(y+2*j + x*L)+1] += res[j][1];
}
else if (x == y+2*j+1) {
_G[2*(x + (y+2*j+1)*L)] += res[j][2]; //diagonal
_G[2*(x + (y+2*j+1)*L)+1] += res[j][3];
_G[2*(x + (y+2*j)*L)] += res[j][0]; //lower triangle
_G[2*(x + (y+2*j)*L)+1] += res[j][1];
_G[2*(y+2*j + x*L)] += res[j][0]; //upper triangle
_G[2*(y+2*j + x*L)+1] += res[j][1];
}
else if (x == y+2*j) {
_G[2*(x + (y+2*j)*L)] += res[j][0]; //diagonal
_G[2*(x + (y+2*j)*L)+1] += res[j][1];
}
}
}
void VDVH_kernel_avx2(std::vector<Complex> &G, const std::vector<double> &V, const std::vector<Complex> &D, const int L, const int M) {
//G is LpadH * L
const int LpadH = (L + 2*KERNEL_WIDTH_D-1) / (2*KERNEL_WIDTH_D) * (2*KERNEL_WIDTH_D);
//padding the output matrix to fit the kernel (to remove later)
std::vector<Complex> _G; _G.resize(L * LpadH, 0.);
//using the main kernel
for (int x = 0; x <= L-KERNEL_HEIGH_D; x += KERNEL_HEIGH_D)
for (int y = 0; y < x+KERNEL_HEIGH_D; y += 2*KERNEL_WIDTH_D)
kernel_avx2(_G.data(), V.data(), D.data(), x, y, 0, M, M, L);
//using the 1xKERNEL_WIDTH_D kernel to finish
for (int x = L/KERNEL_HEIGH_D*KERNEL_HEIGH_D; x < L; x += 1)
for (int y = 0; y <= x; y += 2*KERNEL_WIDTH_D)
kernel_avx2_hor(_G.data(), V.data(), D.data(), x, y, 0, M, M, L);
for (int i = 0; i < L*L; i++) G[i] += _G[i];//std::copy(_G.begin()+ i*L, _G.begin()+i*L+L, G.begin()+i*L);
_G.resize(0);
}
// Complex
#define KERNEL_HEIGH_C 3
#define KERNEL_WIDTH_C 2
void kernel_avx2(Complex* G, const Complex* V, const Complex* D, const int x, const int y, const int l, const int r, const int M, const int L)
{
__m256d res[KERNEL_HEIGH_C][KERNEL_WIDTH_C] = {0.}; //hold two complex type
__m256d reg_temp = {0.}, reg_temp2 = {0.};
Complex temp;
double* _V = (double*) V;
for(int k=l; k<r; k++) { //k inner dim to reduce (V column, square size of D)
//loops must be unrooled
for (int i = 0; i<KERNEL_HEIGH_C; i++) {
temp = V[x+i + k*L] * D[k];
reg_temp = _mm256_set_pd(temp.imag(),temp.real(),temp.imag(),temp.real());
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
int index = 2*(y+2*j + k*L);
reg_temp2 = _mm256_set_pd(_V[index+2],_V[index+2],_V[index],_V[index]); //real part
res[i][j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[i][j]);
}
reg_temp = _mm256_set_pd(-temp.real(),temp.imag(),-temp.real(),temp.imag());
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
int index = 2*(y+2*j + k*L);
reg_temp2 = _mm256_set_pd(_V[index+3],_V[index+3],_V[index+1],_V[index+1]); //imag part (conjugate)
res[i][j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[i][j]);
}
}
}
// write the results back to G
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
for (int i = 0; i < KERNEL_HEIGH_C; i++) {
G[(x+i) + (y+2*j)*L] = res[i][j][0] + Complex(0., res[i][j][1]);
G[(x+i) + (y+2*j+1)*L] += res[i][j][2] + Complex(0., res[i][j][3]);
}
}
}
void kernel_avx2_hor(Complex* G, const Complex* V, const Complex* D, const int x, const int y, const int l, const int r, const int M, const int L)
{
__m256d res[KERNEL_WIDTH_C] = {0.}; //hold two complex type
__m256d reg_temp = {0.}, reg_temp2 = {0.};
Complex temp;
double* _V = (double*) V;
for(int k=l; k<r; k++) {
temp = V[x + k*L] * D[k];
reg_temp = _mm256_set_pd(temp.imag(),temp.real(),temp.imag(),temp.real());
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
int index = 2*(y+2*j + k*L);
reg_temp2 = _mm256_set_pd(_V[index+2],_V[index+2],_V[index],_V[index]); //real part
res[j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[j]);
}
reg_temp = _mm256_set_pd(-temp.real(),temp.imag(),-temp.real(),temp.imag());
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
int index = 2*(y+2*j + k*L);
reg_temp2 = _mm256_set_pd(_V[index+3],_V[index+3],_V[index+1],_V[index+1]); //imag part (conjugate)
res[j] = _mm256_fmadd_pd(reg_temp2, reg_temp, res[j]);
}
}
// write the results back to G
for (int j = 0; j < KERNEL_WIDTH_C; j++) {
G[x + (y+2*j)*L] = res[j][0] + Complex(0.,res[j][1]);
G[x + (y+2*j+1)*L] += res[j][2] + Complex(0., res[j][3]);
}
}
void VDVH_kernel_avx2(std::vector<Complex> &G, const std::vector<Complex> &V, const std::vector<Complex> &D, const int L, const int M) {
//G is LpadH * L
const int LpadH = (L + 2*KERNEL_WIDTH_C-1) / (2*KERNEL_WIDTH_C) * (2*KERNEL_WIDTH_C);
//padding the output matrix to fit the kernel (to remove later)
std::vector<Complex> _G; _G.resize(L * LpadH, 0.);
//using the main kernel
for (int x = 0; x <= L-KERNEL_HEIGH_C; x += KERNEL_HEIGH_C)
for (int y = 0; y < x+KERNEL_HEIGH_C; y += 2*KERNEL_WIDTH_C)
kernel_avx2(_G.data(), V.data(), D.data(), x, y, 0, M, M, L);
//using the 1xKERNEL_WIDTH_C kernel to finish
for (int x = L/KERNEL_HEIGH_C*KERNEL_HEIGH_C; x < L; x += 1)
for (int y = 0; y <= LpadH; y += 2*KERNEL_WIDTH_C)
kernel_avx2_hor(_G.data(), V.data(), D.data(), x, y, 0, M, M, L);
for (int i = 0; i < L*L; i++) G[i] += _G[i];
_G.resize(0);
}
#endif