@Beliavsky, After playing around a little more here are a couple of things I found related to using Intel’s compilers. For the LANL kahan vector sum routines, compiling with icx instead of icc (with options -Ofast -xHost -mavx2) and linking to ifort generated Fortran objects on my Zen 3 processor gives the wrong answers (as in not exactly equal to the correct answers) compiling them with icc works. Using -Ofast instead of -O3 also helps with the speed. The interesting thing for fsum_v is for the test program I posted above, the times for REAL32 and REAL64 are about the same.
Here are some new timings.
REAL32
loop = 524288.0 time = 3.197800368070602E-002
Fortran serial Kahan = 1234569. time = 0.247595988214016
Fortran vector Kahan = 1234569. time = 8.467298746109009E-002
LANL vector Kahan = 1234569. time = 2.398902177810669E-002
intrinsic SUMM = 524288.0 time = 3.185501694679260E-002
Correct sum = 1234569.
REAL64
loop = 1234568.89114850 time = 3.342800587415695E-002
Fortran serial Kahan = 1234568.89123457 time =
0.247623980045319
Fortran vector Kahan = 1234568.89123457 time =
8.850398659706116E-002
LANL vector Kahan = 1234568.89123457 time =
4.786601662635803E-002
intrinsic SUMM = 1234568.89114850 time =
3.257399797439575E-002
Correct sum = 1234568.89123457
I reworked the LANL code to be truly 32 bit.
Here is my implementation:
#include <x86intrin.h>
static float sum[8] __attribute__ ((aligned (32)));
float kahan_intel_vector_32(float* restrict var, long ncells)
{
#ifndef __PGI
float const zero = 0.0;
__m256 local_sum = _mm256_broadcast_ss((float const*) &zero);
__m256 local_correction = _mm256_broadcast_ss((float const*) &zero);
__m256 var_v;
#ifdef __INTEL_COMPILER
#pragma ivdep
#else
#pragma simd
#endif
#pragma vector aligned
for (long i = 0; i < ncells; i+=8) {
var_v = _mm256_load_ps(&var[i]);
__m256 corrected_next_term = var_v + local_correction;
__m256 new_sum = local_sum + local_correction;
local_correction = corrected_next_term - (new_sum - local_sum);
local_sum = new_sum;
}
__m256 sum_v;
sum_v = local_correction;
sum_v += local_sum;
_mm256_store_ps(sum, sum_v);
struct esum_type{
float sum;
float correction;
} local;
local.sum = 0.0;
local.correction = 0.0;
for (long i = 0; i < 8; i++) {
float corrected_next_term_s = sum[i] + local.correction;
float new_sum_s = local.sum + local.correction;
local.correction = corrected_next_term_s - (new_sum_s - local.sum);
local.sum = new_sum_s;
}
float final_sum = local.sum + local.correction;
#else
float final_sum = 0.0;
#endif
return(final_sum);
}
Here is my C-interop code
Module fast_sums
USE ISO_C_BINDING
USE ISO_FORTRAN_ENV, ONLY: INT32, INT64, REAL32, REAL64
Implicit NONE
PRIVATE
Interface fsum
Module Procedure fsum_32
Module Procedure fsum_64
End Interface
Public fsum
Contains
Function fsum_32(a,n) Result(s)
Integer, Intent(IN) :: n
Real(REAL32), Intent(IN) :: a(n)
Real(REAL32) :: s
Interface
Function c_fsum_32(a, ncells) BIND(C,name="kahan_intel_vector_32")
IMPORT :: C_FLOAT, C_LONG
Real(C_FLOAT), Intent(IN) :: a(*)
Integer(C_LONG), VALUE :: ncells
Real(C_FLOAT) :: c_fsum_32
End Function c_fsum_32
End Interface
Integer(C_LONG) :: c_ncells
If (n < 8) Then
s = SUM(a(1:n))
Else
c_ncells = INT(n, C_LONG)
s = c_fsum_32(a, c_ncells)
End If
End Function fsum_32
Function fsum_64(a,n) Result(s)
Integer, Intent(IN) :: n
Real(REAL64), Intent(IN) :: a(n)
Real(REAL64) :: s
Interface
Function c_fsum_64(a, ncells) BIND(C,name="kahan_intel_vector_64")
IMPORT :: C_DOUBLE, C_LONG
Real(C_DOUBLE), Intent(IN) :: a(*)
Integer(C_LONG), VALUE :: ncells
Real(C_DOUBLE) :: c_fsum_64
End Function c_fsum_64
End Interface
Integer(C_LONG) :: c_ncells
If (n < 4) Then
s = SUM(a(1:n))
Else
c_ncells = INT(n, C_LONG)
s = c_fsum_64(a, c_ncells)
End If
End Function fsum_64
End Module fast_sums
The 64 bit version of the LANL code is the same as the version on the LANL GlobalSums github site. My only mod was to change the function name to kahan_intel_vector_64. Also, CHUNK = 8 appeared to be the sweet spot for fsum_v