Related to the thread Some Intrinsic SUMS, I am finding the pairwise calculation of the mean is both faster and more accurate than sum(x)/size(x)
For summing single precision arrays with
module mean_mod
implicit none
private
public :: wp, mean_base, mean_accurate, mean_updated, mean_pair
integer, parameter :: wp = kind(1.0), qp = selected_real_kind(2*precision(1.0_wp))
contains
!
recursive pure function mean_pair(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp) :: y
integer :: n, nhalf
real(kind=wp) :: frac
integer, parameter :: size_no_recurse = 32
n = size(x)
if (n <= size_no_recurse) then
y = sum(x)/max(1, size(x))
return
end if
nhalf = n/2
if (mod(n,2) == 0) then
y = 0.5_wp*mean_pair(x(1:nhalf)) + 0.5_wp*mean_pair(x(nhalf+1:n))
else
frac = nhalf/real(n, kind=wp)
y = frac*mean_pair(x(1:nhalf)) + (1.0_wp-frac)*mean_pair(x(nhalf+1:n))
end if
end function mean_pair
!
pure function mean_accurate(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp) :: y
y = sum(real(x, kind=qp))/max(1,size(x)) ! no check for size(x) == 0 since sum(x) = 0.0 then
end function mean_accurate
!
pure function mean_base(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp) :: y
y = sum(x)/max(1,size(x)) ! no check for size(x) == 0 since sum(x) = 0.0 then
end function mean_base
!
pure function mean_updated(x) result(mean)
! based on Alan Miller's code https://jblevins.org/mirror/amiller/update.f90
real(kind=wp), intent(in) :: x(:)
real(kind=wp) :: mean
integer :: i, n
real(kind=wp) :: dev
mean = 0.0_wp
n = 0
do i=1,size(x)
n = n + 1
dev = x(i) - mean
mean = mean + dev/n
end do
end function mean_updated
end module mean_mod
program main
use mean_mod, only: wp, mean_base, mean_pair, mean_updated, mean_accurate
use iso_fortran_env, only: compiler_version, compiler_options
implicit none
integer, parameter :: n = 10**8, ncalc = 4, npow = 4
real(kind=wp), parameter :: base = 10.0_wp**3, xshift = 0.0_wp
real(kind=wp) :: xorig(n), x(n), xmean(ncalc), xscale, t1(ncalc), t2(ncalc)
integer :: icalc, ipow
character (len=20) :: fmt_s
fmt_s = merge("(/,*(a,10x))", "(/,*(a,18x))", wp == kind(1.0))
print*,"compiler_version: ", compiler_version()
print*,"compiler_options: ", compiler_options()
print*,"wp: ", wp
print*,"huge(x): ", huge(x)
print*,"n: ",n
print*,"xshift: ", xshift
call random_seed()
call random_number(xorig)
xorig = xorig - xshift
do ipow=0, npow
xscale = base ** ipow
x = xorig * xscale
do icalc = 1, ncalc
call cpu_time(t1(icalc))
select case (icalc)
case(1)
xmean(1) = mean_base(x)
case(2)
xmean(2) = mean_updated(x)
case (3)
xmean(3) = mean_pair(x)
case(4)
xmean(4) = mean_accurate(x)
end select
call cpu_time(t2(icalc))
end do
print fmt_s, "method:", "base", "updated", "pair", "accurate"
print*,"xscale: ", xscale
print*,"means: " , xmean
print*,"errors: ", abs(xmean - xmean(ncalc))
print*,"rel errors: ", abs(xmean - xmean(ncalc))/xmean(ncalc)
print*,"times: ", t2-t1
end do
end program main
I get
compiler_version: GCC version 13.0.0 20221218 (experimental)
compiler_options: -march=tigerlake -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -mno-sse4a -mno-fma4 -mno-xop -mfma -mavx512f -mbmi -mbmi2 -maes -mpclmul -mavx512vl -mavx512bw -mavx512dq -mavx512cd -mno-avx512er -mno-avx512pf -mavx512vbmi -mavx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mavx512vpopcntdq -mavx512vbmi2 -mgfni -mvpclmulqdq -mavx512vnni -mavx512bitalg -mno-avx512bf16 -mavx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mno-clzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mmovdir64b -mmovdiri -mno-mwaitx -mno-pconfig -mno-pku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mno-wbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint --param=l1-cache-size=48 --param=l1-cache-line-size=64 --param=l2-cache-size=12288 -mtune=tigerlake -O3 -flto
wp: 4
huge(x): 3.40282347E+38
n: 100000000
xshift: 0.00000000
method: base updated pair accurate
xscale: 1.00000000
means: 0.167772159 0.500012517 0.500038147 0.500038147
errors: 0.332265973 2.56299973E-05 0.00000000 0.00000000
rel errors: 0.664481223 5.12560837E-05 0.00000000 0.00000000
times: 9.37500000E-02 0.421875000 4.68750000E-02 9.37500000E-02
method: base updated pair accurate
xscale: 1000.00000
means: 171.798691 500.004608 500.038177 500.038147
errors: 328.239441 3.35388184E-02 3.05175781E-05 0.00000000
rel errors: 0.656428814 6.70725203E-05 6.10304980E-08 0.00000000
times: 9.37500000E-02 0.406250000 3.12500000E-02 0.109375000
method: base updated pair accurate
xscale: 1000000.00
means: 175921.859 499999.219 500038.156 500038.125
errors: 324116.250 38.9062500 3.12500000E-02 0.00000000
rel errors: 0.648183048 7.78065660E-05 6.24952321E-08 0.00000000
times: 9.37500000E-02 0.453125000 3.12500000E-02 9.37500000E-02
method: base updated pair accurate
xscale: 1.00000000E+09
means: 180143984. 499981376. 500038144. 500038144.
errors: 319894144. 56768.0000 0.00000000 0.00000000
rel errors: 0.639739454 1.13527341E-04 0.00000000 0.00000000
times: 9.37500000E-02 0.437500000 4.68750000E-02 9.37500000E-02
method: base updated pair accurate
xscale: 9.99999996E+11
means: 1.84467440E+11 4.99978568E+11 5.00038173E+11 5.00038140E+11
errors: 3.15570717E+11 59572224.0 32768.0000 0.00000000
rel errors: 0.631093323 1.19135359E-04 6.55310046E-08 0.00000000
times: 7.81250000E-02 0.421875000 6.25000000E-02 9.37500000E-02
and setting wp = kind(1.0d0)
in mean_mod
to sum double precision arrays, I get
compiler_version: GCC version 13.0.0 20221218 (experimental)
compiler_options: -march=tigerlake -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -mno-sse4a -mno-fma4 -mno-xop -mfma -mavx512f -mbmi -mbmi2 -maes -mpclmul -mavx512vl -mavx512bw -mavx512dq -mavx512cd -mno-avx512er -mno-avx512pf -mavx512vbmi -mavx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mavx512vpopcntdq -mavx512vbmi2 -mgfni -mvpclmulqdq -mavx512vnni -mavx512bitalg -mno-avx512bf16 -mavx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mno-clzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mmovdir64b -mmovdiri -mno-mwaitx -mno-pconfig -mno-pku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mno-wbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint --param=l1-cache-size=48 --param=l1-cache-line-size=64 --param=l2-cache-size=12288 -mtune=tigerlake -O3 -flto
wp: 8
huge(x): 1.7976931348623157E+308
n: 100000000
xshift: 0.0000000000000000
method: base updated pair accurate
xscale: 1.0000000000000000
means: 0.50001808444994267 0.50001808444984286 0.50001808444992779 0.50001808444992779
errors: 1.4876988529977098E-014 8.4932061383824475E-014 0.0000000000000000 0.0000000000000000
rel errors: 2.9752900930260033E-014 1.6985797919140990E-013 0.0000000000000000 0.0000000000000000
times: 9.3750000000000000E-002 0.48437500000000000 7.8125000000000000E-002 2.1406250000000000
method: base updated pair accurate
xscale: 1000.0000000000000
means: 500.01808445017298 500.01808444995379 500.01808444992781 500.01808444992781
errors: 2.4516566554666497E-010 2.5977442419389263E-011 0.0000000000000000 0.0000000000000000
rel errors: 4.9031359698994258E-013 5.1953005755716148E-014 0.0000000000000000 0.0000000000000000
times: 9.3750000000000000E-002 0.46875000000000000 6.2500000000000000E-002 2.1562500000000000
method: base updated pair accurate
xscale: 1000000.0000000000
means: 500018.08444989804 500018.08444995317 500018.08444992773 500018.08444992779
errors: 2.9744114726781845E-008 2.5378540158271790E-008 5.8207660913467407E-011 0.0000000000000000
rel errors: 5.9486077907569048E-014 5.0755244555186119E-014 1.1641111136510578E-016 0.0000000000000000
times: 7.8125000000000000E-002 0.48437500000000000 6.2500000000000000E-002 2.1718750000000000
method: base updated pair accurate
xscale: 1000000000.0000000
means: 500018084.45003015 500018084.45005620 500018084.44992781 500018084.44992781
errors: 1.0234117507934570E-004 1.2838840484619141E-004 0.0000000000000000 0.0000000000000000
rel errors: 2.0467494729101988E-013 2.5676752269356833E-013 0.0000000000000000 0.0000000000000000
times: 7.8125000000000000E-002 0.48437500000000000 6.2500000000000000E-002 2.2812500000000000
method: base updated pair accurate
xscale: 1000000000000.0000
means: 500018084449.78223 500018084449.74268 500018084449.92773 500018084449.92780
errors: 0.14556884765625000 0.18511962890625000 6.1035156250000000E-005 0.0000000000000000
rel errors: 2.9112716556320351E-013 3.7022586715018707E-013 1.2206589751077715E-016 0.0000000000000000
times: 9.3750000000000000E-002 0.48437500000000000 6.2500000000000000E-002 2.1718750000000000