Why do you insist on the compiler providing parallel code in some hidden way? This simply does not happen for gfortran without something like OpenMP !$omp parallel do
I observe the same thing here, and you can see the exact same timing results between cpu_time
and system_clock
:
module mymod
use, intrinsic :: iso_fortran_env, only: dp => real64, qp => real128
implicit none
private
public :: do_sum, kahan_sum, kbn_sum, qp_sum, pair_sum
contains
pure function do_sum(x) result(xout)
real(dp), intent(in) :: x(:)
real(dp) :: xout
integer :: n, i
n = size(x)
xout = x(1)
do i=2,n
xout = xout + x(i)
end do
end function do_sum
pure function kahan_sum(x) result(xout)
real(dp), intent(in) :: x(:)
real(dp) :: xout, c, y, t
integer :: n, i
n = size(x)
xout = 0.0
c = 0.0
do i=1,n
y = x(i) - c
t = xout + y
c = (t - xout) - y
xout = t
end do
end function kahan_sum
pure function kbn_sum(x) result(xout)
real(dp), intent(in) :: x(:)
real(dp) :: xout, c, t
integer :: n, i
n = size(x)
xout = 0.0
c = 0.0
do i=1,n
t = xout + x(i)
if (abs(xout).ge.abs(x(i))) then
c = c + ((xout - t) + x(i))
else
c = c + ((x(i) - t) + xout)
end if
xout = t
end do
xout = xout + c
end function kbn_sum
pure function qp_sum(x) result(xout)
real(dp), intent(in) :: x(:)
real(dp) :: xout
real(qp) :: temp
integer :: n, i
n = size(x)
temp = real(x(1), qp)
do i=2,n
temp = temp + real(x(i), qp)
end do
xout = temp
end function qp_sum
pure recursive function pair_sum(x) result(xout)
real(dp), intent(in) :: x(:)
real(dp) :: xout
integer :: n
n = size(x)
if (n.le.64) then
xout = sum(x)
else
xout = pair_sum(x(1:n/2)) + pair_sum(x(n/2+1:n))
end if
end function pair_sum
end module mymod
program main
use, intrinsic :: iso_fortran_env, only: dp => real64, i64 => int64, compiler_version, compiler_options
use, non_intrinsic :: mymod
implicit none
integer, parameter :: n = 2**20, trials = 100
real(dp) :: xdp(n), sum_dp, ref_sum, t1, t2
integer(i64) :: c1(trials), c2(trials), cr(trials)
integer :: i
write(*,'(4a)') 'compiled with: ',compiler_version(),' ',compiler_options()
call random_seed(put=[1,9,9,3,0,8,3,1])
call random_number(xdp)
xdp = (xdp - 0.5_dp)*1.0D+0
ref_sum = qp_sum(xdp)
write(*,'(a32,z16)') ' REF_SUM: ',ref_sum
write(*,'(a)') ''
call cpu_time(t1)
sum_dp = do_sum(xdp)
call cpu_time(t2)
write(*,'(a32,z16,a,e22.15)') ' DO_SUM: ',sum_dp,', elapsed (sec): ',t2-t1
call cpu_time(t1)
sum_dp = kahan_sum(xdp)
call cpu_time(t2)
write(*,'(a32,z16,a,e22.15)') 'KAHAN_SUM: ',sum_dp,', elapsed (sec): ',t2-t1
call cpu_time(t1)
sum_dp = kbn_sum(xdp)
call cpu_time(t2)
write(*,'(a32,z16,a,e22.15)') ' KBN_SUM: ',sum_dp,', elapsed (sec): ',t2-t1
call cpu_time(t1)
sum_dp = pair_sum(xdp)
call cpu_time(t2)
write(*,'(a32,z16,a,e22.15)') ' PAIR_SUM: ',sum_dp,', elapsed (sec): ',t2-t1
write(*,'(a)') ''
call system_clock(c1(1), cr(1))
sum_dp = do_sum(xdp)
call system_clock(c2(1))
write(*,'(a32,z16,a,e22.15)') ' DO_SUM: ',sum_dp,', elapsed (sec): ',real(max(c2(1)-c1(1),1_i64),dp)/real(cr(1),dp)
call system_clock(c1(1), cr(1))
sum_dp = kahan_sum(xdp)
call system_clock(c2(1))
write(*,'(a32,z16,a,e22.15)') 'KAHAN_SUM: ',sum_dp,', elapsed (sec): ',real(max(c2(1)-c1(1),1_i64),dp)/real(cr(1),dp)
call system_clock(c1(1), cr(1))
sum_dp = kbn_sum(xdp)
call system_clock(c2(1))
write(*,'(a32,z16,a,e22.15)') ' KBN_SUM: ',sum_dp,', elapsed (sec): ',real(max(c2(1)-c1(1),1_i64),dp)/real(cr(1),dp)
call system_clock(c1(1), cr(1))
sum_dp = pair_sum(xdp)
call system_clock(c2(1))
write(*,'(a32,z16,a,e22.15)') ' PAIR_SUM: ',sum_dp,', elapsed (sec): ',real(max(c2(1)-c1(1),1_i64),dp)/real(cr(1),dp)
write(*,'(a)') ''
end program main
Output on my machine (AMD 5800X), first with -O3 -march=native -flto
, followed by -Ofast -march=native -flto
:
compiled with: GCC version 13.1.1 20230429 -march=znver3 -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -msse4a -mno-fma4 -mno-xop -mfma -mno-avx512f -mbmi -mbmi2 -maes -mpclmul -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512vbmi -mno-avx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mno-avx512vpopcntdq -mno-avx512vbmi2 -mno-gfni -mvpclmulqdq -mno-avx512vnni -mno-avx512bitalg -mno-avx512bf16 -mno-avx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mclzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mno-movdir64b -mno-movdiri -mmwaitx -mno-pconfig -mpku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint -mno-amx-complex --param=l1-cache-size=32 --param=l1-cache-line-size=64 --param=l2-cache-size=512 -mtune=znver3 -O3 -flto -fpre-include=/usr/include/finclude/math-vector-fortran.h
REF_SUM: 40428948990EE6CB
DO_SUM: 40428948990EE85C, elapsed (sec): 0.680000000000000E-03
KAHAN_SUM: 40428948990EE6CB, elapsed (sec): 0.269500000000000E-02
KBN_SUM: 40428948990EE6CB, elapsed (sec): 0.893000000000001E-03
PAIR_SUM: 40428948990EE6C8, elapsed (sec): 0.282000000000001E-03
DO_SUM: 40428948990EE85C, elapsed (sec): 0.678854000000000E-03
KAHAN_SUM: 40428948990EE6CB, elapsed (sec): 0.267238500000000E-02
KBN_SUM: 40428948990EE6CB, elapsed (sec): 0.894569000000000E-03
PAIR_SUM: 40428948990EE6C8, elapsed (sec): 0.277756000000000E-03
compiled with: GCC version 13.1.1 20230429 -march=znver3 -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -msse4a -mno-fma4 -mno-xop -mfma -mno-avx512f -mbmi -mbmi2 -maes -mpclmul -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512vbmi -mno-avx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mno-avx512vpopcntdq -mno-avx512vbmi2 -mno-gfni -mvpclmulqdq -mno-avx512vnni -mno-avx512bitalg -mno-avx512bf16 -mno-avx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mclzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mno-movdir64b -mno-movdiri -mmwaitx -mno-pconfig -mpku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint -mno-amx-complex --param=l1-cache-size=32 --param=l1-cache-line-size=64 --param=l2-cache-size=512 -mtune=znver3 -Ofast -flto -fpre-include=/usr/include/finclude/math-vector-fortran.h
REF_SUM: 40428948990EE6CB
DO_SUM: 40428948990EE9F4, elapsed (sec): 0.180000000000000E-03
KAHAN_SUM: 40428948990EE9F9, elapsed (sec): 0.180000000000000E-03
KBN_SUM: 40428948990EE9F9, elapsed (sec): 0.177999999999998E-03
PAIR_SUM: 40428948990EE6DE, elapsed (sec): 0.122000000000001E-03
DO_SUM: 40428948990EE9F4, elapsed (sec): 0.178633000000000E-03
KAHAN_SUM: 40428948990EE9F9, elapsed (sec): 0.177474000000000E-03
KBN_SUM: 40428948990EE9F9, elapsed (sec): 0.178564000000000E-03
PAIR_SUM: 40428948990EE6DE, elapsed (sec): 0.110982000000000E-03