@RonShepard
I have now included a modified j_sum as je_sum, that uses e_sum for sums < 65.
I also introduced a modified test order, as this can influence some large memory tests, but not so much help here for e_sum (Array x is 400MBytes ?).
We possibly need a random vector of type 3, with greater exponent range for x, say x = 10**(10*x), to better demonstrate the error response or overflow in the accumulator. ( edit: now included )
The results of your e_sum test vs je_sum > e_sum are surprisingly different for elapsed time. This is the type of results I attribute to better L1 cache usage/variability, but without any definate knowledge.
Would be pleased to get any comments / suggestions
Anyway, I have done these tests on Ryzen 5900X, Win 10 and Gfortran 11.1.0, and included alternative compile options in the lists.
program sums
! use, intrinsic :: iso_fortran_env, only: dp=>real64
use iso_fortran_env, only: dp=>real64
implicit none
integer, parameter :: n=10**8
integer :: itype, k, order(6) = [ 1, 5, 6, 2, 3, 4 ]
real :: x(n), s1, s2, s3, s4, s5, s6
real(dp) :: cpu0, cpu1
character(*), parameter :: sfmt = '(a,es15.7,1x,z8,f11.6," seconds")'
write(*,'("Vector Size n = ",i0)') n
do itype = 1, 3
write(*,'("Random vector of type ",i0)') itype
select case (itype)
case (1)
call random_number(x)
x = x * 0.5 + 0.5 ! values between .5 and 1.0 have the same exponent.
case (2)
call random_number(x)
x = 256 * (x - 0.5) ! mixed signs and exponents.
case (3)
call random_number(x)
x = 10 ** (10*x) ! greater exponent range
end select
do k = 1, 6
select case ( order(k) )
case (1)
cpu0 = delta_sec ()
s1 = sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'intrinsic:', s1, transfer(s1,1), cpu0
case (2)
cpu0 = delta_sec ()
s2 = e_sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'extended :', s2, transfer(s2,1), cpu0
case (3)
cpu0 = delta_sec ()
s3 = r_sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'recursive:', s3, transfer(s3,1), cpu0
case (4)
cpu0 = delta_sec ()
s4 = p_sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'p_sum :', s4, transfer(s4,1), cpu0
case (5)
cpu0 = delta_sec ()
s5 = j_sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'j_sum :', s5, transfer(s5,1), cpu0
case (6)
cpu0 = delta_sec ()
s6 = je_sum(x)
cpu0 = delta_sec ()
write(*,sfmt) 'je_sum :', s6, transfer(s6,1), cpu0
end select
end do
end do
contains
pure function e_sum(x)
! extended precision summation.
implicit none
real :: e_sum
real, intent(in) :: x(:)
integer :: i
real(dp) :: temp
temp = 0.0_dp
do i = 1, size(x)
temp = temp + real( x(i), dp )
enddo
e_sum = (temp)
return
end function e_sum
function r_sum(x)
! recursive summation with manual stack.
implicit none
real :: r_sum
real, intent(in) :: x(:)
integer :: i, j, p
real :: psum( bit_size(1) - leadz(size(x)) ) ! max reccursion depth.
p = 0
do i = 1, size(x)
p = p + 1
psum(p) = x(i)
do j = 1, trailz(i)
psum(p-1) = psum(p-1) + psum(p)
p = p - 1
enddo
enddo
do while (p > 1) ! cleanup loop. accumulate in reverse order.
psum(p-1) = psum(p-1) + psum(p)
p = p - 1
enddo
r_sum = psum(1)
return
end function r_sum
pure recursive function p_sum(x) result(r)
! recursive summation with function calls.
implicit none
real :: r
real, intent(in) :: x(:)
integer :: n
n = size(x)
select case (n)
case (:0)
r = 0.0
case (1)
r = x(1)
case (2)
r = x(1) + x(2)
case (3:)
r = p_sum( x(1:n/2) ) + p_sum( x(n/2+1:n) )
end select
return
end function p_sum
pure recursive function j_sum(x) result(r)
! recursive summation with function calls.
implicit none
real :: r
real, intent(in) :: x(:)
integer :: n
integer, parameter :: min_n = 65 ! 64 ! 16
n = size(x)
if ( n < min_n ) then
r = sum (x)
else
r = j_sum ( x(1:n/2) ) + j_sum ( x(n/2+1:n) )
end if
return
end function j_sum
pure recursive function je_sum(x) result(r)
! recursive summation with extended precision function calls.
implicit none
real :: r
real, intent(in) :: x(:)
integer :: n
integer, parameter :: min_n = 65 ! 128 ! 64 ! 16
n = size(x)
if ( n < min_n ) then
! r = sum (x)
r = e_sum (x)
else
r = je_sum ( x(1:n/2) ) + je_sum ( x(n/2+1:n) )
end if
return
end function je_sum
function delta_sec ()
! high precision timer.
implicit none
real(dp) :: delta_sec
integer*8 :: tick, last=-1, rate=-1
if ( rate < 0 ) call system_clock ( last, rate )
call system_clock (tick)
delta_sec = dble (tick-last) / dble (rate)
last = tick
end function delta_sec
end program sums
set basic=-v -fimplicit-none -fallow-argument-mismatch -O2 -march=native -ffast-math
set omp=-fopenmp -fstack-arrays
set basic=-v -fimplicit-none -fallow-argument-mismatch -O -march=native
set basic=-v -fimplicit-none -fallow-argument-mismatch -O2 -march=native -ffast-math
set basic=-v -fimplicit-none -fallow-argument-mismatch -O1
gfortran %1.f90 %basic% -o %1.exe >> %1.tce 2>&1
%1 >> %1.tce
type %1.tce
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O' '-march=native' '-o' 'ron_sum2.exe' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061096 seconds
extended : 7.4997416E+07 4C8F0BD5 0.061107 seconds
recursive: 7.4997416E+07 4C8F0BD5 0.120826 seconds
p_sum : 7.4997416E+07 4C8F0BD5 0.327250 seconds
j_sum : 7.4997416E+07 4C8F0BD5 0.050302 seconds
je_sum : 7.4997416E+07 4C8F0BD5 0.052829 seconds
Random vector of type 2
intrinsic: -1.4523291E+05 C80DD43A 0.061420 seconds
extended : -1.4519922E+05 C80DCBCE 0.061530 seconds
recursive: -1.4519938E+05 C80DCBD8 0.118981 seconds
p_sum : -1.4519919E+05 C80DCBCC 0.326990 seconds
j_sum : -1.4519934E+05 C80DCBD6 0.050335 seconds
je_sum : -1.4519922E+05 C80DCBCE 0.050940 seconds
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O' '-march=native' '-ffast-math' '-o' 'ron_sum2.exe' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061198 seconds
extended : 7.4998288E+07 4C8F0C42 0.061058 seconds
recursive: 7.4998288E+07 4C8F0C42 0.118410 seconds
p_sum : 7.4998288E+07 4C8F0C42 0.327628 seconds
j_sum : 7.4998288E+07 4C8F0C42 0.050230 seconds
je_sum : 7.4998288E+07 4C8F0C42 0.051019 seconds
Random vector of type 2
intrinsic: 9.2840462E+05 4962A94A 0.061880 seconds
extended : 9.2844181E+05 4962AB9D 0.061782 seconds
recursive: 9.2844188E+05 4962AB9E 0.119847 seconds
p_sum : 9.2844188E+05 4962AB9E 0.327099 seconds
j_sum : 9.2844188E+05 4962AB9E 0.050386 seconds
je_sum : 9.2844188E+05 4962AB9E 0.050888 seconds
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O2' '-march=native' '-ffast-math' '-o' 'ron_sum2.exe' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061357 seconds
extended : 7.4998368E+07 4C8F0C4C 0.061024 seconds
recursive: 7.4998368E+07 4C8F0C4C 0.092400 seconds
p_sum : 7.4998368E+07 4C8F0C4C 0.232685 seconds
j_sum : 7.4998368E+07 4C8F0C4C 0.042841 seconds
je_sum : 7.4998368E+07 4C8F0C4C 0.042998 seconds
Random vector of type 2
intrinsic: -2.9935404E+04 C6E9DECF 0.063704 seconds
extended : -2.9868031E+04 C6E95810 0.063095 seconds
recursive: -2.9868047E+04 C6E95818 0.092844 seconds
p_sum : -2.9868062E+04 C6E95820 0.233533 seconds
j_sum : -2.9868000E+04 C6E95800 0.042905 seconds
je_sum : -2.9868203E+04 C6E95868 0.043013 seconds
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O1' '-o' 'ron_sum2.exe' '-mtune=generic' '-march=x86-64' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061273 seconds
extended : 7.5000264E+07 4C8F0D39 0.061080 seconds
recursive: 7.5000264E+07 4C8F0D39 0.125566 seconds
p_sum : 7.5000264E+07 4C8F0D39 0.327155 seconds
j_sum : 7.5000264E+07 4C8F0D39 0.049916 seconds
je_sum : 7.5000264E+07 4C8F0D39 0.051608 seconds
Random vector of type 2
intrinsic: 9.8548633E+04 47C07A51 0.061245 seconds
extended : 9.8605258E+04 47C096A1 0.061045 seconds
recursive: 9.8605281E+04 47C096A4 0.125928 seconds
p_sum : 9.8605109E+04 47C0968E 0.328060 seconds
j_sum : 9.8605062E+04 47C09688 0.049556 seconds
je_sum : 9.8605203E+04 47C0969A 0.051334 seconds
{{ Revised order }}
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O1' '-o' 'ron_sum2.exe' '-mtune=generic' '-march=x86-64' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061068 seconds
j_sum : 7.4997688E+07 4C8F0BF7 0.049448 seconds
je_sum : 7.4997688E+07 4C8F0BF7 0.051313 seconds
extended : 7.4997688E+07 4C8F0BF7 0.061019 seconds
recursive: 7.4997688E+07 4C8F0BF7 0.126306 seconds
p_sum : 7.4997688E+07 4C8F0BF7 0.330779 seconds
Random vector of type 2
intrinsic: -6.9337450E+05 C92947E8 0.061718 seconds
j_sum : -6.9337188E+05 C92947BE 0.049567 seconds
je_sum : -6.9337169E+05 C92947BB 0.051128 seconds
extended : -6.9337188E+05 C92947BE 0.061709 seconds
recursive: -6.9337188E+05 C92947BE 0.127979 seconds
p_sum : -6.9337188E+05 C92947BE 0.325464 seconds
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O1' '-o' 'ron_sum2.exe' '-mtune=generic' '-march=x86-64' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061153 seconds
j_sum : 7.4999704E+07 4C8F0CF3 0.049519 seconds
je_sum : 7.4999704E+07 4C8F0CF3 0.050971 seconds
extended : 7.4999704E+07 4C8F0CF3 0.061078 seconds
recursive: 7.4999704E+07 4C8F0CF3 0.126844 seconds
p_sum : 7.4999704E+07 4C8F0CF3 0.331213 seconds
Random vector of type 2
intrinsic: -3.0828716E+05 C89687E5 0.062152 seconds
j_sum : -3.0824225E+05 C8968248 0.049818 seconds
je_sum : -3.0824238E+05 C896824C 0.051187 seconds
extended : -3.0824219E+05 C8968246 0.061058 seconds
recursive: -3.0824234E+05 C896824B 0.126645 seconds
p_sum : -3.0824225E+05 C8968248 0.329529 seconds
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O1' '-o' 'ron_sum2.exe' '-mtune=generic' '-march=x86-64' '-dumpdir' 'ron_sum2.'
Vector Size n = 100000000
Random vector of type 1
intrinsic: 1.6777216E+07 4B800000 0.061181 seconds
j_sum : 7.4999504E+07 4C8F0CDA 0.049612 seconds
je_sum : 7.4999504E+07 4C8F0CDA 0.051121 seconds
extended : 7.4999504E+07 4C8F0CDA 0.061106 seconds
recursive: 7.4999504E+07 4C8F0CDA 0.124811 seconds
p_sum : 7.4999504E+07 4C8F0CDA 0.329647 seconds
Random vector of type 2
intrinsic: -6.9732494E+05 C92A3ECF 0.061310 seconds
j_sum : -6.9740488E+05 C92A43CE 0.049676 seconds
je_sum : -6.9740469E+05 C92A43CB 0.051098 seconds
extended : -6.9740481E+05 C92A43CD 0.061636 seconds
recursive: -6.9740494E+05 C92A43CF 0.125510 seconds
p_sum : -6.9740488E+05 C92A43CE 0.327516 seconds
Random vector of type 3
intrinsic: 4.0709641E+16 5B10A134 0.062140 seconds
j_sum : 4.3465584E+16 5B1A6BB8 0.049463 seconds
je_sum : 4.3465584E+16 5B1A6BB8 0.050820 seconds
extended : 4.3465589E+16 5B1A6BB9 0.061240 seconds
recursive: 4.3465589E+16 5B1A6BB9 0.125593 seconds
p_sum : 4.3465584E+16 5B1A6BB8 0.329287 seconds