Compiler optimization seems to have a huge impact on both performance and accuracy for the same algorithm
I have been playing with the following code to test various methods of computing the sum of a 1D array:
module sums
use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64, i32=>int32, i64=>int64
implicit none
private
interface wrap_sum
module procedure wrap_sum_sp
end interface wrap_sum
interface hpa_sum
module procedure hpa_sum_sp
end interface hpa_sum
interface rp_sum
module procedure rp_sum_sp
module procedure rp_sum_chunks_sp
end interface rp_sum
interface p_sum
module procedure p_sum_sp
module procedure p_sum_chunks_sp
end interface p_sum
integer, parameter :: default_chunk = 2
public :: wrap_sum, hpa_sum, rp_sum, p_sum
contains
pure function wrap_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
y = sum(x)
end function wrap_sum_sp
pure function hpa_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
real(dp) :: temp
temp = sum(real(x, dp))
y = temp
end function hpa_sum_sp
pure recursive function rp_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
integer :: n
n = size(x)
if (n.gt.default_chunk) then
y = rp_sum(x(1:n/2)) + rp_sum(x(n/2+1:n))
else
y = sum(x)
end if
end function rp_sum_sp
pure recursive function rp_sum_chunks_sp(x, chunk_size) result(y)
real(sp), intent(in) :: x(:)
integer, intent(in) :: chunk_size
real(sp) :: y
integer :: n
n = size(x)
if (n.gt.chunk_size) then
y = rp_sum(x(1:n/2), chunk_size) + rp_sum(x(n/2+1:n), chunk_size)
else
y = sum(x)
end if
end function rp_sum_chunks_sp
pure function p_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y, work(int((int(size(x), i64)+1_i64)/int(default_chunk, i64), i32))
integer :: n, i, j, jmax, levels
n = size(x)
if (n.gt.default_chunk) then
do i=1,size(work)
work(i) = sum(x((i1)*default_chunk+1:min(i*default_chunk,n)))
end do
jmax = size(work)
levels = ceiling(log(real(size(work)))/log(2.0))
do i=1,levels
do j=1,jmax
work(j) = sum(work((j1)*2+1:min(j*2,jmax)))
end do
jmax = ceiling(real(jmax)/2.0)
end do
y = work(1)
else
y = sum(x)
end if
end function p_sum_sp
pure function p_sum_chunks_sp(x, chunk_size) result(y)
real(sp), intent(in) :: x(:)
integer, intent(in) :: chunk_size
real(sp) :: y, work(int((int(size(x), i64)+1_i64)/int(chunk_size, i64), i32))
integer :: n, i, j, jmax, levels
n = size(x)
if (n.gt.chunk_size) then
do i=1,size(work)
work(i) = sum(x((i1)*chunk_size+1:min(i*chunk_size,n)))
end do
jmax = size(work)
levels = ceiling(log(real(size(work)))/log(2.0))
do i=1,levels
do j=1,jmax
work(j) = sum(work((j1)*2+1:min(j*2,jmax)))
end do
jmax = ceiling(real(jmax)/2.0)
end do
y = work(1)
else
y = sum(x)
end if
end function p_sum_chunks_sp
end module sums
program main
use, intrinsic :: iso_fortran_env, sp=>real32, dp=>real64, qp=>real128, i64=>int64
use, non_intrinsic :: sums
implicit none
integer, parameter :: n = 123456, trials = 10000, jmin = 1, jmax = 19, jinc = 1
real(sp) :: xsp(n), ysp, yref_sp, elapsed
integer(i64) :: c1, c2, cr
integer :: i, j, chunk_size
!! ECHO COMPILER VERSION AND OPTIONS
write(*,'(a,a)') 'COMPILER: ',compiler_version()
write(*,'(a,a)') 'OPTIONS: ',compiler_options()
write(*,'(a)') ''
!! ECHO EXACT SOLUTION AND SET UP XSP ARRAY
xsp = 0.1_sp**6
write(*,'(2(a,i0),a,e13.6)') 'n: ',n,', trials: ',trials,', xsp(1:n)=',xsp(n)
yref_sp = n*(0.1_sp**6)
write(*,'(a50,e13.6)') '32Bit EXACT SUM: ',yref_sp
!! 32BIT INTRINSIC SUM
call system_clock(c1, cr)
do i=1,trials
ysp = sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32Bit INTRINSIC  result,time: ',ysp,elapsed/real(trials)
!! 32BIT WRAPPED (PURE FUNCTION) INTRINSIC SUM
call system_clock(c1, cr)
do i=1,trials
ysp = wrap_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32Bit WRAPPEDINTRINSIC  result,time: ',ysp,elapsed/real(trials)
!! 32BIT HIGHPRECISION (REAL64) ACCUMULATOR
call system_clock(c1, cr)
do i=1,trials
ysp = hpa_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32Bit HIGHPRECISION ACCUMULATOR  result,time: ',ysp,elapsed/real(trials)
!! 32BIT RECURSIVEPAIRWISE (DEFAULT CHUNK = 2)
call system_clock(c1, cr)
do i=1,trials
ysp = rp_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32Bit RECURSIVEPAIRWISE  result,time: ',ysp,elapsed/real(trials)
!! 32BIT PAIRWISE (DEFAULT CHUNK = 2)
call system_clock(c1, cr)
do i=1,trials
ysp = p_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32Bit PAIRWISE  result,time: ',ysp,elapsed/real(trials)
do j=jmin,jmax,jinc
chunk_size = 2**j
!! 32BIT RECURSIVEPAIRWISE WITH USER INPUT CHUNK
call system_clock(c1, cr)
do i=1,trials
ysp = rp_sum(xsp, chunk_size)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6),i8)') '32Bit RECURSIVEPAIRWISE  result,time: ',ysp,elapsed/real(trials),chunk_size
!! 32BIT PAIRWISE WITH USER INPUT CHUNK
call system_clock(c1, cr)
do i=1,trials
ysp = p_sum(xsp, chunk_size)
end do
call system_clock(c2)
elapsed = real(max(c2  c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6),i8)') '32Bit PAIRWISE  result,time: ',ysp,elapsed/real(trials),chunk_size
end do
write(*,'(a)') ''
end program main
The various algorithms being tested are:

INTRINSIC
 the language intrinsic function sum
seems to often be implemented as a simple loop. Performance is very good, but accuracy suffers as one would expect.

wrap_sum
 this is a pure function
that calls the intrinsic sum
, and really only serves to measure the overhead of replacing the inline, intrinsic sum
with a function call.

hpa_sum
 the “HighPrecision Accumulator” summing function uses sum(real(INPUT_ARRAY, real64))
internally. This is the benchmark for accuracy, as well as performance. At least on my (relatively modern) hardware, the conversion from real32
to real64
seems to be nearly free.

rp_sum
 the “RecursivePairwise” summing function is as the name implies, a recursive implementation of pairwise summation. It is also available with a usersupplied CHUNK_SIZE
that will determine when to start accumulating (rather than the default 2 elements).

p_sum
 this is an iterative implementation of the above algorithm, and is also available with the userselected CHUNK_SIZE
. Performance of the iterative vs recursive version seems highly dependent on compiler + options.
Various compiler options tested are:

gfortran O3
 This seems to be a common optimization level people post here. O3
should remain “safe” while doing all of the loop things the compiler can. On my machine, p_sum
with chunk_size=32
seems to yield the best performance while maintaining the correct answer, matching hpa_sum
.
Performance: gfortran O3
COMPILER: GCC version 13.1.1 20230429
OPTIONS: mtune=generic march=x8664 O3 fpreinclude=/usr/include/finclude/mathvectorfortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E05
32Bit EXACT SUM: 0.123456E+00
32Bit INTRINSIC  result,time: 0.123311E+00 0.779044E04
32Bit WRAPPEDINTRINSIC  result,time: 0.123311E+00 0.775578E04
32Bit HIGHPRECISION ACCUMULATOR  result,time: 0.123456E+00 0.781084E04
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.325740E03
32Bit PAIRWISE  result,time: 0.123456E+00 0.203253E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.351616E03 2
32Bit PAIRWISE  result,time: 0.123456E+00 0.214255E03 2
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.192748E03 4
32Bit PAIRWISE  result,time: 0.123456E+00 0.112471E03 4
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.106952E03 8
32Bit PAIRWISE  result,time: 0.123456E+00 0.606991E04 8
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.588933E04 16
32Bit PAIRWISE  result,time: 0.123456E+00 0.388042E04 16
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.449248E04 32
32Bit PAIRWISE  result,time: 0.123456E+00 0.329614E04 32
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.422338E04 64
32Bit PAIRWISE  result,time: 0.123456E+00 0.340916E04 64
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.471615E04 128
32Bit PAIRWISE  result,time: 0.123392E+00 0.438817E04 128
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.604450E04 256
32Bit PAIRWISE  result,time: 0.123392E+00 0.609737E04 256
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.688568E04 512
32Bit PAIRWISE  result,time: 0.123392E+00 0.691470E04 512
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.731949E04 1024
32Bit PAIRWISE  result,time: 0.122881E+00 0.729839E04 1024
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.754217E04 2048
32Bit PAIRWISE  result,time: 0.122881E+00 0.754260E04 2048
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.764888E04 4096
32Bit PAIRWISE  result,time: 0.122880E+00 0.764913E04 4096
32Bit RECURSIVEPAIRWISE  result,time: 0.123443E+00 0.772849E04 8192
32Bit PAIRWISE  result,time: 0.122869E+00 0.766319E04 8192
32Bit RECURSIVEPAIRWISE  result,time: 0.123464E+00 0.771745E04 16384
32Bit PAIRWISE  result,time: 0.114696E+00 0.720122E04 16384
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.775248E04 32768
32Bit PAIRWISE  result,time: 0.983109E01 0.619083E04 32768
32Bit RECURSIVEPAIRWISE  result,time: 0.123367E+00 0.778058E04 65536
32Bit PAIRWISE  result,time: 0.654852E01 0.416639E04 65536
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.782052E04 131072
32Bit PAIRWISE  result,time: 0.123311E+00 0.781921E04 131072
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.781681E04 262144
32Bit PAIRWISE  result,time: 0.123311E+00 0.782069E04 262144
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.781451E04 524288
32Bit PAIRWISE  result,time: 0.123311E+00 0.782358E04 524288

gfortran O3 flto
 Allowing the compiler to further optimize at linktime via flto
changes things up a bit. Now, the recursive rp_sum
with chunk_size=32
measures fastest on my machine. You can actually see that the recursive rp_sum
is significantly faster than iterative p_sum
as you decrease chunk_size
down to the default 2 (where the recursive method is twice as fast).
Performance: gfortran O3 flto
COMPILER: GCC version 13.1.1 20230429
OPTIONS: mtune=generic march=x8664 O3 flto fpreinclude=/usr/include/finclude/mathvectorfortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E05
32Bit EXACT SUM: 0.123456E+00
32Bit INTRINSIC  result,time: 0.123311E+00 0.781882E04
32Bit WRAPPEDINTRINSIC  result,time: 0.123311E+00 0.782007E04
32Bit HIGHPRECISION ACCUMULATOR  result,time: 0.123456E+00 0.799755E04
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.102222E03
32Bit PAIRWISE  result,time: 0.123456E+00 0.228471E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.149437E03 2
32Bit PAIRWISE  result,time: 0.123456E+00 0.202666E03 2
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.776630E04 4
32Bit PAIRWISE  result,time: 0.123456E+00 0.113211E03 4
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.454511E04 8
32Bit PAIRWISE  result,time: 0.123456E+00 0.609175E04 8
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.318196E04 16
32Bit PAIRWISE  result,time: 0.123456E+00 0.395074E04 16
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.300746E04 32
32Bit PAIRWISE  result,time: 0.123456E+00 0.331130E04 32
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.326088E04 64
32Bit PAIRWISE  result,time: 0.123456E+00 0.373874E04 64
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.429449E04 128
32Bit PAIRWISE  result,time: 0.123392E+00 0.466186E04 128
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.597046E04 256
32Bit PAIRWISE  result,time: 0.123392E+00 0.618534E04 256
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.694541E04 512
32Bit PAIRWISE  result,time: 0.123392E+00 0.717122E04 512
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.758849E04 1024
32Bit PAIRWISE  result,time: 0.122881E+00 0.760289E04 1024
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.781288E04 2048
32Bit PAIRWISE  result,time: 0.122881E+00 0.781563E04 2048
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.791697E04 4096
32Bit PAIRWISE  result,time: 0.122880E+00 0.790914E04 4096
32Bit RECURSIVEPAIRWISE  result,time: 0.123443E+00 0.798613E04 8192
32Bit PAIRWISE  result,time: 0.122869E+00 0.798569E04 8192
32Bit RECURSIVEPAIRWISE  result,time: 0.123464E+00 0.803855E04 16384
32Bit PAIRWISE  result,time: 0.114696E+00 0.746117E04 16384
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.802226E04 32768
32Bit PAIRWISE  result,time: 0.983109E01 0.639774E04 32768
32Bit RECURSIVEPAIRWISE  result,time: 0.123367E+00 0.803992E04 65536
32Bit PAIRWISE  result,time: 0.654852E01 0.427503E04 65536
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.806152E04 131072
32Bit PAIRWISE  result,time: 0.123311E+00 0.805235E04 131072
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.805765E04 262144
32Bit PAIRWISE  result,time: 0.123311E+00 0.804456E04 262144
32Bit RECURSIVEPAIRWISE  result,time: 0.123311E+00 0.806504E04 524288
32Bit PAIRWISE  result,time: 0.123311E+00 0.804857E04 524288

gfortran Ofast
 Compiling with Ofast
would be nice, as it should allow further optimization at the expense of accuracy. Interestingly, we see a massive performance regression with Ofast
for the simple hpa_sum
. This is the first time I started seeing the expected behavior of “larger chunk_size
→ better performance but lower accuracy.” In the end, p_sum
with userselected chunk_size=64
is the fastest while maintaining accuracy (for this specific test array) using these flags.
Performance: gfortran Ofast
COMPILER: GCC version 13.1.1 20230429
OPTIONS: mtune=generic march=x8664 Ofast fpreinclude=/usr/include/finclude/mathvectorfortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E05
32Bit EXACT SUM: 0.123456E+00
32Bit INTRINSIC  result,time: 0.123475E+00 0.201231E04
32Bit WRAPPEDINTRINSIC  result,time: 0.123475E+00 0.201817E04
32Bit HIGHPRECISION ACCUMULATOR  result,time: 0.123456E+00 0.185975E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.332306E03
32Bit PAIRWISE  result,time: 0.123456E+00 0.207084E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.363203E03 2
32Bit PAIRWISE  result,time: 0.123456E+00 0.207210E03 2
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.199363E03 4
32Bit PAIRWISE  result,time: 0.123456E+00 0.109001E03 4
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.109159E03 8
32Bit PAIRWISE  result,time: 0.123456E+00 0.574292E04 8
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.558018E04 16
32Bit PAIRWISE  result,time: 0.123456E+00 0.326604E04 16
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.322609E04 32
32Bit PAIRWISE  result,time: 0.123456E+00 0.197867E04 32
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.201350E04 64
32Bit PAIRWISE  result,time: 0.123456E+00 0.146219E04 64
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.163618E04 128
32Bit PAIRWISE  result,time: 0.123392E+00 0.135129E04 128
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.160555E04 256
32Bit PAIRWISE  result,time: 0.123392E+00 0.147956E04 256
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.164985E04 512
32Bit PAIRWISE  result,time: 0.123392E+00 0.161504E04 512
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.189415E04 1024
32Bit PAIRWISE  result,time: 0.122880E+00 0.182092E04 1024
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.195153E04 2048
32Bit PAIRWISE  result,time: 0.122880E+00 0.192573E04 2048
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.200322E04 4096
32Bit PAIRWISE  result,time: 0.122881E+00 0.197797E04 4096
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.203376E04 8192
32Bit PAIRWISE  result,time: 0.122881E+00 0.200474E04 8192
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.202800E04 16384
32Bit PAIRWISE  result,time: 0.114688E+00 0.188345E04 16384
32Bit RECURSIVEPAIRWISE  result,time: 0.123443E+00 0.203066E04 32768
32Bit PAIRWISE  result,time: 0.982949E01 0.160496E04 32768
32Bit RECURSIVEPAIRWISE  result,time: 0.123464E+00 0.202097E04 65536
32Bit PAIRWISE  result,time: 0.655408E01 0.106927E04 65536
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200050E04 131072
32Bit PAIRWISE  result,time: 0.123475E+00 0.200048E04 131072
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200048E04 262144
32Bit PAIRWISE  result,time: 0.123475E+00 0.200079E04 262144
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200056E04 524288
32Bit PAIRWISE  result,time: 0.123475E+00 0.200073E04 524288

gfortran Ofast flto
 The trends from above continue. With flto
, now the recursive rp_sum
is measuring faster, and accuracy is maintained to user selected chunk_size=128
.
Performance: gfortran Ofast flto
COMPILER: GCC version 13.1.1 20230429
OPTIONS: mtune=generic march=x8664 Ofast flto fpreinclude=/usr/include/finclude/mathvectorfortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E05
32Bit EXACT SUM: 0.123456E+00
32Bit INTRINSIC  result,time: 0.123475E+00 0.198425E04
32Bit WRAPPEDINTRINSIC  result,time: 0.123475E+00 0.199907E04
32Bit HIGHPRECISION ACCUMULATOR  result,time: 0.123456E+00 0.184698E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.110326E03
32Bit PAIRWISE  result,time: 0.123456E+00 0.179957E03
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.158522E03 2
32Bit PAIRWISE  result,time: 0.123456E+00 0.221054E03 2
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.837176E04 4
32Bit PAIRWISE  result,time: 0.123456E+00 0.125978E03 4
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.467039E04 8
32Bit PAIRWISE  result,time: 0.123456E+00 0.662764E04 8
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.273335E04 16
32Bit PAIRWISE  result,time: 0.123456E+00 0.363717E04 16
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.177611E04 32
32Bit PAIRWISE  result,time: 0.123456E+00 0.220076E04 32
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.132439E04 64
32Bit PAIRWISE  result,time: 0.123456E+00 0.159370E04 64
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.128255E04 128
32Bit PAIRWISE  result,time: 0.123392E+00 0.139189E04 128
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.142094E04 256
32Bit PAIRWISE  result,time: 0.123392E+00 0.146052E04 256
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.150170E04 512
32Bit PAIRWISE  result,time: 0.123392E+00 0.158891E04 512
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.177992E04 1024
32Bit PAIRWISE  result,time: 0.122880E+00 0.179189E04 1024
32Bit RECURSIVEPAIRWISE  result,time: 0.123456E+00 0.189005E04 2048
32Bit PAIRWISE  result,time: 0.122880E+00 0.190103E04 2048
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.195353E04 4096
32Bit PAIRWISE  result,time: 0.122881E+00 0.195569E04 4096
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.198326E04 8192
32Bit PAIRWISE  result,time: 0.122881E+00 0.198258E04 8192
32Bit RECURSIVEPAIRWISE  result,time: 0.123457E+00 0.200243E04 16384
32Bit PAIRWISE  result,time: 0.114688E+00 0.186374E04 16384
32Bit RECURSIVEPAIRWISE  result,time: 0.123443E+00 0.200717E04 32768
32Bit PAIRWISE  result,time: 0.982949E01 0.160225E04 32768
32Bit RECURSIVEPAIRWISE  result,time: 0.123464E+00 0.200451E04 65536
32Bit PAIRWISE  result,time: 0.655408E01 0.106974E04 65536
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200135E04 131072
32Bit PAIRWISE  result,time: 0.123475E+00 0.200113E04 131072
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200121E04 262144
32Bit PAIRWISE  result,time: 0.123475E+00 0.200112E04 262144
32Bit RECURSIVEPAIRWISE  result,time: 0.123475E+00 0.200140E04 524288
32Bit PAIRWISE  result,time: 0.123475E+00 0.200149E04 524288