Compiler optimization seems to have a huge impact on both performance and accuracy for the same algorithm
I have been playing with the following code to test various methods of computing the sum of a 1D array:
module sums
use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64, i32=>int32, i64=>int64
implicit none
private
interface wrap_sum
module procedure wrap_sum_sp
end interface wrap_sum
interface hpa_sum
module procedure hpa_sum_sp
end interface hpa_sum
interface rp_sum
module procedure rp_sum_sp
module procedure rp_sum_chunks_sp
end interface rp_sum
interface p_sum
module procedure p_sum_sp
module procedure p_sum_chunks_sp
end interface p_sum
integer, parameter :: default_chunk = 2
public :: wrap_sum, hpa_sum, rp_sum, p_sum
contains
pure function wrap_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
y = sum(x)
end function wrap_sum_sp
pure function hpa_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
real(dp) :: temp
temp = sum(real(x, dp))
y = temp
end function hpa_sum_sp
pure recursive function rp_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y
integer :: n
n = size(x)
if (n.gt.default_chunk) then
y = rp_sum(x(1:n/2)) + rp_sum(x(n/2+1:n))
else
y = sum(x)
end if
end function rp_sum_sp
pure recursive function rp_sum_chunks_sp(x, chunk_size) result(y)
real(sp), intent(in) :: x(:)
integer, intent(in) :: chunk_size
real(sp) :: y
integer :: n
n = size(x)
if (n.gt.chunk_size) then
y = rp_sum(x(1:n/2), chunk_size) + rp_sum(x(n/2+1:n), chunk_size)
else
y = sum(x)
end if
end function rp_sum_chunks_sp
pure function p_sum_sp(x) result(y)
real(sp), intent(in) :: x(:)
real(sp) :: y, work(int((int(size(x), i64)+1_i64)/int(default_chunk, i64), i32))
integer :: n, i, j, jmax, levels
n = size(x)
if (n.gt.default_chunk) then
do i=1,size(work)
work(i) = sum(x((i-1)*default_chunk+1:min(i*default_chunk,n)))
end do
jmax = size(work)
levels = ceiling(log(real(size(work)))/log(2.0))
do i=1,levels
do j=1,jmax
work(j) = sum(work((j-1)*2+1:min(j*2,jmax)))
end do
jmax = ceiling(real(jmax)/2.0)
end do
y = work(1)
else
y = sum(x)
end if
end function p_sum_sp
pure function p_sum_chunks_sp(x, chunk_size) result(y)
real(sp), intent(in) :: x(:)
integer, intent(in) :: chunk_size
real(sp) :: y, work(int((int(size(x), i64)+1_i64)/int(chunk_size, i64), i32))
integer :: n, i, j, jmax, levels
n = size(x)
if (n.gt.chunk_size) then
do i=1,size(work)
work(i) = sum(x((i-1)*chunk_size+1:min(i*chunk_size,n)))
end do
jmax = size(work)
levels = ceiling(log(real(size(work)))/log(2.0))
do i=1,levels
do j=1,jmax
work(j) = sum(work((j-1)*2+1:min(j*2,jmax)))
end do
jmax = ceiling(real(jmax)/2.0)
end do
y = work(1)
else
y = sum(x)
end if
end function p_sum_chunks_sp
end module sums
program main
use, intrinsic :: iso_fortran_env, sp=>real32, dp=>real64, qp=>real128, i64=>int64
use, non_intrinsic :: sums
implicit none
integer, parameter :: n = 123456, trials = 10000, jmin = 1, jmax = 19, jinc = 1
real(sp) :: xsp(n), ysp, yref_sp, elapsed
integer(i64) :: c1, c2, cr
integer :: i, j, chunk_size
!! ECHO COMPILER VERSION AND OPTIONS
write(*,'(a,a)') 'COMPILER: ',compiler_version()
write(*,'(a,a)') 'OPTIONS: ',compiler_options()
write(*,'(a)') ''
!! ECHO EXACT SOLUTION AND SET UP XSP ARRAY
xsp = 0.1_sp**6
write(*,'(2(a,i0),a,e13.6)') 'n: ',n,', trials: ',trials,', xsp(1:n)=',xsp(n)
yref_sp = n*(0.1_sp**6)
write(*,'(a50,e13.6)') '32-Bit EXACT SUM: ',yref_sp
!! 32-BIT INTRINSIC SUM
call system_clock(c1, cr)
do i=1,trials
ysp = sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32-Bit INTRINSIC -- result,time: ',ysp,elapsed/real(trials)
!! 32-BIT WRAPPED (PURE FUNCTION) INTRINSIC SUM
call system_clock(c1, cr)
do i=1,trials
ysp = wrap_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32-Bit WRAPPED-INTRINSIC -- result,time: ',ysp,elapsed/real(trials)
!! 32-BIT HIGH-PRECISION (REAL64) ACCUMULATOR
call system_clock(c1, cr)
do i=1,trials
ysp = hpa_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32-Bit HIGH-PRECISION ACCUMULATOR -- result,time: ',ysp,elapsed/real(trials)
!! 32-BIT RECURSIVE-PAIRWISE (DEFAULT CHUNK = 2)
call system_clock(c1, cr)
do i=1,trials
ysp = rp_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32-Bit RECURSIVE-PAIRWISE -- result,time: ',ysp,elapsed/real(trials)
!! 32-BIT PAIRWISE (DEFAULT CHUNK = 2)
call system_clock(c1, cr)
do i=1,trials
ysp = p_sum(xsp)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6))') '32-Bit PAIRWISE -- result,time: ',ysp,elapsed/real(trials)
do j=jmin,jmax,jinc
chunk_size = 2**j
!! 32-BIT RECURSIVE-PAIRWISE WITH USER INPUT CHUNK
call system_clock(c1, cr)
do i=1,trials
ysp = rp_sum(xsp, chunk_size)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6),i8)') '32-Bit RECURSIVE-PAIRWISE -- result,time: ',ysp,elapsed/real(trials),chunk_size
!! 32-BIT PAIRWISE WITH USER INPUT CHUNK
call system_clock(c1, cr)
do i=1,trials
ysp = p_sum(xsp, chunk_size)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_i64))/real(cr)
write(*,'(a50,2(e13.6),i8)') '32-Bit PAIRWISE -- result,time: ',ysp,elapsed/real(trials),chunk_size
end do
write(*,'(a)') ''
end program main
The various algorithms being tested are:
-
INTRINSIC
- the language intrinsic function sum
seems to often be implemented as a simple loop. Performance is very good, but accuracy suffers as one would expect.
-
wrap_sum
- this is a pure function
that calls the intrinsic sum
, and really only serves to measure the overhead of replacing the inline, intrinsic sum
with a function call.
-
hpa_sum
- the “High-Precision Accumulator” summing function uses sum(real(INPUT_ARRAY, real64))
internally. This is the benchmark for accuracy, as well as performance. At least on my (relatively modern) hardware, the conversion from real32
to real64
seems to be nearly free.
-
rp_sum
- the “Recursive-Pairwise” summing function is as the name implies, a recursive implementation of pairwise summation. It is also available with a user-supplied CHUNK_SIZE
that will determine when to start accumulating (rather than the default 2 elements).
-
p_sum
- this is an iterative implementation of the above algorithm, and is also available with the user-selected CHUNK_SIZE
. Performance of the iterative vs recursive version seems highly dependent on compiler + options.
Various compiler options tested are:
-
gfortran -O3
- This seems to be a common optimization level people post here. -O3
should remain “safe” while doing all of the loop things the compiler can. On my machine, p_sum
with chunk_size=32
seems to yield the best performance while maintaining the correct answer, matching hpa_sum
.
Performance: gfortran -O3
COMPILER: GCC version 13.1.1 20230429
OPTIONS: -mtune=generic -march=x86-64 -O3 -fpre-include=/usr/include/finclude/math-vector-fortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E-05
32-Bit EXACT SUM: 0.123456E+00
32-Bit INTRINSIC -- result,time: 0.123311E+00 0.779044E-04
32-Bit WRAPPED-INTRINSIC -- result,time: 0.123311E+00 0.775578E-04
32-Bit HIGH-PRECISION ACCUMULATOR -- result,time: 0.123456E+00 0.781084E-04
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.325740E-03
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.203253E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.351616E-03 2
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.214255E-03 2
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.192748E-03 4
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.112471E-03 4
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.106952E-03 8
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.606991E-04 8
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.588933E-04 16
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.388042E-04 16
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.449248E-04 32
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.329614E-04 32
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.422338E-04 64
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.340916E-04 64
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.471615E-04 128
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.438817E-04 128
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.604450E-04 256
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.609737E-04 256
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.688568E-04 512
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.691470E-04 512
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.731949E-04 1024
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.729839E-04 1024
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.754217E-04 2048
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.754260E-04 2048
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.764888E-04 4096
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.764913E-04 4096
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123443E+00 0.772849E-04 8192
32-Bit PAIRWISE -- result,time: 0.122869E+00 0.766319E-04 8192
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123464E+00 0.771745E-04 16384
32-Bit PAIRWISE -- result,time: 0.114696E+00 0.720122E-04 16384
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.775248E-04 32768
32-Bit PAIRWISE -- result,time: 0.983109E-01 0.619083E-04 32768
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123367E+00 0.778058E-04 65536
32-Bit PAIRWISE -- result,time: 0.654852E-01 0.416639E-04 65536
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.782052E-04 131072
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.781921E-04 131072
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.781681E-04 262144
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.782069E-04 262144
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.781451E-04 524288
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.782358E-04 524288
-
gfortran -O3 -flto
- Allowing the compiler to further optimize at link-time via -flto
changes things up a bit. Now, the recursive rp_sum
with chunk_size=32
measures fastest on my machine. You can actually see that the recursive rp_sum
is significantly faster than iterative p_sum
as you decrease chunk_size
down to the default 2 (where the recursive method is twice as fast).
Performance: gfortran -O3 -flto
COMPILER: GCC version 13.1.1 20230429
OPTIONS: -mtune=generic -march=x86-64 -O3 -flto -fpre-include=/usr/include/finclude/math-vector-fortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E-05
32-Bit EXACT SUM: 0.123456E+00
32-Bit INTRINSIC -- result,time: 0.123311E+00 0.781882E-04
32-Bit WRAPPED-INTRINSIC -- result,time: 0.123311E+00 0.782007E-04
32-Bit HIGH-PRECISION ACCUMULATOR -- result,time: 0.123456E+00 0.799755E-04
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.102222E-03
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.228471E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.149437E-03 2
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.202666E-03 2
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.776630E-04 4
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.113211E-03 4
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.454511E-04 8
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.609175E-04 8
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.318196E-04 16
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.395074E-04 16
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.300746E-04 32
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.331130E-04 32
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.326088E-04 64
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.373874E-04 64
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.429449E-04 128
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.466186E-04 128
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.597046E-04 256
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.618534E-04 256
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.694541E-04 512
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.717122E-04 512
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.758849E-04 1024
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.760289E-04 1024
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.781288E-04 2048
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.781563E-04 2048
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.791697E-04 4096
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.790914E-04 4096
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123443E+00 0.798613E-04 8192
32-Bit PAIRWISE -- result,time: 0.122869E+00 0.798569E-04 8192
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123464E+00 0.803855E-04 16384
32-Bit PAIRWISE -- result,time: 0.114696E+00 0.746117E-04 16384
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.802226E-04 32768
32-Bit PAIRWISE -- result,time: 0.983109E-01 0.639774E-04 32768
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123367E+00 0.803992E-04 65536
32-Bit PAIRWISE -- result,time: 0.654852E-01 0.427503E-04 65536
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.806152E-04 131072
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.805235E-04 131072
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.805765E-04 262144
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.804456E-04 262144
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123311E+00 0.806504E-04 524288
32-Bit PAIRWISE -- result,time: 0.123311E+00 0.804857E-04 524288
-
gfortran -Ofast
- Compiling with -Ofast
would be nice, as it should allow further optimization at the expense of accuracy. Interestingly, we see a massive performance regression with -Ofast
for the simple hpa_sum
. This is the first time I started seeing the expected behavior of “larger chunk_size
→ better performance but lower accuracy.” In the end, p_sum
with user-selected chunk_size=64
is the fastest while maintaining accuracy (for this specific test array) using these flags.
Performance: gfortran -Ofast
COMPILER: GCC version 13.1.1 20230429
OPTIONS: -mtune=generic -march=x86-64 -Ofast -fpre-include=/usr/include/finclude/math-vector-fortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E-05
32-Bit EXACT SUM: 0.123456E+00
32-Bit INTRINSIC -- result,time: 0.123475E+00 0.201231E-04
32-Bit WRAPPED-INTRINSIC -- result,time: 0.123475E+00 0.201817E-04
32-Bit HIGH-PRECISION ACCUMULATOR -- result,time: 0.123456E+00 0.185975E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.332306E-03
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.207084E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.363203E-03 2
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.207210E-03 2
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.199363E-03 4
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.109001E-03 4
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.109159E-03 8
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.574292E-04 8
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.558018E-04 16
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.326604E-04 16
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.322609E-04 32
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.197867E-04 32
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.201350E-04 64
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.146219E-04 64
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.163618E-04 128
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.135129E-04 128
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.160555E-04 256
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.147956E-04 256
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.164985E-04 512
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.161504E-04 512
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.189415E-04 1024
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.182092E-04 1024
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.195153E-04 2048
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.192573E-04 2048
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.200322E-04 4096
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.197797E-04 4096
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.203376E-04 8192
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.200474E-04 8192
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.202800E-04 16384
32-Bit PAIRWISE -- result,time: 0.114688E+00 0.188345E-04 16384
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123443E+00 0.203066E-04 32768
32-Bit PAIRWISE -- result,time: 0.982949E-01 0.160496E-04 32768
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123464E+00 0.202097E-04 65536
32-Bit PAIRWISE -- result,time: 0.655408E-01 0.106927E-04 65536
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200050E-04 131072
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200048E-04 131072
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200048E-04 262144
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200079E-04 262144
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200056E-04 524288
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200073E-04 524288
-
gfortran -Ofast -flto
- The trends from above continue. With -flto
, now the recursive rp_sum
is measuring faster, and accuracy is maintained to user selected chunk_size=128
.
Performance: gfortran -Ofast -flto
COMPILER: GCC version 13.1.1 20230429
OPTIONS: -mtune=generic -march=x86-64 -Ofast -flto -fpre-include=/usr/include/finclude/math-vector-fortran.h
n: 123456, trials: 10000, xsp(1:n)= 0.100000E-05
32-Bit EXACT SUM: 0.123456E+00
32-Bit INTRINSIC -- result,time: 0.123475E+00 0.198425E-04
32-Bit WRAPPED-INTRINSIC -- result,time: 0.123475E+00 0.199907E-04
32-Bit HIGH-PRECISION ACCUMULATOR -- result,time: 0.123456E+00 0.184698E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.110326E-03
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.179957E-03
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.158522E-03 2
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.221054E-03 2
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.837176E-04 4
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.125978E-03 4
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.467039E-04 8
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.662764E-04 8
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.273335E-04 16
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.363717E-04 16
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.177611E-04 32
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.220076E-04 32
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.132439E-04 64
32-Bit PAIRWISE -- result,time: 0.123456E+00 0.159370E-04 64
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.128255E-04 128
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.139189E-04 128
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.142094E-04 256
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.146052E-04 256
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.150170E-04 512
32-Bit PAIRWISE -- result,time: 0.123392E+00 0.158891E-04 512
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.177992E-04 1024
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.179189E-04 1024
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123456E+00 0.189005E-04 2048
32-Bit PAIRWISE -- result,time: 0.122880E+00 0.190103E-04 2048
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.195353E-04 4096
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.195569E-04 4096
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.198326E-04 8192
32-Bit PAIRWISE -- result,time: 0.122881E+00 0.198258E-04 8192
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123457E+00 0.200243E-04 16384
32-Bit PAIRWISE -- result,time: 0.114688E+00 0.186374E-04 16384
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123443E+00 0.200717E-04 32768
32-Bit PAIRWISE -- result,time: 0.982949E-01 0.160225E-04 32768
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123464E+00 0.200451E-04 65536
32-Bit PAIRWISE -- result,time: 0.655408E-01 0.106974E-04 65536
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200135E-04 131072
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200113E-04 131072
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200121E-04 262144
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200112E-04 262144
32-Bit RECURSIVE-PAIRWISE -- result,time: 0.123475E+00 0.200140E-04 524288
32-Bit PAIRWISE -- result,time: 0.123475E+00 0.200149E-04 524288