Speed of array intrinsics

The array intrinsic functions of Fortran are convenient, but it appears that if you are doing multiple calculations on the same array, it may be faster to write your own function. For the program

module min_max_mod
implicit none
integer, parameter :: wp = kind(1.0d0)
contains
pure function min_max(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(2)
integer                   :: i,n
n = size(x)
if (n < 1) return
y(1) = x(1)
y(2) = y(1)
do i=2,n
   if (x(i) < y(1)) y(1) = x(i)
   if (x(i) > y(2)) y(2) = x(i)
end do
end function min_max
!
pure function min_max_mean(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(3)
integer                   :: i,n
n = size(x)
if (n < 1) return
y(1)   = x(1)
y(2:3) = y(1)
do i=2,n
   if (x(i) < y(1)) y(1) = x(i)
   if (x(i) > y(2)) y(2) = x(i)
   y(3) = y(3) + x(i)
end do
if (n > 0) y(3) = y(3)/n
end function min_max_mean
end module min_max_mod
!
program xmin_max
use min_max_mod
implicit none
integer :: i,n
integer, parameter :: nt = 5, nlen = 30
real(kind=wp), allocatable :: x(:)
real(kind=wp)              :: extremes(2,2),t(nt),xmin_max_mean(3,2)
character (len=nlen) :: labels(nt-1) = [character(len=nlen) :: &
   "minval(x), maxval(x)","min_max(x)", &
   "minval(x),maxval(x),sum(x)/n","min_max_mean(x)"]
character (len=*), parameter :: fmt_cr = "(a20,*(1x,f16.12))"
n = 10**8
allocate (x(n))
call random_number(x)
call cpu_time(t(1))
extremes(:,1) = [minval(x),maxval(x)]
call cpu_time(t(2))
extremes(:,2) = min_max(x)
call cpu_time(t(3))
xmin_max_mean(:,2) = [minval(x),maxval(x),sum(x)/n]
call cpu_time(t(4))
xmin_max_mean(:,1) = min_max_mean(x)
call cpu_time(t(5))
print fmt_cr,"min, max:",extremes(:,1)
print fmt_cr,"min, max:",extremes(:,2)
print fmt_cr,"min, max, mean:",xmin_max_mean(:,1)
print fmt_cr,"min, max, mean:",xmin_max_mean(:,2)
print "(/,a8,5x,a30)","time","task"
print "(f8.4,5x,a30)",(t(i+1)-t(i),trim(labels(i)),i=1,nt-1)
end program xmin_max

the output for gfortran -O3 on WSL is

           min, max:   0.000000009614   0.999999996113
           min, max:   0.000000009614   0.999999996113
     min, max, mean:   0.000000009614   0.999999996113   0.499997321868
     min, max, mean:   0.000000009614   0.999999996113   0.499997321868

    time                               task
  0.2214               minval(x), maxval(x)
  0.1194                         min_max(x)
  0.3329       minval(x),maxval(x),sum(x)/n
  0.2380                    min_max_mean(x)

and for ifort -O3 on WSL it is

           min, max:   0.000000003725   0.999999968801
           min, max:   0.000000003725   0.999999968801
     min, max, mean:   0.000000003725   0.999999968801   0.499970722298
     min, max, mean:   0.000000003725   0.999999968801   0.499970722298

    time                               task
  0.1138               minval(x), maxval(x)
  0.0624                         min_max(x)
  0.1568       minval(x),maxval(x),sum(x)/n
  0.0657                    min_max_mean(x)
1 Like

If one wants the largest and smallest elements of an array of intrinsic type, the compiler’s optimization quality is what counts. In your function min_max, 2(n-1) comparisons are required. If each element of the array is of a user-defined type composite variable, it may be worthwhile to reduce the number of comparisons. For example, the following algorithm gets by with 3.n/2 - 2 comparisons, when n is even.

  1. Take the first two elements, compare their keys and use the results to initialize key_min and key_max.
  2. Take the next pair, identify which of the two is larger. Compare the smaller to the previous min. Compare the larger to the previous max.
  3. Repeat Step 2 until all pairs are processed.

If the size of the array, n, is odd, modify the algorithm to suit.

2 Likes

Thanks for the suggestion, but I am finding that algorithm to be slower. For the code

module min_max_mod
implicit none
integer, parameter :: wp = kind(1.0d0)
contains
pure function min_max(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(2)
integer                   :: i,n
n = size(x)
if (n < 1) return
y(1) = x(1)
y(2) = y(1)
do i=2,n
   if (x(i) < y(1)) y(1) = x(i)
   if (x(i) > y(2)) y(2) = x(i)
end do
end function min_max
!
pure function min_max_pairs(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(2)
integer                   :: i,n
logical                   :: odd
n = size(x)
if (n < 1) return
if (n == 1) then
   y(1) = x(1)
   y(2) = x(2)
   return
end if
odd = mod(n,2) == 1
if (x(2) > x(1)) then
   y(1) = x(1)
   y(2) = x(2)
else
   y(1) = x(2)
   y(2) = x(1)
end if
do i=3,n-1,2
   if (x(i+1) > x(i)) then
      if (y(1) > x(i))   y(1) = x(i)
      if (y(2) < x(i+1)) y(2) = x(i+1)
   else
      if (y(1) > x(i+1)) y(1) = x(i+1)
      if (y(2) < x(i))   y(2) = x(i)
   end if
end do
if (odd) then
   y(1) = min(y(1),x(n))
   y(2) = max(y(1),x(n))
end if
end function min_max_pairs
!
pure function min_max_local(x) result(y)
! same as min_max_pairs but sets 
! xi to x(i) and xi1 to x(i+1)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(2)
real(kind=wp)             :: xi,xi1
integer                   :: i,n
logical                   :: odd
n = size(x)
if (n < 1) return
if (n == 1) then
   y(1) = x(1)
   y(2) = x(2)
   return
end if
odd = mod(n,2) == 1
if (x(2) > x(1)) then
   y(1) = x(1)
   y(2) = x(2)
else
   y(1) = x(2)
   y(2) = x(1)
end if
do i=3,n-1,2
   xi1 = x(i+1)
   xi  = x(i)
   if (xi1 > xi) then
      if (y(1) > xi)   y(1) = xi
      if (y(2) < xi1) y(2) = xi1
   else
      if (y(1) > xi1) y(1) = xi1
      if (y(2) < xi)   y(2) = xi
   end if
end do
if (odd) then
   y(1) = min(y(1),x(n))
   y(2) = max(y(1),x(n))
end if
end function min_max_local
!
pure function min_max_mean(x) result(y)
real(kind=wp), intent(in) :: x(:)
real(kind=wp)             :: y(3)
integer                   :: i,n
n = size(x)
if (n < 1) return
y(1)   = x(1)
y(2:3) = y(1)
do i=2,n
   if (x(i) < y(1)) y(1) = x(i)
   if (x(i) > y(2)) y(2) = x(i)
   y(3) = y(3) + x(i)
end do
if (n > 0) y(3) = y(3)/n
end function min_max_mean
end module min_max_mod
!
program xmin_max
use min_max_mod
implicit none
integer :: i,n
integer, parameter :: nt = 7, nlen = 30, ncol_extremes = 4
integer                    :: icol
real(kind=wp), allocatable :: x(:)
real(kind=wp)              :: extremes(2,ncol_extremes),t(nt),xmin_max_mean(3,2)
character (len=nlen) :: labels(nt-1) = [character(len=nlen) :: &
   "minval(x), maxval(x)","min_max(x)","min_max_pairs(x)", &
   "min_max_local(x)","minval(x),maxval(x),sum(x)/n","min_max_mean(x)"]
character (len=*), parameter :: fmt_cr = "(a20,*(1x,f16.12))"
n = 10**8
allocate (x(n))
call random_number(x)
call cpu_time(t(1))
extremes(:,1) = [minval(x),maxval(x)]
call cpu_time(t(2))
extremes(:,2) = min_max(x)
call cpu_time(t(3))
extremes(:,3) = min_max_pairs(x)
call cpu_time(t(4))
extremes(:,4) = min_max_local(x)
call cpu_time(t(5))
xmin_max_mean(:,2) = [minval(x),maxval(x),sum(x)/n]
call cpu_time(t(6))
xmin_max_mean(:,1) = min_max_mean(x)
call cpu_time(t(7))
do icol=1,ncol_extremes
   print fmt_cr,"min, max:",extremes(:,icol)
end do
print fmt_cr,"min, max, mean:",xmin_max_mean(:,1)
print fmt_cr,"min, max, mean:",xmin_max_mean(:,2)
print "(/,a8,5x,a30)","time","task"
print "(f8.4,5x,a30)",(t(i+1)-t(i),trim(labels(i)),i=1,nt-1)
end program xmin_max

with gfortran -O3 on WSL2 I get

           min, max:   0.000000002249   0.999999986684
           min, max:   0.000000002249   0.999999986684
           min, max:   0.000000002249   0.999999986684
           min, max:   0.000000002249   0.999999986684
     min, max, mean:   0.000000002249   0.999999986684   0.499991319230
     min, max, mean:   0.000000002249   0.999999986684   0.499991319230

    time                               task
  0.2200               minval(x), maxval(x)
  0.1264                         min_max(x)
  0.2363                   min_max_pairs(x)
  0.2429                   min_max_local(x)
  0.3305       minval(x),maxval(x),sum(x)/n
  0.2371                    min_max_mean(x)

where min_max_pairs and min_max_local are the functions that try to reduce the number of comparisons, and with ifort -O3 on WSL2 I get

           min, max:   0.000000003725   0.999999968801
           min, max:   0.000000003725   0.999999968801
           min, max:   0.000000003725   0.999999968801
           min, max:   0.000000003725   0.999999968801
     min, max, mean:   0.000000003725   0.999999968801   0.499970722298
     min, max, mean:   0.000000003725   0.999999968801   0.499970722298

    time                               task
  0.1142               minval(x), maxval(x)
  0.0630                         min_max(x)
  0.2162                   min_max_pairs(x)
  0.2153                   min_max_local(x)
  0.1548       minval(x),maxval(x),sum(x)/n
  0.0674                    min_max_mean(x)

Interesting cases.
I found for my Win7-gfortran 11.1 on an i5-2300:
using the pairs approach was the slowest approach,
using more optimisation helped, but
using -O3 -ffast-math with intrisnics was the fastest.

Why don’t pairs do better :frowning:

Of interest, my minor changes to min_max, min_max_pair and timer, which achieved very little were:

  pure function min_max_new(x) result(y)
  ! same as min_max but different starting case
  real(kind=wp), intent(in) :: x(:)
  real(kind=wp)             :: y(2)
  integer                   :: i,n

  n = size(x)
  if (n < 1) return

   y = x(n)

   do i = 1,n-1
     if ( x(i) < y(1) ) y(1) = x(i)
     if ( x(i) > y(2) ) y(2) = x(i)
   end do

  end function min_max_new

  pure function min_max_dev(x) result(y)
  ! same as min_max_pairs, but different start case eliminates ODD
  real(kind=wp), intent(in) :: x(:)
  real(kind=wp)             :: y(2)
  integer                   :: i,n

  n = size(x)
  if (n < 1) return

   y = x(n)

   do i = 1,n-1,2
     if ( x(i) > x(i+1) ) then
       if ( x(i+1) < y(1) ) y(1) = x(i+1)
       if ( x(i)   > y(2) ) y(2) = x(i)  
     else
       if ( x(i)   < y(1) ) y(1) = x(i)  
       if ( x(i+1) > y(2) ) y(2) = x(i+1)
     end if
   end do

  end function min_max_dev

  subroutine elapse_time (sec)
  real(kind=wp)  :: sec
  integer*8 :: tick, rate
!z  call cpu_time (sec)
  call system_clock ( tick, rate )
  sec = dble(tick) / dble(rate)
  end subroutine elapse_time

It’s not the fastest for me, but it’s indeed 2x faster with -ffast-math or -Ofast

I think because this involves too many if/then/else that break some optimizations, while the intrinsics min/max are quite fast. As a matter of fact, with gfortran v1 = min(v1,a(i)) is much faster than if (a(i) < v1) v1 = a(i), at least with safe optimizations.

I’ve compared minval/maxval, 2 separate hand-made loops with min/max intrinsics, 1 single hand-made loop with min/max intrinsics, and 1 single hand-made loop with 2 flavoes of if tests:

$ gfortran -O3 minmax.f90 && ./a.out 
minval/maxval      0.180815E-07   0.100000E+01     time=   0.218761E+00
2 loops            0.180815E-07   0.100000E+01     time=   0.968360E-01
1 loop             0.180815E-07   0.100000E+01     time=   0.469800E-01
1 loop with if1    0.180815E-07   0.100000E+01     time=   0.163348E+00
1 loop with if2    0.180815E-07   0.100000E+01     time=   0.127367E+00

$ gfortran -O3 -Ofast minmax.f90 && ./a.out 
minval/maxval      0.183796E-07   0.100000E+01     time=   0.958210E-01
2 loops            0.183796E-07   0.100000E+01     time=   0.874210E-01
1 loop             0.183796E-07   0.100000E+01     time=   0.453200E-01
1 loop with if1    0.183796E-07   0.100000E+01     time=   0.766570E-01
1 loop with if2    0.183796E-07   0.100000E+01     time=   0.834430E-01

In all cases the single loop with min/max intrinsics is the fastest. What is weird is that without -Ofast, the code with 2 loops is much faster than minval/maxval.

program minmaxs
use iso_fortran_env
implicit none

integer, parameter :: wp=real64
real(wp), allocatable :: a(:)
real(wp) :: v1, v2
integer :: N = 2**26
integer(real64) :: tic, toc, rate

allocate( a(N) )
call random_number(a)

v1 = minval(a)
v2 = maxval(a)

call system_clock(tic,rate)
v1 = minval(a)
v2 = maxval(a)
call system_clock(toc)
write(*,"(A,2E15.6,A,E15.6)") &
   "minval/maxval   ", v1, v2, "     time=", (real(toc-tic,kind=real64))/rate

call system_clock(tic)
call min_max(a,v1,v2)
call system_clock(toc)
write(*,"(A,2E15.6,A,E15.6)") &
   "2 loops         ", v1, v2, "     time=", (real(toc-tic,kind=real64))/rate

call system_clock(tic)
call minmax(a,v1,v2)
call system_clock(toc)
write(*,"(A,2E15.6,A,E15.6)") &
   "1 loop          ", v1, v2, "     time=", (real(toc-tic,kind=real64))/rate

call system_clock(tic)
call minmax_if(a,v1,v2)
call system_clock(toc)
write(*,"(A,2E15.6,A,E15.6)") &
   "1 loop with if1 ", v1, v2, "     time=", (real(toc-tic,kind=real64))/rate

call system_clock(tic)
call minmax_if2(a,v1,v2)
call system_clock(toc)
write(*,"(A,2E15.6,A,E15.6)") &
   "1 loop with if2 ", v1, v2, "     time=", (real(toc-tic,kind=real64))/rate


contains


   subroutine min_max(a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      mi = min(mi,a(i))
   end do
   do i = 2, n
      ma = max(ma,a(i))
   end do
   end subroutine


   subroutine minmax(a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      mi = min(mi,a(i))
      ma = max(ma,a(i))
   end do
   end subroutine
   
   
   subroutine minmax_if(a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      if (a(i) < mi) mi = a(i)
      if (a(i) > ma) ma = a(i)
   end do
   end subroutine
  
  
   subroutine minmax_if2(a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      if (a(i) < mi) then
         mi = a(i)
      else if (a(i) > ma) then
         ma = a(i)
      end if
   end do
   end subroutine
   
end program

If any of you are curious about a C++ solution:

// cpp_minmax.cpp
#include <algorithm> // C++11
#include <span> // C++20


extern "C"
void cpp_minmax(int n, double *af, double *mi, double *ma)
{
  const std::span<const double> a{af,(size_t) n};
  auto [min,max] = std::minmax_element(a.begin(),a.end());

  // unpack iterators
  *mi = *min;
  *ma = *max;
}

For compilation you need to set the standard explicitly: g++ -std=c++20 -Wall -O3 -c cpp_minmax.cpp.

The corresponding Fortran interface is:

interface
   subroutine cpp_minmax(n,a,mi,ma) bind(c,name="cpp_minmax")
      use, intrinsic :: iso_c_binding, only: c_int, c_double
      integer(c_int), intent(in), value :: n
      real(c_double), intent(in) :: a(n)
      real(c_double), intent(out) :: mi, ma
   end subroutine
end interface

On my old Thinkpad with an Intel Ivy Bridge (2012), and compiled with GCC 10, it is slower than the pair of Fortran intrinsics.

2 Likes

@PierU
A couple of ideas about the testing:

  • the order of the test can be an issue, so I repeated the tests npass times
  • final average is reported
  • I don’t like “integer(real64)”

After all that, the results were consistent in my testing, but different to yours.
Making the test order random can help, but probably not justified in this case.
Other compile options might change a bit ?

program minmaxs
  use iso_fortran_env
  implicit none
  
  integer, parameter :: wp=real64
  integer, parameter :: tp=int64
  real(wp), allocatable :: a(:)
  real(wp) :: v1, v2, sec, secs(5)
  integer  :: N = 2**26, ncase=5, npass=6, pass, k
  integer(tp) :: tic, toc, rate
  character :: case_name(0:5) *16 = [ "initialise      ",  & 
                                      "minval/maxval   ",  &
                                      "2 loops         ",  &
                                      "1 loop          ",  &
                                      "1 loop with if1 ",  &
                                      "1 loop with if2 " ]

  call system_clock (tic,rate)
  allocate( a(N) )
  call random_number(a)
  v1 = minval(a)
  v2 = maxval(a)
  sec = delta_sec ()
  write (*,1001)   case_name(0), v1, v2, "     time=", sec

  do pass = 1,npass
    write (*,1002) ' '
    do k = 1,ncase
      call system_clock (tic)
      select case (k)
           
        case (1)    ! "minval/maxval"
          v1 = minval (a)
          v2 = maxval (a)
          
        case (2)     ! "2 loops"
          call min_max (a,v1,v2)
          
        case (3)     ! "1 loop"
          call minmax (a,v1,v2)
          
        case (4)     ! "1 loop with if1"
          call minmax_if (a,v1,v2)
          
        case (5)     ! "1 loop with if2"
          call minmax_if2 (a,v1,v2)
      end select
      sec = delta_sec ()
      write (*,1001)   case_name(k), v1, v2, "     time=", sec
      secs(k) = secs(k) + sec
    end do ! k
  end do  ! pass

  write (*,1002) ' '
  do k = 1,5
    write (*,1002) case_name(k), secs(k)/(npass)
  end do

 1001 format ( A, 2ES15.6, A, f15.8 )
 1002 format ( A, f15.8 )


contains


   subroutine min_max (a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      mi = min(mi,a(i))
   end do
   do i = 2, n
      ma = max(ma,a(i))
   end do
   end subroutine


   subroutine minmax (a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      mi = min(mi,a(i))
      ma = max(ma,a(i))
   end do
   end subroutine
   
   
   subroutine minmax_if (a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      if (a(i) < mi) mi = a(i)
      if (a(i) > ma) ma = a(i)
   end do
   end subroutine
  
  
   subroutine minmax_if2 (a,mi,ma)
   real(wp), intent(in) :: a(:)
   real(wp), intent(out) :: mi, ma
   integer :: n, i
   
   n = size(a)
   mi = a(1) ; ma = a(1)
   do i = 2, n
      if (a(i) < mi) then
         mi = a(i)
      else if (a(i) > ma) then
         ma = a(i)
      end if
   end do
   end subroutine
   
   real(wp) function delta_sec ()
   call system_clock (toc)
   delta_sec = dble(toc-tic)/dble(rate)
   end function delta_sec
   
end program
..
minval/maxval      2.021972E-08   1.000000E+00     time=     0.09090000
2 loops            2.021972E-08   1.000000E+00     time=     0.29000000
1 loop             2.021972E-08   1.000000E+00     time=     0.13980000
1 loop with if1    2.021972E-08   1.000000E+00     time=     0.08430000
1 loop with if2    2.021972E-08   1.000000E+00     time=     0.08440000

minval/maxval        0.09086667
2 loops              0.29300000
1 loop               0.14276667
1 loop with if1      0.08636667
1 loop with if2      0.08710000

(hopefully no errors from the changes)

1 Like