Hello,
I have never used the contiguous attribute for dummy array arguments so far, but in the following test code, gfortran-12 seems to give 2x speed-up as compared to the code with no such attribute. Is this kind of speed-up typical of other (more complicated) codes, or am I doing something wrong in the test code…? (FYI, I tried attaching contiguous to my own codes much ago, but I observed no speed-up at that time.)
[ test code ]
sumexp.f90:
module test_m
    use iso_fortran_env, only: dp => real64
    implicit none
contains
subroutine sumexp_1d( vec, val )
    real(dp), intent(in) _CONT_ :: vec(:)
    real(dp), intent(out) :: val
    val = sum( exp(vec) )
end
subroutine sumexp_2d( vecs, ans )
    real(dp), intent(in) _CONT_ :: vecs(:,:)
    real(dp), intent(out) :: ans
    ans = sum( exp( vecs ) )
end
subroutine sumexp_2d_loop( vecs, ans )
    real(dp), intent(in) _CONT_ :: vecs(:,:)
    real(dp), intent(out) :: ans
    integer :: i1, i2
    ans = 0
    do i2 = 1, size( vecs, 2 )
    do i1 = 1, size( vecs, 1 )
        ans = ans + exp( vecs( i1, i2 ) )
    enddo
    enddo
end
end module
main.f90:
program main
    use test_m
    implicit none
    integer, parameter :: ndat = 10 ** 6
    real(dp), allocatable :: arr(:,:)
    real(dp) :: ans, val, c_start, c_end
    integer :: idat
    allocate( arr( 4, ndat ) )
    call random_number( arr )
    print *, "---- 1d ----"
    call cpu_time(c_start)
    ans = 0
    do idat = 1, ndat
        call sumexp_1d( arr( :, idat ), val )
        ans = ans + val
    enddo
    call cpu_time(c_end)
    print *, "ans  = ", ans
    print *, "time = ", c_end - c_start
    print *, "---- 2d ----"
    call cpu_time(c_start)
    call sumexp_2d( arr, ans )
    call cpu_time(c_end)
    print *, "ans  = ", ans
    print *, "time = ", c_end - c_start
    print *, "---- 2d ----"
    call cpu_time(c_start)
    call sumexp_2d( arr, ans )
    call cpu_time(c_end)
    print *, "ans  = ", ans
    print *, "time = ", c_end - c_start
    print *, "--- 2d_loop ---"
    call cpu_time(c_start)
    call sumexp_2d_loop( arr, ans )
    call cpu_time(c_end)
    print *, "ans  = ", ans
    print *, "time = ", c_end - c_start
end
[ Result (gfortran-12) ]
Note1: machine = Ryzen 7 5700X + Ubuntu22
Note2: Timing fluctuates to some extent, but the fluctuation seems within 20 % or so.
$ gfortran-12 -O3 -march=native -cpp -D_CONT_= sumexp.f90 main.f90 && ./a.out
 ---- 1d ----
 ans  =    6873536.5875237575
 time =    1.5854000000000000E-002
 ---- 2d ----
 ans  =    6873536.5875230227
 time =    1.0854000000000003E-002
 --- 2d_loop ---
 ans  =    6873536.5875230227
 time =    1.1753000000000000E-002
$ gfortran-12 -O3 -march=native -cpp -D_CONT_= -flto sumexp.f90 main.f90 && ./a.out
 ---- 1d ----
 ans  =    6871727.5196960503
 time =    6.3569999999999946E-003   <-- becomes faster with -flto (due to inlining?)
 ---- 2d ----
 ans  =    6871727.5196956908
 time =    1.1256000000000002E-002
 --- 2d_loop ---
 ans  =    6871727.5196956908
 time =    1.1327999999999998E-002
$ gfortran-12 -O3 -march=native -cpp -D_CONT_=",contiguous" -flto sumexp.f90 main.f\
90 && ./a.out
 ---- 1d ----
 ans  =    6872925.9581768718
 time =    7.0910000000000001E-003
 ---- 2d ----
 ans  =    6872925.9581758250
 time =    6.7070000000000046E-003  <-- becomes faster with "contiguous"
 --- 2d_loop ---
 ans  =    6872925.9581758250
 time =    6.6529999999999992E-003 <-- becomes faster with "contiguous"
(Result of CompilerExplorer for comparison with ifort/ifx)
gfortran-14.2 : options =  -O3 -march=native -cpp -D_CONT_=
 ---- 1d ----
 ans  =    6873122.9830409009     
 time =    6.3490000000000005E-003
 ---- 2d ----
 ans  =    6873122.9830414364     
 time =    1.0876000000000004E-002
 --- 2d_loop ---
 ans  =    6873122.9830414364     
 time =    1.1318000000000002E-002
gfortran-14.2 : options =  -O3 -march=native -cpp -D_CONT_=",contiguous"
---- 1d ----
 ans  =    6873095.4191845488     
 time =    7.2559999999999986E-003
 ---- 2d ----
 ans  =    6873095.4191840515     
 time =    7.7909999999999993E-003  <-- now speed-up is not so much?
 --- 2d_loop ---
 ans  =    6873095.4191840515     
 time =    9.9569999999999936E-003
[ Result (ifort2021.11.0 with CompilerExplorer) ]
ifort options = -O3 -march=native -cpp -D_CONT_=
 ---- 1d ----
 ans  =    6873735.16051230     
 time =   8.883999999999996E-003
 ---- 2d ----
 ans  =    6873735.16051221     
 time =   7.436999999999999E-003
 --- 2d_loop ---
 ans  =    6873735.16051221     
 time =   7.478000000000005E-003
ifort options = -O3 -march=native -cpp -D_CONT_=",contiguous"
---- 1d ----
 ans  =    6873735.16051230     
 time =   6.016000000000001E-003  <-- somewhat faster with `contiguous`
 ---- 2d ----
 ans  =    6873735.16051221     
 time =   5.009000000000000E-003   <-- somewhat faster with `contiguous`
 --- 2d_loop ---
 ans  =    6873735.16051221     
 time =   5.124999999999998E-003  <-- somewhat faster with `contiguous`
[ Result (ifx2024.0.0) with CompilerExplorer
ifx options = -O3 -march=native -cpp -D_CONT_=
---- 1d ----
 ans  =    6873735.16051203     
 time =   6.585000000000008E-003
 ---- 2d ----
 ans  =    6873735.16051203     
 time =   1.010300000000000E-002
 --- 2d_loop ---
 ans  =    6873735.16051203     
 time =   9.993000000000002E-003
ifx options = -O3 -march=native -cpp -D_CONT_=",contiguous"
---- 1d ----
 ans  =    6873735.16051203     
 time =   9.002000000000010E-003  <-- slower with `contiguous` (?)
 ---- 2d ----
 ans  =    6873735.16051203     
 time =   9.013000000000007E-003  <-- not much different with `contiguous`
 --- 2d_loop ---
 ans  =    6873735.16051203     
 time =   8.839000000000000E-003  <-- not much different with `contiguous`
- As compared to gfortran, the speed-up seems moderate for ifort and very little for ifx. So the result seems to vary depending on compilers.
- Another surprise to me is that sumexp_1d()is rather fast as compared to the other 2D routines. I expected that the 1D routine would be very slow because a short vector is passed to a routine many times, so with a lot of overhead. But in this simple case, inlining actually eliminates such an overhead…?
