Why a function returns an array is much slower than a subroutine returns an array? (real MWE included)

When told to optimize, compilers sometimes optimize away unneeded calculations. When I add print*,y statements (code below), I get the same times for the two methods with ifort -O3 -heap-arrays on Windows.

module fg
implicit none
integer, private, parameter :: i4=selected_int_kind(9)
integer, private, parameter :: i8=selected_int_kind(15)
integer, private, parameter :: r8=selected_real_kind(15,9) 
integer(kind=i8), private, save :: n 
contains
subroutine fg_init(nin)
integer ( kind = i8 ) :: nin
n = nin
return
end subroutine  
function fi_vec ( x ) 
real ( kind = r8 ) :: fi_vec(n)
real ( kind = r8 ), intent(in) :: x(n)
    fi_vec(1) = -x(2)*x(1) 
    fi_vec(2) = -x(2) + 1.0_r8
return
end function fi_vec
subroutine fi_vec_sub ( x, f ) 
implicit none
real ( kind = r8 ) :: f(n)
real ( kind = r8 ) :: x(n) 
    f(1) = -x(2)*x(1) 
    f(2) = -x(2) + 1.0_r8
return
end 
end module fg
  
program main  
use fg
implicit none
integer, parameter :: i4=selected_int_kind(9)
integer, parameter :: i8=selected_int_kind(15)
integer, parameter :: r8=selected_real_kind(15,9)
integer(kind=i8) :: np,i,n
real(kind=r8) :: t0, t1, x(2),y(2)  
n=2
call fg_init(n)
np = 10**9
call CPU_TIME(t0)
do i=1,np
    x = x + 1.0_r8
    call fi_vec_sub(x, y)
enddo
call CPU_TIME(t1)
print*,"y=",y ! added to ensure y is computed
write(6,*) 'time for subroutine which return 2D array = ', t1 - t0
call CPU_TIME(t0)
do i=1,np
    x = x + 1.0_r8
    y = fi_vec(x)
enddo
call CPU_TIME(t1)
write(6,*) 'time for function whose value is a 2D array = ', t1 - t0
print*,"y=",y ! added to ensure y is computed
end program main
2 Likes