CONTIGUOUS certainly causes a problem in Gfortran
I have included a modified code that demonstrates some call alternatives.
!! probcons.f90:
module probcons
use iso_fortran_env, only: dp=>real64
real(dp), allocatable, dimension(:,:,:) :: hist
integer, parameter :: nhistpoints=50000
! contains
end module probcons
!! blahroutines.f90:
module blahroutines
use iso_fortran_env, only: dp=>real64
implicit none
contains
subroutine blah_orig (hist,res,i,j)
real(dp), contiguous, dimension(:,0:) :: hist
! real(8), dimension(:,0:) :: hist
real(dp) :: res
integer :: i,j,k,n
res=1.d0
do k=1,10
n=min(j,k)
res=res*hist(i,n)
enddo
if(res.eq.-12345.d0)write(6,*)'res=',res
return
end subroutine blah_orig
subroutine blah (hist,res,i,j)
! real(8), contiguous, dimension(:,0:) :: hist
real(dp), dimension(:,0:) :: hist
real(dp) :: res
integer :: i,j,k,n
res=1.d0
do k=1,10
n=min(j,k)
res=res*hist(i,n)
enddo
if(res.eq.-12345.d0)write(6,*)'res=',res
return
end subroutine blah
subroutine blah_star (hist,res,i,j)
! real(8), contiguous, dimension(:,0:) :: hist
real(dp), dimension(6,0:*) :: hist
real(dp) :: res
integer :: i,j,k,n
res=1.d0
do k=1,10
n=min(j,k)
res=res*hist(i,n)
enddo
if(res.eq.-12345.d0)write(6,*)'res=',res
return
end subroutine blah_star
function delta_sec ()
! high precision timer.
implicit none
real(dp) :: delta_sec
integer*8 :: tick, last=-1, rate=-1
if ( rate < 0 ) call system_clock ( last, rate )
call system_clock (tick)
delta_sec = dble (tick-last) / dble (rate)
last = tick
end function delta_sec
end module blahroutines
! aamain.f90:
program arraytemptest
use probcons ! , only : hist,nhistpoints
use blahroutines ! , only : blah, blah_star, blah_orig
implicit none
integer :: npart_lcl,i,j
real(dp) :: rslt, sec
sec = delta_sec ()
npart_lcl=256
write(6,*)'Running with nhistpoints,npart_lcl=',nhistpoints,npart_lcl
!allocate arrays:
allocate(hist(6,0:nhistpoints,npart_lcl))
hist(:,0:,:)=1.d0
! do the calculation: with contiguous
sec = delta_sec ()
do i=1,npart_lcl
do j=1,npart_lcl
call blah_orig (hist(1:6,0:nhistpoints,i),rslt,i,j)
! call blah(hist(:,0:,i),rslt,i,j)
if (rslt.eq.-123456.d0) write(6,*)rslt
end do
end do
sec = delta_sec ()
write(6,*)'BLAH_orig Done. Elapsed time=',sec,' sec', rslt
! do the calculation: with contiguous
sec = delta_sec ()
do i=1,npart_lcl
do j=1,npart_lcl
call blah_orig (hist(:,:,i),rslt,i,j)
! call blah(hist(:,0:,i),rslt,i,j)
if (rslt.eq.-123456.d0) write(6,*)rslt
end do
end do
sec = delta_sec ()
write(6,*)'BLAH_o:,: Done. Elapsed time=',sec,' sec', rslt
! do the calculation: no contiguous
sec = delta_sec ()
do i=1,npart_lcl
do j=1,npart_lcl
call blah (hist(1:6,0:nhistpoints,i),rslt,i,j)
! call blah(hist(:,0:,i),rslt,i,j)
if (rslt.eq.-123456.d0) write(6,*)rslt
end do
end do
sec = delta_sec ()
write(6,*)'BLAH(1:6 Done. Elapsed time=',sec,' sec', rslt
! do the calculation: no array section definition
sec = delta_sec ()
do i=1,npart_lcl
do j=1,npart_lcl
! call blah (hist(1:6,0:nhistpoints,i),rslt,i,j)
call blah (hist(:,0:,i),rslt,i,j)
if (rslt.eq.-123456.d0) write(6,*)rslt
end do
end do
sec = delta_sec ()
write(6,*)'BLAH(:,:) Done. Elapsed time=',sec,' sec', rslt
! do the calculation: F77 wrapper approach
sec = delta_sec ()
do i=1,npart_lcl
do j=1,npart_lcl
call blah_star (hist(1,0,i),rslt,i,j)
if (rslt.eq.-123456.d0) write(6,*)rslt
end do
end do
sec = delta_sec ()
write(6,*)'BLAH_star Done. Elapsed time=',sec,' sec', rslt
end program arraytemptest
Target: x86_64-w64-mingw32
Thread model: win32
gcc version 11.1.0 (GCC)
COLLECT_GCC_OPTIONS='-v' '-fimplicit-none' '-fallow-argument-mismatch' '-O2' '-march=native' '-ffast-math' '-o' 'blah.exe'
Running with nhistpoints,npart_lcl= 50000 256
BLAH_orig Done. Elapsed time= 31.638044000000001 sec 1.0000000000000000
BLAH_o:,: Done. Elapsed time= 4.9350000000000002E-004 sec 1.0000000000000000
BLAH(1:6 Done. Elapsed time= 4.6920000000000002E-004 sec 1.0000000000000000
BLAH(:,:) Done. Elapsed time= 4.7029999999999999E-004 sec 1.0000000000000000
BLAH_star Done. Elapsed time= 3.4120000000000000E-004 sec 1.0000000000000000
Use of contiguous combined with array section call certainly fails in this case
F77 wrapper works best, as it enforces no temporary arrays, but this is not always the best case.