Nvfortran comparison of do concurrent vs OpenMP code

I used the following two versions of the code. It is the same OpenMP code implementation in gfortran and intel - Help - Fortran Discourse (fortran-lang.discourse.group)

1) OpenMP

!  OpenMP version of the code

program openmp_test
    use iso_fortran_env
    implicit none

    
    !=============
    
    integer ( kind = 4 ), parameter :: Nx = 2000
    integer ( kind = 4 ), parameter :: Ny = 2000
    real ( kind = 8 )               :: dx = 0.03d0
    real ( kind = 8 )               :: dy = 0.03d0
    real ( kind = 8 ),    parameter :: one  = 1.0
    real ( kind = 8 ),    parameter :: two  = 2.0
    real ( kind = 8 ),    parameter :: four = 4.0
    real ( kind = 8 ),    parameter :: half = 0.5
    
    !=============
    
    integer (kind = 4 ) :: nsteps = 2000
    integer (kind = 4 ) :: nprint = 100
    integer (kind = 4 ) :: tsteps 
    real ( kind = 8 )   :: dtime  = 1.0d-4
    
    !===============
    
    real ( kind = 8 )   :: tau   = 0.0003d0
    real ( kind = 8 )   :: epsilonb = 0.01d0
    real ( kind = 8 )   :: kappa = 1.8d0
    real ( kind = 8 )   :: delta = 0.02d0
    real ( kind = 8 )   :: aniso = 6.0d0
    real ( kind = 8 )   :: alpha = 0.9d0
    real ( kind = 8 )   :: gama  = 10.0d0
    real ( kind = 8 )   :: teq   = 1.0d0
    real ( kind = 8 )   :: theta0= 0.2d0
    real ( kind = 8 )   :: seed  = 5.0d0
    
    real ( kind = 8 )   :: pix   = four*atan(one)
    
    !===============
    
    real ( kind = 8 ) , dimension( Nx, Ny ) :: phi, tempr
    real ( kind = 8 ) , dimension( Nx, Ny ) :: lap_phi, lap_tempr
    real ( kind = 8 ) , dimension( Nx, Ny ) :: phidx, phidy
    real ( kind = 8 ) , dimension( Nx, Ny ) :: epsil, epsilon_deriv
    real ( kind = 8 )                       :: phi_old, term1, term2
    real ( kind = 8 )                       :: theta, m
    integer ( kind = 4 )                    :: i, j, ip, im, jp, jm
    
    
    
    !============================================================
    
    write (*,*) 'Vern : ',compiler_version ()
    write (*,*) 'Opts : ',compiler_options ()

    
    call timer ( 'start' )
    
    phi = 0.0
    tempr = 0.0
    
    do i = 1, Nx
        do j = 1, Ny
            if ( (i - Nx/two)*(i - Nx/two) + (j - Ny/two)*(j - Ny/two) < seed ) then
                phi(i,j) = one
            end if
        end do
    end do
    
    call timer ( 'initialise phi' )
    
    
    
    !============================================================
    
    
    
    time_loop: do tsteps = 1, nsteps
        
        
        !$omp  parallel do default ( none )     &
        !$omp&             private ( i,j, ip,im, jp,jm, theta )   &
        !$omp&             shared  ( dx, dy, lap_phi, lap_tempr, phi, tempr, phidx, phidy, epsil, epsilon_deriv,  &
        !$omp&                       theta0, aniso, delta, epsilonb )
        
        do j = 1, Ny
            do i =1, Nx                
                
                jp = j + 1
                jm = j - 1
                
                ip = i + 1
                im = i - 1
                
                if ( im == 0 ) im = Nx
                if ( ip == ( Nx + 1) ) ip = 1
                if ( jm == 0 ) jm = Ny
                if ( jp == ( Ny + 1) ) jp = 1
                
                !=====
                
                lap_phi(i,j)   = ( phi(ip,j) + phi(im,j) + phi(i,jm) + phi(i,jp) - four*phi(i,j)) / ( dx*dy )
                lap_tempr(i,j) = ( tempr(ip,j) + tempr(im,j) + tempr(i,jm) + tempr(i,jp) - four*tempr(i,j)) / ( dx*dy )
                
                !======
                
                phidx(i,j) = ( phi(ip,j) - phi(im,j) ) / dx
                phidy(i,j) = ( phi(i,jp) - phi(i,jm) ) / dy
                
                !======
                
                theta  = atan2( phidy(i,j),phidx(i,j) )
                
                !======
                
                epsil(i,j)         =  epsilonb *( one +  delta * cos ( aniso*( theta - theta0 ) ) )
                epsilon_deriv(i,j) = -epsilonb * aniso * delta * sin ( aniso*( theta - theta0 ) )
                
            end do
        end do
        
        !$omp end parallel do
        
        
        !$omp  parallel do default ( none )     &
        !$omp&             private ( i,j,ip,im,jp,jm, phi_old, term1, term2, m )  &
        !$omp&             shared  ( dx, dy, phi, tempr, epsil, epsilon_deriv, phidx, phidy, lap_phi, lap_tempr,  &
        !$omp&                       alpha, pix, teq, gama, dtime, tau, kappa )
        
        do j = 1, Ny
            do i =1, Nx 
                
                jp = j + 1
                jm = j - 1
                
                ip = i + 1
                im = i - 1
                
                if ( im == 0 ) im = Nx
                if ( ip == ( Nx + 1) ) ip = 1
                if ( jm == 0 ) jm = Ny
                if ( jp == ( Ny + 1) ) jp = 1
                
                phi_old = phi(i,j)
                
                !========
                
                term1 =  ( epsil(i,jp)*epsilon_deriv(i,jp)*phidx(i,jp) - epsil(i,jm)*epsilon_deriv(i,jm)*phidx(i,jm) ) / dy
                term2 = -( epsil(ip,j)*epsilon_deriv(ip,j)*phidy(ip,j) - epsil(im,j)*epsilon_deriv(im,j)*phidy(im,j) ) / dx
                
                !========
                
                m = alpha/pix * atan ( gama*( teq - tempr(i,j) ) )
                
                !========
                
                phi(i,j) = phi(i,j) + ( dtime/tau ) * ( term1 + term2 + epsil(i,j)**2 * lap_phi(i,j) ) &
                                    +  phi_old * ( one - phi_old )*( phi_old - half + m )
                tempr(i,j) = tempr(i,j) + dtime*lap_tempr(i,j) + kappa * ( phi(i,j) - phi_old )
                
            end do
        end do
        !$omp end parallel do
        
        
        ! print steps
        
        if ( mod ( tsteps, nprint ) .eq. 0 ) print *, 'Done steps  =  ', tsteps
        
        
        
    end do time_loop
    
    call timer ( 'time_loop' )
       
    !============================================================================
    ! output file
    
    open ( 11, file = "phi.dat" )
    do i = 1, Nx
        write( 11, 10 ) ( phi(i,j), j = 1, Ny )
    end do
    10 FORMAT (1000000F10.6)
    close( 11 )
    
    write (*,*) 'max phi =', maxval (phi)
    call timer ( 'report phi' )
    

end program openmp_test

  subroutine timer ( desc )
   character desc*(*)
   integer*8 :: clock, rate, last_clock = -1
   real*8    :: sec

   if ( last_clock < 0 ) then
     call system_clock ( last_clock, rate )
     sec = 0
     write (*,11) sec, desc
   else
     call system_clock ( clock, rate )
     sec = dble ( clock - last_clock ) / dble (rate)
     write (*,11) sec, desc
  11 format ( f10.3,2x,a )
     last_clock = clock
   end if
  end subroutine timer

2) Do concurrent

!  dc version of the main code

program dc_test
    use iso_fortran_env
    implicit none

    
    !=============
    
    integer ( kind = 4 ), parameter :: Nx = 2000
    integer ( kind = 4 ), parameter :: Ny = 2000
    real ( kind = 8 )               :: dx = 0.03d0
    real ( kind = 8 )               :: dy = 0.03d0
    real ( kind = 8 ),    parameter :: one  = 1.0
    real ( kind = 8 ),    parameter :: two  = 2.0
    real ( kind = 8 ),    parameter :: four = 4.0
    real ( kind = 8 ),    parameter :: half = 0.5
    
    !=============
    
    integer (kind = 4 ) :: nsteps = 2000
    integer (kind = 4 ) :: nprint = 100
    integer (kind = 4 ) :: tsteps 
    real ( kind = 8 )   :: dtime  = 1.0d-4
    
    !===============
    
    real ( kind = 8 )   :: tau   = 0.0003d0
    real ( kind = 8 )   :: epsilonb = 0.01d0
    real ( kind = 8 )   :: kappa = 1.8d0
    real ( kind = 8 )   :: delta = 0.02d0
    real ( kind = 8 )   :: aniso = 6.0d0
    real ( kind = 8 )   :: alpha = 0.9d0
    real ( kind = 8 )   :: gama  = 10.0d0
    real ( kind = 8 )   :: teq   = 1.0d0
    real ( kind = 8 )   :: theta0= 0.2d0
    real ( kind = 8 )   :: seed  = 5.0d0
    
    real ( kind = 8 )   :: pix   = four*atan(one)
    
    !===============
    
    real ( kind = 8 ) , dimension( Nx, Ny ) :: phi, tempr
    real ( kind = 8 ) , dimension( Nx, Ny ) :: lap_phi, lap_tempr
    real ( kind = 8 ) , dimension( Nx, Ny ) :: phidx, phidy
    real ( kind = 8 ) , dimension( Nx, Ny ) :: epsil, epsilon_deriv
    real ( kind = 8 )                       :: phi_old, term1, term2
    real ( kind = 8 )                       :: theta, m
    integer ( kind = 4 )                    :: i, j, ip, im, jp, jm
    
    
    
    !============================================================
    
    write (*,*) 'Vern : ',compiler_version ()
    write (*,*) 'Opts : ',compiler_options ()

    
    call timer ( 'start' )
    
    phi = 0.0
    tempr = 0.0
    
    do i = 1, Nx
        do j = 1, Ny
            if ( (i - Nx/two)*(i - Nx/two) + (j - Ny/two)*(j - Ny/two) < seed ) then
                phi(i,j) = one
            end if
        end do
    end do
    
    call timer ( 'initialise phi' )
    
    
    
    !============================================================
    
    
    
    time_loop: do tsteps = 1, nsteps
        
        do concurrent (j=1:Ny,i=1:Nx) default (none)    &
           local ( ip,im, jp,jm, theta )   &
           shared  ( dx, dy, lap_phi, lap_tempr, phi, tempr, phidx, phidy, epsil, epsilon_deriv,  &
                          theta0, aniso, delta, epsilonb )             
                
                jp = j + 1
                jm = j - 1
                
                ip = i + 1
                im = i - 1
                
                if ( im == 0 ) im = Nx
                if ( ip == ( Nx + 1) ) ip = 1
                if ( jm == 0 ) jm = Ny
                if ( jp == ( Ny + 1) ) jp = 1
                
                !=====
                
                lap_phi(i,j)   = ( phi(ip,j) + phi(im,j) + phi(i,jm) + phi(i,jp) - four*phi(i,j)) / ( dx*dy )
                lap_tempr(i,j) = ( tempr(ip,j) + tempr(im,j) + tempr(i,jm) + tempr(i,jp) - four*tempr(i,j)) / ( dx*dy )
                
                !======
                
                phidx(i,j) = ( phi(ip,j) - phi(im,j) ) / dx
                phidy(i,j) = ( phi(i,jp) - phi(i,jm) ) / dy
                
                !======
                
                theta  = atan2( phidy(i,j),phidx(i,j) )
                
                !======
                
                epsil(i,j)         =  epsilonb *( one +  delta * cos ( aniso*( theta - theta0 ) ) )
                epsilon_deriv(i,j) = -epsilonb * aniso * delta * sin ( aniso*( theta - theta0 ) )
                
	end do
        
        do concurrent (j=1:Ny,i=1:Nx) default (none)    &
           	local (ip,im,jp,jm, phi_old, term1, term2, m )  &
      		shared  ( dx, dy, phi, tempr, epsil, epsilon_deriv, phidx, phidy, lap_phi, lap_tempr,  &
        	alpha, pix, teq, gama, dtime, tau, kappa )
        	        
                jp = j + 1
                jm = j - 1
                
                ip = i + 1
                im = i - 1
                
                if ( im == 0 ) im = Nx
                if ( ip == ( Nx + 1) ) ip = 1
                if ( jm == 0 ) jm = Ny
                if ( jp == ( Ny + 1) ) jp = 1
                
                phi_old = phi(i,j)
                
                !========
                
                term1 =  ( epsil(i,jp)*epsilon_deriv(i,jp)*phidx(i,jp) - epsil(i,jm)*epsilon_deriv(i,jm)*phidx(i,jm) ) / dy
                term2 = -( epsil(ip,j)*epsilon_deriv(ip,j)*phidy(ip,j) - epsil(im,j)*epsilon_deriv(im,j)*phidy(im,j) ) / dx
                
                !========
                
                m = alpha/pix * atan ( gama*( teq - tempr(i,j) ) )
                
                !========
                
                phi(i,j) = phi(i,j) + ( dtime/tau ) * ( term1 + term2 + epsil(i,j)**2 * lap_phi(i,j) ) &
                                    +  phi_old * ( one - phi_old )*( phi_old - half + m )
                tempr(i,j) = tempr(i,j) + dtime*lap_tempr(i,j) + kappa * ( phi(i,j) - phi_old )
                
        end do
        
        
        ! print steps
        
        if ( mod ( tsteps, nprint ) .eq. 0 ) print *, 'Done steps  =  ', tsteps
        
        
        
    end do time_loop
    
    call timer ( 'time_loop' )
       
    !============================================================================
    ! output file
    
    open ( 11, file = "phi.dat" )
    do i = 1, Nx
        write( 11, 10 ) ( phi(i,j), j = 1, Ny )
    end do
    10 FORMAT (1000000F10.6)
    close( 11 )
    
    write (*,*) 'max phi =', maxval (phi)
    call timer ( 'report phi' )
    

end program dc_test

  subroutine timer ( desc )
   character desc*(*)
   integer*8 :: clock, rate, last_clock = -1
   real*8    :: sec

   if ( last_clock < 0 ) then
     call system_clock ( last_clock, rate )
     sec = 0
     write (*,11) sec, desc
   else
     call system_clock ( clock, rate )
     sec = dble ( clock - last_clock ) / dble (rate)
     write (*,11) sec, desc
  11 format ( f10.3,2x,a )
     last_clock = clock
   end if
  end subroutine timer

I run on Ubuntu 20.04 with following commands

1) OpenMP test

shahid@shahid-G3-3579:~$ nvfortran omp.f90 -o omp -mp -Minfo
openmp_test:
     66, FMA (fused multiply-add) instruction(s) generated
     83, !$omp parallel
    104, FMA (fused multiply-add) instruction(s) generated
    105, FMA (fused multiply-add) instruction(s) generated
    118, FMA (fused multiply-add) instruction(s) generated
    127, !$omp parallel
    188, maxval reduction inlined
shahid@shahid-G3-3579:~$ ./omp
 Vern : nvfortran 24.7-0
 Opts : omp.f90 -o omp -mp -Minfo
     0.000  start
     0.038  initialise phi
 Done steps  =            100
 Done steps  =            200
 Done steps  =            300
 Done steps  =            400
 Done steps  =            500
 Done steps  =            600
 Done steps  =            700
 Done steps  =            800
 Done steps  =            900
 Done steps  =           1000
 Done steps  =           1100
 Done steps  =           1200
 Done steps  =           1300
 Done steps  =           1400
 Done steps  =           1500
 Done steps  =           1600
 Done steps  =           1700
 Done steps  =           1800
 Done steps  =           1900
 Done steps  =           2000
    87.198  time_loop
 max phi =    1.002807743736036     
     0.266  report phi
  1. Do concurrent test
shahid@shahid-G3-3579:~$ nvfortran dc.f90 -o dc -stdpar=multicore -Minfo
dc_test:
     62, Memory zero idiom, array assignment replaced by call to pgf90_mzero8
     63, Memory zero idiom, array assignment replaced by call to pgf90_mzero8
     67, FMA (fused multiply-add) instruction(s) generated
     83, Generating Multicore code
         83, Loop parallelized across CPU threads
    101, FMA (fused multiply-add) instruction(s) generated
    102, FMA (fused multiply-add) instruction(s) generated
    115, FMA (fused multiply-add) instruction(s) generated
    120, Generating Multicore code
        120, Loop parallelized across CPU threads
    140, FMA (fused multiply-add) instruction(s) generated
    141, FMA (fused multiply-add) instruction(s) generated
    149, FMA (fused multiply-add) instruction(s) generated
    151, FMA (fused multiply-add) instruction(s) generated
    176, maxval reduction inlined
shahid@shahid-G3-3579:~$ ./dc
 Vern : nvfortran 24.7-0
 Opts : dc.f90 -o dc -stdpar=multicore -Minfo
     0.000  start
     0.030  initialise phi
 Done steps  =            100
 Done steps  =            200
 Done steps  =            300
 Done steps  =            400
 Done steps  =            500
 Done steps  =            600
 Done steps  =            700
 Done steps  =            800
 Done steps  =            900
 Done steps  =           1000
 Done steps  =           1100
 Done steps  =           1200
 Done steps  =           1300
 Done steps  =           1400
 Done steps  =           1500
 Done steps  =           1600
 Done steps  =           1700
 Done steps  =           1800
 Done steps  =           1900
 Done steps  =           2000
    71.349  time_loop
 max phi =    1.002807743736036     
     0.257  report phi

It seems like do concurrent is faster but with higher domain size i.e.Nx=2000,Ny=2000. I test with Nx=100 and Ny=100 and the computed time was the same.

FMA (fused multiply-add) instruction(s) are generated by the nvidia compiler here. Are they same as AVX instructions?