I used the following two versions of the code. It is the same OpenMP code implementation in gfortran and intel - Help - Fortran Discourse (fortran-lang.discourse.group)
1) OpenMP
! OpenMP version of the code
program openmp_test
use iso_fortran_env
implicit none
!=============
integer ( kind = 4 ), parameter :: Nx = 2000
integer ( kind = 4 ), parameter :: Ny = 2000
real ( kind = 8 ) :: dx = 0.03d0
real ( kind = 8 ) :: dy = 0.03d0
real ( kind = 8 ), parameter :: one = 1.0
real ( kind = 8 ), parameter :: two = 2.0
real ( kind = 8 ), parameter :: four = 4.0
real ( kind = 8 ), parameter :: half = 0.5
!=============
integer (kind = 4 ) :: nsteps = 2000
integer (kind = 4 ) :: nprint = 100
integer (kind = 4 ) :: tsteps
real ( kind = 8 ) :: dtime = 1.0d-4
!===============
real ( kind = 8 ) :: tau = 0.0003d0
real ( kind = 8 ) :: epsilonb = 0.01d0
real ( kind = 8 ) :: kappa = 1.8d0
real ( kind = 8 ) :: delta = 0.02d0
real ( kind = 8 ) :: aniso = 6.0d0
real ( kind = 8 ) :: alpha = 0.9d0
real ( kind = 8 ) :: gama = 10.0d0
real ( kind = 8 ) :: teq = 1.0d0
real ( kind = 8 ) :: theta0= 0.2d0
real ( kind = 8 ) :: seed = 5.0d0
real ( kind = 8 ) :: pix = four*atan(one)
!===============
real ( kind = 8 ) , dimension( Nx, Ny ) :: phi, tempr
real ( kind = 8 ) , dimension( Nx, Ny ) :: lap_phi, lap_tempr
real ( kind = 8 ) , dimension( Nx, Ny ) :: phidx, phidy
real ( kind = 8 ) , dimension( Nx, Ny ) :: epsil, epsilon_deriv
real ( kind = 8 ) :: phi_old, term1, term2
real ( kind = 8 ) :: theta, m
integer ( kind = 4 ) :: i, j, ip, im, jp, jm
!============================================================
write (*,*) 'Vern : ',compiler_version ()
write (*,*) 'Opts : ',compiler_options ()
call timer ( 'start' )
phi = 0.0
tempr = 0.0
do i = 1, Nx
do j = 1, Ny
if ( (i - Nx/two)*(i - Nx/two) + (j - Ny/two)*(j - Ny/two) < seed ) then
phi(i,j) = one
end if
end do
end do
call timer ( 'initialise phi' )
!============================================================
time_loop: do tsteps = 1, nsteps
!$omp parallel do default ( none ) &
!$omp& private ( i,j, ip,im, jp,jm, theta ) &
!$omp& shared ( dx, dy, lap_phi, lap_tempr, phi, tempr, phidx, phidy, epsil, epsilon_deriv, &
!$omp& theta0, aniso, delta, epsilonb )
do j = 1, Ny
do i =1, Nx
jp = j + 1
jm = j - 1
ip = i + 1
im = i - 1
if ( im == 0 ) im = Nx
if ( ip == ( Nx + 1) ) ip = 1
if ( jm == 0 ) jm = Ny
if ( jp == ( Ny + 1) ) jp = 1
!=====
lap_phi(i,j) = ( phi(ip,j) + phi(im,j) + phi(i,jm) + phi(i,jp) - four*phi(i,j)) / ( dx*dy )
lap_tempr(i,j) = ( tempr(ip,j) + tempr(im,j) + tempr(i,jm) + tempr(i,jp) - four*tempr(i,j)) / ( dx*dy )
!======
phidx(i,j) = ( phi(ip,j) - phi(im,j) ) / dx
phidy(i,j) = ( phi(i,jp) - phi(i,jm) ) / dy
!======
theta = atan2( phidy(i,j),phidx(i,j) )
!======
epsil(i,j) = epsilonb *( one + delta * cos ( aniso*( theta - theta0 ) ) )
epsilon_deriv(i,j) = -epsilonb * aniso * delta * sin ( aniso*( theta - theta0 ) )
end do
end do
!$omp end parallel do
!$omp parallel do default ( none ) &
!$omp& private ( i,j,ip,im,jp,jm, phi_old, term1, term2, m ) &
!$omp& shared ( dx, dy, phi, tempr, epsil, epsilon_deriv, phidx, phidy, lap_phi, lap_tempr, &
!$omp& alpha, pix, teq, gama, dtime, tau, kappa )
do j = 1, Ny
do i =1, Nx
jp = j + 1
jm = j - 1
ip = i + 1
im = i - 1
if ( im == 0 ) im = Nx
if ( ip == ( Nx + 1) ) ip = 1
if ( jm == 0 ) jm = Ny
if ( jp == ( Ny + 1) ) jp = 1
phi_old = phi(i,j)
!========
term1 = ( epsil(i,jp)*epsilon_deriv(i,jp)*phidx(i,jp) - epsil(i,jm)*epsilon_deriv(i,jm)*phidx(i,jm) ) / dy
term2 = -( epsil(ip,j)*epsilon_deriv(ip,j)*phidy(ip,j) - epsil(im,j)*epsilon_deriv(im,j)*phidy(im,j) ) / dx
!========
m = alpha/pix * atan ( gama*( teq - tempr(i,j) ) )
!========
phi(i,j) = phi(i,j) + ( dtime/tau ) * ( term1 + term2 + epsil(i,j)**2 * lap_phi(i,j) ) &
+ phi_old * ( one - phi_old )*( phi_old - half + m )
tempr(i,j) = tempr(i,j) + dtime*lap_tempr(i,j) + kappa * ( phi(i,j) - phi_old )
end do
end do
!$omp end parallel do
! print steps
if ( mod ( tsteps, nprint ) .eq. 0 ) print *, 'Done steps = ', tsteps
end do time_loop
call timer ( 'time_loop' )
!============================================================================
! output file
open ( 11, file = "phi.dat" )
do i = 1, Nx
write( 11, 10 ) ( phi(i,j), j = 1, Ny )
end do
10 FORMAT (1000000F10.6)
close( 11 )
write (*,*) 'max phi =', maxval (phi)
call timer ( 'report phi' )
end program openmp_test
subroutine timer ( desc )
character desc*(*)
integer*8 :: clock, rate, last_clock = -1
real*8 :: sec
if ( last_clock < 0 ) then
call system_clock ( last_clock, rate )
sec = 0
write (*,11) sec, desc
else
call system_clock ( clock, rate )
sec = dble ( clock - last_clock ) / dble (rate)
write (*,11) sec, desc
11 format ( f10.3,2x,a )
last_clock = clock
end if
end subroutine timer
2) Do concurrent
! dc version of the main code
program dc_test
use iso_fortran_env
implicit none
!=============
integer ( kind = 4 ), parameter :: Nx = 2000
integer ( kind = 4 ), parameter :: Ny = 2000
real ( kind = 8 ) :: dx = 0.03d0
real ( kind = 8 ) :: dy = 0.03d0
real ( kind = 8 ), parameter :: one = 1.0
real ( kind = 8 ), parameter :: two = 2.0
real ( kind = 8 ), parameter :: four = 4.0
real ( kind = 8 ), parameter :: half = 0.5
!=============
integer (kind = 4 ) :: nsteps = 2000
integer (kind = 4 ) :: nprint = 100
integer (kind = 4 ) :: tsteps
real ( kind = 8 ) :: dtime = 1.0d-4
!===============
real ( kind = 8 ) :: tau = 0.0003d0
real ( kind = 8 ) :: epsilonb = 0.01d0
real ( kind = 8 ) :: kappa = 1.8d0
real ( kind = 8 ) :: delta = 0.02d0
real ( kind = 8 ) :: aniso = 6.0d0
real ( kind = 8 ) :: alpha = 0.9d0
real ( kind = 8 ) :: gama = 10.0d0
real ( kind = 8 ) :: teq = 1.0d0
real ( kind = 8 ) :: theta0= 0.2d0
real ( kind = 8 ) :: seed = 5.0d0
real ( kind = 8 ) :: pix = four*atan(one)
!===============
real ( kind = 8 ) , dimension( Nx, Ny ) :: phi, tempr
real ( kind = 8 ) , dimension( Nx, Ny ) :: lap_phi, lap_tempr
real ( kind = 8 ) , dimension( Nx, Ny ) :: phidx, phidy
real ( kind = 8 ) , dimension( Nx, Ny ) :: epsil, epsilon_deriv
real ( kind = 8 ) :: phi_old, term1, term2
real ( kind = 8 ) :: theta, m
integer ( kind = 4 ) :: i, j, ip, im, jp, jm
!============================================================
write (*,*) 'Vern : ',compiler_version ()
write (*,*) 'Opts : ',compiler_options ()
call timer ( 'start' )
phi = 0.0
tempr = 0.0
do i = 1, Nx
do j = 1, Ny
if ( (i - Nx/two)*(i - Nx/two) + (j - Ny/two)*(j - Ny/two) < seed ) then
phi(i,j) = one
end if
end do
end do
call timer ( 'initialise phi' )
!============================================================
time_loop: do tsteps = 1, nsteps
do concurrent (j=1:Ny,i=1:Nx) default (none) &
local ( ip,im, jp,jm, theta ) &
shared ( dx, dy, lap_phi, lap_tempr, phi, tempr, phidx, phidy, epsil, epsilon_deriv, &
theta0, aniso, delta, epsilonb )
jp = j + 1
jm = j - 1
ip = i + 1
im = i - 1
if ( im == 0 ) im = Nx
if ( ip == ( Nx + 1) ) ip = 1
if ( jm == 0 ) jm = Ny
if ( jp == ( Ny + 1) ) jp = 1
!=====
lap_phi(i,j) = ( phi(ip,j) + phi(im,j) + phi(i,jm) + phi(i,jp) - four*phi(i,j)) / ( dx*dy )
lap_tempr(i,j) = ( tempr(ip,j) + tempr(im,j) + tempr(i,jm) + tempr(i,jp) - four*tempr(i,j)) / ( dx*dy )
!======
phidx(i,j) = ( phi(ip,j) - phi(im,j) ) / dx
phidy(i,j) = ( phi(i,jp) - phi(i,jm) ) / dy
!======
theta = atan2( phidy(i,j),phidx(i,j) )
!======
epsil(i,j) = epsilonb *( one + delta * cos ( aniso*( theta - theta0 ) ) )
epsilon_deriv(i,j) = -epsilonb * aniso * delta * sin ( aniso*( theta - theta0 ) )
end do
do concurrent (j=1:Ny,i=1:Nx) default (none) &
local (ip,im,jp,jm, phi_old, term1, term2, m ) &
shared ( dx, dy, phi, tempr, epsil, epsilon_deriv, phidx, phidy, lap_phi, lap_tempr, &
alpha, pix, teq, gama, dtime, tau, kappa )
jp = j + 1
jm = j - 1
ip = i + 1
im = i - 1
if ( im == 0 ) im = Nx
if ( ip == ( Nx + 1) ) ip = 1
if ( jm == 0 ) jm = Ny
if ( jp == ( Ny + 1) ) jp = 1
phi_old = phi(i,j)
!========
term1 = ( epsil(i,jp)*epsilon_deriv(i,jp)*phidx(i,jp) - epsil(i,jm)*epsilon_deriv(i,jm)*phidx(i,jm) ) / dy
term2 = -( epsil(ip,j)*epsilon_deriv(ip,j)*phidy(ip,j) - epsil(im,j)*epsilon_deriv(im,j)*phidy(im,j) ) / dx
!========
m = alpha/pix * atan ( gama*( teq - tempr(i,j) ) )
!========
phi(i,j) = phi(i,j) + ( dtime/tau ) * ( term1 + term2 + epsil(i,j)**2 * lap_phi(i,j) ) &
+ phi_old * ( one - phi_old )*( phi_old - half + m )
tempr(i,j) = tempr(i,j) + dtime*lap_tempr(i,j) + kappa * ( phi(i,j) - phi_old )
end do
! print steps
if ( mod ( tsteps, nprint ) .eq. 0 ) print *, 'Done steps = ', tsteps
end do time_loop
call timer ( 'time_loop' )
!============================================================================
! output file
open ( 11, file = "phi.dat" )
do i = 1, Nx
write( 11, 10 ) ( phi(i,j), j = 1, Ny )
end do
10 FORMAT (1000000F10.6)
close( 11 )
write (*,*) 'max phi =', maxval (phi)
call timer ( 'report phi' )
end program dc_test
subroutine timer ( desc )
character desc*(*)
integer*8 :: clock, rate, last_clock = -1
real*8 :: sec
if ( last_clock < 0 ) then
call system_clock ( last_clock, rate )
sec = 0
write (*,11) sec, desc
else
call system_clock ( clock, rate )
sec = dble ( clock - last_clock ) / dble (rate)
write (*,11) sec, desc
11 format ( f10.3,2x,a )
last_clock = clock
end if
end subroutine timer
I run on Ubuntu 20.04 with following commands
1) OpenMP test
shahid@shahid-G3-3579:~$ nvfortran omp.f90 -o omp -mp -Minfo
openmp_test:
66, FMA (fused multiply-add) instruction(s) generated
83, !$omp parallel
104, FMA (fused multiply-add) instruction(s) generated
105, FMA (fused multiply-add) instruction(s) generated
118, FMA (fused multiply-add) instruction(s) generated
127, !$omp parallel
188, maxval reduction inlined
shahid@shahid-G3-3579:~$ ./omp
Vern : nvfortran 24.7-0
Opts : omp.f90 -o omp -mp -Minfo
0.000 start
0.038 initialise phi
Done steps = 100
Done steps = 200
Done steps = 300
Done steps = 400
Done steps = 500
Done steps = 600
Done steps = 700
Done steps = 800
Done steps = 900
Done steps = 1000
Done steps = 1100
Done steps = 1200
Done steps = 1300
Done steps = 1400
Done steps = 1500
Done steps = 1600
Done steps = 1700
Done steps = 1800
Done steps = 1900
Done steps = 2000
87.198 time_loop
max phi = 1.002807743736036
0.266 report phi
- Do concurrent test
shahid@shahid-G3-3579:~$ nvfortran dc.f90 -o dc -stdpar=multicore -Minfo
dc_test:
62, Memory zero idiom, array assignment replaced by call to pgf90_mzero8
63, Memory zero idiom, array assignment replaced by call to pgf90_mzero8
67, FMA (fused multiply-add) instruction(s) generated
83, Generating Multicore code
83, Loop parallelized across CPU threads
101, FMA (fused multiply-add) instruction(s) generated
102, FMA (fused multiply-add) instruction(s) generated
115, FMA (fused multiply-add) instruction(s) generated
120, Generating Multicore code
120, Loop parallelized across CPU threads
140, FMA (fused multiply-add) instruction(s) generated
141, FMA (fused multiply-add) instruction(s) generated
149, FMA (fused multiply-add) instruction(s) generated
151, FMA (fused multiply-add) instruction(s) generated
176, maxval reduction inlined
shahid@shahid-G3-3579:~$ ./dc
Vern : nvfortran 24.7-0
Opts : dc.f90 -o dc -stdpar=multicore -Minfo
0.000 start
0.030 initialise phi
Done steps = 100
Done steps = 200
Done steps = 300
Done steps = 400
Done steps = 500
Done steps = 600
Done steps = 700
Done steps = 800
Done steps = 900
Done steps = 1000
Done steps = 1100
Done steps = 1200
Done steps = 1300
Done steps = 1400
Done steps = 1500
Done steps = 1600
Done steps = 1700
Done steps = 1800
Done steps = 1900
Done steps = 2000
71.349 time_loop
max phi = 1.002807743736036
0.257 report phi
It seems like do concurrent is faster but with higher domain size i.e.Nx=2000,Ny=2000. I test with Nx=100 and Ny=100 and the computed time was the same.
FMA (fused multiply-add) instruction(s) are generated by the nvidia compiler here. Are they same as AVX instructions?