Here are the compiler switches.
gfortran -O2 -fopenmp ch3305.f90 -o ch3305_gfortran.exe
ifort /nologo /Qdiag-disable:10448 -O2 -heap-arrays -openmp ch3305.f90 -o ch3305_ifort.exe
ifx /nologo -O2 -heap-arrays -openmp ch3305.f90 -o ch3305_ifx.exe
nagfor -O4 -fopenmp ch3305.f90 -o ch3305_nag.exe
nvfortran -O2 -fopenmp ch3305.f90 -o ch3305_nvidia.out
Iâve left out the Cray ones.
Here is the source file
include âinteger_kind_module.f90â
include âprecision_module.f90â
include âtiming_module.f90â
program ch3305
use timing_module
use precision_module
use omp_lib
use iso_fortran_env
implicit none
! Original/Book 10,000,000
!
! Latest version has a variable memory footprint
!
!
! This version checks the memory allocation
! for each loop
!
integer :: n
integer :: allocation_status
integer, parameter :: start_size = 12810241024
integer, parameter :: loop_count = 10
integer, parameter :: n_types = 4
integer :: i
integer :: j
integer :: k
integer :: l
integer :: nthreads
real (dp), allocatable, dimension (
:: x
real (dp), allocatable, dimension (
:: y
real (dp), allocatable, dimension (
:: z
real, dimension (n_types, loop_count) :: timing_details = 0.0
real, dimension (n_types) :: t_sum = 0.0
real, dimension (n_types) :: t_average = 0.0
real :: reset = 0.0
character (15), dimension (n_types) :: heading_1 = &
[ â Whole array ', &
â Do loop ', &
â Do concurrent ', &
â openmp â ]
print * , â â
print *,compiler_version()
print *,compiler_options()
print *, â â
nthreads = omp_get_max_threads()
open (unit=20, file=âch3305.datâ)
print 100, nthreads
100 format (â Nthreads = ', i3)
!
! Dynamic allocation
!
! The loop count l depends on the amount
! of memory the system has.
!
! I use native Windows and Linux installs
! and for these systems the memory is the
! actual physical memory.
!
! I also use
!
! Linux under wsl
! Linux under hyper-v
!
! Both of these have less than the physical memory available.
!
! Adjust l accordingly
!
! I use the following values with the systems I have
!
! 128 GB system - l=8
! 32 GB system - l=6
! 16 GB system - l=3
l=8
do k=1,l
print *,ââ
call start_timing()
print *,ââ
n = k * start_size
print *,''
print *,' Problem size = ' , k*128 , ' * 1024 * 1024'
print *,''
allocate (x(n),stat=allocation_status)
if (allocation_status > 0) then
print *,â Allocation errorâ
print *,â Program terminatesâ
stop 10
end if
allocate (y(n),stat=allocation_status)
if (allocation_status > 0) then
print *,â Allocation errorâ
print *,â Program terminatesâ
stop 20
end if
allocate (z(n),stat=allocation_status)
if (allocation_status > 0) then
print *,â Allocation errorâ
print *,â Program terminatesâ
stop 30
end if
!
! Initialisation
!
call random_number(x)
call random_number(y)
z = 0.0_dp
print 110, time_difference()
110 format (â Initialise time = ', f12.6)
write (20, 120) x(1), y(1), z(1)
120 format (3(2x,f6.3))
print *, â â
do j = 1, loop_count
print 130, j
130 format (' Iteration = ', i3)
!
! Whole array syntax
!
z = x + y
timing_details(1, j) = time_difference()
write (20, 120) x(1), y(1), z(1)
z = 0.0_dp
reset = time_difference()
!
! Simple traditional do loop
!
do i = 1, n
z(i) = x(i) + y(i)
end do
timing_details(2, j) = time_difference()
z = 0.0_dp
reset = time_difference()
!
! do concurrent loop
!
do concurrent (i=1:n)
z(i) = x(i) + y(i)
end do
timing_details(3, j) = time_difference()
write (20, 120) x(1), y(1), z(1)
z = 0.0_dp
reset = time_difference()
!
! OpenMP parallel loop
!
!$omp parallel do
do i = 1, n
z(i) = x(i) + y(i)
end do
!$omp end parallel do
timing_details(4, j) = time_difference()
write (20, 120) x(1), y(1), z(1)
z = 0.0_dp
reset = time_difference()
end do
print 140
140 format (15x, â Sum Averageâ)
do i = 1, n_types
t_sum(i) = sum(timing_details(i,1:loop_count))
t_average(i) = t_sum(i)/loop_count
print 150, heading_1(i), t_sum(i), t_average(i)
150 format (a, 2(3x,f12.6))
end do
deallocate (x)
deallocate (y)
deallocate (z)
print *, â â
call end_timing()
print *,â â
end do
close (20)
end program