(not sure if this is the best place for this comment, but could be?)
I have been looking at DO CONCURRENT and read 22-169.pdf from j3-fortran.org .
Unfortunately:
- I am wanting to understand what multi-threading support is available for DO CONCURRENT, but,
- I don’t know what support there is for the new features of DO CONCURRENT in available compilers and,
- The only Fortran compilers I am using are GFortran Ver 12 and Silverfrost FTN95
However I have enhanced a test example of DO CONCURRENT use from 22-169.pdf.
This now provides a comparison of 4 types of approaches:
- using pure functions (as in 22-169.pdf)
- using a conventional DO loop
- using do concurrent ( but not with recent thread based changes )
- using !$OMP parallel DO
I have created a repeat test summary approach (add_ticks) for 7 different array sizes wth 5 repeats of each test, which produces a report of average ticks per test, using SYSTEM_CLOCK ( which is best for Gfortran on windows)
I am hoping othres might be able to :
a) enhance these tests with an additional DO CONCURRENT test that includes the recent thread based changes in the Fortran standard,
b) provide some documentation of which Fortran compilers support multi-threaded do concurrent,
c) provide some results to assess the effectiveness of standard conforming multi-threaded DO CONCURRENT usage vs OpenMP usage.
It would be good to understand what contribution DO CO… may be providing for standard conforming calculation.
I would be interested in what others think of the effectiveness of DO CONCURRENT, how it could provide some standard conformance for multi-threading, or if other directions are required.
My testing has been limited to multi-threading, rather than distributed processing / COARRAYS, which is another significant direction for Fortran.
One of the problems that is emerging with using multi-threaing is the movement to off-loading to GPU’s, where these are requiring more non-standard and diverse approaches for each type of GPU hardware, such as for NVIDIA or Intel hardware with hardware specific compilers.
The following is my revised code which I built using Gfortran
! Gfortran build : gfortran test1.f90 -O3 -fopenmp -o test1.exe
module numerot
integer, parameter :: num = 1000
integer :: n
real, allocatable :: A(:), B(:), C(:)
real :: dsec
integer*8 :: last_tick=0, dtick
integer, parameter :: mtype = 5
integer, parameter :: mtest = 100
integer :: ntype
integer :: test_count(mtype), test_n(mtest,mtype), test_t(mtest,mtype), sum_t(mtype)
contains
pure real function yksi(X)
implicit none
real, intent(in) :: X(:)
!real, intent(out) :: R
yksi = norm2(X)
end function yksi
pure real function kaksi(X)
implicit none
real, intent(in) :: X(:)
kaksi = 2*norm2(X)
end function kaksi
pure real function kolme(X)
implicit none
real, intent(in) :: X(:)
kolme = 3*norm2(X)
end function kolme
real function delta_sec ()
integer*8 :: tick, rate
call SYSTEM_CLOCK ( tick, rate )
dtick = tick-last_tick
last_tick = tick
dsec = real(dtick) / real(rate)
delta_sec = dsec
end function delta_sec
subroutine add_ticks ( test, n )
! routine to accumulate and report multiple tests
integer :: test, n, k,i, m, nt
real :: x
x = delta_sec ()
if ( test == 0 ) then ! initialise
ntype = n
test_count = 0
test_n = 0
else if ( test == -1 ) then ! report averages
write (*,10) 'pure', 'DO loop', 'DO Con', 'OpenMP'
k = test_count(1)+1
m = 0
nt = 0
sum_t = 0
do i = 1,k
if ( m /= test_n(i,1) ) then
if ( nt > 0 ) write ( *,12 ) sum_t(1:ntype) / nt
write (*,*) ' '
nt = 0
sum_t = 0
end if
if ( test_n(i,1) > 0 ) then
nt = nt+1
write ( *,11) nt, test_n(i,1), test_t(i,1:ntype)
sum_t = sum_t + test_t(i,:)
m = test_n(i,1)
end if
end do
10 format ( 13x, 5A8 )
11 format ( i3, 2i9, 5i8 )
12 format ( 13x, 5i8 )
else ! accumulate
if ( test <= mtype ) then
k = test_count(test) + 1
if ( k <= mtest ) then
test_count(test) = k
test_n(k,test) = n
test_t(k,test) = dtick
end if
end if
end if
end subroutine add_ticks
end module numerot
program main
use numerot
use iso_fortran_env
implicit none
integer i,j
write (*,*) 'Vern : ',compiler_version ()
write (*,*) 'Opts : ',compiler_options ()
call add_ticks (0,4)
do i = 1,7
n = num * 4**i
allocate ( a(n), b(n), c(n) )
A = 1
B = 1
C = 1
write ( *,*) 'Test n=',n
do j = 1,5
dsec = delta_sec ()
call main_test
call do_con_test
call openmp_test
call do_test
end do
deallocate ( a, b, c )
end do
call add_ticks (-1,0)
end program main
subroutine main_test
use numerot
implicit none
real :: RA, RB, RC
RA = yksi(A)
RB = kaksi(B)
RC = kolme(C)
call add_ticks (1,n)
print*,RA+RB+RC, dsec, dtick,' pure'
end subroutine main_test
subroutine do_con_test
use numerot
implicit none
real :: RA, RB, RC
integer i
ra = 0
rb = 0
rc = 0
do concurrent ( i = 1:size(A) )
RA = RA + A(i)**2
RB = RB + B(i)**2
RC = RC + C(i)**2
end do
RA = sqrt (RA)
RB = sqrt (RB) * 2
RC = sqrt (RC) * 3
call add_ticks (3,n)
print*,RA+RB+RC, dsec, dtick,' do concurrent'
end subroutine do_con_test
subroutine openmp_test
use numerot
implicit none
real :: RA, RB, RC
integer i
ra = 0
rb = 0
rc = 0
!$OMP PARALLEL DO private (i) shared (A,B,C), REDUCTION (+: RA,RB,RC)
do i = 1, size(A)
RA = RA + A(i)**2
RB = RB + B(i)**2
RC = RC + C(i)**2
end do
!$OMP END PARALLEL DO
RA = sqrt (RA)
RB = sqrt (RB) * 2
RC = sqrt (RC) * 3
call add_ticks (4,n)
print*,RA+RB+RC, dsec, dtick,' OpenMP'
end subroutine openmp_test
subroutine do_test
use numerot
implicit none
real :: RA, RB, RC
integer i
ra = 0
rb = 0
rc = 0
do i = 1, size(A)
RA = RA + A(i)**2
RB = RB + B(i)**2
RC = RC + C(i)**2
end do
RA = sqrt (RA)
RB = sqrt (RB) * 2
RC = sqrt (RC) * 3
call add_ticks (2,n)
print*,RA+RB+RC, dsec, dtick,' DO test'
end subroutine do_test