Here’s a second example (contrived, admittedly) where associate behaves poorly:
subroutine add11(a,b,c,d)
real, intent(in) :: a(:), b(:)
real, intent(out) :: c(:), d(:)
associate(rc => a + b, rd => a - b)
c = rc
d = rd
end associate
end subroutine
gfortran appears to allocate two temporary arrays, do the addition and subtraction separately, and then copy the results into the output arrays. Similar with flang.
Ideally the compiler would recognize it doesn’t need a temporary and fuse the two arithmetic operations into the same loop to minimize memory accesses.
Link: Compiler Explorer
I expanded this example in a few different ways,
Loop Kernels (click here)
! loop_kernels.f90
module loop_kernels
implicit none
private
public :: add1, add2, add3, add4, add5, add6, add7, add8, add9, add10
public :: add11, add12, add13
type :: pair
real :: c, d
end type
contains
! Fortran array syntax
subroutine add1(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
c = a + b
d = a - b
end subroutine
! Manually-fused loops
subroutine add2(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
integer :: i
do i = 1, n
c(i) = a(i) + b(i)
d(i) = a(i) - b(i)
end do
end subroutine
! Non-fused loop variation
subroutine add3(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
integer :: i
do i = 1, n
c(i) = a(i) + b(i)
end do
do i = 1, n
d(i) = a(i) - b(i)
end do
end subroutine
subroutine add4(a,b,c,d)
real, intent(in) :: a(:), b(:)
real, intent(out) :: c(:), d(:)
c = a + b
d = a - b
end subroutine
subroutine add5(a,b,c,d)
real, intent(in) :: a(:), b(:)
real, intent(out) :: c(:), d(:)
c(:) = a(:) + b(:)
d(:) = a(:) - b(:)
end subroutine
subroutine add6(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
call op(a,b,c,d)
! FIXME: add loop variation
contains
elemental subroutine op(a,b,c,d)
real, intent(in) :: a, b
real, intent(out) :: c, d
c = a + b
d = a - b
end subroutine
end subroutine
subroutine add7(a,b,c,d)
real, intent(in) :: a(:), b(:)
real, intent(out) :: c(:), d(:)
call op(a,b,c,d)
contains
elemental subroutine op(a,b,c,d)
real, intent(in) :: a, b
real, intent(out) :: c, d
c = a + b
d = a - b
end subroutine
end subroutine
subroutine add8(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
integer :: i
do i = 1, n
associate(res=>make_sum(a(i),b(i)))
c(i) = res%c
d(i) = res%d
end associate
end do
contains
function make_sum(a,b) result(p)
real, intent(in) :: a, b
type(pair) :: p
p = pair(a + b, a - b)
end function
end subroutine
subroutine add9(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
associate(res=>make_sum_e(a, b))
c = res%c
d = res%d
end associate
contains
elemental function make_sum_e(a,b) result(p)
real, intent(in) :: a, b
type(pair) :: p
p = pair(a + b, a - b)
end function
end subroutine
subroutine add10(n,a,b,c,d)
integer, intent(in) :: n
real, intent(in) :: a(n), b(n)
real, intent(out) :: c(n), d(n)
integer :: i
do concurrent (i = 1:n)
associate(res => make_sum_e(a(i),b(i)))
c(i) = res%c
d(i) = res%d
end associate
end do
contains
elemental function make_sum_e(a,b) result(p)
real, intent(in) :: a, b
type(pair) :: p
p = pair(a + b, a - b)
end function
end subroutine
subroutine add11(a,b,c,d)
real, intent(in) :: a(:), b(:)
real, intent(out) :: c(:), d(:)
associate(rc => a + b, rd => a - b)
c = rc
d = rd
end associate
end subroutine
subroutine add12(a,b,c,d)
real, intent(in), pointer :: a(:), b(:), c(:), d(:)
c = a + b
d = a - b
end subroutine
subroutine add13(a,b,c,d)
real, intent(in), pointer, contiguous :: a(:), b(:), c(:), d(:)
c = a + b
d = a - b
end subroutine
end module
Driver (click here)
! driver.f90
program driver
use loop_kernels
implicit none
integer, parameter :: N = 10000000 ! 10M elements
integer, parameter :: TRIALS = 100
integer, parameter :: NUM_KERNELS = 13
real, allocatable, target :: a(:), b(:), c(:), d(:)
real(8) :: t1, t2
real(8) :: times(NUM_KERNELS)
real(8) :: relative_speeds(NUM_KERNELS)
integer :: i, j
character(len=20) :: names(NUM_KERNELS)
names = ["Array Syntax ", "Fused Loop (Ref) ", "Non-Fused Loops ", &
"Assumed Shape ", "Colon Slice ", "Elemental (Exp) ", &
"Elemental (Assum.)", "Associate Func ", "Elemental Func ", &
"Do Concurrent ", "Associate Expr ", "Pointer ", &
"Pointer (contig.) "]
allocate(a(N), b(N), c(N), d(N))
call random_number(a)
call random_number(b)
print *, "Starting Benchmark with N =", N, ", TRIALS = ", TRIALS
print "(A4, A20, A15, A15)", "#", "Kernel Name", "Time (s)", "Rel. Speed"
print *, "------------------------------------------------------------"
do i = 1, NUM_KERNELS
t1 = get_time()
do j = 1, TRIALS
select case(i)
case(1); call add1(N,a,b,c,d)
case(2); call add2(N,a,b,c,d)
case(3); call add3(N,a,b,c,d)
case(4); call add4(a,b,c,d)
case(5); call add5(a,b,c,d)
case(6); call add6(N,a,b,c,d)
case(7); call add7(a,b,c,d)
case(8); call add8(N,a,b,c,d)
case(9); call add9(N,a,b,c,d)
case(10); call add10(N,a,b,c,d)
case(11); call add11(a,b,c,d)
case(12); call add12(a,b,c,d)
case(13); call add13(a,b,c,d)
! Add cases for the new preposterous ones if you implement them
case default; c = a + b; d = a - b
end select
end do
t2 = get_time()
times(i) = (t2 - t1) / TRIALS
end do
! Reference is add2 (Fused Loop)
do i = 1, NUM_KERNELS
relative_speeds(i) = times(i) / times(2)
print "(I2, 2X, A20, F15.6, F15.3)", i, names(i), times(i), relative_speeds(i)
end do
print *, "------------------------------------------------------------"
print "(A, F10.3)", "Geometric Mean Abstraction Penalty: ", geo_mean(relative_speeds)
contains
function get_time()
real(8) :: get_time
integer :: c, r
call system_clock(c, r)
get_time = real(c, 8) / real(r, 8)
end function
function geo_mean(arr)
real(8), intent(in) :: arr(:)
real(8) :: geo_mean
geo_mean = exp(sum(log(arr)) / size(arr))
end function
end program driver
$ gfortran -O3 -mcpu=native -fcheck=array-temps loop_kernels.f90 driver.f90 -o driver
$ ./driver
Starting Benchmark with N = 10000000 , TRIALS = 100
# Kernel Name Time (s) Rel. Speed
------------------------------------------------------------
1 Array Syntax 0.003010 1.610
2 Fused Loop (Ref) 0.001870 1.000
3 Non-Fused Loops 0.002670 1.428
4 Assumed Shape 0.002670 1.428
5 Colon Slice 0.002670 1.428
6 Elemental (Exp) 0.001920 1.027
7 Elemental (Assum.) 0.001880 1.005
8 Associate Func 0.001950 1.043
9 Elemental Func 0.005260 2.813
10 Do Concurrent 0.001840 0.984
11 Associate Expr 0.004330 2.316
12 Pointer 0.013060 6.984
13 Pointer (contig.) 0.012950 6.925
------------------------------------------------------------
Geometric Mean Abstraction Penalty: 1.761
Compared to a plain old loop (#2), using associate (#11) gives a slowdown by a factor of two.
(Note, no temporary arrays are created at the call site.)