Fortran skill markdown for codex

Here’s a second example (contrived, admittedly) where associate behaves poorly:

    subroutine add11(a,b,c,d)
        real, intent(in) :: a(:), b(:)
        real, intent(out) :: c(:), d(:)
        associate(rc => a + b, rd => a - b)
            c = rc
            d = rd
        end associate
    end subroutine

gfortran appears to allocate two temporary arrays, do the addition and subtraction separately, and then copy the results into the output arrays. Similar with flang.

Ideally the compiler would recognize it doesn’t need a temporary and fuse the two arithmetic operations into the same loop to minimize memory accesses.

Link: Compiler Explorer


I expanded this example in a few different ways,

Loop Kernels (click here)
! loop_kernels.f90
module loop_kernels
implicit none
private

public :: add1, add2, add3, add4, add5, add6, add7, add8, add9, add10
public :: add11, add12, add13

type :: pair
    real :: c, d
end type

contains

    ! Fortran array syntax
    subroutine add1(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)
    c = a + b
    d = a - b
    end subroutine

    ! Manually-fused loops
    subroutine add2(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        c(i) = a(i) + b(i)
        d(i) = a(i) - b(i)
    end do

    end subroutine

    ! Non-fused loop variation
    subroutine add3(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        c(i) = a(i) + b(i)
    end do

    do i = 1, n
        d(i) = a(i) - b(i)
    end do

    end subroutine


    subroutine add4(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    c = a + b
    d = a - b

    end subroutine

    subroutine add5(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    c(:) = a(:) + b(:)
    d(:) = a(:) - b(:)

    end subroutine


    subroutine add6(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    call op(a,b,c,d)

    ! FIXME: add loop variation

    contains

        elemental subroutine op(a,b,c,d)
        real, intent(in) :: a, b
        real, intent(out) :: c, d
        c = a + b
        d = a - b
        end subroutine

    end subroutine

    subroutine add7(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    call op(a,b,c,d)

    contains

        elemental subroutine op(a,b,c,d)
        real, intent(in) :: a, b
        real, intent(out) :: c, d
        c = a + b
        d = a - b
        end subroutine

    end subroutine


    subroutine add8(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        associate(res=>make_sum(a(i),b(i)))
            c(i) = res%c
            d(i) = res%d
        end associate
    end do

    contains

        function make_sum(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add9(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    associate(res=>make_sum_e(a, b))
        c = res%c
        d = res%d
    end associate

    contains

        elemental function make_sum_e(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add10(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do concurrent (i = 1:n)
        associate(res => make_sum_e(a(i),b(i)))
            c(i) = res%c
            d(i) = res%d
        end associate
    end do

    contains

        elemental function make_sum_e(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add11(a,b,c,d)
        real, intent(in) :: a(:), b(:)
        real, intent(out) :: c(:), d(:)
        associate(rc => a + b, rd => a - b)
            c = rc
            d = rd
        end associate
    end subroutine

    subroutine add12(a,b,c,d)
        real, intent(in), pointer :: a(:), b(:), c(:), d(:)
        c = a + b
        d = a - b
    end subroutine

    subroutine add13(a,b,c,d)
        real, intent(in), pointer, contiguous :: a(:), b(:), c(:), d(:)
        c = a + b
        d = a - b
    end subroutine

end module
Driver (click here)
! driver.f90
program driver
    use loop_kernels
    implicit none

    integer, parameter :: N = 10000000  ! 10M elements
    integer, parameter :: TRIALS = 100
    integer, parameter :: NUM_KERNELS = 13
    
    real, allocatable, target :: a(:), b(:), c(:), d(:)
    real(8) :: t1, t2
    real(8) :: times(NUM_KERNELS)
    real(8) :: relative_speeds(NUM_KERNELS)
    integer :: i, j
    character(len=20) :: names(NUM_KERNELS)

    names = ["Array Syntax      ", "Fused Loop (Ref)  ", "Non-Fused Loops   ", &
             "Assumed Shape     ", "Colon Slice       ", "Elemental (Exp)   ", &
             "Elemental (Assum.)", "Associate Func    ", "Elemental Func    ", &
             "Do Concurrent     ", "Associate Expr    ", "Pointer           ", &
             "Pointer (contig.) "]

    allocate(a(N), b(N), c(N), d(N))
    call random_number(a)
    call random_number(b)

    print *, "Starting Benchmark with N =", N, ", TRIALS = ", TRIALS
    print "(A4, A20, A15, A15)", "#", "Kernel Name", "Time (s)", "Rel. Speed"
    print *, "------------------------------------------------------------"

    do i = 1, NUM_KERNELS
        t1 = get_time()
        
        do j = 1, TRIALS
            select case(i)
            case(1);  call add1(N,a,b,c,d)
            case(2);  call add2(N,a,b,c,d)
            case(3);  call add3(N,a,b,c,d)
            case(4);  call add4(a,b,c,d)
            case(5);  call add5(a,b,c,d)
            case(6);  call add6(N,a,b,c,d)
            case(7);  call add7(a,b,c,d)
            case(8);  call add8(N,a,b,c,d)
            case(9);  call add9(N,a,b,c,d)
            case(10); call add10(N,a,b,c,d)
            case(11); call add11(a,b,c,d)
            case(12); call add12(a,b,c,d)
            case(13); call add13(a,b,c,d)
            ! Add cases for the new preposterous ones if you implement them
            case default; c = a + b; d = a - b 
            end select
        end do
        
        t2 = get_time()
        times(i) = (t2 - t1) / TRIALS
    end do

    ! Reference is add2 (Fused Loop)
    do i = 1, NUM_KERNELS
        relative_speeds(i) = times(i) / times(2)
        print "(I2, 2X, A20, F15.6, F15.3)", i, names(i), times(i), relative_speeds(i)
    end do

    print *, "------------------------------------------------------------"
    print "(A, F10.3)", "Geometric Mean Abstraction Penalty: ", geo_mean(relative_speeds)

contains

    function get_time()
        real(8) :: get_time
        integer :: c, r
        call system_clock(c, r)
        get_time = real(c, 8) / real(r, 8)
    end function

    function geo_mean(arr)
        real(8), intent(in) :: arr(:)
        real(8) :: geo_mean
        geo_mean = exp(sum(log(arr)) / size(arr))
    end function

end program driver
$ gfortran -O3 -mcpu=native -fcheck=array-temps loop_kernels.f90 driver.f90 -o driver
$ ./driver
 Starting Benchmark with N =    10000000 , TRIALS =          100
   #         Kernel Name       Time (s)     Rel. Speed
 ------------------------------------------------------------
 1  Array Syntax               0.003010          1.610
 2  Fused Loop (Ref)           0.001870          1.000
 3  Non-Fused Loops            0.002670          1.428
 4  Assumed Shape              0.002670          1.428
 5  Colon Slice                0.002670          1.428
 6  Elemental (Exp)            0.001920          1.027
 7  Elemental (Assum.)         0.001880          1.005
 8  Associate Func             0.001950          1.043
 9  Elemental Func             0.005260          2.813
10  Do Concurrent              0.001840          0.984
11  Associate Expr             0.004330          2.316
12  Pointer                    0.013060          6.984
13  Pointer (contig.)          0.012950          6.925
 ------------------------------------------------------------
Geometric Mean Abstraction Penalty:      1.761

Compared to a plain old loop (#2), using associate (#11) gives a slowdown by a factor of two.
(Note, no temporary arrays are created at the call site.)

4 Likes