Fortran skill markdown for codex

Here’s an example that points to a “quality of implemenation” issue (Compiler Explorer).

module stencil_mod
implicit none
public
integer, parameter :: dp = kind(1.0d0)
contains

  subroutine stencil_manual(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    integer :: i,j
    do j = 1, ny
      do i = 1, nx
        unew(i,j) = uold(i,j) + &
             coeffx*(uold(i+1,j) - 2.0_dp*uold(i,j) + uold(i-1,j)) + &
             coeffy*(uold(i,j+1) - 2.0_dp*uold(i,j) + uold(i,j-1))
      end do
    end do
  end subroutine stencil_manual

  subroutine stencil_slicing(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    unew(1:nx,1:ny) = uold(1:nx,1:ny) + &
         coeffx*(uold(2:nx+1,1:ny) - 2.0_dp*uold(1:nx,1:ny) + uold(0:nx-1,1:ny)) + &
         coeffy*(uold(1:nx,2:ny+1) - 2.0_dp*uold(1:nx,1:ny) + uold(1:nx,0:ny-1))
  end subroutine stencil_slicing

  subroutine stencil_associate(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    associate(centre => uold(1:nx,1:ny), &
              west => uold(0:nx-1,1:ny), &
              east => uold(2:nx+1,1:ny), &
              south => uold(1:nx,0:ny-1), &
              north => uold(1:nx,2:ny+1))
      unew(1:nx,1:ny) = centre + &
           coeffx*(east - 2.0_dp*centre + west) + &
           coeffy*(north - 2.0_dp*centre + south)
    end associate
  end subroutine stencil_associate

end module

When compiled with ifort -O3 -xSKYLAKE-AVX512 -qopt-zmm-usage=high, the first two variants use AVX512 instructions while the associate version leads to a scalar loop.

When the same code is compiled with flang -O3 -march=skylake-avx512 -mprefer-vector-width=512, the first two variants use unrolling by 8 elements, whereas the associate version uses unrolling by 4 elements.

With ifx 2025.3.2 all three variants use AVX512 vectorization (8 way unrolling); same with gfortran 15.2.

5 Likes

Here’s a second example (contrived, admittedly) where associate behaves poorly:

    subroutine add11(a,b,c,d)
        real, intent(in) :: a(:), b(:)
        real, intent(out) :: c(:), d(:)
        associate(rc => a + b, rd => a - b)
            c = rc
            d = rd
        end associate
    end subroutine

gfortran appears to allocate two temporary arrays, do the addition and subtraction separately, and then copy the results into the output arrays. Similar with flang.

Ideally the compiler would recognize it doesn’t need a temporary and fuse the two arithmetic operations into the same loop to minimize memory accesses.

Link: Compiler Explorer


I expanded this example in a few different ways,

Loop Kernels (click here)
! loop_kernels.f90
module loop_kernels
implicit none
private

public :: add1, add2, add3, add4, add5, add6, add7, add8, add9, add10
public :: add11, add12, add13

type :: pair
    real :: c, d
end type

contains

    ! Fortran array syntax
    subroutine add1(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)
    c = a + b
    d = a - b
    end subroutine

    ! Manually-fused loops
    subroutine add2(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        c(i) = a(i) + b(i)
        d(i) = a(i) - b(i)
    end do

    end subroutine

    ! Non-fused loop variation
    subroutine add3(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        c(i) = a(i) + b(i)
    end do

    do i = 1, n
        d(i) = a(i) - b(i)
    end do

    end subroutine


    subroutine add4(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    c = a + b
    d = a - b

    end subroutine

    subroutine add5(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    c(:) = a(:) + b(:)
    d(:) = a(:) - b(:)

    end subroutine


    subroutine add6(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    call op(a,b,c,d)

    ! FIXME: add loop variation

    contains

        elemental subroutine op(a,b,c,d)
        real, intent(in) :: a, b
        real, intent(out) :: c, d
        c = a + b
        d = a - b
        end subroutine

    end subroutine

    subroutine add7(a,b,c,d)
    real, intent(in) :: a(:), b(:)
    real, intent(out) :: c(:), d(:)

    call op(a,b,c,d)

    contains

        elemental subroutine op(a,b,c,d)
        real, intent(in) :: a, b
        real, intent(out) :: c, d
        c = a + b
        d = a - b
        end subroutine

    end subroutine


    subroutine add8(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do i = 1, n
        associate(res=>make_sum(a(i),b(i)))
            c(i) = res%c
            d(i) = res%d
        end associate
    end do

    contains

        function make_sum(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add9(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    associate(res=>make_sum_e(a, b))
        c = res%c
        d = res%d
    end associate

    contains

        elemental function make_sum_e(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add10(n,a,b,c,d)
    integer, intent(in) :: n
    real, intent(in) :: a(n), b(n)
    real, intent(out) :: c(n), d(n)

    integer :: i

    do concurrent (i = 1:n)
        associate(res => make_sum_e(a(i),b(i)))
            c(i) = res%c
            d(i) = res%d
        end associate
    end do

    contains

        elemental function make_sum_e(a,b) result(p)
            real, intent(in) :: a, b
            type(pair) :: p
            p = pair(a + b, a - b)
        end function

    end subroutine

    subroutine add11(a,b,c,d)
        real, intent(in) :: a(:), b(:)
        real, intent(out) :: c(:), d(:)
        associate(rc => a + b, rd => a - b)
            c = rc
            d = rd
        end associate
    end subroutine

    subroutine add12(a,b,c,d)
        real, intent(in), pointer :: a(:), b(:), c(:), d(:)
        c = a + b
        d = a - b
    end subroutine

    subroutine add13(a,b,c,d)
        real, intent(in), pointer, contiguous :: a(:), b(:), c(:), d(:)
        c = a + b
        d = a - b
    end subroutine

end module
Driver (click here)
! driver.f90
program driver
    use loop_kernels
    implicit none

    integer, parameter :: N = 10000000  ! 10M elements
    integer, parameter :: TRIALS = 100
    integer, parameter :: NUM_KERNELS = 13
    
    real, allocatable, target :: a(:), b(:), c(:), d(:)
    real(8) :: t1, t2
    real(8) :: times(NUM_KERNELS)
    real(8) :: relative_speeds(NUM_KERNELS)
    integer :: i, j
    character(len=20) :: names(NUM_KERNELS)

    names = ["Array Syntax      ", "Fused Loop (Ref)  ", "Non-Fused Loops   ", &
             "Assumed Shape     ", "Colon Slice       ", "Elemental (Exp)   ", &
             "Elemental (Assum.)", "Associate Func    ", "Elemental Func    ", &
             "Do Concurrent     ", "Associate Expr    ", "Pointer           ", &
             "Pointer (contig.) "]

    allocate(a(N), b(N), c(N), d(N))
    call random_number(a)
    call random_number(b)

    print *, "Starting Benchmark with N =", N, ", TRIALS = ", TRIALS
    print "(A4, A20, A15, A15)", "#", "Kernel Name", "Time (s)", "Rel. Speed"
    print *, "------------------------------------------------------------"

    do i = 1, NUM_KERNELS
        t1 = get_time()
        
        do j = 1, TRIALS
            select case(i)
            case(1);  call add1(N,a,b,c,d)
            case(2);  call add2(N,a,b,c,d)
            case(3);  call add3(N,a,b,c,d)
            case(4);  call add4(a,b,c,d)
            case(5);  call add5(a,b,c,d)
            case(6);  call add6(N,a,b,c,d)
            case(7);  call add7(a,b,c,d)
            case(8);  call add8(N,a,b,c,d)
            case(9);  call add9(N,a,b,c,d)
            case(10); call add10(N,a,b,c,d)
            case(11); call add11(a,b,c,d)
            case(12); call add12(a,b,c,d)
            case(13); call add13(a,b,c,d)
            ! Add cases for the new preposterous ones if you implement them
            case default; c = a + b; d = a - b 
            end select
        end do
        
        t2 = get_time()
        times(i) = (t2 - t1) / TRIALS
    end do

    ! Reference is add2 (Fused Loop)
    do i = 1, NUM_KERNELS
        relative_speeds(i) = times(i) / times(2)
        print "(I2, 2X, A20, F15.6, F15.3)", i, names(i), times(i), relative_speeds(i)
    end do

    print *, "------------------------------------------------------------"
    print "(A, F10.3)", "Geometric Mean Abstraction Penalty: ", geo_mean(relative_speeds)

contains

    function get_time()
        real(8) :: get_time
        integer :: c, r
        call system_clock(c, r)
        get_time = real(c, 8) / real(r, 8)
    end function

    function geo_mean(arr)
        real(8), intent(in) :: arr(:)
        real(8) :: geo_mean
        geo_mean = exp(sum(log(arr)) / size(arr))
    end function

end program driver
$ gfortran -O3 -mcpu=native -fcheck=array-temps loop_kernels.f90 driver.f90 -o driver
$ ./driver
 Starting Benchmark with N =    10000000 , TRIALS =          100
   #         Kernel Name       Time (s)     Rel. Speed
 ------------------------------------------------------------
 1  Array Syntax               0.003010          1.610
 2  Fused Loop (Ref)           0.001870          1.000
 3  Non-Fused Loops            0.002670          1.428
 4  Assumed Shape              0.002670          1.428
 5  Colon Slice                0.002670          1.428
 6  Elemental (Exp)            0.001920          1.027
 7  Elemental (Assum.)         0.001880          1.005
 8  Associate Func             0.001950          1.043
 9  Elemental Func             0.005260          2.813
10  Do Concurrent              0.001840          0.984
11  Associate Expr             0.004330          2.316
12  Pointer                    0.013060          6.984
13  Pointer (contig.)          0.012950          6.925
 ------------------------------------------------------------
Geometric Mean Abstraction Penalty:      1.761

Compared to a plain old loop (#2), using associate (#11) gives a slowdown by a factor of two.
(Note, no temporary arrays are created at the call site.)

4 Likes

Hi @ivanpribec, thank you very much for benchmark!

And I have another question related to ASSOCIATE. My use case of ASSOCIATE (almost 100%) is to make a local short symbol that refers to some component of derived types (usually nested / composite one, so having multiple %), like:

associate( pos => replicas(i)% sys% conf% pos, &
           !! extract more components (with no colon/slice on the RHS)
          ) 
  !! computation using pos etc
end associate

where pos is an allocatable array contained in a derived-type. In this kind of usage, is it reasonable to expect that there would be no performance hit, i.e., the speed will be the same as using the corresponding “raw” code (i.e., with ... conf% pos written explicitly in the computation part)?

(Also, isn’t it useful to split this thread into separate “codex + markdown” and the “associate + performance” threads, so that the titles show their contents specifically…?)

4 Likes

From my limited experience, this sped up the LLM ‘cycle’ quite nicely. Some of this is because fortitude is very fast compared to compilers, but also the messages it gives are generous.

My list of instructions is not as long, i mostly tell it to make sure it uses the tools and to write Modern Forttan, so i’ll look at some of the suggestions here keenly

1 Like

(I agree about splitting the associate-related replies; sorry for hijacking the thread.)

It is hard to say as different compilers, and the many optimization passes they perform, can sometimes yield unexpected results. I agree that associating with an “entire” object (like in your case), and not an array slice or an expression, sounds reasonable and I’d bet on this not having a major impact.

In either case I would recommend measuring the routine associating to member deep in the type hierarchy is on the critical execution path. ( see Rules 1 and 2 in <h1>Rob Pike's 5 Rules of Programming</h1>).

If the routine is indeed performance-critical, you can always test try with a temporary preprocessor hack:

#define pos replicas(i)%sys%conf%pos
associate( ... )
   ! computation using pos etc
end associate
#undef pos

(You do need to make sure that the pos variable is consistently lowercase (and not POS, Pos, …))

4 Likes

@ivanpribec good analysis and examples. I think compilers should also be able to warn if they have to make a temporary. I think the key is to guarantee to the user that there is no “obvious” slowdown. So failure to vectorize “obvious” loops, extra temporaries etc. I would like to be warnings.

3 Likes

Thanks very much for your comment! I think I didn’t use associate for the most time-consuming parts (which I usually write as simple as possible like F77 + to avoid possible compiler bugs with OpenMP :sweat_droplets:), but for other places I will try measuring the effect of associate with the following macros (which I think is very convenient because I do not need to modify the computation part).

#define pos replicas(i)%sys%conf%pos
associate( ... )
   ! computation using pos etc
end associate
#undef pos
1 Like

This is the Modern Fortran skill I have created. Comments and suggestions are welcome!

3 Likes

I think one of the most important things to do is create a project synopsis text file, and keep it up to date. If agentic coding, also always save a session_handoff.txt file which must be updated every session with instructions at bottom for files to read. With opencode I recently installed the plugin opencode-mem for controllable persistent memory, and now at each session end I can just state, “remember to read session_handoff next session when I say the magic word which is mekalekahimekahiny-ho” And then I say the pass phrase and we get rocking and rolling. (a little humor) .

PROJECT SYNOPSIS

Version 0.1.0 - DATE

Author:

  1. PROJECT OVERVIEW

================================================================================

  1. WHAT IS A …utility… AND WHAT DOES IT DO?

================================================================================

  1. INPUT: explain what the input is

================================================================================

  1. OUTPUT: explain output format, expectation, etc.

================================================================================

  1. PROGRAM FLOW - DETAILED FLOW CHART DIAGRAM (include modules with routines contained)

================================================================================

  1. OPTION MODULE ARCHITECTURE (description of specialty modules and how to use)

================================================================================

  1. KEY DATA STRUCTURES

================================================================================

  1. CURRENT CAPABILITIES (what is currently coded and working)

================================================================================

  1. FUTURE ROADMAP

High Priority:

Medium Priority:

Lower Priority:

================================================================================

  1. LIMITATIONS AND TECHNICAL DEBT

Known Limitations:

Areas Needing Refactoring:

================================================================================

  1. BUILD SYSTEM - FPM (Fortran Package Manager)

Project Configuration (fpm.toml):

Dependencies:

Build Commands:

Profiles:

Linting:

================================================================================

  1. SWITCHING OPTION FILES (locations of custom modules for testing, for compiling)

Current Setup:

To Switch Options:
1.
2.
3. Rebuild:
fpm clean
fpm build
4. Test:
fpm run – input_file.ext

================================================================================

  1. FILE STRUCTURE (just use linux tree command to create this)

================================================================================

  1. DEVELOPMENT FLAGS

Debug output is controlled in …f90:
dbug = .true. ! General debug output to list file
dbug_params = .true. ! Parameter debug output
dbug_vocab = .false. ! Vocabulary table dump
dbug_tldata = .false. ! Tool data dump
Set these in the …() subroutine before calling …().

================================================================================

  1. EXTENSION (where to add new features, vocab, handling, helpers, etc…)

Adding New FEATURES:
1.
2.
3.
Adding New INPUT Formats:
1.
2.
3.
Adding New Machine Configurations:
1.
2.
3.

================================================================================

END OF SYNOPSIS

1 Like