Fortran skill markdown for codex

Here’s an example that points to a “quality of implemenation” issue (Compiler Explorer).

module stencil_mod
implicit none
public
integer, parameter :: dp = kind(1.0d0)
contains

  subroutine stencil_manual(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    integer :: i,j
    do j = 1, ny
      do i = 1, nx
        unew(i,j) = uold(i,j) + &
             coeffx*(uold(i+1,j) - 2.0_dp*uold(i,j) + uold(i-1,j)) + &
             coeffy*(uold(i,j+1) - 2.0_dp*uold(i,j) + uold(i,j-1))
      end do
    end do
  end subroutine stencil_manual

  subroutine stencil_slicing(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    unew(1:nx,1:ny) = uold(1:nx,1:ny) + &
         coeffx*(uold(2:nx+1,1:ny) - 2.0_dp*uold(1:nx,1:ny) + uold(0:nx-1,1:ny)) + &
         coeffy*(uold(1:nx,2:ny+1) - 2.0_dp*uold(1:nx,1:ny) + uold(1:nx,0:ny-1))
  end subroutine stencil_slicing

  subroutine stencil_associate(uold, unew, nx, ny, coeffx, coeffy)
    integer, intent(in) :: nx, ny
    real(dp), intent(in)  :: uold(0:nx+1,0:ny+1)
    real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
    real(dp), intent(in) :: coeffx, coeffy
    associate(centre => uold(1:nx,1:ny), &
              west => uold(0:nx-1,1:ny), &
              east => uold(2:nx+1,1:ny), &
              south => uold(1:nx,0:ny-1), &
              north => uold(1:nx,2:ny+1))
      unew(1:nx,1:ny) = centre + &
           coeffx*(east - 2.0_dp*centre + west) + &
           coeffy*(north - 2.0_dp*centre + south)
    end associate
  end subroutine stencil_associate

end module

When compiled with ifort -O3 -xSKYLAKE-AVX512 -qopt-zmm-usage=high, the first two variants use AVX512 instructions while the associate version leads to a scalar loop.

When the same code is compiled with flang -O3 -march=skylake-avx512 -mprefer-vector-width=512, the first two variants use unrolling by 8 elements, whereas the associate version uses unrolling by 4 elements.

With ifx 2025.3.2 all three variants use AVX512 vectorization (8 way unrolling); same with gfortran 15.2.

5 Likes