Here’s an example that points to a “quality of implemenation” issue (Compiler Explorer).
module stencil_mod
implicit none
public
integer, parameter :: dp = kind(1.0d0)
contains
subroutine stencil_manual(uold, unew, nx, ny, coeffx, coeffy)
integer, intent(in) :: nx, ny
real(dp), intent(in) :: uold(0:nx+1,0:ny+1)
real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
real(dp), intent(in) :: coeffx, coeffy
integer :: i,j
do j = 1, ny
do i = 1, nx
unew(i,j) = uold(i,j) + &
coeffx*(uold(i+1,j) - 2.0_dp*uold(i,j) + uold(i-1,j)) + &
coeffy*(uold(i,j+1) - 2.0_dp*uold(i,j) + uold(i,j-1))
end do
end do
end subroutine stencil_manual
subroutine stencil_slicing(uold, unew, nx, ny, coeffx, coeffy)
integer, intent(in) :: nx, ny
real(dp), intent(in) :: uold(0:nx+1,0:ny+1)
real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
real(dp), intent(in) :: coeffx, coeffy
unew(1:nx,1:ny) = uold(1:nx,1:ny) + &
coeffx*(uold(2:nx+1,1:ny) - 2.0_dp*uold(1:nx,1:ny) + uold(0:nx-1,1:ny)) + &
coeffy*(uold(1:nx,2:ny+1) - 2.0_dp*uold(1:nx,1:ny) + uold(1:nx,0:ny-1))
end subroutine stencil_slicing
subroutine stencil_associate(uold, unew, nx, ny, coeffx, coeffy)
integer, intent(in) :: nx, ny
real(dp), intent(in) :: uold(0:nx+1,0:ny+1)
real(dp), intent(out) :: unew(0:nx+1,0:ny+1)
real(dp), intent(in) :: coeffx, coeffy
associate(centre => uold(1:nx,1:ny), &
west => uold(0:nx-1,1:ny), &
east => uold(2:nx+1,1:ny), &
south => uold(1:nx,0:ny-1), &
north => uold(1:nx,2:ny+1))
unew(1:nx,1:ny) = centre + &
coeffx*(east - 2.0_dp*centre + west) + &
coeffy*(north - 2.0_dp*centre + south)
end associate
end subroutine stencil_associate
end module
When compiled with ifort -O3 -xSKYLAKE-AVX512 -qopt-zmm-usage=high, the first two variants use AVX512 instructions while the associate version leads to a scalar loop.
When the same code is compiled with flang -O3 -march=skylake-avx512 -mprefer-vector-width=512, the first two variants use unrolling by 8 elements, whereas the associate version uses unrolling by 4 elements.
With ifx 2025.3.2 all three variants use AVX512 vectorization (8 way unrolling); same with gfortran 15.2.