I found some interesting experiments with stencil and halo syntax for Fortran in the following two works:
- ForOpenCL: Transformations Exploiting Array Syntax in
 Fortran for Accelerator Programming: https://arxiv.org/pdf/1107.2157.pdf
- Locally-Oriented Programming: A Simple Programming Model for Stencil-Based Computations on Multi-Level Distributed Memory Architectures: https://arxiv.org/pdf/1502.03504.pdf
The first work would be relevant to the discussion on Fortran Programmers : How do you want to offload to GPU accelerators in the next 5 years?
! Envisioned stencil syntax
pure CONCURRENT subroutine Laplacian(U)}
real, HALO(:,:) :: U
     U(0,0) = U(0,+1) &
+ U(-1,0) - 4*U(0, 0) + U(+1,0) &
            + U(0,-1)
end subroutine Laplacian
! Declaration syntax including halo
real, allocatable, dimension(:,:), codimension[:,:] &
HALO(1:*:1,1:*:1) :: U
! Launching the kernel
do while (.not. converged)
   do concurrent (i=1:M, j=1:N) [[device]]
      call Laplacian( U(i,j)[device] )
   end do
   call HALO_TRANSFER(U, BC=CYCLIC)
end do