I’m working on OpenMP parallelization of a lattice Boltzmann kernel for a 2D periodical domain.
The full streaming kernel (lacking a few module constants) is in the details box:
Full kernel
   subroutine kernel_stream(nx,ny,ld,ft,fp,dt,omega)
      integer, intent(in) :: nx, ny, ld
      real(wp), intent(in) :: ft(ld,nx,0:8)     ! \bar{f}^{+,n}
      real(wp), intent(inout) :: fp(ld,nx,0:8)  ! \tilde{f}^{+,n}
      real(wp), intent(in) :: dt, omega
      real(wp) :: cfw(ny,0:8), cfn(ny,0:8)
      real(wp) :: cfrho(ny), cfux(ny), cfuy(ny), cfindp(ny)
      real(wp) :: fc, fe, fn, fw, fs, fne, fnw, fsw
      real(wp), parameter :: p2 = 0.5_wp, &
                             p8 = 0.125_wp
      integer :: x, y, q
      integer :: xp1, xm1, yp1, ym1
      real(wp) :: cxq, cyq
      real(wp) :: fluxW, fluxN
      !$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega)
      !$omp do schedule(static)
      do x = 1, nx
         
         ! locate neighbor nodes
         xp1 = mod(x, nx) + 1
         xm1 = mod(nx + x - 2, nx) + 1
         ! Interpolate values at the WEST and NORTH cell faces
         ! 
         ! All directions are needed, so we can calculate the 
         ! macroscopic values
         !
         do q = 0, 8
         
         cxq = dt*cx(q)
         cyq = dt*cy(q)
         do y = 1, ny
            yp1 = mod(y, ny) + 1
            ym1 = mod(ny + y - 2, ny) + 1
            ! read pdf values
            fc  = ft(y  , x  , q)
            fe  = ft(y  , xp1, q)
            fn  = ft(yp1, x  , q)
            fw  = ft(y  , xm1, q)
            fs  = ft(ym1, x  , q)
            fne = ft(yp1, xp1, q)
            fnw = ft(yp1, xm1, q)
            fsw = ft(ym1, xm1, q)
            ! fse = ft(ym1, xp1, q)  ! not needed
            ! West cell face
            cfw(y,q) = p2*(fc + fw) - p2*cxq*(fc - fw) - &
                       p8*cyq*(fnw + fn - fsw - fs)
            ! North cell face
            cfn(y,q) = p2*(fc + fn) - p2*cyq*(fn - fc) - &
                       p8*cxq*(fne + fe - fnw - fw)
         end do
         end do
         ! Update values at cell faces
         !
         ! cf = (1 - omega)*cf + omega * cfeq
         !
#if DUGKS
         ! WEST face
         call update_ew(ny,cfw,cfrho,cfux,cfuy,cfindp,omega)
         
         ! NORTH face
         call update_ns(ny,cfn,cfrho,cfux,cfuy,cfindp,omega)
#endif
         ! TODO:
         !   At the WEST face we only need to update 
         !   velocites 1,3,5,7,6,8
         !   At the NORTH face, we only need to update
         !   velocities 2,4,5,7,6,8
         !
         do q = 1, 8
         
         cxq = dt*cx(q)
         cyq = dt*cy(q)
         do y = 1, ny
            yp1 = mod(y, ny) + 1
            ym1 = mod(ny + y - 2, ny) + 1
            fluxW = cxq*cfw(y,q)
            fluxN = cyq*cfn(y,q)
            !
            ! WARNING: This section could potentially show a race condition
            !
            fp(y,x,q) = fp(y,x,q) + fluxW - fluxN
            fp(y,xm1,q) = fp(y,xm1,q) - fluxW
            fp(yp1,x,q) = fp(yp1,x,q) + fluxN
         end do
         end do
      end do
      !$omp end do
      !$omp end parallel
   contains
      subroutine update_ew(ny,f,rho,ux,uy,indp,omega)
         integer, intent(in) :: ny
         real(wp), intent(inout) :: f(ny,0:8)
         real(wp), intent(inout) :: rho(ny), ux(ny), uy(ny), indp(ny)
         real(wp), intent(in) :: omega
         integer :: y
         real(wp) :: invrho
         real(wp) :: omegabar, omega_w0, omega_ws, omega_wd
         real(wp) :: vel_trm_13, vel_trm_57, vel_trm_68
         real(wp) :: velxpy, velxmy
         omegabar = 1.0_wp - omega
         omega_w0 = 3.0_wp * omega * w0
         omega_ws = 3.0_wp * omega * ws
         omega_wd = 3.0_wp * omega * wd
         do y = 1, ny
            rho(y) = (((f(y,5) + f(y,7)) + (f(y,6) + f(y,8))) + &
                      ((f(y,1) + f(y,3)) + (f(y,2) + f(y,4)))) + f(y,0)
            invrho = 1.0_wp/rho(y)
            ux(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,8) - f(y,6))) + (f(y,1) - f(y,3)))
            uy(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,6) - f(y,8))) + (f(y,2) - f(y,4)))
            indp(y) = one_third - 0.5_wp * (ux(y)**2 + uy(y)**2)
         end do
         do y = 1, ny
         
            vel_trm_13 = indp(y) + 1.5_wp * ux(y) * ux(y)
            f(y,1) = omegabar*f(y,1) + omega_ws * rho(y) * (vel_trm_13 + ux(y))
            f(y,3) = omegabar*f(y,3) + omega_ws * rho(y) * (vel_trm_13 - ux(y))
         
         end do
         do y = 1, ny
            velxpy = ux(y) + uy(y)
            vel_trm_57 = indp(y) + 1.5_wp * velxpy * velxpy
            f(y,5) = omegabar*f(y,5) + omega_wd * rho(y) * (vel_trm_57 + velxpy)
            f(y,7) = omegabar*f(y,7) + omega_wd * rho(y) * (vel_trm_57 - velxpy)
         
         end do
         do y = 1, ny
            
            velxmy = ux(y) - uy(y)
            vel_trm_68 = indp(y) + 1.5_wp * velxmy * velxmy
            
            f(y,6) = omegabar*f(y,6) + omega_wd * rho(y) * (vel_trm_68 - velxmy)
            f(y,8) = omegabar*f(y,8) + omega_wd * rho(y) * (vel_trm_68 + velxmy)
         
         end do
      end subroutine
      subroutine update_ns(ny,f,rho,ux,uy,indp,omega)
         integer, intent(in) :: ny
         real(wp), intent(inout) :: f(ny,0:8)
         real(wp), intent(inout) :: rho(ny), ux(ny), uy(ny), indp(ny)
         real(wp), intent(in) :: omega
         integer :: y
         real(wp) :: invrho
         real(wp) :: omegabar, omega_w0, omega_ws, omega_wd
         real(wp) :: vel_trm_24, vel_trm_57, vel_trm_68
         real(wp) :: velxpy, velxmy
         omegabar = 1.0_wp - omega
         omega_w0 = 3.0_wp * omega * w0
         omega_ws = 3.0_wp * omega * ws
         omega_wd = 3.0_wp * omega * wd
         do y = 1, ny
            rho(y) = (((f(y,5) + f(y,7)) + (f(y,6) + f(y,8))) + &
                      ((f(y,1) + f(y,3)) + (f(y,2) + f(y,4)))) + f(y,0)
            
            invrho = 1.0_wp/rho(y)
            ux(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,8) - f(y,6))) + (f(y,1) - f(y,3)))
            uy(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,6) - f(y,8))) + (f(y,2) - f(y,4)))
            indp(y) = one_third - 0.5_wp * (ux(y)**2 + uy(y)**2)
         
         end do
         do y = 1, ny
            
            vel_trm_24 = indp(y) + 1.5_wp * uy(y) * uy(y)
            f(y,2) = omegabar*f(y,2) + omega_ws * rho(y) * (vel_trm_24 + uy(y))
            f(y,4) = omegabar*f(y,4) + omega_ws * rho(y) * (vel_trm_24 - uy(y))
         
         end do
         do y = 1, ny
            velxpy = ux(y) + uy(y)
            vel_trm_57 = indp(y) + 1.5_wp * velxpy * velxpy
            f(y,5) = omegabar*f(y,5) + omega_wd * rho(y) * (vel_trm_57 + velxpy)
            f(y,7) = omegabar*f(y,7) + omega_wd * rho(y) * (vel_trm_57 - velxpy)
         
         end do
         do y = 1, ny
            
            velxmy = ux(y) - uy(y)
            vel_trm_68 = indp(y) + 1.5_wp * velxmy * velxmy
            
            f(y,6) = omegabar*f(y,6) + omega_wd * rho(y) * (vel_trm_68 - velxmy)
            f(y,8) = omegabar*f(y,8) + omega_wd * rho(y) * (vel_trm_68 + velxmy)
         
         end do
      end subroutine
   end subroutine
From the numerical point of view, the kernel is a solver for a set of advection equations (q = 0,1,\dots,8) using a finite volume approach on a regular square grid with unit cell length (\Delta x = 1) (the full method is described in this paper). Below is a sketch of a 4×4 grid with numbered cells and edges. The little boxes mark the periodic edge.
To save on evaluations I’m only calculating the fluxes for the west and north faces of each cell. Note that due to the regular structure, I’m not storing an edge list. I’m worried that the marked section where I add/subtract the fluxes from the cell and it’s neighbors presents a potential race condition:
      !$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega)
      !$omp do schedule(static)
      do x = 1, nx
         
         ! locate neighbor nodes
         xp1 = mod(x, nx) + 1
         xm1 = mod(nx + x - 2, nx) + 1
        !
        ! ... interpolation at cell faces ...
        !
         do q = 1, 8
         
         cxq = dt*cx(q)
         cyq = dt*cy(q)
         do y = 1, ny
            yp1 = mod(y, ny) + 1
            ym1 = mod(ny + y - 2, ny) + 1
            fluxW = cxq*cfw(y,q)
            fluxN = cyq*cfn(y,q)
            fp(y,x,q) = fp(y,x,q) + fluxW - fluxN  !>
            fp(y,xm1,q) = fp(y,xm1,q) - fluxW      !> race condition ?
            fp(yp1,x,q) = fp(yp1,x,q) + fluxN      !>
         end do
         end do
      end do
      !$omp end do
      !$omp end parallel
To my surprise, for the limited tests that I ran on a 64×64 grid, the results were the same with 1, 4, or 8 threads. I’m guessing I would bump into problems if  Edit: I’ve confirmed the results are unreliable, see my second post.nx = 4, and each thread only calculated one column of cells. Then the updates for (y, x) and (y, xm1) (cell to the left) might overlap? For nx = 64 however, I’m guessing the cells are already updated by the time the other threads reach the end of their x section.
Is my understanding correct? Does the cell-face-based loop present a potential race condition? How would a safe cell-face-based update loop look like?

