# Fixing an OpenMP race condition

I’m working on OpenMP parallelization of a lattice Boltzmann kernel for a 2D periodical domain.
The full streaming kernel (lacking a few module constants) is in the details box:

Full kernel

subroutine kernel_stream(nx,ny,ld,ft,fp,dt,omega)
integer, intent(in) :: nx, ny, ld
real(wp), intent(in) :: ft(ld,nx,0:8)     ! \bar{f}^{+,n}
real(wp), intent(inout) :: fp(ld,nx,0:8)  ! \tilde{f}^{+,n}

real(wp), intent(in) :: dt, omega

real(wp) :: cfw(ny,0:8), cfn(ny,0:8)
real(wp) :: cfrho(ny), cfux(ny), cfuy(ny), cfindp(ny)

real(wp) :: fc, fe, fn, fw, fs, fne, fnw, fsw

real(wp), parameter :: p2 = 0.5_wp, &
p8 = 0.125_wp

integer :: x, y, q
integer :: xp1, xm1, yp1, ym1

real(wp) :: cxq, cyq

real(wp) :: fluxW, fluxN

!$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega) !$omp do schedule(static)
do x = 1, nx

! locate neighbor nodes
xp1 = mod(x, nx) + 1
xm1 = mod(nx + x - 2, nx) + 1

! Interpolate values at the WEST and NORTH cell faces
!
! All directions are needed, so we can calculate the
! macroscopic values
!
do q = 0, 8

cxq = dt*cx(q)
cyq = dt*cy(q)

do y = 1, ny

yp1 = mod(y, ny) + 1
ym1 = mod(ny + y - 2, ny) + 1

fc  = ft(y  , x  , q)
fe  = ft(y  , xp1, q)
fn  = ft(yp1, x  , q)
fw  = ft(y  , xm1, q)
fs  = ft(ym1, x  , q)
fne = ft(yp1, xp1, q)
fnw = ft(yp1, xm1, q)
fsw = ft(ym1, xm1, q)
! fse = ft(ym1, xp1, q)  ! not needed

! West cell face
cfw(y,q) = p2*(fc + fw) - p2*cxq*(fc - fw) - &
p8*cyq*(fnw + fn - fsw - fs)

! North cell face
cfn(y,q) = p2*(fc + fn) - p2*cyq*(fn - fc) - &
p8*cxq*(fne + fe - fnw - fw)

end do
end do

! Update values at cell faces
!
! cf = (1 - omega)*cf + omega * cfeq
!

#if DUGKS
! WEST face
call update_ew(ny,cfw,cfrho,cfux,cfuy,cfindp,omega)

! NORTH face
call update_ns(ny,cfn,cfrho,cfux,cfuy,cfindp,omega)
#endif

! TODO:
!   At the WEST face we only need to update
!   velocites 1,3,5,7,6,8
!   At the NORTH face, we only need to update
!   velocities 2,4,5,7,6,8
!
do q = 1, 8

cxq = dt*cx(q)
cyq = dt*cy(q)

do y = 1, ny

yp1 = mod(y, ny) + 1
ym1 = mod(ny + y - 2, ny) + 1

fluxW = cxq*cfw(y,q)
fluxN = cyq*cfn(y,q)

!
! WARNING: This section could potentially show a race condition
!
fp(y,x,q) = fp(y,x,q) + fluxW - fluxN
fp(y,xm1,q) = fp(y,xm1,q) - fluxW
fp(yp1,x,q) = fp(yp1,x,q) + fluxN

end do
end do

end do
!$omp end do !$omp end parallel

contains

subroutine update_ew(ny,f,rho,ux,uy,indp,omega)
integer, intent(in) :: ny
real(wp), intent(inout) :: f(ny,0:8)
real(wp), intent(inout) :: rho(ny), ux(ny), uy(ny), indp(ny)
real(wp), intent(in) :: omega

integer :: y
real(wp) :: invrho
real(wp) :: omegabar, omega_w0, omega_ws, omega_wd

real(wp) :: vel_trm_13, vel_trm_57, vel_trm_68
real(wp) :: velxpy, velxmy

omegabar = 1.0_wp - omega

omega_w0 = 3.0_wp * omega * w0
omega_ws = 3.0_wp * omega * ws
omega_wd = 3.0_wp * omega * wd

do y = 1, ny

rho(y) = (((f(y,5) + f(y,7)) + (f(y,6) + f(y,8))) + &
((f(y,1) + f(y,3)) + (f(y,2) + f(y,4)))) + f(y,0)
invrho = 1.0_wp/rho(y)

ux(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,8) - f(y,6))) + (f(y,1) - f(y,3)))
uy(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,6) - f(y,8))) + (f(y,2) - f(y,4)))

indp(y) = one_third - 0.5_wp * (ux(y)**2 + uy(y)**2)
end do

do y = 1, ny

vel_trm_13 = indp(y) + 1.5_wp * ux(y) * ux(y)

f(y,1) = omegabar*f(y,1) + omega_ws * rho(y) * (vel_trm_13 + ux(y))
f(y,3) = omegabar*f(y,3) + omega_ws * rho(y) * (vel_trm_13 - ux(y))

end do

do y = 1, ny

velxpy = ux(y) + uy(y)
vel_trm_57 = indp(y) + 1.5_wp * velxpy * velxpy

f(y,5) = omegabar*f(y,5) + omega_wd * rho(y) * (vel_trm_57 + velxpy)
f(y,7) = omegabar*f(y,7) + omega_wd * rho(y) * (vel_trm_57 - velxpy)

end do

do y = 1, ny

velxmy = ux(y) - uy(y)
vel_trm_68 = indp(y) + 1.5_wp * velxmy * velxmy

f(y,6) = omegabar*f(y,6) + omega_wd * rho(y) * (vel_trm_68 - velxmy)
f(y,8) = omegabar*f(y,8) + omega_wd * rho(y) * (vel_trm_68 + velxmy)

end do

end subroutine

subroutine update_ns(ny,f,rho,ux,uy,indp,omega)
integer, intent(in) :: ny
real(wp), intent(inout) :: f(ny,0:8)
real(wp), intent(inout) :: rho(ny), ux(ny), uy(ny), indp(ny)
real(wp), intent(in) :: omega

integer :: y
real(wp) :: invrho
real(wp) :: omegabar, omega_w0, omega_ws, omega_wd

real(wp) :: vel_trm_24, vel_trm_57, vel_trm_68
real(wp) :: velxpy, velxmy

omegabar = 1.0_wp - omega

omega_w0 = 3.0_wp * omega * w0
omega_ws = 3.0_wp * omega * ws
omega_wd = 3.0_wp * omega * wd

do y = 1, ny

rho(y) = (((f(y,5) + f(y,7)) + (f(y,6) + f(y,8))) + &
((f(y,1) + f(y,3)) + (f(y,2) + f(y,4)))) + f(y,0)

invrho = 1.0_wp/rho(y)

ux(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,8) - f(y,6))) + (f(y,1) - f(y,3)))
uy(y) = invrho * (((f(y,5) - f(y,7)) + (f(y,6) - f(y,8))) + (f(y,2) - f(y,4)))

indp(y) = one_third - 0.5_wp * (ux(y)**2 + uy(y)**2)

end do

do y = 1, ny

vel_trm_24 = indp(y) + 1.5_wp * uy(y) * uy(y)

f(y,2) = omegabar*f(y,2) + omega_ws * rho(y) * (vel_trm_24 + uy(y))
f(y,4) = omegabar*f(y,4) + omega_ws * rho(y) * (vel_trm_24 - uy(y))

end do

do y = 1, ny

velxpy = ux(y) + uy(y)
vel_trm_57 = indp(y) + 1.5_wp * velxpy * velxpy

f(y,5) = omegabar*f(y,5) + omega_wd * rho(y) * (vel_trm_57 + velxpy)
f(y,7) = omegabar*f(y,7) + omega_wd * rho(y) * (vel_trm_57 - velxpy)

end do

do y = 1, ny

velxmy = ux(y) - uy(y)
vel_trm_68 = indp(y) + 1.5_wp * velxmy * velxmy

f(y,6) = omegabar*f(y,6) + omega_wd * rho(y) * (vel_trm_68 - velxmy)
f(y,8) = omegabar*f(y,8) + omega_wd * rho(y) * (vel_trm_68 + velxmy)

end do

end subroutine

end subroutine


From the numerical point of view, the kernel is a solver for a set of advection equations (q = 0,1,\dots,8) using a finite volume approach on a regular square grid with unit cell length (\Delta x = 1) (the full method is described in this paper). Below is a sketch of a 4×4 grid with numbered cells and edges. The little boxes mark the periodic edge.

To save on evaluations I’m only calculating the fluxes for the west and north faces of each cell. Note that due to the regular structure, I’m not storing an edge list. I’m worried that the marked section where I add/subtract the fluxes from the cell and it’s neighbors presents a potential race condition:

      !$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega) !$omp do schedule(static)
do x = 1, nx

! locate neighbor nodes
xp1 = mod(x, nx) + 1
xm1 = mod(nx + x - 2, nx) + 1

!
! ... interpolation at cell faces ...
!

do q = 1, 8

cxq = dt*cx(q)
cyq = dt*cy(q)

do y = 1, ny

yp1 = mod(y, ny) + 1
ym1 = mod(ny + y - 2, ny) + 1

fluxW = cxq*cfw(y,q)
fluxN = cyq*cfn(y,q)

fp(y,x,q) = fp(y,x,q) + fluxW - fluxN  !>
fp(y,xm1,q) = fp(y,xm1,q) - fluxW      !> race condition ?
fp(yp1,x,q) = fp(yp1,x,q) + fluxN      !>

end do
end do

end do
!$omp end do !$omp end parallel


To my surprise, for the limited tests that I ran on a 64×64 grid, the results were the same with 1, 4, or 8 threads. I’m guessing I would bump into problems if nx = 4, and each thread only calculated one column of cells. Then the updates for (y, x) and (y, xm1) (cell to the left) might overlap? For nx = 64 however, I’m guessing the cells are already updated by the time the other threads reach the end of their x section. Edit: I’ve confirmed the results are unreliable, see my second post.

Is my understanding correct? Does the cell-face-based loop present a potential race condition? How would a safe cell-face-based update loop look like?

2 Likes

Upon doing some more testing, there is indeed a race condition. The L2-norm of my solution varies by 1- or 2- orders of magnitude from run to run.

1 Like

As far as I see, you do an in-place update. That would mean that even the sequential algorithm depends on the order of execution.
To avoid this, you would need to store the data twice, once at t=t_n and once at t=t_(n+1).

The parallel version of the in-place variant will additionally suffer from non-conserved fields due to race conditions. I think a good test is to check that the overall amount of the species remains constant in the absences of sources/sinks.

2 Likes

Perhaps somewhere may need something like the barrier in MPI_barrier. Such that all the threads are synchronized before taking the next actions.

Indeed, I could use a synchronization construct such as !$omp critical so that only a single thread will execute the in-place update at a time. I’ve tried it already but it totally chokes the performance. It would be better to find a solution where I store the solution twice and hence eliminate the data-dependency. A different execution order however won’t really help since the edges/cells of a periodic domain form a cyclic graph. Adding a reduction to the shared do loop seems to do the trick:  !$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega)
!$omp do schedule(static) reduction(+: fp) do x = 1, nx ! ...  I didn’t know the reduction clause applies to arrays too. All the training examples I’ve seen show it for scalars, e.g. a cumulative sum of values. Still the speed-ups I get are quite poor. Perhaps I should just forget the idea of an edge-based cell update, and recalculate the fluxes for each cell. This way I would have local updates, instead of having to write to the west and north neighbors. 1 Like One option could be to use graph coloring of your cells where cells of the same color do not share faces in common. You can then loop over colors and process cells of the same color in parallel. For a regular lattice, coloring the cells might be quite straightforward with only two colors. I think this is also reasonable and my experience has found that recomputation is generally preferable for FV methods in cases where it reduces memory bandwidth requirements. In this case that nx and ny are both even it’s just a checker-board coloring, right? Then I’d need to apply the streaming kernel twice, once for the odd (x,y) pairs, and once for the even ones, effectively doubling the memory bandwidth. The Wikipedia page on edge coloring says: By Vizing’s theorem, the number of colors needed to edge color a simple graph is either its maximum degree Δ or Δ+1. Since each edge connects two cell, my maximum degree is two. For nx = ny = 2*n + 1 (odd array extents) I think three colors are needed. For the mixed cases where n_x \neq n_y and at least one is odd however, things get more complicated. Probably it would be better to rewrite the loop in an edge-centric way then, relaxing the restriction to only use the north- and west-oriented formulas. I think I’ll go for the recomputation instead and mark @lkedward’s answer as the solution. Thanks to everyone for your input! 1 Like I am trying the test program (in the first post) and wondering where cx and cy (maybe arrays) are defined. Maybe in some other module for constants? Then I wonder if default(private) may lead to undefined values of cx and cy. module test_m integer, parameter :: n = 12 integer :: k integer :: cx( n ) = [( k * 10, k=1,n )] end program main use omp_lib use test_m implicit none integer i !$omp parallel do default(private) schedule(static)
!-- !$omp parallel do default(none) private(i) shared(cx) schedule(static) do i = 1, n print *, "i= ", i, " thre= ", omp_get_thread_num(), " cx(i) = ", cx(i) enddo !$omp end parallel do
end


$gfortran-10 -fopenmp test.f90; OMP_NUM_THREADS=4 ./a.out  i= 1 thre= 0 cx(i) = 832 i= 4 thre= 1 cx(i) = 1 i= 10 thre= 3 cx(i) = 1 i= 7 thre= 2 cx(i) = 0 i= 2 thre= 0 cx(i) = 0 i= 5 thre= 1 cx(i) = 2100864 i= 11 thre= 3 cx(i) = -1320156304 i= 8 thre= 2 cx(i) = 0 i= 3 thre= 0 cx(i) = 832 i= 6 thre= 1 cx(i) = 28672 i= 12 thre= 3 cx(i) = 32746 i= 9 thre= 2 cx(i) = -1321186432  Here are the module constants:  integer, parameter :: wp = kind(1.0d0) real(wp), parameter :: cx(0:8) = [real(wp) :: 0, 1, 0, -1, 0, 1, -1, -1, 1] real(wp), parameter :: cy(0:8) = [real(wp) :: 0, 0, 1, 0, -1, 1, 1, -1, -1] real(wp), parameter :: one_third = 1.0_wp / 3.0_wp real(wp), parameter :: w0 = 4._wp / 9._wp, & ws = 1._wp / 9._wp, & wd = 1._wp / 36._wp  Unfortunately, I don’t think they are of much use without the main driver that I haven’t yet published. You could try to fill the data fields with 1’s or random values, and apply the kernel in a time-stepping loop but I don’t think it will remain stable without proper initialization. I’ve now switched to the recomputation approach, and it works at double the speed of the edge-based approach. Edit: I’ve put the loop of the recomputation-based kernel in a hide-details box below. The differences are I have to compute values at all four cell faces cfe, cfw, cfs, cfn; and the update section doesn’t reference any neighbor cells. This makes the iteration spaces of each thread completely separate and the parallel code worked straight away. Summary  !$omp parallel default(private) shared(nx,ny,ld,ft,fp,dt,omega)
!$omp do schedule(static) do x = 1, nx ! locate neighbor nodes xp1 = mod(x, nx) + 1 xm1 = mod(nx + x - 2, nx) + 1 ! Interpolate values at the WEST and NORTH cell faces ! ! All directions are needed, so we can calculate the ! macroscopic values ! do q = 0, 8 cxq = dt*cx(q) cyq = dt*cy(q) do y = 1, ny yp1 = mod(y, ny) + 1 ym1 = mod(ny + y - 2, ny) + 1 ! read pdf values fc = ft(y , x , q) fe = ft(y , xp1, q) fn = ft(yp1, x , q) fw = ft(y , xm1, q) fs = ft(ym1, x , q) fne = ft(yp1, xp1, q) fnw = ft(yp1, xm1, q) fsw = ft(ym1, xm1, q) fse = ft(ym1, xp1, q) ! West cell face cfw(y,q) = p2*(fc + fw) - p2*cxq*(fc - fw) - & p8*cyq*(fnw + fn - fsw - fs) ! North cell face cfn(y,q) = p2*(fc + fn) - p2*cyq*(fn - fc) - & p8*cxq*(fne + fe - fnw - fw) cfe(y,q) = p2*(fc + fe) - p2*cxq*(fe - fc) - & p8*cyq*(fne + fn - fse - fs) cfs(y,q) = p2*(fc + fs) - p2*cyq*(fc - fs) - & p8*cxq*(fse + fe - fsw - fw) end do end do #if DUGKS ! WEST face call update_ew(ny,cfw,cfrho,cfux,cfuy,cfindp,omega) ! EAST face call update_ew(ny,cfe,cfrho,cfux,cfuy,cfindp,omega) ! NORTH face call update_ns(ny,cfn,cfrho,cfux,cfuy,cfindp,omega) ! SOUTH face call update_ns(ny,cfs,cfrho,cfux,cfuy,cfindp,omega) #endif ! TODO: ! At the WEST face we only need to update ! velocites 1,3,5,7,6,8 ! At the NORTH face, we only need to update ! velocities 2,4,5,7,6,8 ! do q = 1, 8 cxq = dt*cx(q) cyq = dt*cy(q) do y = 1, ny yp1 = mod(y, ny) + 1 ym1 = mod(ny + y - 2, ny) + 1 fluxE = cfe(y,q) fluxW = cfw(y,q) fluxN = cfn(y,q) fluxS = cfs(y,q) fp(y,x,q) = fp(y,x,q) - cxq*(fluxE - fluxW) - cyq*(fluxN - fluxS) end do end do end do !$omp end do
!$omp end parallel  1 Like Thanks, and I guess it would be OK if all the values declared as private will be recomputed / redefined in the parallel construct. (Because I always use default(none) in my codes, I was afraid that some scalars or arrays might be inadvertently copied to all the threads with undefined values given…) 1 Like Here is a small example that I wrote for testing. Maybe it is too much simplified. But I want to show it because I realized that even the sequential algorithm gives artificial sinks/sources if the access is not continuous. program flux implicit none integer, parameter :: length = 5 real, dimension(0:length-1) :: c integer :: i,me,next,o integer, parameter, dimension(2,0:length-1) :: order = reshape([0,1,2,3,4,& 1,3,2,0,4],[2,length],order=[2,1]) do o = 1,size(order,1) c = 0.0 c(length/2:) = 1.0 print*, 'c initial' print*, c print*, 'total',sum(c) do i=0,length-1 me = order(o,i) next = mod(order(o,i)+1,length) f = 0.5 * (c(me) + c(next)) c(me) = f end do print*, 'c final' print*, c print*, 'total',sum(c) print*, '' end do end program  I see your point, but I believe it doesn’t apply to my case because the fluxes are calculated based upon array ft which is intent(in), and not fp which gets updated. In other words my reads and writes are decoupled. I don’t fully understand the method you are using. Still, I have implemented the Lattice Boltzmann Method and know that parallelization of the stream step can lead to race conditions. When I implemented the LBM with the D2Q9 model in CUDA Fortran before (7 years ago…), I used a temporary array as follows: f_new(i, j, Center) = f(i, j, Center) if(1<=i .and. i<=nx-1)then f_new(i+1, j, Right) = f(i, j, Right) end if  After imposing the boundary conditions to f_new, f is updated by copying f_new. Pointer swapping can avoid memory copying from f_new to f. Using a temporary array may be the simplest solution to parallelize the stream step. Indeed, the classic streaming version of LBM is implemented as you’ve shown. Here’s an excerpt from a separate subroutine of mine which does the same operation as yours:  !$omp workshare
fdst(:,:,0) = fsrc(:,:,0)
!$omp end workshare !$omp do schedule(static)
do x = 1, nx
xp1 = mod(x, nx) + 1
xm1 = mod(nx + x - 2, nx) + 1
do y = 1, ny
fdst(y,x,1) = fsrc(y,xm1,1)
end do
end do
!$omp end do  The only difference is I’m using a non-local read, while yours has a non-local write. I wonder if this makes any speed difference. I have also unrolled the lattice directions, so I’m working on one “spatial” slice of PDF’s at a time. After streaming I swap the arrays using integer pointers. The subroutine in the original post is for a variation of LBM called DUGKS (Discrete Unified Gas Kinetic Scheme). The main idea is that since the collision in LBM is discretized by a Crank-Nicholson scheme, the advective part should be evaluated at the temporal midpoint t + \Delta t/2 too. This results however in a method with quite different numerical properties. The biggest change is that the Courant number can be varied, which gives a bit more freedom in the choice of discretization parameters. However the method also requires more storage & more FLOPS, hence has lower bandwidth (more work per time step). I’m interested to evaluate whether it pays off somehow. 2 Likes Thanks for the additional information, and I’m sorry for the inaccurate reply. I used to study LBM, but I couldn’t keep up with it because there were so many varieties, like MRT, cumulant, lattice kinetic, immersed boundary extensions, etc. I learned that the Courant number is fixed at 1 in LBM, so I am attracted to the Crank-Nicholson discretization approach. If you can do with $omp atomic it is much better than critical! Also, your reduction with an array is extremely memory consuming, basically allocating fp * (omp_num_threads + 1), I wouldn’t go there.
Instead you could use the sink method, so something like this:

!$omp parallel do ordered(2) private(l,j,i) reduction(+:norm2) schedule(static,1) do l = 1,nx do j = 1,ny !$omp ordered depend(sink:l-1,j) depend(sink:l,j-1)


this is an easy way to use openmp threading for gauss-seidel methods, as you have here.

2 Likes