Trying to make a program using OpenMP give same output as non parallel counterpart

Hi,
I am trying to parallelize this simple program using OpenMP

The main problem with the program is that parsing through an array to find the best needed configuration, yields unexpected results.
Non parallel code:

program question_1_b
  implicit none

  type points
     real :: x,y,f,cv
  end type points

  type(points),allocatable :: arr(:)
  type(points) :: check
  real,allocatable :: val(:)
  integer :: i,j,k,n,loc
  integer :: beginning,rate,end

  call system_clock(beginning,rate)

  n=10000
  k=1

  allocate(val(n))
  allocate(arr(n*n))

  do i=1,n
     val(i)=real(i)
  end do

  do concurrent(i=1:n,j=1:n)
     arr(k)%x=val(i)
     arr(k)%y=val(j)
     k=k+1
  end do

  call data_points(arr%x,arr%y,arr%f,arr%cv)
  call check_value()

  print *, 'Number of combinations checked: ',k-1
  print *, 'The max values are: '
  print *, 'x : ±',check%x
  print *, 'y : ±',check%y
  print *, 'f: ',check%f
  print *, 'cv:',check%cv

  deallocate(val)
  deallocate(arr)

  call system_clock(end)
  print *, "Elapsed time: ",real(end-beginning)/real(rate)," Seconds"
  
contains
  subroutine check_value()
    check%f=0.

    do concurrent(i=1:n*n)
       if (arr(i)%cv<=9. .and. arr(i)%f>check%f) then
          check=arr(i)
       end if
    end do

  end subroutine check_value
    
   elemental subroutine data_points(x,y,f,cv)
    real,intent(in) :: x,y
    real,intent(out) :: f,cv

    f=(x*y)+sqrt(9.-(x**2)-(y**2))
    cv=(x**2)+(y**2)

  end subroutine data_points
end program question_1_b

Non parallel output:

Number of combinations checked: 100000000
The max values are:
x : ± 2.000000
y : ± 2.000000
f: 5.000000
cv: 8.000000
Elapsed time: 0.8485000 Seconds

Parallel code

program omp_exp1
  use OMP_LIB
  use iso_fortran_env
  implicit none

  type points
     real :: x,y,f,cv
  end type points

  type(points),allocatable :: arr(:)
  type(points) :: check
  real,allocatable :: val(:)
  integer :: i,j,k,n,loc
  integer :: beginning,rate,end

  call system_clock(beginning,rate)

  n=10000
  k=1

  allocate(val(n))
  allocate(arr(n*n))

  !$OMP PARALLEL
  !$OMP DO
  do i=1,n
     val(i)=real(i)
  end do
  !$OMP END DO
  !$OMP END PARALLEL

  !$OMP PARALLEL SHARED(k,arr)
  !$OMP DO
  do i=1,n
     do j=1,n
        arr(k)%x=val(i)
        arr(k)%y=val(j)
        k=k+1
     end do
  end do
  !$OMP END DO
  !$OMP END PARALLEL

  call data_points(arr%x,arr%y,arr%f,arr%cv)
  call check_value()

  print *, 'Number of combinations checked: ',k-1
  print *, 'The max values are: '
  print *, 'x : ±',check%x
  print *, 'y : ±',check%y
  print *, 'f: ',check%f
  print *, 'cv:',check%cv

  deallocate(val)
  deallocate(arr)

  call system_clock(end)
  print *, "Elapsed time: ",real(end-beginning)/real(rate)," Seconds"
  
contains
  subroutine check_value()
    use OMP_LIB
    check%f=0.

    !$OMP PARALLEL PRIVATE(i)
    !$OMP DO
    do i=1,n*n
       !$OMP CRITICAL
       if (arr(i)%cv<=9. .and. arr(i)%f>check%f) then
          check=arr(i)
       end if
       !$OMP END CRITICAL
    end do
    !$OMP END DO
    !$OMP END PARALLEL

  end subroutine check_value
    
   elemental subroutine data_points(x,y,f,cv)
    real,intent(in) :: x,y
    real,intent(out) :: f,cv

    f=(x*y)+sqrt(9.-(x**2)-(y**2))
    cv=(x**2)+(y**2)

  end subroutine data_points
end program omp_exp1

Output Obtained:

Number of combinations checked: 10000000
The max values are:
x : ± 0.0000000E+00
y : ± 0.0000000E+00
f: 3.000000
cv: 0.0000000E+00
Elapsed time: 16.75450 Seconds

Don’t focus too hard on the optimizing the calculations, I just want to be able to parallelize the original program. :slight_smile:

Thanks.

1 Like

The use of k = k+1 will not work as required in this code, as the initial value of k is not adjusted for each thread (based on the value “i”). “k” can not be shared and must be private.
You could try either of the following, which should be identical with most optimising compilers

!  explicit definition of k
 !$OMP PARALLEL DO SHARED(arr,val,n) PRIVATE(i,j,k) SCHEDULE(STATIC)
  do i=1,n
     do j=1,n
        k = (i-1)*n+j
        arr(k)%x=val(i)
        arr(k)%y=val(j)
     end do
  end do
  !$OMP END PARALLEL DO

!  incremental version of k
 !$OMP PARALLEL DO SHARED(arr,val,n) PRIVATE(i,j,k) SCHEDULE(STATIC)
  do i=1,n
     k = (i-1)*n + 1
     do j=1,n
        arr(k)%x=val(i)
        arr(k)%y=val(j)
        k = k+1
     end do
  end do
  !$OMP END PARALLEL DO

This approach applies PARALLEL DO to the DO i loop.
It is unlikely this will improve performance as there is “low arithmetic intensity” https://fortran-lang.discourse.group/t/compiling-only-part-of-code-with-ffast-math/3196/28

1 Like

Thank you ! :slight_smile: