I’ve been seeing some strange openmp scaling behavior that I’m not sure how to explain.
I have a simple test program that scales nicely when compiled with gfortran but poorly when compiled with ifort.
My test program is the following:
module parserMod
use function_parser, only : fparser_array
implicit none
type(fparser_array), save :: parser
!$omp threadprivate(parser)
end module parserMod
!-------------------------------------------------------------------------------
subroutine parallelMarbles(marbles, numThreads)
use parserMod, only : parser
use iso_fortran_env, only: wp => real64
use iso_fortran_env, only : output_unit
real(wp), dimension(6,200000), intent(inout) :: marbles
integer, intent(in) :: numThreads
integer :: indx
character(len=1), dimension(3), parameter :: parserVars = ['x', 'y', 'z']
! All threads initialize the parser
!$omp parallel num_threads(numThreads)
call parser%parse(parserVars,parserVars)
if (parser%error()) then
call parser%print_errors(output_unit)
stop 99
endif
!$omp end parallel
!$omp parallel do default(none) &
!$omp private(indx) &
!$omp shared(marbles) &
!$omp num_threads(numThreads)
do indx = 1, size(marbles(1, :))
marbles(1,indx) = 1
call doWork(marbles(:,indx))
end do
!$omp end parallel do
end subroutine parallelMarbles
!-------------------------------------------------------------------------------
subroutine doWork(marble)
use omp_lib, only : omp_get_thread_num
use parserMod, only : parser
use iso_fortran_env, only: wp => real64
implicit none
real(wp), dimension(6), intent(inout) :: marble
integer :: indx
do indx = 1, 200
marble(2) = mod(indx, 6 + omp_get_thread_num())*marble(1)
marble(3) = mod(indx, 5 + omp_get_thread_num())*marble(1)
marble(4) = mod(indx, 4 + omp_get_thread_num())*marble(1)
call parser%evaluate(marble(1:3), marble(4:6))
marble(1) = sum(marble(2:))
end do
end subroutine doWork
!-------------------------------------------------------------------------------
program testOMP
use iso_fortran_env, only: wp => real64
! real(wp), allocatable, dimension(:,:) :: marbles
real(wp), dimension(6,200000) :: marbles
integer :: numThreads
real :: singleTime, threadTime
integer :: startTime, endTime, countRate, countMax
character(len=25) :: varString
! allocate(marbles(6,200000))
do numThreads = 1, 4, 3
write(*,*) 'Calling parallel marbles with ', numThreads, ' threads.'
call system_clock(startTime, countRate, countMax)
call parallelMarbles(marbles, numThreads)
call system_clock(endTime)
threadTime = (dble(endTime) - dble(startTime))/dble(countRate)
write (varString, '(F25.6)') threadTime
write (*, '(A)') ' Loop time = ' // trim(adjustl(varString)) // ' seconds.'
if (numThreads .eq. 1) then
singleTime = threadTime
endif
write (varString, '(F25.6)') singleTime / threadTime
write (*, '(A)') ' Speedup = ' // trim(adjustl(varString)) // 'x.'
write(*,*) '------------------------------------------------------'
end do
end program testOMP
The test uses the fortran_function_parser module:
GitHub - jacobwilliams/fortran_function_parser: Modern Fortran Function Parser.
Any insight into what might be going wrong in ifort and how I might improve the performance with ifort would be greatly appreciated.