Hello,
I am considering revising some of the CPP macros in a code. However, in some cases, I have loops with this kind of logic:
do k = 1, nz
do j = 1, ny
do i = 1, nx
diff1 = field(i, j, k) + 1.
diff2 = field(i, j, k) - 1.
#if defined(_OPTION_1)
#if defined(_OPTION_1_PLUS)
res = diff1 + diff2
#else
res = diff1 * diff2
#endif
#else
res = diff1 - diff2
#endif
field(i, j, k) = res
end do
end do
end do
I would think that these CPP macros can be replaced with plain if
conditions. The compiler could simply write an optimized loop for each of the options since they can be determined beforehand.
In other words, take these two examples (I added OpenACC directives for optional testing on a GPU, e.g., with nvfortran
).
Example 1:
Can be compiled and run with:
gfortran -cpp -D_OPTION_1 -D_OPTION_1_PLUS test_cpp.f90 -o test_cpp; ./test_cpp
program p
implicit none
integer, parameter :: nx = 5, ny = 5, nz = 5
integer :: i, j, k
real :: field(nx, ny, nz)
real :: diff1, diff2, res
integer :: iunit
!
! initialize and move data to GPU
!
do k = 1, nz
do j = 1, ny
do i = 1, nx
field(i, j, k) = i + j*10. + k*100.
end do
end do
end do
!$acc enter data copyin(field)
!
! run kernel
!
!$acc parallel loop collapse(3) default(present) private(diff1,diff2,res)
do k = 1, nz
do j = 1, ny
do i = 1, nx
diff1 = field(i, j, k) + 1.
diff2 = field(i, j, k) - 1.
#if defined(_OPTION_1)
#if defined(_OPTION_1_PLUS)
res = diff1 + diff2
#else
res = diff1 * diff2
#endif
#else
res = diff1 - diff2
#endif
field(i, j, k) = res
end do
end do
end do
!
! move data to GPU and save
!
!$acc update self(field)
open(newunit=iunit, file="output.bin", form="unformatted", access="stream", status="replace")
write(iunit) field
close(iunit)
print*, 'field(3,3,3) = ', field(3,3,3)
end program p
Example 2
Can be compiled and run with:
gfortran test_nocpp.f90 -o test_nocpp; ./test_nocpp 1 1
program p
implicit none
integer, parameter :: nx = 5, ny = 5, nz = 5
integer :: i, j, k
real :: field(nx, ny, nz)
real :: diff1, diff2, res
integer :: iunit
integer :: option1, option1_plus
character(len=32) :: arg
!
! get options from the command line
!
if (command_argument_count() >= 1) then
call get_command_argument(1, arg)
read(arg, *) option1
else
option1 = 0
endif
if (command_argument_count() >= 2) then
call get_command_argument(2, arg)
read(arg, *) option1_plus
else
option1_plus = 0
endif
!
! initialize and move data to GPU
!
do k = 1, nz
do j = 1, ny
do i = 1, nx
field(i, j, k) = i + j*10. + k*100.
end do
end do
end do
!$acc enter data copyin(field, option1, option1_plus)
!
! run kernel
!
!$acc parallel loop collapse(3) default(present) private(diff1,diff2,res)
do k = 1, nz
do j = 1, ny
do i = 1, nx
diff1 = field(i, j, k) + 1.
diff2 = field(i, j, k) - 1.
if (option1 == 1) then
if (option1_plus == 1) then
res = diff1 + diff2
else
res = diff1 * diff2
endif
else
res = diff1 - diff2
endif
field(i, j, k) = res
end do
end do
end do
!
! move data to host and save
!
!$acc update self(field)
open(newunit=iunit, file="output.bin", form="unformatted", access="stream", status="replace")
write(iunit) field
close(iunit)
print*, 'field(3,3,3) = ', field(3,3,3)
end program p
Questions:
- Would any of the current compilers fail to optimize the two programs to make them equivalent in performance?
- How would you check this - would one need to read and understand the Intermediate Representation/Assembly?
Thanks!
EDIT: Related: Loop-invariant code motion - Wikipedia