I managed to fix the problem and I have run successfully a slightly more complicated example on the GPU. The relevant part of the code is here:
!$omp target data map(to: a_grid,z_grid,ap_grid) map(tofrom: payoff_gpu)
!$omp target teams distribute parallel do collapse(2) private(z_c,a_c,ap_c,cons,util)
do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
cons = R*a_grid(a_c) + z_grid(z_c) - ap_grid(ap_c)
if (cons>0.0) then
util = -1.0/cons
else
util = penalty
endif
payoff_gpu(ap_c,a_c,z_c) = util
enddo
enddo
enddo
!$omp end target data
This runs on my GPU but it gives different results compared to the serial version. I am trying to understand if it is because
- I made a mistake in the OpenMP directives (declaring variables in
to',
tofrom’, from' and
private’, etc.)
- Computations on the GPU are “less precise” compared to the CPU (?)
I report here below the complete code to run this example:
! Author: Alessandro Di Nola
! Date: 2024-04-09
! Test openMP parallelization CPU vs GPU
!===============================================================================!
include "mkl_omp_offload.f90"
module mod_numerical
implicit none
private
public :: linspace, isclose
contains
function linspace(my_start, my_stop, n)
! Purpose: replicate Matlab function <linspace>
implicit none
! Inputs:
integer :: n
real :: my_start, my_stop
! Function result:
real :: linspace(n)
! Locals:
integer :: i
real :: step, grid(n)
if (n == 1) then
grid(n) = (my_start+my_stop)/2.d0
elseif (n>=2) then
grid(1) = my_start
if (n>2) then
step = (my_stop-my_start)/(real(n-1,8))
do i = 2, n-1
grid(i) = grid(i-1) + step
end do
end if
grid(n) = my_stop
endif
linspace = grid
end function linspace
! See https://numpy.org/doc/stable/reference/generated/numpy.isclose.html
elemental function isclose(a,b,atol,rtol)
real, intent(in) :: a, b
real, intent(in), optional :: atol, rtol
logical :: isclose
real :: atol_, rtol_
atol_ = 1.0e-5
rtol_ = 1.0e-9
if (present(atol)) atol_ = atol
if (present(rtol)) rtol_ = rtol
isclose = abs(a - b) <= (atol_ + rtol_*abs(b))
end function
end module mod_numerical
!===============================================================================!
program main
use omp_lib
use mod_numerical, only: linspace, isclose
implicit none
! Declare variables here
character(len=*), parameter :: savedir = "output"
logical :: good
real :: t1, t2, R, beta, cons, util,penalty
real :: t_serial,t_par_cpu,t_par_gpu
integer :: n_z,n_a,istat,myid,a_c,ap_c,z_c,ind(3)
real, allocatable :: payoff_serial(:,:,:), payoff_cpu(:,:,:), payoff_gpu(:,:,:)
real, allocatable :: a_grid(:),ap_grid(:),z_grid(:),z_tran(:,:)
write(*,*), '*********************************************** '
! ---------------------- Initialize variables -------------------------------!
!write(*,*) "Enter the size of the grid, n_a ="
!read(*,*) n_a
n_a = 500 ! Number of grid points for asset, a
n_z = 5 ! Number of grid points for income shock, z
R = 1.03d0 ! Gross interest rate, R = 1+r
beta = 0.96d0 ! Discount factor
penalty = -huge(0.0) ! Penalty for negative consumption
! ---------------------- Define grids ---------------------------------------!
allocate(a_grid(n_a),ap_grid(n_a),z_grid(n_z),z_tran(n_z,n_z),stat=istat)
if (istat /= 0) then
error stop "Error allocating memory for arrays z_grid and z_tran"
endif
a_grid = linspace(0.01, 4.0, n_a)
ap_grid = a_grid
z_grid = [real :: 0.9792, 0.9896, 1.0000, 1.0106, 1.0212]
z_tran = reshape( [real :: 0.9727, 0.0273, 0., 0., 0., &
0.0041, 0.9806, 0.0153, 0.0, 0.0, &
0.0, 0.0082, 0.9837, 0.0082, 0.0, &
0.0, 0.0, 0.0153, 0.9806, 0.0041, &
0.0, 0.0, 0.0, 0.0273, 0.9727 ], [5,5])
!if (do_disp) call disp("Matrix z_tran", z_tran)
! Main program statements here
write(*,*) "Testing parallelization with OpenMP"
!$omp parallel
myid = OMP_GET_THREAD_NUM()
if (myid == 0) then
print *, 'Grid size ', n_a
print *, 'Number of procs is ', OMP_GET_NUM_THREADS()
endif
!$omp end parallel
! Initialize big array
allocate(payoff_cpu(n_a,n_a,n_z),payoff_serial(n_a,n_a,n_z),payoff_gpu(n_a,n_a,n_z), stat=istat)
if (istat /= 0) then
error stop "Error allocating memory for payoff_cpu,payoff_gpu,payoff_serial"
end if
write(*,*) "1. Serial"
t1 = omp_get_wtime()
do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
cons = R*a_grid(a_c) + z_grid(z_c) - ap_grid(ap_c)
if (cons>0.0) then
util = -1.0/cons
else
util = penalty
endif
payoff_serial(ap_c,a_c,z_c) = util
enddo !end ap_c
enddo !end a_c
enddo !end z_c
t2 = omp_get_wtime()
t_serial = t2-t1
write(*,*) "2. Parallel with OpenMP on CPU"
t1 = omp_get_wtime()
!$omp parallel default(shared) &
!$ private(z_c,a_c,ap_c,cons,util)
!$omp do collapse(2)
do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
cons = R*a_grid(a_c) + z_grid(z_c) - ap_grid(ap_c)
if (cons>0.0) then
util = -1.0/cons
else
util = penalty
endif
payoff_cpu(ap_c,a_c,z_c) = util
enddo
enddo
enddo
!$omp enddo
!$omp end parallel
t2 = omp_get_wtime()
t_par_cpu = t2-t1
!======================================= GPU OPENMP ======================================!
myid = OMP_GET_THREAD_NUM()
if (myid .eq. 0) then
print *, 'Grid size ', n_a
print *, 'Number of CPU procs is ', OMP_GET_NUM_THREADS()
print *, 'Number of OpenMP Device Available:', omp_get_num_devices()
!$omp target
if (OMP_IS_INITIAL_DEVICE()) then
print *, ' Running on CPU'
else
print *, ' Running on GPU'
endif
!$omp end target
endif
write(*,*) "3. Parallel with GPU"
t1 = omp_get_wtime()
!!!!$ private(z_c,a_c,ap_c,cons,util)
!$omp target data map(to: a_grid,z_grid,ap_grid) map(tofrom: payoff_gpu)
!$omp target teams distribute parallel do collapse(2) private(z_c,a_c,ap_c,cons,util)
do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
cons = R*a_grid(a_c) + z_grid(z_c) - ap_grid(ap_c)
if (cons>0.0) then
util = -1.0/cons
else
util = penalty
endif
payoff_gpu(ap_c,a_c,z_c) = util
enddo
enddo
enddo
!$omp end target data
t2 = omp_get_wtime()
t_par_gpu = t2-t1
!====================================================================================================!
! verify result
write(*,*) "Verify results, CPU parallel"
good = .true.
outer: do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
if ( .not. isclose(payoff_serial(ap_c,a_c,z_c),payoff_cpu(ap_c,a_c,z_c),1.0e-3) ) then
write(*,*) 'ap_c = ', ap_c, ' a_c = ', a_c, ' z_c = ', z_c
write(*,*) 'payoff_serial = ', payoff_serial(ap_c,a_c,z_c),' payoff_cpu = ', payoff_cpu(ap_c,a_c,z_c)
print *,'FAILED'
good = .false.
exit outer
endif
enddo
enddo
enddo outer
if (good) print *,'CPU PASSED'
write(*,*) "Verify results, GPU parallel"
good = .true.
outer2: do z_c=1,n_z
do a_c=1,n_a
do ap_c=1,n_a
if ( .not. isclose(payoff_serial(ap_c,a_c,z_c),payoff_gpu(ap_c,a_c,z_c),1.0e-3) ) then
write(*,*) 'ap_c = ', ap_c, ' a_c = ', a_c, ' z_c = ', z_c
write(*,*) 'payoff_serial = ', payoff_serial(ap_c,a_c,z_c),' payoff_cpu = ', payoff_gpu(ap_c,a_c,z_c)
print *,'FAILED'
good = .false.
exit outer2
endif
enddo
enddo
enddo outer2
if (good) print *,'GPU PASSED'
write(*,'(A,F5.3,A)'),"Time for serial = ", t_serial, ' seconds'
write(*,'(A,F5.3,A)'),"Time for parallel CPU = ", t_par_cpu, ' seconds'
write(*,'(A,F5.3,A)'),"Time for parallel GPU = ", t_par_gpu, ' seconds'
write(*,'(A,F5.3)') "Parallel_cpu/serial = ", t_par_cpu/t_serial
write(*,'(A,F5.3)') "Parallel_gpu/serial = ", t_par_gpu/t_serial
write(*,'(A,F15.6)') "||payoff_cpu-payoff_serial|| = ", maxval(abs(payoff_cpu-payoff_serial))
write(*,'(A,F15.6)') "||payoff_gpu-payoff_serial|| = ", maxval(abs(payoff_gpu-payoff_serial))
if (maxval(abs(payoff_gpu-payoff_serial))>1e-2) then
write(*,*) "Results on GPU seem wrong..."
ind = maxloc(abs(payoff_gpu-payoff_serial))
write(*,'(A,F15.6)') "payoff_serial(ind) = ", payoff_serial(ind(1),ind(2),ind(3))
write(*,'(A,F15.6)') "payoff_gpu(ind) = ", payoff_gpu(ind(1),ind(2),ind(3))
endif
!call sub_write()
contains
subroutine sub_write()
! Comment out this part if you don't want to write to txt files
call writescalar_i(n_a,trim(savedir)//'n_a.txt')
call writescalar_i(n_z,trim(savedir)//'n_z.txt')
call write_arr3(payoff_serial,trim(savedir)//'payoff_serial.txt')
call write_arr3(payoff_cpu,trim(savedir)//'payoff_cpu.txt')
call write_arr3(payoff_gpu,trim(savedir)//'payoff_gpu.txt')
end subroutine sub_write
subroutine writescalar_i(x,file_name)
implicit none
!Declare inputs:
integer, intent(in) :: x
character(len=*), intent(in) :: file_name
integer :: unitno,ierr
!Write scalar x into a txt file
open(newunit=unitno, file=file_name, status='replace', iostat=ierr)
if (ierr/=0) then
error stop "Error: writescalar: cannot open file"
endif
write(unitno,*) x
close(unitno)
end subroutine writescalar_i
subroutine write_arr1(x,file_name)
real, intent(in) :: x(:)
character(len=*), intent(in) :: file_name
integer :: unitno, ierr, i1
!Write 1 dim array x into a txt file
open(newunit=unitno, file=file_name, status='replace', iostat=ierr)
if (ierr/=0) then
error stop "Error: write_arr1: cannot open file"
endif
do i1 = 1,size(x,dim=1)
write(unitno,*) x(i1)
enddo
close(unitno)
end subroutine write_arr1
subroutine write_arr2(x,file_name)
real, intent(in) :: x(:,:)
character(len=*), intent(in) :: file_name
integer :: unitno, ierr,i1,i2
!Write 1 dim array x into a txt file
open(newunit=unitno, file=file_name, status='replace', iostat=ierr)
if (ierr/=0) then
error stop "Error: write_arr2: cannot open file"
endif
do i2 = 1,size(x,dim=2)
do i1 = 1,size(x,dim=1)
write(unitno,*) x(i1,i2)
enddo
enddo
close(unitno)
end subroutine write_arr2
subroutine write_arr3(x,file_name)
real, intent(in) :: x(:,:,:)
character(len=*), intent(in) :: file_name
integer :: unitno, ierr,i1,i2,i3
!Write 1 dim array x into a txt file
open(newunit=unitno, file=file_name, status='replace', iostat=ierr)
if (ierr/=0) then
error stop "Error: write_arr3: cannot open file"
endif
do i3 = 1,size(x,dim=3)
do i2 = 1,size(x,dim=2)
do i1 = 1,size(x,dim=1)
write(unitno,*) x(i1,i2,i3)
enddo
enddo
enddo
close(unitno)
end subroutine write_arr3
end program main
I compiled the above code with
ifx /Qopenmp /Qopenmp-targets:spir64 src\main1.f90 -o exe\run_win.exe
and obtained the following results:
***********************************************
Testing parallelization with OpenMP
Grid size 500
Number of procs is 20
1. Serial
2. Parallel with OpenMP on CPU
Grid size 500
Number of CPU procs is 1
Number of OpenMP Device Available: 1
Running on GPU
3. Parallel with GPU
Verify results, CPU parallel
CPU PASSED
Verify results, GPU parallel
ap_c = 181 a_c = 58 z_c = 1
payoff_serial = -103.4175 payoff_cpu = -103.4188
FAILED
Time for serial = 0.000 seconds
Time for parallel CPU = 0.000 seconds
Time for parallel GPU = 0.000 seconds
Parallel_cpu/serial = NaN
Parallel_gpu/serial = NaN
||payoff_cpu-payoff_serial|| = 0.000000
||payoff_gpu-payoff_serial|| = 17476.250000
Results on GPU seem wrong...
payoff_serial(ind) = -279620.250000
payoff_gpu(ind) = -262144.000000