I’m trying to parallelize a panel simulation using OpenMP but the serial code is much faster than the parallel code. Not sure what I’m doing wrong.
The key part of the code is the following:
! Outer loop over individuals
!$omp parallel if (par_fortran==1) default(shared) private(hh_c,t)
!$omp do
do hh_c=1,N_sim
write(*,*) "Doing hh_c = ", hh_c
! Simulate a sequence of shocks (length Tsim) for individual hh_c
call markov_simul(sim_ind_z(:,hh_c),Pi,T_sim,init_z)
! Inner loop over time periods
do t=2,T_sim+1
! Update assets
sim_ind_a(t,hh_c) = pol_ap_ind(sim_ind_a(t-1,hh_c),sim_ind_z(t-1,hh_c))
sim_a(t,hh_c) = a_grid(sim_ind_a(t,hh_c))
enddo
enddo
!$omp enddo
!$omp end parallel
There are two nested loops: the outer loop is over individuals and can be parallelized, the inner loop is over time periods. For each individual, I simulate a long history of assets and shocks. The simulation of the shocks is done by the subroutine markov_simul
contained in module mymod
.
It turns out that the parallel code (which is activated when I set par_fortran=1
) is much slower than the serial code.
Output of serial code:
Starting program...
max # of threads = 20
hello from thread 0
=============================================================
Starting SIMULATION...
=============================================================
Doing hh_c = 1
Doing hh_c = 2
Doing hh_c = 3
Doing hh_c = 4
Doing hh_c = 5
Doing hh_c = 6
Doing hh_c = 7
Doing hh_c = 8
Doing hh_c = 9
Doing hh_c = 10
Doing hh_c = 11
Doing hh_c = 12
Doing hh_c = 13
Doing hh_c = 14
Simulation assets done, now consumption...
=============================================================
SIMULATION runs for 5.00110049999967 seconds.
=============================================================
=============================================================
MODEL MOMENTS
Average assets, simulation: 1.8624
Average consumption, simulation: 1.0189
EXAMPLE SIMULATED SERIES FOR 10 INDIVIDUALS at t=1000
hh_c, sim_a, sim_c
1 1.9960 1.0798
2 1.9960 1.7822
3 1.9960 1.0798
4 1.9960 1.0798
5 1.9960 1.0798
6 0.7936 -0.1707
7 1.9960 1.0798
8 1.9960 1.0798
9 1.9960 1.0798
10 0.7936 0.5317
Program terminated OK!
Output of parallel code:
Starting program...
max # of threads = 20
hello from thread 0
hello from thread 1
hello from thread 4
hello from thread 6
hello from thread 7
hello from thread 8
hello from thread 3
hello from thread 5
hello from thread 9
hello from thread 10
hello from thread 11
hello from thread 12
hello from thread 13
hello from thread 2
=============================================================
Starting SIMULATION...
=============================================================
Doing hh_c = 1
Doing hh_c = 9
Doing hh_c = 4
Doing hh_c = 3
Doing hh_c = 8
Doing hh_c = 13
Doing hh_c = 11
Doing hh_c = 12
Doing hh_c = 14
Doing hh_c = 5
Doing hh_c = 7
Doing hh_c = 10
Doing hh_c = 6
Doing hh_c = 2
Simulation assets done, now consumption...
=============================================================
SIMULATION runs for 134.736892299999 seconds.
=============================================================
=============================================================
MODEL MOMENTS
Average assets, simulation: 1.8623
Average consumption, simulation: 1.0189
EXAMPLE SIMULATED SERIES FOR 10 INDIVIDUALS at t=1000
hh_c, sim_a, sim_c
1 1.9960 1.0798
2 1.9960 1.0798
3 1.9960 1.0798
4 0.7936 -0.1707
5 0.7936 -0.1707
6 1.9960 1.0798
7 1.9960 1.0798
8 1.9960 1.0798
9 0.7936 -0.1707
10 0.7936 0.5317
Program terminated OK!
For those interested, I post below the entire code, compiled with ifort /O3 /Qipo /Qopenmp
module mymod
implicit none
private
public :: linspace, markov_simul, myerror
contains
subroutine myerror(string)
implicit none
character(len=*), intent(in) :: string
write(*,*) "ERROR: ", string
write(*,*) "Program will terminate.."
pause
stop
end subroutine myerror
function linspace(a, b, num) result(array)
! Input parameters
real(8), intent(in) :: a ! The starting value
real(8), intent(in) :: b ! The stopping value
integer, intent(in) :: num ! The number of points
! Output array
real(8) :: array(num)
! Local variables
integer :: i
real(8) :: step
! Error handling for num <= 1
if (num == 1) then
array(1) = a
return
endif
! Calculate the step size
step = (b - a) / real(num - 1, 8)
! Populate the array
do i = 1, num
array(i) = a + (i - 1) * step
enddo
end function linspace
function simul_iid(z_prob,dbg) result(ind_sim)
implicit none
!##### DESCRIPTION #############
!Simulate a random draw from a discretized distribution
! Note: this is fast if the size of z_prob is small
! When z_prob is large (say, more than 100 elements), sample_pmf is faster
!##### INPUTS/OUTPUTS #############
real(8), intent(in) :: z_prob(:) !discretized prob vector, must be >=0 and sum to 1
integer, intent(in) :: dbg !0-1 flag
integer :: ind_sim !index for the draw
!##### LOCALS #############
integer :: n, i
real(8) :: u ! random number (a scalar)
real(8) :: check, prob_sum
!##### ROUTINE CODE #############
n = size(z_prob)
if (dbg==1) then
if ( any(z_prob<0d0) .or. any(z_prob>1d0) ) then
error stop "simul_iid: 0<=prob<=1 violated"
endif
check = sum(z_prob)
if (abs(check-1d0)>1d-6) then
error stop "simul_iid: prob does not sum to one"
endif
endif
call RANDOM_NUMBER(u)
do i = 1,n-1
!Cumulative sum p1+p2+..+pi
prob_sum = sum(z_prob(1:i))
if (u <= prob_sum ) then
ind_sim = i
return
endif !END IF
enddo !end for
! Else, choose the last value
ind_sim = n
end function simul_iid
subroutine markov_simul(ysim,Ptran,Tsim,y1,dbg_in)
implicit none
!##### DESCRIPTION #############
! Routine for simulating a Markov chain with time-independent
! transition matrix (standard case). To increase speed, set
! the debug flag to zero.
! This subroutine calls the function simulate_iid.
!##### INPUTS/OUTPUTS #############
real(8), intent(in) :: Ptran(:,:) ! Transition matrix, dim: (ns,ns)
integer, intent(in) :: Tsim ! Lenght of the time series to simulate
integer, intent(in) :: y1 ! Initial condition
integer, intent(out) :: ysim(:) ! Simulated time series s.t. y_sim(1)=y1, dim: (Tsim)
integer, intent(in),optional :: dbg_in ! 0-1 debug flag
!##### LOCALS #############
integer :: n, i, ns, t, dbg
real(8) :: check, prob_sum
real(8), allocatable :: row_sum(:)
!##### ROUTINE CODE #############
if (present(dbg_in)) then
dbg = dbg_in
else
dbg = 0
endif
ns = size(Ptran,dim=1)
! Check if inputs are correct (only if debug flag = 1)
if (dbg==1) then
if (size(Ptran,1)/=size(Ptran,2)) then
call myerror("markov_simul: Ptran is not square")
endif
if ( any(Ptran<0d0) .or. any(Ptran>1d0) ) then
call myerror("markov_simul: 0<=prob<=1 violated")
endif
row_sum = sum(Ptran,dim=2)
if (maxval(abs(row_sum-1d0)) > 1d-6) then
call myerror("markov_simul: Rows of Ptran do not sum to one")
endif
if (Tsim < 2) then
call myerror("markov_simul: Tsim must be at least 2")
endif
if ( y1<1 .or. y1>ns) then
call myerror("markov_simul: initial condition is out of bounds")
endif
if ( size(ysim)/=Tsim ) then
call myerror("markov_simul: ysim has wrong size")
endif
endif
! Simulate series of length Tsim
ysim(1) = y1
do t = 1,Tsim-1
ysim(t+1) = simul_iid(Ptran(ysim(t),:),dbg)
enddo
end subroutine markov_simul
end module mymod
!================================================================================================
program main
! THIS CODE USES OPENMP
use mymod, only: linspace, markov_simul
use omp_lib
implicit none
! ------------------ Declarations ------------------
integer, parameter :: par_fortran = 1
integer, parameter :: nthreads = 14
integer, parameter :: N_sim = 14
integer, parameter :: T_sim = 30000000
integer, parameter :: n_a=500, n_z=2
real(8) :: t1,t2
real(8) :: Pi(2,2), z_grid(2), b, grid_max, r, w
integer, allocatable :: pol_ap_ind(:,:), sim_ind_a(:,:), sim_ind_z(:,:)
real(8), allocatable :: a_grid(:), sim_a(:,:), sim_c(:,:)
! Variables for the simulation
integer :: hh_c, t, init_a, init_z
! ------------------ End declarations ------------------
write(*,*) "Starting program..."
write(*,*) "max # of threads = ",omp_get_max_threads()
CALL OMP_SET_NUM_THREADS(nthreads)
!$omp parallel if (par_fortran==1)
write(*,*) "hello from thread ",omp_get_thread_num() !," of ",nthreads
!$omp end parallel
! ---------------------- Set economic parameter values ----------------------!
Pi(1,:) = [0.6d0, 0.4d0]
Pi(2,:) = [.05d0, 0.95d0]
z_grid = [0.5d0, 1.0d0]
r = 0.04d0
w = 1.0d0
b = 0.0d0
grid_max = 4.0d0
! Set grid for assets
allocate(a_grid(n_a))
a_grid = linspace(b,grid_max,n_a)
! Set policy function for assets (index)
allocate(pol_ap_ind(n_a,n_z))
pol_ap_ind(:,1) = n_a/5 ! Low shock
pol_ap_ind(:,2) = n_a/2 ! High shock
write(*,*) "============================================================="
write(*,*) "Starting SIMULATION..."
write(*,*) "============================================================="
t1 = omp_get_wtime()
allocate(sim_a(T_sim+1,N_sim),sim_c(T_sim,N_sim),sim_ind_a(T_sim+1,N_sim), sim_ind_z(T_sim,N_sim))
! Initial conditions for state variables "a" and "z"
init_a = 1
init_z = n_z/2
! Recall: first index is time, second index is individual
sim_ind_a(1,:) = init_a
! Loop over time periods to simulate assets
!t1 = omp_get_wtime()
! Outer loop over individuals
!$omp parallel if (par_fortran==1) default(shared) private(hh_c,t)
!$omp do
do hh_c=1,N_sim
write(*,*) "Doing hh_c = ", hh_c
! Simulate a sequence of shocks (length Tsim) for individual hh_c
call markov_simul(sim_ind_z(:,hh_c),Pi,T_sim,init_z)
! Inner loop over time periods
do t=2,T_sim+1
! Update assets
sim_ind_a(t,hh_c) = pol_ap_ind(sim_ind_a(t-1,hh_c),sim_ind_z(t-1,hh_c))
sim_a(t,hh_c) = a_grid(sim_ind_a(t,hh_c))
enddo
enddo
!$omp enddo
!$omp end parallel
!t2 = omp_get_wtime()
!write(*,*) 'Time for simulating asset: ', t2-t1
write(*,*) 'Simulation assets done, now consumption...'
!pause
! Update consumption
do hh_c=1,N_sim
do t=1,T_sim
sim_c(t,hh_c) = (1d0+r)*sim_a(t,hh_c)+w*z_grid(sim_ind_z(t,hh_c))-sim_a(t+1,hh_c)
enddo
enddo
t2 = omp_get_wtime()
write(*,*) "============================================================="
write(*,*) 'SIMULATION runs for ',t2-t1,' seconds.'
write(*,*) "============================================================="
write(*,*) "============================================================="
write(*,*) 'MODEL MOMENTS'
write(*,'(X,A,F10.4)') "Average assets, simulation: ", sum(sim_a)/size(sim_a)
write(*,'(X,A,F10.4)') "Average consumption, simulation: ", sum(sim_c)/size(sim_c)
write(*,*) 'EXAMPLE SIMULATED SERIES FOR 10 INDIVIDUALS at t=1000'
write(*,'(X,A)') "hh_c, sim_a, sim_c"
do hh_c=1,10
write(*,'(I5,10F10.4)') hh_c, sim_a(1000,hh_c), sim_c(1000,hh_c)
enddo
deallocate(a_grid,pol_ap_ind,sim_a,sim_c,sim_ind_z)
write(*,*) "Program terminated OK!"
end program main
Any help is greatly appreciated!