Dear all, I’m working on a code that makes use of openMP. Unfortunately the code is still too slow for my purposes. I would greatly appreciate any suggestion on how to speed it up.
I tried to explain where the bottleneck is in the comments in the code: in step 4 there are three nested loops over x_c,b_c,k_c and inside there is another loop for a maximization. I parallelized with openMP only the outermost loop, the one over x_c. I tried to collapse all thee loops and it works but is slower. Just to give you a better idea about the size of the problem:
nx = 60
nb = 80
nk = 100
and I have a PC with 8 physical cores.
PLEASE SEE comment below for a minimum working example
!=============================================================!
module mymod
implicit none
contains
PURE FUNCTION locate(xx,x)
IMPLICIT NONE
REAL(8), DIMENSION(:), INTENT(IN) :: xx
REAL(8), INTENT(IN) :: x
INTEGER :: locate
INTEGER :: n,il,im,iu
n=size(xx)
il=0
iu=n+1
do
if (iu-il <= 1) exit
im=(iu+il)/2
if (x >= xx(im)) then
il=im
else
iu=im
end if
end do
if (x == xx(1)) then
locate=1
else if (x == xx(n)) then
locate=n-1
else
locate=il
end if
END FUNCTION locate
!===============================================================================!
PURE FUNCTION locate_equi(x_grid,xi) result(jl)
IMPLICIT NONE
! Declare inputs
REAL(8), INTENT(IN) :: x_grid(:)
REAL(8), INTENT(IN) :: xi
! Declare function result
INTEGER :: jl
INTEGER :: n
REAL(8) :: step, xi_min
n = size(x_grid)
step = x_grid(2)-x_grid(1)
xi_min = xi-x_grid(1)
jl = min(n-1,max(1,floor(xi_min/step)+1))
END FUNCTION locate_equi
!===============================================================================!
pure subroutine find_loc(jstar,omega, a_grid,aopt)
implicit none
! Declare Inputs:
real(8), intent(in) :: a_grid(:)
real(8), intent(in) :: aopt
! Declare outputs:
integer, intent(out) :: jstar
real(8), intent(out) :: omega
! Declare local variables:
integer :: na
!Find j s.t. a_grid(j)<=aopt<a_grid(j+1)
!for j=1,..,N-1
! omega is the weight on a_grid(j) and lies in [0,1]
na = size(a_grid)
jstar = max(min(locate_equi(a_grid,aopt),na-1),1)
!Weight on a_grid(j)
omega = (a_grid(jstar+1)-aopt)/(a_grid(jstar+1)-a_grid(jstar))
!omega = max(min(omega,1.0d0),0.0d0)
end subroutine find_loc
FUNCTION linint(x,y,xi)
! Purpose: linear interpolation of function y on grid x at interpolation point xi
! To make it pure, cannot use PRINT or STOP statements
IMPLICIT NONE
REAL(8), DIMENSION(:), INTENT(IN) :: x,y
REAL(8), INTENT(IN) :: xi
REAL(8) :: linint
REAL(8) :: a,b,d
INTEGER :: n,i
n=size(x)
IF (size(y)/=n) THEN
PRINT *, 'linint: x and y must be of the same size'
PAUSE
STOP 'program terminated by linint'
END IF
! If the grid x is NOT equally spaced, use "locate"
! otherwise use "locate_equi", faster.
!i=max(min(locate(x,xi),n-1),1)
i = locate_equi(x,xi)
d = x(i+1)-x(i)
!IF (d == 0.0) STOP 'bad x input in splint'
a = (x(i+1)-xi)/d
b = (xi-x(i))/d
linint = a*y(i)+b*y(i+1)
END FUNCTION linint
function fun_adjcost(kprime,k,theta,delta) result(adj)
implicit none
! Variables
! INPUTS
! kprime: Next-period capital, scalar
! k: Current period capital, scalar
! theta: resale value, in (0,1)
real(8), intent(in) :: kprime, k
real(8), intent(in) :: theta, delta
real(8) :: adj
! Body of fun_adjcost
if (kprime>=(1.0d0-delta)*k) then
adj = kprime-(1.0d0-delta)*k
else
adj = theta*(kprime-(1.0d0-delta)*k)
endif
end function fun_adjcost
end module mymod
!==============================================================!
subroutine sub_vfi_onestep(val_c_new,pol_kp,&
val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,do_howard,n_howard,nk,nb,nx)
use omp_lib
use mymod, only: linint, fun_adjcost
implicit none
! Declare input variables
integer(4), intent(in) :: nk,nb,nx
integer(4), intent(in) :: do_howard,n_howard
real(8), intent(in) :: val_c(nk,nb,nx)
real(8), intent(in) :: val0_u(nk,nb,nx), kp_bar(nk,nb,nx)
real(8), intent(in) :: B_hat(nk,nx), profit_mat(nk,nx)
real(8), intent(in) :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
real(8), intent(in) :: theta, q, delta, psi
! Declare output variables
real(8), intent(out) :: val_c_new(nk,nb,nx)
integer(4), intent(out) :: pol_kp(nk,nb,nx)
! Declare local variables
integer(4) :: k_c, b_c, x_c, xp_c, kp_c, max_ind,h_c
logical :: liq1, liq2
real(8) :: k_val, b_val, profit_val, kp_ub, kp_val, bprime, v0_int
real(8), allocatable :: EVx(:,:), b_grid_kp(:), rhs_vec(:)
real(8), allocatable :: val0_c(:,:,:), val0(:,:,:)
integer, allocatable :: is_c(:,:,:)
! Start execution
call omp_set_num_threads(8)
! Initialize outputs
val_c_new = 0.0d0 !(nk,nb,nx)
pol_kp = 1 !(nk,nb,nx)
! STEP 2 - Impose liquidation, eq. (24)
! Value of constrained firm before exit
allocate(val0_c(nk,nb,nx))
val0_c = 0.0d0
do x_c = 1,nx
do b_c = 1,nb
do k_c = 1,nk
k_val = k_grid(k_c)
b_val = b_grid(k_c,b_c)
profit_val = profit_mat(k_c,x_c)
liq1 = profit_val-b_val+theta*(1.0d0-delta)*k_val<0.0d0
liq2 = val_c(k_c,b_c,x_c)<theta*(1.0d0-delta)*k_val-b_val
if (liq1 .or. liq2) then
val0_c(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
else
val0_c(k_c,b_c,x_c) = val_c(k_c,b_c,x_c)
endif
enddo
enddo
enddo
! STEP 3 - Do equation (23)
allocate(val0(nk,nb,nx),is_c(nk,nb,nx))
val0 = 0.0d0
is_c = 1
do x_c = 1,nx
do b_c = 1,nb
do k_c = 1,nk
b_val = b_grid(k_c,b_c)
if (b_val<=B_hat(k_c,x_c)) then
val0(k_c,b_c,x_c) = val0_u(k_c,b_c,x_c)
is_c(k_c,b_c,x_c) = 0
else
val0(k_c,b_c,x_c) = val0_c(k_c,b_c,x_c)
endif
enddo
enddo
enddo
! STEP 4 - Solve eq. (25)
allocate(EVx(nk,nb),rhs_vec(nk))
! The bottleneck in the code is this part: three nested loops over (x,b,k) with a maximization over k'
! Maximize over k' on the grid
!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,&
!$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind)
!$omp do collapse(1)
do x_c = 1,nx
! Compute expected value in eq. (25)
EVx = 0.0d0 !(k',b')
do xp_c = 1,nx
EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c)
enddo
do b_c = 1,nb
do k_c = 1,nk
k_val = k_grid(k_c)
b_val = b_grid(k_c,b_c)
profit_val = profit_mat(k_c,x_c)
kp_ub = kp_bar(k_c,b_c,x_c)
if (is_c(k_c,b_c,x_c)==1 .and. profit_val-b_val+theta*(1.0d0-delta)*k_val>=0.0d0) then
! If no forced liquidation
! Set lower and upper bound for maximization over k' in (25)
! b'>=b_grid(1)
!kp_lb = q*b_grid_k(1)+profit_val-b_val+k_val
! Create the RHS of Bellman eq. 25 for each k' \in k_grid
rhs_vec = -100000.0d0
do kp_c = 1,nk
kp_val = k_grid(kp_c)
b_grid_kp = b_grid(kp_c,:)
bprime = max(b_grid_kp(1), (1.0d0/q)*(b_val-profit_val+fun_adjcost(kp_val,k_val,theta,delta)))
if (kp_val<=kp_ub) then
v0_int = linint(b_grid_kp,EVx(kp_c,:),bprime)
rhs_vec(kp_c) = q*(psi*(theta*(1.0d0-delta)*kp_val-bprime)+(1.0d0-psi)*v0_int)
else
exit
endif
enddo !kp_c
max_ind = maxloc(rhs_vec,dim=1)
pol_kp(k_c,b_c,x_c) = max_ind
val_c_new(k_c,b_c,x_c) = rhs_vec(max_ind)
else
! forced liquidation: val_c is irrelevant
val_c_new(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
endif
enddo !k
enddo !b
enddo !x
!$omp enddo
!$omp end parallel
! Now do Howard - I do NOT include the subroutine sub_howard since it is not important for the performance of the code.
! Howard acceleration
if (do_howard==1) then
call sub_howard(val_c_new,pol_kp,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,n_howard,nk,nb,nx)
endif
end subroutine sub_vfi_onestep
!===========================================================!