# Fortran openMP - make code faster

Dear all, I’m working on a code that makes use of openMP. Unfortunately the code is still too slow for my purposes. I would greatly appreciate any suggestion on how to speed it up.

I tried to explain where the bottleneck is in the comments in the code: in step 4 there are three nested loops over x_c,b_c,k_c and inside there is another loop for a maximization. I parallelized with openMP only the outermost loop, the one over x_c. I tried to collapse all thee loops and it works but is slower. Just to give you a better idea about the size of the problem:
nx = 60
nb = 80
nk = 100
and I have a PC with 8 physical cores.
PLEASE SEE comment below for a minimum working example

!=============================================================!

module mymod
implicit none
contains

PURE FUNCTION locate(xx,x)
IMPLICIT NONE
REAL(8), DIMENSION(:), INTENT(IN) :: xx
REAL(8), INTENT(IN) :: x
INTEGER :: locate
INTEGER :: n,il,im,iu
n=size(xx)
il=0
iu=n+1
do
if (iu-il <= 1) exit
im=(iu+il)/2
if (x >= xx(im)) then
il=im
else
iu=im
end if
end do
if (x == xx(1)) then
locate=1
else if (x == xx(n)) then
locate=n-1
else
locate=il
end if
END FUNCTION locate
!===============================================================================!

PURE FUNCTION locate_equi(x_grid,xi) result(jl)
IMPLICIT NONE
! Declare inputs
REAL(8), INTENT(IN) :: x_grid(:)
REAL(8), INTENT(IN) :: xi
! Declare function result
INTEGER :: jl
INTEGER :: n
REAL(8) :: step, xi_min

n      = size(x_grid)
step   = x_grid(2)-x_grid(1)
xi_min = xi-x_grid(1)

jl     = min(n-1,max(1,floor(xi_min/step)+1))

END FUNCTION locate_equi
!===============================================================================!

pure subroutine find_loc(jstar,omega, a_grid,aopt)
implicit none
! Declare Inputs:
real(8), intent(in) :: a_grid(:)
real(8), intent(in) :: aopt
! Declare outputs:
integer, intent(out) :: jstar
real(8), intent(out) :: omega
! Declare local variables:
integer :: na

!Find j s.t. a_grid(j)<=aopt<a_grid(j+1)
!for j=1,..,N-1
! omega is the weight on a_grid(j) and lies in [0,1]

na = size(a_grid)

jstar = max(min(locate_equi(a_grid,aopt),na-1),1)
!Weight on a_grid(j)
omega = (a_grid(jstar+1)-aopt)/(a_grid(jstar+1)-a_grid(jstar))
!omega = max(min(omega,1.0d0),0.0d0)

end subroutine find_loc

FUNCTION linint(x,y,xi)
! Purpose: linear interpolation of function y on grid x at interpolation point xi
!          To make it pure, cannot use PRINT or STOP statements
IMPLICIT NONE
REAL(8), DIMENSION(:), INTENT(IN) :: x,y
REAL(8), INTENT(IN) :: xi
REAL(8) :: linint
REAL(8) :: a,b,d
INTEGER :: n,i
n=size(x)
IF (size(y)/=n) THEN
PRINT *, 'linint: x and y must be of the same size'
PAUSE
STOP 'program terminated by linint'
END IF
! If the grid x is NOT equally spaced, use "locate"
! otherwise use "locate_equi", faster.
!i=max(min(locate(x,xi),n-1),1)
i = locate_equi(x,xi)
d = x(i+1)-x(i)
!IF (d == 0.0) STOP 'bad x input in splint'
a      = (x(i+1)-xi)/d
b      = (xi-x(i))/d
linint = a*y(i)+b*y(i+1)
END FUNCTION linint

implicit none
! Variables
! INPUTS
! kprime: Next-period capital, scalar
! k: Current period capital, scalar
! theta: resale value, in (0,1)
real(8), intent(in) :: kprime, k
real(8), intent(in) :: theta, delta

if (kprime>=(1.0d0-delta)*k) then
else
endif

end module mymod
!==============================================================!

subroutine sub_vfi_onestep(val_c_new,pol_kp,&
val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,do_howard,n_howard,nk,nb,nx)

use omp_lib
implicit none
! Declare input variables
integer(4), intent(in) :: nk,nb,nx
integer(4), intent(in) :: do_howard,n_howard
real(8), intent(in)    :: val_c(nk,nb,nx)
real(8), intent(in)    :: val0_u(nk,nb,nx), kp_bar(nk,nb,nx)
real(8), intent(in)    :: B_hat(nk,nx), profit_mat(nk,nx)
real(8), intent(in)    :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
real(8), intent(in)    :: theta, q, delta, psi
! Declare output variables
real(8), intent(out)    :: val_c_new(nk,nb,nx)
integer(4), intent(out) :: pol_kp(nk,nb,nx)
! Declare local variables
integer(4) :: k_c, b_c, x_c, xp_c, kp_c, max_ind,h_c
logical :: liq1, liq2
real(8) :: k_val, b_val, profit_val, kp_ub, kp_val, bprime, v0_int
real(8), allocatable :: EVx(:,:), b_grid_kp(:), rhs_vec(:)
real(8), allocatable :: val0_c(:,:,:), val0(:,:,:)
integer, allocatable :: is_c(:,:,:)

! Start execution

! Initialize outputs
val_c_new = 0.0d0 !(nk,nb,nx)
pol_kp    = 1     !(nk,nb,nx)

! STEP 2 - Impose liquidation, eq. (24)

! Value of constrained firm before exit
allocate(val0_c(nk,nb,nx))
val0_c = 0.0d0
do x_c = 1,nx
do b_c = 1,nb
do k_c = 1,nk
k_val = k_grid(k_c)
b_val = b_grid(k_c,b_c)
profit_val = profit_mat(k_c,x_c)
liq1 = profit_val-b_val+theta*(1.0d0-delta)*k_val<0.0d0
liq2 = val_c(k_c,b_c,x_c)<theta*(1.0d0-delta)*k_val-b_val
if (liq1 .or. liq2) then
val0_c(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
else
val0_c(k_c,b_c,x_c) = val_c(k_c,b_c,x_c)
endif
enddo
enddo
enddo

! STEP 3 - Do equation (23)
allocate(val0(nk,nb,nx),is_c(nk,nb,nx))
val0 = 0.0d0
is_c = 1
do x_c = 1,nx
do b_c = 1,nb
do k_c = 1,nk
b_val = b_grid(k_c,b_c)
if (b_val<=B_hat(k_c,x_c)) then
val0(k_c,b_c,x_c) = val0_u(k_c,b_c,x_c)
is_c(k_c,b_c,x_c) = 0
else
val0(k_c,b_c,x_c) = val0_c(k_c,b_c,x_c)
endif
enddo
enddo
enddo

! STEP 4 - Solve eq. (25)

allocate(EVx(nk,nb),rhs_vec(nk))
! The bottleneck in the code is this part: three nested loops over (x,b,k) with a maximization over k'
! Maximize over k' on the grid
!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,& !$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind)
!$omp do collapse(1) do x_c = 1,nx ! Compute expected value in eq. (25) EVx = 0.0d0 !(k',b') do xp_c = 1,nx EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c) enddo do b_c = 1,nb do k_c = 1,nk k_val = k_grid(k_c) b_val = b_grid(k_c,b_c) profit_val = profit_mat(k_c,x_c) kp_ub = kp_bar(k_c,b_c,x_c) if (is_c(k_c,b_c,x_c)==1 .and. profit_val-b_val+theta*(1.0d0-delta)*k_val>=0.0d0) then ! If no forced liquidation ! Set lower and upper bound for maximization over k' in (25) ! b'>=b_grid(1) !kp_lb = q*b_grid_k(1)+profit_val-b_val+k_val ! Create the RHS of Bellman eq. 25 for each k' \in k_grid rhs_vec = -100000.0d0 do kp_c = 1,nk kp_val = k_grid(kp_c) b_grid_kp = b_grid(kp_c,:) bprime = max(b_grid_kp(1), (1.0d0/q)*(b_val-profit_val+fun_adjcost(kp_val,k_val,theta,delta))) if (kp_val<=kp_ub) then v0_int = linint(b_grid_kp,EVx(kp_c,:),bprime) rhs_vec(kp_c) = q*(psi*(theta*(1.0d0-delta)*kp_val-bprime)+(1.0d0-psi)*v0_int) else exit endif enddo !kp_c max_ind = maxloc(rhs_vec,dim=1) pol_kp(k_c,b_c,x_c) = max_ind val_c_new(k_c,b_c,x_c) = rhs_vec(max_ind) else ! forced liquidation: val_c is irrelevant val_c_new(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val endif enddo !k enddo !b enddo !x !$omp enddo
!$omp end parallel ! Now do Howard - I do NOT include the subroutine sub_howard since it is not important for the performance of the code. ! Howard acceleration if (do_howard==1) then call sub_howard(val_c_new,pol_kp,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,n_howard,nk,nb,nx) endif end subroutine sub_vfi_onestep !===========================================================! 1 Like It might help to post a complete program - now we see the relevant subroutines and functions, but nobody can experiment with it, as there is no main program. While I am no expert in OpenMP, I do see a bunch of shared arrays. I do not know if they are read-only, but a first glance at the code suggests quite a few are. In that the copyin clause might help to have “unshared”, so that getting different parts of the arrays do not cause obstruction. Just a thought, mind you. 2 Likes Scanning through this quickly: Right now you’re only parallelizing over the outer nx = 60 cycles, you’d be better off parallelizing over the inner nb * nk = 8000 cycles as I don’t see any obvious signs of a carried dependency. (!$omp do collapse(1) is a useless directive btw.)

Do you really need to have EVx completely precomputed before the inner two cycles?

I’d also look into if you can restructure/sort the data to remove if statements. This is critical for GPUs, but the benefits will filter back into CPUs (modern CPUs are good at branch prediction, but no need for branch prediction in the first place is even better).

1 Like

@wphuhn-intel Thanks for your tips! It is true that there no carried dependencies in the three loops (x,b,k). I tried parallelizing over all three with do collapse(3) but the run time is significantly slower. Indeed, when collapsing all three loops, I had to move EVx inside and I guess this is one reason why the code is slower.
Regarding your question “Do you really need to have EVx completely precomputed before the inner two cycles?”: I don’t have to but since EVx depends only on x and not on (b,k), I think it is faster to compute it before opening the inner loops on (b,k), right?

I will try to generate a MWE. Here is the change that I’ve done which however resulted in slower performance:

!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,& !$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind)
!$omp do collapse(3) do x_c = 1,nx do b_c = 1,nb do k_c = 1,nk ! Compute expected value in eq. (25) EVx = 0.0d0 !(k',b') do xp_c = 1,nx EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c) enddo k_val = k_grid(k_c) ! rest of the code omitted for brevity enddo !k enddo !b enddo !x !$omp enddo
!$omp end parallel @Arjen You’re absolutely right, here is a minimum working example: !=============================================================! module mymod implicit none contains function linspace(my_start, my_stop, n) ! Purpose: replicate Matlab function <linspace> ! Originally written by Arnau Valladares-Esteban implicit none ! Inputs: integer :: n real(8) :: my_start, my_stop ! Function result: real(8) :: linspace(n) ! Locals: integer :: i real(8) :: step, grid(n) if (n == 1) then grid(n) = (my_start+my_stop)/2.0d0 elseif (n>=2) then grid(1) = my_start if (n>2) then step = (my_stop-my_start)/(real(n-1,8)) do i = 2, n-1 grid(i) = grid(i-1) + step end do end if grid(n) = my_stop endif linspace = grid end function linspace !===============================================================================! PURE FUNCTION locate(xx,x) IMPLICIT NONE REAL(8), DIMENSION(:), INTENT(IN) :: xx REAL(8), INTENT(IN) :: x INTEGER :: locate INTEGER :: n,il,im,iu n=size(xx) il=0 iu=n+1 do if (iu-il <= 1) exit im=(iu+il)/2 if (x >= xx(im)) then il=im else iu=im end if end do if (x == xx(1)) then locate=1 else if (x == xx(n)) then locate=n-1 else locate=il end if END FUNCTION locate !===============================================================================! PURE FUNCTION locate_equi(x_grid,xi) result(jl) IMPLICIT NONE ! Declare inputs REAL(8), INTENT(IN) :: x_grid(:) REAL(8), INTENT(IN) :: xi ! Declare function result INTEGER :: jl INTEGER :: n REAL(8) :: step, xi_min n = size(x_grid) step = x_grid(2)-x_grid(1) xi_min = xi-x_grid(1) jl = min(n-1,max(1,floor(xi_min/step)+1)) END FUNCTION locate_equi !===============================================================================! pure subroutine find_loc(jstar,omega, a_grid,aopt) implicit none ! Declare Inputs: real(8), intent(in) :: a_grid(:) real(8), intent(in) :: aopt ! Declare outputs: integer, intent(out) :: jstar real(8), intent(out) :: omega ! Declare local variables: integer :: na !Find j s.t. a_grid(j)<=aopt<a_grid(j+1) !for j=1,..,N-1 ! omega is the weight on a_grid(j) and lies in [0,1] na = size(a_grid) jstar = locate_equi(a_grid,aopt) !Weight on a_grid(j) omega = (a_grid(jstar+1)-aopt)/(a_grid(jstar+1)-a_grid(jstar)) !omega = max(min(omega,1.0d0),0.0d0) end subroutine find_loc FUNCTION linint(x,y,xi) ! Purpose: linear interpolation of function y on grid x at interpolation point xi ! To make it pure, cannot use PRINT or STOP statements IMPLICIT NONE REAL(8), DIMENSION(:), INTENT(IN) :: x,y REAL(8), INTENT(IN) :: xi REAL(8) :: linint REAL(8) :: a,b,d INTEGER :: n,i n=size(x) IF (size(y)/=n) THEN PRINT *, 'linint: x and y must be of the same size' PAUSE STOP 'program terminated by linint' END IF ! If the grid x is NOT equally spaced, use "locate" ! otherwise use "locate_equi", faster. !i=max(min(locate(x,xi),n-1),1) i = locate_equi(x,xi) d = x(i+1)-x(i) !IF (d == 0.0) STOP 'bad x input in splint' a = (x(i+1)-xi)/d b = (xi-x(i))/d linint = a*y(i)+b*y(i+1) END FUNCTION linint function fun_adjcost(kprime,k,theta,delta) result(adj) implicit none ! Variables ! INPUTS ! kprime: Next-period capital, scalar ! k: Current period capital, scalar ! theta: Resale value, in (0,1) real(8), intent(in) :: kprime, k real(8), intent(in) :: theta, delta real(8) :: adj ! Body of fun_adjcost if (kprime>=(1.0d0-delta)*k) then adj = kprime-(1.0d0-delta)*k else adj = theta*(kprime-(1.0d0-delta)*k) endif end function fun_adjcost end module mymod !===============================================================================! subroutine sub_vfi_onestep(val_c_new,pol_kp,& val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,nk,nb,nx) use omp_lib use mymod, only: linint, fun_adjcost implicit none ! Declare input variables integer(4), intent(in) :: nk,nb,nx real(8), intent(in) :: val_c(nk,nb,nx) real(8), intent(in) :: val0_u(nk,nb,nx), kp_bar(nk,nb,nx) real(8), intent(in) :: B_hat(nk,nx), profit_mat(nk,nx) real(8), intent(in) :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx) real(8), intent(in) :: theta, q, delta, psi ! Declare output variables real(8), intent(out) :: val_c_new(nk,nb,nx) integer(4), intent(out) :: pol_kp(nk,nb,nx) ! Declare local variables integer(4) :: k_c, b_c, x_c, xp_c, kp_c, max_ind,h_c logical :: liq1, liq2 real(8) :: k_val, b_val, profit_val, kp_ub, kp_val, bprime, v0_int real(8), allocatable :: EVx(:,:), b_grid_kp(:), rhs_vec(:) real(8), allocatable :: val0_c(:,:,:), val0(:,:,:) integer, allocatable :: is_c(:,:,:) ! The number of threads cannot be greater than the number of cores on the computer. ! For haomin's imac, this is 10. call omp_set_num_threads(8) ! Initialize outputs val_c_new = 0.0d0 !(nk,nb,nx) pol_kp = 1 !(nk,nb,nx) ! STEP 2 - Impose liquidation, eq. (24) ! Value of constrained firm before exit allocate(val0_c(nk,nb,nx)) val0_c = 0.0d0 do x_c = 1,nx do b_c = 1,nb do k_c = 1,nk k_val = k_grid(k_c) b_val = b_grid(k_c,b_c) profit_val = profit_mat(k_c,x_c) liq1 = profit_val-b_val+theta*(1.0d0-delta)*k_val<0.0d0 liq2 = val_c(k_c,b_c,x_c)<theta*(1.0d0-delta)*k_val-b_val if (liq1 .or. liq2) then val0_c(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val else val0_c(k_c,b_c,x_c) = val_c(k_c,b_c,x_c) endif enddo enddo enddo ! STEP 3 - Do equation (23) allocate(val0(nk,nb,nx),is_c(nk,nb,nx)) val0 = 0.0d0 is_c = 1 do x_c = 1,nx do b_c = 1,nb do k_c = 1,nk b_val = b_grid(k_c,b_c) if (b_val<=B_hat(k_c,x_c)) then val0(k_c,b_c,x_c) = val0_u(k_c,b_c,x_c) is_c(k_c,b_c,x_c) = 0 else val0(k_c,b_c,x_c) = val0_c(k_c,b_c,x_c) endif enddo enddo enddo ! STEP 4 - Solve eq. (25) allocate(EVx(nk,nb),rhs_vec(nk)) ! Maximize over k' on the grid !$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,&
!$kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind) !$omp do collapse(1)
do x_c = 1,nx
! Compute expected value in eq. (25)
EVx = 0.0d0 !(k',b')
do xp_c = 1,nx
EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c)
enddo
do b_c = 1,nb
do k_c = 1,nk
k_val = k_grid(k_c)
b_val = b_grid(k_c,b_c)
profit_val = profit_mat(k_c,x_c)
kp_ub = kp_bar(k_c,b_c,x_c)

if (is_c(k_c,b_c,x_c)==1 .and. profit_val-b_val+theta*(1.0d0-delta)*k_val>=0.0d0) then
! If no forced liquidation
! Set lower and upper bound for maximization over k' in (25)
! b'>=b_grid(1)
!kp_lb = q*b_grid_k(1)+profit_val-b_val+k_val

! Create the RHS of Bellman eq. 25 for each k' \in k_grid
rhs_vec = -100000.0d0
do kp_c = 1,nk
kp_val    = k_grid(kp_c)
b_grid_kp = b_grid(kp_c,:)
if (kp_val<=kp_ub) then
v0_int = linint(b_grid_kp,EVx(kp_c,:),bprime)
rhs_vec(kp_c) = q*(psi*(theta*(1.0d0-delta)*kp_val-bprime)+(1.0d0-psi)*v0_int)
else
exit
endif
enddo !kp_c
max_ind = maxloc(rhs_vec,dim=1)
pol_kp(k_c,b_c,x_c)    = max_ind
val_c_new(k_c,b_c,x_c) = rhs_vec(max_ind)
else
! forced liquidation: val_c is irrelevant
val_c_new(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
endif
enddo !k
enddo !b
enddo !x
!$omp enddo !$omp end parallel

end subroutine sub_vfi_onestep
!============================================================================!

program main
use mymod, only: linspace
use omp_lib
implicit none

external :: sub_vfi_onestep ! better to put this into a module, but I can't
integer, parameter :: nx = 60  !should be 60  !!!!!!!!!!!
integer, parameter :: nb = 80  !should be 80  !!!!!!!!!!!
integer, parameter :: nk = 100 !should be 100 !!!!!!!!!!!
real    :: t_end, t_start
integer :: x_c,k_c,istat
real(8) :: theta,q,delta,psi
real(8) :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
real(8) :: B_hat(nk,nx), profit_mat(nk,nx)
real(8), allocatable :: val_c(:,:,:), val0_u(:,:,:), kp_bar(:,:,:), val_c_new(:,:,:)
integer(4), allocatable :: pol_kp(:,:,:)

write(*,*) "Hello, sub_vfi_onestep!"

! Create fake data

theta  = 0.9107133678d0
q      = 0.989d0
delta  = 0.015d0
psi    = 0.0029334045302853d0

! Equally spaced grids
k_grid = linspace(0.01d0,200.0d0,nk)

do k_c = 1,nk
b_grid(k_c,:) = linspace(-100d0,50d0,nb)
enddo

! Right stochastic matrix pi_x, make sure all rows sum to one
CALL RANDOM_NUMBER(pi_x)
do x_c = 1,nx
pi_x(x_c,:) = pi_x(x_c,:)/sum(pi_x(x_c,:))
enddo

write(*,*) "sum(pi_x) rows = ", sum(pi_x,dim=2)

allocate(val_c(nk,nb,nx), val0_u(nk,nb,nx), kp_bar(nk,nb,nx), val_c_new(nk,nb,nx),pol_kp(nk,nb,nx),STAT=istat)
if (istat/=0) then
write(*,*) "Allocation failed!"
endif

CALL RANDOM_NUMBER(B_hat)
CALL RANDOM_NUMBER(profit_mat)
CALL RANDOM_NUMBER(val0_u)
CALL RANDOM_NUMBER(kp_bar)
CALL RANDOM_NUMBER(kp_bar)
CALL RANDOM_NUMBER(val_c)

! Call the computational subroutine "sub_vfi_onestep"
! NOTE: would be better to use assumed shape dummy arguments but I can't change this
write(*,*) "Calling sub_vfi_onestep.."
t_start = omp_get_wtime()

call sub_vfi_onestep(val_c_new,pol_kp,&
val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,nk,nb,nx)

t_end = omp_get_wtime()
write(*,*) "..done!"

write(*,*) 'FORTRAN executable runs for',real(t_end-t_start),'seconds.'

pause

end program main

!===========================================================!

To compile the code:
ifort /Qopenmp /O3 /Qipo /Qprec-div- sub_vfi_onestep_mex.f90 -o run.exe