Fortran openMP - make code faster

Dear all, I’m working on a code that makes use of openMP. Unfortunately the code is still too slow for my purposes. I would greatly appreciate any suggestion on how to speed it up.

I tried to explain where the bottleneck is in the comments in the code: in step 4 there are three nested loops over x_c,b_c,k_c and inside there is another loop for a maximization. I parallelized with openMP only the outermost loop, the one over x_c. I tried to collapse all thee loops and it works but is slower. Just to give you a better idea about the size of the problem:
nx = 60
nb = 80
nk = 100
and I have a PC with 8 physical cores.
PLEASE SEE comment below for a minimum working example

!=============================================================!

module mymod
implicit none
    contains

    
PURE FUNCTION locate(xx,x)
    IMPLICIT NONE
    REAL(8), DIMENSION(:), INTENT(IN) :: xx
    REAL(8), INTENT(IN) :: x
    INTEGER :: locate
    INTEGER :: n,il,im,iu
    n=size(xx)
    il=0
    iu=n+1
    do
	    if (iu-il <= 1) exit
	    im=(iu+il)/2
	    if (x >= xx(im)) then
		    il=im
	    else
		    iu=im
	    end if
    end do
    if (x == xx(1)) then
	    locate=1
    else if (x == xx(n)) then
	    locate=n-1
    else
	    locate=il
    end if
END FUNCTION locate
!===============================================================================!

PURE FUNCTION locate_equi(x_grid,xi) result(jl)
    IMPLICIT NONE
	! Declare inputs
    REAL(8), INTENT(IN) :: x_grid(:)
    REAL(8), INTENT(IN) :: xi
	! Declare function result 
    INTEGER :: jl
    INTEGER :: n
	REAL(8) :: step, xi_min 
    
	n      = size(x_grid)
    step   = x_grid(2)-x_grid(1)
	xi_min = xi-x_grid(1)
	
	jl     = min(n-1,max(1,floor(xi_min/step)+1))
	
END FUNCTION locate_equi
!===============================================================================!

pure subroutine find_loc(jstar,omega, a_grid,aopt) 
     implicit none
     ! Declare Inputs:
     real(8), intent(in) :: a_grid(:)
     real(8), intent(in) :: aopt
     ! Declare outputs:
     integer, intent(out) :: jstar
     real(8), intent(out) :: omega
     ! Declare local variables:
     integer :: na
     
     !Find j s.t. a_grid(j)<=aopt<a_grid(j+1)
     !for j=1,..,N-1
     ! omega is the weight on a_grid(j) and lies in [0,1]
 
     na = size(a_grid)
 
     jstar = max(min(locate_equi(a_grid,aopt),na-1),1)
     !Weight on a_grid(j)
     omega = (a_grid(jstar+1)-aopt)/(a_grid(jstar+1)-a_grid(jstar))
     !omega = max(min(omega,1.0d0),0.0d0)
     
end subroutine find_loc

FUNCTION linint(x,y,xi)
    ! Purpose: linear interpolation of function y on grid x at interpolation point xi
    !          To make it pure, cannot use PRINT or STOP statements
    IMPLICIT NONE
    REAL(8), DIMENSION(:), INTENT(IN) :: x,y
    REAL(8), INTENT(IN) :: xi
    REAL(8) :: linint
    REAL(8) :: a,b,d
    INTEGER :: n,i
    n=size(x)
    IF (size(y)/=n) THEN
	    PRINT *, 'linint: x and y must be of the same size'
        PAUSE
	    STOP 'program terminated by linint'
    END IF
	! If the grid x is NOT equally spaced, use "locate"
	! otherwise use "locate_equi", faster.
    !i=max(min(locate(x,xi),n-1),1)
    i = locate_equi(x,xi)
	d = x(i+1)-x(i)
    !IF (d == 0.0) STOP 'bad x input in splint'
    a      = (x(i+1)-xi)/d
    b      = (xi-x(i))/d
    linint = a*y(i)+b*y(i+1)
END FUNCTION linint

function fun_adjcost(kprime,k,theta,delta) result(adj)
    implicit none
    ! Variables
    ! INPUTS
    ! kprime: Next-period capital, scalar
    ! k: Current period capital, scalar
    ! theta: resale value, in (0,1)
    real(8), intent(in) :: kprime, k 
    real(8), intent(in) :: theta, delta 
    real(8) :: adj
    
    ! Body of fun_adjcost
    if (kprime>=(1.0d0-delta)*k) then
        adj = kprime-(1.0d0-delta)*k
    else
        adj = theta*(kprime-(1.0d0-delta)*k)
    endif
    
end function fun_adjcost
    
end module mymod
!==============================================================!

subroutine sub_vfi_onestep(val_c_new,pol_kp,&
    val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,do_howard,n_howard,nk,nb,nx)
    
    use omp_lib 
    use mymod, only: linint, fun_adjcost
    implicit none
    ! Declare input variables
    integer(4), intent(in) :: nk,nb,nx
    integer(4), intent(in) :: do_howard,n_howard
    real(8), intent(in)    :: val_c(nk,nb,nx)
    real(8), intent(in)    :: val0_u(nk,nb,nx), kp_bar(nk,nb,nx)
    real(8), intent(in)    :: B_hat(nk,nx), profit_mat(nk,nx)
    real(8), intent(in)    :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
    real(8), intent(in)    :: theta, q, delta, psi
    ! Declare output variables
    real(8), intent(out)    :: val_c_new(nk,nb,nx)
    integer(4), intent(out) :: pol_kp(nk,nb,nx)
    ! Declare local variables
    integer(4) :: k_c, b_c, x_c, xp_c, kp_c, max_ind,h_c
    logical :: liq1, liq2
    real(8) :: k_val, b_val, profit_val, kp_ub, kp_val, bprime, v0_int
    real(8), allocatable :: EVx(:,:), b_grid_kp(:), rhs_vec(:)
    real(8), allocatable :: val0_c(:,:,:), val0(:,:,:)
    integer, allocatable :: is_c(:,:,:)
    
     ! Start execution  
    call omp_set_num_threads(8)
    
    ! Initialize outputs
    val_c_new = 0.0d0 !(nk,nb,nx)
    pol_kp    = 1     !(nk,nb,nx)

    ! STEP 2 - Impose liquidation, eq. (24)

    ! Value of constrained firm before exit
    allocate(val0_c(nk,nb,nx))
    val0_c = 0.0d0
    do x_c = 1,nx
        do b_c = 1,nb
            do k_c = 1,nk
                k_val = k_grid(k_c)
                b_val = b_grid(k_c,b_c)
                profit_val = profit_mat(k_c,x_c)
                liq1 = profit_val-b_val+theta*(1.0d0-delta)*k_val<0.0d0
                liq2 = val_c(k_c,b_c,x_c)<theta*(1.0d0-delta)*k_val-b_val
                if (liq1 .or. liq2) then
                    val0_c(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
                else
                    val0_c(k_c,b_c,x_c) = val_c(k_c,b_c,x_c)
                endif
            enddo
        enddo
    enddo
    
    ! STEP 3 - Do equation (23)
    allocate(val0(nk,nb,nx),is_c(nk,nb,nx))
    val0 = 0.0d0
    is_c = 1
    do x_c = 1,nx
        do b_c = 1,nb
            do k_c = 1,nk
                b_val = b_grid(k_c,b_c)
                if (b_val<=B_hat(k_c,x_c)) then
                    val0(k_c,b_c,x_c) = val0_u(k_c,b_c,x_c)
                    is_c(k_c,b_c,x_c) = 0
                else
                    val0(k_c,b_c,x_c) = val0_c(k_c,b_c,x_c)
                endif
            enddo
        enddo
    enddo
    
    ! STEP 4 - Solve eq. (25)
    
    allocate(EVx(nk,nb),rhs_vec(nk))
    ! The bottleneck in the code is this part: three nested loops over (x,b,k) with a maximization over k'
    ! Maximize over k' on the grid
	!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,&
	!$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind) 
    !$omp do collapse(1)
    do x_c = 1,nx
	    ! Compute expected value in eq. (25)
        EVx = 0.0d0 !(k',b')
        do xp_c = 1,nx
            EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c)
        enddo
        do b_c = 1,nb
            do k_c = 1,nk                
                k_val = k_grid(k_c)
                b_val = b_grid(k_c,b_c)
                profit_val = profit_mat(k_c,x_c)
                kp_ub = kp_bar(k_c,b_c,x_c)

                if (is_c(k_c,b_c,x_c)==1 .and. profit_val-b_val+theta*(1.0d0-delta)*k_val>=0.0d0) then
                    ! If no forced liquidation
                    ! Set lower and upper bound for maximization over k' in (25)
                    ! b'>=b_grid(1)
                    !kp_lb = q*b_grid_k(1)+profit_val-b_val+k_val
                
                    ! Create the RHS of Bellman eq. 25 for each k' \in k_grid
                    rhs_vec = -100000.0d0
                    do kp_c = 1,nk
                        kp_val    = k_grid(kp_c)
                        b_grid_kp = b_grid(kp_c,:)
                        bprime = max(b_grid_kp(1), (1.0d0/q)*(b_val-profit_val+fun_adjcost(kp_val,k_val,theta,delta)))
                        if (kp_val<=kp_ub) then 
                            v0_int = linint(b_grid_kp,EVx(kp_c,:),bprime)
                            rhs_vec(kp_c) = q*(psi*(theta*(1.0d0-delta)*kp_val-bprime)+(1.0d0-psi)*v0_int)
                        else
                            exit    
                        endif
                    enddo !kp_c
                    max_ind = maxloc(rhs_vec,dim=1)
                    pol_kp(k_c,b_c,x_c)    = max_ind
                    val_c_new(k_c,b_c,x_c) = rhs_vec(max_ind)
                else
                    ! forced liquidation: val_c is irrelevant
                    val_c_new(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
                endif
            enddo !k
        enddo !b
    enddo !x
	!$omp enddo
  	!$omp end parallel
  	
	! Now do Howard - I do NOT include the subroutine sub_howard since it is not important for the performance of the code.

	! Howard acceleration
	if (do_howard==1) then
		call sub_howard(val_c_new,pol_kp,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,n_howard,nk,nb,nx)
	endif

end subroutine sub_vfi_onestep   
!===========================================================! 
1 Like

It might help to post a complete program - now we see the relevant subroutines and functions, but nobody can experiment with it, as there is no main program.

While I am no expert in OpenMP, I do see a bunch of shared arrays. I do not know if they are read-only, but a first glance at the code suggests quite a few are. In that the copyin clause might help to have “unshared”, so that getting different parts of the arrays do not cause obstruction. Just a thought, mind you.

2 Likes

Scanning through this quickly:

Right now you’re only parallelizing over the outer nx = 60 cycles, you’d be better off parallelizing over the inner nb * nk = 8000 cycles as I don’t see any obvious signs of a carried dependency. (!$omp do collapse(1) is a useless directive btw.)

Do you really need to have EVx completely precomputed before the inner two cycles?

I’d also look into if you can restructure/sort the data to remove if statements. This is critical for GPUs, but the benefits will filter back into CPUs (modern CPUs are good at branch prediction, but no need for branch prediction in the first place is even better).

1 Like

@wphuhn-intel Thanks for your tips! It is true that there no carried dependencies in the three loops (x,b,k). I tried parallelizing over all three with do collapse(3) but the run time is significantly slower. Indeed, when collapsing all three loops, I had to move EVx inside and I guess this is one reason why the code is slower.
Regarding your question “Do you really need to have EVx completely precomputed before the inner two cycles?”: I don’t have to but since EVx depends only on x and not on (b,k), I think it is faster to compute it before opening the inner loops on (b,k), right?

I will try to generate a MWE. Here is the change that I’ve done which however resulted in slower performance:

!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,&
	!$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind) 
    !$omp do collapse(3)
    do x_c = 1,nx
        do b_c = 1,nb
            do k_c = 1,nk 
                ! Compute expected value in eq. (25)
                EVx = 0.0d0 !(k',b')
                do xp_c = 1,nx
                    EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c)
                enddo
                k_val = k_grid(k_c)

                ! rest of the code omitted for brevity
enddo !k
enddo !b
enddo !x
!$omp enddo
!$omp end parallel

@Arjen You’re absolutely right, here is a minimum working example:

!=============================================================!
module mymod
implicit none
    contains
    
    function linspace(my_start, my_stop, n)
    ! Purpose: replicate Matlab function <linspace>
    ! Originally written by Arnau Valladares-Esteban
    implicit none
    ! Inputs:
    integer :: n
    real(8) :: my_start, my_stop
    ! Function result:
    real(8) :: linspace(n)
    ! Locals:
    integer :: i
    real(8) :: step, grid(n)

    if (n == 1) then
        grid(n) = (my_start+my_stop)/2.0d0
    elseif (n>=2) then
        grid(1) = my_start
        if (n>2) then
            step = (my_stop-my_start)/(real(n-1,8))
            do i = 2, n-1
                grid(i) = grid(i-1) + step
            end do
        end if
        grid(n) = my_stop
    endif
    linspace = grid
    
    end function linspace
    !===============================================================================!
    
PURE FUNCTION locate(xx,x)
    IMPLICIT NONE
    REAL(8), DIMENSION(:), INTENT(IN) :: xx
    REAL(8), INTENT(IN) :: x
    INTEGER :: locate
    INTEGER :: n,il,im,iu
    n=size(xx)
    il=0
    iu=n+1
    do
	    if (iu-il <= 1) exit
	    im=(iu+il)/2
	    if (x >= xx(im)) then
		    il=im
	    else
		    iu=im
	    end if
    end do
    if (x == xx(1)) then
	    locate=1
    else if (x == xx(n)) then
	    locate=n-1
    else
	    locate=il
    end if
END FUNCTION locate
!===============================================================================!

PURE FUNCTION locate_equi(x_grid,xi) result(jl)
    IMPLICIT NONE
	! Declare inputs
    REAL(8), INTENT(IN) :: x_grid(:)
    REAL(8), INTENT(IN) :: xi
	! Declare function result 
    INTEGER :: jl
    INTEGER :: n
	REAL(8) :: step, xi_min 
    
	n      = size(x_grid)
    step   = x_grid(2)-x_grid(1)
	xi_min = xi-x_grid(1)
	
	jl     = min(n-1,max(1,floor(xi_min/step)+1))
	
END FUNCTION locate_equi
!===============================================================================!

pure subroutine find_loc(jstar,omega, a_grid,aopt) 
     implicit none
     ! Declare Inputs:
     real(8), intent(in) :: a_grid(:)
     real(8), intent(in) :: aopt
     ! Declare outputs:
     integer, intent(out) :: jstar
     real(8), intent(out) :: omega
     ! Declare local variables:
     integer :: na
     
     !Find j s.t. a_grid(j)<=aopt<a_grid(j+1)
     !for j=1,..,N-1
     ! omega is the weight on a_grid(j) and lies in [0,1]
 
     na = size(a_grid)
 
     jstar = locate_equi(a_grid,aopt)
     !Weight on a_grid(j)
     omega = (a_grid(jstar+1)-aopt)/(a_grid(jstar+1)-a_grid(jstar))
     !omega = max(min(omega,1.0d0),0.0d0)
     
end subroutine find_loc

FUNCTION linint(x,y,xi)
    ! Purpose: linear interpolation of function y on grid x at interpolation point xi
    !          To make it pure, cannot use PRINT or STOP statements
    IMPLICIT NONE
    REAL(8), DIMENSION(:), INTENT(IN) :: x,y
    REAL(8), INTENT(IN) :: xi
    REAL(8) :: linint
    REAL(8) :: a,b,d
    INTEGER :: n,i
    n=size(x)
    IF (size(y)/=n) THEN
	    PRINT *, 'linint: x and y must be of the same size'
        PAUSE
	    STOP 'program terminated by linint'
    END IF
	! If the grid x is NOT equally spaced, use "locate"
	! otherwise use "locate_equi", faster.
    !i=max(min(locate(x,xi),n-1),1)
    i = locate_equi(x,xi)
	d = x(i+1)-x(i)
    !IF (d == 0.0) STOP 'bad x input in splint'
    a      = (x(i+1)-xi)/d
    b      = (xi-x(i))/d
    linint = a*y(i)+b*y(i+1)
END FUNCTION linint

function fun_adjcost(kprime,k,theta,delta) result(adj)
    implicit none
    ! Variables
    ! INPUTS
    ! kprime: Next-period capital, scalar
    ! k:      Current period capital, scalar
    ! theta:  Resale value, in (0,1)
    real(8), intent(in) :: kprime, k 
    real(8), intent(in) :: theta, delta 
    real(8) :: adj
    
    ! Body of fun_adjcost
    if (kprime>=(1.0d0-delta)*k) then
        adj = kprime-(1.0d0-delta)*k
    else
        adj = theta*(kprime-(1.0d0-delta)*k)
    endif
    
end function fun_adjcost
    
end module mymod
!===============================================================================!

subroutine sub_vfi_onestep(val_c_new,pol_kp,&
    val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,nk,nb,nx)
    
    use omp_lib 
    use mymod, only: linint, fun_adjcost
    implicit none
    ! Declare input variables
    integer(4), intent(in) :: nk,nb,nx
    real(8), intent(in)    :: val_c(nk,nb,nx)
    real(8), intent(in)    :: val0_u(nk,nb,nx), kp_bar(nk,nb,nx)
    real(8), intent(in)    :: B_hat(nk,nx), profit_mat(nk,nx)
    real(8), intent(in)    :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
    real(8), intent(in)    :: theta, q, delta, psi
    ! Declare output variables
    real(8), intent(out)    :: val_c_new(nk,nb,nx)
    integer(4), intent(out) :: pol_kp(nk,nb,nx)
    ! Declare local variables
    integer(4) :: k_c, b_c, x_c, xp_c, kp_c, max_ind,h_c
    logical :: liq1, liq2
    real(8) :: k_val, b_val, profit_val, kp_ub, kp_val, bprime, v0_int
    real(8), allocatable :: EVx(:,:), b_grid_kp(:), rhs_vec(:)
    real(8), allocatable :: val0_c(:,:,:), val0(:,:,:)
    integer, allocatable :: is_c(:,:,:)
    
    ! The number of threads cannot be greater than the number of cores on the computer.
    ! For haomin's imac, this is 10. 
    call omp_set_num_threads(8)
    
    ! Initialize outputs
    val_c_new = 0.0d0 !(nk,nb,nx)
    pol_kp    = 1     !(nk,nb,nx)

    ! STEP 2 - Impose liquidation, eq. (24)

    ! Value of constrained firm before exit
    allocate(val0_c(nk,nb,nx))
    val0_c = 0.0d0
    do x_c = 1,nx
        do b_c = 1,nb
            do k_c = 1,nk
                k_val = k_grid(k_c)
                b_val = b_grid(k_c,b_c)
                profit_val = profit_mat(k_c,x_c)
                liq1 = profit_val-b_val+theta*(1.0d0-delta)*k_val<0.0d0
                liq2 = val_c(k_c,b_c,x_c)<theta*(1.0d0-delta)*k_val-b_val
                if (liq1 .or. liq2) then
                    val0_c(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
                else
                    val0_c(k_c,b_c,x_c) = val_c(k_c,b_c,x_c)
                endif
            enddo
        enddo
    enddo
    
    ! STEP 3 - Do equation (23)
    allocate(val0(nk,nb,nx),is_c(nk,nb,nx))
    val0 = 0.0d0
    is_c = 1
    do x_c = 1,nx
        do b_c = 1,nb
            do k_c = 1,nk
                b_val = b_grid(k_c,b_c)
                if (b_val<=B_hat(k_c,x_c)) then
                    val0(k_c,b_c,x_c) = val0_u(k_c,b_c,x_c)
                    is_c(k_c,b_c,x_c) = 0
                else
                    val0(k_c,b_c,x_c) = val0_c(k_c,b_c,x_c)
                endif
            enddo
        enddo
    enddo
    
    ! STEP 4 - Solve eq. (25)
    
    allocate(EVx(nk,nb),rhs_vec(nk))
    
    ! Maximize over k' on the grid
	!$omp parallel default(shared) private(x_c,xp_c,b_c,k_c,EVx,k_val,b_val,profit_val,&
	!$ kp_ub,rhs_vec,kp_val,b_grid_kp,bprime,v0_int,max_ind) 
    !$omp do collapse(1)
    do x_c = 1,nx
	    ! Compute expected value in eq. (25)
        EVx = 0.0d0 !(k',b')
        do xp_c = 1,nx
            EVx = EVx+val0(:,:,xp_c)*pi_x(x_c,xp_c)
        enddo
        do b_c = 1,nb
            do k_c = 1,nk                
                k_val = k_grid(k_c)
                b_val = b_grid(k_c,b_c)
                profit_val = profit_mat(k_c,x_c)
                kp_ub = kp_bar(k_c,b_c,x_c)

                if (is_c(k_c,b_c,x_c)==1 .and. profit_val-b_val+theta*(1.0d0-delta)*k_val>=0.0d0) then
                    ! If no forced liquidation
                    ! Set lower and upper bound for maximization over k' in (25)
                    ! b'>=b_grid(1)
                    !kp_lb = q*b_grid_k(1)+profit_val-b_val+k_val
                
                    ! Create the RHS of Bellman eq. 25 for each k' \in k_grid
                    rhs_vec = -100000.0d0
                    do kp_c = 1,nk
                        kp_val    = k_grid(kp_c)
                        b_grid_kp = b_grid(kp_c,:)
                        bprime = max(b_grid_kp(1), (1.0d0/q)*(b_val-profit_val+fun_adjcost(kp_val,k_val,theta,delta)))
                        if (kp_val<=kp_ub) then 
                            v0_int = linint(b_grid_kp,EVx(kp_c,:),bprime)
                            rhs_vec(kp_c) = q*(psi*(theta*(1.0d0-delta)*kp_val-bprime)+(1.0d0-psi)*v0_int)
                        else
                            exit    
                        endif
                    enddo !kp_c
                    max_ind = maxloc(rhs_vec,dim=1)
                    pol_kp(k_c,b_c,x_c)    = max_ind
                    val_c_new(k_c,b_c,x_c) = rhs_vec(max_ind)
                else
                    ! forced liquidation: val_c is irrelevant
                    val_c_new(k_c,b_c,x_c) = theta*(1.0d0-delta)*k_val-b_val
                endif
            enddo !k
        enddo !b
    enddo !x
	!$omp enddo
  	!$omp end parallel

end subroutine sub_vfi_onestep   
!============================================================================! 

program main
use mymod, only: linspace
use omp_lib
implicit none

external :: sub_vfi_onestep ! better to put this into a module, but I can't
integer, parameter :: nx = 60  !should be 60  !!!!!!!!!!!
integer, parameter :: nb = 80  !should be 80  !!!!!!!!!!!
integer, parameter :: nk = 100 !should be 100 !!!!!!!!!!!
real    :: t_end, t_start
integer :: x_c,k_c,istat
real(8) :: theta,q,delta,psi
real(8) :: k_grid(nk), b_grid(nk,nb), pi_x(nx,nx)
real(8) :: B_hat(nk,nx), profit_mat(nk,nx) 
real(8), allocatable :: val_c(:,:,:), val0_u(:,:,:), kp_bar(:,:,:), val_c_new(:,:,:)
integer(4), allocatable :: pol_kp(:,:,:)

write(*,*) "Hello, sub_vfi_onestep!"

! Create fake data

theta  = 0.9107133678d0
q      = 0.989d0
delta  = 0.015d0
psi    = 0.0029334045302853d0

! Equally spaced grids
k_grid = linspace(0.01d0,200.0d0,nk)

do k_c = 1,nk
    b_grid(k_c,:) = linspace(-100d0,50d0,nb)
enddo

! Right stochastic matrix pi_x, make sure all rows sum to one
CALL RANDOM_NUMBER(pi_x)
do x_c = 1,nx
    pi_x(x_c,:) = pi_x(x_c,:)/sum(pi_x(x_c,:))
enddo

write(*,*) "sum(pi_x) rows = ", sum(pi_x,dim=2)

allocate(val_c(nk,nb,nx), val0_u(nk,nb,nx), kp_bar(nk,nb,nx), val_c_new(nk,nb,nx),pol_kp(nk,nb,nx),STAT=istat)
if (istat/=0) then
write(*,*) "Allocation failed!"
endif

CALL RANDOM_NUMBER(B_hat)
CALL RANDOM_NUMBER(profit_mat)
CALL RANDOM_NUMBER(val0_u)
CALL RANDOM_NUMBER(kp_bar)
CALL RANDOM_NUMBER(kp_bar)
CALL RANDOM_NUMBER(val_c)

! Call the computational subroutine "sub_vfi_onestep"
! NOTE: would be better to use assumed shape dummy arguments but I can't change this
write(*,*) "Calling sub_vfi_onestep.."
t_start = omp_get_wtime()

call sub_vfi_onestep(val_c_new,pol_kp,&
    val_c,val0_u,kp_bar,B_hat,profit_mat,k_grid,b_grid,pi_x,theta,q,delta,psi,nk,nb,nx)

t_end = omp_get_wtime()
write(*,*) "..done!"

write(*,*) 'FORTRAN executable runs for',real(t_end-t_start),'seconds.'

pause

end program main


!===========================================================!

To compile the code:
ifort /Qopenmp /O3 /Qipo /Qprec-div- sub_vfi_onestep_mex.f90 -o run.exe