I want to initialize a sparse matrix in compressed row format (CSR). In order to do this, I need to initialize three arrays:
- values
- row_ind
- col_ind
These three arrays will be later passed to the function mkl_sparse_d_create_csr
. Everything works ok but the code is extremely slow. The slow part is not the call to mkl_sparse_d_create_csr
and the creation of the sparse matrix, but the creation of the arrays values
, row_ind
and col_ind
. I suspect this has to do with accessing arrays allocated on the heap, but I’m not expert on this.
At the end of this post I provide a minimum working example to show the problem. The size of the state space is defined in the module mod_globals
, see parameter n_w
. I set n_w=20
and on my laptop the code takes around 12 seconds. Here I want to highlight the bottleneck in the computation:
row_ind = 0
row_ind(1) = 1 ! first index is 1
counter = 1 ! initialize the counter of the non-zero elements to 1
row_c = 1 ! initialize the row counter of the flow matrix to 1
! First loop over the current state
do xm_c = 1,n_x
do xf_c = 1,n_x
do sm_c = 1,n_s
do sf_c = 1,n_s
do k_c = 1,n_k
row_ind(row_c+1) = row_ind(row_c)
!write(*,*) "xm_c,xf_c",xm_c,xf_c
! Reset the column counter
col_c = 1
! Then, loop over future states
do xpm_c = 1,n_x
do xpf_c = 1,n_x
do spm_c = 1,n_s
do spf_c = 1,n_s
do kp_c = 1,n_k
flow_temp = flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c)
if (abs(flow_temp)>1.0d-10) then
! non-zero value
!if (counter>maxnz) then
! write(*,*) "increase maxnz by 50%!"
! ! increase the size of values and col_ind arrays by 100%
! values_temp = values
! col_ind_temp = col_ind
! deallocate(values,col_ind)
! maxnz = maxnz*3/2
! allocate(values(maxnz),col_ind(maxnz),stat=istat)
! values(1:counter-1) = values_temp
! col_ind(1:counter-1) = col_ind_temp
!endif
values(counter) = flow_temp ! note: this is the bottleneck step!
col_ind(counter) = col_c
row_ind(row_c+1) = counter+1
! increment the non-zero element counter
counter = counter + 1 !merge(1,0,abs(flow_temp)>1.0d-9)
endif ! non-zero element
! Increment the column counter
col_c = col_c + 1
enddo ! closing loops over the next state
enddo
enddo
enddo
enddo
! Increment the row counter
row_c = row_c + 1
enddo ! closing loops over the current state
enddo
enddo
enddo
enddo
Any suggestion on how to make this code faster is greatly appreciated!
Minimum working example here:
! MINIMUM WORKING EXAMPLE: How to define a sparse matrix in Fortran
! Compilation:
! ifx /O3 /Qopenmp /Qipo /Qprec-div- /QxHost main.f90 -o run_win.exe
! ifort /O3 /Qopenmp /Qipo /Qprec-div- /QxHost main.f90 -o run_win.exe
! See fortran forum:
!===================================================================!
module mod_globals
use iso_fortran_env, only: int8, int16, int32, int64
implicit none
! Flags for debugging
integer, parameter :: verbose_dist = 2 ! Verbosity level for the distribution module
! Parameters for state space dimensions
integer, parameter :: n_s = 2
integer, parameter :: n_h = 3
integer, parameter :: n_w = 20
integer, parameter :: n_x = (n_h-1)*n_w+1
integer, parameter :: n_k = 6
contains
! Source:
! https://github.com/gabrielgggg/defaultModel/blob/master/src/sim.f90
SUBROUTINE fixSeed(mynum_in,n_warmup)
!
IMPLICIT NONE
! - Declare inputs
integer, intent(in), optional :: mynum_in
integer, intent(in), optional :: n_warmup
! - Declare local variables
INTEGER, ALLOCATABLE, DIMENSION(:) :: seed
INTEGER :: n, mynum ,i
real(8) :: temp
if (present(mynum_in)) then
mynum = mynum_in
else
! GM used 1989 as seed
mynum = 1989
endif
CALL RANDOM_SEED(size = n)
ALLOCATE(seed(n))
seed = mynum
CALL RANDOM_SEED(put = seed)
! Warm up random number generator
if (present(n_warmup)) then
do i=1,n_warmup
call RANDOM_NUMBER(temp)
enddo
endif
END SUBROUTINE fixSeed
end module mod_globals
!===================================================================!
module mod_distribution
use mod_globals
implicit none
contains
function flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c) result(F)
integer, intent(in) :: xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c
real(8) :: F
real(8) :: unif_rand
! The real code is of course very different, but it is not the bottleneck. The call and execution of this function is fast
call RANDOM_NUMBER(unif_rand)
if (unif_rand<0.01d0) then
! Some silly operation
F = 3.14d0+real(xm_c,8)
else
F = 0.0d0
endif
end function flow_couple
subroutine get_distribution_sparse()
! This subroutine computes the stationary distribution of couples
! using a sparse matrix representation of the flow matrix.
implicit none
! - Declare locals:
integer :: n_state,istat,xm_c,xf_c,xpm_c,xpf_c,sm_c,sf_c,spm_c,spf_c
integer :: row_c,col_c,k_c,kp_c
real(8) :: flow_temp,maxnz_real
! - Declare locals for sparse matrix
integer(int64) :: maxnz,counter
integer, allocatable, dimension(:) :: row_ind,col_ind,col_ind_temp,temp_index
real(8), allocatable :: values(:),values_temp(:),temp_vec(:)
! - Start execution
! n_state is the total number of states of households
n_state = n_x*n_x*n_s*n_s*n_k
write(*,*) "computing flow matrix..."
! In all elements are different than zero, then maxnz= n_state*n_state
! However, in practice, the number of non-zero elements is much smaller
maxnz = int(n_state,int64)*int(n_state,int64)/int(95,int64) ! guess of the number of non-zero elements
if (verbose_dist>=2) then
write(*,*) "maxnz = ",maxnz
maxnz_real = real(n_state,8)*real(n_state,8)/95.0d0
write(*,*) "maxnz_real = ",maxnz_real
write(*,*) "int(maxnz_real) = ",int(maxnz_real,int64)
!pause
endif
allocate(values(maxnz),col_ind(maxnz),row_ind(n_state+1),stat=istat)
if (istat/=0) then
write(*,*) "Error allocation of values,col_ind,row_ind"
pause
stop
endif
allocate(temp_vec(n_state),stat=istat)
row_ind = 0
row_ind(1) = 1 ! first index is 1
counter = 1 ! initialize the counter of the non-zero elements to 1
row_c = 1 ! initialize the row counter of the flow matrix to 1
! First loop over the current state
do xm_c = 1,n_x
do xf_c = 1,n_x
do sm_c = 1,n_s
do sf_c = 1,n_s
do k_c = 1,n_k
row_ind(row_c+1) = row_ind(row_c)
!write(*,*) "xm_c,xf_c",xm_c,xf_c
! Reset the column counter
col_c = 1
! Then, loop over future states
do xpm_c = 1,n_x
do xpf_c = 1,n_x
do spm_c = 1,n_s
do spf_c = 1,n_s
do kp_c = 1,n_k
flow_temp = flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c)
if (abs(flow_temp)>1.0d-10) then
! non-zero value
!if (counter>maxnz) then
! write(*,*) "increase maxnz by 50%!"
! ! increase the size of values and col_ind arrays by 100%
! values_temp = values
! col_ind_temp = col_ind
! deallocate(values,col_ind)
! maxnz = maxnz*3/2
! allocate(values(maxnz),col_ind(maxnz),stat=istat)
! values(1:counter-1) = values_temp
! col_ind(1:counter-1) = col_ind_temp
!endif
values(counter) = flow_temp ! note: this is the bottleneck step!
col_ind(counter) = col_c
row_ind(row_c+1) = counter+1
! increment the non-zero element counter
counter = counter + 1 !merge(1,0,abs(flow_temp)>1.0d-9)
endif ! non-zero element
! Increment the column counter
col_c = col_c + 1
enddo ! closing loops over the next state
enddo
enddo
enddo
enddo
! Increment the row counter
row_c = row_c + 1
enddo ! closing loops over the current state
enddo
enddo
enddo
enddo
! THE FOLLOWING PART RUNS VERY FAST, NOT A PROBLEM
! Resize values and col_ind to keep only non-zero elements
maxnz = counter-1
row_ind(n_state+1) = maxnz+1
! This is more concise but gives a segmentation fault
!values = values(1:maxnz)
!col_ind = col_ind(1:maxnz)
values_temp = values(1:maxnz)
col_ind_temp = col_ind(1:maxnz)
deallocate(values,col_ind)
values = values_temp
col_ind = col_ind_temp
if (verbose_dist>=2) then
write(*,*) "maxnz = ",maxnz
write(*,*) "n_state*n_state = ",real(n_state,8)*real(n_state,8)
write(*,'(A,f10.6)') "sparsity = ",real(maxnz,8)/(real(n_state,8)*real(n_state,8))
!pause
endif
! Check consistency of row_ind and col_ind
if (verbose_dist>=2) then
write(*,*) "Checking CSR..."
endif
call csr_check(n_state,maxnz,row_ind,col_ind)
if (verbose_dist>=2) then
write(*,*) "CSR check passed..."
!pause
endif
! Create sparse matrix sflow
!istat = mkl_sparse_d_create_csr(sflow,SPARSE_INDEX_BASE_ONE, n_state,n_state,row_ind(1:n_state),row_ind(2:n_state+1),col_ind,values)
!descr%type = SPARSE_MATRIX_TYPE_GENERAL
!if (istat/=0) then
! write(*,*) "Error in mkl_sparse_d_create_csr: ",istat
! pause
!endif
end subroutine get_distribution_sparse
!======================================================================!
subroutine csr_check(n,nnz,ia,ja)
! Thanks to Ivan Pribec
! Assumes square matrix of size n-by-n, and
! one based indexing of the sparsity pattern
integer, intent(in) :: n
integer(8), intent(in) :: nnz
integer, intent(in) :: ia(n+1), ja(nnz)
logical :: consistent
integer :: i, nz, iaa, iab
! Check consistency of row indexes
consistent = .true.
do i = 2, n+1
if (ia(i-1) >= ia(i)) then
consistent = .false.
return
end if
end do
if (.not. consistent) error stop 1
! Check number of nonzeros
nz = 0
do i = 1, n
nz = nz + (ia(i+1) - ia(i))
end do
if (nnz /= nz) error stop 2
if (nnz /= ia(n+1)-1) error stop 3
! Check column indexes are within the matrix
do i = 1, n
iaa = ia(i)
iab = ia(i+1)-1
if (any(ja(iaa:iab) > n)) then
write(*,*) "Column indexes exceed n"
error stop 4
end if
end do
end subroutine csr_check
!======================================================================!
end module mod_distribution
!===================================================================!
program main
use mod_globals, only: fixSeed,n_w
use mod_distribution, only: get_distribution_sparse
use omp_lib ! OpenMP library
implicit none
! - Declare variables
real(8) :: t1,t2
integer, parameter :: randseed = 3085113
! Execution starts here
write(*,*) "Hello, World!"
write(*,*) "n_w = ", n_w
! set seed for random number generator and warm up with 5000 draws
call fixSeed(randseed,5000)
t1 = omp_get_wtime()
call get_distribution_sparse()
t2 = omp_get_wtime()
write(*,'(A,G0)') "Execution time: ",t2-t1
pause
end program main