Initialize a sparse matrix - code is very slow

aledinola · May 5, 2024, 1:27pm

I want to initialize a sparse matrix in compressed row format (CSR). In order to do this, I need to initialize three arrays:

values
row_ind
col_ind

These three arrays will be later passed to the function mkl_sparse_d_create_csr. Everything works ok but the code is extremely slow. The slow part is not the call to mkl_sparse_d_create_csr and the creation of the sparse matrix, but the creation of the arrays values, row_ind and col_ind. I suspect this has to do with accessing arrays allocated on the heap, but I’m not expert on this.

At the end of this post I provide a minimum working example to show the problem. The size of the state space is defined in the module mod_globals, see parameter n_w. I set n_w=20 and on my laptop the code takes around 12 seconds. Here I want to highlight the bottleneck in the computation:

row_ind = 0
row_ind(1) = 1 ! first index is 1
counter = 1 ! initialize the counter of the non-zero elements to 1
row_c = 1 ! initialize the row counter of the flow matrix to 1
! First loop over the current state
do xm_c = 1,n_x
do xf_c = 1,n_x
do sm_c = 1,n_s
do sf_c = 1,n_s
do k_c = 1,n_k
    row_ind(row_c+1) = row_ind(row_c)
    !write(*,*) "xm_c,xf_c",xm_c,xf_c
    ! Reset the column counter
    col_c = 1
    ! Then, loop over future states
    do xpm_c = 1,n_x
    do xpf_c = 1,n_x   
    do spm_c = 1,n_s
    do spf_c = 1,n_s  
    do kp_c = 1,n_k
        flow_temp = flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c)
        if (abs(flow_temp)>1.0d-10) then  
            ! non-zero value
            !if (counter>maxnz) then
            !    write(*,*) "increase maxnz by 50%!"
            !    ! increase the size of values and col_ind arrays by 100%
            !    values_temp = values
            !    col_ind_temp = col_ind
            !    deallocate(values,col_ind)
            !    maxnz = maxnz*3/2
            !    allocate(values(maxnz),col_ind(maxnz),stat=istat)
            !    values(1:counter-1) = values_temp
            !    col_ind(1:counter-1) = col_ind_temp
            !endif
            values(counter)  = flow_temp  ! note: this is the bottleneck step!
            col_ind(counter) = col_c
            row_ind(row_c+1) = counter+1
            ! increment the non-zero element counter
            counter = counter + 1 !merge(1,0,abs(flow_temp)>1.0d-9)
        endif ! non-zero element
        ! Increment the column counter
        col_c = col_c + 1
    enddo ! closing loops over the next state
    enddo 
    enddo
    enddo
    enddo
    ! Increment the row counter
    row_c = row_c + 1
enddo ! closing loops over the current state
enddo 
enddo
enddo
enddo

Any suggestion on how to make this code faster is greatly appreciated!

Minimum working example here:

! MINIMUM WORKING EXAMPLE: How to define a sparse matrix in Fortran
! Compilation:
! ifx /O3 /Qopenmp /Qipo /Qprec-div- /QxHost main.f90 -o run_win.exe
! ifort /O3 /Qopenmp /Qipo /Qprec-div- /QxHost main.f90 -o run_win.exe
! See fortran forum:
!===================================================================!

module mod_globals
    use iso_fortran_env, only: int8, int16, int32, int64
    implicit none
    
    ! Flags for debugging
    integer, parameter :: verbose_dist = 2 ! Verbosity level for the distribution module
    
    ! Parameters for state space dimensions
    integer, parameter :: n_s = 2     
    integer, parameter :: n_h = 3     
    integer, parameter :: n_w = 20 
    integer, parameter :: n_x = (n_h-1)*n_w+1 
    integer, parameter :: n_k = 6
    
    contains
    
    ! Source:
    ! https://github.com/gabrielgggg/defaultModel/blob/master/src/sim.f90
    SUBROUTINE fixSeed(mynum_in,n_warmup)
    ! 
    IMPLICIT NONE
    ! - Declare inputs 
    integer, intent(in), optional :: mynum_in
    integer, intent(in), optional :: n_warmup
    ! - Declare local variables
    INTEGER, ALLOCATABLE, DIMENSION(:) :: seed
    INTEGER :: n, mynum ,i 
    real(8) :: temp

    if (present(mynum_in)) then
        mynum = mynum_in
    else
        ! GM used 1989 as seed
        mynum = 1989
    endif
    
    CALL RANDOM_SEED(size = n)
    ALLOCATE(seed(n))
    seed = mynum
    CALL RANDOM_SEED(put = seed)

    ! Warm up random number generator
    if (present(n_warmup)) then
        do i=1,n_warmup
            call RANDOM_NUMBER(temp)
        enddo
    endif
    
    END SUBROUTINE fixSeed
end module mod_globals
!===================================================================!
    
module mod_distribution
    use mod_globals
    implicit none
    
    contains
    
function flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c) result(F)
    integer, intent(in) :: xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c
    real(8) :: F
    real(8) :: unif_rand

    ! The real code is of course very different, but it is not the bottleneck. The call and execution of this function is fast
    
    call RANDOM_NUMBER(unif_rand)
    if (unif_rand<0.01d0) then
        ! Some silly operation
        F = 3.14d0+real(xm_c,8)
    else
        F = 0.0d0
    endif
    
end function flow_couple    
    
subroutine get_distribution_sparse()
    ! This subroutine computes the stationary distribution of couples
    ! using a sparse matrix representation of the flow matrix.
    implicit none
    ! - Declare locals:
    integer :: n_state,istat,xm_c,xf_c,xpm_c,xpf_c,sm_c,sf_c,spm_c,spf_c
    integer :: row_c,col_c,k_c,kp_c
    real(8) :: flow_temp,maxnz_real
    ! - Declare locals for sparse matrix
    integer(int64) :: maxnz,counter
    integer, allocatable, dimension(:) :: row_ind,col_ind,col_ind_temp,temp_index
    real(8), allocatable :: values(:),values_temp(:),temp_vec(:)
    
    ! - Start execution 
    
    ! n_state is the total number of states of households
    n_state = n_x*n_x*n_s*n_s*n_k
    
    write(*,*) "computing flow matrix..."
    ! In all elements are different than zero, then maxnz= n_state*n_state
    ! However, in practice, the number of non-zero elements is much smaller
    maxnz = int(n_state,int64)*int(n_state,int64)/int(95,int64) ! guess of the number of non-zero elements 
    if (verbose_dist>=2) then
        write(*,*) "maxnz = ",maxnz
        maxnz_real = real(n_state,8)*real(n_state,8)/95.0d0
        write(*,*) "maxnz_real = ",maxnz_real
        write(*,*) "int(maxnz_real) = ",int(maxnz_real,int64)
        !pause
    endif

    allocate(values(maxnz),col_ind(maxnz),row_ind(n_state+1),stat=istat)
    if (istat/=0) then
        write(*,*) "Error allocation of values,col_ind,row_ind"
        pause 
        stop
    endif 
    allocate(temp_vec(n_state),stat=istat)

    row_ind = 0
    row_ind(1) = 1 ! first index is 1
    counter = 1 ! initialize the counter of the non-zero elements to 1
    row_c = 1 ! initialize the row counter of the flow matrix to 1
    ! First loop over the current state
    do xm_c = 1,n_x
    do xf_c = 1,n_x
    do sm_c = 1,n_s
    do sf_c = 1,n_s
    do k_c = 1,n_k
        row_ind(row_c+1) = row_ind(row_c)
        !write(*,*) "xm_c,xf_c",xm_c,xf_c
        ! Reset the column counter
        col_c = 1
        ! Then, loop over future states
        do xpm_c = 1,n_x
        do xpf_c = 1,n_x   
        do spm_c = 1,n_s
        do spf_c = 1,n_s  
        do kp_c = 1,n_k
            flow_temp = flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c)
            if (abs(flow_temp)>1.0d-10) then  
                ! non-zero value
                !if (counter>maxnz) then
                !    write(*,*) "increase maxnz by 50%!"
                !    ! increase the size of values and col_ind arrays by 100%
                !    values_temp = values
                !    col_ind_temp = col_ind
                !    deallocate(values,col_ind)
                !    maxnz = maxnz*3/2
                !    allocate(values(maxnz),col_ind(maxnz),stat=istat)
                !    values(1:counter-1) = values_temp
                !    col_ind(1:counter-1) = col_ind_temp
                !endif
                values(counter)  = flow_temp  ! note: this is the bottleneck step!
                col_ind(counter) = col_c
                row_ind(row_c+1) = counter+1
                ! increment the non-zero element counter
                counter = counter + 1 !merge(1,0,abs(flow_temp)>1.0d-9)
            endif ! non-zero element
            ! Increment the column counter
            col_c = col_c + 1
        enddo ! closing loops over the next state
        enddo 
        enddo
        enddo
        enddo
        ! Increment the row counter
        row_c = row_c + 1
    enddo ! closing loops over the current state
    enddo 
    enddo
    enddo
    enddo
    
    ! THE FOLLOWING PART RUNS VERY FAST, NOT A PROBLEM
    ! Resize values and col_ind to keep only non-zero elements
    maxnz = counter-1
    row_ind(n_state+1) = maxnz+1
    ! This is more concise but gives a segmentation fault
    !values = values(1:maxnz) 
    !col_ind = col_ind(1:maxnz)
    values_temp = values(1:maxnz)
    col_ind_temp = col_ind(1:maxnz)
    deallocate(values,col_ind)
    values = values_temp
    col_ind = col_ind_temp
    if (verbose_dist>=2) then   
        write(*,*) "maxnz = ",maxnz
        write(*,*) "n_state*n_state = ",real(n_state,8)*real(n_state,8)
        write(*,'(A,f10.6)') "sparsity = ",real(maxnz,8)/(real(n_state,8)*real(n_state,8))
        !pause
    endif 
    
    ! Check consistency of row_ind and col_ind
    if (verbose_dist>=2) then
        write(*,*) "Checking CSR..."
    endif
    call csr_check(n_state,maxnz,row_ind,col_ind)
    if (verbose_dist>=2) then
        write(*,*) "CSR check passed..."
        !pause
    endif

! Create sparse matrix sflow
!istat = mkl_sparse_d_create_csr(sflow,SPARSE_INDEX_BASE_ONE, n_state,n_state,row_ind(1:n_state),row_ind(2:n_state+1),col_ind,values)
!descr%type = SPARSE_MATRIX_TYPE_GENERAL
!if (istat/=0) then
!    write(*,*) "Error in mkl_sparse_d_create_csr: ",istat
!    pause
!endif

end subroutine get_distribution_sparse

!======================================================================!    
subroutine csr_check(n,nnz,ia,ja)
! Thanks to Ivan Pribec
     ! Assumes square matrix of size n-by-n, and
    ! one based indexing of the sparsity pattern
        integer, intent(in) :: n
        integer(8), intent(in) :: nnz
        integer, intent(in) :: ia(n+1), ja(nnz)

        logical :: consistent
        integer :: i, nz, iaa, iab

        ! Check consistency of row indexes
        consistent = .true.
        do i = 2, n+1
            if (ia(i-1) >= ia(i)) then
                consistent = .false.
                return
            end if
        end do
        if (.not. consistent) error stop 1

        ! Check number of nonzeros
        nz = 0
        do i = 1, n
            nz = nz + (ia(i+1) - ia(i))
        end do
        if (nnz /= nz) error stop 2
        if (nnz /= ia(n+1)-1) error stop 3

        ! Check column indexes are within the matrix
        do i = 1, n

            iaa = ia(i)
            iab = ia(i+1)-1

            if (any(ja(iaa:iab) > n)) then
                write(*,*) "Column indexes exceed n"
                error stop 4
            end if

        end do

end subroutine csr_check
!======================================================================!  

end module mod_distribution
!===================================================================!
    
program main
    use mod_globals, only: fixSeed,n_w
    use mod_distribution, only: get_distribution_sparse
    use omp_lib ! OpenMP library
    implicit none
    ! - Declare variables
    real(8) :: t1,t2
    integer, parameter :: randseed = 3085113
    
    ! Execution starts here
    write(*,*) "Hello, World!"
    write(*,*) "n_w = ", n_w
    
    ! set seed for random number generator and warm up with 5000 draws
    call fixSeed(randseed,5000) 
    
    t1 = omp_get_wtime()
    call get_distribution_sparse()
    t2 = omp_get_wtime()
    
    write(*,'(A,G0)') "Execution time: ",t2-t1
    
    pause
end program main

ivanpribec · May 5, 2024, 4:06pm

A few ideas which are maybe applicable

use a two-stage process: step 1) determine the sparsity pattern, ideally this would be a relatively cheap operation; step 2) once the sparsity pattern is known, compute the values
construct the matrix using triplet/COO format (mkl_sparse_?_create_coo) and convert it to CSR before it is used (mkl_sparse_convert_csr)
use a jagged array as temporary storage so that the outer loop can be parallelized; afterward copy the values into contiguous storage

rwmsu · May 5, 2024, 4:29pm

Yousef Saad’s SPARSKIT is still available (its Fortran 77 although I think John Burkhardt has a F90 version). You might try using it as a reference/benchmark to see what kind of performance you should expect.

https://www-users.cse.umn.edu/~saad/software/SPARSKIT/

ivanpribec · May 5, 2024, 4:57pm

Could you show us the real flow_couple procedure? The fact the sparse matrix appears to be filled randomly example is not really helpful. If it’s really this way, one could just select a random subset of column indexes.

I imagine the actual flow_couple is some kind of discontinuous function f(\mathbf{s},\mathbf{s'}). (Luckily, I’ve never had to deal with a 10-d state space, even the Boltzmann equation is only 7-d.)

RonShepard · May 5, 2024, 5:15pm

There is a 10-deep set of nested do loops, and a function call for each combination, so to improve efficiency, you probably need to eliminate that effort. If you know which interactions are active given one set of 5 indices, then here is one approach. You do the loop over the 5 indices, and then fill in the sparse array structure based on which interactions are nonzero or numerically significant. You would need to fill in that array without looping over all possibilities, just from the 5 indices that are given. That is very problem-specific, of course, and it requires knowledge of the sparsity structure for each problem.

ivanpribec · May 5, 2024, 5:33pm

Here’s one possible variation of using the COO format, that makes the loop nest slightly easier to parallelize:

    ! Allocate COO storage
    allocate(values(maxnz),col_ind(maxnz),row_ind(maxnz))

    counter = 1 ! initialize the counter of the non-zero elements to 1

    !$omp parallel do collapse(10) default(private) shared(values,col_ind,row_ind,counter,n_x,n_s,n_k)
    do xm_c = 1,n_x ! Loop over current state s
    do xf_c = 1,n_x
    do sm_c = 1,n_s
    do sf_c = 1,n_s
    do k_c  = 1,n_k
        do xpm_c = 1,n_x ! Loop over future state s'
        do xpf_c = 1,n_x
        do spm_c = 1,n_s
        do spf_c = 1,n_s
        do kp_c  = 1,n_k

            ! Calculate linear indexes (map 5-d to 1-d index)
            ridx = linear_index(xm_c,xf_c,sm_c,sf_c,k_c)
            cidx = linear_index(xpm_c,xpf_c,spm_c,spf_c,kp_c)
        
            flow_temp = flow_couple(&
                s=state(xm_c,xf_c,sm_c,sf_c,k_c), &
                sp=state(xpm_c,xpf_c,spm_c,spf_c,kp_c))

            if (abs(flow_temp) > 1.0d-10) then
                !$omp critical
                values(counter) = flow_temp
                row_ind(counter) = ridx
                col_ind(counter) = cidx
                counter = counter + 1
                !$omp end critical
            end if

        enddo ! closing loops over the future state s'
        enddo 
        enddo
        enddo
        enddo
    enddo ! closing loops over the state s
    enddo 
    enddo
    enddo
    enddo

There will be a lot of pressure on the critical section, so I’m not too optimistic about this approach.

aledinola · May 5, 2024, 6:53pm

Yes, sure. I came up with a fake example because the real flow_couple function is very long. As you can see, there are tons of if conditions and additions/multiplications, but it is not very heavy computationally.

function flow_couple(xm_c,xf_c,sm_c,sf_c,k_c,xpm_c,xpf_c,spm_c,spf_c,kp_c,omega) result(F)
        implicit none
        ! Declare inputs
        integer, intent(in) :: xm_c,xf_c,xpm_c,xpf_c ! current and next state of employment
        integer, intent(in) :: sm_c,sf_c,spm_c,spf_c ! current and next state of skills
        integer, intent(in) :: k_c,kp_c ! current and next state of demographics
        type(omegafun), intent(in) :: omega
        ! Declare function result
        real(8) :: F ! Flow from state x to state x'
        ! Declare locals
        real(8) :: factor

        F = 0d0 
    !  Return zero for all impossible transitions or non-transitions
    ! - Same demographics, skills and employment status
    if (spm_c==sm_c .and. spf_c==sf_c .and. xpm_c == xm_c .and. xpf_c == xf_c .and. k_c==kp_c) then
        F = 0.0d0
        return
    endif
    ! ! Changes in emp of both members to employment
    ! if (xpm_c/=xm_c .and. xpf_c/=xf_c .and. xpm_c>1 .and. xpf_c>1) then
    !     F = 0.0d0
    !     return
    ! endif
    ! ! Both members lose their jobs (and not newborn)
    ! if (xpm_c==1 .and. xpf_c==1 .and. xm_c>1 .and. xf_c>1  .and. (kp_c>1 .or. spm_c>1 .or. spf_c>1)) then
    !     F = 0.0d0
    !     return
    ! endif

! If-Loop structure:
! - Layer 1 (outermost): demographic change (incl. pl). Cases: (a) no demographic change, (b) transitions to non-young-child state, (c) transitions from non-young child state to young child state, (d) transitions within young child states (parental leave expiration, or job loss while on parental leave).
! - Layer 2: skill change. Cases: (a) no skill change, (b) skill change for m, (c) skill change for f.
! - Layer 3: employment changes. This depend on the skill and demographic changes.


! BEGIN IF-LOOPS (no retirement shock)
! - Layer 1 (a) No demographic change
if (k_c == kp_c) then
    ! Note: job acceptance and (exogeneous or endogeneous) job separation from k_c = 3,4 (parental leave, young child) will result in transition to kp_c = 2. Thus, these events don't belong to the current case (no demographic change), but to the layer 1 (d): demographic transitions within young child states.

    ! - Layer 2 (a) Both members keep same skills
    if (spm_c==sm_c .and. spf_c==sf_c) then
        ! - Layer 3
        ! - Both members are unemployed in x
        if (xm_c==1 .and. xf_c==1) then
            ! m receives a job offer and accepts (m cannot be on parental leave)
            if (xpm_c>1 .and. xpf_c==1 .and. k_c/=3) then
                factor = omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)+omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*factor
                !if (debug_dist) write(*,*) "Case 3: F = ",F
            ! f receives a job offer and accepts (f cannot be on parental leave)
            elseif (xpm_c==1 .and. xpf_c>1 .and. k_c/=4) then
                factor = omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)+omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*factor
                !if (debug_dist) write(*,*) "Case 5: F = ",F
            else
                F = 0d0
            endif
        ! - m is employed and f is unemployed in x
        elseif (xm_c>1 .and. xf_c==1) then
            if (xpm_c==1 .and. xpf_c==1 .and. k_c /= 3) then
            ! m loses the job (m cannot be in parental leave)
                F = delta_vec(xm_c,k_c,1)
                !if (debug_dist) write(*,*) "Case 6: F = ",F
            elseif (xpm_c>1 .and. xpm_c/=xm_c .and. xpf_c==1 .and. k_c/=3) then
            ! employed male accepts a new job (m cannot be on parental leave)
                factor = omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)+omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*factor
                !if (debug_dist) write(*,*) "Case 7: F = ",F
            elseif (xpm_c==1 .and. xpf_c>1 .and. k_c/=3 .and. k_c/=4) then
            ! female accepts a job while male quits (m and f cannot be in parental leave)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 8: F = ",F
            elseif (xpm_c==xm_c .and. xpf_c>1 .and. k_c/=4) then
            ! female accepts a job while male stays (f cannot be on parental leave)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 9: F = ",F
            else 
                F = 0d0
            endif

        ! - f is employed and m is unemployed in x
        elseif (xf_c>1 .and. xm_c==1) then
            if (xpf_c==1 .and. xpm_c==1 .and. k_c /= 4) then
            ! f loses the job (female cannot be in parental leave)
                F = delta_vec(xf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 10: F = ",F
            elseif (xpf_c>1 .and. xpf_c/=xf_c .and. xpm_c==1 .and. k_c/=4) then
            ! employed female accepts a new job (f cannot be on parental leave)
                factor = omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)+omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*factor
                !if (debug_dist) write(*,*) "Case 11: F = ",F
            elseif (xpf_c==1 .and. xpm_c>1  .and. k_c/=3 .and. k_c /= 4) then
            ! male accepts a job while female quits (m and f cannot be in parental leave)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
                !if (debug_dist) write(*,*) "Case 12: F = ",F
            elseif (xpf_c==xf_c .and. xpm_c>1 .and. k_c/=3) then
            ! male accepts a job while female stays (m cannot be on parental leave)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
                !if (debug_dist) write(*,*) "Case 13: F = ",F
            else 
                F = 0d0
            endif
    ! - Both m and f are employed
        elseif (xm_c>1 .and. xf_c>1) then
            if (xpm_c==1 .and. xpf_c==xf_c .and. k_c /= 3) then
            ! m loses the job (male cannot be in parental leave)
                F = delta_vec(xm_c,k_c,1)
                !if (debug_dist) write(*,*) "Case 14: F = ",F
            elseif (xpm_c==xm_c .and. xpf_c==1 .and. k_c /=4) then
            ! f loses the job (female cannot be in parental leave)
                F = delta_vec(xf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 15: F = ",F
            elseif (xpm_c>1 .and. xpm_c/=xm_c .and. xpf_c==1 .and. k_c/=3 .and. k_c /= 4) then
            ! m accepts a new job and f quits (m and f cannot be in parental leave)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
                !if (debug_dist) write(*,*) "Case 16: F = ",F
            elseif (xpm_c>1 .and. xpm_c/=xm_c .and. xpf_c==xf_c .and. k_c/=3) then
            ! m accepts a new job and f stays  (m cannot be on parental leave)
                F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
               ! if (debug_dist) write(*,*) "Case 17: F = ",F
            elseif (xpf_c>1 .and. xpf_c/=xf_c .and. xpm_c==1 .and. k_c /= 3 .and. k_c/=4) then
            ! f accepts a new job and m quits (m and f cannot be in parental leave)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 18: F = ",F
            elseif (xpf_c>1 .and. xpf_c/=xf_c .and. xpm_c==xm_c .and. k_c/=4) then
            ! f accepts a new job and m stays (f cannot be on parental leave)
                F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
                !if (debug_dist) write(*,*) "Case 19: F = ",F
            else 
                F = 0d0
            endif
        endif ! close loop over current empl. state of the couple
    ! - Layer 2 (b) Male gets skill shock, f keeps same skill
    elseif (spm_c/=sm_c .and. spf_c==sf_c) then
        if (xpm_c==xm_c .and. xpf_c==xf_c .and. spm_c==sm_c+1 .and. xm_c>1) then
            ! Empl states are the same, m is empl and his skill goes up by 1
            F = s_trans_up(xm_c,sm_c,k_c,1)
        elseif (xpm_c==xm_c .and. xpf_c==xf_c .and. spm_c==sm_c-1 .and. xm_c==1) then
            ! Empl states are the same, m is non-empl and his skill goes down by 1
            F = s_trans_down(xm_c,sm_c,k_c,1)
        else
            F = 0d0
        endif ! loop over empl of couple
    ! - Layer 2 (c) Male keeps same skill, f gets skill shock
    elseif (spm_c==sm_c .and. spf_c/=sf_c) then
        if (xpm_c==xm_c .and. xpf_c==xf_c .and. spf_c==sf_c+1 .and. xf_c>1) then
            ! Empl states are the same, f is empl and her skill goes up by 1
            F = s_trans_up(xf_c,sf_c,k_c,2)
        elseif (xpm_c==xm_c .and. xpf_c==xf_c .and. spf_c==sf_c-1 .and. xf_c==1) then
            ! Empl states are the same, f is non-empl and her skill goes down by 1
            F = s_trans_down(xf_c,sf_c,k_c,2)
        else
            F = 0d0
        endif ! loop over empl of couple
    else 
        F = 0.0d0
    endif ! close loop over skills of the couple

! - Layer 1 (b) Demographic change: transitions to a non-young-kid state (kp = 1,5,6)
elseif (kp_c /= k_c .and. (kp_c == 1 .or. kp_c == 5 .or. kp_c == 6)) then
    ! Notes: only exogeneous demographic change in this case. Quitting can happen. Skill change not possible.

    ! - Both members keep same skills
    if (sm_c == spm_c .and. sf_c == spf_c) then
        ! - Both members are unemployed in x and xp
        if (xm_c == 1 .and. xpm_c == 1 .and. xf_c == 1 .and. xpf_c==1) then
            F = Phik_couple(k_c,kp_c)
        ! - m emp, f unemp in x; emp states remain unchanged
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c == 1 .and. xpf_c == 1) then
            F = Phik_couple(k_c,kp_c)*(1.0d0-omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,1))
        ! - m emp, f unemp in x; both unemp in xp
        elseif (xm_c > 1 .and. xpm_c == 1 .and. xf_c == 1 .and. xpf_c == 1) then
            F = Phik_couple(k_c,kp_c)*omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,1)
        ! - m unemp, f emp in x; emp states remain unchanged
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c == 1 .and. xpm_c == 1) then
            F = Phik_couple(k_c,kp_c)*(1.0d0-omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,2))
        ! - m unemp, f emp in x; both unemp in xp
        elseif (xf_c > 1 .and. xpf_c == 1 .and. xm_c == 1 .and. xpm_c == 1) then
            F = Phik_couple(k_c,kp_c)*omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,2)
        ! - both employed in x; emp states remain unchanged
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c) then
            F = Phik_couple(k_c,kp_c)*(1.0d0-omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,1)-omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,2))
        ! - both employed in x; f unemp in xp
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == 1) then
            F = Phik_couple(k_c,kp_c)*omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,2)
        ! - both employed in x; m unemp in xp
        elseif (xm_c > 1 .and. xpm_c == 1 .and. xf_c > 1 .and. xpf_c == xf_c) then
            F = Phik_couple(k_c,kp_c)*omega%couple_quit(xm_c,xf_c,sm_c,sf_c,kp_c,1)
        else 
            F = 0.0d0
        endif

    ! - At least one member changes skills (impossible transition)
    else ! if (sm_c /= spm_c .or. sf_c /= spf_c) then
        F = 0.0d0
    endif ! close loop over skills of the couple (with demographic change)

! -  Layer 1 (c) Demographic change: transitions from a non-young-kid state (k = 1,5,6) to a young-kid state (kp = 2,3,4)
elseif ((k_c /= 2 .and. k_c /= 3 .and. k_c /= 4) .and. (kp_c==2 .or. kp_c==3 .or. kp_c==4)) then
    ! Notes: demographic transitions are both exogeneous (to a state with young child) and endogeneous (decisions regarding parental leave). Quitting is not possible because parental leave is superior to quitting. Skill change not possible.

    ! - Both members keep same skills
    if (sm_c == spm_c .and. sf_c == spf_c) then
        ! - Both members are unemployed in x and xp: no one is eligible for parental leave (kp_c = 2)
        if (xm_c == 1 .and. xpm_c == 1 .and. xf_c == 1 .and. xpf_c == 1 .and. kp_c == 2) then
            F = Phik_couple(k_c,kp_c)
        ! - m is employed and f is unemployed in x; m does not go on parental leave
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c == 1 .and. xpf_c == 1 .and. kp_c == 2) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        ! - m is employed and f is unemployed in x; m goes on parental leave
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c == 1 .and. xpf_c == 1 .and. kp_c == 3) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
         ! - f is employed and m is unemployed in x; f does not go on parental leave
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c == 1 .and. xpm_c == 1 .and. kp_c == 2) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        ! - f is employed and m is unemployed in x; f goes on parental leave
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c == 1 .and. xpm_c == 1 .and. kp_c == 4) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        ! - both employed in x; no one goes on parental leave
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. kp_c == 2) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        ! - both employed in x; m goes on parental leave f remains in job
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. kp_c == 3) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        ! - both employed in x; f goes on parental leave m remains in job
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. kp_c == 4) then
            F = Phik_couple(k_c,kp_c)*omega%couple_pl(kp_c,xm_c,xf_c,sm_c,sf_c)
        else 
            F = 0.0d0
        endif


    else ! if (sm_c /= spm_c .or. sf_c /= spf_c) then
        F = 0.0d0
    endif ! close loop over skills of the couple (with demographic change)

! -  Layer 1 (d) Demographic change: transitions within young-kid states (k = 2,3,4)
!       Note: if k_c = 2, the only possible demographic transition is to kp = 1,5,6 (non-young-child state)
elseif (k_c /=kp_c .and. (k_c == 3 .or. k_c == 4) .and. (kp_c==2 .or. kp_c == 3 .or. kp_c==4)) then
    ! Notes: these demog transitions are purely endogeneous and must be triggered by either parental leave expiration, own job acceptance, spousal job acceptance (which induces quitting), or exogeneous job separation. Skill change not possible because it doesn't trigger any change in parental leave. 

    ! - Layer 2 (a) Both members keep same skills
    if (sm_c == spm_c .and. sf_c == spf_c) then
        ! - m is employed and on parental leave, f is unemployed. m's parental leave expires. m returns to job, no one takes parental leave. 
        if (xm_c > 1 .and. xpm_c == xm_c .and. xf_c == 1 .and. xpf_c == 1 .and. k_c == 3 .and. kp_c ==2 ) then
            F = p_tau*omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - m is employed and on parental leave, f is unemployed. m and f unemployed in xp, no one takes parental leave. Two scenarios that give rise to the same outcome:
        !       i. m loses job. ii. m's parental leave expires and m quits.
        elseif (xm_c > 1 .and. xpm_c == 1 .and. xf_c == 1 .and. xpf_c == 1 .and. k_c == 3 .and. kp_c ==2 ) then
            F = delta_vec(xm_c,k_c,1)+p_tau*(1.0d0-omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c))
        ! - m is employed and on parental leave, f is unemployed. f accepts job, m quits. No one takes parental leave.
        elseif(xm_c > 1 .and. xpm_c == 1 .and. xf_c == 1 .and. xpf_c > 1 .and. k_c==3 .and. kp_c==2) then
            F=lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
        ! - m is employed and on parental leave, f is unemployed. m accepts a new job. No one takes parental leave.
        elseif(xm_c > 1 .and. xpm_c > 1 .and. xm_c/=xpm_c .and. xf_c == 1 .and. xpf_c == 1 .and. k_c==3 .and. kp_c==2) then
            F=lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*(omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1) + omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1))
        ! - f is employed and on parental leave, m is unemployed. f's parental leave expires. f returns to job, no one takes parental leave. 
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c == 1 .and. xpm_c == 1 .and. k_c == 4 .and. kp_c ==2 ) then
            F = p_tau*omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - f is employed and on parental leave, m is unemployed. m and f unemployed in xp, no one takes parental leave. Two scenarios that give rise to the same outcome:
        !       i. f loses job. ii. f's parental leave expires and f quits.
        elseif (xf_c > 1 .and. xpf_c == 1 .and. xm_c == 1 .and. xpm_c == 1 .and. k_c == 4 .and. kp_c ==2 ) then
            F = delta_vec(xf_c,k_c,2)+p_tau*(1.0d0-omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c))
        ! - f is employed and on parental leave, m is unemployed. m accepts job, f quits. No one takes parental leave.
        elseif(xf_c > 1 .and. xpf_c == 1 .and. xm_c == 1 .and. xpm_c > 1 .and. k_c==4 .and. kp_c==2) then
            F=lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
        ! - f is employed and on parental leave, m is unemployed. f accepts a new job. No one takes parental leave.
        elseif(xf_c > 1 .and. xpf_c > 1 .and. xf_c/=xpf_c .and. xm_c == 1 .and. xpm_c == 1 .and. k_c==4 .and. kp_c==2) then
            F=lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*(omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2) + omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2))
        ! - both employed, m is on parental leave. m's parental leave expires, m returns to job, f remains in job, no one takes parental leave.
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. k_c == 3 .and. kp_c ==2 ) then
            F = p_tau*omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - both employed, m is on parental leave. m's parental leave expires, m returns to job, f takes parental leave. 
        elseif (xm_c > 1 .and. xpm_c == xm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. k_c == 3 .and. kp_c ==4 ) then
            F = p_tau*omega%couple_ple2(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - both employed, m is on parental leave. m becomes unemp and f remains in job. no one takes parental leave. Two scenarios lead to this case:
        ! i. m's parental leave expires and m quits. ii. m loses job.
        elseif (xm_c > 1 .and. xpm_c == 1 .and. xf_c > 1 .and. xpf_c == xf_c .and. k_c == 3 .and. kp_c ==2 ) then
            F = delta_vec(xm_c,k_c,1) + p_tau*(1.0d0-omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)-omega%couple_ple2(xm_c,xf_c,sm_c,sf_c,k_c))
        ! - both employed, m is on parental leave. f accepts a new job and m quits. No one takes parental leave.
        elseif (xm_c > 1 .and. xpm_c == 1 .and. xf_c > 1 .and. xpf_c>1 .and. xpf_c /= xf_c .and. k_c == 3 .and. kp_c ==2 ) then
            F = lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
        ! - both employed, m is on parental leave. m accepts a new job, f stays. no one takes parental leave.
        elseif(xm_c > 1 .and. xpm_c > 1 .and. xm_c/=xpm_c .and. xf_c > 1 .and. xpf_c == xf_c .and. k_c==3 .and. kp_c==2) then
            F=lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_plus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
        ! - both employed, m is on parental leave. m accepts a new job, f quits. no one takes parental leave.
        elseif(xm_c > 1 .and. xpm_c > 1 .and. xm_c/=xpm_c .and. xf_c > 1 .and. xpf_c == 1 .and. k_c==3 .and. kp_c==2) then
            F=lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
        ! - both employed, f is on parental leave. f's parental leave expires, f returns to job, m remains in job, no one takes parental leave.
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c > 1 .and. xpm_c == xm_c .and. k_c == 4 .and. kp_c ==2 ) then
            F = p_tau*omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - both employed, f is on parental leave. f's parental leave expires, f returns to job, m takes parental leave. 
        elseif (xf_c > 1 .and. xpf_c == xf_c .and. xm_c > 1 .and. xpm_c == xm_c .and. k_c == 4 .and. kp_c ==3 ) then
            F = p_tau*omega%couple_ple2(xm_c,xf_c,sm_c,sf_c,k_c)
        ! - both employed, f is on parental leave. f becomes unemp and m remains in job. no one takes parental leave. Two scenarios lead to this case:
        ! i. f's parental leave expires and f quits. ii. f loses job.
        elseif (xf_c > 1 .and. xpf_c == 1 .and. xm_c > 1 .and. xpm_c == xm_c .and. k_c == 4 .and. kp_c ==2 ) then
            F = delta_vec(xf_c,k_c,2) + p_tau*(1.0d0-omega%couple_ple1(xm_c,xf_c,sm_c,sf_c,k_c)-omega%couple_ple2(xm_c,xf_c,sm_c,sf_c,k_c))
        ! - both employed, f is on parental leave. m accepts a new job and f quits. No one takes parental leave.
        elseif (xf_c > 1 .and. xpf_c == 1 .and. xm_c > 1 .and. xpm_c>1 .and. xpm_c /= xm_c .and. k_c == 4 .and. kp_c ==2 ) then
            F = lambda_vec(xm_c,k_c,1)*F_hw(xpm_c-1,1)*omega%couple_minus(xpm_c,xm_c,xf_c,sm_c,sf_c,k_c,1)
         ! - both employed, f is on parental leave. f accepts a new job, m stays. no one takes parental leave.
        elseif(xf_c > 1 .and. xpf_c > 1 .and. xf_c/=xpf_c .and. xm_c > 1 .and. xpm_c == xm_c .and. k_c==4 .and. kp_c==2) then
            F=lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_plus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
        ! - both employed, f is on parental leave. f accepts a new job, m quits. no one takes parental leave.
        elseif(xf_c > 1 .and. xpf_c > 1 .and. xf_c/=xpf_c .and. xm_c > 1 .and. xpm_c == 1 .and. k_c==4 .and. kp_c==2) then
            F=lambda_vec(xf_c,k_c,2)*F_hw(xpf_c-1,2)*omega%couple_minus(xpf_c,xm_c,xf_c,sm_c,sf_c,k_c,2)
        
        else
            F = 0.0d0
        endif
    else ! if (sm_c /= spm_c .or. sf_c /= spf_c) then
        F = 0.0d0
    endif ! close loop over skills of the couple (with demographic change)

! - No other types of demographic change possible
else
    F = 0.0d0
endif ! close loop over demographic change

! Retirement shock (you are reborn as unemployed with lowest skill)
if (xpm_c ==1 .and. xpf_c == 1 .and. kp_c == 1) then
    F = F + Retk_couple(k_c)*p_zeta_couple(spm_c,spf_c)
endif

! if (debug_dist) write(*,*) "Case undefined: F = ",F
end function flow_couple

Then in separate module I have defined

! - Decision rules
    type omegafun
        ! - Acceptance probability for couples (new job offer x' for gender j)
        !   (x',xm,xf,s_m,s_f,k,j)
        real(8), allocatable :: couple_plus(:,:,:,:,:,:,:)  ! accept while spousal emp state unchanged
        real(8), allocatable :: couple_minus(:,:,:,:,:,:,:) ! accept while spouse quits
         ! - Quitting decisions upon transitioning into a non-young child state (not necessary but useful for debugging)
        real(8), allocatable :: couple_quit(:,:,:,:,:,:) ! (xm,xf,sm,sf,k,j) probability that member j quits given xm,xf,sm,sf and a transition into k (not a young child state)
        ! - Parental leave decision upon transitioning into the state of young child
        real(8), allocatable :: couple_pl(:,:,:,:,:)      !(k',xm,xf,sm,sf) , where k' can only be 2,3,4 (young child states)
        ! - Decision upon parental leave expiration (xm,xf,sm,sf,k)
        real(8), allocatable :: couple_ple1(:,:,:,:,:) ! both emp states unchanged, no one takes parental leave
        real(8), allocatable :: couple_ple2(:,:,:,:,:) ! parental leave transferred to spouse, both emp states remain unchanged
    end type omegafun

ivanpribec · May 5, 2024, 8:46pm

Wow! That slightly changes things, because it doesn’t look very feasible to vectorize. I counted 59 different end states (including zero).

I wonder if you would benefit from using a finite state machine or a state transition table. Here’s a rough sketch of the idea (no clue if it would be actually faster than the existing serial approach):

type :: idx_pair
   integer :: row, col
end type

type(idx_pair), allocatable :: idx_list(:)
real(8), allocatable :: values(:)

integer(int8) :: table(nstate,nstate)
integer(int8), allocatable :: enums(:)

! Loop to determine end state
do concurrent(...) ! 10-d loop
    ridx = ... ! Current state to linear index
    cidx =  ... ! Future state to linear index
    table(ridx,cidx) = state_to_enum(...)  ! A value from 0-58
end do

! Get non-zero states
enums = pack(table, table > 0)

! Get sparsity pattern
idx_list = pack([((idx_pair(i,j),i=1,nstate),j=1,nstate)],table > 0)

! Allocate space for matrix values
allocate(values,mold=idx_list)

! Now that we know the sparsity pattern and end states
! we can evaluate the values of the matrix in parallel

do concurrent (k = 1:size(idx_list))  ! Could also be OpenMP parallel do
      ! 1-d to 5-d index
      xm_c = ...
      xf_c = ...
      ...
      ! `flow_couple` evaluates a select case construct
      ! with 58 different end expressions
      values(k) = flow_couple(enum=enums(k), current=..., future=...) 
end do

! Now we have the values and indexes, so we can
! copy them into contiguous COO or CSR storage

Changing the model, would mean replacing the pair of functions state_to_enum and flow_couple (these could be callbacks or procedure pointers). But still I have the feeling like this is not the right abstraction and some ingredient to make the problem simpler is missing. My gut feeling is that some kind of domain-specific language approach might be it.

aledinola · May 5, 2024, 9:17pm

Thanks for your suggestion. I implemented it, but, as you expected, the performance is even worse than the serial code: on my machine, the run time increase from 12 seconds to 255 seconds! Results are correct with critical directive (without the critical directive, results are wrong). So definitely not worth it to use OpenMP here.

One question: do you have a sample code for linear_index? I wrote my own clumsy function, is this what you had in mind? Maybe I’m going off topic, but I was wondering if there is a more general way of mapping multiple indeces to linear indeces

function linear_index(xm_c,xf_c,sm_c,sf_c,k_c) result(ind)
		implicit none
		!Purpose: map multiple subscripts for the states into a single index.
		!Declare inputs:
		integer,intent(in) :: xm_c,xf_c,sm_c,sf_c,k_c
        !Declare function result
        integer :: ind

        ! ind = sub2ind([n_x,n_x,n_s,n_s,n_k], [xm_c,xf_c,sm_c,sf_c,k_c])
        ind = xm_c + (xf_c-1)*n_x + (sm_c-1)*n_x*n_x + (sf_c-1)*n_x*n_x*n_s + (k_c-1)*n_x*n_x*n_s*n_s
        
    end function linear_index

ivanpribec · May 5, 2024, 9:29pm

Thats exactly what I had in mind. I didn’t attempt to run it. That shows the synchronization overhead at matrix-entry level is too big.

aledinola · June 26, 2024, 4:46pm

Hi @ivanpribec, I came back to this suggestion after a while. I am implementing it, but this line does not compile

idx_list = pack([((idx_pair(i,j),i=1,nstate),j=1,nstate)],table > 0)

It says that the shape do not conform
As far as I understand, the purpose is to create the indexes for the non-zero elements of table. I have this example in mind:

% table = [1,0,0,0;
% 0,1,1,0
% 0,0,1,0
% 0,0,1,0];

Then we want to get

% idx_list(1) → (1,1)

% idx_list(2) → (2,2)

% idx_list(3) → (2,3)

% idx_list(4) → (3,3)

% idx_list(5) → (4,3)

aledinola · June 26, 2024, 4:51pm

To make it clearer, I replaced your line with the following

idx_list = [((idx_pair(i,j),i=1,n),j=1,n)]
idx_list = pack(idx_list,table > 0)

The first line is OK and it generates a one-dimensional array of type idx_pair, which looks like this:

idx_list(1)
{…}
idx_list(1)%ROW: 1
idx_list(1)%COL: 1
idx_list(2)
{…}
idx_list(2)%ROW: 2
idx_list(2)%COL: 1

The second line does not work: perhaps because idx_list is one dimensional while table is two-dimensional?

Thanks!

ivanpribec · June 26, 2024, 7:05pm

Maybe add a reshape to get matching ranks in the pack function?

idx_list = pack( &
    reshape( &
        [((idx_pair(i,j),i=1,nstate),j=1,nstate)],
        [nstate,nstate]), &
    table > 0)

I will note that I’m pessimistic about obtaining performance improvements. I was really just putting options on the table in line with the philosophy “Design it Twice”. Even if the second option fails, at least one comes away with a better understanding of the problem space.

aledinola · June 26, 2024, 8:25pm

Thank you, that worked! The new code is a bit slower (not bad) than the original code but it was nonetheless useful to understand better the problem.

One issue is that I get a stak-overflow (see code below), so I have to set heap arrays 0, which may slow down the code.

Another issue is that this part seems to be a bottleneck (besides generating a stack-overflow)

idx_list = [((idx_pair(i,j),i=1,n),j=1,n)] !STACK-OVERFLOW HERE
idx_list = pack(reshape(idx_list,[n,n]),table > 0)

! Declarations
integer, allocatable :: row_ind(:),col_ind(:)
real(8), allocatable :: values(:)
type :: idx_pair
    integer :: row, col
end type idx_pair
type(idx_pair), allocatable :: idx_list(:)
! Execution
*omitted*

idx_list = [((idx_pair(i,j),i=1,n),j=1,n)] !STACK-OVERFLOW HERE
idx_list = pack(reshape(idx_list,[n,n]),table > 0)
nnz      = size(idx_list)
allocate(row_ind(nnz),col_ind(nnz),values(nnz))

do k = 1,nnz  
    i = idx_list(k)%row
    j = idx_list(k)%col
    row_ind(k) = i
    col_ind(k) = j
    values(k)  = flow_mat(i,j)
enddo

ivanpribec · June 26, 2024, 9:17pm

Seems to be usual issue (unfortunately) with the Intel Fortran compiler where it makes a temporary for the left-hand side on the stack (despite being too large).

Does writing that line as,

allocate(idx_list,source=[((idx_pair(i,j),i=1,n),j=1,n)])

help?

Edit: I wrote a test program:

program stack_test
implicit none

integer :: i, j, n

type :: idx_pair
    integer :: row, col
end type

type(idx_pair), allocatable :: idx_list(:)

n = 1
do
    print *, "n = ", n
    idx_list = [((idx_pair(i,j),i=1,n),j=1,n)]
    n = 2*n
end do

end program

The output I get on my Mac with an older ifort version:

$ ifort -v
ifort version 2021.9.0
$ ifort stack_test.f90 
$ ./a.out 
 n =            1
 n =            2
 n =            4
 n =            8
 n =           16
 n =           32
 n =           64
 n =          128
 n =          256
 n =          512
 n =         1024
forrtl: severe (174): SIGSEGV, segmentation fault occurred
Image              PC                Routine            Line        Source             
a.out              0000000108F77484  for__signal_handl     Unknown  Unknown
libsystem_platfor  00007FF816C78DFD  _sigtramp             Unknown  Unknown
a.out              0000000108F4FDD1  MAIN__                Unknown  Unknown
a.out              0000000108F4FB2E  main                  Unknown  Unknown

Changing the loop to,

n = 1
do
    print *, "n = ", n
    if (allocated(idx_list)) deallocate(idx_list)
    allocate(idx_list,source=[((idx_pair(i,j),i=1,n),j=1,n)])
    n = 2*n
end do

does not help and it fails just like previously.

ivanpribec · June 26, 2024, 10:41pm

Here’s one way to encapsulate the index list creation:

( this will start to clog your computer’s resources by swapping memory at some point, so make sure to terminate the program manually )

program stack_test
implicit none

integer :: n

type :: coo_t
    integer, allocatable :: row(:), col(:)
end type
type(coo_t) :: list
real, allocatable :: r(:,:)
logical, allocatable :: mask(:,:)
real(kind(1.0d0)) :: t1, t2

n = 1
do
    if (allocated(r)) deallocate(r)
    allocate(r(n,n))
    call random_number(r)
    mask = r > 0.95         ! table > 0
    call cpu_time(t1)
    list = idx_list(n,mask)
    call cpu_time(t2)
    print *, "n = ", n, ", time (s) = ", t2 - t1
    n = 2*n
end do

contains

    function idx_list(n,mask) result(list)
        implicit none
        integer, intent(in) :: n
        logical, intent(in) :: mask(n,n)
        type(coo_t) :: list

        integer :: nnz, i, j, k

        nnz = count(mask)
        allocate(list%row(nnz), list%col(nnz))
        
        k = 0
        do j = 1, n
            do i = 1, n
                if (mask(i,j)) then
                    k = k + 1
                    list%row(k) = i
                    list%row(k) = j
                end if
            end do
        end do

    end function

end program

The quadratic loop quickly becomes a major bottleneck (not that you can do anything about it?):

$ ifort -warn all -O3 stack_test.f90 
$ ./a.out
           1           0  3.100000000000021E-005
           2           0  3.999999999999664E-006
           4           0  2.999999999999531E-006
           8           3  3.999999999999664E-006
          16           8  6.000000000000796E-006
          32          33  5.999999999999929E-006
          64         121  9.000000000000327E-006
         128         520  2.299999999999958E-005
         256        1985  7.900000000000008E-005
         512        7922  3.599999999999992E-004
        1024       31763  1.513000000000000E-003
        2048      125599  5.727999999999997E-003
        4096      504114  2.296399999999998E-002
        8192     2013605  9.584400000000004E-002
       16384     8056859  0.371974000000000     
       32768    32221725   1.50515000000000     
       65536   128861460   29.6458400000000

Topic		Replies	Views
Writing wrappers for LAPACK and BLAS routines	54	2014	December 14, 2023
FSPARSE: A modern fortran library for sparse matrices - Call for contributors Announcements	14	2509	June 26, 2024
Replacing legacy code with Julia	15	1016	July 8, 2022
Sparse matrix library	3	774	March 20, 2024
OpenMP Parallel Loop Help	27	1139	August 4, 2023

Initialize a sparse matrix - code is very slow

Related topics