My code in particular is a specific case that probably cannot be generalized. But as I mentioned in a previous post, the problem is similar to a Kronecker product of matrices.

So for that case, I have created a few different subroutines with all the ideas presented in this thread: In all cases, the matrices are `a(n_a x m_a)`

and `b(n_b x m_b)`

. Also to refer to matrix elements the indices `(i,j)`

are used, and the loops variables are `i_a, j_a, i_b, j_b, i_c, j_c`

(hopefully that helps to see in which direction -row/column- of what matrix -a/b/c- the loop operates on).

How the code works: the matrix `c`

is filled in by columns, in chucks of `n_b`

elements.

- First column:
`c(1:n_b,1) = a(1,1)*b(:,1)`

, then `c(n_b+1:2*n_b,1) = a(2,1)*b(:,1)`

, etc, all the way to `c((n_a-1)*n_b+1:n_a*n_b) =:a(n_a,1)*b(:,1)`

.
- Second column: same elements
`a(1,1), a(2,1) ... a(n_a,1)`

multiplying the second column of `b`

: `b(:,2)`

.
- This process is repeated
`m_b`

times, with the same column of `a,`

until matrix `c`

has been filled to column `n_b`

.
- Repeat this process
`m_a`

times, successively using `a(:,2), a(:,3), ..., a(:,m_a)`

.

**With a slice, allocatable, implied-do loop**

```
subroutine kronecker_product_slice_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(in) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, j_c
integer :: t
integer(ik), allocatable :: slice(:)
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(slice(n_b))
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
slice = [(t, t = 1, n_b)]
do i_a = 1, n_a
tmp = a(i_a, j_a)
c(slice, j_c) = tmp * b(:, j_b)
slice = slice + n_b
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_slice_v1
```

**With slice, passed as an argument**

```
subroutine kronecker_product_slice_v2(c, a, b, array)
integer, intent(out), allocatable :: c(:,:)
integer, intent(in) :: a(:,:), b(:,:)
integer, intent(in) :: array(:)
integer :: n_a, n_b, m_a, m_b
integer :: i_a, j_a, j_b. j_c
integer :: t
integer, allocatable :: slice(:)
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
slice = array
do i_a = 1, n_a
c(slice, j_c) = a(i_a, j_a) * b(:, j_b)
slice = slice + n_b
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_slice_v2
```

**Referencing, without slice array, v1** (as suggested by @ivanpribec if I understood correctly)

```
subroutine kronecker_product_ref_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b ! rows/cols of matrices a, b
integer :: i_a, j_a, j_b, i_b, j_c
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c((i_a - 1) * n_b + i_b, j_c) = tmp * b(i_b, j_b)
end do
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_ref_v1
```

**Referencing, without slice array, v2** (similar to v1, but using variables instead of computing slice)

```
subroutine kronecker_product_ref_v2(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, j_c
integer :: low, upp
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
low = 1
upp = n_b
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
c(low : upp, j_c) = tmp * b(:, j_b)
low = low + n_b
upp = upp + n_b
end do
j_c = j_c + 1
low = 1
upp = n_b
end do
end do
end subroutine kronecker_product_ref_v2
```

**An extra DO loop, v1** (as suggested by @urbanjost, computing the slice)

```
subroutine kronecker_product_do_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, i_b, j_c
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c((i_a - 1) * n_b + i_b, j_c) = tmp * b(i_b, j_b)
end do
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_do_v1
```

**An extra DO loop, v2** (with variables instead of computing the slice)

```
subroutine kronecker_product_do_v2(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, i_b, j_c
integer :: skip
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
skip = 0
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c(skip + i_b, j_c) = tmp * b(i_b, j_b)
end do
skip = skip + n_b
end do
j_c = j_c + 1
skip = 0
end do
end do
end subroutine kronecker_product_do_v2
```

There are more options to test (e.g. passing the sizes `n_a, n_b`

, etc) and possibly others may be faster, but thatâ€™s all I have tested so far. In most of this subroutines the variable `tmp`

(that stores the value `a(i_a, j_a)`

) is not strictly needed, but it seemed to help (i.e. be faster). I could be wrong.

From my tests, the **do** and **ref** versions are faster than the ones using the slice.

Between those two, itâ€™s hard to say (same with v1 or v2). **ref** seems to be faster, but I cannot confirm, and it also depends on the sizes of `a`

and `b`

.

Something else to test could be the use of `integer`

, `real`

and `complex`

. Maybe the fastest version differs between types.

A test problem for any of these subroutines is as follows:

```
program test
implicit none
integer, parameter :: ik = selected_int_kind(r = 35) ! for timing
integer, parameter :: rk = selected_real_kind(r = 3000) ! quad precision
integer(ik) :: ts, tf, tr ! timing
integer :: i
integer, parameter :: array(*) = [(i, i = 1, XX)] ! slice, substitute XX with size(b,1)
integer, allocatable :: a(:,:), b(:,:), c(:,:)
a = reshape([1, 1, 1, 3, 0, 2, 2, 0, 2], [3, 3])
b = reshape([0, 5, 1, 5, 0, 1], [3, 2])
call system_clock(ts, tr)
call kronecker_product_XYZ(c, a, b) ! substitute XYZ
call system_clock(tf)
print *, "time: ", real(tf - ts, rk) / tr
do i = 1, size(a, 1) * size(b, 1)
print *, c(i,:)
end do
contains
! ...
end program test
```

PS: There may be bugs in the subroutine codes