My code in particular is a specific case that probably cannot be generalized. But as I mentioned in a previous post, the problem is similar to a Kronecker product of matrices.
So for that case, I have created a few different subroutines with all the ideas presented in this thread: In all cases, the matrices are a(n_a x m_a)
and b(n_b x m_b)
. Also to refer to matrix elements the indices (i,j)
are used, and the loops variables are i_a, j_a, i_b, j_b, i_c, j_c
(hopefully that helps to see in which direction -row/column- of what matrix -a/b/c- the loop operates on).
How the code works: the matrix c
is filled in by columns, in chucks of n_b
elements.
- First column:
c(1:n_b,1) = a(1,1)*b(:,1)
, then c(n_b+1:2*n_b,1) = a(2,1)*b(:,1)
, etc, all the way to c((n_a-1)*n_b+1:n_a*n_b) =:a(n_a,1)*b(:,1)
.
- Second column: same elements
a(1,1), a(2,1) ... a(n_a,1)
multiplying the second column of b
: b(:,2)
.
- This process is repeated
m_b
times, with the same column of a,
until matrix c
has been filled to column n_b
.
- Repeat this process
m_a
times, successively using a(:,2), a(:,3), ..., a(:,m_a)
.
With a slice, allocatable, implied-do loop
subroutine kronecker_product_slice_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(in) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, j_c
integer :: t
integer(ik), allocatable :: slice(:)
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(slice(n_b))
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
slice = [(t, t = 1, n_b)]
do i_a = 1, n_a
tmp = a(i_a, j_a)
c(slice, j_c) = tmp * b(:, j_b)
slice = slice + n_b
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_slice_v1
With slice, passed as an argument
subroutine kronecker_product_slice_v2(c, a, b, array)
integer, intent(out), allocatable :: c(:,:)
integer, intent(in) :: a(:,:), b(:,:)
integer, intent(in) :: array(:)
integer :: n_a, n_b, m_a, m_b
integer :: i_a, j_a, j_b. j_c
integer :: t
integer, allocatable :: slice(:)
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
slice = array
do i_a = 1, n_a
c(slice, j_c) = a(i_a, j_a) * b(:, j_b)
slice = slice + n_b
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_slice_v2
Referencing, without slice array, v1 (as suggested by @ivanpribec if I understood correctly)
subroutine kronecker_product_ref_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b ! rows/cols of matrices a, b
integer :: i_a, j_a, j_b, i_b, j_c
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c((i_a - 1) * n_b + i_b, j_c) = tmp * b(i_b, j_b)
end do
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_ref_v1
Referencing, without slice array, v2 (similar to v1, but using variables instead of computing slice)
subroutine kronecker_product_ref_v2(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, j_c
integer :: low, upp
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
low = 1
upp = n_b
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
c(low : upp, j_c) = tmp * b(:, j_b)
low = low + n_b
upp = upp + n_b
end do
j_c = j_c + 1
low = 1
upp = n_b
end do
end do
end subroutine kronecker_product_ref_v2
An extra DO loop, v1 (as suggested by @urbanjost, computing the slice)
subroutine kronecker_product_do_v1(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, i_b, j_c
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c((i_a - 1) * n_b + i_b, j_c) = tmp * b(i_b, j_b)
end do
end do
j_c = j_c + 1
end do
end do
end subroutine kronecker_product_do_v1
An extra DO loop, v2 (with variables instead of computing the slice)
subroutine kronecker_product_do_v2(c, a, b)
integer, intent(out), allocatable :: c(:,:)
integer, intent(out) :: a(:,:), b(:,:)
integer :: tmp
integer :: n_a, m_a, n_b, m_b
integer :: i_a, j_a, j_b, i_b, j_c
integer :: skip
n_a = size(a, dim = 1)
m_a = size(a, dim = 2)
n_b = size(b, dim = 1)
m_b = size(b, dim = 2)
allocate(c(n_a * n_b, m_a * m_b))
skip = 0
j_c = 1
do j_a = 1, m_a
do j_b = 1, m_b
do i_a = 1, n_a
tmp = a(i_a, j_a)
do i_b = 1, n_b
c(skip + i_b, j_c) = tmp * b(i_b, j_b)
end do
skip = skip + n_b
end do
j_c = j_c + 1
skip = 0
end do
end do
end subroutine kronecker_product_do_v2
There are more options to test (e.g. passing the sizes n_a, n_b
, etc) and possibly others may be faster, but that’s all I have tested so far. In most of this subroutines the variable tmp
(that stores the value a(i_a, j_a)
) is not strictly needed, but it seemed to help (i.e. be faster). I could be wrong.
From my tests, the do and ref versions are faster than the ones using the slice.
Between those two, it’s hard to say (same with v1 or v2). ref seems to be faster, but I cannot confirm, and it also depends on the sizes of a
and b
.
Something else to test could be the use of integer
, real
and complex
. Maybe the fastest version differs between types.
A test problem for any of these subroutines is as follows:
program test
implicit none
integer, parameter :: ik = selected_int_kind(r = 35) ! for timing
integer, parameter :: rk = selected_real_kind(r = 3000) ! quad precision
integer(ik) :: ts, tf, tr ! timing
integer :: i
integer, parameter :: array(*) = [(i, i = 1, XX)] ! slice, substitute XX with size(b,1)
integer, allocatable :: a(:,:), b(:,:), c(:,:)
a = reshape([1, 1, 1, 3, 0, 2, 2, 0, 2], [3, 3])
b = reshape([0, 5, 1, 5, 0, 1], [3, 2])
call system_clock(ts, tr)
call kronecker_product_XYZ(c, a, b) ! substitute XYZ
call system_clock(tf)
print *, "time: ", real(tf - ts, rk) / tr
do i = 1, size(a, 1) * size(b, 1)
print *, c(i,:)
end do
contains
! ...
end program test
PS: There may be bugs in the subroutine codes