I noticed this recently for the Level 2 BLAS SGER
/DGER
operation which computes the rank-1 update:
A = A + \alpha x y^T
The matrix A is m \times n, the vector x is m \times 1 and the vector y is n \times 1. In Fortran there are multiple ways to perform this (assuming you don’t just use BLAS):
program rank1_update
implicit none
real :: A(3,4)
real :: x(3), y(4)
real :: alpha
integer :: i
alpha = 1.0
x = [1,2,3]
y = [1,2,3,4]
A = 0
call sger_dc(alpha,A,x,y)
print '(4(F8.3))', (A(i,:), i=1,3)
print '()'
A = 0
call sger_arr(alpha,A,x,y)
print '(4(F8.3))', (A(i,:), i=1,3)
print '()'
A = 0
call sger_mm(alpha,A,x,y)
print '(4(F8.3))', (A(i,:), i=1,3)
print '()'
A = 0
call sger_reshape(alpha,A,x,y)
print '(4(F8.3))', (A(i,:), i=1,3)
contains
! loops
subroutine sger_dc(alpha,a,x,y)
real, intent(in) :: alpha, x(:), y(:)
real, intent(inout) :: a(:,:)
integer :: i, j
do concurrent(i=1:size(a,1),j=1:size(a,2))
a(i,j) = a(i,j) + alpha * x(i) * y(j)
end do
end subroutine
! array syntax
subroutine sger_arr(alpha,a,x,y)
real, intent(in) :: alpha, x(:), y(:)
real, intent(inout) :: a(:,:)
a = a + alpha * spread(x,2,size(a,2)) &
* spread(y,1,size(a,1))
end subroutine
! matmul
subroutine sger_mm(alpha,a,x,y)
real, intent(in) :: alpha
real, intent(in), target :: x(:), y(:)
real, intent(inout) :: a(:,:)
real, pointer :: x2(:,:), y2T(:,:)
x2(1:size(x),1:1) => x
y2T(1:1,1:size(y)) => y
a = a + alpha * matmul(x2,y2T)
end subroutine
! reshape
subroutine sger_reshape(alpha,a,x,y)
real, intent(in) :: alpha
real, intent(in) :: x(:), y(:)
real, intent(inout) :: a(:,:)
associate(x2 => reshape(x,shape=[size(x),1]), &
y2 => reshape(y,shape=[size(y),1]))
a = a + alpha * matmul(x2,transpose(y2))
end associate
end subroutine
end program
Guess which one is the most efficient in practice? The loop one; for other variants the compilers create temporary arrays
Edit: I was curious how the generated procedures differ in size (or how bloated are they)
$ gfortran -Wall -O3 -march=skylake -fno-inline -c rank1_update.f90
$ nm --print-size --size-sort --radix=d rank1_update.o | grep "sger" | awk '{print $2, $4}'
0000000000000193 sger_dc.3.constprop.0.isra.0
0000000000000197 sger_mm.1.constprop.0.isra.0
0000000000000698 sger_arr.2.constprop.0.isra.0
0000000000003307 sger_reshape.0.constprop.0.isra.0
(note that gfortran performs constant propagation)
$ ifort -warn all -O3 -xSKYLAKE -inline-level=0 -c rank1_update.f90
$ nm --print-size --size-sort --radix=d rank1_update.o | grep "IP_sger" | awk '{print $2, $4}'
0000000000000256 rank1_update_IP_sger_dc_
0000000000000272 rank1_update_IP_sger_arr_
0000000000000368 rank1_update_IP_sger_mm_
0000000000001232 rank1_update_IP_sger_reshape_