I have timed various ways to compute the Gram matrix matmul(A, transpose(A))
aka A*A'
using gfortran on Windows and found that
- It is much faster to compute
xtr = transpose(x)
g = matmul(x, xtr)
than to pass tranpose(x)
as an argument with g = matmul(x, transpose(x))
-
It is almost twice as fast to compute
A*A'
with loops. -
Trying to exploit the symmetry of the result by computing only some of the elements of the Gram matrix is much slower, to my surprise.
The code is
module kind_mod
implicit none
private
public :: dp
integer, parameter :: dp = kind(1.0d0)
end module kind_mod
module util_mod
use kind_mod, only: dp
implicit none
private
public :: gram, gram_sub, gram_symm
contains
function gram(A) result(G)
! computes matmul(A, transpose(A))
real(kind=dp), intent(in) :: A(:,:)
real(kind=dp) :: G(size(A,1), size(A,1))
integer :: i, j, k
G = 0.0_dp
do j = 1, size(A,1)
do k = 1, size(A,2)
do i = 1, size(A,1)
G(i,j) = G(i,j) + A(i,k)*A(j,k)
end do
end do
end do
end function gram
!
subroutine gram_sub(A, G)
! computes matmul(A, transpose(A))
real(kind=dp), intent(in) :: A(:,:)
real(kind=dp), intent(out) :: G(:,:)
integer :: i, j, k
G = 0.0_dp
do j = 1, size(A,1)
do k = 1, size(A,2)
do i = 1, size(A,1)
G(i,j) = G(i,j) + A(i,k)*A(j,k)
end do
end do
end do
end subroutine gram_sub
!
function gram_symm(A) result(G)
! computes matmul(A, transpose(A)) exploiting symmetry of the result
real(kind=dp), intent(in) :: A(:,:)
real(kind=dp) :: G(size(A,1), size(A,1))
integer :: i, j, k
G = 0.0_dp
do j = 1, size(A,1)
do k = 1, size(A,2)
do i = j, size(A,1)
G(i,j) = G(i,j) + A(i,k)*A(j,k)
end do
end do
end do
! Copy the upper triangle to the lower triangle
do j = 1, size(A,1)
do i = j+1, size(A,1)
G(j,i) = G(i,j)
end do
end do
end function gram_symm
end module util_mod
program xgram
! 06/01/2023 04:33 PM compare speed of gram(x) and matmul(x, transpose(x))
use kind_mod, only: dp
use util_mod, only: gram, gram_symm, gram_sub
use iso_fortran_env, only: compiler_version, compiler_options
implicit none
integer, parameter :: n1 = 10**2, n2=10**6, ntimes = 7
real(kind=dp) :: x(n1,n2), xtr(n2,n1), g(n1, n1), gsub(n1, n1), gsymm(n1, n1), gchk(n1, n1), times(ntimes)
logical, parameter :: debug = .false.
print*,"compiler_version: " // compiler_version()
print*,"compiler_options: " // compiler_options()
call random_seed()
call random_number(x)
call cpu_time(times(1)) ; xtr = transpose(x)
call cpu_time(times(2)) ; g = gram(x)
call cpu_time(times(3)) ; call gram_sub(x, gsub)
call cpu_time(times(4)) ; gsymm = gram_symm(x)
call cpu_time(times(5)) ; gchk = matmul(x, xtr) ! use precomputed transpose(x)
call cpu_time(times(6)) ; gchk = matmul(x, transpose(x))
call cpu_time(times(7))
print "(*(g0, 1x))", "n1, n2, kind(x), huge(x) =",n1, n2, kind(x), huge(x)
if (debug) then
print*,maxval(g), maxval(gchk), minval(g), minval(gchk)
print*,g(1, 1), gchk(1, 1), g(n1, n1), gchk(n1, n1)
print*,"g(1, 2), g(2, 1) =", g(1, 2), g(2, 1)
end if
print*,"max diff =",maxval(abs(g-gchk)), maxval(abs(g-gsymm))
print "(/,*(a15))", "operation", "tranpose", "gram", "gram_sub", "gram_symm", "matmul", "matmul_temp"
print "(a15,*(f15.3))", "time", times(2:) - times(:ntimes-1)
end program xgram
and sample results compiling with gfortran -march=native -O3 -flto
are
compiler_version: GCC version 13.0.0 20221218 (experimental)
compiler_options: -march=tigerlake -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -mno-sse4a -mno-fma4 -mno-xop -mfma -mavx512f -mbmi -mbmi2 -maes -mpclmul -mavx512vl -mavx512bw -mavx512dq -mavx512cd -mno-avx512er -mno-avx512pf -mavx512vbmi -mavx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mavx512vpopcntdq -mavx512vbmi2 -mgfni -mvpclmulqdq -mavx512vnni -mavx512bitalg -mno-avx512bf16 -mavx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mno-clzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mmovdir64b -mmovdiri -mno-mwaitx -mno-pconfig -mno-pku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mno-wbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint --param=l1-cache-size=48 --param=l1-cache-line-size=64 --param=l2-cache-size=12288 -mtune=tigerlake -g -O3 -Wall -Werror=unused-parameter -Werror=unused-variable -Werror=unused-function -Wno-maybe-uninitialized -Wno-surprising -flto -fbounds-check -fmodule-private
n1, n2, kind(x), huge(x) = 100 1000000 8 0.17976931348623157E+309
max diff = 1.1641532182693481E-010 0.0000000000000000
operation tranpose gram gram_sub gram_symm matmul matmul_temp
time 1.000 0.859 0.797 4.734 0.516 5.781
One should compare the time for gram
or gram_sub
with the sum of times for transpose
and matmul
. Suggestions for speeding up the calculation are welcome.