Looking at some linear algebra project I realized that many of them are written in C and in an parts in assembly (blis, openblas, gotoblas).
To keep linear algebra in fortran one would have to able to write code with a similar performance in fortran!?
Here is the task: the code below implements a home brew gem routine with cache-blocking. According to Intel Advisor the loops reach 84 to 89 % efficiency. But compared to MKL it needs about 1.8x as much time (single core and threaded).
The compiler flags are “-O3 -march=coffeelake”, the compiler is ifort 19.1.3, the mkl is from parallel studio 2020.4.304, using the 64 bit integer interface. The os is linux, kernel version 5. The current benchmark for single threaded is 3.3 seconds for mkl and 5.6 seconds for homebrew. Anybody out there who can close the gap and therefore prove that it is worth staying with fortran for linear algebra?
program test
use blas95
use, intrinsic :: iso_fortran_env, only: int64, real64
real(real64), allocatable :: r21(:,:),r22(:,:),r24(:,:),r23(:,:)
real(real64) :: t0,t1
integer(int64) :: nr=7000,nc=4000
allocate(r21(nr,nc),r22(nc,nc),r23(nr,nc),r24(nr,nc))
r21(:,:)=1._real64;r22(:,:)=2._real64;r23(:,:)=0._real64;r24(:,:)=0._real64
call cpu_time(t0)
call gemm(a=r21,b=r22,c=r23)
call cpu_time(t1)
write(*,*) t1-t0
call cpu_time(t0)
call homebrew(a=r21,b=r22,c=r24)
call cpu_time(t1)
write(*,*) t1-t0
if(maxval(abs(r23-r24))>10e-10) then
write(*,*) maxval(abs(r23-r24))
write(*,*) "error";call exit(1)
end if
contains
subroutine homebrew(a,b,c)
real(real64), intent(in) :: a(:,:),b(:,:)
real(real64), intent(inout) :: c(:,:)
integer(int64) :: a1,a2,a3,a4,a5,a6,n=64
do a1=1,size(b,2),n
do a2=1,size(b,1),n
do a3=1,size(a,1),n
do a4=a1,min(a1+n-1,size(b,2)),1
do a5=a2,min(a2+n-1,size(b,1)),1
do a6=a3,min(a3+n-1,size(a,1)),1
c(a6,a4)=c(a6,a4)+a(a6,a5)*b(a5,a4)
end do
end do
end do
end do
end do
end do
end subroutine homebrew
end program test