Looking at some linear algebra project I realized that many of them are written in C and in an parts in assembly (blis, openblas, gotoblas).
To keep linear algebra in fortran one would have to able to write code with a similar performance in fortran!?
Here is the task: the code below implements a home brew gem routine with cache-blocking. According to Intel Advisor the loops reach 84 to 89 % efficiency. But compared to MKL it needs about 1.8x as much time (single core and threaded).
The compiler flags are “-O3 -march=coffeelake”, the compiler is ifort 19.1.3, the mkl is from parallel studio 2020.4.304, using the 64 bit integer interface. The os is linux, kernel version 5. The current benchmark for single threaded is 3.3 seconds for mkl and 5.6 seconds for homebrew. Anybody out there who can close the gap and therefore prove that it is worth staying with fortran for linear algebra?
program test
 use blas95
 use, intrinsic :: iso_fortran_env, only: int64, real64
 real(real64), allocatable :: r21(:,:),r22(:,:),r24(:,:),r23(:,:)
 real(real64) :: t0,t1
 integer(int64) :: nr=7000,nc=4000
 allocate(r21(nr,nc),r22(nc,nc),r23(nr,nc),r24(nr,nc))
 r21(:,:)=1._real64;r22(:,:)=2._real64;r23(:,:)=0._real64;r24(:,:)=0._real64
 call cpu_time(t0)
 call gemm(a=r21,b=r22,c=r23)
 call cpu_time(t1)
 write(*,*) t1-t0
 call cpu_time(t0)
 call homebrew(a=r21,b=r22,c=r24)
 call cpu_time(t1)
 write(*,*) t1-t0
 if(maxval(abs(r23-r24))>10e-10) then
   write(*,*) maxval(abs(r23-r24))
   write(*,*) "error";call exit(1)
 end if
contains
  subroutine homebrew(a,b,c)
    real(real64), intent(in) :: a(:,:),b(:,:)
    real(real64), intent(inout) :: c(:,:)
    integer(int64) :: a1,a2,a3,a4,a5,a6,n=64
    do a1=1,size(b,2),n
      do a2=1,size(b,1),n
        do a3=1,size(a,1),n
          do a4=a1,min(a1+n-1,size(b,2)),1
            do a5=a2,min(a2+n-1,size(b,1)),1
              do a6=a3,min(a3+n-1,size(a,1)),1
                c(a6,a4)=c(a6,a4)+a(a6,a5)*b(a5,a4)
              end do
            end do
          end do
        end do
      end do
    end do
  end subroutine homebrew
end program test
