Would using DO CONCURRENT with the -qopenmp option for Intel Fortran help? Here is the homebrew subroutine with DO CONCURRENT.
subroutine homebrew(a,b,c)
use, intrinsic :: iso_fortran_env, only: int64, real64
real(real64), intent(in) :: a(:,:),b(:,:)
real(real64), intent(inout) :: c(:,:)
integer(int64) :: a1,a2,a3,a4,a5,a6,n=64
do concurrent (a1=1:size(b,2):n)
do concurrent (a2=1:size(b,1):n)
do concurrent (a3=1:size(a,1):n)
do concurrent(a4=a1:min(a1+n-1,size(b,2)):1)
do concurrent (a5=a2:min(a2+n-1,size(b,1)):1)
do concurrent (a6=a3:min(a3+n-1,size(a,1)):1)
c(a6,a4)=c(a6,a4)+a(a6,a5)*b(a5,a4)
end do
end do
end do
end do
end do
end do
end subroutine homebrew