My try based on the original code (with some modifications) without omp:
gfortran --version && gcc -O3 -march=native -mtune=native -mfma -c dgemm_kernel.c -o dgemm_kernel.o && gfortran -O3 -march=native -mtune=native compare_dgemm.f90 -o compare_dgemm dgemm_kernel.o -lopenblas && export OMP_NUM_THREADS=1 && ./compare_dgemm
GNU Fortran (GCC) 15.2.0
Copyright (C) 2025 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
==================================================
Matrix size: 1000
Fortran(s): 0.073000
C-Intrinsics(s): 0.066000
OpenBLAS(s): 0.057000
Speedup (OpenBLAS): 1.280702
Speedup (C-Intrinsics): 1.106061
Error (Fortran): 0.000000
Error (C): 0.000000
==================================================
Matrix size: 1500
Fortran(s): 0.239000
C-Intrinsics(s): 0.222000
OpenBLAS(s): 0.193000
Speedup (OpenBLAS): 1.238342
Speedup (C-Intrinsics): 1.076577
Error (Fortran): 0.000000
Error (C): 0.000000
==================================================
Matrix size: 2000
Fortran(s): 0.549000
C-Intrinsics(s): 0.530000
OpenBLAS(s): 0.455000
Speedup (OpenBLAS): 1.206593
Speedup (C-Intrinsics): 1.035849
Error (Fortran): 0.000000
Error (C): 0.000000
module mod_dgemm
use iso_fortran_env, only: dp => real64
implicit none
private
public :: dp, dgemm_pure_fortran, dgemm_openblas, dgemm_c_intrinsics
integer, parameter :: MR = 6, NR = 8, MC = 96, KC = 224
interface
pure subroutine dgemm_kernel_6x8(n_block, A, m_block, B, k_block, C, ldc) bind(C)
use iso_c_binding, only: c_double, c_int
implicit none
integer(c_int), intent(in) :: ldc, m_block, n_block, k_block
real(c_double), intent(in) :: A(*), B(*)
real(c_double), intent(inout) :: C(*)
end subroutine
function aligned_alloc_wrapper(alignment, size) bind(C)
use iso_c_binding, only: c_ptr, c_long
implicit none
integer(c_long), value :: alignment, size
type(c_ptr) :: aligned_alloc_wrapper
end function
end interface
contains
!===========================
! BLAS wrapper
!===========================
subroutine dgemm_openblas(A, B, C, m, n, k)
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
call dgemm('N','N', m,n,k, 1.0_dp, A,m, B,k, 0.0_dp, C,m)
end subroutine dgemm_openblas
!===========================
! Aligned allocation helper
!===========================
subroutine allocate_aligned_real64_1D(x, n, alignment, raw_ptr)
use iso_c_binding, only: c_double, c_int, c_ptr, c_long, c_sizeof, c_f_pointer, c_associated
real(c_double), pointer, intent(inout) :: x(:)
integer, intent(in) :: n
integer(c_int), intent(in), optional :: alignment
type(c_ptr), intent(out) :: raw_ptr
integer(c_long) :: align, total_size
align = merge(alignment, 64, present(alignment))
total_size = n * c_sizeof(0.0_c_double)
raw_ptr = aligned_alloc_wrapper(align, total_size)
if (.not. c_associated(raw_ptr)) stop "allocation failed"
call c_f_pointer(raw_ptr, x, [n])
end subroutine allocate_aligned_real64_1D
!===========================
! Intrinsics DGEMM
!===========================
subroutine dgemm_c_intrinsics(A, B, C, m, n, k)
use iso_c_binding, only: c_ptr
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
integer :: i, j, ib, jb, pb, rows, cols
integer :: m_block, n_block, k_block, NC, paneli, panelj, col
real(dp), pointer :: A_block(:), B_block(:), A_panel(:,:), B_panel(:,:)
type(c_ptr) :: ptrA, ptrB
NC = n
call allocate_aligned_real64_1D(A_block, MC*KC, 64, ptrA)
call allocate_aligned_real64_1D(B_block, NC*KC, 64, ptrB)
! Loop over C blocks
do jb = 1, n, NC
n_block = min(NC, n-jb+1)
do pb = 1, k, KC
k_block = min(KC, k-pb+1)
! Pack B
panelj = 0
do j = 1, n_block, NR
cols = min(NR, n_block-j+1)
B_panel(1:cols,1:k_block) => B_block(panelj*NR*k_block+1 : panelj*NR*k_block+cols*k_block)
do col = 1, cols
B_panel(col,:) = B(pb:pb+k_block-1, jb+j+col-2)
end do
panelj = panelj+1
end do
! Loop over A blocks
do ib = 1, m, MC
m_block = min(MC, m-ib+1)
! Pack A
paneli = 0
do i = 1, m_block, MR
rows = min(MR, m_block-i+1)
A_panel(1:rows,1:k_block) => A_block(paneli*MR*k_block+1 : paneli*MR*k_block+rows*k_block)
A_panel(1:rows,:) = A(ib+i-1:ib+i+rows-2, pb:pb+k_block-1)
paneli = paneli+1
end do
call dgemm_kernel_6x8(n_block, A_block, m_block, B_block, k_block, C(ib,jb), size(C,1))
end do
end do
end do
deallocate(A_block, B_block)
end subroutine dgemm_c_intrinsics
!===========================
! Pure Fortran DGEMM
!===========================
pure subroutine pack_A(mb, kb, A, lda, A_pack)
integer, intent(in) :: mb, kb, lda
real(dp), intent(in) :: A(lda,*)
real(dp), intent(out) :: A_pack(MC*KC)
integer :: i,j,rows,row, abase
do i = 1, mb, MR
rows = min(MR, mb-i+1)
abase = (i/MR)*MR*kb
! do concurrent (j =1:kb, row=1:rows)
! A_pack(abase + (j-1)*rows + row) = A(i+row-1, j)
! end do
do j = 1, kb
do row = 1, rows
A_pack(abase + (j-1)*rows + row) = A(i+row-1, j)
end do
end do
end do
end subroutine
pure subroutine pack_B(kb, nb, B, ldb, B_pack, NC)
integer, intent(in) :: kb, nb, ldb, NC
real(dp), intent(in) :: B(ldb,*)
real(dp), intent(out) :: B_pack(KC*NC)
integer :: i,j,cols,col, bbase
do j = 1, nb, NR
cols = min(NR, nb-j+1)
bbase = (j/NR)*NR*kb
! do concurrent (i=1:kb, col=1:cols)
! B_pack(bbase + (i-1)*cols + col) = B(i, j+col-1)
! end do
do i = 1, kb
do col = 1, cols
B_pack(bbase + (i-1)*cols + col) = B(i, j+col-1)
end do
end do
end do
end subroutine
pure subroutine kernel_scalar(k, m_block, n_block, A, B, C, ldc)
integer, intent(in) :: k, m_block, n_block, ldc
real(dp), intent(in) :: A(*)
real(dp), intent(in) :: B(*)
real(dp), intent(inout) :: C(*)
integer :: i, j, p
do j = 1, n_block
do i = 1, m_block
do p = 1, k
C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
end do
end do
end do
end subroutine
pure subroutine dgemm_pure_fortran(A, B, C, m, n, k)
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
integer :: i, j, p, kk, ll, ib, jb, pb, jbound, n_full, m_full, m_block, n_block, k_block, r, bbase, abase, aptr, bptr
real(dp), allocatable :: A_block(:), B_block(:)
real(dp) :: cr(4,2,MR), arow
integer :: NC
NC = n
allocate(A_block(MC*KC))
allocate(B_block(KC*NC))
do jb = 1, n, NC
n_block = min(NC, n-jb+1)
do pb = 1, k, KC
k_block = min(KC, k-pb+1)
call pack_B(k_block, n_block, B(pb, jb), k, B_block, NC)
do ib = 1, m, MC
m_block = min(MC, m-ib+1)
call pack_A(m_block, k_block, A(ib, pb), m, A_block)
m_full = m_block-mod(m_block, MR)
n_full = n_block-mod(n_block, NR)
do j = 1, n_full, NR
bbase = ((j-1)/NR)*NR*k_block
do i = 1, m_full, MR
abase = ((i-1)/MR)*MR*k_block
cr = 0.0_dp
do p = 0, k_block-1
aptr = abase+p*MR+1
bptr = bbase+p*NR+1
do r = 1, MR
arow = A_block(aptr)
cr(1,1,r) = cr(1,1,r) + arow*B_block(bptr )
cr(2,1,r) = cr(2,1,r) + arow*B_block(bptr+1)
cr(3,1,r) = cr(3,1,r) + arow*B_block(bptr+2)
cr(4,1,r) = cr(4,1,r) + arow*B_block(bptr+3)
cr(1,2,r) = cr(1,2,r) + arow*B_block(bptr+4)
cr(2,2,r) = cr(2,2,r) + arow*B_block(bptr+5)
cr(3,2,r) = cr(3,2,r) + arow*B_block(bptr+6)
cr(4,2,r) = cr(4,2,r) + arow*B_block(bptr+7)
aptr = aptr + 1
end do
end do
do kk = 0, 3
C(ib+i+3, jb+j+kk-1) = C(ib+i+3, jb+j+kk-1) + cr(kk+1, 1, 5)
C(ib+i+4, jb+j+kk-1) = C(ib+i+4, jb+j+kk-1) + cr(kk+1, 1, 6)
C(ib+i+3, jb+j+kk+3) = C(ib+i+3, jb+j+kk+3) + cr(kk+1, 2, 5)
C(ib+i+4, jb+j+kk+3) = C(ib+i+4, jb+j+kk+3) + cr(kk+1, 2, 6)
do ll = 0, 3
C(ib+i+ll-1, jb+j+kk-1) = C(ib+i+ll-1, jb+j+kk-1) + cr(kk+1, 1, ll+1)
C(ib+i+ll-1, jb+j+kk+3) = C(ib+i+ll-1, jb+j+kk+3) + cr(kk+1, 2, ll+1)
end do
end do
end do
end do
! do concurrent (j = n_full+1: n_block: NR) local(jbound, bbase)
do j = n_full+1, n_block, NR
jbound = min(NR, n_block-j+1)
bbase = ((j-1)/NR)*NR*k_block+1
! do concurrent (i = 1: m_full: MR)
do i = 1, m_full, MR
call kernel_scalar(k_block, MR, jbound, A_block(((i-1)/MR)*MR*k_block+1), B_block(bbase), C(ib+i-1:, jb+j-1), m)
end do
end do
! do concurrent (j = 1: n_full: NR) local(bbase)
do j = 1, n_full, NR
bbase = ((j-1)/NR)*NR*k_block+1
! do concurrent (i = m_full+1: m_block: MR)
do i = m_full+1, m_block, MR
call kernel_scalar(k_block, min(MR, m_block-i+1), NR, A_block(((i-1)/MR)*MR*k_block+1), B_block(bbase), C(ib+i-1:, jb+j-1), m)
end do
end do
end do
end do
end do
end subroutine
end module mod_dgemm
program compare_dgemm
use mod_dgemm, only: dp, dgemm_pure_fortran, dgemm_openblas, dgemm_c_intrinsics
implicit none
integer, parameter :: nreps = 20
integer, parameter :: sizes(*) = [1000, 1500, 2000]
real(dp), allocatable :: A(:,:), B(:,:), C1(:,:), C2(:,:), C3(:,:)
real(dp) :: elapsed, tmin_for, tmin_c, tmin_blas, error_for, error_c
integer :: start, finish, rate, r, m, n, k, sz
call system_clock(count_rate=rate)
do sz = 1, size(sizes)
m = sizes(sz)
n = m
k = m
allocate(A(m,k), B(k,n), C1(m,n), C2(m,n), C3(m,n))
call random_number(A)
call random_number(B)
! Warm-up
C1 = 0.0_dp
call dgemm_pure_fortran(A,B,C1,m,n,k)
C2 = 0.0_dp
call dgemm_c_intrinsics(A,B,C2,m,n,k)
C3 = 0.0_dp
call dgemm_openblas(A,B,C3,m,n,k)
! --- Intrinsics timing (min over nreps) ---
tmin_for = huge(1.0_dp)
do r = 1, nreps
C1 = 0.0_dp
call system_clock(start)
call dgemm_pure_fortran(A,B,C1,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_for = min(tmin_for,elapsed)
end do
! --- Intrinsics timing (min over nreps) ---
tmin_c = huge(1.0_dp)
do r = 1, nreps
C2 = 0.0_dp
call system_clock(start)
call dgemm_c_intrinsics(A,B,C2,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_c = min(tmin_c,elapsed)
end do
! --- BLAS timing (min over nreps) ---
tmin_blas = huge(1.0_dp)
do r = 1, nreps
C3 = 0.0_dp
call system_clock(start)
call dgemm_openblas(A,B,C3,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_blas = min(tmin_blas,elapsed)
end do
error_for = maxval(abs(C1-C3))
error_c = maxval(abs(C2-C3))
print '(a)', repeat('=',50)
print '(a,i8)' , 'Matrix size: ', m
print '(a,f12.6)', 'Fortran(s): ', tmin_for
print '(a,f12.6)', 'C-Intrinsics(s): ', tmin_c
print '(a,f12.6)', 'OpenBLAS(s): ', tmin_blas
print '(a,f12.6)', 'Speedup (OpenBLAS): ', tmin_for/tmin_blas
print '(a,f12.6)', 'Speedup (C-Intrinsics): ', tmin_for/tmin_c
print '(a,f12.6)', 'Error (Fortran): ', error_for
print '(a,f12.6)', 'Error (C): ', error_c
deallocate(A,B,C1,C2,C3)
end do
end program compare_dgemm