Achieving OpenBLAS DGEMM performance with Fortran vs C intrinsics: why is Fortran slower?

My try based on the original code (with some modifications) without omp:

gfortran --version && gcc -O3 -march=native -mtune=native -mfma -c dgemm_kernel.c -o dgemm_kernel.o && gfortran -O3 -march=native -mtune=native compare_dgemm.f90 -o compare_dgemm dgemm_kernel.o -lopenblas && export OMP_NUM_THREADS=1 && ./compare_dgemm

GNU Fortran (GCC) 15.2.0
Copyright (C) 2025 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
==================================================
Matrix size:                1000
Fortran(s):                 0.073000
C-Intrinsics(s):            0.066000
OpenBLAS(s):                0.057000
Speedup (OpenBLAS):         1.280702
Speedup (C-Intrinsics):     1.106061
Error (Fortran):            0.000000
Error (C):                  0.000000
==================================================
Matrix size:                1500
Fortran(s):                 0.239000
C-Intrinsics(s):            0.222000
OpenBLAS(s):                0.193000
Speedup (OpenBLAS):         1.238342
Speedup (C-Intrinsics):     1.076577
Error (Fortran):            0.000000
Error (C):                  0.000000
==================================================
Matrix size:                2000
Fortran(s):                 0.549000
C-Intrinsics(s):            0.530000
OpenBLAS(s):                0.455000
Speedup (OpenBLAS):         1.206593
Speedup (C-Intrinsics):     1.035849
Error (Fortran):            0.000000
Error (C):                  0.000000
module mod_dgemm
   use iso_fortran_env, only: dp => real64
   implicit none

   private
   public :: dp, dgemm_pure_fortran, dgemm_openblas, dgemm_c_intrinsics

   integer, parameter :: MR = 6, NR = 8, MC = 96, KC = 224

   interface
      pure subroutine dgemm_kernel_6x8(n_block, A, m_block, B, k_block, C, ldc) bind(C)
         use iso_c_binding, only: c_double, c_int
         implicit none
         integer(c_int), intent(in) :: ldc, m_block, n_block, k_block
         real(c_double), intent(in) :: A(*), B(*)
         real(c_double), intent(inout) :: C(*)
      end subroutine
      function aligned_alloc_wrapper(alignment, size) bind(C)
         use iso_c_binding, only: c_ptr, c_long
         implicit none
         integer(c_long), value :: alignment, size
         type(c_ptr) :: aligned_alloc_wrapper
      end function
   end interface

contains

   !===========================
   ! BLAS wrapper
   !===========================
   subroutine dgemm_openblas(A, B, C, m, n, k)
      integer,  intent(in)    :: m, n, k
      real(dp), intent(in)    :: A(m,k), B(k,n)
      real(dp), intent(inout) :: C(m,n)
      call dgemm('N','N', m,n,k, 1.0_dp, A,m, B,k, 0.0_dp, C,m)
   end subroutine dgemm_openblas

   !===========================
   ! Aligned allocation helper
   !===========================
   subroutine allocate_aligned_real64_1D(x, n, alignment, raw_ptr)
      use iso_c_binding, only: c_double, c_int, c_ptr, c_long, c_sizeof, c_f_pointer, c_associated
      real(c_double), pointer, intent(inout) :: x(:)
      integer, intent(in) :: n
      integer(c_int), intent(in), optional :: alignment
      type(c_ptr), intent(out) :: raw_ptr
      integer(c_long) :: align, total_size

      align = merge(alignment, 64, present(alignment))
      total_size = n * c_sizeof(0.0_c_double)
      raw_ptr = aligned_alloc_wrapper(align, total_size)
      if (.not. c_associated(raw_ptr)) stop "allocation failed"
      call c_f_pointer(raw_ptr, x, [n])
   end subroutine allocate_aligned_real64_1D

   !===========================
   ! Intrinsics DGEMM
   !===========================
   subroutine dgemm_c_intrinsics(A, B, C, m, n, k)
      use iso_c_binding, only: c_ptr
      integer, intent(in) :: m, n, k
      real(dp), intent(in) :: A(m,k), B(k,n)
      real(dp), intent(inout) :: C(m,n)

      integer :: i, j, ib, jb, pb, rows, cols
      integer :: m_block, n_block, k_block, NC, paneli, panelj, col
      real(dp), pointer :: A_block(:), B_block(:), A_panel(:,:), B_panel(:,:)
      type(c_ptr) :: ptrA, ptrB

      NC = n
      call allocate_aligned_real64_1D(A_block, MC*KC, 64, ptrA)
      call allocate_aligned_real64_1D(B_block, NC*KC, 64, ptrB)

      ! Loop over C blocks
      do jb = 1, n, NC
         n_block = min(NC, n-jb+1)
         do pb = 1, k, KC
            k_block = min(KC, k-pb+1)

            ! Pack B
            panelj = 0
            do j = 1, n_block, NR
               cols = min(NR, n_block-j+1)
               B_panel(1:cols,1:k_block) => B_block(panelj*NR*k_block+1 : panelj*NR*k_block+cols*k_block)
               do col = 1, cols
                  B_panel(col,:) = B(pb:pb+k_block-1, jb+j+col-2)
               end do
               panelj = panelj+1
            end do

            ! Loop over A blocks
            do ib = 1, m, MC
               m_block = min(MC, m-ib+1)

               ! Pack A
               paneli = 0
               do i = 1, m_block, MR
                  rows = min(MR, m_block-i+1)
                  A_panel(1:rows,1:k_block) => A_block(paneli*MR*k_block+1 : paneli*MR*k_block+rows*k_block)
                  A_panel(1:rows,:) = A(ib+i-1:ib+i+rows-2, pb:pb+k_block-1)
                  paneli = paneli+1
               end do

               call dgemm_kernel_6x8(n_block, A_block, m_block, B_block, k_block, C(ib,jb), size(C,1))
            end do
         end do
      end do

      deallocate(A_block, B_block)
   end subroutine dgemm_c_intrinsics

   !===========================
   ! Pure Fortran DGEMM
   !===========================
   pure subroutine pack_A(mb, kb, A, lda, A_pack)
      integer, intent(in)   :: mb, kb, lda
      real(dp), intent(in)  :: A(lda,*)
      real(dp), intent(out) :: A_pack(MC*KC)
      integer               :: i,j,rows,row, abase
      do i = 1, mb, MR
         rows = min(MR, mb-i+1)
         abase = (i/MR)*MR*kb
         ! do concurrent (j =1:kb, row=1:rows)
         !    A_pack(abase + (j-1)*rows + row) = A(i+row-1, j)
         ! end do
         do j = 1, kb
            do row = 1, rows
               A_pack(abase + (j-1)*rows + row) = A(i+row-1, j)
            end do
         end do
      end do
   end subroutine

   pure subroutine pack_B(kb, nb, B, ldb, B_pack, NC)
      integer, intent(in)   :: kb, nb, ldb, NC
      real(dp), intent(in)  :: B(ldb,*)
      real(dp), intent(out) :: B_pack(KC*NC)
      integer               :: i,j,cols,col, bbase
      do j = 1, nb, NR
         cols = min(NR, nb-j+1)
         bbase = (j/NR)*NR*kb
         ! do concurrent (i=1:kb, col=1:cols)
         !    B_pack(bbase + (i-1)*cols + col) = B(i, j+col-1)
         ! end do
         do i = 1, kb
            do col = 1, cols
               B_pack(bbase + (i-1)*cols + col) = B(i, j+col-1)
            end do
         end do
      end do
   end subroutine

   pure subroutine kernel_scalar(k, m_block, n_block, A, B, C, ldc)
      integer,  intent(in)    :: k, m_block, n_block, ldc
      real(dp), intent(in)    :: A(*)
      real(dp), intent(in)    :: B(*)
      real(dp), intent(inout) :: C(*)
      integer                 :: i, j, p
      do j = 1, n_block
         do i = 1, m_block
            do p = 1, k
               C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
            end do
         end do
      end do
   end subroutine

   pure subroutine dgemm_pure_fortran(A, B, C, m, n, k)
      integer, intent(in)     :: m, n, k
      real(dp), intent(in)    :: A(m,k), B(k,n)
      real(dp), intent(inout) :: C(m,n)
      integer :: i, j, p, kk, ll, ib, jb, pb, jbound, n_full, m_full, m_block, n_block, k_block, r, bbase, abase, aptr, bptr
      real(dp), allocatable :: A_block(:), B_block(:)
      real(dp) :: cr(4,2,MR), arow
      integer :: NC

      NC = n

      allocate(A_block(MC*KC))
      allocate(B_block(KC*NC))

      do jb = 1, n, NC
         n_block = min(NC, n-jb+1)
         do pb = 1, k, KC
            k_block = min(KC, k-pb+1)

            call pack_B(k_block, n_block, B(pb, jb), k, B_block, NC)

            do ib = 1, m, MC
               m_block = min(MC, m-ib+1)

               call pack_A(m_block, k_block, A(ib, pb), m, A_block)

               m_full = m_block-mod(m_block, MR)
               n_full = n_block-mod(n_block, NR)
               do j = 1, n_full, NR
                  bbase = ((j-1)/NR)*NR*k_block
                  do i = 1, m_full, MR
                     abase = ((i-1)/MR)*MR*k_block

                     cr = 0.0_dp
                     do p = 0, k_block-1
                        aptr = abase+p*MR+1
                        bptr = bbase+p*NR+1
                        do r = 1, MR
                           arow = A_block(aptr)
                           cr(1,1,r) = cr(1,1,r) + arow*B_block(bptr  )
                           cr(2,1,r) = cr(2,1,r) + arow*B_block(bptr+1)
                           cr(3,1,r) = cr(3,1,r) + arow*B_block(bptr+2)
                           cr(4,1,r) = cr(4,1,r) + arow*B_block(bptr+3)

                           cr(1,2,r) = cr(1,2,r) + arow*B_block(bptr+4)
                           cr(2,2,r) = cr(2,2,r) + arow*B_block(bptr+5)
                           cr(3,2,r) = cr(3,2,r) + arow*B_block(bptr+6)
                           cr(4,2,r) = cr(4,2,r) + arow*B_block(bptr+7)
                           aptr = aptr + 1
                        end do
                     end do

                     do kk = 0, 3
                        C(ib+i+3, jb+j+kk-1) = C(ib+i+3, jb+j+kk-1) + cr(kk+1, 1, 5)
                        C(ib+i+4, jb+j+kk-1) = C(ib+i+4, jb+j+kk-1) + cr(kk+1, 1, 6)
                        C(ib+i+3, jb+j+kk+3) = C(ib+i+3, jb+j+kk+3) + cr(kk+1, 2, 5)
                        C(ib+i+4, jb+j+kk+3) = C(ib+i+4, jb+j+kk+3) + cr(kk+1, 2, 6)
                        do ll = 0, 3
                           C(ib+i+ll-1, jb+j+kk-1) = C(ib+i+ll-1, jb+j+kk-1) + cr(kk+1, 1, ll+1)
                           C(ib+i+ll-1, jb+j+kk+3) = C(ib+i+ll-1, jb+j+kk+3) + cr(kk+1, 2, ll+1)
                        end do
                     end do

                  end do
               end do

               ! do concurrent (j = n_full+1: n_block: NR) local(jbound, bbase)
               do j = n_full+1, n_block, NR
                  jbound = min(NR, n_block-j+1)
                  bbase = ((j-1)/NR)*NR*k_block+1
                  ! do concurrent (i = 1: m_full: MR)
                  do i = 1, m_full, MR
                     call kernel_scalar(k_block, MR, jbound, A_block(((i-1)/MR)*MR*k_block+1), B_block(bbase), C(ib+i-1:, jb+j-1), m)
                  end do
               end do

               ! do concurrent (j = 1: n_full: NR) local(bbase)
               do j = 1, n_full, NR
                  bbase = ((j-1)/NR)*NR*k_block+1
                  ! do concurrent (i = m_full+1: m_block: MR)
                  do i = m_full+1, m_block, MR
                     call kernel_scalar(k_block, min(MR, m_block-i+1), NR, A_block(((i-1)/MR)*MR*k_block+1), B_block(bbase), C(ib+i-1:, jb+j-1), m)
                  end do
               end do

            end do
         end do
      end do

   end subroutine
end module mod_dgemm


program compare_dgemm
   use mod_dgemm, only: dp, dgemm_pure_fortran, dgemm_openblas, dgemm_c_intrinsics
   implicit none
   integer, parameter :: nreps = 20
   integer, parameter :: sizes(*) = [1000, 1500, 2000]
   real(dp), allocatable :: A(:,:), B(:,:), C1(:,:), C2(:,:), C3(:,:)
   real(dp) :: elapsed, tmin_for, tmin_c, tmin_blas, error_for, error_c
   integer :: start, finish, rate, r, m, n, k, sz

   call system_clock(count_rate=rate)

   do sz = 1, size(sizes)
      m = sizes(sz)
      n = m
      k = m
      allocate(A(m,k), B(k,n), C1(m,n), C2(m,n), C3(m,n))
      call random_number(A)
      call random_number(B)

      ! Warm-up
      C1 = 0.0_dp
      call dgemm_pure_fortran(A,B,C1,m,n,k)
      C2 = 0.0_dp
      call dgemm_c_intrinsics(A,B,C2,m,n,k)
      C3 = 0.0_dp
      call dgemm_openblas(A,B,C3,m,n,k)

      ! --- Intrinsics timing (min over nreps) ---
      tmin_for = huge(1.0_dp)
      do r = 1, nreps
         C1 = 0.0_dp
         call system_clock(start)
         call dgemm_pure_fortran(A,B,C1,m,n,k)
         call system_clock(finish)
         elapsed = real(finish-start,dp)/real(rate,dp)
         tmin_for = min(tmin_for,elapsed)
      end do

      ! --- Intrinsics timing (min over nreps) ---
      tmin_c = huge(1.0_dp)
      do r = 1, nreps
         C2 = 0.0_dp
         call system_clock(start)
         call dgemm_c_intrinsics(A,B,C2,m,n,k)
         call system_clock(finish)
         elapsed = real(finish-start,dp)/real(rate,dp)
         tmin_c = min(tmin_c,elapsed)
      end do

      ! --- BLAS timing (min over nreps) ---
      tmin_blas = huge(1.0_dp)
      do r = 1, nreps
         C3 = 0.0_dp
         call system_clock(start)
         call dgemm_openblas(A,B,C3,m,n,k)
         call system_clock(finish)
         elapsed = real(finish-start,dp)/real(rate,dp)
         tmin_blas = min(tmin_blas,elapsed)
      end do

      error_for = maxval(abs(C1-C3))
      error_c   = maxval(abs(C2-C3))

      print '(a)', repeat('=',50)
      print '(a,i8)'   , 'Matrix size:            ', m
      print '(a,f12.6)', 'Fortran(s):             ', tmin_for
      print '(a,f12.6)', 'C-Intrinsics(s):        ', tmin_c
      print '(a,f12.6)', 'OpenBLAS(s):            ', tmin_blas
      print '(a,f12.6)', 'Speedup (OpenBLAS):     ', tmin_for/tmin_blas
      print '(a,f12.6)', 'Speedup (C-Intrinsics): ', tmin_for/tmin_c
      print '(a,f12.6)', 'Error (Fortran):        ', error_for
      print '(a,f12.6)', 'Error (C):              ', error_c

      deallocate(A,B,C1,C2,C3)
   end do
end program compare_dgemm

Interestingly, on my computer with an older version of gfortran, your Fortran code runs ten times slower than the C version. However, with the newer gfortran version, it produces the same results as yours

You main modification is that you have manually unrolled the inner loops, right?

I was trying to polish the code yesterday and moving it to a module and converting dgemm_pure_fortran to a pure subroutine, …

You’re right the most significant change is unrolling the inner loop and eliminating the temporary arrays b0 and b1. (I haven’t read the full thread, so I hope this isn’t a duplicate)

Original code:

! zero accumulators
cr = 0.0d0
! ---- main accumulation over K ----
do p = 1, k_block
   ! load B row p (row-major): first half j=1..4, second half j=5..8
   b0(1:4) = B_block(panelj*NR*k_block + (p-1)*NR + 1:panelj*NR*k_block + (p-1)*NR + 4)
   b1(1:4) = B_block(panelj*NR*k_block + (p-1)*NR + 5:panelj*NR*k_block + (p-1)*NR + 8)

   ! for each row of A, broadcast and FMA into both halves
   do r = 1, MR
      arow = A_block(paneli*MR*k_block+ r + (p-1)*MR)   ! A(r,p) with lda=MR
      ! accumulate into cols 0..3
      cr(1:4,1,r) = cr(1:4,1,r) + arow*b0(1:4)
      cr(1:4,1,r) = cr(1:4,1,r) + arow*b0(1:4)
      ! accumulate into cols 4..7
      cr(1:4,2,r) = cr(1:4,2,r) + arow*b1(1:4)
   end do
end do

Modified code:

cr = 0.0_dp
do p = 0, k_block-1
   aptr = abase+p*MR+1
   bptr = bbase+p*NR+1
   do r = 1, MR
      arow = A_block(aptr)
      cr(1,1,r) = cr(1,1,r) + arow*B_block(bptr  )
      cr(2,1,r) = cr(2,1,r) + arow*B_block(bptr+1)
      cr(3,1,r) = cr(3,1,r) + arow*B_block(bptr+2)
      cr(4,1,r) = cr(4,1,r) + arow*B_block(bptr+3)

      cr(1,2,r) = cr(1,2,r) + arow*B_block(bptr+4)
      cr(2,2,r) = cr(2,2,r) + arow*B_block(bptr+5)
      cr(3,2,r) = cr(3,2,r) + arow*B_block(bptr+6)
      cr(4,2,r) = cr(4,2,r) + arow*B_block(bptr+7)
      aptr = aptr + 1
   end do
end do

[Editedt]: This gives also the same results:

cr = 0.0_dp
do p = 0, k_block-1
   aptr = abase+p*MR+1
   bptr = bbase+p*NR+1
   do r = 1, MR
      arow = A_block(aptr)
      cr(1:4,1,r) = cr(1:4,1,r) + arow*B_block(bptr+0:3  )
      cr(1:4,2,r) = cr(1:4,2,r) + arow*B_block(bptr+4:7  )
      aptr = aptr + 1
   end do
end do

OK… So it’s not the manual unrolling that does the job (which would have been surprising), but avoiding the b temporary arrays. Difficult to explain…

I don’t think it will change much, but wouldn’t the following small change help if ever so slightly by avoiding the sequential loop dependency to increment aptr ? there also seems to be a typo in B_block(bptr+0:3 ) > B_block(bptr+0:bptr+3 ?

cr = 0.0_dp
do p = 0, k_block-1
   aptr = abase+p*MR
   bptr = bbase+p*NR+1
   do r = 1, MR
      arow = A_block(aptr+r)
      cr(1:4,1,r) = cr(1:4,1,r) + arow*B_block(bptr:bptr+3  )
      cr(1:4,2,r) = cr(1:4,2,r) + arow*B_block(bptr+4:bptr+7  )
   end do
end do

The performance depends heavily on the compiler version, making it difficult to know exactly what to write.

Using OpenMP SIMD

! accumulate into cols 0..3
!$OMP SIMD SIMDLEN(4)
do q=1,4
 cr(q,1,r)=cr(q,1,r)+arow*b0(q)
end do

! accumulate into cols 4..7
!$OMP SIMD SIMDLEN(4)
do q=1,4
 cr(q,2,r)=cr(q,2,r)+arow*b1(q)
end do

GFortran 11/14/15 gives us what we want

============================
 Results for GCC/GFortran 11
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.048000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.159000
C-Intrinsics(s):            0.150000

============================
 Results for GCC/GFortran 12
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.099000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.328000
C-Intrinsics(s):            0.149000

============================
 Results for GCC/GFortran 13
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.103000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.340000
C-Intrinsics(s):            0.149000

============================
 Results for GCC/GFortran 14
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.051000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.165000
C-Intrinsics(s):            0.148000

============================
 Results for GCC/GFortran 15
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.050000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.164000
C-Intrinsics(s):            0.150000

Using what was suggested by @Ali :

We get:

============================
 Results for GCC/GFortran 11
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.820000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 2.766000
C-Intrinsics(s):            0.149000

============================
 Results for GCC/GFortran 12
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.819000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 2.764000
C-Intrinsics(s):            0.150000

============================
 Results for GCC/GFortran 13
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.097000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.324000
C-Intrinsics(s):            0.149000

============================
 Results for GCC/GFortran 14
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.098000
C-Intrinsics(s):            0.043000
==================================================
Matrix size:                1500
Fortran(s):                 0.325000
C-Intrinsics(s):            0.149000

============================
 Results for GCC/GFortran 15
============================
==================================================
Matrix size:                1000
Fortran(s):                 0.051000
C-Intrinsics(s):            0.044000
==================================================
Matrix size:                1500
Fortran(s):                 0.165000
C-Intrinsics(s):            0.150000

yes! :wink:

I tried that before; I don’t know exactly why, but it makes it much slower:

==================================================
Matrix size:                1000
Fortran(s):                 0.459000
C-Intrinsics(s):            0.069000
OpenBLAS(s):                0.058000
Speedup (OpenBLAS):         7.913793
Speedup (C-Intrinsics):     6.652174
Error (Fortran):            0.000000
Error (C):                  0.000000

I got the latest improvement with loop tiling in kernel_scalar:

   pure subroutine kernel_scalar(k, m_block, n_block, A, B, C, ldc)
      integer,  intent(in)    :: k, m_block, n_block, ldc
      real(dp), intent(in)    :: A(*)
      real(dp), intent(in)    :: B(*)
      real(dp), intent(inout) :: C(*)
      integer                 :: i, j, p, ii, jj
      integer, parameter :: block_size = 8
      do jj = 1, n_block, block_size
         do ii = 1, m_block, block_size
            do p = 1, k
               do j = jj, min(jj+block_size-1, n_block)
                  do i = ii, min(ii+block_size-1, m_block)
                     C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
                  end do
               end do
            end do
         end do
      end do
   end subroutine

This yields:

==================================================
Matrix size:                1000
Fortran(s):                 0.068000
C-Intrinsics(s):            0.068000
OpenBLAS(s):                0.058000
Speedup (OpenBLAS):         1.172414
Speedup (C-Intrinsics):     1.000000
Error (Fortran):            0.000000
Error (C):                  0.000000
==================================================
Matrix size:                1500
Fortran(s):                 0.229000
C-Intrinsics(s):            0.229000
OpenBLAS(s):                0.193000
Speedup (OpenBLAS):         1.186528
Speedup (C-Intrinsics):     1.000000
Error (Fortran):            0.000000
Error (C):                  0.000000
==================================================
Matrix size:                2000
Fortran(s):                 0.553000
C-Intrinsics(s):            0.546000
OpenBLAS(s):                0.458000
Speedup (OpenBLAS):         1.207424
Speedup (C-Intrinsics):     1.012821
Error (Fortran):            0.000000
Error (C):                  0.000000

Would including daxpy in this latest code provide any benefit ?
I expect that daxpy would use AVX instructions, although I am not sure of pure usage ?

   pure subroutine kernel_scalar (k, m_block, n_block, A, B, C, ldc)
      integer, parameter :: dp = kind (1.d00)
      integer,  intent(in)    :: k, m_block, n_block, ldc
      real(dp), intent(in)    :: A(*)
      real(dp), intent(in)    :: B(*)
      real(dp), intent(inout) :: C(*)
      integer                 :: i, j, p, ii, jj, mj,ni,ip,ia,ic
      integer, parameter :: block_size = 8

      do jj = 1, n_block, block_size
         mj = min (jj+block_size-1, n_block)
         do ii = 1, m_block, block_size
            ni = min (ii+block_size-1, m_block) - ii+1
            do p = 1, k
               ip = (p-1)*n_block
               ia = ii+ip
               ic = ii-idc
               do j = jj, mj
                   call daxpy ( ni, B(j+ip), A(ia), C(j*ldc+ic) )  ! ( n, a, x, y )
!                  do i = ii, min(ii+block_size-1, m_block)
!                     C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
!                  end do
               end do
            end do
         end do
      end do

   contains

      pure subroutine daxpy ( n, a, X, Y )
   !
   !   Performs the vector operation  [Y] = [Y] + a * [X]
   !
         integer,  intent(in)    :: n
         real(dp), intent(in)    :: A, X(n)
         real(dp), intent(inout) :: Y(n)
   
          Y = Y + a * X
   
      end subroutine daxpy

   end subroutine kernel_scalar

Most modern compilers would automatically provide mj,ni,ip,ia and ic

Wouldn’t this be? ia = ii+(p-1)*m_block

I also think there might be a small typo here: idc > ldc ?

Yes to both and thanks.
Using daxpy is a good option, as I find it does use AVX, although I have only used it with contiguous arrays.

1 Like

The BLAS daxpy() is written to take noncontiguous array arguments, but in most implementations there is probably an internal branch to a special section of code for contiguous arrays. However, to achieve optimal efficiency, that routine would probably need to be inlined by the compiler in order to eliminate the subroutine overhead and the branching overhead. Then the question would be what is the difference in that inlined subroutine and compiling the original innermost do loop?

1 Like

@RonShepard
My daxpy example had it contained as 1 line of computation to achieve optimal efficiency.
The wrapper was used to clearly define the array attributes.
My use of this routine does appear to be effective in my OpenMP codes, although less effective with many threads and large vectors.

In a separate test program, I tested kernel_scalar1 and kernel_scalar2 (suggested by @JohnCampbell ) with different block_size values using gfortran 15.2.0. Below are the results from my system.

============================================
block_size = 8
elapsed time (kernel_scalar1):    0.098000 s
elapsed time (kernel_scalar2):    0.092000 s
Speedup (k1/k2):                  1.065217
Max |C1 - C2|:                    0.000000
============================================
block_size = 32
elapsed time (kernel_scalar1):    0.052000 s
elapsed time (kernel_scalar2):    0.049000 s
Speedup (k1/k2):                  1.061224
Max |C1 - C2|:                    0.000000
============================================
block_size = 64
elapsed time (kernel_scalar1):    0.048000 s
elapsed time (kernel_scalar2):    0.046000 s
Speedup (k1/k2):                  1.043478
Max |C1 - C2|:                    0.000000
============================================
block_size = 128
elapsed time (kernel_scalar1):    0.055000 s
elapsed time (kernel_scalar2):    0.051000 s
Speedup (k1/k2):                  1.078431
Max |C1 - C2|:                    0.000000
============================================
block_size = 256
elapsed time (kernel_scalar1):    0.051000 s
elapsed time (kernel_scalar2):    0.048000 s
Speedup (k1/k2):                  1.062500
Max |C1 - C2|:                    0.000000
============================================
block_size = 512
elapsed time (kernel_scalar1):    0.048000 s
elapsed time (kernel_scalar2):    0.046000 s
Speedup (k1/k2):                  1.043478
Max |C1 - C2|:                    0.000000
module mod_test

   implicit none
   integer, parameter :: dp = kind(1.0d0)

   private
   public :: dp, kernel_scalar1, kernel_scalar2

contains

   pure subroutine kernel_scalar1(k, m_block, n_block, A, B, C, ldc, block_size)
      integer,  intent(in)    :: k, m_block, n_block, ldc, block_size
      real(dp), intent(in)    :: A(*)
      real(dp), intent(in)    :: B(*)
      real(dp), intent(inout) :: C(*)
      integer                 :: i, j, p, ii, jj

      !   if (.not.is_contiguous(A)) error stop 'A is not contiguous'
      !   if (.not.is_contiguous(B)) error stop 'B is not contiguous'
      !   if (.not.is_contiguous(C)) error stop 'C is not contiguous'

      do jj = 1, n_block, block_size
         do ii = 1, m_block, block_size
            do p = 1, k
               do j = jj, min(jj+block_size-1, n_block)
                  do i = ii, min(ii+block_size-1, m_block)
                     C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
                  end do
               end do
            end do
         end do
      end do
   end subroutine

   pure subroutine kernel_scalar2(k, m_block, n_block, A, B, C, ldc, block_size)
      integer,  intent(in)    :: k, m_block, n_block, ldc, block_size
      real(dp), intent(in)    :: A(*)
      real(dp), intent(in)    :: B(*)
      real(dp), intent(inout) :: C(*)
      integer                 :: i, j, p, ii, jj, mj,ni,ip,ia,ic

      !   if (.not.is_contiguous(A)) error stop 'A is not contiguous'
      !   if (.not.is_contiguous(B)) error stop 'B is not contiguous'
      !   if (.not.is_contiguous(C)) error stop 'C is not contiguous'

      do jj = 1, n_block, block_size
         mj = min (jj+block_size-1, n_block)
         do ii = 1, m_block, block_size
            ni = min (ii+block_size-1, m_block) - ii+1
            do p = 1, k
               ip = (p-1)*n_block
               ia = ii+(p-1)*m_block
               ic = ii-ldc
               do j = jj, mj
                  call daxpy ( ni, B(j+ip), A(ia), C(j*ldc+ic) )  ! ( n, a, x, y )
!                  do i = ii, min(ii+block_size-1, m_block)
!                     C(i+(j-1)*ldc) = C(i+(j-1)*ldc) + A(i+(p-1)*m_block)*B(j+(p-1)*n_block)
!                  end do
               end do
            end do
         end do
      end do

   contains

      pure subroutine daxpy ( n, a, X, Y )
         !
         !   Performs the vector operation  [Y] = [Y] + a * [X]
         !
         integer,  intent(in)    :: n
         real(dp), intent(in)    :: A, X(n)
         real(dp), intent(inout) :: Y(n)
         Y = Y + a * X
      end subroutine daxpy
   end subroutine
end module


program test

   use mod_test, only : dp, kernel_scalar1, kernel_scalar2

   implicit none
   integer, parameter :: m = 512, n = 512, k = 512
   integer, parameter :: block_sizes(*) = [8,32,64,128,256,512]
   integer, parameter :: nreps = 20

   real(dp) :: A(m*k), B(n*k), C1(m*n), C2(m*n)
   real(dp) :: elapsed1, elapsed2
   integer :: rate, start, finish, r, bs, block_size

   do bs = 1, size(block_sizes)

      block_size = block_sizes(bs)

      call random_number(A)
      call random_number(B)

      call system_clock(count_rate=rate)

      elapsed1 = huge(1.0_dp)
      do r = 1, nreps
         C1 = 0.0_dp
         call system_clock(start)
         call kernel_scalar1(k, m, n, A, B, C1, m, block_size)
         call system_clock(finish)
         elapsed1 = min(elapsed1,real(finish-start,dp)/real(rate,dp))
      end do

      elapsed2 = huge(1.0_dp)
      do r = 1, nreps
         C2 = 0.0_dp
         call system_clock(start)
         call kernel_scalar2(k, m, n, A, B, C2, m, block_size)
         call system_clock(finish)
         elapsed2 = min(elapsed2,real(finish-start,dp)/real(rate,dp))
      end do

      print '(a)', repeat('=',44)
      print '(a,g0)', 'block_size = ', block_size
      print '(a,f12.6," s")', "elapsed time (kernel_scalar1):", elapsed1
      print '(a,f12.6," s")', "elapsed time (kernel_scalar2):", elapsed2
      print '(a,f12.6)',      "Speedup (k1/k2):              ", elapsed1/elapsed2
      print '(a,f12.6)',      "Max |C1 - C2|:                ", maxval(abs(C1 - C2))
   end do
end program test
2 Likes

@Ali
Thanks for this test example. It is interesting to compare kernel_scalar 1 & 2 for various compiler optimisation levels.
I thought they show AVX is included in daxpy for lower optimisation selections.