Hi everyone,
I have been experimenting with implementing a DGEMM kernel and benchmarking it against OpenBLAS.
Using C with AVX intrinsics, I was able to reproduce OpenBLAS-level performance. The implementation consists of Fortran code handling the outer loops and submatrix packing, while the compute-intensive microkernel is implemented in C with intrinsics. For the tested matrix sizes, the performance is very close to that of OpenBLAS.
However, when I try to implement the same kernel purely in Fortran, the runtime is about 2Ă— slower than OpenBLAS.
My question: Is it actually possible to achieve the same DGEMM performance using only pure Fortran (without falling back to C), or are there inherent limitations in current Fortran regarding SIMD intrinsics and low-level register control?
Below are:
1 - Performance results comparing my implementation with OpenBLAS.
2 - The C+intrinsics kernel I wrote (called from Fortran).
3 - The Fortran driver that contains the outer loops, packs panels, and benchmarks against OpenBLAS.
- Performance Results
==================================================
Matrix size: 1000
Fortran(s): 0.098000
C-Intrinsics(s): 0.044000
OpenBLAS(s): 0.044000
Speedup (OpenBLAS): 2.227273
Speedup (C-Intrinsics): 2.227273
Error (Fortran): 0.000000
Error (C): 0.000000
==================================================
Matrix size: 1500
Fortran(s): 0.327000
C-Intrinsics(s): 0.149000
OpenBLAS(s): 0.149000
Speedup (OpenBLAS): 2.194631
Speedup (C-Intrinsics): 2.194631
Error (Fortran): 0.000000
Error (C): 0.000000
==================================================
Matrix size: 2000
Fortran(s): 0.764000
C-Intrinsics(s): 0.352000
OpenBLAS(s): 0.351000
Speedup (OpenBLAS): 2.176638
Speedup (C-Intrinsics): 2.170455
Error (Fortran): 0.000000
Error (C): 0.000000
- C File
/*
* dgemm_kernel.c
* compile:
* gcc -O3 -march=native -mavx2 -mfma -c dgemm_kernel_cleaned.c -o dgemm_kernel.o
*/
#include <immintrin.h>
// Blocking sizes (micro-kernel dimensions)
#define MR 6
#define NR 8
void* aligned_alloc_wrapper(long alignment, long size) {
return aligned_alloc(alignment, (size_t)size);
}
// ------------------------------
// AVX2 micro-kernel: computes C_sub += A_sub * B_sub for a 6x8 tile
// A_sub: MR x K
// B_sub: K x NR
// C_sub: MR x NR (column-major with leading dimension ldc)
// ------------------------------
static inline void micro_kernel(
const double* __restrict__ A_sub, // size: MR x K
const double* __restrict__ B_sub, const int* k_block, // size: K x NR
double* __restrict__ C_sub, const int* ldc)
{
const int LDC = *ldc; // stride between rows in C (elements)
const int K = *k_block;
// Each row has two 4-wide vectors: cols 0..3 and 4..7
__m256d c_vec_rows[MR][2];
// Temporary column vectors for rows 0..3 after transpose
__m256d c_vec_cols[NR];
// Zero-initialize accumulators
for (int row = 0; row < MR; ++row) {
c_vec_rows[row][0] = _mm256_setzero_pd(); // cols 0..3
c_vec_rows[row][1] = _mm256_setzero_pd(); // cols 4..7
}
// Main accumulation loop over K
for (int p = 0; p < K; ++p) {
// Load B row p for columns j..j+7
__m256d b_vec0 = _mm256_load_pd(&B_sub[p * NR + 0]); // cols 0..3
__m256d b_vec1 = _mm256_load_pd(&B_sub[p * NR + 4]); // cols 4..7
for (int row = 0; row < MR; ++row) {
// Broadcast A_sub[row + p * MR]
__m256d a = _mm256_broadcast_sd(&A_sub[row + p * MR]);
c_vec_rows[row][0] = _mm256_fmadd_pd(b_vec0, a, c_vec_rows[row][0]);
c_vec_rows[row][1] = _mm256_fmadd_pd(b_vec1, a, c_vec_rows[row][1]);
}
}
// --------------------------
// Transpose rows 0..3 (fast vector path)
// --------------------------
// First half: columns 0..3 (from c_vec_rows[*][0])
__m256d t0 = _mm256_unpacklo_pd(c_vec_rows[0][0], c_vec_rows[1][0]); // [ c00 c10 c01 c11 ]
__m256d t1 = _mm256_unpackhi_pd(c_vec_rows[0][0], c_vec_rows[1][0]); // [ c02 c12 c03 c13 ]
__m256d t2 = _mm256_unpacklo_pd(c_vec_rows[2][0], c_vec_rows[3][0]); // [ c20 c30 c21 c31 ]
__m256d t3 = _mm256_unpackhi_pd(c_vec_rows[2][0], c_vec_rows[3][0]); // [ c22 c32 c23 c33 ]
// Combine lows -> cols 0 and 2; highs -> cols 1 and 3 (permute control 0x20/0x31)
c_vec_cols[0] = _mm256_permute2f128_pd(t0, t2, 0x20); // [ c00 c10 c20 c30 ] col 0
c_vec_cols[1] = _mm256_permute2f128_pd(t1, t3, 0x20); // [ c02 c12 c22 c32 ] col 2
c_vec_cols[2] = _mm256_permute2f128_pd(t0, t2, 0x31); // [ c01 c11 c21 c31 ] col 1
c_vec_cols[3] = _mm256_permute2f128_pd(t1, t3, 0x31); // [ c03 c13 c23 c33 ] col 3
// Second half: columns 4..7 (from c_vec_rows[*][1])
t0 = _mm256_unpacklo_pd(c_vec_rows[0][1], c_vec_rows[1][1]);
t1 = _mm256_unpackhi_pd(c_vec_rows[0][1], c_vec_rows[1][1]);
t2 = _mm256_unpacklo_pd(c_vec_rows[2][1], c_vec_rows[3][1]);
t3 = _mm256_unpackhi_pd(c_vec_rows[2][1], c_vec_rows[3][1]);
c_vec_cols[4] = _mm256_permute2f128_pd(t0, t2, 0x20); // [ c04 c14 c24 c34 ] col 4
c_vec_cols[5] = _mm256_permute2f128_pd(t1, t3, 0x20); // [ c06 c16 c26 c36 ] col 6
c_vec_cols[6] = _mm256_permute2f128_pd(t0, t2, 0x31); // [ c05 c15 c25 c35 ] col 5
c_vec_cols[7] = _mm256_permute2f128_pd(t1, t3, 0x31); // [ c07 c17 c27 c37 ] col 7
// --------------------------
// Accumulate the first 4 rows into C (vectorized)
// --------------------------
for (int col = 0; col < NR; ++col) {
_mm_prefetch((const char*)&C_sub[col * LDC], _MM_HINT_T0);
// Load existing C column rows 0..3
__m256d c_orig_low = _mm256_load_pd(&C_sub[col * LDC]);
// Add accumulator for rows 0..3
__m256d c_new_low = _mm256_add_pd(c_orig_low, c_vec_cols[col]);
// Store back rows 0..3
_mm256_storeu_pd(&C_sub[col * LDC], c_new_low);
}
// --------------------------
// Handle tail rows 4..5 (MR == 6)
// --------------------------
double tmp4[4] __attribute__((aligned(32)));
double tmp5[4] __attribute__((aligned(32)));
double tail_pair[2] __attribute__((aligned(32)));
// First half (cols 0..3)
_mm256_store_pd(tmp4, c_vec_rows[4][0]); // row 4, cols 0..3
_mm256_store_pd(tmp5, c_vec_rows[5][0]); // row 5, cols 0..3
for (int k = 0; k < 4; ++k) {
int col = k; // 0..3
tail_pair[0] = tmp4[k];
tail_pair[1] = tmp5[k];
// Accumulate into C rows 4..5
double* c_tail_ptr = &C_sub[col * LDC + 4];
__m128d c_orig_tail = _mm_load_pd(c_tail_ptr);
__m128d tail_vec = _mm_load_pd(tail_pair);
__m128d c_new_tail = _mm_add_pd(c_orig_tail, tail_vec);
_mm_store_pd(c_tail_ptr, c_new_tail);
}
// Second half (cols 4..7)
_mm256_store_pd(tmp4, c_vec_rows[4][1]); // row 4, cols 4..7
_mm256_store_pd(tmp5, c_vec_rows[5][1]); // row 5, cols 4..7
for (int k = 0; k < 4; ++k) {
int col = 4 + k; // 4..7
tail_pair[0] = tmp4[k];
tail_pair[1] = tmp5[k];
double* c_tail_ptr = &C_sub[col * LDC + 4];
__m128d c_orig_tail = _mm_load_pd(c_tail_ptr);
__m128d tail_vec = _mm_load_pd(tail_pair);
__m128d c_new_tail = _mm_add_pd(c_orig_tail, tail_vec);
_mm_store_pd(c_tail_ptr, c_new_tail);
}
}
// ------------------------------
// Cleanup DGEMM kernel for tails, AVX2-accelerated where possible
// ------------------------------
void scalar_kernel(
const int* k, const int* m_block, const int* n_block,
const double* __restrict__ A, const int* lda,
const double* __restrict__ B, const int* ldb,
double* __restrict__ C, const int* ldc)
{
const int K = *k;
const int M = *m_block; // <= MR
const int N = *n_block; // <= NR
const int LDA = *lda;
const int LDB = *ldb;
const int LDC = *ldc;
int j = 0;
for (; j + 1 < N; j += 2) {
double* Cj0 = C + (size_t)j * LDC;
double* Cj1 = C + (size_t)(j + 1) * LDC;
int i = 0;
// Process 4 rows at a time
for (; i + 3 < M; i += 4) {
__m256d c0 = _mm256_setzero_pd();
__m256d c1 = _mm256_setzero_pd();
for (int p = 0; p < K; ++p) {
__m256d a = _mm256_loadu_pd(&A[(size_t)i + (size_t)p * LDA]);
__m256d b0 = _mm256_broadcast_sd(&B[(size_t)p * LDB + j]);
__m256d b1 = _mm256_broadcast_sd(&B[(size_t)p * LDB + (j + 1)]);
c0 = _mm256_fmadd_pd(a, b0, c0);
c1 = _mm256_fmadd_pd(a, b1, c1);
}
// Accumulate into C
__m256d c0_orig = _mm256_loadu_pd(&Cj0[i]);
__m256d c1_orig = _mm256_loadu_pd(&Cj1[i]);
__m256d c0_new = _mm256_add_pd(c0_orig, c0);
__m256d c1_new = _mm256_add_pd(c1_orig, c1);
_mm256_storeu_pd(&Cj0[i], c0_new);
_mm256_storeu_pd(&Cj1[i], c1_new);
}
// Remaining rows (scalar)
for (; i < M; ++i) {
double cij0 = Cj0[i];
double cij1 = Cj1[i];
for (int p = 0; p < K; ++p) {
double a = A[(size_t)i + (size_t)p * LDA];
cij0 += a * B[(size_t)p * LDB + j];
cij1 += a * B[(size_t)p * LDB + (j + 1)];
}
Cj0[i] = cij0;
Cj1[i] = cij1;
}
}
// Handle leftover last column if N is odd
if (j < N) {
double* Cj = C + (size_t)j * LDC;
int i = 0;
for (; i + 3 < M; i += 4) {
__m256d c0 = _mm256_setzero_pd();
for (int p = 0; p < K; ++p) {
__m256d a = _mm256_load_pd(&A[(size_t)i + (size_t)p * LDA]);
__m256d b = _mm256_broadcast_sd(&B[(size_t)p * LDB + j]);
c0 = _mm256_fmadd_pd(a, b, c0);
}
__m256d c0_orig = _mm256_loadu_pd(&Cj[i]);
__m256d c0_new = _mm256_add_pd(c0_orig, c0);
_mm256_storeu_pd(&Cj[i], c0_new);
}
for (; i < M; ++i) {
double cij = Cj[i];
for (int p = 0; p < K; ++p) {
cij += A[(size_t)i + (size_t)p * LDA] * B[(size_t)p * LDB + j];
}
Cj[i] = cij;
}
}
}
// ------------------------------
// Top-level kernel driving 6x8 blocks and tails
// ------------------------------
void dgemm_kernel_6x8(
const int* n_block,
const double* __restrict__ A_block, const int* m_block,
double* __restrict__ B_block, const int* k_block,
double* __restrict__ C, const int* ldc)
{
const int LDC = *ldc; // column-major stride for C
const int M_block = *m_block;
const int N_block = *n_block;
const int K = *k_block;
const int m_full = M_block - (M_block % MR);
const int n_full = N_block - (N_block % NR);
(void)K; // used in address arithmetic below
int MRc = MR;
int NRc = NR;
// === Case 1: Full MRxNR tiles ===
for (int j = 0; j < n_full; j += NR) {
int panelj = j / NR;
for (int i = 0; i < m_full; i += MR) {
int paneli = i / MR;
const double* __restrict__ A_sub = &A_block[paneli * MR * K]; // packed A
const double* __restrict__ B_sub = &B_block[panelj * NR * K]; // packed B
double* __restrict__ C_sub = &C[i + (size_t)j * LDC];
micro_kernel(A_sub, B_sub, k_block, C_sub, ldc);
}
}
// === Case 2: Partial NR columns ===
for (int j = n_full; j < N_block; j += NR) {
int jbound = (N_block - j > NR) ? NR : (N_block - j);
int panelj = j / NR;
for (int i = 0; i < m_full; i += MR) {
int paneli = i / MR;
scalar_kernel(
k_block, &MRc, &jbound,
&A_block[paneli * MR * K], &MRc,
&B_block[panelj * NR * K], &jbound,
&C[i + (size_t)j * LDC], ldc);
}
}
// === Case 3: Partial MR rows ===
for (int j = 0; j < n_full; j += NR) {
int panelj = j / NR;
for (int i = m_full; i < M_block; i += MR) {
int ibound = (M_block - i > MR) ? MR : (M_block - i);
int paneli = i / MR;
scalar_kernel(
k_block, &ibound, &NRc,
&A_block[paneli * MR * K], &ibound,
&B_block[panelj * NR * K], &NRc,
&C[i + (size_t)j * LDC], ldc);
}
}
// === Case 4: Bottom-right corner (partial rows and columns) ===
for (int j = n_full; j < N_block; j += NR) {
int jbound = (N_block - j > NR) ? NR : (N_block - j);
int panelj = j / NR;
for (int i = m_full; i < M_block; i += MR) {
int ibound = (M_block - i > MR) ? MR : (M_block - i);
int paneli = i / MR;
scalar_kernel(
k_block, &ibound, &jbound,
&A_block[paneli * MR * K], &ibound,
&B_block[panelj * NR * K], &jbound,
&C[i + (size_t)j * LDC], ldc);
}
}
}
- Fortran File
! Example: compare custom DGEMM (with C-intrinsics) vs. OpenBlas
! Compile:
! gfortran -O3 -march=native -funroll-loops -ffast-math -o compare_dgemm \
! compare_dgemm.f90 dgemm_kernel.o -lopenblas
! OMP_NUM_THREADS=1 ./compare_dgemm
!
program compare_dgemm
use iso_fortran_env, only: dp => real64
implicit none
integer, parameter :: nreps = 20
integer, parameter :: sizes(*) = [1000, 1500, 2000]
real(dp), allocatable :: A(:,:), B(:,:), C1(:,:), C2(:,:), C3(:,:)
integer :: start, finish, rate, r
integer :: m, n, k, sz
real(dp) :: elapsed, tmin_for, tmin_c, tmin_blas, error_for, error_c
call system_clock(count_rate=rate)
do sz = 1, size(sizes)
m = sizes(sz); n = m; k = m
allocate(A(m,k), B(k,n), C1(m,n), C2(m,n), C3(m,n))
call random_number(A); call random_number(B)
! Warm-up
C1 = 0; call dgemm_pure_fortran(A,B,C1,m,n,k)
C2 = 0; call dgemm_c_intrinsics(A,B,C2,m,n,k)
C3 = 0; call dgemm_openblas(A,B,C3,m,n,k)
! --- Intrinsics timing (min over nreps) ---
tmin_for = huge(1.0_dp)
do r = 1, nreps
C1 = 0
call system_clock(start)
call dgemm_pure_fortran(A,B,C1,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_for = min(tmin_for,elapsed)
end do
! --- Intrinsics timing (min over nreps) ---
tmin_c = huge(1.0_dp)
do r = 1, nreps
C2 = 0
call system_clock(start)
call dgemm_c_intrinsics(A,B,C2,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_c = min(tmin_c,elapsed)
end do
! --- BLAS timing (min over nreps) ---
tmin_blas = huge(1.0_dp)
do r = 1, nreps
C3 = 0
call system_clock(start)
call dgemm_openblas(A,B,C3,m,n,k)
call system_clock(finish)
elapsed = real(finish-start,dp)/real(rate,dp)
tmin_blas = min(tmin_blas,elapsed)
end do
error_for = maxval(abs(C1-C3))
error_c = maxval(abs(C2-C3))
print '(a)', repeat('=',50)
print '(a,i8)', 'Matrix size: ', m
print '(a,f12.6)', 'Fortran(s): ', tmin_for
print '(a,f12.6)', 'C-Intrinsics(s): ', tmin_c
print '(a,f12.6)', 'OpenBLAS(s): ', tmin_blas
print '(a,f12.6)', 'Speedup (OpenBLAS): ', tmin_for/tmin_blas
print '(a,f12.6)', 'Speedup (C-Intrinsics): ', tmin_for/tmin_c
print '(a,f12.6)', 'Error (Fortran): ', error_for
print '(a,f12.6)', 'Error (C): ', error_c
deallocate(A,B,C1,C2,C3)
end do
end program compare_dgemm
!===========================
! Aligned allocation helper
!===========================
subroutine allocate_aligned_real64_1D(x, n, alignment, raw_ptr)
use iso_c_binding
use iso_fortran_env, only: dp => real64
implicit none
real(c_double), pointer, intent(inout) :: x(:)
integer, intent(in) :: n
integer(c_int), intent(in), optional :: alignment
type(c_ptr), intent(out) :: raw_ptr
integer(c_long) :: align, total_size
interface
function aligned_alloc_wrapper(alignment, size) bind(C)
use iso_c_binding
integer(c_long), value :: alignment, size
type(c_ptr) :: aligned_alloc_wrapper
end function
end interface
align = merge(alignment, 64, present(alignment))
total_size = n * c_sizeof(0.0_c_double)
raw_ptr = aligned_alloc_wrapper(align, total_size)
if (.not. c_associated(raw_ptr)) stop "allocation failed"
call c_f_pointer(raw_ptr, x, [n])
end subroutine allocate_aligned_real64_1D
!===========================
! BLAS wrapper
!===========================
subroutine dgemm_openblas(A, B, C, m, n, k)
use iso_fortran_env, only: dp => real64
implicit none
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
call dgemm('N','N', m,n,k, 1.0_dp, A,m, B,k, 0.0_dp, C,m)
end subroutine dgemm_openblas
!===========================
! Intrinsics DGEMM
!===========================
subroutine dgemm_c_intrinsics(A, B, C, m, n, k)
use iso_fortran_env, only: dp => real64
use iso_c_binding
implicit none
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
integer, parameter :: MR=6, NR=8, MC=96, KC=224
integer :: i, j, ib, jb, pb, rows, cols
integer :: m_block, n_block, k_block, NC, paneli, panelj, col
real(dp), pointer :: A_block(:), B_block(:), A_panel(:,:), B_panel(:,:)
type(c_ptr) :: ptrA, ptrB
interface
pure subroutine dgemm_kernel_6x8(n_block, A, m_block, B, k_block, C, ldc) bind(C)
use iso_c_binding
integer(C_INT), intent(in) :: ldc, m_block, n_block, k_block
real(c_double), intent(in) :: A(*), B(*)
real(c_double), intent(inout) :: C(*)
end subroutine
subroutine allocate_aligned_real64_1D(x, n, alignment, raw_ptr)
use iso_c_binding
use iso_fortran_env, only: dp => real64
real(c_double), pointer, intent(inout) :: x(:)
integer, intent(in) :: n
integer(c_int), intent(in), optional :: alignment
type(c_ptr), intent(out) :: raw_ptr
end subroutine
end interface
NC = n
call allocate_aligned_real64_1D(A_block, MC*KC, 64, ptrA)
call allocate_aligned_real64_1D(B_block, NC*KC, 64, ptrB)
! Loop over C blocks
do jb = 1, n, NC
n_block = min(NC, n-jb+1)
do pb = 1, k, KC
k_block = min(KC, k-pb+1)
! Pack B
panelj = 0
do j = 1, n_block, NR
cols = min(NR, n_block-j+1)
B_panel(1:cols,1:k_block) => B_block(panelj*NR*k_block+1 : panelj*NR*k_block+cols*k_block)
do col = 1, cols
B_panel(col,:) = B(pb:pb+k_block-1, jb+j+col-2)
end do
panelj = panelj+1
end do
! Loop over A blocks
do ib = 1, m, MC
m_block = min(MC, m-ib+1)
! Pack A
paneli = 0
do i = 1, m_block, MR
rows = min(MR, m_block-i+1)
A_panel(1:rows,1:k_block) => A_block(paneli*MR*k_block+1 : paneli*MR*k_block+rows*k_block)
A_panel(1:rows,:) = A(ib+i-1:ib+i+rows-2, pb:pb+k_block-1)
paneli = paneli+1
end do
call dgemm_kernel_6x8(n_block, A_block, m_block, B_block, k_block, C(ib,jb), size(C,1))
end do
end do
end do
deallocate(A_block, B_block)
end subroutine dgemm_c_intrinsics
pure subroutine pack_A(mb, kb, A, lda, A_pack, MC, KC)
use iso_fortran_env, only: dp => real64
implicit none
integer, intent(in) :: mb, kb, lda, MC, KC
real(dp), intent(in) :: A(lda,*)
real(dp), intent(out) :: A_pack(MC*KC)
integer, parameter :: MR = 6, NR = 8
integer :: i,j,panel,rows,row
do i = 1, mb, MR
rows = min(MR, mb - i + 1)
panel = i/MR
do j = 1,kb
do row=1,rows
A_pack(panel*MR*kb + (j-1)*rows + row) = A(i+row-1, j)
end do
end do
panel = panel + 1
end do
end subroutine pack_A
pure subroutine pack_B(kb, nb, B, ldb, B_pack, KC, NC)
use iso_fortran_env, only: dp => real64
implicit none
integer, intent(in) :: kb, nb, ldb, KC, NC
real(dp), intent(in) :: B(ldb,*)
real(dp), intent(out) :: B_pack(KC*NC)
integer, parameter :: MR = 6, NR = 8
integer :: i,j,cols,col,panel
do j = 1, nb, NR
panel = j/NR
cols = min(NR, nb - j + 1)
do i = 1, kb
do col = 1, cols
B_pack(panel*NR*kb + (i-1)*cols + col) = B(i, j+col-1)
end do
end do
panel = panel + 1
end do
end subroutine pack_B
subroutine dgemm_pure_fortran(A, B, C, m, n, k)
use iso_fortran_env, only: dp => real64
implicit none
integer, intent(in) :: m, n, k
real(dp), intent(in) :: A(m,k), B(k,n)
real(dp), intent(inout) :: C(m,n)
integer, parameter :: MR = 6, NR = 8
integer, parameter :: MC = 96, KC = 224
integer :: i, j, p, ii, jj, NC
integer :: ib, jb, pb
integer :: ibound, jbound, pbound, paneli, panelj, n_full, m_full
integer :: m_block, n_block, k_block, r
! Per-row accumulators: cr(r, half, lane), half=1 -> cols 0..3, half=2 -> cols 4..7
real(8) :: cr(4,2,MR)
! Scratch for B lanes and A broadcast
real(8) :: b0(4), b1(4), arow
! Transpose scratch for rows 0..3, both halves
real(8) :: col5(4), col6(4), col7(4), col8(4)
real(8) :: col1(4), col2(4), col3(4), col4(4)
! rows 4..5 tails (handled as scalars per column)
real(8) :: r4h0(4), r5h0(4), r4h1(4), r5h1(4)
! Buffers for packed blocks
real(dp), allocatable :: A_block(:), B_block(:)
real(dp), pointer :: CC(:)
NC = n
! Allocate max sized packing buffers
allocate(A_block(MC*KC))
allocate(B_block(KC*NC))
! Loop over blocks of C (m,n), and inner dimension k
do jb = 1, n, NC
n_block = min(NC, n - jb + 1)
do pb = 1, k, KC
k_block = min(KC, k - pb + 1)
! Pack B block (k_block x n_block) into contiguous B_block
call pack_B(k_block, n_block, B(pb, jb), size(B,1), B_block, KC, NC)
do ib = 1, m, MC
m_block = min(MC, m - ib + 1)
! Pack A block (m_block x k_block) into contiguous A_block
call pack_A(m_block, k_block, A(ib, pb), size(A,1), A_block, MC, KC)
m_full = m_block - mod(m_block, MR)
n_full = n_block - mod(n_block,NR)
! Now compute C-block using microkernel calls
! Loop over microtiles inside the block
do j = 1, n_full, NR
panelj = j/NR
do i = 1, m_full, MR
paneli = i/MR
! zero accumulators
cr = 0.0d0
! ---- main accumulation over K ----
do p = 1, k_block
! load B row p (row-major): first half j=1..4, second half j=5..8
b0(1:4) = B_block(panelj*NR*k_block + (p-1)*NR + 1:panelj*NR*k_block + (p-1)*NR + 4)
b1(1:4) = B_block(panelj*NR*k_block + (p-1)*NR + 5:panelj*NR*k_block + (p-1)*NR + 8)
! for each row of A, broadcast and FMA into both halves
do r = 1, MR
arow = A_block(paneli*MR*k_block+ r + (p-1)*MR) ! A(r,p) with lda=MR
! accumulate into cols 0..3
cr(1:4,1,r) = cr(1:4,1,r) + arow*b0(1:4)
! accumulate into cols 4..7
cr(1:4,2,r) = cr(1:4,2,r) + arow*b1(1:4)
end do
end do
! ---- store back to C (column-major) ----
! First handle rows 0..3 (i=1..4): transpose 4x4 for each half then add to C
! transpose rows->columns
col1 = [ cr(1,1,1), cr(1,1,2), cr(1,1,3), cr(1,1,4) ] ! column 1, rows 1..4
col2 = [ cr(2,1,1), cr(2,1,2), cr(2,1,3), cr(2,1,4) ] ! column 2
col3 = [ cr(3,1,1), cr(3,1,2), cr(3,1,3), cr(3,1,4) ] ! column 3
col4 = [ cr(4,1,1), cr(4,1,2), cr(4,1,3), cr(4,1,4) ] ! column 4
! Half 1: cols 4..7
col5 = [ cr(1,2,1), cr(1,2,2), cr(1,2,3), cr(1,2,4) ] ! column 5, rows 1..4
col6 = [ cr(2,2,1), cr(2,2,2), cr(2,2,3), cr(2,2,4) ] ! column 6
col7 = [ cr(3,2,1), cr(3,2,2), cr(3,2,3), cr(3,2,4) ] ! column 7
col8 = [ cr(4,2,1), cr(4,2,2), cr(4,2,3), cr(4,2,4) ] ! column 8
! accumulate into C(:,1..4)
! write cleanly:
ii = ib + i - 2
jj = jb + j - 1
C(ii + 1:ii + 4 , jj + 0) = C(ii + 1:ii + 4 , jj + 0) + col1(1:4)
C(ii + 1:ii + 4 , jj + 1) = C(ii + 1:ii + 4 , jj + 1) + col2(1:4)
C(ii + 1:ii + 4 , jj + 2) = C(ii + 1:ii + 4 , jj + 2) + col3(1:4)
C(ii + 1:ii + 4 , jj + 3) = C(ii + 1:ii + 4 , jj + 3) + col4(1:4)
C(ii + 1:ii + 4 , jj + 4) = C(ii + 1:ii + 4 , jj + 4) + col5(1:4)
C(ii + 1:ii + 4 , jj + 5) = C(ii + 1:ii + 4 , jj + 5) + col6(1:4)
C(ii + 1:ii + 4 , jj + 6) = C(ii + 1:ii + 4 , jj + 6) + col7(1:4)
C(ii + 1:ii + 4 , jj + 7) = C(ii + 1:ii + 4 , jj + 7) + col8(1:4)
! ---- rows 4..5 tails (i=5,6) per column (no transpose needed) ----
r4h0(:) = cr(:,1,5); r5h0(:) = cr(:,1,6)
r4h1(:) = cr(:,2,5); r5h1(:) = cr(:,2,6)
! cols 0..3
C(ii + 5 , jj + 0) = C(ii + 5 , jj + 0) + r4h0(1)
C(ii + 6 , jj + 0) = C(ii + 6 , jj + 0) + r5h0(1)
C(ii + 5 , jj + 1) = C(ii + 5 , jj + 1) + r4h0(2)
C(ii + 6 , jj + 1) = C(ii + 6 , jj + 1) + r5h0(2)
C(ii + 5 , jj + 2) = C(ii + 5 , jj + 2) + r4h0(3)
C(ii + 6 , jj + 2) = C(ii + 6 , jj + 2) + r5h0(3)
C(ii + 5 , jj + 3) = C(ii + 5 , jj + 3) + r4h0(4)
C(ii + 6 , jj + 3) = C(ii + 6 , jj + 3) + r5h0(4)
! cols 4..7
C(ii + 5 , jj + 4) = C(ii + 5 , jj + 4) + r4h1(1)
C(ii + 6 , jj + 4) = C(ii + 6 , jj + 4) + r5h1(1)
C(ii + 5 , jj + 5) = C(ii + 5 , jj + 5) + r4h1(2)
C(ii + 6 , jj + 5) = C(ii + 6 , jj + 5) + r5h1(2)
C(ii + 5 , jj + 6) = C(ii + 5 , jj + 6) + r4h1(3)
C(ii + 6 , jj + 6) = C(ii + 6 , jj + 6) + r5h1(3)
C(ii + 5 , jj + 7) = C(ii + 5 , jj + 7) + r4h1(4)
C(ii + 6 , jj + 7) = C(ii + 6 , jj + 7) + r5h1(4)
end do
end do
do j = n_full+1, n_block, NR
jbound = min(NR, n_block - j + 1)
panelj = j/NR
do i = 1, m_full, MR
paneli = i/MR
! Edge cases fall back to scalar kernel
call kernel_scalar(k_block, MR, jbound, &
A_block(paneli*MR*k_block + 1), MR, &
B_block(panelj*NR*k_block + 1), jbound, &
C(ib + i - 1:, jb + j - 1), size(C,1))
end do
end do
do j = 1, n_full, NR
panelj = j/NR
do i = m_full+1,m_block, MR
ibound = min(MR, m_block - i + 1)
paneli = i/MR
! Edge cases fall back to scalar kernel
call kernel_scalar(k_block, ibound, NR, &
A_block(paneli*MR*k_block + 1), ibound, &
B_block(panelj*NR*k_block + 1), NR, &
C(ib + i - 1:, jb + j - 1), size(C,1))
end do
end do
do j = n_full+1, n_block, NR
jbound = min(NR, n_block - j + 1)
panelj = j/NR
do i = m_full+1,m_block, MR
ibound = min(MR, m_block - i + 1)
paneli = i/MR
! Edge cases fall back to scalar kernel
call kernel_scalar(k_block, ibound, jbound, &
A_block(paneli*MR*k_block + 1), ibound, &
B_block(panelj*NR*k_block + 1), jbound, &
C(ib + i - 1:, jb + j - 1), size(C,1))
end do
end do
end do
end do
end do
deallocate(A_block)
deallocate(B_block)
contains
pure subroutine kernel_scalar(k, m_block, n_block, A, lda, B, ldb, C, ldc)
! General scalar cleanup for an m_block x n_block tile (m_block <= MR, n_block <= NR)
use iso_c_binding, only: c_int, c_double
implicit none
integer, intent(in) :: k, m_block, n_block
integer, intent(in) :: lda, ldb, ldc
real(c_double), intent(in) :: A(*) ! packed A (lda stride)
real(c_double), intent(in) :: B(*) ! packed B (ldb stride)
real(c_double), intent(inout) :: C(*)! C tile (ldc stride)
integer :: p, i, j
real(c_double) :: cij, aval, bval
do j = 1, n_block
do i = 1, m_block
! load existing C(i,j)
cij = C(i + (j-1)*ldc)
do p = 1, k
! A(i,p) at A( i + (p-1)*lda )
aval = A(i + (p-1)*lda)
! B(p,j) at B( p + (j-1)*ldb )
bval = B(j + (p-1)*ldb)
cij = cij + aval * bval
end do
C(i + (j-1)*ldc) = cij
end do
end do
end subroutine kernel_scalar
end subroutine dgemm_pure_fortran