64-bit.
I think it’s more likely that better language integration will go the other way: Julia’s compiler team wants to “rip open” the compiler, better exposing it to library authors to build their own.
LoopVectorization.jl is experimental, and will go through at least one major overhaul before I am comfortable tagging a “1.0” version.
It does. Others have confirmed it works on Power8 and Nvidea Jetson (ARM).
I bought an M1 Mac specifically so I could test and make sure performance isn’t accidentally bad, and can confirm it works there as well (you just need Julia master, and – until we get a Hwloc_jll version compatible with Apple ARM – you’d need to ] dev Hwloc
to replace Hwloc_jll
with a path to the library installed some other way; I used Homebrew).
Given that LoopVectorization is still at version “0”, now (or, “sooner rather than later”) would be the time to deprecate @avx
, and choose a new name for the macro.
@avx
seems to consistently cause confusion over whether platforms other than x86 are supported. The reaction to the name seems generally pretty negative.
I’m open to name suggestions here, but I’d probably ask on Julia’s Slack/Zulip.
Depends on the loop.
If autovectorization requires violating IEEE, then it will require annotation of some kind, e.g. @fastmath
or @simd
from Base Julia.
LoopVectorization.@avx
is a macro implemented by a Julia package that implements its own (experimental) autovectorizer.
It does more than just use special functions.
LoopVectorization.jl’s approach does not have that limitation, although I should cleanup erf
support, which currently requires specifying verf
.
LoopVectorization is also pretty good at nested loops. This is most relevant whenever anyone’s doing things not already easily/well expressed by BLAS, e.g. convolutions and tensor contractions (where expressing them as BLAS has some overhead).
gfortran code for the benchmark
module matrixmul
use ISO_C_BINDING
contains
! gfortran -Ofast -march=native -mprefer-vector-width=512 -shared -fPIC matmul.f90 -o libmatmul.so
subroutine AmulB(C, A, B, M, K, N) BIND(C, name="AmulB")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(M, K), intent(in) :: A
real(C_double), dimension(K, N), intent(in) :: B
C = matmul(A, B)
end subroutine AmulB
subroutine AmulB7x7x7(C, A, B) BIND(C, name="AmulB7x7x7")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: A
real(C_double), dimension(7, 7), intent(in) :: B
C = matmul(A, B)
end subroutine AmulB7x7x7
subroutine AmulB8x8x8(C, A, B) BIND(C, name="AmulB8x8x8")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: A
real(C_double), dimension(8, 8), intent(in) :: B
C = matmul(A, B)
end subroutine AmulB8x8x8
subroutine AmulBt(C, A, Bt, M, K, N) BIND(C, name="AmulBt")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(M, K), intent(in) :: A
real(C_double), dimension(N, K), intent(in) :: Bt
C = matmul(A, transpose(Bt))
end subroutine AmulBt
subroutine AmulBt7x7x7(C, A, Bt) BIND(C, name="AmulBt7x7x7")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: A
real(C_double), dimension(7, 7), intent(in) :: Bt
C = matmul(A, transpose(Bt))
end subroutine AmulBt7x7x7
subroutine AmulBt8x8x8(C, A, Bt) BIND(C, name="AmulBt8x8x8")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: A
real(C_double), dimension(8, 8), intent(in) :: Bt
C = matmul(A, transpose(Bt))
end subroutine AmulBt8x8x8
subroutine AtmulB(C, At, B, M, K, N) BIND(C, name="AtmulB")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(K, M), intent(in) :: At
real(C_double), dimension(K, N), intent(in) :: B
C = matmul(transpose(At), B)
end subroutine AtmulB
subroutine AtmulB7x7x7(C, At, B) BIND(C, name="AtmulB7x7x7")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: At
real(C_double), dimension(7, 7), intent(in) :: B
C = matmul(transpose(At), B)
end subroutine AtmulB7x7x7
subroutine AtmulB8x8x8(C, At, B) BIND(C, name="AtmulB8x8x8")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: At
real(C_double), dimension(8, 8), intent(in) :: B
C = matmul(transpose(At), B)
end subroutine AtmulB8x8x8
subroutine AtmulBt(C, At, Bt, M, K, N) BIND(C, name="AtmulBt")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(K, M), intent(in) :: At
real(C_double), dimension(N, K), intent(in) :: Bt
C = transpose(matmul(Bt, At))
end subroutine AtmulBt
subroutine AtmulBt7x7x7(C, At, Bt) BIND(C, name="AtmulBt7x7x7")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: At
real(C_double), dimension(7, 7), intent(in) :: Bt
C = transpose(matmul(Bt, At))
end subroutine AtmulBt7x7x7
subroutine AtmulBt8x8x8(C, At, Bt) BIND(C, name="AtmulBt8x8x8")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: At
real(C_double), dimension(8, 8), intent(in) :: Bt
C = transpose(matmul(Bt, At))
end subroutine AtmulBt8x8x8
subroutine AmulB_loop(C, A, B, M, K, N) BIND(C, name="AmulB_loop")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(M, K), intent(in) :: A
real(C_double), dimension(K, N), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:N, mm = 1:M)
Cmn = 0.0
do kk = 1,K
Cmn = Cmn + A(mm,kk) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulB_loop
subroutine AmulB7x7x7_loop(C, A, B) BIND(C, name="AmulB7x7x7_loop")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: A
real(C_double), dimension(7, 7), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:7, mm = 1:7)
Cmn = 0.0
do kk = 1,7
Cmn = Cmn + A(mm,kk) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulB7x7x7_loop
subroutine AmulB8x8x8_loop(C, A, B) BIND(C, name="AmulB8x8x8_loop")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: A
real(C_double), dimension(8, 8), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:8, mm = 1:8)
Cmn = 0.0
do kk = 1,8
Cmn = Cmn + A(mm,kk) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulB8x8x8_loop
subroutine AmulBt_loop(C, A, Bt, M, K, N) BIND(C, name="AmulBt_loop")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(M, K), intent(in) :: A
real(C_double), dimension(N, K), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:N, mm = 1:M)
Cmn = 0.0
do kk = 1,K
Cmn = Cmn + A(mm,kk) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulBt_loop
subroutine AmulBt7x7x7_loop(C, A, Bt) BIND(C, name="AmulBt7x7x7_loop")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: A
real(C_double), dimension(7, 7), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:7, mm = 1:7)
Cmn = 0.0
do kk = 1,7
Cmn = Cmn + A(mm,kk) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulBt7x7x7_loop
subroutine AmulBt8x8x8_loop(C, A, Bt) BIND(C, name="AmulBt8x8x8_loop")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: A
real(C_double), dimension(8, 8), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:8, mm = 1:8)
Cmn = 0.0
do kk = 1,8
Cmn = Cmn + A(mm,kk) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AmulBt8x8x8_loop
subroutine AtmulB_loop(C, At, B, M, K, N) BIND(C, name="AtmulB_loop")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(K, M), intent(in) :: At
real(C_double), dimension(K, N), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:N, mm = 1:M)
Cmn = 0.0
do kk = 1,K
Cmn = Cmn + At(kk,mm) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulB_loop
subroutine AtmulB7x7x7_loop(C, At, B) BIND(C, name="AtmulB7x7x7_loop")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: At
real(C_double), dimension(7, 7), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:7, mm = 1:7)
Cmn = 0.0
do kk = 1,7
Cmn = Cmn + At(kk,mm) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulB7x7x7_loop
subroutine AtmulB8x8x8_loop(C, At, B) BIND(C, name="AtmulB8x8x8_loop")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: At
real(C_double), dimension(8, 8), intent(in) :: B
real(C_double) :: Cmn
do concurrent(nn = 1:8, mm = 1:8)
Cmn = 0.0
do kk = 1,8
Cmn = Cmn + At(kk,mm) * B(kk,nn)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulB8x8x8_loop
subroutine AtmulBt_loop(C, At, Bt, M, K, N) BIND(C, name="AtmulBt_loop")
integer(C_long), intent(in) :: M, K, N
real(C_double), dimension(M, N), intent(out) :: C
real(C_double), dimension(K, M), intent(in) :: At
real(C_double), dimension(N, K), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:N, mm = 1:M)
Cmn = 0.0
do kk = 1,K
Cmn = Cmn + At(kk,mm) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulBt_loop
subroutine AtmulBt7x7x7_loop(C, At, Bt) BIND(C, name="AtmulBt7x7x7_loop")
real(C_double), dimension(7, 7), intent(out) :: C
real(C_double), dimension(7, 7), intent(in) :: At
real(C_double), dimension(7, 7), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:7, mm = 1:7)
Cmn = 0.0
do kk = 1,7
Cmn = Cmn + At(kk,mm) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulBt7x7x7_loop
subroutine AtmulBt8x8x8_loop(C, At, Bt) BIND(C, name="AtmulBt8x8x8_loop")
real(C_double), dimension(8, 8), intent(out) :: C
real(C_double), dimension(8, 8), intent(in) :: At
real(C_double), dimension(8, 8), intent(in) :: Bt
real(C_double) :: Cmn
do concurrent(nn = 1:8, mm = 1:8)
Cmn = 0.0
do kk = 1,8
Cmn = Cmn + At(kk,mm) * Bt(nn,kk)
end do
C(mm,nn) = Cmn
end do
end subroutine AtmulBt8x8x8_loop
real(C_double) function dot3v2(x, A, y, M, N) BIND(C, name="dot3v2")
integer(C_long), intent(in) :: M, N
real(C_double), intent(in) :: x(M), A(M,N), y(N)
real(C_double) :: t
integer(C_long) :: mm, nn
dot3v2 = 0.0d0
do concurrent(nn = 1:N, mm = 1:M)
dot3v2 = dot3v2 + x(mm) * A(mm, nn) * y(nn)
end do
end function dot3v2
real(C_double) function dot3(x, A, y, M, N) BIND(C, name="dot3")
integer(C_long), intent(in) :: M, N
real(C_double), intent(in) :: x(M), A(M,N), y(N)
real(C_double) :: t
integer(C_long) :: mm, nn
dot3 = 0.0d0
do concurrent(nn = 1:N)
t = 0.0d0
do concurrent(mm = 1:M)
t = t + x(mm) * A(mm, nn)
end do
dot3 = dot3 + t * y(nn)
end do
end function dot3
real(C_double) function dot3builtin(x, A, y, M, N) BIND(C, name="dot3builtin")
integer(C_long), intent(in) :: M, N
real(C_double), intent(in) :: x(M), A(M,N), y(N)
dot3builtin = dot_product(x, matmul(A, y))
end function dot3builtin
end module matrixmul
Julia code and script for running benchmarks
using LoopVectorization, ArrayInterface, Static, LinearAlgebra
function mul_serial!(C, A, B)
@avx for n ∈ indices((C,B), 2), m ∈ indices((C,A), 1)
Cmn = zero(eltype(C))
for k ∈ indices((A,B), (2,1))
Cmn += A[m,k] * B[k,n]
end
C[m,n] = Cmn
end
end
function mul_threads!(C, A, B)
@avxt for n ∈ indices((C,B), 2), m ∈ indices((C,A), 1)
Cmn = zero(eltype(C))
for k ∈ indices((A,B), (2,1))
Cmn += A[m,k] * B[k,n]
end
C[m,n] = Cmn
end
end
const LIBFORTMUL = joinpath(pwd(), "libmatmul.so")
if !isfile(LIBFORTMUL)
# run(`gfortran -Ofast -march=native -fdisable-tree-cunrolli -floop-nest-optimize -funroll-loops -mprefer-vector-width=$(LoopVectorization.pick_vector_width(Float64)*64) -shared -fPIC matmul.f90 -o $LIBFORTMUL`)
run(`gfortran -Ofast -march=native -mprefer-vector-width=$(LoopVectorization.pick_vector_width(Float64)*64) -shared -fPIC matmul.f90 -o $LIBFORTMUL`)
end
struct SizedMatrix{M,N,T} <: DenseMatrix{T}
data::Matrix{T}
end
@inline SizedMatrix{M,N}(data::Matrix{T}) where {M,N,T} = SizedMatrix{M,N,T}(data)
@inline Base.unsafe_convert(::Type{Ptr{T}}, A::SizedMatrix{M,N,T}) where {M,N,T} = Base.unsafe_convert(Ptr{T}, A.data)
@inline Base.size(A::SizedMatrix{M,N}) where {M,N} = (M,N)
@inline Base.elsize(A::SizedMatrix{M,N,T}) where {M,N,T} = sizeof(T)
@inline Base.length(A::SizedMatrix{M,N}) where {M,N} = M*N
@inline ArrayInterface.size(A::SizedMatrix{M,N}) where {M,N} = (static(M),static(N))
@inline ArrayInterface.axes(A::SizedMatrix{M,N}, ::StaticInt{1}) where {M,N} = static(1):static(M)
@inline ArrayInterface.axes(A::SizedMatrix{M,N}, ::StaticInt{2}) where {M,N} = static(1):static(N)
@inline ArrayInterface.axes(A::SizedMatrix{M,N}) where {M,N} = (static(1):static(M), static(1):static(N))
@inline Base.parent(A::SizedMatrix) = A.data
@inline ArrayInterface.parent_type(::Type{<:SizedMatrix{M,N,T}}) where {M,N,T} = Matrix{T}
@inline function Base.getindex(A::SizedMatrix, i...)
@boundscheck checkbounds(A, i...)
@inbounds A.data[i...]
end
@inline function Base.setindex!(A::SizedMatrix, v, i...)
@boundscheck checkbounds(A, i...)
@inbounds A.data[i...] = v
end
for At ∈ (false,true), Bt ∈ (false,true)
fname = Symbol(At ? "At" : "A", "mul", Bt ? "Bt" : "B")
for static_size ∈ (0,7,8)
if static_size == 7
Ctype = :(SizedMatrix{7,7,Float64})
fname_sized = Symbol(fname, "7x7x7")
elseif static_size == 8
Ctype = :(SizedMatrix{8,8,Float64})
fname_sized = Symbol(fname, "8x8x8")
else
Ctype = :(Matrix{Float64})
fname_sized = fname
end
Atype = At ? :(Adjoint{Float64,$Ctype}) : Ctype
Btype = Bt ? :(Adjoint{Float64,$Ctype}) : Ctype
A = At ? :(parent(A)) : :A
B = Bt ? :(parent(B)) : :B
for loop ∈ (false, true)
funcname! = loop ? :fortmul! : :fortmul_loop!
cfname = QuoteNode(loop ? Symbol(fname_sized, "_loop") : fname_sized)
matmul_quote = if static_size == 0
quote
function $funcname!(C::$Ctype, A::$Atype, B::$Btype)
M, N = size(C); K = size(B,1)
ccall(($cfname, LIBFORTMUL), Cvoid, (Ref{Float64},Ref{Float64},Ref{Float64},Ref{Clong},Ref{Clong},Ref{Clong}), C, $A, $B, M, K, N)
end
end
else
quote
function $funcname!(C::$Ctype, A::$Atype, B::$Btype)
ccall(($cfname, LIBFORTMUL), Cvoid, (Ref{Float64},Ref{Float64},Ref{Float64}), C, $A, $B)
end
end
end
@eval $matmul_quote
end
end
end
const Matrix7x7 = SizedMatrix{7,7}
const Matrix8x8 = SizedMatrix{8,8}
A7x7 = rand(7,7); B7x7 = rand(7,7); C7x7 = rand(7,7);
A8x8 = rand(8,8); B8x8 = rand(8,8); C8x8 = rand(8,8);
function print_matmul_summary(C, A, B)
M, K = size(A); N = size(B,2);
A_str = A isa Adjoint ? "A'" : "A"
B_str = B isa Adjoint ? "B'" : "B"
static_str = C isa SizedMatrix ? "compile" : "runtime"
println("Running $(static_str)-sized $(M)×$(N) = $(M)×$(K) * $(K)×$(N) benchmarks for C = $A_str * $B_str")
end
function run_benchmark(C,A,B)
print_matmul_summary(C,A,B)
M, K = size(A); N = size(B,2)
println("gfortran Builtin:")
t = @belapsed fortmul!($C, $A, $B)
println("GFLOPS: $(2e-9M*K*N/t)")
Ccopy = copy(C)
println("gfortran Loops:")
t = @belapsed fortmul_loop!($C, $A, $B)
println("GFLOPS: $(2e-9M*K*N/t)")
@assert C ≈ Ccopy
println("LoopVectorization.jl Single-Thread:")
t = @belapsed mul_serial!($C, $A, $B)
println("GFLOPS: $(2e-9M*K*N/t)")
@assert C ≈ Ccopy
if length(C) > 32^2
println("LoopVectorization.jl Multiple-Threads:")
t = @belapsed mul_threads!($C, $A, $B)
println("GFLOPS: $(2e-9M*K*N/t)")
@assert C ≈ Ccopy
end
println()
end
for (C,A,B) ∈ ((C7x7,A7x7,B7x7),(C8x8,A8x8,B8x8)), At ∈ (false,true), Bt ∈ (false,true), static_size ∈ (false,true)
M,K = size(A); N = size(B,2)
T = static_size ? (M == 7 ? Matrix7x7 : Matrix8x8) : identity
Atemp = T(A); Btemp = T(B)
Atemp = At ? Atemp' : Atemp
Btemp = Bt ? Btemp' : Btemp
run_benchmark(T(C), Atemp, Btemp)
end
for M ∈ (71, 72, 144, 216)
K = N = M
A = rand(M,K); B = rand(K,N);
C = Matrix{Float64}(undef, M, N);
run_benchmark(C, A, B)
run_benchmark(C, A, B')
run_benchmark(C, A', B)
run_benchmark(C, A', B')
end
Also note that there’s only two versions of the matrix multiply function in Julia: 1 single threaded, another multithreaded. The same function is used for all transposed permutations, without us having to write and optimize a different version for each.