Yeah, if what you’er trying to show is that Julia’s GC is slower than manual memory management, you’re right.
using LoopVectorization
function eval_exp_turbo!(A)
N = size(A,1);
a = range(0, stop=2*pi, length=N)
_A = reinterpret(reshape, Float64, A)
@tturbo for j in 1:N, i in 1:N
x = sqrt(a[i]^2 + a[j]^2)
prefac = exp(-x)
s, c = sincos(100*x)
_A[1,i,j] = prefac * c
_A[2,i,j] = prefac * s
end
return A
end
eval_exp_turbo!(Matrix{ComplexF64}(undef, 5, 5));
eval_exp_turbo!(view(Matrix{ComplexF64}(undef, 5, 5), 1:5, 1:5));
Threads.nthreads()
@time begin
for N in 1000:1000:10000
A = Matrix{ComplexF64}(undef, N, N)
eval_exp_turbo!(A)
end
end
@time begin
Abig = Matrix{ComplexF64}(undef, 10000, 10000)
for N in 1000:1000:10000
A = eval_exp_turbo!(view(Abig, 1:N, 1:N))
end
end
@time begin
for N in 1000:1000:10000
p = Libc.malloc(sizeof(ComplexF64) * N * N)
A = unsafe_wrap(Array, Ptr{ComplexF64}(p), (N,N))
eval_exp_turbo!(A)
Libc.free(p)
end
end
@time begin
p = Libc.malloc(sizeof(ComplexF64) * 10_000^2)
for N in 1000:1000:10000
A = unsafe_wrap(Array, Ptr{ComplexF64}(p), (N,N))
eval_exp_turbo!(A)
end
Libc.free(p)
end
Yields:
julia> Threads.nthreads()
8
julia> @time begin
for N in 1000:1000:10000
A = Matrix{ComplexF64}(undef, N, N)
eval_exp_turbo!(A)
end
end
0.437918 seconds (48 allocations: 5.737 GiB, 41.34% gc time)
julia> @time begin
Abig = Matrix{ComplexF64}(undef, 10000, 10000)
for N in 1000:1000:10000
A = eval_exp_turbo!(view(Abig, 1:N, 1:N))
end
end
0.264932 seconds (66 allocations: 1.490 GiB, 2.48% gc time)
julia> @time begin
for N in 1000:1000:10000
p = Libc.malloc(sizeof(ComplexF64) * N * N)
A = unsafe_wrap(Array, Ptr{ComplexF64}(p), (N,N))
eval_exp_turbo!(A)
Libc.free(p)
end
end
0.216784 seconds (50 allocations: 1.719 KiB)
julia> @time begin
p = Libc.malloc(sizeof(ComplexF64) * 10_000^2)
for N in 1000:1000:10000
A = unsafe_wrap(Array, Ptr{ComplexF64}(p), (N,N))
eval_exp_turbo!(A)
end
Libc.free(p)
end
0.232580 seconds (60 allocations: 2.172 KiB)
Out of these four, I do think the first is the most idiomatic, and will generally be slower than manual memory management in Fortran.
I do think that is a valid point in comparing the languages. It’s also a valid point that Fortran makes large, dynamically-sized stack-allocated arrays easy.
I do think it is also worth noting that this can be worked around in Julia when you care. In practice many libraries don’t care, making their development easier at the cost of crippling Julia’s multithreaded performance in many real-world workloads.
But from my perspective, I can’t fix how people manage memory in Julia programs or Julia’s GC from within LoopVectorization.jl, which is why those benchmarks are less interesting to me personally: while relevant and worth addressing, it isn’t something that can be addressed from within the libraries I like to work on.
EDIT:
> tail -n12 fortcomplexexp2.f90 (base)
t1 = omp_get_wtime()
allocate(m(10000,10000))
do i = 1, 10
n = 1000*i
a = [(2*j*pi/(n - 1), j=0, n - 1)]
k = cmplx(100.0, 1.0)
call eikr(a, k, n, m)
enddo
deallocate(m)
t2 = omp_get_wtime()
print *, n, t2 - t1, 'seconds'
end program
> ifort -qopenmp -mkl -fast fortcomplexexp2.f90 -o fortcomplex_v2
> OMP_NUM_THREADS=8 ./fortcomplex_v2
10000 0.230892181396484 seconds
> OMP_NUM_THREADS=18 ./fortcomplex_v2
10000 0.212796926498413 seconds
Performance is comparable now when using 8 threads, but Julia slowed down from 0.23 → 0.35s when using 18 threads. Hmm.