Hello,
I have a process where I need to do repeated 3D convolutions and correlations between “large” blocks of data (say 500x50x50) and small operators (typically 51x5x5, centered along all dimensions).
I have some routines that I wrote for that, just a “naive” implementation with 5 nested loops, plus a call to BLAS sapxy
or sdot
in the inner loop (that is, 6 nested loops, as expected).
Recently I have tested the Intel MKL routines that perform multidimensional convolutions/correlations, to see if I could get some speed-up compared to my naive implementation. And the results somehow puzzle me…
For simplicity I took blocks and operators of equal sizes in all dimensions: data blocks are 100x100x100, and I am testing various operator sizes lop * lop * lop
, with lop
varying from 1 to 25. The MKL routines have 3 different modes: “DIRECT”, “FFT”, “AUTO”, and I’m testing all of them. I am using the ifort 18 version, with -O3 -mkl
options, and I set MKL_NUM_THREADS=1
to prevent MKL using multithreading.
The timings are shown below:
As expected, the FFT mode performance does not depend that much on the operator size (as all FFTs are probably set according to the largest size between the data block and the operator), and consequently this mode is faster for large enough operator sizes.
But besides there are some surprises:
- The DIRECT mode is always much slower than my naive loops. I was expecting this mode to implement something very similar to what I’m doing, but possibly better optimized (with blocking or whatever). At worst I was expecting similar runtimes, not larger ones!
- In the correlations, the DIRECT mode is faster than the FFT mode for small operator sizes, but still the AUTO mode internally selects the FFT mode! It seems the AUTO mode always reverts to FFT, which looks like a bug to me.
- In the MKL FFT mode, correlations are about 8x slower than the convolutions (and I can’t get why).
I would be interested in your comments and possible explanations.
The test code is below… BTW how Intel distributing the headers if weird: instead of a .mod file, this is an source include file that contains Fortran modules. Not convenient when you have constraints on the module names in a project (unless copying the file in the project and modifying the names).
module convcorr_m
implicit none
public
contains
!*******************************************************************************
! 3D convolution y = y + f*x *
! or *
! 3D correlation f = f + x'*y *
!*******************************************************************************
subroutine cc_sconv3d(x,y,n1,n2,n3,f,l1n,l1p,l2n,l2p,l3n,l3p,cc,reset)
implicit none
! Arguments
integer , intent(in ) :: n1, n2, n3, l1n, l1p, l2n, l2p, l3n, l3p
real , intent(in ) :: x(n1,n2,n3)
real , intent(inout) :: y(n1,n2,n3)
real , intent(inout) :: f(l1n:l1p,l2n:l2p,l3n:l3p)
character(len=*), intent(in ), optional :: cc
logical , intent(in ), optional :: reset
! Local variables
integer :: mode, i2, i3, j1, j2, j3, i1i, i1f, i2i, i2f, i3i, i3f
real :: sdot
!*******************************************************************************
mode = 0
if (present(cc)) then
if (cc == 'corr' .or. cc == 'CORR') mode = 1
end if
if (mode == 0) then
if (present(reset)) then
if (reset) y(:,:,:) = 0.0
end if
do j3 = l3n, l3p
i3i = max(1,1+j3) ; i3f = min(n3,n3+j3)
do i3 = i3i, i3f
do j2 = l2n, l2p
i2i = max(1,1+j2) ; i2f = min(n2,n2+j2)
do i2 = i2i, i2f
do j1 = l1n, l1p
i1i = max(1,1+j1) ; i1f = min(n1,n1+j1)
call saxpy(i1f-i1i+1,f(j1,j2,j3),x(i1i-j1,i2-j2,i3-j3),1,y(i1i,i2,i3),1)
end do
end do
end do
end do
end do
else
if (present(reset)) then
if (reset) f(:,:,:) = 0.0
end if
do j3 = l3n, l3p
i3i = max(1,1+j3) ; i3f = min(n3,n3+j3)
do j2 = l2n, l2p
i2i = max(1,1+j2) ; i2f = min(n2,n2+j2)
do i3 = i3i, i3f
do i2 = i2i, i2f
do j1 = l1n, l1p
i1i = max(1,1+j1) ; i1f = min(n1,n1+j1)
f(j1,j2,j3) = f(j1,j2,j3) + sdot(i1f-i1i+1,y(i1i,i2,i3),1,x(i1i-j1,i2-j2,i3-j3),1)
end do
end do
end do
end do
end do
end if
end subroutine cc_sconv3d
end module convcorr_m
include "mkl_vsl.fi"
program convcorr
use iso_fortran_env
use convcorr_m
use mkl_vsl
implicit none
integer, parameter :: n = 100
real, allocatable :: x(:,:,:), y(:,:,:), f(:,:,:)
integer :: lop, convmode, corrmode, stat, imode, npass
type(vsl_conv_task) :: taskonv
type(vsl_corr_task) :: taskorr
integer(int64) :: tic, toc
real(real64) :: rate
allocate( x(n,n,n), y(n,n,n) )
write(*,"(4X,4A24)") "---- LOOPS -----", "-- VSL_DIRECT --", "--- VSL_FFT ----", "--- VSL_AUTO ---"
write(*,"(A4,8A12)") "LOP", "CONV", "CORR", "CONV", "CORR", "CONV", "CORR", "CONV", "CORR"
do lop = 1, 25, 1
write(*,"(4I4)",advance='no') lop
allocate( f(lop,lop,lop) )
call random_number(x); x = x - 0.5
call random_number(f); f = f - 0.5
call system_clock(tic,rate); npass = 0
do
npass = npass+1
call cc_sconv3d(x,y,n,n,n,f,-lop/2,(lop-1)/2,-lop/2,(lop-1)/2,-lop/2,(lop-1)/2,reset=.true.)
call system_clock(toc,rate)
if ((toc-tic)/rate >= 1.0) exit
end do
write(*,"(F12.6)",advance='no') ((toc-tic)/rate)/npass
call system_clock(tic,rate); npass = 0
do
npass = npass+1
call cc_sconv3d(x,y,n,n,n,f,-lop/2,(lop-1)/2,-lop/2,(lop-1)/2,-lop/2,(lop-1)/2,cc='CORR',reset=.true.)
call system_clock(toc,rate)
if ((toc-tic)/rate >= 1.0) exit
end do
write(*,"(F12.6)",advance='no') ((toc-tic)/rate)/npass
VSLMODE: do imode = 1, 3
if (imode == 1) then
convmode = VSL_CONV_MODE_DIRECT
corrmode = VSL_CORR_MODE_DIRECT
else if (imode == 2) then
convmode = VSL_CONV_MODE_FFT
corrmode = VSL_CORR_MODE_FFT
else if (imode == 3) then
convmode = VSL_CONV_MODE_AUTO
corrmode = VSL_CORR_MODE_AUTO
end if
call random_number(x); x = x - 0.5
call random_number(f); f = f - 0.5
stat = vslsconvnewtask(taskonv, convmode, 3, [n,n,n], [lop,lop,lop], [n,n,n])
stat = vslconvsetstart(taskonv,[lop/2,lop/2,lop/2])
stat = vslconvsetinternalprecision(taskonv, VSL_CONV_PRECISION_SINGLE)
stat = vslscorrnewtask(taskorr, corrmode, 3, [n,n,n], [n,n,n], [lop,lop,lop])
stat = vslcorrsetstart(taskorr,[-lop/2,-lop/2,-lop/2])
stat = vslcorrsetinternalprecision(taskorr, VSL_CORR_PRECISION_SINGLE)
call system_clock(tic,rate); npass = 0
do
npass = npass+1
stat = vslsconvexec(taskonv,x,[1,n,n*n],f,[1,lop,lop*lop],y,[1,n,n*n])
call system_clock(toc,rate)
if ((toc-tic)/rate >= 1.0) exit
end do
write(*,"(F12.6)",advance='no') ((toc-tic)/rate)/npass
call system_clock(tic,rate); npass = 0
do
npass = npass+1
stat = vslscorrexec(taskorr,x,[1,n,n*n],y,[1,n,n*n],f,[1,lop,lop*lop])
call system_clock(toc,rate)
if ((toc-tic)/rate >= 1.0) exit
end do
write(*,"(F12.6)",advance='no') ((toc-tic)/rate)/npass
stat = vslconvdeletetask(taskonv)
stat = vslcorrdeletetask(taskorr)
end do VSLMODE
deallocate( f )
write(*,*)
end do
end program