I made a small test case based on the my basic 1d convolution routine with explicit shapes. I made a version with assumed shapes, and then I built an OOP version (similar to @gronki) on top of that. So I’m testing 3 versions:
- explicit shape interface
- assumed shape interface
- OOP interface (with explicit shapes, so one can separate the overheads due to the assumed shapes from the ones due to OOP).
In the assumed version, I declared the array arguments as contiguous
, as not all compilers do test the contiguity at runtime to take advantage of it (it seems that gfortran doesn’t, since I have much better performances when contiguous
is specified).
The test consists in repeatedly performing a 1d convolution of a vector of size N with a kernel of size L. The vectors are the columns of a 2D array. The number of columns m is determined such that the total amount of floating point operations (O(N\space L\space m)) is constant.
With gfortran (14) -Ofast -march=native -fopenmp
(I am linking OpenMP just to get the omp_get_wtime()
function):
- there is a slight overhead of the assumed shape interface, which is observable only with very short vectors and kernels.
- the overhead is even pushed down by using the link time optimization
-flto
- the is a more important overhead of the OOP layer, even if it also gets non observable with longer vectors/kernels; the link-time optimization also helps, but not that much
With ifort (21) -Ofast -march=native -fopenmp
:
- there is no overhead of the assumed shape interface, even without using the the inter procedural optimization
-ipo
- the overhead of the OOP layer is similar to the one with gfortran
So… On the one hand, if one deals only with long enough vectors/kernels, none of the overheads really matters. On the other hand, the old style explicit shapes routine always ensures optimum performances whatever the vector/kernel lengths and whatever the compiler (OK, I’ve tested only 2 of them…)
The test outputs (the ifort test is running on a more powerful machine than the gfortran test, hence the overall better performances):
$ gfortran -Ofast -march=native -fopenmp conv.f90 testconv.f90 && ./a.out
N= 50 L= 3 m=66666 explicit shape: 5.04247 sec. 429.331238
N= 50 L= 3 m=66666 assumed shape: 5.67779 sec. 429.331238
N= 50 L= 3 m=66666 oop (explicit): 11.94931 sec. 429.331238
N= 100 L= 11 m= 9090 explicit shape: 3.50213 sec. 3415.54297
N= 100 L= 11 m= 9090 assumed shape: 3.26650 sec. 3415.54297
N= 100 L= 11 m= 9090 oop (explicit): 4.10882 sec. 3415.54297
N=1000 L= 11 m= 909 explicit shape: 2.50628 sec. 2853.36157
N=1000 L= 11 m= 909 assumed shape: 2.50305 sec. 2853.36157
N=1000 L= 11 m= 909 oop (explicit): 2.58195 sec. 2853.36157
N=1000 L=101 m= 99 explicit shape: 2.72735 sec. 28535.1992
N=1000 L=101 m= 99 assumed shape: 2.80414 sec. 28535.1992
N=1000 L=101 m= 99 oop (explicit): 2.72536 sec. 28535.1992
$
$ gfortran -Ofast -march=native -flto -fopenmp conv.f90 testconv.f90 && ./a.out
N= 50 L= 3 m=66666 explicit shape: 4.39111 sec. 1358.89661
N= 50 L= 3 m=66666 assumed shape: 4.77631 sec. 1358.89661
N= 50 L= 3 m=66666 oop (explicit): 10.60820 sec. 1358.89661
N= 100 L= 11 m= 9090 explicit shape: 2.96375 sec. 3414.63672
N= 100 L= 11 m= 9090 assumed shape: 2.98961 sec. 3414.63672
N= 100 L= 11 m= 9090 oop (explicit): 3.89261 sec. 3414.63672
N=1000 L= 11 m= 909 explicit shape: 2.49971 sec. 1322.84998
N=1000 L= 11 m= 909 assumed shape: 2.41724 sec. 1322.84998
N=1000 L= 11 m= 909 oop (explicit): 2.97342 sec. 1322.84998
N=1000 L=101 m= 99 explicit shape: 2.89645 sec. 22934.1426
N=1000 L=101 m= 99 assumed shape: 2.73086 sec. 22934.1426
N=1000 L=101 m= 99 oop (explicit): 2.80207 sec. 22934.1426
$ ifort -Ofast -march=native -fopenmp conv.f90 testconv.f90 && ./a.out
N= 50 L= 3 m=66666 explicit shape: 3.22394 sec. 543.7036
N= 50 L= 3 m=66666 assumed shape: 3.20795 sec. 543.7036
N= 50 L= 3 m=66666 oop (explicit): 7.10191 sec. 543.7036
N= 100 L= 11 m= 9090 explicit shape: 2.05390 sec. 1746.136
N= 100 L= 11 m= 9090 assumed shape: 2.32972 sec. 1746.136
N= 100 L= 11 m= 9090 oop (explicit): 2.76197 sec. 1746.136
N=1000 L= 11 m= 909 explicit shape: 2.11248 sec. 2002.490
N=1000 L= 11 m= 909 assumed shape: 2.15468 sec. 2002.490
N=1000 L= 11 m= 909 oop (explicit): 2.18171 sec. 2002.490
N=1000 L=101 m= 99 explicit shape: 1.32205 sec. 31236.11
N=1000 L=101 m= 99 assumed shape: 1.58577 sec. 31236.11
N=1000 L=101 m= 99 oop (explicit): 1.33595 sec. 31236.11
$
$ ifort -Ofast -march=native -ipo -fopenmp conv.f90 testconv.f90 && ./a.out
N= 50 L= 3 m=66666 explicit shape: 3.22681 sec. 543.7036
N= 50 L= 3 m=66666 assumed shape: 3.20301 sec. 543.7036
N= 50 L= 3 m=66666 oop (explicit): 6.16719 sec. 543.7036
N= 100 L= 11 m= 9090 explicit shape: 2.33493 sec. 1746.136
N= 100 L= 11 m= 9090 assumed shape: 2.11606 sec. 1746.136
N= 100 L= 11 m= 9090 oop (explicit): 2.71851 sec. 1746.136
N=1000 L= 11 m= 909 explicit shape: 2.16270 sec. 2002.490
N=1000 L= 11 m= 909 assumed shape: 2.15879 sec. 2002.490
N=1000 L= 11 m= 909 oop (explicit): 2.18736 sec. 2002.490
N=1000 L=101 m= 99 explicit shape: 1.57657 sec. 31236.11
N=1000 L=101 m= 99 assumed shape: 1.59752 sec. 31236.11
N=1000 L=101 m= 99 oop (explicit): 1.60242 sec. 31236.11
conv.f90
:
module phconv
implicit none
type conv1D_t
real, allocatable :: k(:)
contains
procedure :: set_kernel
procedure :: apply
procedure :: clear_kernel
end type
contains
subroutine set_kernel(this,k,kfirst,klast)
class(conv1D_t), intent(inout) :: this
integer, intent(in) :: kfirst, klast
real, intent(in) :: k(kfirst:klast)
allocate( this%k, source=k )
end subroutine
subroutine clear_kernel(this)
class(conv1D_t), intent(inout) :: this
deallocate( this%k )
end subroutine
subroutine apply(this,x,xfirst,xlast,y,yfirst,ylast)
class(conv1D_t), intent(inout) :: this
integer, intent(in) :: xfirst, xlast, yfirst, ylast
real, intent(in) :: x(xfirst:xlast)
real, intent(inout) :: y(yfirst:ylast)
call sconv1D_es( this%k, lbound(this%k,1), ubound(this%k,1) &
, x, xfirst, xlast &
, y, yfirst, ylast )
end subroutine
!*******************************************************************************
subroutine sconv1D_es &
(d,dfirst,dlast, &
e,efirst,elast, &
x,xfirst,xlast)
!*******************************************************************************
! x = x + d * e
!*******************************************************************************
implicit none
integer, intent(in) :: dfirst, dlast
integer, intent(in) :: efirst, elast
integer, intent(in) :: xfirst, xlast
real, intent(in) :: d(dfirst:dlast), e(efirst:elast)
real, intent(inout) :: x(xfirst:xlast)
integer :: id, ixmin, ixmax
do id = dfirst, dlast
ixmin = max( xfirst, efirst+id)
ixmax = min( xlast, elast +id)
x(ixmin:ixmax) = x(ixmin:ixmax) + d(id)*e(ixmin-id:ixmax-id)
end do
end subroutine sconv1D_es
!*******************************************************************************
subroutine sconv1D_as(d,dfirst,e,efirst,x,xfirst)
!*******************************************************************************
! x = x + d * e
!*******************************************************************************
implicit none
integer, intent(in) :: dfirst
integer, intent(in) :: efirst
integer, intent(in) :: xfirst
real, intent(in), contiguous :: d(dfirst:), e(efirst:)
real, intent(inout), contiguous :: x(xfirst:)
integer :: id, ixmin, ixmax
do id = dfirst, ubound(d,1)
ixmin = max( xfirst , efirst +id)
ixmax = min( ubound(x,1), ubound(e,1) +id)
x(ixmin:ixmax) = x(ixmin:ixmax) + d(id)*e(ixmin-id:ixmax-id)
end do
end subroutine sconv1D_as
end module
testconv.f90
:
program convtest
use omp_lib
use phconv
implicit none
real, allocatable :: f(:,:), x(:,:), y(:,:)
integer, parameter :: N(*) = [50, 100, 1000, 1000]
integer, parameter :: L(*) = [ 3, 11, 11, 101]
integer :: i, j, k, m, L2
double precision :: tic, toc
type(conv1D_t) :: ooconv
100 format (A,I4,X,A,I3,X,A,I5,3X,A,F10.5,X,A,3X,G0)
do i = 1, size(N)
ASSOCIATE( N => N(i), L=>L(i) )
L2 = L/2
m = 10**7 / (N*L)
allocate( x(N,m), y(N,m), f(0:L-1,m) )
y(:,:) = 0.0
call random_number(x)
call random_number(f)
tic = omp_get_wtime()
do k = 1, 1000
do j = 1, m
call sconv1D_es( f(:,j) , -L2, L2 &
, x(:,j) , 1, N &
, y(L2:N-L2,j), L2, N-L2 )
end do
end do
toc = omp_get_wtime()
print 100, "N=", N, "L=", L, "m=", m, "explicit shape:", toc-tic, "sec.", y(N/2,1)
y(:,:) = 0.0
tic = omp_get_wtime()
do k = 1, 1000
do j = 1, m
call sconv1D_as( f(:,j) , -L2 &
, x(:,j) , 1 &
, y(L2:N-L2,j), L2 )
end do
end do
toc = omp_get_wtime()
print 100, "N=", N, "L=", L, "m=", m, "assumed shape:", toc-tic, "sec.", y(N/2,1)
y(:,:) = 0.0
tic = omp_get_wtime()
do k = 1, 1000
do j = 1, m
call ooconv%set_kernel( f(:,j), -L2, L2 )
call ooconv%apply( x(:,j), 1, N, y(L2:N-L2,j), L2, N-L2 )
call ooconv%clear_kernel()
end do
end do
toc = omp_get_wtime()
print 100, "N=", N, "L=", L, "m=", m, "oop (explicit):", toc-tic, "sec.", y(N/2,1)
deallocate( x, y, f )
END ASSOCIATE
end do
end