That’s really strange, I find this really hard to believe and it’s contrary to our experience on Windows OS with IFORT.
On Windows using IFORT and its O2/O3 optimization, the teams I’ve worked with have noticed considerable improvement on the order of 20% or more with refactoring of FORTRAN 77 style code to just MODULEs hosting all the subroutines. With LAPACK style, I mean codebases that are FORTRAN 77 style one-.f file-per-external subroutine approach with no use of global or static data, there can be further performance gains. Then moving such code to MODULEs with hosted subroutines and perhaps due to the interprocedural optimization facility (Linux: -ipo) with `IFORT, performance sees a good improvement.
And I just tried a simple 1000x1000 Ax=B DGESV case and “ceteris paribus” on Windows: using IFORT with /O3 and /Qipo optimization, it’s showing >40% performance improvement.
Click to see the driver code
module runs_m
use iso_fortran_env, only : I8 => int64
use ieee_arithmetic, only : ieee_selected_real_kind
use lapack_m, only : DGESV !<-- This module hosts all LAPACK routines having `PURE` attribute
integer, parameter :: WP = ieee_selected_real_kind( p=15, radix=2 )
integer, parameter :: N = 1000
integer, parameter :: NRHS = 1000
real(WP), allocatable :: A_save(:,:)
real(WP), allocatable :: X(:,:)
real(WP), allocatable :: A(:,:)
real(WP), allocatable :: BX(:,:)
integer, allocatable :: ipvt(:)
contains
subroutine setup( astat )
integer, intent(out) :: astat
allocate( A_save(N,N), stat=astat )
if ( astat /= 0 ) return
allocate( A(N,N), stat=astat )
if ( astat /= 0 ) return
allocate( X(N,NRHS), stat=astat )
if ( astat /= 0 ) return
allocate( BX(N,NRHS), stat=astat )
if ( astat /= 0 ) return
allocate( ipvt(N), stat=astat )
if ( astat /= 0 ) return
call random_number( A_save )
call random_number( X )
end subroutine
impure elemental subroutine run( t )
! Argument list
real(WP), intent(inout) :: t
! Local variables
real(WP) :: t1, t2
integer :: info
integer :: lda, ldb
A = A_save
BX = matmul( A, X )
lda = N ; ldb = N
call my_cpu_time( t1 )
call DGESV( n , nrhs, a , lda , ipvt, bx, ldb , info )
call my_cpu_time( t2 )
t = t2 - t1
! assert the result
if ( (abs(BX(1,1)-X(1,1))>0.01_wp).or.(abs(BX(N,NRHS)-X(N,NRHS))>0.01_wp) ) then
t = huge(t)
end if
return
end subroutine
impure elemental subroutine my_cpu_time( time )
!.. Argument list
real(WP), intent(inout) :: time
!.. Local variables
integer(I8) :: tick
integer(I8) :: rate
call system_clock (tick, rate)
time = real(tick, kind=kind(time) ) / real(rate, kind=kind(time) )
return
end subroutine my_cpu_time
end module
! calling executive
use runs_m
integer, parameter :: ntrial = 12
real(WP) :: times(ntrial)
integer :: astat
call setup( astat )
if ( astat /= 0 ) stop "Setup failed."
call run( times )
print *, "CPU time (sec): ", (sum(times) - maxval(times) - minval(times))/(ntrial-2)
end
! LAPACK module: only a snippet is shown below
module lapack_m
contains
..
PURE SUBROUTINE dgesv( N, NRHS, A, LDA, IPIV, B, LDB, INFO )
*
* -- LAPACK driver routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
*
* Argument list
INTEGER, intent(in) :: N
INTEGER, intent(in) :: NRHS
INTEGER, intent(in) :: LDA
DOUBLE PRECISION, intent(inout) :: A( LDA, * )
INTEGER, intent(out) :: IPIV(*)
INTEGER, intent(in) :: LDB
DOUBLE PRECISION, intent(inout) :: B( LDB, * )
INTEGER, intent(out) :: INFO
...
Say with above, the CPU time is set as 1 sec, then the time with the case where DGESV is marked EXTERNAL and each of LAPACK subroutines are external procedures the time is ~1.41 sec. For reference, on one Windows 10 personal laptop with `Intel(R) Core™ i7-6820HQ @ 2.70GHz’ CPU, the measured CPU time of above shown code was about 0.6 seconds whereas it was 0.851 sec with the case where DGESV and its callees are external procedures.
I would thus encourage those working on stdlib to attempt stdlib_lapack, the biggest hurdle here might be what to with BSD license with LAPACK compared to MIT license with stdlib.