Fast_math: A collection of functions for fast number crunching using Fortran

Nice work @nshaffer !! So you motivated me to push a bit more and reworked the test and found interesting results:

code + testing
module fast_rsqrt
    use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
    implicit none
    private
    
    public :: frsqrt

    interface frsqrt
        !! Retranscript of the original Quake III Arena, see https://en.wikipedia.org/wiki/Fast_inverse_square_root
        !! for pure reference
        module procedure frsqrt_dp
        module procedure frsqrt_sp
    end interface
    
contains

    elemental function frsqrt_sp(x) result(y)
        integer, parameter :: wp = sp
        real(wp), intent(in) :: x
        real(wp) :: y
        !-- Internal Variables
        real(wp)    :: x2,y2
        integer(wp) :: i
        integer(wp), parameter :: magic = int(Z'5f3759df',kind=wp)
        !-------------------------------------------------
        x2 = 0.5_wp*x
        i  = transfer(x,i)
        i  = magic - shiftr(i,1)
        y2 = transfer(i,y)

        ! Perform one Newton-Raphson step
        y  = y2*(1.5_wp-x2*y2*y2)

    end function frsqrt_sp

    elemental function frsqrt_dp(x) result(y)
        integer, parameter :: wp = dp
        real(wp), intent(in) :: x
        real(wp) :: y
        !-- Internal Variables
        real(wp)    :: x2,y2
        integer(wp) :: i
        integer(wp), parameter :: magic = 6910469410427058089_wp
        !-------------------------------------------------
        x2 = 0.5_wp*x
        i  = transfer(x,i)
        i  = magic - shiftr(i,1)
        y2 = transfer(i,y)

        ! Perform one Newton-Raphson step
        y  = y2*(1.5_wp-x2*y2*y2)

    end function frsqrt_dp
    
end module fast_rsqrt

program main
    use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
    use fast_rsqrt
    !> Internal parameters and variables
    integer, parameter :: n = 1e5, ncalc = 2, niter = 10000
    integer :: iter, i
    real(dp) :: time(0:ncalc), times_tot(ncalc), err
    1 format(a10,': <time> = ',f9.4,' ns/eval, speed-up=',f5.2,'X, rel. error=',es16.4)
    !====================================================================================
    print *,""
    print *,"================================================================"
    print *," Fast rsqrt = 1/sqrt(x)"
    block
        integer, parameter :: wp=sp
        real(wp), allocatable :: x(:) , y(:), yref(:)
        real(kind=wp) :: tolerance = 1e-2_wp
        !> define a log space
        allocate( x(n) , y(n), yref(n) )
        call random_number(x)
        x = 10._wp**(-10*(1._wp-x) + 10*x)
        times_tot(:) = 0
        do iter=1,niter
            call cpu_time(time(0))
            yref = 1._wp/sqrt(x); call cpu_time(time(1))
            y = frsqrt(x)  ; call cpu_time(time(2))
            times_tot(:) = times_tot(:) + time(1:ncalc) - time(0:ncalc-1)
        end do
        err = sqrt( sum(( y - yref )**2) ) / sqrt( sum(( yref )**2) )
        print *, "time default: ",times_tot(1),"time frsqrt: ",times_tot(2)
        print 1, "frsqrt r32" , 1e9*times_tot(2)/(niter*n), times_tot(1)/times_tot(2), err
    end block
    block
        integer, parameter :: wp=dp
        real(wp), allocatable :: x(:) , y(:), yref(:)
        real(kind=wp) :: tolerance = 1e-2_wp
        !> define a log space
        allocate( x(n) , y(n), yref(n) )
        call random_number(x)
        x = 10._wp**(-200*(1._wp-x) + 200*x)
        times_tot(:) = 0
        do iter=1,niter
            call cpu_time(time(0))
            yref = 1._wp/sqrt(x); call cpu_time(time(1))
            y = frsqrt(x)  ; call cpu_time(time(2))
            times_tot(:) = times_tot(:) + time(1:ncalc) - time(0:ncalc-1)
        end do
        err = sqrt( sum(( y - yref )**2) ) / sqrt( sum(( yref )**2) )
        print *, "time default: ",times_tot(1),"time frsqrt: ",times_tot(2)
        print 1, "frsqrt r64" , 1e9*times_tot(2)/(niter*n), times_tot(1)/times_tot(2), err
    end block

end program

Results from compiler explorer

ifort 19 -O3 -xHost

 time default:   0.112649999999986      time frsqrt:   0.111316999999994     
frsqrt r32: <time> =    0.1113 ns/eval, speed-up= 1.01X, rel. error=      1.1156E-03
 time default:    1.49099300000007      time frsqrt:   0.470805999999973     
frsqrt r64: <time> =    0.4708 ns/eval, speed-up= 3.17X, rel. error=      1.0753E-03

ifx2023 -O3 -xHost

 time default:   0.322242999999995      time frsqrt:   0.317305999999985     
frsqrt r32: <time> =    0.3173 ns/eval, speed-up= 1.02X, rel. error=      1.1156E-03
 time default:    2.53335500000001      time frsqrt:   0.726347999999955     
frsqrt r64: <time> =    0.7263 ns/eval, speed-up= 3.49X, rel. error=      1.0753E-03

gfortran 13.2 -Ofast -march=native

 time default:   0.26977699999999727      time frsqrt:   0.28963799999999806     
frsqrt r32: <time> =    0.2896 ns/eval, speed-up= 0.93X, rel. error=      1.1054E-03
 time default:    1.8799389999999838      time frsqrt:   0.69013799999999925     
frsqrt r64: <time> =    0.6901 ns/eval, speed-up= 2.72X, rel. error=      1.0654E-03

gfortran 13.2 -O3 -march=native

time default:   0.30656400000004769      time frsqrt:   0.13649199999999095     
frsqrt r32: <time> =    0.1365 ns/eval, speed-up= 2.25X, rel. error=      1.1029E-03
 time default:   0.94306099999994464      time frsqrt:   0.27456500000000195     
frsqrt r64: <time> =    0.2746 ns/eval, speed-up= 3.43X, rel. error=      1.0285E-03

same tendency with my local machine

So basically:

The double precision frsqrt version is consistently >2X faster

For the single precision the story is different, with ifort or ifx, they already provide the SSE instructions with -O3

ifort

        vrsqrtps  ymm5, ymm2 

ifx:

        vrsqrt14ps      ymm1, ymm0

for gfortran one has to compile with -Ofast to also get the fastest intrinsic, otherwise with -O3, frsqrt is still faster

gfortran with -Ofast

        vrsqrtps        ymm1, YMMWORD PTR [rdi+rax]

I guess it is instructive to have it there if only just as fun experiment and for learning :slight_smile: