Nice work @nshaffer !! So you motivated me to push a bit more and reworked the test and found interesting results:

##
code + testing

```
module fast_rsqrt
use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
implicit none
private
public :: frsqrt
interface frsqrt
!! Retranscript of the original Quake III Arena, see https://en.wikipedia.org/wiki/Fast_inverse_square_root
!! for pure reference
module procedure frsqrt_dp
module procedure frsqrt_sp
end interface
contains
elemental function frsqrt_sp(x) result(y)
integer, parameter :: wp = sp
real(wp), intent(in) :: x
real(wp) :: y
!-- Internal Variables
real(wp) :: x2,y2
integer(wp) :: i
integer(wp), parameter :: magic = int(Z'5f3759df',kind=wp)
!-------------------------------------------------
x2 = 0.5_wp*x
i = transfer(x,i)
i = magic - shiftr(i,1)
y2 = transfer(i,y)
! Perform one Newton-Raphson step
y = y2*(1.5_wp-x2*y2*y2)
end function frsqrt_sp
elemental function frsqrt_dp(x) result(y)
integer, parameter :: wp = dp
real(wp), intent(in) :: x
real(wp) :: y
!-- Internal Variables
real(wp) :: x2,y2
integer(wp) :: i
integer(wp), parameter :: magic = 6910469410427058089_wp
!-------------------------------------------------
x2 = 0.5_wp*x
i = transfer(x,i)
i = magic - shiftr(i,1)
y2 = transfer(i,y)
! Perform one Newton-Raphson step
y = y2*(1.5_wp-x2*y2*y2)
end function frsqrt_dp
end module fast_rsqrt
program main
use, intrinsic :: iso_fortran_env, only: sp=>real32, dp=>real64
use fast_rsqrt
!> Internal parameters and variables
integer, parameter :: n = 1e5, ncalc = 2, niter = 10000
integer :: iter, i
real(dp) :: time(0:ncalc), times_tot(ncalc), err
1 format(a10,': <time> = ',f9.4,' ns/eval, speed-up=',f5.2,'X, rel. error=',es16.4)
!====================================================================================
print *,""
print *,"================================================================"
print *," Fast rsqrt = 1/sqrt(x)"
block
integer, parameter :: wp=sp
real(wp), allocatable :: x(:) , y(:), yref(:)
real(kind=wp) :: tolerance = 1e-2_wp
!> define a log space
allocate( x(n) , y(n), yref(n) )
call random_number(x)
x = 10._wp**(-10*(1._wp-x) + 10*x)
times_tot(:) = 0
do iter=1,niter
call cpu_time(time(0))
yref = 1._wp/sqrt(x); call cpu_time(time(1))
y = frsqrt(x) ; call cpu_time(time(2))
times_tot(:) = times_tot(:) + time(1:ncalc) - time(0:ncalc-1)
end do
err = sqrt( sum(( y - yref )**2) ) / sqrt( sum(( yref )**2) )
print *, "time default: ",times_tot(1),"time frsqrt: ",times_tot(2)
print 1, "frsqrt r32" , 1e9*times_tot(2)/(niter*n), times_tot(1)/times_tot(2), err
end block
block
integer, parameter :: wp=dp
real(wp), allocatable :: x(:) , y(:), yref(:)
real(kind=wp) :: tolerance = 1e-2_wp
!> define a log space
allocate( x(n) , y(n), yref(n) )
call random_number(x)
x = 10._wp**(-200*(1._wp-x) + 200*x)
times_tot(:) = 0
do iter=1,niter
call cpu_time(time(0))
yref = 1._wp/sqrt(x); call cpu_time(time(1))
y = frsqrt(x) ; call cpu_time(time(2))
times_tot(:) = times_tot(:) + time(1:ncalc) - time(0:ncalc-1)
end do
err = sqrt( sum(( y - yref )**2) ) / sqrt( sum(( yref )**2) )
print *, "time default: ",times_tot(1),"time frsqrt: ",times_tot(2)
print 1, "frsqrt r64" , 1e9*times_tot(2)/(niter*n), times_tot(1)/times_tot(2), err
end block
end program
```

Results from compiler explorer

ifort 19 -O3 -xHost

```
time default: 0.112649999999986 time frsqrt: 0.111316999999994
frsqrt r32: <time> = 0.1113 ns/eval, speed-up= 1.01X, rel. error= 1.1156E-03
time default: 1.49099300000007 time frsqrt: 0.470805999999973
frsqrt r64: <time> = 0.4708 ns/eval, speed-up= 3.17X, rel. error= 1.0753E-03
```

ifx2023 -O3 -xHost

```
time default: 0.322242999999995 time frsqrt: 0.317305999999985
frsqrt r32: <time> = 0.3173 ns/eval, speed-up= 1.02X, rel. error= 1.1156E-03
time default: 2.53335500000001 time frsqrt: 0.726347999999955
frsqrt r64: <time> = 0.7263 ns/eval, speed-up= 3.49X, rel. error= 1.0753E-03
```

gfortran 13.2 -Ofast -march=native

```
time default: 0.26977699999999727 time frsqrt: 0.28963799999999806
frsqrt r32: <time> = 0.2896 ns/eval, speed-up= 0.93X, rel. error= 1.1054E-03
time default: 1.8799389999999838 time frsqrt: 0.69013799999999925
frsqrt r64: <time> = 0.6901 ns/eval, speed-up= 2.72X, rel. error= 1.0654E-03
```

gfortran 13.2 -O3 -march=native

```
time default: 0.30656400000004769 time frsqrt: 0.13649199999999095
frsqrt r32: <time> = 0.1365 ns/eval, speed-up= 2.25X, rel. error= 1.1029E-03
time default: 0.94306099999994464 time frsqrt: 0.27456500000000195
frsqrt r64: <time> = 0.2746 ns/eval, speed-up= 3.43X, rel. error= 1.0285E-03
```

same tendency with my local machine

So basically:

The double precision frsqrt version is consistently >2X faster

For the single precision the story is different, with ifort or ifx, they already provide the SSE instructions with -O3

ifort

```
vrsqrtps ymm5, ymm2
```

ifx:

```
vrsqrt14ps ymm1, ymm0
```

for gfortran one has to compile with -Ofast to also get the fastest intrinsic, otherwise with -O3, frsqrt is still faster

gfortran with -Ofast

```
vrsqrtps ymm1, YMMWORD PTR [rdi+rax]
```

I guess it is instructive to have it there if only just as fun experiment and for learning