Method 5 is same as method 4 improved by factorizing the sigma division and 0.5d0 multiplication (and removing zscalar):
$ gfortran speed.f90 -mtune=generic -march=x86-64 -O2 && ./a.out
imethod, nll_sum = 1 176513032.418208
imethod, nll_sum = 2 176504717.977701
imethod, nll_sum = 3 176486226.004816
imethod, nll_sum = 4 176480747.481051
imethod, nll_sum = 5 176491733.413728
times (s) for methods: 12.0716 10.4207 9.1838 8.7447 8.1351
compiler_version: GCC version 10.2.0
compiler_options: -mtune=generic -march=x86-64 -O2 -fpre-include=/usr/include/finclude/math-vector-fortran.h
module negloglik_mod
implicit none
private
public :: negloglik
contains
subroutine negloglik(x, n, mu, sigma, nll, imethod)
integer , intent(in) :: n,imethod
double precision, intent(in) :: x(n), mu, sigma
double precision, intent(out) :: nll
double precision, parameter :: pi = 4*atan(1.0d0), sqrt_2_pi = sqrt(2.0d0*pi)
double precision :: z(n),ll,zscalar,log_sigma_sqrt_2_pi
integer :: i
if (imethod < 3) z = (x - mu)/sigma
if (imethod == 1) then
ll = 0.0d0
do i=1,n
ll = ll - 0.5*z(i)**2 - log(sigma*sqrt_2_pi)
end do
else if (imethod == 2) then ! avoid computing log(sigma*sqrt(2*pi)) in loop
ll = -n * log(sigma*sqrt_2_pi) - 0.5d0*sum(z**2)
else if (imethod == 3) then ! loop fusion
ll = 0.0d0
do i=1,n
zscalar = (x(i) - mu)/sigma
ll = ll - 0.5d0*zscalar**2 - log(sigma*sqrt_2_pi)
end do
else if (imethod == 4) then ! better loop fusion with log outside loop
ll = -n * log(sigma*sqrt_2_pi)
do i=1,n
zscalar = (x(i) - mu)/sigma
ll = ll - 0.5d0*zscalar**2
end do
else if (imethod == 5) then ! better with 0.5d0/sigma**2 outside loop
ll = 0.0d0
do i=1,n
ll = ll - ((x(i) - mu))**2
end do
ll = ll * 0.5d0 / (sigma**2)
ll = ll -n * log(sigma*sqrt_2_pi)
end if
nll = -ll
end subroutine negloglik
end module negloglik_mod
program xnegloglik
use negloglik_mod , only: negloglik
use iso_fortran_env, only: compiler_version,compiler_options
implicit none
integer, parameter :: dp = kind(1.0d0)
integer, parameter :: n = 1000000, niter = 1000, nmethods = 5
real(kind=dp) :: x(n),mu,sigma,nll,nll_sum,tt(0:nmethods)
integer :: iter,imethod
call cpu_time(tt(0))
do imethod=1,nmethods
nll_sum = 0.0_dp
do iter=1,niter
call random_number(x)
mu = sum(x)/n
sigma = sqrt(sum((x-mu)**2)/n)
call negloglik(x,n,mu,sigma,nll,imethod)
nll_sum = nll_sum + nll
end do
write (*,"('imethod, nll_sum = ',i0,1x,f0.6)") imethod,nll_sum
call cpu_time(tt(imethod))
end do
write (*,"('times (s) for methods:',100(1x,f0.4))") tt(1:) - tt(0:nmethods-1)
write (*,"('compiler_version: ',a)") trim(compiler_version())
write (*,"('compiler_options: ',a)") trim(compiler_options())
end program xnegloglik