The standard deviation of x(:)
is commonly computed as sqrt(sum((x-mean(x))**2)/(n-1))
where n = size(x)
, but Wikipedia says that instead of (n - 1)
, (n - 1.5)
or (n - 1.5 + 1/(8*(n-1.0)))
are more accurate. I have simulated this with random normal deviates for n = 100
but found that the usual (n - 1)
formula (method 2 below) works best. For the function
pure function sd(x,imethod) result(xsd)
! formulas to compute standard deviation from Wikipedia https://en.wikipedia.org/wiki/Standard_deviation
real(kind=dp), intent(in) :: x(:)
integer , intent(in) :: imethod
real(kind=dp) :: xsd
real(kind=dp) :: xmean,ssd
integer :: n
n = size(x)
if (n < 2) then
xsd = -1.0_dp
return
end if
xmean = mean(x)
ssd = sum((x-xmean)**2)
select case (imethod)
case (1) ; xsd = sqrt(ssd/n)
case (2) ; xsd = sqrt(ssd/(n - 1))
case (3) ; xsd = sqrt(ssd/(n - 1.5_dp))
case (4) ; xsd = sqrt(ssd/(n - 1.5_dp + 1.0_dp/(8*n-8)))
case default; xsd = -2.0_dp
end select
end function sd
with
main program
program xstdev
use kind_mod , only: dp
use ziggurat_mod, only: zigset, rnor
use stdev_mod , only: mean,sd
implicit none
integer , parameter :: ntotal = 10**7, ngroups = ntotal/100, n = ntotal/ngroups, &
nmethods = 4, niter = 10
real(kind=dp), parameter :: true_sd = 1.0_dp
logical , parameter :: print_each = .false.
real(kind=dp) :: x(ntotal),xsd(ngroups,nmethods),avg_err(niter,nmethods)
integer :: i,i1,i2,imethod,iter
print*,"#total, #groups, group_size =",ntotal,ngroups,n
call zigset(123456)
do iter=1,niter
x = rnor(ntotal)
x = x - mean(x) ! set mean to zero
x = x/sqrt(mean(x**2)) ! set variance to one
i1 = 1
do i=1,ngroups
i2 = i1 + n - 1
forall (imethod=1:nmethods) xsd(i,imethod) = sd(x(i1:i2),imethod) ! compute sd for subset using various methods
i1 = i1 + n
end do
if (print_each) write (*,"(/,2a10)") "method","avg_error"
do imethod=1,nmethods
avg_err(iter,imethod) = mean(abs(xsd(:,imethod)-true_sd)) ! average absolute error of sd estimates for subsets
if (print_each) write (*,"(i10,f10.6)") imethod,avg_err(iter,imethod)
end do
end do
write (*,"(/,'OVERALL',/,2a10)") "method","avg_error"
do imethod=1,nmethods
write (*,"(i10,f10.6)") imethod,mean(avg_err(:,imethod))
end do
end program xstdev
the average absolute error in the estimate of standard deviation is
OVERALL
method avg_error
1 0.056745
2 0.056695
3 0.056778
4 0.056778
The full program is at GitHub. Have I coded the estimators of standard deviation incorrectly, or is the design of the simulation incorrect?
Edit: With
revised main program
program xstdev
use kind_mod , only: dp
use ziggurat_mod, only: zigset, rnor
use stdev_mod , only: mean, rms, sd
implicit none
integer , parameter :: ntotal = 10**7, ngroups = ntotal/100, n = ntotal/ngroups, &
nmethods = 4, niter = 10
real(kind=dp), parameter :: true_sd = 1.0_dp
logical , parameter :: print_each = .false., standardize = .true.
real(kind=dp) :: x(ntotal),xsd(ngroups,nmethods),avg_sd(niter,nmethods),avg_err(niter,nmethods),rmse(niter,nmethods)
integer :: i,i1,i2,imethod,iter
print*,"#total, #groups, group_size =",ntotal,ngroups,n
call zigset(123456)
do iter=1,niter
x = rnor(ntotal)
if (standardize) then
x = x - mean(x) ! set mean to zero
x = x/sqrt(mean(x**2)) ! set variance to one
end if
i1 = 1
do i=1,ngroups
i2 = i1 + n - 1
forall (imethod=1:nmethods) xsd(i,imethod) = sd(x(i1:i2),imethod) ! compute sd for subset using various methods
i1 = i1 + n
end do
if (print_each) write (*,"(/,*(a14))") "method","avg_sd","rmse","avg_error"
do imethod=1,nmethods
avg_sd(iter,imethod) = mean(xsd(:,imethod))
avg_err(iter,imethod) = mean(abs(xsd(:,imethod)-true_sd)) ! average absolute error of sd estimates for subsets
rmse(iter,imethod) = rms(xsd(:,imethod)-true_sd)
if (print_each) write (*,"(i14,*(f14.10))") imethod,avg_sd(iter,imethod),rmse(iter,imethod),avg_err(iter,imethod)
end do
end do
write (*,"(/,'OVERALL',/,4a14)") "method","avg_sd","rmse","avg_error"
do imethod=1,nmethods
write (*,"(i14,*(f14.10))") imethod,mean(avg_sd(:,imethod)),mean(rmse(:,imethod)),mean(avg_err(:,imethod))
end do
end program xstdev
I do find that method 3 gives the least biased estimate of the standard deviation, although estimators in 1 and 2 still have a lower RMSE:
#total, #groups, group_size = 10000000 100000 100
OVERALL
method avg_sd rmse avg_error
1 0.9924683782 0.0002245609 0.0567451915
2 0.9974682505 0.0002245620 0.0566951266
3 0.9999966912 0.0002249881 0.0567783850
4 0.9999902820 0.0002249867 0.0567780829
Since method 2 with the (n-1)
denominator is common, I think I will continue to use it.