Normal random number generator

I slightly modified your rnorm subroutine and also wrote a alternative version using the associate statement. On my machine, the version with the associate statement takes about 5.7s and is about 1 second faster than the version without associate ( edited to add: for N=10^8).

random_normal.f90
program main
    use iso_fortran_env, only: rwp=>real64
    implicit none
    integer N
    real(rwp), allocatable :: z(:)
    real(rwp):: t1, t2, mean, std_dev

    N=1+10**8
    allocate(z(N))

    call cpu_time(t1)
    call standard_normal(Z,0d0,1d0)
    call cpu_time(t2)

    mean=sum(Z)/N
    std_dev=sqrt(sum((Z-mean)**2)/N)

    write(*,*)"    M2:"
    write(*,'(A,x,I12)')   "size    : ", size(z)
    write(*,'(A,x,F16.8)') "Time(s) : ", t2-t1
    write(*,'(A,x,F16.8)') "mean    : ", mean
    write(*,'(A,x,F16.8)') "std-dev : ", std_dev

    call cpu_time(t1)
    call rnorm(Z,0d0,1d0)
    call cpu_time(t2)
    
    mean=sum(Z)/N
    std_dev=sqrt(sum((Z-mean)**2)/N)

    write(*,*)"    M2:"
    write(*,'(A,x,I12)')   "size    : ", size(z)
    write(*,'(A,x,F16.8)') "Time(s) : ", t2-t1
    write(*,'(A,x,F16.8)') "mean    : ", mean
    write(*,'(A,x,F16.8)') "std-dev : ", std_dev

    contains
    
    subroutine standard_normal(z, mu, sigma)
        use iso_fortran_env, only: rwp=>real64, iwp=>int64
        implicit none
        real(rwp), optional :: mu, sigma
        real(rwp) :: z(:)
        real(rwp), parameter   :: PI=3.14159265358979323846
        real(rwp), allocatable :: u(:)
        !get size of Z
        integer(iwp) N, n1

        N=size(z,kind=iwp)
        n1=N + modulo(N,2)
        
        allocate(u(n1))
        call random_number(u)
        u = 1d0 - u
        associate (u1=>sqrt(-2d0*log(u(1:n1/2))),u2=>2d0*PI*u(n1/2+1:n1))
            z=[u1*cos(u2), u1(1:N/2)*sin(u2(1:N/2))]
        endassociate

        if(present(sigma)) z=z*sigma
        if(present(mu)) z=z+mu

    endsubroutine standard_normal
        
    subroutine rnorm(x, mu, sigma)
        implicit none
        integer(8) :: n,n1
        real(8), parameter   :: PI=3.14159265358979323846
        real(8) :: x(:), mu, sigma
        real(8), allocatable :: u1(:), u2(:)

        n = size(x,kind=8)
        n1 = n + modulo(n, 2)

        allocate(u1(n1/2), u2(n1/2))
        call random_number(u1)
        call random_number(u2)
        u1 = 1d0 - u1
        u2 = 1d0 - u2
        x = mu + sigma*[sqrt(-2d0*log(u1))*cos(2d0*pi*u2), sqrt(-2d0*log(u1(1:n/2)))*sin(2d0*pi*u2(1:n/2))]
    end subroutine rnorm

end program main

Output
     M1:
size    :  100000001
Time(s) :        5.71875000
mean    :       -0.00002553
std-dev :        0.99999518
     M2:
size    :  100000001
Time(s) :        6.71875000
mean    :        0.00003767
std-dev :        1.00007775

(another edit to fix std_dev calc)

Another method:
From root_cern

"[…] Uses the Acceptance-complement ratio from W. Hoermann and G. Derflinger This is one of the fastest existing method for generating normal random variables. It is a factor 2/3 faster than the polar (Box-Muller) method used in the previous version of TRandom::Gaus. The speed is comparable to the Ziggurat method (from Marsaglia) implemented for example in GSL and available in the MathMore library.[…]
"
REFERENCE: - W. Hoermann and G. Derflinger (1990): The ACR Method for generating normal random variables, OR Spektrum 12 (1990), 181-185.