Why solving two differential equations is 8 times slower than solving one?

compiler flag:

/nologo /debug:full /O3 /QxHost /assume:buffered_io /heap-arrays0 /Qipo /debug-parameters:all /module:“x64\Release\” /object:“x64\Release\” /Fd"x64\Release\vc150.pdb" /traceback /libs:static /threads /Qmkl:cluster /c

linker flag:

/OUT:“x64\Release\stochastic_RK.exe” /INCREMENTAL:NO /NOLOGO /MANIFEST /MANIFESTFILE:“x64\Release\stochastic_RK.exe.intermediate.manifest” /MANIFESTUAC:“level=‘asInvoker’ uiAccess=‘false’” /DEBUG /PDB:“D:\Works CHLA\stochastic_RK\stochastic_RK\stochastic_RK\x64\Release\stochastic_RK.pdb” /SUBSYSTEM:CONSOLE /LARGEADDRESSAWARE /IMPLIB:“D:\Works CHLA\stochastic_RK\stochastic_RK\stochastic_RK\x64\Release\stochastic_RK.lib”

Basically it is just -O3 -QxHost -heaparray
Other flags are not important.

I solve two equations, and solve one equations, I am using the same flags.

I simplified my cases to the following,

I solve one equation dy/dt = 1

I solve 100 the same equations, i from 1 to 100, dy_i/dt =1

Then I should expect it took 100 times longer than solving just one equation, right?

In my code I did things like that and I expect solving 2 the same equations should cost just 2 times than solving one such equation. But I got a factor of 7 - 8 or so.

So I think perhaps there are some unnecessary memory allocation or deallocation, or some temp arrays are created. But I have not identify where.

Do you think the following code may generate temp arrays for x(:,i-1) and x(:,i)?

do i = 1, n
t = t0 + (real(i)*(tn-t0))/real(n)
call rk4_ti_step_vec_mod1 ( x(:,i-1), t, h, q, fi_vec2, gi_vec2, nd, x(:,i) )
end do 

it is in the subroutine,

subroutine test02 ( xout )
use constants
use stochastic_rk
implicit none
interface				   !! Declare function        
function fi_vec2 ( x ) result (f)
use constants
real ( kind = r8 ) :: f(2)
real ( kind = r8 ) :: x(2)
end function
function gi_vec2 ( x ) result (g)
use constants
real ( kind = r8 ) :: g(2)
real ( kind = r8 ) :: x(2)
end function      
end interface 
real ( kind = r8 ), external :: fi_vec21, fi_vec22, gi_vec21, gi_vec22
integer ( kind = i8 ), parameter :: n = 1000
real ( kind = r8 ) h
integer ( kind = i8 ) :: i, itot, istart, istep
real ( kind = r8 ) q
integer ( kind = i8 ) :: seed
real ( kind = r8 ) t
real ( kind = r8 ), parameter :: t0 = 0.0
real ( kind = r8 ), parameter :: tn = 1.0
real ( kind = r8 ) :: x(2,0:n)
real (kind = r8) :: xout(5)
integer ( kind = i8 ) :: nd ! dimension, =2

h = ( tn - t0 ) / real(n)
q = 1.0
seed = 123456789

i = 0
t = t0
x(1,i) = 20.0_r8
x(2,i) = one
nd = 2

!write ( *, '(a)' ) ' '
!write ( *, '(a)' ) '         I           T             X'
!write ( *, '(a)' ) ' '
!write ( *, '(2x,i8,2x,f14.6,2x,g14.6)' ) i, t, x(i)

do i = 1, n
t = t0 + (real(i)*(tn-t0))/real(n)
call rk4_ti_step_vec_mod1 ( x(:,i-1), t, h, q, fi_vec2, gi_vec2, nd, x(:,i) )
end do

itot = i-1
istart = itot/5
istep = istart
xout(1:5:1) = x(1,istart:itot:istep)
!write ( *, '(2x,i8,2x,f14.6,2x,g14.6)' ) i-1, t, xout
return
end subroutine test02   

with functions,

function fi_vec2 ( x ) result (f)
use constants
implicit none
real ( kind = r8 ) :: f(2)
real ( kind = r8 ), intent(in) :: x(2)
  f(1) = -x(2)*x(1) 
  f(2) = -x(2) + one
return
end function fi_vec2
function gi_vec2 ( x ) result(g)
use constants
implicit none
real ( kind = r8 ), parameter :: gp(2) = [one,zero] 
real ( kind = r8 ), intent(in) :: x(2)
real ( kind = r8 ) :: g(2)
g=gp
return
end function gi_vec2