I think the main reason for the poor performance is what @jbdv said. Changing the memory access to be column major in the Fortran implementation yields a notable difference
mat size: 500 timigs (col-major, row-major) 6.0472999999999999E-002 8.3461999999999995E-002
mat size: 1000 timigs (col-major, row-major) 0.17684999999999998 2.1223450000000001
mat size: 1500 timigs (col-major, row-major) 0.77685899999999997 11.889548000000000
mat size: 2000 timigs (col-major, row-major) 2.0247399999999995 31.665683999999999
mat size: 2500 timigs (col-major, row-major) 4.1311760000000035 70.841338999999991
mat size: 3000 timigs (col-major, row-major) 7.3490720000000067 121.72819000000001
mat size: 3500 timigs (col-major, row-major) 12.796781000000010 210.18912300000000
mat size: 4000 timigs (col-major, row-major) 17.554618000000005 337.45606700000002

Source code
module gauss_m
private
public :: gauss, gauss_rm
integer, parameter :: dp = kind(1.0d0)
contains
subroutine gauss(a, b, x)
real(dp), intent(inout) :: a(:, :), b(:), x(:)
integer :: i, k, n
real(dp), allocatable :: tmp_a(:)
real(dp) :: tmp_b
n = size(a, 1)
do i=1, n
! Check if has to swap
if (a(i, i) == 0.0) then
do k=i+1, n
! Swap values and exit loop
if (a(i, k) /= 0.0) then
tmp_a = a(:, k); tmp_b = b(k)
a(:, k) = a(:, i); b(k) = b(i)
a(:, i) = tmp_a; b(i) = tmp_b
exit
end if
end do
end if
! Eliminate items in row below
do k=i+1, n
b(k) = b(k) - (b(i) / a(i, i) * a(k, i))
a(:, k) = a(:, k) - a(:, i) / a(i, i) * a(i, k)
end do
end do
! Solve
x = 0.0
do i=n, 1, -1
b(i) = b(i) - dot_product(a(:, i), x)
x(i) = b(i) / a(i, i)
end do
end subroutine gauss
subroutine gauss_rm(a, b, x)
!! row major implementation, do not use for actual solves
real(dp), intent(inout) :: a(:, :), b(:), x(:)
integer :: i, k, n
real(dp), allocatable :: tmp_a(:)
real(dp) :: tmp_b
n = size(a, 1)
do i=1, n
! Check if has to swap
if (a(i, i) == 0.0) then
do k=i+1, n
! Swap values and exit loop
if (a(k, i) /= 0.0) then
tmp_a = a(k, :); tmp_b = b(k)
a(k, :) = a(i, :); b(k) = b(i)
a(i, :) = tmp_a; b(i) = tmp_b
exit
end if
end do
end if
! Eliminate items in row below
do k=i+1, n
b(k) = b(k) - (b(i) / a(i, i) * a(k, i))
a(k, :) = a(k, :) - a(i, :) / a(i, i) * a(k, i)
end do
end do
! Solve
x = 0.0
do i=n, 1, -1
b(i) = b(i) - dot_product(a(i, :), x)
x(i) = b(i) / a(i, i)
end do
end subroutine gauss_rm
end module gauss_m
program main
use gauss_m
implicit none
integer, parameter :: dp = kind(1.0d0)
real(dp), allocatable :: a(:,:), b(:), x(:), a_rm(:,:), b_rm(:), x_rm(:)
real(dp), allocatable :: timings(:,:)
integer :: n, n_min, n_max, n_int
n_min = 500; n_max = 5000; n_int = 500;
allocate(timings(((n_max - n_min)/ n_int) + 1, 2))
do n = n_min, n_max, n_int
allocate(a(n, n), b(n), x(n))
call fill(a, b)
allocate(a_rm, source=a); allocate(b_rm, source=b); allocate(x_rm, source=x)
timings(n/n_int, 1) = timeit(a, b, x, gauss)
timings(n/n_int, 2) = timeit(a_rm, b_rm, x_rm, gauss_rm)
print*, "mat size:", n, "timigs (col-major, row-major):", timings(n/n_int, :), "rel L2 norm: ", norm2(x_rm- x)/ norm2(x)
deallocate(a, b, x, a_rm, b_rm, x_rm)
end do
print*, " TIMINGS "
print*, repeat("-", 80)
print*, "column major: ", timings(:, 1)
print*, "row major: ", timings(:, 2)
print*, repeat("-", 80)
contains
subroutine fill(a, b)
real(dp), intent(out) :: a(:,:), b(:)
integer :: i, j
do j = 1, size(a,2)
do i = 1, size(a,1)
a(i,j) = 1.0_dp/real(i + j - 1,dp)
end do
end do
do i = 1, size(b)
b(i) = 1.0_dp/real(i + 1,dp)
end do
end subroutine
function timeit(a, b, x, sub) result(time)
real(dp), intent(inout) :: a(:,:), b(:), x(:)
interface
subroutine sub(a, b, x)
integer, parameter :: dp = kind(1.0d0)
real(dp), intent(inout) :: a(:,:), b(:), x(:)
end subroutine sub
end interface
real(dp) :: time
real(dp) :: starttime
call cpu_time(starttime)
call sub(a, b, x)
call cpu_time(time)
time = time - starttime
end function timeit
end program main
PS thanks @ivanpribec for the fill routine I was too lazy to write one