Gauss Algorithm written in Fortran slower than written in Python for large matrices?

I think the main reason for the poor performance is what @jbdv said. Changing the memory access to be column major in the Fortran implementation yields a notable difference

 mat size:          500 timigs (col-major, row-major)   6.0472999999999999E-002   8.3461999999999995E-002
 mat size:         1000 timigs (col-major, row-major)  0.17684999999999998        2.1223450000000001     
 mat size:         1500 timigs (col-major, row-major)  0.77685899999999997        11.889548000000000     
 mat size:         2000 timigs (col-major, row-major)   2.0247399999999995        31.665683999999999     
 mat size:         2500 timigs (col-major, row-major)   4.1311760000000035        70.841338999999991     
 mat size:         3000 timigs (col-major, row-major)   7.3490720000000067        121.72819000000001     
 mat size:         3500 timigs (col-major, row-major)   12.796781000000010        210.18912300000000     
 mat size:         4000 timigs (col-major, row-major)   17.554618000000005        337.45606700000002  

python_vs_fortran_gauss

Source code
module gauss_m

   private
   public :: gauss, gauss_rm
   integer, parameter :: dp = kind(1.0d0)


contains

   subroutine gauss(a, b, x)
      real(dp), intent(inout)  :: a(:, :), b(:), x(:)

      integer :: i, k, n
      real(dp), allocatable :: tmp_a(:)
      real(dp)              :: tmp_b

      n = size(a, 1)

      do i=1, n
         ! Check if has to swap
         if (a(i, i) == 0.0) then
            do k=i+1, n
               ! Swap values and exit loop
               if (a(i, k) /= 0.0) then
                  tmp_a = a(:, k);   tmp_b = b(k)
                  a(:, k) = a(:, i); b(k) = b(i)
                  a(:, i) = tmp_a;   b(i) = tmp_b
                  exit
               end if
            end do
         end if
         ! Eliminate items in row below
         do k=i+1, n
            b(k) = b(k) - (b(i) / a(i, i) * a(k, i))
            a(:, k) = a(:, k) - a(:, i) / a(i, i) * a(i, k)
         end do
      end do
      ! Solve
      x = 0.0
      do i=n, 1, -1
         b(i) = b(i) - dot_product(a(:, i), x)
         x(i) = b(i) / a(i, i)
      end do

   end subroutine gauss

   subroutine gauss_rm(a, b, x)
      !! row major implementation, do not use for actual solves
      real(dp), intent(inout)  :: a(:, :), b(:), x(:)

      integer :: i, k, n
      real(dp), allocatable :: tmp_a(:)
      real(dp)              :: tmp_b

      n = size(a, 1)

      do i=1, n
         ! Check if has to swap
         if (a(i, i) == 0.0) then
            do k=i+1, n
               ! Swap values and exit loop
               if (a(k, i) /= 0.0) then
                  tmp_a = a(k, :);   tmp_b = b(k)
                  a(k, :) = a(i, :); b(k) = b(i)
                  a(i, :) = tmp_a;   b(i) = tmp_b
                  exit
               end if
            end do
         end if
         ! Eliminate items in row below
         do k=i+1, n
            b(k) = b(k) - (b(i) / a(i, i) * a(k, i))
            a(k, :) = a(k, :) - a(i, :) / a(i, i) * a(k, i)
         end do
      end do
      ! Solve
      x = 0.0
      do i=n, 1, -1
         b(i) = b(i) - dot_product(a(i, :), x)
         x(i) = b(i) / a(i, i)
      end do

   end subroutine gauss_rm

end module gauss_m


program main

   use gauss_m

   implicit none

   integer, parameter :: dp = kind(1.0d0)
   real(dp), allocatable :: a(:,:), b(:), x(:), a_rm(:,:), b_rm(:), x_rm(:)
   real(dp), allocatable :: timings(:,:)
   integer :: n, n_min, n_max, n_int

   n_min = 500; n_max = 5000; n_int = 500;
   
   allocate(timings(((n_max - n_min)/ n_int) + 1, 2))

   do n = n_min, n_max, n_int

      allocate(a(n, n), b(n), x(n))
      call fill(a, b)
      allocate(a_rm, source=a); allocate(b_rm, source=b); allocate(x_rm, source=x)

      timings(n/n_int, 1) = timeit(a, b, x, gauss)
      timings(n/n_int, 2) = timeit(a_rm, b_rm, x_rm, gauss_rm)
      print*, "mat size:", n, "timigs (col-major, row-major):", timings(n/n_int, :), "rel L2 norm: ", norm2(x_rm- x)/ norm2(x)

      deallocate(a, b, x, a_rm, b_rm, x_rm)
   end do

   print*, "          TIMINGS         "
   print*, repeat("-", 80)
   print*, "column major: ", timings(:, 1)
   print*, "row    major: ", timings(:, 2)
   print*, repeat("-", 80)


contains

   subroutine fill(a, b)
      real(dp), intent(out) :: a(:,:), b(:)
      integer :: i, j

      do j = 1, size(a,2)
         do i = 1, size(a,1)
            a(i,j) = 1.0_dp/real(i + j - 1,dp)
         end do
      end do

      do i = 1, size(b)
         b(i) = 1.0_dp/real(i + 1,dp)
      end do

   end subroutine

   function timeit(a, b, x, sub) result(time)
      real(dp), intent(inout) :: a(:,:), b(:), x(:)
      interface
         subroutine sub(a, b, x)
            integer, parameter :: dp = kind(1.0d0)
            real(dp), intent(inout) :: a(:,:), b(:), x(:)
         end subroutine sub
      end interface

      real(dp) :: time
      real(dp) :: starttime

      call cpu_time(starttime)
      call sub(a, b, x)
      call cpu_time(time)
      time = time - starttime
   end function timeit

end program main

PS thanks @ivanpribec for the fill routine I was too lazy to write one

4 Likes