Generic or Kind-Agnostic linear system solvers

@ivanpribec I’m moving this part of the conversation here because it’s more relevant here. If I understood correctly from 30 minutes of research and this page reverse communication wouldn’t be a big deal to implement on the library side. An first test is the following:

module stdlib_krylov_PCG
  use stdlib_kinds
  
  implicit none
  private
  !! Public methods
  public :: stdlib_linalg_cg
  !!
  interface stdlib_linalg_cg
    !!
    module procedure stdlib_krylov_CG
    module procedure stdlib_krylov_CG_reverse
    !!
  end interface
  !!
contains  !!
  subroutine stdlib_krylov_CG_reverse( n, wrk, x, info, itnum )
    !!
    integer,  intent(in   ) :: n
    real(dp), intent(inout) :: wrk(n,3) ! or 4 for preconditioning
    real(dp), intent(inout) :: x(n)
    integer,  intent(  out) :: info
    integer,  intent(inout) :: itnum
    !!
    real(dp)       :: alpha, beta
    real(dp)       :: p_dot_Ap
    real(dp), save :: r_dot_r
    real(dp), save :: r_dot_r_old
    !!
    !   Mapping (maybe associate?)
    !  p = wrk(:,1)
    ! Ap = wrk(:,2)
    !  r = wrk(:,3)
    ! Kr = wrk(:,4)

    !cg_iterations: do itnum = 1, options%itmax
      !
      if (info == 22) then
        info = 22 !request Matrix-Vector multiplication
        return
      end if 
      ! Ap = matmul( A, p ) ! Performed outside
      !
      r_dot_r = dot_product( wrk(:,3), wrk(:,3) )  
      
      if (info == 11) then ! the matvec product has done
        p_dot_Ap = dot_product( wrk(:,1), wrk(:,2) )
        alpha = r_dot_r / p_dot_Ap
        x = x + (alpha) * wrk(:,1) 
        wrk(:,3) = wrk(:,3) + (-alpha) * wrk(:,2) 
        r_dot_r_old = r_dot_r
        ! Time for test
        info = 21
        return
      end if 

      !if ( sqrt(r_dot_r) < options%tol_dp ) then ! Performed outside
      !  info = 0
      !  exit cg_iterations
      !end if

      if (info == 12) then
        r_dot_r = dot_product( wrk(:,3), wrk(:,3) )
        beta = r_dot_r/r_dot_r_old
        wrk(:,1) = beta * wrk(:,1) + 1._dp * wrk(:,3)
        itnum = itnum + 1
        ! Time to go back to Ap
        info = 22
      end if

    !end do cg_iterations

    return

  end subroutine stdlib_krylov_CG_reverse
  !!
  subroutine stdlib_krylov_CG( A, rhs, x, xzero, info )
    !!
    real(dp),         intent(in   )           :: A(:,:) 
    real(dp),         intent(in   )           :: rhs(:)
    real(dp),         intent(inout)           :: x(:)
    logical,          intent(in   ), optional :: xzero
    integer,          intent(  out)           :: info
    !!
    !! Default options
    real(dp), allocatable :: r(:)
    real(dp), allocatable :: p(:)
    real(dp), allocatable :: Ap(:)
    !! Residual, search direction and other vectors.
    real(dp) :: alpha, beta
    real(dp) :: p_dot_Ap
    real(dp) :: r_dot_r
    real(dp) :: r_dot_r_old
    integer :: itnum
    !!
    allocate (Ap, source = rhs)
    Ap = 0._dp
    allocate (r, source = rhs)

    if ( present(xzero) .and. xzero ) then 
      Ap = matmul( A, x )
      r = 1._dp * r + (-1._dp) * Ap 
    else
      x = 0._dp
    end if
 
    allocate (p, source = r)
    
    r_dot_r = dot_product( r, r )  
    info = 1
    
    cg_iterations: do itnum = 1, 10
      
      Ap = matmul( A, p )
      p_dot_Ap = dot_product( p, Ap )
      alpha = r_dot_r / p_dot_Ap
      x = x + (alpha) * p 
      r = r + (-alpha) * Ap 
      r_dot_r_old = r_dot_r
      r_dot_r = dot_product( r, r )

      if ( sqrt(r_dot_r) < 1.e-10 ) then
        info = 0
        exit cg_iterations
      end if

      beta = r_dot_r/r_dot_r_old
      p = beta * p + 1._dp * r 

    end do cg_iterations

    deallocate (r, p, Ap)

    return

  end subroutine stdlib_krylov_CG
  !!
end module stdlib_krylov_PCG

program main
  use stdlib_kinds
  use stdlib_krylov_PCG
  implicit none

  integer :: i,j, n
  real(dp), allocatable :: A(:,:)
  real(dp), allocatable :: L(:,:)
  real(dp), allocatable :: U(:,:)
  real(dp), allocatable :: b(:), x(:)
  real(dp), allocatable :: work(:,:), ones(:)
  integer :: info
  integer :: itnum

  n = 10
  allocate( A(n,n), L(n,n), U(n,n), b(n), ones(n), x(n), work(n,3) )
  L = 0._dp
  do j = 1, n
    do i = j, n
      call random_number(L(i,j))
    end do
    L(j,j) = 3._dp + 3._dp*L(j,j)! sqrt(real(n))
  end do
  U = transpose(L)
  A = matmul( L, transpose(L) )
  ones = 1._dp
  b = matmul( A, ones)

  call stdlib_linalg_cg(A, b, x, .false., info )
  print *, x
  
  x = 0._dp
  ! This in a subroutine init
  work(:,1) = b
  work(:,2) = 0._dp
  work(:,3) = b
  info = 22

  cg_iteration: do i = 1, 3*10
  
    call stdlib_linalg_cg( n, work, x, info, itnum )
    if (info == 22) then 
       work(:,2) = matmul( A, work(:,1) )
       info = 11
    end if

    if (info == 21) then 
       if ( sqrt(dot_product(work(:,3),work(:,3))) < 1.e-10 ) then
         info = 0
         exit cg_iteration
       else 
         info = 12
       end if
    end if

  end do cg_iteration
 
  print *, x

end program

I see the value in this kind of implementation (there is the standard cg there for reference), but I don’t see any benefit in terms of code simplicity to be honest. I will test performance though.