I was testing this idea of running an application where the main is in C and it calls some code in Fortran, and to mimic the expected complexity, say that the fortran code uses MPI and tries to incorporate coarray:
src\coarray_mpi.f90
module coarray_mpi
use, intrinsic :: iso_c_binding, only: c_int
implicit none
private
public :: say_hello
contains
subroutine coarray_mpi_initialize() bind(C, name="coarray_mpi_initialize")
integer(c_int) :: image_count
image_count = num_images()
sync all
end subroutine coarray_mpi_initialize
subroutine coarray_mpi_hello() bind(C, name="coarray_mpi_hello")
call say_hello()
call example_mcurcic_mpi()
call example_mcurcic_caf()
call example_jacobi_mpi()
call example_jacobi_caf()
end subroutine coarray_mpi_hello
subroutine say_hello()
integer(c_int) :: image_index
integer(c_int) :: image_count
integer(c_int) :: turn
real, allocatable :: A(:)
image_index = this_image()
image_count = num_images()
do turn = 1, image_count
sync all
if (image_index == turn) then
write (*, '(a,i0,a,i0)') 'Hello from coarray image ', image_index, ' of ', image_count
end if
end do
sync all
end subroutine say_hello
subroutine example_mcurcic_mpi()
use mpi
integer :: ierr, rank, nproc, request
integer :: stat(mpi_status_size)
integer :: array(5) = 0
integer, parameter :: sender = 0, receiver = 1
call mpi_comm_rank(mpi_comm_world, rank, ierr)
call mpi_comm_size(mpi_comm_world, nproc, ierr)
if (nproc /= 2) then
if (rank == 0) then
write(*,*) 'skip example_mcurcic_mpi : This code must be run on 2 parallel processes'
end if
return
end if
if (rank == sender) array = [1, 2, 3, 4, 5]
print '(a,i1,a,5(4x,i2))', 'array on proc ', rank, &
' before copy:', array
call mpi_barrier(mpi_comm_world, ierr)
if (rank == sender) then
call mpi_isend(array, size(array), mpi_int, &
receiver, 1, mpi_comm_world, request, ierr)
else if (rank == receiver) then
call mpi_irecv(array, size(array), mpi_int, &
sender, 1, mpi_comm_world, request, ierr)
call mpi_wait(request, stat, ierr)
end if
print '(a,i1,a,5(4x,i2))', 'array on proc ', rank, &
' after copy: ', array
end subroutine
subroutine example_mcurcic_caf()
integer :: array(5)[*] = 0
integer, parameter :: sender = 1, receiver = 2
if (num_images() /= 2) then
if( this_image() == 1 ) then
write(*,*) 'skip example_mcurcic_caf : This code must be run on 2 parallel processes'
end if
return
end if
if (this_image() == sender) array = [1, 2, 3, 4, 5]
print '(a,i2,a,5(4x,i2))', 'array on proc ', this_image(), &
' before copy:', array
sync all
if (this_image() == receiver) array(:) = array(:)[sender]
print '(a,i1,a,5(4x,i2))', 'array on proc ', this_image(), &
' after copy: ', array
end subroutine
subroutine example_jacobi_mpi()
use mpi
integer :: ierr, rank, size
integer :: nx, ny, local_nx
integer :: i, j, iter, max_iter
integer :: up, down
integer :: status(MPI_STATUS_SIZE)
real, allocatable :: A(:,:), Anew(:,:)
real :: local_sum, global_sum
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, size, ierr)
nx = 100
ny = 100
max_iter = 100
local_nx = nx / size
allocate(A(0:local_nx+1, 0:ny+1), source = 1.0 )
allocate(Anew(0:local_nx+1, 0:ny+1), source = 0.0 )
up = rank - 1
down = rank + 1
if (rank == 0) up = MPI_PROC_NULL
if (rank == size-1) down = MPI_PROC_NULL
do iter = 1, max_iter
! Exchange halos
call MPI_Sendrecv(A(1,0), ny+2, MPI_REAL, up, 0, &
A(local_nx+1,0), ny+2, MPI_REAL, down, 0, &
MPI_COMM_WORLD, status, ierr)
call MPI_Sendrecv(A(local_nx,0), ny+2, MPI_REAL, down, 1, &
A(0,0), ny+2, MPI_REAL, up, 1, &
MPI_COMM_WORLD, status, ierr)
! Jacobi update
do concurrent( i = 1:local_nx, j = 1:ny )
Anew(i,j) = 0.25 * (A(i-1,j) + A(i+1,j) + A(i,j-1) + A(i,j+1))
end do
! Swap arrays
A(1:local_nx,1:ny) = Anew(1:local_nx,1:ny)
end do
local_sum = norm2( A(1:local_nx,1:ny) )
! Global reduction
call MPI_Allreduce(local_sum, global_sum, 1, MPI_REAL, MPI_SUM, MPI_COMM_WORLD, ierr)
global_sum = sqrt(global_sum)
if (rank == 0) print *, "MPI ", "Iter:", iter, " Norm:", global_sum
end subroutine
subroutine example_jacobi_caf()
integer :: nx, ny, local_nx
integer :: i, j, iter, max_iter
integer :: me, nimgs
real, allocatable :: A(:,:)[:], Anew(:,:)[:]
real :: local_sum, global_sum
me = this_image()
nimgs = num_images()
nx = 100
ny = 100
max_iter = 100
local_nx = nx / nimgs
allocate(A(0:local_nx+1, 0:ny+1)[*], source = 1.0)
allocate(Anew(0:local_nx+1, 0:ny+1)[*], source = 0.0)
do iter = 1, max_iter
sync all
! Exchange halos using coarrays
if (me > 1) A(0,:) = A(local_nx,:)[me-1]
if (me < nimgs) A(local_nx+1,:) = A(1,:)[me+1]
sync all
! Jacobi update
do concurrent( i = 1:local_nx, j = 1:ny )
Anew(i,j) = 0.25 * (A(i-1,j) + A(i+1,j) + A(i,j-1) + A(i,j+1))
end do
A(1:local_nx,1:ny) = Anew(1:local_nx,1:ny)
end do
local_sum = norm2( A(1:local_nx,1:ny) )
! Global reduction
global_sum = local_sum
call co_sum(global_sum)
global_sum = sqrt(global_sum)
if (this_image() == 1) print *, "CAF ", "Iter:", iter, " Norm:", global_sum
end subroutine
end module coarray_mpi
This worked with intel compilers. Don’t know if it could work with gnu+opencoarrays.
What seems encouraging about this is that it keeps the door open for progressive integration/migration within Fortran codes while also allowing on using external libraries in mixed language environement where those might be written in C/C++ using MPI in their own way.