Coarrays and multithreading (with Intel Fortran)?

I could finally run a coarray+OpenMP program without any crash on a single node (with ifort 2021)… But:

  1. I’m struggling a lot with the configuration, the compilation parameters, etc… (I still haven’t found a way to run less images than the number of cores on the machine).
  2. although the prints in the code report that several OpenMP threads are used by each process, it seems that the threads run sequentially rather than simultaneously (!)
program coarray_example
    use iso_fortran_env
    use omp_lib
    implicit none
    integer(int64), parameter :: Ntot = 2**22
    integer(int64) :: me, ni, n, i, k, s
    integer(int64), allocatable :: data(:)[:]  ! Coarray declaration
    integer(int64), allocatable :: local_data(:)

    call omp_set_num_threads(2)
    
    me = this_image()
    ni = num_images()
    
    if (me == 1) print *, 'Number of images:', ni
    
    n = Ntot / ni
    allocate( data(n)[*], local_data(n), source=0_int64 )

    print "(A,I3,A,I9,A)", 'image', me, ' has allocated a coarray of size (', n, ')[*]'
    
    ! Each image sets its own data
    call setdata( data(:)[me], me )

    print *, 'Image', me, ' has set its own values'
    
    ! Synchronize all images
    sync all
    
    ! Image 1 collects and sums the data from all images
    if (me == 1) then
        s = 0
        do k = 1, ni
            print *, 'Image 1 collecting and summing data from image', k
            local_data(:) = data(:)[k]
            s = s + sum( local_data )
        end do
        print *, 'collected sum =', s, '(should be =', Ntot*(Ntot+1)/2, ' )'
    end if
 
    
contains


    subroutine setdata(data,me)
    	integer(int64), intent(out) :: data(:)
    	integer(int64), intent(in)  :: me
    	integer(int64) :: i
    	!$OMP PARALLEL DO SCHEDULE(static, 10000)
    	do i = 1, size(data,kind=int64)
    	   if (me == 1 .and. mod(i,10000) == 0) print*, omp_get_thread_num(), i
    	   data(i) = (me-1)*size(data) + i
    	end do
    end subroutine
    
end program coarray_example

Compilation: ifort -O3 -coarray -coarray-config-file=caf.cfg -fopenmp /opt/intel/21/mpi/2021.4.0/lib/release/libmpi.so.12 coarrays.f90 -o coarrays

caf.cfg:
-genvall -genv I_MPI_FABRICS=shm:ofi -machinefile=./hostsfile -n 8 ./coarrays

hostsfile contains a single line with the machine name on which the program is launched.