FizzBuzz in Fortran much slower than in C

A request: will it be possible for someone to take the defined IO with a derived type case shown below and try it with stream output on a Linux system, while the “FizzBuzz” values are setup using coarrays? Intel API can be an option for this. And compare it with the Fortran sequential IO with an intrinsic type shown by @Beliavsky in the original post?

   use num_string_m, only : num_string_t
   character(len=*), parameter :: FIZZBUZZ = "FizzBuzz"
   character(len=*), parameter :: FIZZ = "Fizz"
   character(len=*), parameter :: BUZZ = "Buzz"
   integer, parameter :: N = 1000000000
   type(num_string_t), allocatable :: s(:)[:]
   logical :: div_3, div_5
   integer :: i, x, j, lun, istat
   x = N / num_images()
   allocate( s(x)[*] )
   sync all
   do concurrent ( i = 1:x  )
      j = (this_image()-1)*x + i
      div_3 = mod( j, 3 ) == 0
      div_5 = mod( j, 5 ) == 0
      if ( div_3 .and. div_5 ) then
         s(i) = FIZZBUZZ
      else if ( div_3 ) then
         s(i) = FIZZ
      else if ( div_5 ) then
         s(i) = BUZZ
      else
         s(i) = j
      end if
   end do
   sync all
   if ( this_image() == 1 ) then
      open( newunit=lun, access="stream", form="formatted", status="scratch", iostat=istat )
      if ( istat == 0 ) then
         do i = 1, num_images()
            write( lun, fmt="(*(DT))", advance="no" ) s(:)[i]
         end do
      end if
   end if
end

The code toward the derived type:

Click for code

Module for numstring_t

module num_string_m
   type :: num_string_t
      private
      character(len=digits(0)) :: s
      integer :: lens = 0
   contains
      private
      procedure, pass(this) :: assign_s
      procedure, pass(this) :: assign_n
      procedure, pass(dtv) :: write_s
      generic, public :: assignment(=) => assign_s, assign_n
      generic, public :: write(formatted) => write_s
   end type
contains
   elemental subroutine assign_s( this, s )
      class(num_string_t), intent(inout) :: this
      character(len=*), intent(in)       :: s
      this%s = s
      this%lens = len_trim(this%s) 
   end subroutine
   elemental subroutine assign_n( this, num )
      class(num_string_t), intent(inout) :: this
      integer, intent(in)                :: num
      integer :: i, n, rem
      n = num
      this%s = ""
      this%lens = 0
      to_s: do i = digits(0), 1, -1
         rem = mod( n, 10 )
         this%s(i:i) = achar(iachar("0") + rem)
         this%lens = this%lens + 1
         n = n/10
         if ( n == 0 ) exit to_s
      end do to_s
      this%s = adjustl(this%s)
   end subroutine
   subroutine write_s( dtv, lun, iotype, vlist, istat, imsg )
      ! Argument list
      class(num_string_t), intent(in)  :: dtv
      integer, intent(in)              :: lun
      character(len=*), intent(in)     :: iotype
      integer, intent(in)              :: vlist(:)
      integer, intent(out)             :: istat
      character (len=*), intent(inout) :: imsg
      ! local variable
      character(len=20) :: pfmt
      istat = 0
      select case ( iotype )
         case ( "LISTDIRECTED" )
            ! No special consideration
            write(lun, fmt=*, iostat=istat, iomsg=imsg) dtv%s(:dtv%lens) // new_line("")
         case ( "DT" )
            ! vlist(1) is to be used as the field widths of the
            ! component of the derived type variable. First set up the format to
            ! be used for output.
            if ( size(vlist) > 0 ) then
               write(pfmt,"(*(g0))" ) "(1x,g", vlist(1), ")"
            else
               pfmt = "(1x,g0)"
            end if
            write(lun, fmt=pfmt, advance="no", iostat=istat, iomsg=imsg) dtv%s(:dtv%lens) // new_line("")
         case ( "NAMELIST" )
            ! Not supported
            istat = 1
            imsg = "Namelist option is not yet supported."
            return
      end select
      return
   end subroutine
end module

It will be interesting to see how much adversely the performance might be affected when complicated options using modern Fortran are in play.

Also, the determination of the “FizzBuzz” values themselves has gotta be an embarrassingly parallel problem for a processor, so the question is whether a Fortran processor fed with coarrays and aided by DO CONCURRENT gets close achieving it following which it is just a matter of IO of large data.