I was curious lately about the speed of writing with and without a format string write(lun,'(a)')
vs write(lun,*)
for example. Previously I had identified that writing with *
was always faster, which I suppose makes sense because it seems that something about writing (besides the actual action of putting data on the disk) is happening/being computed at run-time. For instance, a program will compile without errors, then fail at runtime if a write statement has mismatched format + output variables.
Anyways, I then wondered what happens if I just send my output to /dev/null
, and I was very surprised to find that for some compilers this was faster, and others significantly worse.
I found the following:
-
writing to files,
gfortran
was about an order of magnitude faster than Intel OneAPIifort
orifx
when writingHello World!
via*
or'(a)'
, but only about twice as fast as AMDAOCC Flang
with*
and the same speed with'(a)'
-
writing to
/dev/null
,gfortran
slowed down considerably, which was completely unexpected; AMDAOCC Flang
was relatively the same timing, and only Intel OneAPIifort
andifx
took about 1/3 the time they spent writing to files
Any insights to this behavior? I am surprised how inconsistent the writing performance was between different compilers, and even more surprised that writing to /dev/null
was not substantially faster for all of them.
code (click to see)
module mymod
implicit none
private
public :: hw1, hw2
contains
impure subroutine hw1(lun)
integer, intent(in), optional :: lun
if (present(lun)) then
write(lun,*) 'Hello World!'
else
write(*,*) 'Hello World!'
end if
end subroutine hw1
impure subroutine hw2(lun)
integer, intent(in), optional :: lun
if (present(lun)) then
write(lun,'(a)') 'Hello World!'
else
write(*,'(a)') 'Hello World!'
end if
end subroutine hw2
end module mymod
program main
use, intrinsic :: iso_fortran_env
use, non_intrinsic :: mymod
implicit none
integer(int64), parameter :: i_max = 100000000_int64
integer(int64) :: c1, c2, cr, i, j
real(real64) :: elapsed
integer :: lun, lun1, lun2
write(*,'(a)') 'COMPILER: '//compiler_version()//', OPTIONS: '//compiler_options()
do j=1_int64,2_int64
if (j == 2_int64) then
write(*,*) 'dumping output to /dev/null'
open(newunit=lun, file='/dev/null', action='write')
lun1 = lun
lun2 = lun
else
write(*,*) 'hw1 writes to 831, hw2 writes to 530'
lun1 = 831
lun2 = 530
end if
call system_clock(c1, cr)
do i=1_int64,i_max
call hw1(lun1)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_int64), real64)/real(cr, real64)
write(*,*) 'write(831,*): ',elapsed,' seconds'
call system_clock(c1, cr)
do i=1_int64,i_max
call hw2(lun2)
end do
call system_clock(c2)
elapsed = real(max(c2 - c1, 1_int64), real64)/real(cr, real64)
write(*,*) 'write(530,''(a)''): ',elapsed,' seconds'
end do
write(*,*)''
end program main
terminal output (click to see)
COMPILER: GCC version 13.1.1 20230429, OPTIONS: -march=znver3 -mmmx -mpopcnt -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -mavx -mavx2 -msse4a -mno-fma4 -mno-xop -mfma -mno-avx512f -mbmi -mbmi2 -maes -mpclmul -mno-avx512vl -mno-avx512bw -mno-avx512dq -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512vbmi -mno-avx512ifma -mno-avx5124vnniw -mno-avx5124fmaps -mno-avx512vpopcntdq -mno-avx512vbmi2 -mno-gfni -mvpclmulqdq -mno-avx512vnni -mno-avx512bitalg -mno-avx512bf16 -mno-avx512vp2intersect -mno-3dnow -madx -mabm -mno-cldemote -mclflushopt -mclwb -mclzero -mcx16 -mno-enqcmd -mf16c -mfsgsbase -mfxsr -mno-hle -msahf -mno-lwp -mlzcnt -mmovbe -mno-movdir64b -mno-movdiri -mmwaitx -mno-pconfig -mpku -mno-prefetchwt1 -mprfchw -mno-ptwrite -mrdpid -mrdrnd -mrdseed -mno-rtm -mno-serialize -mno-sgx -msha -mshstk -mno-tbm -mno-tsxldtrk -mvaes -mno-waitpkg -mwbnoinvd -mxsave -mxsavec -mxsaveopt -mxsaves -mno-amx-tile -mno-amx-int8 -mno-amx-bf16 -mno-uintr -mno-hreset -mno-kl -mno-widekl -mno-avxvnni -mno-avx512fp16 -mno-avxifma -mno-avxvnniint8 -mno-avxneconvert -mno-cmpccxadd -mno-amx-fp16 -mno-prefetchi -mno-raoint -mno-amx-complex --param=l1-cache-size=32 --param=l1-cache-line-size=64 --param=l2-cache-size=512 -mtune=znver3 -Ofast -flto -fwhole-program -fpre-include=/usr/include/finclude/math-vector-fortran.h
hw1 writes to 831, hw2 writes to 530
write(831,*): 7.8473035189999996 seconds
write(530,'(a)'): 10.895682978000000 seconds
dumping output to /dev/null
write(831,*): 17.326002824000000 seconds
write(530,'(a)'): 21.694812569000000 seconds
COMPILER: flang Flang - 1.5 2017-05-01, OPTIONS: -Ofast -march=native -flto -o proj_flang
hw1 writes to 831, hw2 writes to 530
write(831,*): 12.12563490000000 seconds
write(530,'(a)'): 10.88859700000000 seconds
dumping output to /dev/null
write(831,*): 11.41602090000000 seconds
write(530,'(a)'): 10.42051290000000 seconds
COMPILER: Intel(R) Fortran Intel(R) 64 Compiler Classic for applications running on Intel(R) 64, Version 2021.9.0 Build 20230302_000000, OPTIONS: -Ofast -mavx2 -ipo -fp-model=fast=2 -fp-speculation=fast -o proj_ifort
hw1 writes to 831, hw2 writes to 530
write(831,*): 74.2634630000000 seconds
write(530,'(a)'): 75.9310700000000 seconds
dumping output to /dev/null
write(831,*): 24.6305700000000 seconds
write(530,'(a)'): 24.2716460000000 seconds
COMPILER: Intel(R) Fortran Compiler for applications running on Intel(R) 64, Version 2023.1.0 Build 20230320, OPTIONS: -Ofast -mavx2 -ipo -fp-model=fast=2 -fp-speculation=fast -o proj_ifx
hw1 writes to 831, hw2 writes to 530
write(831,*): 73.6425570000000 seconds
write(530,'(a)'): 74.0088560000000 seconds
dumping output to /dev/null
write(831,*): 22.1595820000000 seconds
write(530,'(a)'): 23.5241890000000 seconds
compile + run script (click to see)
#!/bin/bash
clear
rm -v a.out fort.*
gfortran -Ofast -march=native -flto -fwhole-program -o proj_gfortran main.f90
./proj_gfortran
flang -Ofast -march=native -flto -o proj_flang main.f90
./proj_flang
ifort -Ofast -mavx2 -ipo -fp-model=fast=2 -fp-speculation=fast -o proj_ifort main.f90
./proj_ifort
ifx -Ofast -mavx2 -ipo -fp-model=fast=2 -fp-speculation=fast -o proj_ifx main.f90
./proj_ifx