Inspired by https://fortran-lang.discourse.group/t/how-to-use-utf-8-in-gfortran, I looked at handling of strings that contain extended ASCII letters.
The Fortran standard mentions the 7 bit standard ASCII (ISO/IEC 646 - Wikipedia) and that is also the default in Gfortran (SELECTED_CHAR_KIND (The GNU Fortran Compiler)). For that reason, symbols of the extended ASCII code (in the range 128 to 255) are stored differently and their len is 2 or 3. This is a little bit annoying, especially since the default ASCII character type has 8 bit = 1 byte (at least on Linux), i.e. it could store the 256 symbols (https://theasciicode.com.ar) easily if extended ASCII would be the default.
Does anyone has an idea how to implement a len_fixed that gives the number of characters on screen instead of the size in bytes (or “storage units” in case that a single character uses more bytes, i.e. in the case of UCS-4)? Additionally, it would be nice to have a sensible approach for iterating over a string that contains extended ASCII characters.
Below is the code I’ve used for testing. Gfortran and ifx show the same behavior for ASCII but ifx rejects the code related to UCS-4.
program test_ASCII
use, intrinsic :: ISO_C_binding
implicit none
integer, parameter :: ucs4 = selected_char_kind ('ISO_10646')
integer, parameter :: ascii = selected_char_kind ('ASCII')
character(kind=ascii) :: c
integer(C_SIGNED_CHAR), dimension(storage_size('~')/8) :: bits_126
integer(C_SIGNED_CHAR), dimension(storage_size('Ç')/8) :: bits_128
integer(C_SIGNED_CHAR), dimension(storage_size('â– ')/8) :: bits_254
print('(a,i0,a,i0)'), ' "~": ASCII=126, size/bit=', storage_size('~'), ', len=', len('~')!, C_sizeof(bits_126)
bits_126 = transfer('~',1_C_SIGNED_CHAR)
print*, bits_126
call print_bits(bits_126)
print('(a,i0,a,i0)'), ' "Ç": ASCII=128, size/bit=', storage_size('Ç'), ', len=', len('Ç')!, C_sizeof(bits_128)
bits_128 = transfer('Ç',1_C_SIGNED_CHAR,size=2)
print*, bits_128
call print_bits(bits_128)
print('(a,i0,a,i0)'), ' "â– ": ASCII=254, size/bit=', storage_size('â– '), ', len=', len('â– ')!, C_sizeof(bits_254)
bits_254 = transfer('â– ',1_C_SIGNED_CHAR,size=3)
print*, bits_254
call print_bits(bits_254)
c = transfer(126_C_SIGNED_CHAR,'a') ! values > 127 or < -127 require "-fno-range-check" on Gfortran and lead to strange results
print*, c
c = transfer(-127_C_SIGNED_CHAR,'a')
print*, c
#ifdef __GFORTRAN__
print*, ''
print*, 'UCS4'
print*, '~', storage_size(ucs4_'~'), len(ucs4_'~')
print*, 'Ç', storage_size(ucs4_'Ç'), len(ucs4_'Ç')
print*, 'â– ', storage_size(ucs4_'â– '), len(ucs4_'â– ')
#endif
contains
subroutine print_bits(i)
integer(C_SIGNED_CHAR), dimension(:), intent(in) :: i
integer :: s,l
do l = 1, size(i)
do s = bit_size(i)-1, 0, -1
write(*,'(i1)',advance='no') ibits(i(l:l),s,1)
end do
write(*,'(a)',advance='no') ' '
end do
print*
print*
end subroutine print_bits
end program test_ASCII
edit: added output below and fixed mistake (was “byte” instead of “bit”)
"~": ASCII=126, size/bit=8, len=1
126
01111110
"Ç": ASCII=128, size/bit=16, len=2
-61 -121
11000011 10000111
"â– ": ASCII=254, size/bit=24, len=3
-30 -106 -96
11100010 10010110 10100000
~
ďż˝
UCS4
~ 32 1
Ç 64 2
â– 96 3
edit 2: fixed bit-wise representation