How to use utf-8 in gfortran?

A complete module to manage unicode, isolatin and utf8


module odessa_unicode

   implicit none

   private
   public :: u32 ! character type for UTF-32 strings
   public :: utf8_to_unicode
   public :: isolatin_to_unicode
   public :: unicode_to_utf8
   public :: unicode_to_isolatin
   public :: utf8_to_isolatin
   public :: isolatin_to_utf8
   public :: utf8_to_utf32
   public :: utf32_to_utf8
   public :: isolatin_to_utf32
   public :: utf32_to_isolatin

   integer,parameter :: u32 = selected_char_kind ('ISO_10646')

contains

subroutine isolatin_to_unicode(isolatin,unicode,nerr)

   character           ,intent(in)  :: isolatin(:)
   integer, allocatable,intent(out) :: unicode(:)
   integer             ,intent(out) :: nerr

   integer :: i, n, char_code

   nerr = 0

   n = size(isolatin)

   allocate(unicode(n))

   do i = 1, n
      char_code = iachar(isolatin(i))
      ! Only 8 characters do not correspond to unicode
      select case (char_code)
      case (164) ! Symbol Euro
         unicode(i) = 8364
      case (166) ! S caron
         unicode(i) = 352
      case (168) ! s caron
         unicode(i) = 353
      case (180) ! Z caron
         unicode(i) = 381
      case (184) ! z caron
         unicode(i) = 382
      case (188) ! OE majuscule
         unicode(i) = 338
      case (189) ! oe minuscule
         unicode(i) = 339
      case (190) ! Y trema
         unicode(i) = 376
      case default
         unicode(i) = char_code
      end select
   end do

end subroutine

subroutine unicode_to_isolatin(unicode,isolatin,nerr)

   integer              ,intent(in)  :: unicode(:)
   character,allocatable,intent(out) :: isolatin(:)
   integer              ,intent(out) :: nerr

   integer :: i, n, cp
   integer :: replacement_count

   nerr=0

   n = size(unicode)

   allocate(isolatin(n))

   do i = 1, n
      cp = unicode(i)
      select case (cp)
      ! 8 special characters
      case (8364) ! Euro
         isolatin(i) = achar(164)
      case (352)  ! S caron
         isolatin(i) = achar(166)
      case (353)  ! s caron
         isolatin(i) = achar(168)
      case (381)  ! Z caron
         isolatin(i) = achar(180)
      case (382)  ! z caron
         isolatin(i) = achar(184)
      case (338)  ! OE majuscule
         isolatin(i) = achar(188)
      case (339)  ! oe minuscule
         isolatin(i) = achar(189)
      case (376)  ! Y trema
         isolatin(i) = achar(190)
      case (0:163, 165, 167, 169:179, 181:183, 185:187, 191:255)
         isolatin(i) = achar(cp)
      case default
         nerr=nerr+1
         isolatin(i) = '?' ! replacement character
      end select
   end do

end subroutine

subroutine isolatin_to_utf8(isolatin,utf8,nerr)
   character            ,intent(in)  :: isolatin(:)
   character,allocatable,intent(out) :: utf8(:)
   integer              ,intent(out) :: nerr
   integer,allocatable :: unicode(:)
   call isolatin_to_unicode(isolatin,unicode,nerr)
   call unicode_to_utf8(unicode,utf8,nerr)
end subroutine

subroutine utf8_to_isolatin(utf8,isolatin,nerr)
   character            ,intent(in)  :: utf8(:)
   character,allocatable,intent(out) :: isolatin(:)
   integer              ,intent(out) :: nerr
   integer,allocatable :: unicode(:)
   call utf8_to_unicode(utf8,unicode,nerr)
   call unicode_to_isolatin(unicode,isolatin,nerr)
end subroutine

subroutine unicode_to_utf8(unicode,utf8,nerr)

   integer  ,intent(in)              :: unicode(:)
   character,allocatable,intent(out) :: utf8(:)
   integer              ,intent(out) :: nerr

   integer :: i, n_unicode, n_utf8, cp
   character, allocatable :: temp_utf8(:)

   nerr=0

   n_unicode = size(unicode)

   allocate(temp_utf8(4*n_unicode))
   n_utf8 = 0

   do i = 1, n_unicode
      cp = unicode(i)

      select case (cp)
      case (0:127) ! 1 byte : 0xxxxxxx
         n_utf8 = n_utf8 + 1
         temp_utf8(n_utf8) = achar(cp)

      case (128:2047) ! 2 bytes : 110xxxxx 10xxxxxx
         n_utf8 = n_utf8 + 2
         temp_utf8(n_utf8-1) = achar(ior(192, ishft(cp, -6)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case (2048:65535) ! 3 bytes : 1110xxxx 10xxxxxx 10xxxxxx
         if (cp >= 55296 .and. cp <= 57343) then
            nerr=nerr+1
            n_utf8 = n_utf8 + 1
            temp_utf8(n_utf8) = '?'
            cycle
         end if
         n_utf8 = n_utf8 + 3
         temp_utf8(n_utf8-2) = achar(ior(224, ishft(cp, -12)))
         temp_utf8(n_utf8-1) = achar(ior(128, iand(ishft(cp, -6), 63)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case (65536:1114111) ! 4 bytes : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         n_utf8 = n_utf8 + 4
         temp_utf8(n_utf8-3) = achar(ior(240, ishft(cp, -18)))
         temp_utf8(n_utf8-2) = achar(ior(128, iand(ishft(cp, -12), 63)))
         temp_utf8(n_utf8-1) = achar(ior(128, iand(ishft(cp, -6), 63)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case default
         nerr=nerr+1
         n_utf8 = n_utf8 + 1
         temp_utf8(n_utf8) = '?'
      end select
   end do

   allocate(utf8(n_utf8))
   utf8 = temp_utf8(1:n_utf8)

end subroutine

subroutine utf8_to_unicode(utf8,unicode,nerr)

   ! in fact, this routine is also able to decode an ISOLATIN string

   character            ,intent(in)  :: utf8(:)
   integer  ,allocatable,intent(out) :: unicode(:)
   integer              ,intent(out) :: nerr

   integer                           :: n_out
   integer                           :: i, len8, b1, b2, b3, b4
   integer                           :: cp, nbytes,nerr0
   integer,allocatable               :: temp(:)

   nerr = 0

   len8 = size(utf8)
   i = 1
   n_out = 0
   allocate(temp(len8)) ! big enough to store all unicode values

   do while (i <= len8)

      nerr0=nerr

      b1 = iachar(utf8(i))
      if (b1 < 0) b1 = b1 + 256

      nbytes = 1

      select case (b1)

      case (0:127)
         cp = b1

      case (192:223)
         if (i+1 > len8) then
            nbytes=len8-i+1
            nerr = nerr+1
            cp=IACHAR('?')
         else
            nbytes=2
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            if (iand(b2, 192) /= 128) then
               nerr=nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 31)
               cp = ishft(cp,6) + iand(b2,63)
            endif
         endif

      case (224:239)
         if (i+2 > len8) then
            nbytes=len8-i+1
            nerr=nerr+1
            cp=IACHAR('?')
         else
            nbytes = 3
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            b3 = iachar(utf8(i+2)); if (b3 < 0) b3 = b3 + 256
            if (iand(b2, 192) /= 128 .or. iand(b3, 192) /= 128) then
               nerr =nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 15)
               cp = ishft(cp,6) + iand(b2,63)
               cp = ishft(cp,6) + iand(b3,63)
            endif
         endif

      case (240:247)
         if (i+3 > len8) then
            nbytes=len8-i+1
            nerr = nerr+1
            cp=IACHAR('?')
         else
            nbytes = 4
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            b3 = iachar(utf8(i+2)); if (b3 < 0) b3 = b3 + 256
            b4 = iachar(utf8(i+3)); if (b4 < 0) b4 = b4 + 256
            if (iand(b2,192)/=128 .or. iand(b3,192)/=128 .or. iand(b4,192)/=128) then
               nerr = nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 7)
               cp = ishft(cp,6) + iand(b2,63)
               cp = ishft(cp,6) + iand(b3,63)
               cp = ishft(cp,6) + iand(b4,63)
            endif
         endif

      case default
         nerr=nerr+1
         cp=IACHAR('?')

      end select

      if(nerr0 /= nerr) then
         ! This is an invalid UTF-8 start byte. We apply the heuristic
         ! and interpret it as an ISO-8859-15 character.
         select case (b1)
         case (164); cp = 8364  ! Euro
         case (166); cp = 352  ! S caron
         case (168); cp = 353  ! s caron
         case (180); cp = 381  ! Z caron
         case (184); cp = 382  ! z caron
         case (188); cp = 338  ! OE
         case (189); cp = 339  ! oe
         case (190); cp = 376  ! Y trema
         case default
            cp = b1 ! For all other chars, the codepoint is the byte value
         end select
         nbytes=1
      endif

      n_out = n_out + 1
      temp(n_out) = cp
      i = i + nbytes

   enddo

   allocate(unicode(n_out))
   unicode = temp(1:n_out)

end subroutine

subroutine utf8_to_utf32(utf8,utf32,nerr)
   character                      ,intent(in)  :: utf8(:)
   character(kind=u32),allocatable,intent(out) :: utf32(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   call utf8_to_unicode(utf8,unicode,nerr)
   n=size(unicode)
   allocate(utf32(n))
   do i=1,n
      utf32(i)=char(unicode(i),kind=u32)
   enddo
end subroutine

subroutine isolatin_to_utf32(isolatin,utf32,nerr)
   character                      ,intent(in)  :: isolatin(:)
   character(kind=u32),allocatable,intent(out) :: utf32(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   call isolatin_to_unicode(isolatin,unicode,nerr)
   n=size(unicode)
   allocate(utf32(n))
   do i=1,n
      utf32(i)=char(unicode(i),kind=u32)
   enddo
end subroutine

subroutine utf32_to_isolatin(utf32,isolatin,nerr)
   character(kind=u32)            ,intent(in)  :: utf32(:)
   character          ,allocatable,intent(out) :: isolatin(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   n=size(utf32)
   allocate(unicode(n))
   do i=1,n
      unicode(i)=ICHAR(utf32(i))
   enddo
   call unicode_to_isolatin(unicode,isolatin,nerr)
end subroutine

subroutine utf32_to_utf8(utf32,utf8,nerr)
   character(kind=u32)            ,intent(in)  :: utf32(:)
   character          ,allocatable,intent(out) :: utf8(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   n=size(utf32)
   allocate(unicode(n))
   do i=1,n
      unicode(i)=ICHAR(utf32(i))
   enddo
   call unicode_to_utf8(unicode,utf8,nerr)
end subroutine

end module
2 Likes

I am wondering if there are topics other than those listed below in the syllabi regarding character encoding that are commonly encountered.

Although the results so far might be better called “using Unicode with gfortran on Linux and Cygwin”, I started some material to go with the github repository. Ultimately it will be more general if I get setup on MSWindows and additional compilers. But the proposed topics currently under
consideration are

Introduction to Fortran Unicode support

 + Lesson I: reading and writing UTF-8 Unicode files
 + Lesson II: creating Unicode strings in ASCII Fortran source
   files
 + Lesson III: mixing ASCII and UCS4 kinds as regards assignments,
   contatenation, passing arguments to external ASCII libraries, and
   I/O argument lists
 + Lesson IV: what is and is not supported with internal READ and
   WRITE statements
 + Lesson V: processing Unicode file names on OPEN() statements
 + Lesson VI: reading UTF-8 strings from command lines
 + Lesson VII: passing Unicode strings to and from C
 + Lesson VIII: related utility programs

off the beaten path:

 + Lesson I: UTF-8 source files -- just in comments and constants
 + Lesson II: the backslash escape code extension
 + Lesson III: converting between UCS-4 and UTF-8 with procedures
 + Lesson IV: embedding BOM characters at the beginning of files

Processing Unicode when ISO-10646 is not supported by a compiler

 + Lesson I: converting UTF-8 codes to and from INTEGER values
 + Lesson II: byte-oriented printing of 4-byte integers
 + Lesson III: issues with terminal emulators, system locale settings,
   and other Unicode-related issues
 + Lesson IV: working with ASCII extended encodings; particularly
   those commonly referred to as Extended, Latin, Latin1 and Latin2.

Perhaps subjective and overly ambitious given the time constraints I have, but I hope to make the documentation supplement the code being gathered
and provide a complete coverage of using Fortran in an increasingly Unicode world!

With emphasis on working code accessible as an fpm package that helps resolve some of the current difficulties I have encountered or seen desccribed.

Any additional topics that stand out as requiring coverage?

So far …

2 Likes

Great job, thank you for the effort. Just one quick comment after looking at the first example, count_glyphs.f90. I guess it was already mentioned upthread that len= specifier gives the number of characters (regardless of their kind), not bytes. So in

! NOTE: this character variable is the Unicode kind, not ASCII
character(len=4096,kind=ucs4) :: uline ! specifies maximum line length of 4096 bytes,
                                       ! which might be as few as 1024 (ie. 4096/4) glyphs

the comments are misleading. To be convinced of this, it is enough to write out the uline variable to an unformatted file and check its size (16k+8 for two record lengths):

character(len=4096,kind=ucs4) :: uline ! specifies maximum line length of 4096 bytes,
                                       ! which might be as few as 1024 (ie. 4096/4) glyphs
!------
open(11,file='out',form='unformatted')
write(11) uline
end program count_glyphs

Edit: also, the which might be as few as … seems to suggest that the usc4 kind is variable-length as UTF-8, which AFAIU is not the case (4 bytes per character, fixed, though maybe it could depend on the implementation??? I am not quite sure)

1 Like

Yes. Comment was entirely wrong. I little too much cut and paste from another example I was working on showing using multi-byte strings directly in utf-8 source files, but want to get the by-the-book section done first. Still working on this, so really appreciate the feedback. The day is coming when ASCII will be superseded by UTF-8 in almost all text files, so I need to sort this out myself; hopefully that will be of benefit to others as well.

I think Fortran needs to standardize the use of UTF-8 source files, perhaps by identifying them with the BOM string as appears to be the approach the NAG compiler has taken according to its documentation, particularly clarifying if the code itself will be normalized to ASCII to eliminate the confusion that Unicode can introduce when editing Fortran source, where an editor might replace ASCII 7-bit quotes with other multi-byte characters and so on.

If your environment supports utf-8 files it looks like the majority of modern Fortran compilers with or without support for ISO-10464 can use the M_unicode module to process UTF-8 encoded data. Not seeing much of an indication of interest though. I was going to support for the remaining intrinsic procedure names’ and OOP version of type(unicode_type) and add user documentation as I was getting enthusiastic about how portable the extension it depends on seems to be (all utf-8 encoded characters in strings and comments, ideally – can be used without that but you loose the what-you-see-is-what-you-get convenience). Is this because of a lack of need, because compilers that do support ISO-10646 are providing adequate support, or distrust of using UTF-8 text files portably, or reservations about the code or ? ? The following
example code works with the library with the Intel compiler (which does not support Unicode Fortran extensions) quite nicely in my Linux/Unix environments and CygWin. Not sure yet about MSWindows or other environments

program assign_exe
use M_unicode, only : len, len_trim, repeat, trim
use M_unicode, only : character, range, repeat 
use M_unicode, only : assignment(=), unicode_type
character(len=*),parameter   :: g='(*(g0))'
character(len=:),allocatable :: aline
type(unicode_type)           :: uline, substring
character(len=*),parameter   :: smiley='😃'
integer,allocatable          :: codes(:)
character(len=:),allocatable :: glyphs

   aline="Доки не впріти, доти не вміти."

   write(*,g)'123456789012345678901234567890'
   write(*,g)aline
   write(*,g)'length in bytes is: ',len(aline)
   uline=aline
   write(*,g)'length in glyphs is: ',len(uline)

   write(*,g)'string is: ',character(uline) 
   write(*,g)'third word is: ',character(uline,9,14) ! substring

   substring=range(uline,17,29)
   write(*,g)'string is: ',character(substring) 

   uline=repeat(smiley,30)
   write(*,g) character(uline)

   write(*,g) len_trim(uline)
   uline=aline//'      '
   write(*,g) len_trim(uline)

   uline=[32,160,8192,8193,8194,8195,8196,8197,8198,8199,8200,8201,8202,8239,8287,12288]
   write(*,*)'spaces:',character(uline),len(uline),len_trim(uline)

   uline=[32,160,8192,8193,8194,8195,8196,8197,8198,8199,8200,8201,8202,8239,8287,12288]
   uline=trim(uline)
   write(*,*)'trim:','[',character(uline),']'


   !write(*,g)uline%codes
   !write(*,g)uline
end program assign_exe

I still plan on finishing the project even if just for my own uses, but am wondering how far back on the burner to push it :red_question_mark: I thought Intel was missing the mark to not support Unicode Fortran features, but maybe not.

sorry if I miss something obvious but what module are you referencing to?
Google search for “M_unicode” fortran module returns no results. W/o the quotes it shows results ignoring the “M_” part.

The M_unicode module in the USE statement is part of the WIP mentioned above in M_unicode.f90

The other module (M_utf8.f90) assumes the compiler supports the optional Fortran ISO-10646 support, although it uses M_unicode.f90

After some experimenting, it became clear a stand-alone module not needing the optional Unicode support looked like a good solution for all compilers, and a nice supplement for those that support the optional Unicode features as well, as they provide no easy transfer of UCS4 to ASCII that retains the multi-byte characters some systems can use for Fortran options that require default kinds, which is typically ASCII. That is, if you convert a UCS4 string to ASCII using an assign the non-ASCII characters are replaced with a system-dependent character; but a lot of systems will take a stream of bytes holding UTF8 characters as things like file-names. The module supplies such procedures.

Being only a few days old on a lightly traversed site probably does not put it at the top of the Google data tree, but everything I mentioned is in the same github repository. I plan on putting it in the GPF tree ultimately (General Purpose Fortran) and as a fpm package in the fpm repository if there is interest, but the M_utf8.f90 module will probably stay separate, as it requires the optional Fortran Unicode support.

PS: it is done as a user-defined type in a way that it lets you have “string” arrays with the elements having different lengths as well, which is why LEN() is elemental where it does not have to be with regular CHARACTER arrays where all elements are the same length. Since ASCII is a subset of Unicode that gives you a nice generic type for doing ASCII and Unicode strings, or at least that is the idea.

1 Like

Apparently this link points to Not found. The right one is, if I read it right, this one.

If the community supplies a full replacement, with all intrinsics and operators overloaded, they will certainly not go for it anytime soon :smiley:

That could be solved by allowing to specify encoding='utf-8 for the internal units, line in the following, nonconforming code:

program test
  implicit none
  character(len=120) :: str
  ! the following line causes Error: UNIT tag at (1) must be of type INTEGER
  open(unit=str,encoding='utf-8')
  ! do I/O converting UCS4-UTF8 and v.v.
end program test

That would probably require assigning attributes to strings that you would then have to track, but if it could be used on the WRITE and READ that would seem appropriate, or if you could assign a KIND to the string like “utf-8”. So maybe along those lines, but

    write(line,'(a)',encoding='utf=8') variable

more like “advance=‘no’”.

Well, not sure it works on anything other than GNU/Linux and Cygwin so far; but getting there as long as the compiler allows UTF-8 in comments and quoted strings. Corrected the link you mentioned.

Of the three proposed projects (Using Unicode in Fortran with ISO-10466; Unicode in Fortran without ISO-10466 support; A user guide to using ISO-10466 support in Fortran)

  • M_unicode Using Unicode in Fortran when utf-8 source files are supported

Is complete enough for an alpha release. SCAN(), VERIFY(), TOKENIZE() and SPLIT() support is not included yet, but the majority of intrinsics and operators are overloaded, as well as an OOP interface. My testing has only used ifx and gfortran on Linux and Cygwin but results are encouraging.

1 Like

The following is a bit misleading.

if(ucs4 /= -1 )then
      open (output_unit, encoding='UTF-8')
      write (*,*) trim (hello_world)
   else
      write (*,*) 'cannot use utf-8'
   endif

UTF-8 encoding can be used, selected_char_kind ('utf-8') does not return
the kind=4 parameter, but UTF-8 canned be used without an issue. Perhaps,
in the case of gfortran, I should update the intrinsic:

selected_char_kind ('utf-8'). ;)

UTF-8 is an encoding scheme, it is agnostic regarding any particular
character set which depends on the users terminal and locale settings.

The standard indicates automatic conversion from internal encodings like UCS4 to UTF-8 byte streams when ENCODING=‘UTF-8’ is used. It is free to have other effects.

In some environments output to stdout defaults to writing byte streams as extended ASCII or LatinN on the display but switches to displaying using Unicode glyphs wnen ENCODING=‘UTF-8’ is used.
ifx does not support UCS4 internal encoding (ie. KIND=‘ISO=10466’) but does allow ENCODING=‘UTF-8’ and although not documented anywhere I see it begins interpreting byte streams as UTF-8 and displaying them with a Unicode font.

Depending on how you read it the standard might start printing non-ASCII characters stored in default CHARACTER variables or string constants with a compiler-specific character representing characters as non-ASCII7 but on anything I tried it on it displays byte streams as-is, interpreting them as UTF-8 data.

So in practice most if not all Fortran compilers seem to be able to display bytes assuming they represent UTF-8 data in a WYSIWYG manner. The difference is whether you can use intrinsics just as like with default and ASCII CHARACTER variables; whether the byte streams can be converted to KIND=‘ISO-10466’ type variables from command line arguments or be used on OPEN() and INQUIRE() statements and so on.

In the environments I use the M_unicode module does that and more when using ifx, even though ifx does not have the Unicode extension.

If the extension is supported as with gfortran and NAG and flang there are dusty corners like converting command line arguments and constant strings that are ASCII or UTF-8 encoded into UCS4 variables that the M_utf8 module helps with.

I plan on making a compatible user-defined type for the M_utf8 module so the modules can be interchanged to take advantage of the UCS4 support where available, but also work when not. I might regroup a little as I was just discussing with someone that stdlib also has a user-defined type for processing ragged arrays of strings and
that maybe the M_utf8/M_unicode interfaces could be combined with that to provide a possibly transparent ability to have stdlib support various encodings. Not sure yet at all if that is feasible, perhaps by extending the type but it sounds appealing on the surface.

The M_unicode module is working well for what I need it for now; and did not garner the interest I hoped so proceeding on a as-time-permits basis but it is interesting working with Unicode. Things I take for granted, such as the direction of text, the meaning of upper-case and lower-case, particularly for the letter “i”, parsing lines mixing ideographs and Latin glyphs, … all are a little more complicated than they first appear :grinning_face_with_smiling_eyes:

The following demonstration code works.

ian@debian:~/test3$ gfortran test1.f -ffree-form
ian@debian:~/test3$ ./a.out
Fortran says: ¡Hola café — こんにちは — π≈3.14159
ian@debian:~/test3$ cat test1.f
program hello_f
implicit none
! UTF-8 in Fortran source is fine; no special input-charset flag needed.
character(len=*), parameter :: msg = ‘Fortran says: ¡Hola café — こんにちは — π≈3.14159’
print *, trim(msg)
end program hello_f

Until very recent releases of gfortran a line limit of 132 characters
was the default, which was interpreted as 132 ASCII characters or bytes.
With these older versions I would recommend turning off that limit as
many Unicode characters are multi-byte. The default makes it hard to adhere to
the 132-byte limit because you will generally see less glyphs than there
are bytes. For gfortran the switch is

-ffree-line-length-none     Allow arbitrary character line width in free mode.

Most Fortran compilers do not yet explicitly support UTF-8-encoded data
in source files, they just do not disallow it.

So far I do not know of any that do not treat the UTF-8 encoded strings as
default CHARACTER-type bytes. So I would recommend not using list-directed
I/O containing UTF-8 encoded strings, as the lines might be broken in the
middle of a glyph definition.

So the example becomes

ian@debian:~/test3$ gfortran test1.f -ffree-form --free-line-length-none
ian@debian:~/test3$ ./a.out
Fortran says: ¡Hola café — こんにちは — π≈3.14159
ian@debian:~/test3$ cat test1.f
program hello_f
implicit none
! UTF-8 in Fortran source is fine; no special input-charset flag needed.
character(len=*), parameter :: msg = ‘Fortran says: ¡Hola café — こんにちは — π≈3.14159’
print '(a)', trim(msg)
end program hello_f

Otherwise that is indeed sufficient for just echoing the UTF-8-encoded
strings on systems that default to UTF-8 encoding.

But if you actually want to manipulate the data (until a compiler supports
UTF-8-encoded kinds directly) you have to convert the strings to ISO-10646
character kind if the compiler supports it or use procedures that handle
the UTF-8-encoded multi-byte data properly.

The M_unicode module mentioned
above supports all the intrinsics with a user-defined type that does
not require ISO-10646 support with an OOP interface to the intrinsics
and other procedures such as UPPER(), LOWER(), SORT(), and REPLACE()
that will work with UTF-8-encoded data that has been tested in at least
the following programming environments:

  ✓ macos-latest_gfortran
  ✓ macos-latest_gfortran_cmake
  ✓ macos-latest_gfortran_fpm

  ✓ ubuntu-latest_flang-new
  ✓ ubuntu-latest_flang-new_cmake
  ✓ ubuntu-latest_flang-new_fpm

  ✓ ubuntu-latest_gfortran
  ✓ ubuntu-latest_gfortran_cmake
  ✓ ubuntu-latest_gfortran_fpm

  ✓ ubuntu-latest_ifx_cmake
  ✓ ubuntu-latest_ifx_fpm

  ✓ windows-latest_flang-new_cmake
  ✓ windows-latest_flang-new_fpm

  ✓ windows-latest_gfortran_cmake
  ✓ windows-latest_gfortran_fpm

  ✓ windows-latest_ifx_cmake
  ✓ windows-latest_ifx_fpm

Sample program using M_unicode

An example program shows some basic usage of the M_unicode module

      program demo_M_unicode
      use,intrinsic :: iso_fortran_env, only : stdout=>output_unit
      use M_unicode,only : TOKENIZE, REPLACE, CHARACTER, UPPER, LOWER, LEN
      use M_unicode,only : unicode_type, assignment(=), operator(//)
      use M_unicode,only : ut => unicode_type, ch => character
      use M_unicode,only : read(formatted), write(formatted)
      type(unicode_type)             :: string, numeric, uppercase, lowercase
      type(unicode_type),allocatable :: array(:)
      character(len=*),parameter     :: all='(g0)'
      character(len=*),parameter     :: uni='(DT)'
      uppercase='АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ'
      lowercase='абвгґдеєжзиіїйклмнопрстуфхцчшщьюя'
      numeric='0123456789'

       string=uppercase//' '//numeric//' '//lowercase

       print all, 'Original string:'
       print all, ch(string)
       print all, 'length in bytes :',len(string%character())
       print all, 'length in glyphs:',len(string)
       print all

       print all, 'convert to all uppercase:'
       print uni, UPPER(string)
       print all

       print all, 'convert to all lowercase:'
       print uni, LOWER(string)
       print uni, string%lower() ! using OOP instead of procedural syntax
       print all

       print all, 'tokenize on spaces ... '
       call TOKENIZE(string,ut(' '),array)
       print all, '... writing with A or G format:',character(array)
       print uni, ut('... writing with DT format'),array
       print all

       print all, 'case-insensitive replace:'
       print uni,  REPLACE(string, &
       & ut('клмнопрс'), &
       & ut('--------'), &
       & ignorecase=.true.)
       print all

      end program demo_M_unicode

Expected output

Original string:
АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ 0123456789 абвгґдеєжзиіїйклмнопрстуфхцчшщьюя
length in bytes :
144
length in glyphs:
78

convert to all uppercase:
АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ 0123456789 АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ

convert to all lowercase:
абвгґдеєжзиіїйклмнопрстуфхцчшщьюя 0123456789 абвгґдеєжзиіїйклмнопрстуфхцчшщьюя
абвгґдеєжзиіїйклмнопрстуфхцчшщьюя 0123456789 абвгґдеєжзиіїйклмнопрстуфхцчшщьюя

tokenize on spaces ...
... writing with A or G format:
АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ
0123456789
абвгґдеєжзиіїйклмнопрстуфхцчшщьюя
... writing with DT format
АБВГҐДЕЄЖЗИІЇЙКЛМНОПРСТУФХЦЧШЩЬЮЯ
0123456789
абвгґдеєжзиіїйклмнопрстуфхцчшщьюя

case-insensitive replace:
АБВГҐДЕЄЖЗИІЇЙ--------ТУФХЦЧШЩЬЮЯ 0123456789 абвгґдеєжзиіїй--------туфхцчшщьюя