How to use utf-8 in gfortran?

A complete module to manage unicode, isolatin and utf8


module odessa_unicode

   implicit none

   private
   public :: u32 ! character type for UTF-32 strings
   public :: utf8_to_unicode
   public :: isolatin_to_unicode
   public :: unicode_to_utf8
   public :: unicode_to_isolatin
   public :: utf8_to_isolatin
   public :: isolatin_to_utf8
   public :: utf8_to_utf32
   public :: utf32_to_utf8
   public :: isolatin_to_utf32
   public :: utf32_to_isolatin

   integer,parameter :: u32 = selected_char_kind ('ISO_10646')

contains

subroutine isolatin_to_unicode(isolatin,unicode,nerr)

   character           ,intent(in)  :: isolatin(:)
   integer, allocatable,intent(out) :: unicode(:)
   integer             ,intent(out) :: nerr

   integer :: i, n, char_code

   nerr = 0

   n = size(isolatin)

   allocate(unicode(n))

   do i = 1, n
      char_code = iachar(isolatin(i))
      ! Only 8 characters do not correspond to unicode
      select case (char_code)
      case (164) ! Symbol Euro
         unicode(i) = 8364
      case (166) ! S caron
         unicode(i) = 352
      case (168) ! s caron
         unicode(i) = 353
      case (180) ! Z caron
         unicode(i) = 381
      case (184) ! z caron
         unicode(i) = 382
      case (188) ! OE majuscule
         unicode(i) = 338
      case (189) ! oe minuscule
         unicode(i) = 339
      case (190) ! Y trema
         unicode(i) = 376
      case default
         unicode(i) = char_code
      end select
   end do

end subroutine

subroutine unicode_to_isolatin(unicode,isolatin,nerr)

   integer              ,intent(in)  :: unicode(:)
   character,allocatable,intent(out) :: isolatin(:)
   integer              ,intent(out) :: nerr

   integer :: i, n, cp
   integer :: replacement_count

   nerr=0

   n = size(unicode)

   allocate(isolatin(n))

   do i = 1, n
      cp = unicode(i)
      select case (cp)
      ! 8 special characters
      case (8364) ! Euro
         isolatin(i) = achar(164)
      case (352)  ! S caron
         isolatin(i) = achar(166)
      case (353)  ! s caron
         isolatin(i) = achar(168)
      case (381)  ! Z caron
         isolatin(i) = achar(180)
      case (382)  ! z caron
         isolatin(i) = achar(184)
      case (338)  ! OE majuscule
         isolatin(i) = achar(188)
      case (339)  ! oe minuscule
         isolatin(i) = achar(189)
      case (376)  ! Y trema
         isolatin(i) = achar(190)
      case (0:163, 165, 167, 169:179, 181:183, 185:187, 191:255)
         isolatin(i) = achar(cp)
      case default
         nerr=nerr+1
         isolatin(i) = '?' ! replacement character
      end select
   end do

end subroutine

subroutine isolatin_to_utf8(isolatin,utf8,nerr)
   character            ,intent(in)  :: isolatin(:)
   character,allocatable,intent(out) :: utf8(:)
   integer              ,intent(out) :: nerr
   integer,allocatable :: unicode(:)
   call isolatin_to_unicode(isolatin,unicode,nerr)
   call unicode_to_utf8(unicode,utf8,nerr)
end subroutine

subroutine utf8_to_isolatin(utf8,isolatin,nerr)
   character            ,intent(in)  :: utf8(:)
   character,allocatable,intent(out) :: isolatin(:)
   integer              ,intent(out) :: nerr
   integer,allocatable :: unicode(:)
   call utf8_to_unicode(utf8,unicode,nerr)
   call unicode_to_isolatin(unicode,isolatin,nerr)
end subroutine

subroutine unicode_to_utf8(unicode,utf8,nerr)

   integer  ,intent(in)              :: unicode(:)
   character,allocatable,intent(out) :: utf8(:)
   integer              ,intent(out) :: nerr

   integer :: i, n_unicode, n_utf8, cp
   character, allocatable :: temp_utf8(:)

   nerr=0

   n_unicode = size(unicode)

   allocate(temp_utf8(4*n_unicode))
   n_utf8 = 0

   do i = 1, n_unicode
      cp = unicode(i)

      select case (cp)
      case (0:127) ! 1 byte : 0xxxxxxx
         n_utf8 = n_utf8 + 1
         temp_utf8(n_utf8) = achar(cp)

      case (128:2047) ! 2 bytes : 110xxxxx 10xxxxxx
         n_utf8 = n_utf8 + 2
         temp_utf8(n_utf8-1) = achar(ior(192, ishft(cp, -6)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case (2048:65535) ! 3 bytes : 1110xxxx 10xxxxxx 10xxxxxx
         if (cp >= 55296 .and. cp <= 57343) then
            nerr=nerr+1
            n_utf8 = n_utf8 + 1
            temp_utf8(n_utf8) = '?'
            cycle
         end if
         n_utf8 = n_utf8 + 3
         temp_utf8(n_utf8-2) = achar(ior(224, ishft(cp, -12)))
         temp_utf8(n_utf8-1) = achar(ior(128, iand(ishft(cp, -6), 63)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case (65536:1114111) ! 4 bytes : 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         n_utf8 = n_utf8 + 4
         temp_utf8(n_utf8-3) = achar(ior(240, ishft(cp, -18)))
         temp_utf8(n_utf8-2) = achar(ior(128, iand(ishft(cp, -12), 63)))
         temp_utf8(n_utf8-1) = achar(ior(128, iand(ishft(cp, -6), 63)))
         temp_utf8(n_utf8)   = achar(ior(128, iand(cp, 63)))

      case default
         nerr=nerr+1
         n_utf8 = n_utf8 + 1
         temp_utf8(n_utf8) = '?'
      end select
   end do

   allocate(utf8(n_utf8))
   utf8 = temp_utf8(1:n_utf8)

end subroutine

subroutine utf8_to_unicode(utf8,unicode,nerr)

   ! in fact, this routine is also able to decode an ISOLATIN string

   character            ,intent(in)  :: utf8(:)
   integer  ,allocatable,intent(out) :: unicode(:)
   integer              ,intent(out) :: nerr

   integer                           :: n_out
   integer                           :: i, len8, b1, b2, b3, b4
   integer                           :: cp, nbytes,nerr0
   integer,allocatable               :: temp(:)

   nerr = 0

   len8 = size(utf8)
   i = 1
   n_out = 0
   allocate(temp(len8)) ! big enough to store all unicode values

   do while (i <= len8)

      nerr0=nerr

      b1 = iachar(utf8(i))
      if (b1 < 0) b1 = b1 + 256

      nbytes = 1

      select case (b1)

      case (0:127)
         cp = b1

      case (192:223)
         if (i+1 > len8) then
            nbytes=len8-i+1
            nerr = nerr+1
            cp=IACHAR('?')
         else
            nbytes=2
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            if (iand(b2, 192) /= 128) then
               nerr=nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 31)
               cp = ishft(cp,6) + iand(b2,63)
            endif
         endif

      case (224:239)
         if (i+2 > len8) then
            nbytes=len8-i+1
            nerr=nerr+1
            cp=IACHAR('?')
         else
            nbytes = 3
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            b3 = iachar(utf8(i+2)); if (b3 < 0) b3 = b3 + 256
            if (iand(b2, 192) /= 128 .or. iand(b3, 192) /= 128) then
               nerr =nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 15)
               cp = ishft(cp,6) + iand(b2,63)
               cp = ishft(cp,6) + iand(b3,63)
            endif
         endif

      case (240:247)
         if (i+3 > len8) then
            nbytes=len8-i+1
            nerr = nerr+1
            cp=IACHAR('?')
         else
            nbytes = 4
            b2 = iachar(utf8(i+1)); if (b2 < 0) b2 = b2 + 256
            b3 = iachar(utf8(i+2)); if (b3 < 0) b3 = b3 + 256
            b4 = iachar(utf8(i+3)); if (b4 < 0) b4 = b4 + 256
            if (iand(b2,192)/=128 .or. iand(b3,192)/=128 .or. iand(b4,192)/=128) then
               nerr = nerr+1
               cp=IACHAR('?')
            else
               cp = iand(b1, 7)
               cp = ishft(cp,6) + iand(b2,63)
               cp = ishft(cp,6) + iand(b3,63)
               cp = ishft(cp,6) + iand(b4,63)
            endif
         endif

      case default
         nerr=nerr+1
         cp=IACHAR('?')

      end select

      if(nerr0 /= nerr) then
         ! This is an invalid UTF-8 start byte. We apply the heuristic
         ! and interpret it as an ISO-8859-15 character.
         select case (b1)
         case (164); cp = 8364  ! Euro
         case (166); cp = 352  ! S caron
         case (168); cp = 353  ! s caron
         case (180); cp = 381  ! Z caron
         case (184); cp = 382  ! z caron
         case (188); cp = 338  ! OE
         case (189); cp = 339  ! oe
         case (190); cp = 376  ! Y trema
         case default
            cp = b1 ! For all other chars, the codepoint is the byte value
         end select
         nbytes=1
      endif

      n_out = n_out + 1
      temp(n_out) = cp
      i = i + nbytes

   enddo

   allocate(unicode(n_out))
   unicode = temp(1:n_out)

end subroutine

subroutine utf8_to_utf32(utf8,utf32,nerr)
   character                      ,intent(in)  :: utf8(:)
   character(kind=u32),allocatable,intent(out) :: utf32(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   call utf8_to_unicode(utf8,unicode,nerr)
   n=size(unicode)
   allocate(utf32(n))
   do i=1,n
      utf32(i)=char(unicode(i),kind=u32)
   enddo
end subroutine

subroutine isolatin_to_utf32(isolatin,utf32,nerr)
   character                      ,intent(in)  :: isolatin(:)
   character(kind=u32),allocatable,intent(out) :: utf32(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   call isolatin_to_unicode(isolatin,unicode,nerr)
   n=size(unicode)
   allocate(utf32(n))
   do i=1,n
      utf32(i)=char(unicode(i),kind=u32)
   enddo
end subroutine

subroutine utf32_to_isolatin(utf32,isolatin,nerr)
   character(kind=u32)            ,intent(in)  :: utf32(:)
   character          ,allocatable,intent(out) :: isolatin(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   n=size(utf32)
   allocate(unicode(n))
   do i=1,n
      unicode(i)=ICHAR(utf32(i))
   enddo
   call unicode_to_isolatin(unicode,isolatin,nerr)
end subroutine

subroutine utf32_to_utf8(utf32,utf8,nerr)
   character(kind=u32)            ,intent(in)  :: utf32(:)
   character          ,allocatable,intent(out) :: utf8(:)
   integer                        ,intent(out) :: nerr
   integer            ,allocatable             :: unicode(:)
   integer                                     :: i, n
   n=size(utf32)
   allocate(unicode(n))
   do i=1,n
      unicode(i)=ICHAR(utf32(i))
   enddo
   call unicode_to_utf8(unicode,utf8,nerr)
end subroutine

end module
2 Likes

I am wondering if there are topics other than those listed below in the syllabi regarding character encoding that are commonly encountered.

Although the results so far might be better called “using Unicode with gfortran on Linux and Cygwin”, I started some material to go with the github repository. Ultimately it will be more general if I get setup on MSWindows and additional compilers. But the proposed topics currently under
consideration are

Introduction to Fortran Unicode support

 + Lesson I: reading and writing UTF-8 Unicode files
 + Lesson II: creating Unicode strings in ASCII Fortran source
   files
 + Lesson III: mixing ASCII and UCS4 kinds as regards assignments,
   contatenation, passing arguments to external ASCII libraries, and
   I/O argument lists
 + Lesson IV: what is and is not supported with internal READ and
   WRITE statements
 + Lesson V: processing Unicode file names on OPEN() statements
 + Lesson VI: reading UTF-8 strings from command lines
 + Lesson VII: passing Unicode strings to and from C
 + Lesson VIII: related utility programs

off the beaten path:

 + Lesson I: UTF-8 source files -- just in comments and constants
 + Lesson II: the backslash escape code extension
 + Lesson III: converting between UCS-4 and UTF-8 with procedures
 + Lesson IV: embedding BOM characters at the beginning of files

Processing Unicode when ISO-10646 is not supported by a compiler

 + Lesson I: converting UTF-8 codes to and from INTEGER values
 + Lesson II: byte-oriented printing of 4-byte integers
 + Lesson III: issues with terminal emulators, system locale settings,
   and other Unicode-related issues
 + Lesson IV: working with ASCII extended encodings; particularly
   those commonly referred to as Extended, Latin, Latin1 and Latin2.

Perhaps subjective and overly ambitious given the time constraints I have, but I hope to make the documentation supplement the code being gathered
and provide a complete coverage of using Fortran in an increasingly Unicode world!

With emphasis on working code accessible as an fpm package that helps resolve some of the current difficulties I have encountered or seen desccribed.

Any additional topics that stand out as requiring coverage?

So far …

2 Likes

Great job, thank you for the effort. Just one quick comment after looking at the first example, count_glyphs.f90. I guess it was already mentioned upthread that len= specifier gives the number of characters (regardless of their kind), not bytes. So in

! NOTE: this character variable is the Unicode kind, not ASCII
character(len=4096,kind=ucs4) :: uline ! specifies maximum line length of 4096 bytes,
                                       ! which might be as few as 1024 (ie. 4096/4) glyphs

the comments are misleading. To be convinced of this, it is enough to write out the uline variable to an unformatted file and check its size (16k+8 for two record lengths):

character(len=4096,kind=ucs4) :: uline ! specifies maximum line length of 4096 bytes,
                                       ! which might be as few as 1024 (ie. 4096/4) glyphs
!------
open(11,file='out',form='unformatted')
write(11) uline
end program count_glyphs

Edit: also, the which might be as few as … seems to suggest that the usc4 kind is variable-length as UTF-8, which AFAIU is not the case (4 bytes per character, fixed, though maybe it could depend on the implementation??? I am not quite sure)

1 Like

Yes. Comment was entirely wrong. I little too much cut and paste from another example I was working on showing using multi-byte strings directly in utf-8 source files, but want to get the by-the-book section done first. Still working on this, so really appreciate the feedback. The day is coming when ASCII will be superseded by UTF-8 in almost all text files, so I need to sort this out myself; hopefully that will be of benefit to others as well.

I think Fortran needs to standardize the use of UTF-8 source files, perhaps by identifying them with the BOM string as appears to be the approach the NAG compiler has taken according to its documentation, particularly clarifying if the code itself will be normalized to ASCII to eliminate the confusion that Unicode can introduce when editing Fortran source, where an editor might replace ASCII 7-bit quotes with other multi-byte characters and so on.

If your environment supports utf-8 files it looks like the majority of modern Fortran compilers with or without support for ISO-10464 can use the M_unicode module to process UTF-8 encoded data. Not seeing much of an indication of interest though. I was going to support for the remaining intrinsic procedure names’ and OOP version of type(unicode_type) and add user documentation as I was getting enthusiastic about how portable the extension it depends on seems to be (all utf-8 encoded characters in strings and comments, ideally – can be used without that but you loose the what-you-see-is-what-you-get convenience). Is this because of a lack of need, because compilers that do support ISO-10646 are providing adequate support, or distrust of using UTF-8 text files portably, or reservations about the code or ? ? The following
example code works with the library with the Intel compiler (which does not support Unicode Fortran extensions) quite nicely in my Linux/Unix environments and CygWin. Not sure yet about MSWindows or other environments

program assign_exe
use M_unicode, only : len, len_trim, repeat, trim
use M_unicode, only : character, range, repeat 
use M_unicode, only : assignment(=), unicode_type
character(len=*),parameter   :: g='(*(g0))'
character(len=:),allocatable :: aline
type(unicode_type)           :: uline, substring
character(len=*),parameter   :: smiley='😃'
integer,allocatable          :: codes(:)
character(len=:),allocatable :: glyphs

   aline="Доки не впріти, доти не вміти."

   write(*,g)'123456789012345678901234567890'
   write(*,g)aline
   write(*,g)'length in bytes is: ',len(aline)
   uline=aline
   write(*,g)'length in glyphs is: ',len(uline)

   write(*,g)'string is: ',character(uline) 
   write(*,g)'third word is: ',character(uline,9,14) ! substring

   substring=range(uline,17,29)
   write(*,g)'string is: ',character(substring) 

   uline=repeat(smiley,30)
   write(*,g) character(uline)

   write(*,g) len_trim(uline)
   uline=aline//'      '
   write(*,g) len_trim(uline)

   uline=[32,160,8192,8193,8194,8195,8196,8197,8198,8199,8200,8201,8202,8239,8287,12288]
   write(*,*)'spaces:',character(uline),len(uline),len_trim(uline)

   uline=[32,160,8192,8193,8194,8195,8196,8197,8198,8199,8200,8201,8202,8239,8287,12288]
   uline=trim(uline)
   write(*,*)'trim:','[',character(uline),']'


   !write(*,g)uline%codes
   !write(*,g)uline
end program assign_exe

I still plan on finishing the project even if just for my own uses, but am wondering how far back on the burner to push it :red_question_mark: I thought Intel was missing the mark to not support Unicode Fortran features, but maybe not.

sorry if I miss something obvious but what module are you referencing to?
Google search for “M_unicode” fortran module returns no results. W/o the quotes it shows results ignoring the “M_” part.

The M_unicode module in the USE statement is part of the WIP mentioned above in M_unicode.f90

The other module (M_utf8.f90) assumes the compiler supports the optional Fortran ISO-10646 support, although it uses M_unicode.f90

After some experimenting, it became clear a stand-alone module not needing the optional Unicode support looked like a good solution for all compilers, and a nice supplement for those that support the optional Unicode features as well, as they provide no easy transfer of UCS4 to ASCII that retains the multi-byte characters some systems can use for Fortran options that require default kinds, which is typically ASCII. That is, if you convert a UCS4 string to ASCII using an assign the non-ASCII characters are replaced with a system-dependent character; but a lot of systems will take a stream of bytes holding UTF8 characters as things like file-names. The module supplies such procedures.

Being only a few days old on a lightly traversed site probably does not put it at the top of the Google data tree, but everything I mentioned is in the same github repository. I plan on putting it in the GPF tree ultimately (General Purpose Fortran) and as a fpm package in the fpm repository if there is interest, but the M_utf8.f90 module will probably stay separate, as it requires the optional Fortran Unicode support.

PS: it is done as a user-defined type in a way that it lets you have “string” arrays with the elements having different lengths as well, which is why LEN() is elemental where it does not have to be with regular CHARACTER arrays where all elements are the same length. Since ASCII is a subset of Unicode that gives you a nice generic type for doing ASCII and Unicode strings, or at least that is the idea.

1 Like

Apparently this link points to Not found. The right one is, if I read it right, this one.

If the community supplies a full replacement, with all intrinsics and operators overloaded, they will certainly not go for it anytime soon :smiley:

That could be solved by allowing to specify encoding='utf-8 for the internal units, line in the following, nonconforming code:

program test
  implicit none
  character(len=120) :: str
  ! the following line causes Error: UNIT tag at (1) must be of type INTEGER
  open(unit=str,encoding='utf-8')
  ! do I/O converting UCS4-UTF8 and v.v.
end program test

That would probably require assigning attributes to strings that you would then have to track, but if it could be used on the WRITE and READ that would seem appropriate, or if you could assign a KIND to the string like “utf-8”. So maybe along those lines, but

    write(line,'(a)',encoding='utf=8') variable

more like “advance=‘no’”.

Well, not sure it works on anything other than GNU/Linux and Cygwin so far; but getting there as long as the compiler allows UTF-8 in comments and quoted strings. Corrected the link you mentioned.

Of the three proposed projects (Using Unicode in Fortran with ISO-10466; Unicode in Fortran without ISO-10466 support; A user guide to using ISO-10466 support in Fortran)

  • M_unicode Using Unicode in Fortran when utf-8 source files are supported

Is complete enough for an alpha release. SCAN(), VERIFY(), TOKENIZE() and SPLIT() support is not included yet, but the majority of intrinsics and operators are overloaded, as well as an OOP interface. My testing has only used ifx and gfortran on Linux and Cygwin but results are encouraging.

1 Like