fortplot_unicode.f90 Source File

Unicode text processing utilities for raster rendering

This module provides Unicode-to-ASCII conversion functionality for raster rendering backends that need text fallback.



Source Code

!! Unicode text processing utilities for raster rendering
!!
!! This module provides Unicode-to-ASCII conversion functionality
!! for raster rendering backends that need text fallback.
module fortplot_unicode
    implicit none
    private
    
    public :: escape_unicode_for_raster
    public :: unicode_codepoint_to_ascii
    public :: codepoint_to_lowercase_greek
    public :: codepoint_to_uppercase_greek
    public :: codepoint_to_default_placeholder
    public :: utf8_to_codepoint
    public :: utf8_char_length
    public :: contains_unicode, is_unicode_char, check_utf8_sequence, is_greek_letter_codepoint

contains

    subroutine escape_unicode_for_raster(input_text, escaped_text)
        !! Pass through Unicode for raster rendering (STB TrueType supports Unicode)
        character(len=*), intent(in) :: input_text
        character(len=*), intent(out) :: escaped_text
        
        ! STB TrueType can handle Unicode directly, so just pass through
        escaped_text = input_text
    end subroutine escape_unicode_for_raster

    subroutine unicode_codepoint_to_ascii(codepoint, ascii_equiv)
        !! Convert Unicode codepoint to ASCII equivalent
        integer, intent(in) :: codepoint
        character(len=*), intent(out) :: ascii_equiv
        
        ! Try lowercase Greek first, then uppercase, then default
        if (codepoint_to_lowercase_greek(codepoint, ascii_equiv)) return
        if (codepoint_to_uppercase_greek(codepoint, ascii_equiv)) return
        call codepoint_to_default_placeholder(codepoint, ascii_equiv)
    end subroutine unicode_codepoint_to_ascii
    
    logical function codepoint_to_lowercase_greek(codepoint, ascii_equiv)
        !! Convert lowercase Greek codepoint to ASCII name
        integer, intent(in) :: codepoint
        character(len=*), intent(out) :: ascii_equiv
        
        codepoint_to_lowercase_greek = .true.
        select case (codepoint)
        case (945) ! α
            ascii_equiv = "alpha"
        case (946) ! β
            ascii_equiv = "beta"
        case (947) ! γ
            ascii_equiv = "gamma"
        case (948) ! δ
            ascii_equiv = "delta"
        case (949) ! ε
            ascii_equiv = "epsilon"
        case (950) ! ζ
            ascii_equiv = "zeta"
        case (951) ! η
            ascii_equiv = "eta"
        case (952) ! θ
            ascii_equiv = "theta"
        case (953) ! ι
            ascii_equiv = "iota"
        case (954) ! κ
            ascii_equiv = "kappa"
        case (955) ! λ
            ascii_equiv = "lambda"
        case (956) ! μ
            ascii_equiv = "mu"
        case (957) ! ν
            ascii_equiv = "nu"
        case (958) ! ξ
            ascii_equiv = "xi"
        case (959) ! ο
            ascii_equiv = "omicron"
        case (960) ! π
            ascii_equiv = "pi"
        case (961) ! ρ
            ascii_equiv = "rho"
        case (963) ! σ
            ascii_equiv = "sigma"
        case (964) ! τ
            ascii_equiv = "tau"
        case (965) ! υ
            ascii_equiv = "upsilon"
        case (966) ! φ
            ascii_equiv = "phi"
        case (967) ! χ
            ascii_equiv = "chi"
        case (968) ! ψ
            ascii_equiv = "psi"
        case (969) ! ω
            ascii_equiv = "omega"
        case default
            codepoint_to_lowercase_greek = .false.
        end select
    end function codepoint_to_lowercase_greek
    
    logical function codepoint_to_uppercase_greek(codepoint, ascii_equiv)
        !! Convert uppercase Greek codepoint to ASCII name
        integer, intent(in) :: codepoint
        character(len=*), intent(out) :: ascii_equiv
        
        codepoint_to_uppercase_greek = .true.
        select case (codepoint)
        case (913) ! Α
            ascii_equiv = "Alpha"
        case (914) ! Β
            ascii_equiv = "Beta"
        case (915) ! Γ
            ascii_equiv = "Gamma"
        case (916) ! Δ
            ascii_equiv = "Delta"
        case (917) ! Ε
            ascii_equiv = "Epsilon"
        case (918) ! Ζ
            ascii_equiv = "Zeta"
        case (919) ! Η
            ascii_equiv = "Eta"
        case (920) ! Θ
            ascii_equiv = "Theta"
        case (921) ! Ι
            ascii_equiv = "Iota"
        case (922) ! Κ
            ascii_equiv = "Kappa"
        case (923) ! Λ
            ascii_equiv = "Lambda"
        case (924) ! Μ
            ascii_equiv = "Mu"
        case (925) ! Ν
            ascii_equiv = "Nu"
        case (926) ! Ξ
            ascii_equiv = "Xi"
        case (927) ! Ο
            ascii_equiv = "Omicron"
        case (928) ! Π
            ascii_equiv = "Pi"
        case (929) ! Ρ
            ascii_equiv = "Rho"
        case (931) ! Σ
            ascii_equiv = "Sigma"
        case (932) ! Τ
            ascii_equiv = "Tau"
        case (933) ! Υ
            ascii_equiv = "Upsilon"
        case (934) ! Φ
            ascii_equiv = "Phi"
        case (935) ! Χ
            ascii_equiv = "Chi"
        case (936) ! Ψ
            ascii_equiv = "Psi"
        case (937) ! Ω
            ascii_equiv = "Omega"
        case default
            codepoint_to_uppercase_greek = .false.
        end select
    end function codepoint_to_uppercase_greek
    
    subroutine codepoint_to_default_placeholder(codepoint, ascii_equiv)
        !! Convert unknown codepoint to default placeholder format
        integer, intent(in) :: codepoint
        character(len=*), intent(out) :: ascii_equiv
        
        write(ascii_equiv, '("U+", Z4.4)') codepoint
    end subroutine codepoint_to_default_placeholder

    integer function utf8_char_length(char)
        !! Determine the number of bytes in a UTF-8 character
        character(len=1), intent(in) :: char
        integer :: byte_val
        
        byte_val = ichar(char)
        
        if (byte_val < 128) then
            ! ASCII (0xxxxxxx)
            utf8_char_length = 1
        else if (byte_val < 224) then
            ! 2-byte sequence (110xxxxx)
            utf8_char_length = 2
        else if (byte_val < 240) then
            ! 3-byte sequence (1110xxxx)
            utf8_char_length = 3
        else if (byte_val < 248) then
            ! 4-byte sequence (11110xxx)
            utf8_char_length = 4
        else
            ! Invalid UTF-8 start byte
            utf8_char_length = 0
        end if
    end function utf8_char_length

    integer function utf8_to_codepoint(text, start_pos)
        !! Convert UTF-8 sequence to Unicode codepoint
        character(len=*), intent(in) :: text
        integer, intent(in) :: start_pos
        integer :: char_len, byte_val, codepoint
        
        char_len = utf8_char_length(text(start_pos:start_pos))
        
        if (char_len == 0 .or. start_pos + char_len - 1 > len(text)) then
            ! Invalid sequence or out of bounds
            utf8_to_codepoint = 0
            return
        end if
        
        if (char_len == 1) then
            ! ASCII
            utf8_to_codepoint = ichar(text(start_pos:start_pos))
        else if (char_len == 2) then
            ! 2-byte sequence
            byte_val = ichar(text(start_pos:start_pos))
            codepoint = iand(byte_val, int(z'1F')) * 64
            byte_val = ichar(text(start_pos+1:start_pos+1))
            codepoint = codepoint + iand(byte_val, int(z'3F'))
            utf8_to_codepoint = codepoint
        else if (char_len == 3) then
            ! 3-byte sequence
            byte_val = ichar(text(start_pos:start_pos))
            codepoint = iand(byte_val, int(z'0F')) * 4096
            byte_val = ichar(text(start_pos+1:start_pos+1))
            codepoint = codepoint + iand(byte_val, int(z'3F')) * 64
            byte_val = ichar(text(start_pos+2:start_pos+2))
            codepoint = codepoint + iand(byte_val, int(z'3F'))
            utf8_to_codepoint = codepoint
        else if (char_len == 4) then
            ! 4-byte sequence
            byte_val = ichar(text(start_pos:start_pos))
            codepoint = iand(byte_val, int(z'07')) * 262144
            byte_val = ichar(text(start_pos+1:start_pos+1))
            codepoint = codepoint + iand(byte_val, int(z'3F')) * 4096
            byte_val = ichar(text(start_pos+2:start_pos+2))
            codepoint = codepoint + iand(byte_val, int(z'3F')) * 64
            byte_val = ichar(text(start_pos+3:start_pos+3))
            codepoint = codepoint + iand(byte_val, int(z'3F'))
            utf8_to_codepoint = codepoint
        else
            utf8_to_codepoint = 0
        end if
    end function utf8_to_codepoint

    logical function contains_unicode(text)
        !! Check if text contains Unicode characters
        character(len=*), intent(in) :: text
        integer :: i, char_len
        logical :: is_valid
        
        contains_unicode = .false.
        i = 1
        
        do while (i <= len_trim(text))
            char_len = utf8_char_length(text(i:i))
            if (char_len > 1) then
                ! Check if it's a valid UTF-8 sequence
                call check_utf8_sequence(text, i, is_valid, char_len)
                if (is_valid) then
                    contains_unicode = .true.
                    return
                end if
            end if
            i = i + char_len
        end do
    end function contains_unicode

    logical function is_unicode_char(char)
        !! Check if a character is Unicode (multi-byte)
        character(len=*), intent(in) :: char
        
        is_unicode_char = utf8_char_length(char(1:1)) > 1
    end function is_unicode_char

    subroutine check_utf8_sequence(text, start_pos, is_valid, seq_len)
        !! Check if UTF-8 sequence is valid and return its length
        character(len=*), intent(in) :: text
        integer, intent(in) :: start_pos
        logical, intent(out) :: is_valid
        integer, intent(out) :: seq_len
        integer :: i, byte_val
        
        is_valid = .false.
        seq_len = utf8_char_length(text(start_pos:start_pos))
        
        if (seq_len == 1) then
            is_valid = .true.
            return
        end if
        
        if (start_pos + seq_len - 1 > len(text)) then
            return  ! Not enough bytes
        end if
        
        ! Check continuation bytes
        do i = 1, seq_len - 1
            byte_val = ichar(text(start_pos + i:start_pos + i))
            if (iand(byte_val, int(z'C0')) /= int(z'80')) then
                return  ! Invalid continuation byte
            end if
        end do
        
        is_valid = .true.
    end subroutine check_utf8_sequence

    logical function is_greek_letter_codepoint(codepoint)
        !! Check if codepoint is a Greek letter
        integer, intent(in) :: codepoint
        
        ! Greek letters are in ranges U+0391-U+03A9 (uppercase) and U+03B1-U+03C9 (lowercase)
        is_greek_letter_codepoint = (codepoint >= 913 .and. codepoint <= 937) .or. &
                                   (codepoint >= 945 .and. codepoint <= 969)
    end function is_greek_letter_codepoint

end module fortplot_unicode