fortplot_latex_parser.f90 Source File


Source Code

module fortplot_latex_parser
    !! LaTeX command parser for Greek letters and mathematical symbols
    use fortplot_unicode, only: utf8_char_length
    implicit none
    
    private
    public :: find_latex_command, extract_latex_command, find_all_latex_commands
    public :: is_valid_greek_command, latex_to_unicode, process_latex_in_text
    
    ! Greek letter command mappings
    character(len=*), parameter :: LOWERCASE_GREEK(24) = [ &
        "alpha   ", "beta    ", "gamma   ", "delta   ", &
        "epsilon ", "zeta    ", "eta     ", "theta   ", &
        "iota    ", "kappa   ", "lambda  ", "mu      ", &
        "nu      ", "xi      ", "omicron ", "pi      ", &
        "rho     ", "sigma   ", "tau     ", "upsilon ", &
        "phi     ", "chi     ", "psi     ", "omega   " ]
    
    character(len=*), parameter :: UPPERCASE_GREEK(24) = [ &
        "Alpha   ", "Beta    ", "Gamma   ", "Delta   ", &
        "Epsilon ", "Zeta    ", "Eta     ", "Theta   ", &
        "Iota    ", "Kappa   ", "Lambda  ", "Mu      ", &
        "Nu      ", "Xi      ", "Omicron ", "Pi      ", &
        "Rho     ", "Sigma   ", "Tau     ", "Upsilon ", &
        "Phi     ", "Chi     ", "Psi     ", "Omega   " ]
    
    ! Unicode codepoints for lowercase Greek letters
    integer, parameter :: LOWERCASE_CODEPOINTS(24) = [ &
        945, 946, 947, 948, 949, 950, 951, 952, &
        953, 954, 955, 956, 957, 958, 959, 960, &
        961, 963, 964, 965, 966, 967, 968, 969 ]
    
    ! Unicode codepoints for uppercase Greek letters  
    integer, parameter :: UPPERCASE_CODEPOINTS(24) = [ &
        913, 914, 915, 916, 917, 918, 919, 920, &
        921, 922, 923, 924, 925, 926, 927, 928, &
        929, 931, 932, 933, 934, 935, 936, 937 ]

contains

    subroutine find_latex_command(text, start, start_pos, end_pos, found)
        character(len=*), intent(in) :: text
        integer, intent(in) :: start
        integer, intent(out) :: start_pos, end_pos
        logical, intent(out) :: found
        integer :: i, text_len, cmd_start, cmd_end
        
        found = .false.
        start_pos = 0
        end_pos = 0
        text_len = len_trim(text)
        
        ! Look for backslash
        do i = start, text_len
            if (text(i:i) == '\') then
                cmd_start = i
                
                ! Find end of command (next non-alphabetic character)
                cmd_end = cmd_start + 1  ! Start after backslash
                do while (cmd_end <= text_len)
                    if (.not. is_alpha(text(cmd_end:cmd_end))) then
                        cmd_end = cmd_end - 1
                        exit
                    end if
                    cmd_end = cmd_end + 1
                end do
                if (cmd_end > text_len) cmd_end = text_len
                
                ! Validate command
                if (cmd_end > cmd_start) then
                    if (is_valid_greek_command(text(cmd_start:cmd_end))) then
                        start_pos = cmd_start
                        end_pos = cmd_end
                        found = .true.
                        return
                    end if
                end if
            end if
        end do
    end subroutine find_latex_command
    
    subroutine extract_latex_command(text, start_pos, end_pos, command)
        character(len=*), intent(in) :: text
        integer, intent(in) :: start_pos, end_pos
        character(len=*), intent(out) :: command
        
        command = ""
        if (start_pos >= 1 .and. end_pos <= len(text) .and. start_pos <= end_pos) then
            if (text(start_pos:start_pos) == '\') then
                ! Extract command without the backslash
                command = text(start_pos+1:end_pos)
            end if
        end if
    end subroutine extract_latex_command
    
    subroutine find_all_latex_commands(text, commands, num_found)
        character(len=*), intent(in) :: text
        integer, intent(out) :: commands(:,:)
        integer, intent(out) :: num_found
        integer :: start, start_pos, end_pos
        logical :: found
        
        num_found = 0
        start = 1
        
        do while (start <= len_trim(text) .and. num_found < size(commands, 1))
            call find_latex_command(text, start, start_pos, end_pos, found)
            if (.not. found) exit
            
            num_found = num_found + 1
            commands(num_found, 1) = start_pos
            commands(num_found, 2) = end_pos
            
            start = end_pos + 1
        end do
    end subroutine find_all_latex_commands
    
    logical function is_valid_greek_command(command_text)
        character(len=*), intent(in) :: command_text
        character(len=20) :: command_name
        integer :: i
        
        is_valid_greek_command = .false.
        
        ! Extract command name (without backslash)
        if (len(command_text) < 2 .or. command_text(1:1) /= '\') return
        
        command_name = command_text(2:)
        
        ! Check lowercase Greek letters
        do i = 1, 24
            if (trim(command_name) == trim(LOWERCASE_GREEK(i))) then
                is_valid_greek_command = .true.
                return
            end if
        end do
        
        ! Check uppercase Greek letters
        do i = 1, 24
            if (trim(command_name) == trim(UPPERCASE_GREEK(i))) then
                is_valid_greek_command = .true.
                return
            end if
        end do
    end function is_valid_greek_command
    
    logical function is_alpha(ch)
        character(len=1), intent(in) :: ch
        integer :: ascii_val
        
        ascii_val = iachar(ch)
        is_alpha = (ascii_val >= iachar('A') .and. ascii_val <= iachar('Z')) .or. &
                   (ascii_val >= iachar('a') .and. ascii_val <= iachar('z'))
    end function is_alpha
    
    subroutine latex_to_unicode(latex_command, unicode_char, success)
        character(len=*), intent(in) :: latex_command
        character(len=*), intent(out) :: unicode_char
        logical, intent(out) :: success
        integer :: i, codepoint
        
        success = .false.
        unicode_char = ""
        
        if (len_trim(latex_command) == 0) return
        
        ! Check lowercase Greek letters
        do i = 1, 24
            if (trim(latex_command) == trim(LOWERCASE_GREEK(i))) then
                codepoint = LOWERCASE_CODEPOINTS(i)
                call codepoint_to_utf8(codepoint, unicode_char)
                success = .true.
                return
            end if
        end do
        
        ! Check uppercase Greek letters
        do i = 1, 24
            if (trim(latex_command) == trim(UPPERCASE_GREEK(i))) then
                codepoint = UPPERCASE_CODEPOINTS(i)
                call codepoint_to_utf8(codepoint, unicode_char)
                success = .true.
                return
            end if
        end do
    end subroutine latex_to_unicode
    
    subroutine codepoint_to_utf8(codepoint, utf8_char)
        integer, intent(in) :: codepoint
        character(len=*), intent(out) :: utf8_char
        
        utf8_char = ""
        
        if (codepoint <= 127) then
            ! 1-byte sequence
            utf8_char(1:1) = achar(codepoint)
        else if (codepoint <= 2047) then
            ! 2-byte sequence
            utf8_char(1:1) = achar(192 + (codepoint / 64))
            utf8_char(2:2) = achar(128 + mod(codepoint, 64))
        else if (codepoint <= 65535) then
            ! 3-byte sequence
            utf8_char(1:1) = achar(224 + (codepoint / 4096))
            utf8_char(2:2) = achar(128 + mod(codepoint / 64, 64))
            utf8_char(3:3) = achar(128 + mod(codepoint, 64))
        else
            ! 4-byte sequence (not needed for Greek letters)
            utf8_char(1:1) = achar(240 + (codepoint / 262144))
            utf8_char(2:2) = achar(128 + mod(codepoint / 4096, 64))
            utf8_char(3:3) = achar(128 + mod(codepoint / 64, 64))
            utf8_char(4:4) = achar(128 + mod(codepoint, 64))
        end if
    end subroutine codepoint_to_utf8
    
    subroutine process_latex_in_text(input_text, result_text, result_len)
        !! Convert LaTeX-style commands to Unicode while preserving math scopes
        !! UTF-8 aware: multi-byte characters are copied as intact sequences
        character(len=*), intent(in) :: input_text
        character(len=*), intent(out) :: result_text
        integer, intent(out) :: result_len
        integer :: i, pos, n
        integer :: cmd_end, char_len
        character(len=20) :: command, unicode_char
        logical :: success

        result_text = ""
        result_len = 0
        pos = 1
        n = len_trim(input_text)
        i = 1

        do while (i <= n)
            char_len = utf8_char_length(input_text(i:i))
            if (char_len <= 0) char_len = 1

            if (char_len == 1) then
                if (input_text(i:i) == '$') then
                    result_text(pos:pos) = '$'
                    pos = pos + 1
                    i = i + 1
                    cycle
                end if

                if (input_text(i:i) == '\') then
                    cmd_end = i + 1
                    do while (cmd_end <= n)
                        if (.not. is_alpha(input_text(cmd_end:cmd_end))) exit
                        cmd_end = cmd_end + 1
                    end do
                    if (cmd_end - 1 > i) then
                        call extract_latex_command(input_text, i, cmd_end - 1, command)
                        call latex_to_unicode(trim(command), unicode_char, success)
                        if (success) then
                            result_text(pos:pos + len_trim(unicode_char) - 1) = trim(unicode_char)
                            pos = pos + len_trim(unicode_char)
                            i = cmd_end
                            cycle
                        end if

                        ! Handle \sqrt{...} or \sqrt x -> Unicode radical + content
                        if (trim(command) == 'sqrt') then
                            call process_sqrt_block(input_text, cmd_end, n, pos, &
                                                    result_text, i)
                            cycle
                        end if
                    end if
                    ! If not a recognized command, fall through and copy the backslash
                end if

                result_text(pos:pos) = input_text(i:i)
                pos = pos + 1
                i = i + 1
            else
                ! Multi-byte UTF-8 character: copy entire sequence intact
                if (pos + char_len - 1 <= len(result_text)) then
                    result_text(pos:pos + char_len - 1) = &
                        input_text(i:i + char_len - 1)
                    pos = pos + char_len
                end if
                i = i + char_len
            end if
        end do

        result_len = pos - 1
    end subroutine process_latex_in_text

    recursive subroutine process_sqrt_block(input_text, after_cmd, n, out_pos, &
                                            result_text, next_i)
        !! Process \sqrt{...} or \sqrt x: output Unicode radical (U+221A) followed by
        !! the content with LaTeX commands resolved. For braced form, strips outer braces.
        !! For unbraced form, takes the next single character.
        character(len=*), intent(in) :: input_text
        integer, intent(in) :: after_cmd, n
        integer, intent(inout) :: out_pos
        character(len=*), intent(inout) :: result_text
        integer, intent(out) :: next_i

        character(len=4096) :: inner_raw, inner_processed
        integer :: brace_depth, j, inner_end, inner_len, processed_len

        ! Output Unicode radical symbol (U+221A = 8730, 3-byte UTF-8: E2 88 9A)
        result_text(out_pos:out_pos) = achar(226)
        result_text(out_pos+1:out_pos+1) = achar(136)
        result_text(out_pos+2:out_pos+2) = achar(154)
        out_pos = out_pos + 3

        if (after_cmd > n) then
            ! Nothing after \sqrt
            next_i = after_cmd
            return
        end if

        if (input_text(after_cmd:after_cmd) == '{') then
            ! Braced form: \sqrt{...}
            brace_depth = 1
            j = after_cmd + 1
            do while (j <= n .and. brace_depth > 0)
                if (input_text(j:j) == '{') then
                    brace_depth = brace_depth + 1
                else if (input_text(j:j) == '}') then
                    brace_depth = brace_depth - 1
                    if (brace_depth == 0) then
                        inner_end = j - 1
                        exit
                    end if
                end if
                j = j + 1
            end do

            if (brace_depth /= 0) then
                ! No matching brace; treat as unbraced
                inner_len = 1
                if (after_cmd <= n) then
                    inner_raw(1:1) = input_text(after_cmd:after_cmd)
                else
                    inner_len = 0
                end if
                call process_latex_in_text(inner_raw(1:inner_len), inner_processed, processed_len)
                if (processed_len > 0) then
                    result_text(out_pos:out_pos + processed_len - 1) = &
                        inner_processed(1:processed_len)
                    out_pos = out_pos + processed_len
                end if
                next_i = after_cmd + 1
                return
            end if

            inner_len = inner_end - (after_cmd + 1) + 1
            if (inner_len > 0 .and. inner_len <= len(inner_raw)) then
                inner_raw(1:inner_len) = input_text(after_cmd + 1:inner_end)
            else
                inner_len = 0
            end if

            call process_latex_in_text(inner_raw(1:inner_len), inner_processed, processed_len)
            if (processed_len > 0) then
                result_text(out_pos:out_pos + processed_len - 1) = &
                    inner_processed(1:processed_len)
                out_pos = out_pos + processed_len
            end if

            next_i = j + 1
        else
            ! Unbraced form: \sqrt x -> radical + next char
            inner_len = 1
            inner_raw(1:1) = input_text(after_cmd:after_cmd)
            call process_latex_in_text(inner_raw(1:inner_len), inner_processed, processed_len)
            if (processed_len > 0) then
                result_text(out_pos:out_pos + processed_len - 1) = &
                    inner_processed(1:processed_len)
                out_pos = out_pos + processed_len
            end if
            next_i = after_cmd + 1
        end if
    end subroutine process_sqrt_block

end module fortplot_latex_parser