test_pdf_utils.f90 Source File


Source Code

module test_pdf_utils
    use, intrinsic :: iso_fortran_env, only: dp => real64, int8, int64
    use fortplot_zlib_core, only: zlib_decompress
    implicit none
contains
    subroutine extract_pdf_stream_text(filename, stream_text, status)
        character(len=*), intent(in) :: filename
        character(len=:), allocatable, intent(out) :: stream_text
        integer, intent(out) :: status

        character(len=1), allocatable :: data(:)
        integer(int64) :: fsize
        integer :: content_begin, content_len
        integer :: ios, pos
        integer :: stream_start, stream_end
        integer :: unit
        integer :: i

        status = 0
        allocate (character(len=0) :: stream_text)

        open (newunit=unit, file=filename, access='stream', form='unformatted', &
              status='old', iostat=ios)
        if (ios /= 0) then
            status = -1
            return
        end if

        inquire (unit=unit, size=fsize)
        if (fsize <= 0_int64) then
            close (unit)
            return
        end if

        allocate (character(len=1) :: data(int(fsize)))
        read (unit, iostat=ios) data
        close (unit)
        if (ios /= 0) then
            status = -2
            return
        end if

        pos = 1
        do
            stream_start = find_subsequence(data, fsize, 'stream', pos)
            if (stream_start < 0) exit
            stream_end = find_subsequence(data, fsize, 'endstream', &
                                          stream_start + len('stream'))
            if (stream_end < 0) exit

            content_begin = stream_start + len('stream')
            if (content_begin <= int(fsize)) then
                if (data(content_begin) == achar(13)) then
                    content_begin = content_begin + 1
                    if (content_begin <= int(fsize)) then
                        if (data(content_begin) == achar(10)) then
                            content_begin = content_begin + 1
                        end if
                    end if
                else if (data(content_begin) == achar(10)) then
                    content_begin = content_begin + 1
                end if
            end if

            content_len = stream_end - content_begin
            if (content_len > 0) then
                block
                    integer(int8), allocatable :: compressed(:), decompressed_raw(:)
                    character(len=:), allocatable :: chunk_text
                    integer :: j
                    integer :: status_decomp

                    allocate (compressed(content_len))
                    do j = 1, content_len
                        compressed(j) = int(iachar(data(content_begin + j - 1)), int8)
                    end do
                    decompressed_raw = zlib_decompress(compressed, content_len, &
                                                       status_decomp, .false.)

                    if (status_decomp == 0 .and. size(decompressed_raw) > 0) then
                        allocate (character(len=size(decompressed_raw)) :: chunk_text)
                        call bytes_to_string(decompressed_raw, chunk_text)
                    else
                        allocate (character(len=content_len) :: chunk_text)
                        do j = 1, content_len
                            chunk_text(j:j) = data(content_begin + j - 1)
                        end do
                    end if

                    call append_string(stream_text, chunk_text)
                end block
            end if

            pos = stream_end + len('endstream')
        end do

    end subroutine extract_pdf_stream_text

    subroutine append_string(target, chunk)
        character(len=:), allocatable, intent(inout) :: target
        character(len=*), intent(in) :: chunk
        character(len=:), allocatable :: combined
        integer :: new_len
        integer :: old_len

        old_len = len(target)
        new_len = old_len + len(chunk)

        allocate (character(len=new_len) :: combined)
        if (old_len > 0) combined(1:old_len) = target
        if (len(chunk) > 0) combined(old_len + 1:new_len) = chunk
        call move_alloc(combined, target)
    end subroutine append_string

    subroutine bytes_to_string(bytes, text)
        integer(int8), intent(in) :: bytes(:)
        character(len=*), intent(out) :: text
        integer :: i

        do i = 1, size(bytes)
            text(i:i) = achar(iand(int(bytes(i)), 255))
        end do
    end subroutine bytes_to_string

    integer function find_subsequence(arr, n, pat, start_idx) result(pos)
        integer(int64), intent(in) :: n
        character(len=1), intent(in) :: arr(n)
        character(len=*), intent(in) :: pat
        integer, intent(in) :: start_idx
        integer :: i, j, pat_len

        pat_len = len_trim(pat)
        pos = -1
        if (pat_len <= 0) return
        do i = max(1, start_idx), int(n) - pat_len + 1
            do j = 1, pat_len
                if (arr(i + j - 1) /= pat(j:j)) exit
                if (j == pat_len) then
                    pos = i
                    return
                end if
            end do
        end do
    end function find_subsequence

    logical function pdf_stream_has_stroke_rgb(stream_text, rgb, tol) result(found)
        character(len=*), intent(in) :: stream_text
        real(dp), intent(in) :: rgb(3)
        real(dp), intent(in) :: tol

        character(len=128) :: token
        integer :: token_len
        logical :: has_token
        integer :: pos

        character(len=64) :: t1, t2, t3
        integer :: t1_len, t2_len, t3_len
        real(dp) :: r_val, g_val, b_val
        logical :: ok_r, ok_g, ok_b

        found = .false.
        pos = 1

        t1 = ''
        t2 = ''
        t3 = ''
        t1_len = 0
        t2_len = 0
        t3_len = 0

        do
            call pdf_next_token(stream_text, pos, token, token_len, has_token)
            if (.not. has_token) exit

            if (token_len == 2) then
                if (token(1:2) == 'RG') then
                    ok_r = parse_real_token(t3, t3_len, r_val)
                    ok_g = parse_real_token(t2, t2_len, g_val)
                    ok_b = parse_real_token(t1, t1_len, b_val)

                    if (ok_r .and. ok_g .and. ok_b) then
                        if (abs(r_val - rgb(1)) <= tol .and. &
                            abs(g_val - rgb(2)) <= tol .and. &
                            abs(b_val - rgb(3)) <= tol) then
                            found = .true.
                            return
                        end if
                    end if
                end if
            end if

            call shift_recent_tokens(token, token_len, t1, t1_len, t2, t2_len, &
                                     t3, t3_len)
        end do
    end function pdf_stream_has_stroke_rgb

    integer function pdf_stream_count_operator(stream_text, op) result(count_op)
        character(len=*), intent(in) :: stream_text
        character(len=*), intent(in) :: op

        character(len=128) :: token
        integer :: token_len
        logical :: has_token
        integer :: pos
        integer :: op_len

        count_op = 0
        pos = 1
        op_len = len_trim(op)
        if (op_len <= 0) return

        do
            call pdf_next_token(stream_text, pos, token, token_len, has_token)
            if (.not. has_token) exit

            if (token_len == op_len) then
                if (token_len <= len(token)) then
                    if (token(1:token_len) == op(1:op_len)) count_op = count_op + 1
                end if
            end if
        end do
    end function pdf_stream_count_operator

    subroutine pdf_next_token(text, pos, token, token_len, has_token)
        character(len=*), intent(in) :: text
        integer, intent(inout) :: pos
        character(len=*), intent(out) :: token
        integer, intent(out) :: token_len
        logical, intent(out) :: has_token

        integer :: n
        integer :: start_pos
        integer :: end_pos
        integer :: i
        integer :: copy_len
        character(len=1) :: ch

        token = ''
        token_len = 0
        has_token = .false.

        n = len(text)
        if (pos < 1) pos = 1

        do
            do while (pos <= n)
                if (.not. is_pdf_whitespace(text(pos:pos))) exit
                pos = pos + 1
            end do
            if (pos > n) return

            if (text(pos:pos) /= '%') exit
            call pdf_skip_comment(text, pos)
        end do

        start_pos = pos
        ch = text(pos:pos)

        select case (ch)
        case ('(')
            call pdf_scan_literal_string(text, pos, end_pos)
        case ('<')
            if (pos < n) then
                if (text(pos + 1:pos + 1) == '<') then
                    end_pos = pos + 1
                    pos = pos + 2
                else
                    call pdf_scan_hex_string(text, pos, end_pos)
                    pos = end_pos + 1
                end if
            else
                call pdf_scan_hex_string(text, pos, end_pos)
                pos = end_pos + 1
            end if
        case ('>')
            if (pos < n) then
                if (text(pos + 1:pos + 1) == '>') then
                    end_pos = pos + 1
                    pos = pos + 2
                else
                    end_pos = pos
                    pos = pos + 1
                end if
            else
                end_pos = pos
                pos = pos + 1
            end if
        case ('[', ']', '{', '}', ')')
            end_pos = pos
            pos = pos + 1
        case ('/')
            pos = pos + 1
            do while (pos <= n)
                if (is_pdf_whitespace(text(pos:pos))) exit
                if (is_pdf_delimiter(text(pos:pos))) exit
                pos = pos + 1
            end do
            end_pos = pos - 1
        case default
            do while (pos <= n)
                if (is_pdf_whitespace(text(pos:pos))) exit
                if (is_pdf_delimiter(text(pos:pos))) exit
                if (text(pos:pos) == '%') exit
                pos = pos + 1
            end do
            end_pos = pos - 1
        end select

        token_len = end_pos - start_pos + 1
        if (token_len <= 0) return

        copy_len = min(token_len, len(token))
        do i = 1, copy_len
            token(i:i) = text(start_pos + i - 1:start_pos + i - 1)
        end do
        has_token = .true.
    end subroutine pdf_next_token

    logical function is_pdf_whitespace(ch) result(is_ws)
        character(len=1), intent(in) :: ch
        integer :: code

        code = iachar(ch)
        is_ws = (code == 0) .or. (code == 9) .or. (code == 10) .or. &
                (code == 12) .or. (code == 13) .or. (code == 32)
    end function is_pdf_whitespace

    logical function is_pdf_delimiter(ch) result(is_delim)
        character(len=1), intent(in) :: ch

        is_delim = (ch == '(') .or. (ch == ')') .or. (ch == '<') .or. &
                   (ch == '>') .or. (ch == '[') .or. (ch == ']') .or. &
                   (ch == '{') .or. (ch == '}') .or. (ch == '/') .or. &
                   (ch == '%')
    end function is_pdf_delimiter

    subroutine pdf_skip_comment(text, pos)
        character(len=*), intent(in) :: text
        integer, intent(inout) :: pos

        integer :: n

        n = len(text)
        if (pos < 1) pos = 1
        if (pos > n) return
        if (text(pos:pos) /= '%') return

        do while (pos <= n)
            if (text(pos:pos) == achar(10) .or. text(pos:pos) == achar(13)) exit
            pos = pos + 1
        end do
        do while (pos <= n)
            if (text(pos:pos) /= achar(10) .and. text(pos:pos) /= achar(13)) exit
            pos = pos + 1
        end do
    end subroutine pdf_skip_comment

    subroutine pdf_scan_literal_string(text, pos, end_pos)
        character(len=*), intent(in) :: text
        integer, intent(inout) :: pos
        integer, intent(out) :: end_pos

        integer :: n
        integer :: depth
        integer :: i
        integer :: j
        integer :: octal_digits

        n = len(text)
        if (pos < 1) pos = 1
        if (pos > n) then
            end_pos = n
            return
        end if
        if (text(pos:pos) /= '(') then
            end_pos = pos
            return
        end if

        depth = 1
        i = pos + 1
        do while (i <= n)
            if (text(i:i) == achar(92)) then
                if (i == n) then
                    i = i + 1
                    exit
                end if

                if (text(i + 1:i + 1) == achar(13)) then
                    i = i + 2
                    if (i <= n) then
                        if (text(i:i) == achar(10)) i = i + 1
                    end if
                    cycle
                end if

                if (text(i + 1:i + 1) == achar(10)) then
                    i = i + 2
                    cycle
                end if

                if (is_octal_digit(text(i + 1:i + 1))) then
                    octal_digits = 1
                    j = i + 2
                    do while (octal_digits < 3 .and. j <= n)
                        if (.not. is_octal_digit(text(j:j))) exit
                        octal_digits = octal_digits + 1
                        j = j + 1
                    end do
                    i = j
                    cycle
                end if

                i = i + 2
                cycle
            end if

            if (text(i:i) == '(') then
                depth = depth + 1
            else if (text(i:i) == ')') then
                depth = depth - 1
                if (depth == 0) exit
            end if

            i = i + 1
        end do

        if (i > n) then
            end_pos = n
            pos = n + 1
        else
            end_pos = i
            pos = end_pos + 1
        end if
    end subroutine pdf_scan_literal_string

    subroutine pdf_scan_hex_string(text, pos, end_pos)
        character(len=*), intent(in) :: text
        integer, intent(inout) :: pos
        integer, intent(out) :: end_pos

        integer :: n
        integer :: i

        n = len(text)
        if (pos < 1) pos = 1
        if (pos > n) then
            end_pos = n
            return
        end if
        if (text(pos:pos) /= '<') then
            end_pos = pos
            return
        end if

        i = pos + 1
        do while (i <= n)
            if (text(i:i) == '>') exit
            i = i + 1
        end do

        if (i > n) then
            end_pos = n
        else
            end_pos = i
        end if
    end subroutine pdf_scan_hex_string

    logical function is_octal_digit(ch) result(is_digit)
        character(len=1), intent(in) :: ch
        integer :: code

        code = iachar(ch)
        is_digit = (code >= iachar('0') .and. code <= iachar('7'))
    end function is_octal_digit

    subroutine shift_recent_tokens(token, token_len, t1, t1_len, t2, t2_len, &
                                   t3, t3_len)
        character(len=*), intent(in) :: token
        integer, intent(in) :: token_len
        character(len=*), intent(inout) :: t1, t2, t3
        integer, intent(inout) :: t1_len, t2_len, t3_len

        integer :: copy_len

        t3 = t2
        t3_len = t2_len
        t2 = t1
        t2_len = t1_len

        t1 = ''
        t1_len = min(token_len, len(t1))
        copy_len = t1_len
        if (copy_len > 0) t1(1:copy_len) = token(1:copy_len)
    end subroutine shift_recent_tokens

    logical function parse_real_token(token, token_len, value) result(ok)
        character(len=*), intent(in) :: token
        integer, intent(in) :: token_len
        real(dp), intent(out) :: value

        integer :: ios

        value = 0.0_dp
        ok = .false.
        if (token_len <= 0) return
        if (token_len > len(token)) return

        read (token(1:token_len), *, iostat=ios) value
        ok = (ios == 0)
    end function parse_real_token

end module test_pdf_utils