fortplot_zlib_huffman.f90 Source File


Source Code

module fortplot_zlib_huffman
    !! Huffman encoding and deflate compression functionality
    !! Split from fortplot_zlib.f90 for file size compliance (Issue #884)
    use, intrinsic :: iso_fortran_env, only: int8, int32
    use fortplot_logging, only: log_error
    implicit none
    
    private
    public :: compress_with_fixed_huffman, analyze_compressibility
    
    ! Deflate compression constants
    integer, parameter :: MAX_MATCH = 258
    integer, parameter :: MIN_MATCH = 3
    integer, parameter :: MAX_DISTANCE = 32768
    integer, parameter :: HASH_BITS = 15
    integer, parameter :: HASH_SIZE = 2**HASH_BITS
    integer, parameter :: WINDOW_SIZE = 32768
    
contains

    subroutine init_fixed_huffman_tables(literal_codes, literal_lengths, distance_codes, distance_lengths)
        !! Initialize fixed Huffman tables as per RFC 1951
        integer, intent(out) :: literal_codes(0:285)
        integer, intent(out) :: literal_lengths(0:285)
        integer, intent(out) :: distance_codes(0:29)
        integer, intent(out) :: distance_lengths(0:29)
        integer :: i, code
        
        ! CRITICAL FIX: RFC 1951 Fixed Huffman table with correct bit-reversed codes
        ! Fixed literal/length codes with proper bit ordering for deflate
        code = 0
        ! 0-143: 8 bits, reversed to MSB first for deflate format
        do i = 0, 143
            literal_codes(i) = bit_reverse(code + 48, 8)
            literal_lengths(i) = 8
            code = code + 1
        end do
        
        code = 0
        ! 144-255: 9 bits, reversed to MSB first for deflate format
        do i = 144, 255
            literal_codes(i) = bit_reverse(code + 400, 9)
            literal_lengths(i) = 9
            code = code + 1
        end do
        
        code = 0
        ! 256-279: 7 bits, reversed to MSB first for deflate format
        do i = 256, 279
            literal_codes(i) = bit_reverse(code, 7)
            literal_lengths(i) = 7
            code = code + 1
        end do
        
        code = 0
        ! 280-285: 8 bits, reversed to MSB first for deflate format
        do i = 280, 285  ! Note: only 280-285 are valid
            literal_codes(i) = bit_reverse(code + 192, 8)
            literal_lengths(i) = 8
            code = code + 1
        end do
        
        ! Fixed distance codes (all 5 bits, properly bit-reversed)  
        do i = 0, 29
            distance_codes(i) = bit_reverse(i, 5)
            distance_lengths(i) = 5
        end do
    end subroutine init_fixed_huffman_tables
    
    function calculate_hash(data, pos, data_len) result(hash_val)
        !! Calculate hash for LZ77 matching (3-byte hash)
        integer(int8), intent(in) :: data(*)
        integer, intent(in) :: pos, data_len
        integer :: hash_val
        
        if (pos + 2 <= data_len) then
            hash_val = iand(ior(ior(ishft(iand(int(data(pos)), 255), 16), &
                                   ishft(iand(int(data(pos+1)), 255), 8)), &
                               iand(int(data(pos+2)), 255)), HASH_SIZE - 1)
        else
            hash_val = 0
        end if
    end function calculate_hash
    
    subroutine update_hash_table(hash_table, hash_chain, hash_val, pos)
        !! Update hash table for LZ77 matching
        integer, intent(inout) :: hash_table(0:)
        integer, intent(inout) :: hash_chain(:)
        integer, intent(in) :: hash_val, pos
        
        integer :: chain_pos
        
        chain_pos = iand(pos, WINDOW_SIZE - 1) + 1
        hash_chain(chain_pos) = hash_table(hash_val)
        hash_table(hash_val) = pos
    end subroutine update_hash_table
    
    subroutine find_longest_match(data, pos, data_len, hash_table, hash_chain, match_len, match_dist)
        !! Find longest match using LZ77 algorithm
        integer(int8), intent(in) :: data(*)
        integer, intent(in) :: pos, data_len
        integer, intent(in) :: hash_table(0:)
        integer, intent(in) :: hash_chain(:)
        integer, intent(out) :: match_len, match_dist
        
        integer :: hash_val, chain_pos, candidate_pos, len, max_len
        integer :: i, chain_length
        
        match_len = 0
        match_dist = 0
        max_len = min(MAX_MATCH, data_len - pos + 1)
        
        if (max_len < MIN_MATCH) return
        
        hash_val = calculate_hash(data, pos, data_len)
        candidate_pos = hash_table(hash_val)
        chain_length = 0
        
        do while (candidate_pos > 0 .and. candidate_pos < pos .and. chain_length < 128)
            if (pos - candidate_pos > MAX_DISTANCE) exit
            
            ! Check match length
            len = 0
            do i = 0, max_len - 1
                if (data(pos + i) == data(candidate_pos + i)) then
                    len = len + 1
                else
                    exit
                end if
            end do
            
            if (len >= MIN_MATCH .and. len > match_len) then
                match_len = len
                match_dist = pos - candidate_pos
                if (len >= max_len) exit  ! Found maximum possible match
            end if
            
            ! Follow hash chain
            chain_pos = iand(candidate_pos, WINDOW_SIZE - 1) + 1
            candidate_pos = hash_chain(chain_pos)
            chain_length = chain_length + 1
        end do
    end subroutine find_longest_match
    
    subroutine write_bits(buffer, bit_pos, byte_pos, value, num_bits)
        !! Write bits to output buffer (LSB first) with bounds checking
        !! Fixed for proper deflate format - corrected bit ordering
        integer(int8), intent(inout) :: buffer(:)
        integer, intent(inout) :: bit_pos, byte_pos
        integer, intent(in) :: value, num_bits
        
        integer :: i, bit, buffer_size
        
        buffer_size = size(buffer)
        
        do i = 0, num_bits - 1
            ! Bounds check to prevent buffer overrun
            if (byte_pos > buffer_size) then
                call log_error('Compression error: buffer overrun in write_bits')
                return
            end if
            
            ! CRITICAL FIX: Extract bits from LSB to MSB (correct for deflate format)
            bit = iand(ishft(value, -i), 1)
            
            if (bit_pos == 0) then
                buffer(byte_pos) = 0
            end if
            
            ! Write bit into current position within byte
            buffer(byte_pos) = ior(buffer(byte_pos), int(ishft(bit, bit_pos), int8))
            bit_pos = bit_pos + 1
            
            if (bit_pos == 8) then
                bit_pos = 0
                byte_pos = byte_pos + 1
            end if
        end do
    end subroutine write_bits
    
    subroutine encode_literal(buffer, bit_pos, byte_pos, literal, codes, lengths)
        !! Encode a literal using Huffman coding
        integer(int8), intent(inout) :: buffer(:)
        integer, intent(inout) :: bit_pos, byte_pos
        integer(int8), intent(in) :: literal
        integer, intent(in) :: codes(0:), lengths(0:)
        
        integer :: lit_val
        
        lit_val = iand(int(literal), 255)  ! Ensure 0-255 range  
        call write_bits(buffer, bit_pos, byte_pos, codes(lit_val), lengths(lit_val))
    end subroutine encode_literal
    
    subroutine encode_length_distance(buffer, bit_pos, byte_pos, length, distance, &
                                      literal_codes, literal_lengths, distance_codes, distance_lengths)
        !! Encode length-distance pair using Huffman coding
        integer(int8), intent(inout) :: buffer(:)
        integer, intent(inout) :: bit_pos, byte_pos
        integer, intent(in) :: length, distance
        integer, intent(in) :: literal_codes(0:), literal_lengths(0:)
        integer, intent(in) :: distance_codes(0:), distance_lengths(0:)
        
        integer :: length_code, length_extra_bits, length_extra
        integer :: distance_code, distance_extra_bits, distance_extra
        
        ! Encode length
        call get_length_code(length, length_code, length_extra_bits, length_extra)
        call write_bits(buffer, bit_pos, byte_pos, &
                       literal_codes(length_code), literal_lengths(length_code))
        if (length_extra_bits > 0) then
            call write_bits(buffer, bit_pos, byte_pos, length_extra, length_extra_bits)
        end if
        
        ! Encode distance  
        call get_distance_code(distance, distance_code, distance_extra_bits, distance_extra)
        call write_bits(buffer, bit_pos, byte_pos, &
                       distance_codes(distance_code), distance_lengths(distance_code))
        if (distance_extra_bits > 0) then
            call write_bits(buffer, bit_pos, byte_pos, distance_extra, distance_extra_bits)
        end if
    end subroutine encode_length_distance
    
    subroutine get_length_code(length, code, extra_bits, extra)
        !! Get length code and extra bits according to RFC 1951
        integer, intent(in) :: length
        integer, intent(out) :: code, extra_bits, extra
        
        if (length < 3 .or. length > 258) then
            code = 256  ! Invalid, use end-of-block
            extra_bits = 0
            extra = 0
            return
        end if
        
        if (length <= 10) then
            code = 257 + (length - 3)
            extra_bits = 0
            extra = 0
        else if (length <= 18) then
            code = 265 + (length - 11) / 2
            extra_bits = 1
            extra = mod(length - 11, 2)
        else if (length <= 34) then
            code = 269 + (length - 19) / 4
            extra_bits = 2
            extra = mod(length - 19, 4)
        else if (length <= 66) then
            code = 273 + (length - 35) / 8
            extra_bits = 3
            extra = mod(length - 35, 8)
        else if (length <= 130) then
            code = 277 + (length - 67) / 16
            extra_bits = 4
            extra = mod(length - 67, 16)
        else if (length <= 257) then
            code = 281 + (length - 131) / 32
            extra_bits = 5
            extra = mod(length - 131, 32)
        else  ! length = 258
            code = 285
            extra_bits = 0
            extra = 0
        end if
    end subroutine get_length_code
    
    subroutine get_distance_code(distance, code, extra_bits, extra)
        !! Get distance code and extra bits according to RFC 1951
        integer, intent(in) :: distance
        integer, intent(out) :: code, extra_bits, extra
        
        if (distance < 1 .or. distance > 32768) then
            code = 0
            extra_bits = 0
            extra = 0
            return
        end if
        
        if (distance <= 4) then
            code = distance - 1
            extra_bits = 0
            extra = 0
        else if (distance <= 8) then
            code = 4 + (distance - 5) / 2
            extra_bits = 1
            extra = mod(distance - 5, 2)
        else if (distance <= 16) then
            code = 6 + (distance - 9) / 4
            extra_bits = 2
            extra = mod(distance - 9, 4)
        else if (distance <= 32) then
            code = 8 + (distance - 17) / 8
            extra_bits = 3
            extra = mod(distance - 17, 8)
        else if (distance <= 64) then
            code = 10 + (distance - 33) / 16
            extra_bits = 4
            extra = mod(distance - 33, 16)
        else if (distance <= 128) then
            code = 12 + (distance - 65) / 32
            extra_bits = 5
            extra = mod(distance - 65, 32)
        else if (distance <= 256) then
            code = 14 + (distance - 129) / 64
            extra_bits = 6
            extra = mod(distance - 129, 64)
        else if (distance <= 512) then
            code = 16 + (distance - 257) / 128
            extra_bits = 7
            extra = mod(distance - 257, 128)
        else if (distance <= 1024) then
            code = 18 + (distance - 513) / 256
            extra_bits = 8
            extra = mod(distance - 513, 256)
        else if (distance <= 2048) then
            code = 20 + (distance - 1025) / 512
            extra_bits = 9
            extra = mod(distance - 1025, 512)
        else if (distance <= 4096) then
            code = 22 + (distance - 2049) / 1024
            extra_bits = 10
            extra = mod(distance - 2049, 1024)
        else if (distance <= 8192) then
            code = 24 + (distance - 4097) / 2048
            extra_bits = 11
            extra = mod(distance - 4097, 2048)
        else if (distance <= 16384) then
            code = 26 + (distance - 8193) / 4096
            extra_bits = 12
            extra = mod(distance - 8193, 4096)
        else  ! distance <= 32768
            code = 28 + (distance - 16385) / 8192
            extra_bits = 13
            extra = mod(distance - 16385, 8192)
        end if
    end subroutine get_distance_code

    function bit_reverse(value, num_bits) result(reversed_value)
        !! Reverses the bits of a given value up to num_bits.
        integer, intent(in) :: value, num_bits
        integer :: reversed_value
        integer :: i
        
        reversed_value = 0
        do i = 0, num_bits - 1
            if (iand(ishft(value, -i), 1) == 1) then
                reversed_value = ior(reversed_value, ishft(1, num_bits - 1 - i))
            end if
        end do
    end function bit_reverse
    
    function analyze_compressibility(data, data_len) result(ratio)
        !! Analyze data to estimate compression ratio
        !! Returns value between 0 (incompressible) and 1 (highly compressible)
        integer(int8), intent(in) :: data(*)
        integer, intent(in) :: data_len
        real :: ratio
        
        integer :: i, run_length, total_runs
        integer(int8) :: current_byte, prev_byte
        integer :: byte_counts(0:255)
        real :: entropy
        
        if (data_len < 10) then
            ratio = 0.1
            return
        end if
        
        ! Count byte frequencies and runs
        byte_counts = 0
        total_runs = 0
        prev_byte = data(1)
        run_length = 1
        
        do i = 1, min(data_len, 8192)  ! Sample first 8KB
            current_byte = data(i)
            byte_counts(iand(int(current_byte), 255)) = &
                byte_counts(iand(int(current_byte), 255)) + 1
            
            if (i > 1) then
                if (current_byte == prev_byte) then
                    run_length = run_length + 1
                else
                    if (run_length >= 3) total_runs = total_runs + 1
                    run_length = 1
                    prev_byte = current_byte
                end if
            end if
        end do
        
        ! Calculate simple entropy estimate
        entropy = 0.0
        do i = 0, 255
            if (byte_counts(i) > 0) then
                entropy = entropy + 1.0
            end if
        end do
        entropy = entropy / 256.0  ! Normalize
        
        ! Estimate compression ratio based on entropy and runs
        ratio = (1.0 - entropy) * 0.5 + real(total_runs) / real(data_len/10) * 0.5
        ratio = max(0.0, min(1.0, ratio))
        
    end function analyze_compressibility
    
    subroutine compress_with_fixed_huffman(input_data, input_len, output_buffer, output_pos)
        !! Compress data using fixed Huffman codes with LZ77
        integer(int8), intent(in) :: input_data(*)
        integer, intent(in) :: input_len
        integer(int8), intent(inout) :: output_buffer(:)
        integer, intent(inout) :: output_pos
        
        integer :: literal_codes(0:285), literal_lengths(0:285)
        integer :: distance_codes(0:29), distance_lengths(0:29)
        integer :: hash_table(0:HASH_SIZE-1)
        integer :: hash_chain(WINDOW_SIZE)
        integer :: pos, match_len, match_dist
        integer :: bit_pos, byte_pos
        integer :: block_size, block_start
        
        ! Initialize fixed Huffman tables
        call init_fixed_huffman_tables(literal_codes, literal_lengths, &
                                      distance_codes, distance_lengths)
        
        ! Initialize hash structures
        hash_table = 0
        hash_chain = 0
        
        pos = 1
        byte_pos = output_pos
        bit_pos = 0
        
        ! Process in blocks to avoid overflow
        do while (pos <= input_len)
            block_start = pos
            block_size = min(input_len - pos + 1, 32768)
            
            ! Write block header: BFINAL, BTYPE=01 (fixed Huffman)
            ! CRITICAL FIX: RFC 1951 specifies BFINAL first, then BTYPE (2 bits)
            if (pos + block_size >= input_len) then
                ! BFINAL=1, BTYPE=01: bits are 1,0,1 = decimal 5 (LSB first)
                call write_bits(output_buffer, bit_pos, byte_pos, 5, 3)  ! BFINAL=1, BTYPE=01
            else
                ! BFINAL=0, BTYPE=01: bits are 0,1,0 = decimal 2 (LSB first)
                call write_bits(output_buffer, bit_pos, byte_pos, 2, 3)  ! BFINAL=0, BTYPE=01
            end if
            
            ! Compress block data
            do while (pos <= block_start + block_size - 1 .and. pos <= input_len)
                ! Try to find a match
                call find_longest_match(input_data, pos, input_len, &
                                       hash_table, hash_chain, match_len, match_dist)
                
                if (match_len >= MIN_MATCH) then
                    ! Encode length-distance pair
                    call encode_length_distance(output_buffer, bit_pos, byte_pos, &
                                              match_len, match_dist, &
                                              literal_codes, literal_lengths, &
                                              distance_codes, distance_lengths)
                    
                    ! Update hash table for all matched positions
                    do while (match_len > 0)
                        if (pos <= input_len - 2) then
                            call update_hash_table(hash_table, hash_chain, &
                                                 calculate_hash(input_data, pos, input_len), pos)
                        end if
                        pos = pos + 1
                        match_len = match_len - 1
                    end do
                else
                    ! Encode literal
                    call encode_literal(output_buffer, bit_pos, byte_pos, &
                                      input_data(pos), literal_codes, literal_lengths)
                    
                    ! Update hash table
                    if (pos <= input_len - 2) then
                        call update_hash_table(hash_table, hash_chain, &
                                             calculate_hash(input_data, pos, input_len), pos)
                    end if
                    pos = pos + 1
                end if
            end do
            
            ! Write end-of-block code (256) - CRITICAL for proper stream termination  
            call write_bits(output_buffer, bit_pos, byte_pos, &
                           literal_codes(256), literal_lengths(256))
        end do
        
        ! Ensure final bits are written
        if (bit_pos > 0) then
            byte_pos = byte_pos + 1
        end if
        
        output_pos = byte_pos
        
    end subroutine compress_with_fixed_huffman

end module fortplot_zlib_huffman