python - bz2 not recompressing properly

Question

python - bz2 not recompressing properly

24 Views Asked by Miss_Orchid At 20 December 2023 at 19:08

The following code is able to read in a bzipped file:

offset = 24

# Open the object
fobj = open(filey,'rb')

# Read the data
buffer = fobj.read()

# Apply bz2 compression
buffer_unbzip,places_to_bzip = bzip_blocks_decompress_all(buffer,offset)

where the bzip_blocks_decompress_all function is defined as below:

def bzip_blocks_decompress_all(data,offset):
    import bz2
    frames = bytearray()
    places_to_bzip = []
    while offset < len(data):
        block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
        offset += 4
        frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
        places_to_bzip.append([offset,offset+block_cmp_bytes])
        offset += block_cmp_bytes
        
    return frames,places_to_bzip

So I have the locations of where objects are bzipped (places_to_bzip). So my thinking is that we should be able to do something like the following:

# Try to compress using bz2 just based on some of the places_to_bzip
a1 = buffer[places_to_bzip[0][0]:places_to_bzip[0][1]]
a2 = buffer_unbzip[places_to_bzip[0][0]:places_to_bzip[0][1]]

# Convert a2 back to a1 with a bzip compression
a3 = bz2.compress(a2)
print(len(a1))
print(len(a2))
print(len(a3))

104
104
70

Why is this not recompressing properly? Below is the output from a1 and a2 for testing:

print(a1)
b'BZh51AY&SY\xe6\xb1\xacS\x00\x00\x02_\xab\xfe(@\x00\x10\x00@\x04\x00@\x00@\x800\x02\x00\x00\x01\x00@\x08\x00\x00\x18 \x00T4\x8d\x004\x01\xa0\x91(\x01\x90\xd3\xd2\x14\xac\xd6v\x85\xf0\x0fD\x85\xc3A}\xe09\xbc\xe1\x8b\x04Y\xbfb$"\xcc\x13\xc0B\r\x99\xf1Qa%S\x00|]\xc9\x14\xe1BC\x9a\xc6\xb1L'

print(a2)
bytearray(b'\x00\x0b\x00\x02\x05z\x00\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00X\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01h\x00\x00\x00\x00\x002\x04@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')

Original Q&A

There are 1 best solutions below

**Mark Tolonen** · Answer 1 · 2023-12-20T20:41:09.213000

Per my comments, buffer_unbzip contains the decompressed data only, and offsets in places_to_bzip are the start/end offsets of slices in the original compressed data. The offset of the unbzipped frames is not known.

Below I've reverse-engineered the input file and generated one, then used the OP's code to extract the data. The code is modified to also return the start/end of each unbzipped frame and then walks the offsets re-compressing and comparing each frame's compression data:

import bz2
import struct

### Reproducible input file example ###
def write_frame(f, data):
    bzdata = bz2.compress(data)
    # Write size of compressed data as big-endian 4-byte integer,
    # then the compressed data.
    f.write(struct.pack('>L', len(bzdata)) + bzdata)

with open('file.bin', 'wb') as f:
    f.write(b'A' * 24)  # header in the original data?
    write_frame(f, b'B' * 50)  # compressed frames
    write_frame(f, b'C' * 25)
    write_frame(f, b'D' * 30)
    write_frame(f, b'E' * 12)
### END ###

offset = 24

# Open the object
with open('file.bin','rb') as fobj:
    # Read the data
    buffer = fobj.read()

def bzip_blocks_decompress_all(data,offset):
    import bz2
    frames = bytearray()
    places_to_bzip = []
    places_to_unbzip = []
    while offset < len(data):
        # Why signed and abs()?  A length should never be negative.
        # Note: >L means big-endian 4-byte unsigned integer.
        #       A tuple of the struct elements is returned,
        #       in this case a 1-tuple, so [0] to get the integer
        block_cmp_bytes = struct.unpack_from('>L', data, offset)[0]
        #block_cmp_bytes = abs(int.from_bytes(data[offset:offset + 4], 'big', signed=True))
        offset += 4
        start = len(frames)
        frames += bz2.decompress(data[offset:offset + block_cmp_bytes])
        end = len(frames)
        places_to_bzip.append([start, end])
        places_to_unbzip.append([offset, offset + block_cmp_bytes])
        offset += block_cmp_bytes
        
    return frames, places_to_bzip, places_to_unbzip

# Apply bz2 compression
buffer_unbzip, places_to_bzip, places_to_unbzip = bzip_blocks_decompress_all(buffer, offset)
print(f'{buffer=}')
print(f'{buffer_unbzip=}')

# Try to compress using bz2 just based on some of the places_to_bzip
for (bstart, bend), (unbstart, unbend) in zip(places_to_bzip, places_to_unbzip):
    a1 = buffer[unbstart:unbend]
    a2 = buffer_unbzip[bstart:bend]

    # Convert a2 back to a1 with a bzip compression
    a3 = bz2.compress(a2)
    print(a1 == a3, a2)

Output:

buffer=b"AAAAAAAAAAAAAAAAAAAAAAAA\x00\x00\x00'BZh91AY&SY?\xbf\xc2\x8b\x00\x00\x02\x14\x00\x00\x01\x10\x00 \x00!\x00\x82\x0b\x17rE8P\x90?\xbf\xc2\x8b\x00\x00\x00'BZh91AY&SY\x0b\xc7\x94'\x00\x00\x02$\x00\x02\x00\x08\x00 \x00!\x00\x82\x0b\x17rE8P\x90\x0b\xc7\x94'\x00\x00\x00'BZh91AY&SYX\xf3\xe3\x91\x00\x00\x02$\x00\x00\x10\x04\x00 \x00!\x00\x82\x0b\x17rE8P\x90X\xf3\xe3\x91\x00\x00\x00'BZh91AY&SY\xb6\xa1w{\x00\x00\x02D\x00\x00@\x02\x00 \x00!\x00\x82\x0b\x17rE8P\x90\xb6\xa1w{"
buffer_unbzip=bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDEEEEEEEEEEEE')
True bytearray(b'BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB')
True bytearray(b'CCCCCCCCCCCCCCCCCCCCCCCCC')
True bytearray(b'DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD')
True bytearray(b'EEEEEEEEEEEE')

python - bz2 not recompressing properly

There are 1 best solutions below

Related Questions in PYTHON

Related Questions in HEX

Related Questions in BYTE

Related Questions in BZ2

Trending Questions

Popular # Hahtags

Popular Questions