Merge pull request #4688 from tdub0/compression

Update _compression, gzip, and test_gzip for CPython v3.11.2
2023-03-16 00:52:21 +09:00
parent 8ff947e83a 7e0863ef81
commit 134d9f1e98
5 changed files with 258 additions and 131 deletions
--- a/Lib/_compression.py
+++ b/Lib/_compression.py
@@ -1,7 +1,7 @@
 """Internal classes used by the gzip, lzma and bz2 modules"""

 import io
-
+import sys

 BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE  # Compressed data read chunk size

@@ -110,6 +110,16 @@ class DecompressReader(io.RawIOBase):
        self._pos += len(data)
        return data

+    def readall(self):
+        chunks = []
+        # sys.maxsize means the max length of output buffer is unlimited,
+        # so that the whole input buffer can be decompressed within one
+        # .decompress() call.
+        while data := self.read(sys.maxsize):
+            chunks.append(data)
+
+        return b"".join(chunks)
+
    # Rewind the file to the beginning of the data stream.
    def _rewind(self):
        self._fp.seek(0)
--- a/Lib/gzip.py
+++ b/Lib/gzip.py
@@ -399,6 +399,59 @@ class GzipFile(_compression.BaseStream):
        return self._buffer.readline(size)


+def _read_exact(fp, n):
+    '''Read exactly *n* bytes from `fp`
+
+    This method is required because fp may be unbuffered,
+    i.e. return short reads.
+    '''
+    data = fp.read(n)
+    while len(data) < n:
+        b = fp.read(n - len(data))
+        if not b:
+            raise EOFError("Compressed file ended before the "
+                           "end-of-stream marker was reached")
+        data += b
+    return data
+
+
+def _read_gzip_header(fp):
+    '''Read a gzip header from `fp` and progress to the end of the header.
+
+    Returns last mtime if header was present or None otherwise.
+    '''
+    magic = fp.read(2)
+    if magic == b'':
+        return None
+
+    if magic != b'\037\213':
+        raise BadGzipFile('Not a gzipped file (%r)' % magic)
+
+    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
+    if method != 8:
+        raise BadGzipFile('Unknown compression method')
+
+    if flag & FEXTRA:
+        # Read & discard the extra field, if present
+        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
+        _read_exact(fp, extra_len)
+    if flag & FNAME:
+        # Read and discard a null-terminated string containing the filename
+        while True:
+            s = fp.read(1)
+            if not s or s==b'\000':
+                break
+    if flag & FCOMMENT:
+        # Read and discard a null-terminated string containing a comment
+        while True:
+            s = fp.read(1)
+            if not s or s==b'\000':
+                break
+    if flag & FHCRC:
+        _read_exact(fp, 2)     # Read & discard the 16-bit header CRC
+    return last_mtime
+
+
 class _GzipReader(_compression.DecompressReader):
    def __init__(self, fp):
        super().__init__(_PaddedFile(fp), zlib.decompressobj,
@@ -411,53 +464,11 @@ class _GzipReader(_compression.DecompressReader):
        self._crc = zlib.crc32(b"")
        self._stream_size = 0  # Decompressed size of unconcatenated stream

-    def _read_exact(self, n):
-        '''Read exactly *n* bytes from `self._fp`
-
-        This method is required because self._fp may be unbuffered,
-        i.e. return short reads.
-        '''
-
-        data = self._fp.read(n)
-        while len(data) < n:
-            b = self._fp.read(n - len(data))
-            if not b:
-                raise EOFError("Compressed file ended before the "
-                               "end-of-stream marker was reached")
-            data += b
-        return data
-
    def _read_gzip_header(self):
-        magic = self._fp.read(2)
-        if magic == b'':
+        last_mtime = _read_gzip_header(self._fp)
+        if last_mtime is None:
            return False
-
-        if magic != b'\037\213':
-            raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
-        (method, flag,
-         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
-        if method != 8:
-            raise BadGzipFile('Unknown compression method')
-
-        if flag & FEXTRA:
-            # Read & discard the extra field, if present
-            extra_len, = struct.unpack("<H", self._read_exact(2))
-            self._read_exact(extra_len)
-        if flag & FNAME:
-            # Read and discard a null-terminated string containing the filename
-            while True:
-                s = self._fp.read(1)
-                if not s or s==b'\000':
-                    break
-        if flag & FCOMMENT:
-            # Read and discard a null-terminated string containing a comment
-            while True:
-                s = self._fp.read(1)
-                if not s or s==b'\000':
-                    break
-        if flag & FHCRC:
-            self._read_exact(2)     # Read & discard the 16-bit header CRC
+        self._last_mtime = last_mtime
        return True

    def read(self, size=-1):
@@ -520,7 +531,7 @@ class _GzipReader(_compression.DecompressReader):
        # We check that the computed CRC and size of the
        # uncompressed data matches the stored values.  Note that the size
        # stored is the true file size mod 2**32.
-        crc32, isize = struct.unpack("<II", self._read_exact(8))
+        crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
        if crc32 != self._crc:
            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
                                                             hex(self._crc)))
@@ -540,21 +551,69 @@ class _GzipReader(_compression.DecompressReader):
        super()._rewind()
        self._new_member = True

+
+def _create_simple_gzip_header(compresslevel: int,
+                               mtime = None) -> bytes:
+    """
+    Write a simple gzip header with no extra fields.
+    :param compresslevel: Compresslevel used to determine the xfl bytes.
+    :param mtime: The mtime (must support conversion to a 32-bit integer).
+    :return: A bytes object representing the gzip header.
+    """
+    if mtime is None:
+        mtime = time.time()
+    if compresslevel == _COMPRESS_LEVEL_BEST:
+        xfl = 2
+    elif compresslevel == _COMPRESS_LEVEL_FAST:
+        xfl = 4
+    else:
+        xfl = 0
+    # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
+    # fields added to header), mtime, xfl and os (255 for unknown OS).
+    return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
+
+
 def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
    """Compress data in one shot and return the compressed string.
-    Optional argument is the compression level, in range of 0-9.
+
+    compresslevel sets the compression level in range of 0-9.
+    mtime can be used to set the modification time. The modification time is
+    set to the current time by default.
    """
-    buf = io.BytesIO()
-    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
-        f.write(data)
-    return buf.getvalue()
+    if mtime == 0:
+        # Use zlib as it creates the header with 0 mtime by default.
+        # This is faster and with less overhead.
+        return zlib.compress(data, level=compresslevel, wbits=31)
+    header = _create_simple_gzip_header(compresslevel, mtime)
+    trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
+    # Wbits=-15 creates a raw deflate block.
+    return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
+            trailer)
+

 def decompress(data):
    """Decompress a gzip compressed string in one shot.
    Return the decompressed string.
    """
-    with GzipFile(fileobj=io.BytesIO(data)) as f:
-        return f.read()
+    decompressed_members = []
+    while True:
+        fp = io.BytesIO(data)
+        if _read_gzip_header(fp) is None:
+            return b"".join(decompressed_members)
+        # Use a zlib raw deflate compressor
+        do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
+        # Read all the data except the header
+        decompressed = do.decompress(data[fp.tell():])
+        if not do.eof or len(do.unused_data) < 8:
+            raise EOFError("Compressed file ended before the end-of-stream "
+                           "marker was reached")
+        crc, length = struct.unpack("<II", do.unused_data[:8])
+        if crc != zlib.crc32(decompressed):
+            raise BadGzipFile("CRC check failed")
+        if length != (len(decompressed) & 0xffffffff):
+            raise BadGzipFile("Incorrect length of data produced")
+        decompressed_members.append(decompressed)
+        data = do.unused_data[8:].lstrip(b"\x00")


 def main():
--- a/Lib/test/test_gzip.py
+++ b/Lib/test/test_gzip.py
@@ -12,7 +12,7 @@ import unittest
 from subprocess import PIPE, Popen
 from test.support import import_helper
 from test.support import os_helper
-from test.support import _4G, bigmemtest
+from test.support import _4G, bigmemtest, requires_subprocess
 from test.support.script_helper import assert_python_ok, assert_python_failure

 gzip = import_helper.import_module('gzip')
@@ -552,6 +552,15 @@ class TestGzip(BaseTest):
                        f.read(1) # to set mtime attribute
                        self.assertEqual(f.mtime, mtime)

+    def test_compress_correct_level(self):
+        # gzip.compress calls with mtime == 0 take a different code path.
+        for mtime in (0, 42):
+            with self.subTest(mtime=mtime):
+                nocompress = gzip.compress(data1, compresslevel=0, mtime=mtime)
+                yescompress = gzip.compress(data1, compresslevel=1, mtime=mtime)
+                self.assertIn(data1, nocompress)
+                self.assertNotIn(data1, yescompress)
+
    def test_decompress(self):
        for data in (data1, data2):
            buf = io.BytesIO()
@@ -562,6 +571,14 @@ class TestGzip(BaseTest):
            datac = gzip.compress(data)
            self.assertEqual(gzip.decompress(datac), data)

+    def test_decompress_truncated_trailer(self):
+        compressed_data = gzip.compress(data1)
+        self.assertRaises(EOFError, gzip.decompress, compressed_data[:-4])
+
+    def test_decompress_missing_trailer(self):
+        compressed_data = gzip.compress(data1)
+        self.assertRaises(EOFError, gzip.decompress, compressed_data[:-8])
+
    def test_read_truncated(self):
        data = data1*50
        # Drop the CRC (4 bytes) and file size (4 bytes).
@@ -756,6 +773,7 @@ def create_and_remove_directory(directory):
 class TestCommandLine(unittest.TestCase):
    data = b'This is a simple test with gzip'

+    @requires_subprocess()
    def test_decompress_stdin_stdout(self):
        with io.BytesIO() as bytes_io:
            with gzip.GzipFile(fileobj=bytes_io, mode='wb') as gzip_file:
@@ -791,6 +809,7 @@ class TestCommandLine(unittest.TestCase):
        self.assertEqual(rc, 1)
        self.assertEqual(out, b'')

+    @requires_subprocess()
    @create_and_remove_directory(TEMPDIR)
    def test_compress_stdin_outfile(self):
        args = sys.executable, '-m', 'gzip'
--- a/extra_tests/snippets/stdlib_zlib.py
+++ b/extra_tests/snippets/stdlib_zlib.py
@@ -48,8 +48,9 @@ compressed_lorem_list = [
    b"x\xda\xf3\xc9/J\xcdU\xc8,(.\xcdUH\xc9\xcf\xc9/R(\xce,QH\xccM-\x01\x00\x83\xd5\t\xc5",
 ]

-for level, text in enumerate(compressed_lorem_list):
-    assert zlib.compress(lorem, level) == text
+for level, expected in enumerate(compressed_lorem_list):
+    actual = zlib.compress(lorem, level)
+    assert actual == expected

 # default level
 assert zlib.compress(lorem) == zlib.compress(lorem, -1) == zlib.compress(lorem, 6)
--- a/stdlib/src/zlib.rs
+++ b/stdlib/src/zlib.rs
@@ -5,7 +5,7 @@ mod zlib {
    use crate::vm::{
        builtins::{PyBaseExceptionRef, PyBytes, PyBytesRef, PyIntRef, PyTypeRef},
        common::lock::PyMutex,
-        function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg, OptionalOption},
+        function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg},
        PyPayload, PyResult, VirtualMachine,
    };
    use adler32::RollingAdler32 as Adler32;
@@ -47,7 +47,7 @@ mod zlib {

    // copied from zlibmodule.c (commit 530f506ac91338)
    #[pyattr]
-    const MAX_WBITS: u8 = 15;
+    const MAX_WBITS: i8 = 15;
    #[pyattr]
    const DEF_BUF_SIZE: usize = 16 * 1024;
    #[pyattr]
@@ -78,8 +78,9 @@ mod zlib {
        crate::binascii::crc32(data, begin_state)
    }

-    fn compression_from_int(level: Option<i32>) -> Option<Compression> {
-        match level.unwrap_or(Z_DEFAULT_COMPRESSION) {
+    // TODO: rewrite with TryFromBorrowedObject
+    fn compression_from_int(level: i32) -> Option<Compression> {
+        match level {
            Z_DEFAULT_COMPRESSION => Some(Compression::default()),
            valid_level @ Z_NO_COMPRESSION..=Z_BEST_COMPRESSION => {
                Some(Compression::new(valid_level as u32))
@@ -92,23 +93,33 @@ mod zlib {
    struct PyFuncCompressArgs {
        #[pyarg(positional)]
        data: ArgBytesLike,
-        #[pyarg(any, optional)]
-        level: OptionalOption<i32>,
+        #[pyarg(any, default = "Z_DEFAULT_COMPRESSION")]
+        level: i32,
+        #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
+        wbits: ArgPrimitiveIndex<i8>,
    }

    /// Returns a bytes object containing compressed data.
    #[pyfunction]
    fn compress(args: PyFuncCompressArgs, vm: &VirtualMachine) -> PyResult<PyBytesRef> {
-        let data = args.data;
-        let level = args.level;
+        let PyFuncCompressArgs {
+            data,
+            level,
+            ref wbits,
+        } = args;

-        let compression = compression_from_int(level.flatten())
+        let level = compression_from_int(level)
            .ok_or_else(|| new_zlib_error("Bad compression level", vm))?;

-        let mut encoder = ZlibEncoder::new(Vec::new(), compression);
-        data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap());
-        let encoded_bytes = encoder.finish().unwrap();
-
+        let encoded_bytes = if args.wbits.value == MAX_WBITS {
+            let mut encoder = ZlibEncoder::new(Vec::new(), level);
+            data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap());
+            encoder.finish().unwrap()
+        } else {
+            let mut inner = CompressInner::new(InitOptions::new(wbits.value, vm)?.compress(level));
+            data.with_ref(|input_bytes| inner.compress(input_bytes, vm))?;
+            inner.flush(vm)?
+        };
        Ok(vm.ctx.new_bytes(encoded_bytes))
    }

@@ -125,6 +136,21 @@ mod zlib {
    }

    impl InitOptions {
+        fn new(wbits: i8, vm: &VirtualMachine) -> PyResult<InitOptions> {
+            let header = wbits > 0;
+            let wbits = wbits.unsigned_abs();
+            match wbits {
+                9..=15 => Ok(InitOptions::Standard {
+                    header,
+                    #[cfg(feature = "zlib")]
+                    wbits,
+                }),
+                #[cfg(feature = "zlib")]
+                25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }),
+                _ => Err(vm.new_value_error("Invalid initialization option".to_owned())),
+            }
+        }
+
        fn decompress(self) -> Decompress {
            match self {
                #[cfg(not(feature = "zlib"))]
@@ -149,22 +175,6 @@ mod zlib {
        }
    }

-    fn header_from_wbits(wbits: OptionalArg<i8>, vm: &VirtualMachine) -> PyResult<InitOptions> {
-        let wbits = wbits.unwrap_or(MAX_WBITS as i8);
-        let header = wbits > 0;
-        let wbits = wbits.unsigned_abs();
-        match wbits {
-            9..=15 => Ok(InitOptions::Standard {
-                header,
-                #[cfg(feature = "zlib")]
-                wbits,
-            }),
-            #[cfg(feature = "zlib")]
-            25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }),
-            _ => Err(vm.new_value_error("Invalid initialization option".to_owned())),
-        }
-    }
-
    fn _decompress(
        mut data: &[u8],
        d: &mut Decompress,
@@ -232,43 +242,55 @@ mod zlib {
    struct PyFuncDecompressArgs {
        #[pyarg(positional)]
        data: ArgBytesLike,
-        #[pyarg(any, optional)]
-        wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
-        #[pyarg(any, optional)]
-        bufsize: OptionalArg<ArgPrimitiveIndex<usize>>,
+        #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
+        wbits: ArgPrimitiveIndex<i8>,
+        #[pyarg(any, default = "ArgPrimitiveIndex { value: DEF_BUF_SIZE }")]
+        bufsize: ArgPrimitiveIndex<usize>,
    }

    /// Returns a bytes object containing the uncompressed data.
    #[pyfunction]
-    fn decompress(arg: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
-        let data = arg.data;
-        let wbits = arg.wbits;
-        let bufsize = arg.bufsize;
+    fn decompress(args: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
+        let PyFuncDecompressArgs {
+            data,
+            wbits,
+            bufsize,
+        } = args;
        data.with_ref(|data| {
-            let bufsize = bufsize.into_primitive().unwrap_or(DEF_BUF_SIZE);
+            let mut d = InitOptions::new(wbits.value, vm)?.decompress();

-            let mut d = header_from_wbits(wbits.into_primitive(), vm)?.decompress();
-
-            _decompress(data, &mut d, bufsize, None, false, vm).and_then(|(buf, stream_end)| {
-                if stream_end {
-                    Ok(buf)
-                } else {
-                    Err(new_zlib_error(
-                        "Error -5 while decompressing data: incomplete or truncated stream",
-                        vm,
-                    ))
-                }
-            })
+            _decompress(data, &mut d, bufsize.value, None, false, vm).and_then(
+                |(buf, stream_end)| {
+                    if stream_end {
+                        Ok(buf)
+                    } else {
+                        Err(new_zlib_error(
+                            "Error -5 while decompressing data: incomplete or truncated stream",
+                            vm,
+                        ))
+                    }
+                },
+            )
        })
    }

+    #[derive(FromArgs)]
+    struct DecompressobjArgs {
+        #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
+        wbits: ArgPrimitiveIndex<i8>,
+        #[cfg(feature = "zlib")]
+        #[pyarg(any, optional)]
+        _zdict: OptionalArg<ArgBytesLike>,
+    }
+
    #[pyfunction]
    fn decompressobj(args: DecompressobjArgs, vm: &VirtualMachine) -> PyResult<PyDecompress> {
        #[allow(unused_mut)]
-        let mut decompress = header_from_wbits(args.wbits.into_primitive(), vm)?.decompress();
+        let mut decompress = InitOptions::new(args.wbits.value, vm)?.decompress();
        #[cfg(feature = "zlib")]
-        if let OptionalArg::Present(dict) = args.zdict {
-            dict.with_ref(|d| decompress.set_dictionary(d).unwrap());
+        if let OptionalArg::Present(_dict) = args._zdict {
+            // FIXME: always fails
+            // dict.with_ref(|d| decompress.set_dictionary(d));
        }
        Ok(PyDecompress {
            decompress: PyMutex::new(decompress),
@@ -407,34 +429,44 @@ mod zlib {
    }

    #[derive(FromArgs)]
-    struct DecompressobjArgs {
-        #[pyarg(any, optional)]
-        wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
+    #[allow(dead_code)] // FIXME: use args
+    struct CompressobjArgs {
+        #[pyarg(any, default = "Z_DEFAULT_COMPRESSION")]
+        level: i32,
+        // only DEFLATED is valid right now, it's w/e
+        #[pyarg(any, default = "DEFLATED")]
+        _method: i32,
+        #[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
+        wbits: ArgPrimitiveIndex<i8>,
+        #[pyarg(any, name = "_memLevel", default = "DEF_MEM_LEVEL")]
+        _mem_level: u8,
+        #[cfg(feature = "zlib")]
+        #[pyarg(any, default = "Z_DEFAULT_STRATEGY")]
+        _strategy: i32,
        #[cfg(feature = "zlib")]
        #[pyarg(any, optional)]
-        zdict: OptionalArg<ArgBytesLike>,
+        zdict: Option<ArgBytesLike>,
    }

    #[pyfunction]
-    fn compressobj(
-        level: OptionalArg<i32>,
-        // only DEFLATED is valid right now, it's w/e
-        _method: OptionalArg<i32>,
-        wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
-        // these aren't used.
-        _mem_level: OptionalArg<i32>, // this is memLevel in CPython
-        _strategy: OptionalArg<i32>,
-        _zdict: OptionalArg<ArgBytesLike>,
-        vm: &VirtualMachine,
-    ) -> PyResult<PyCompress> {
-        let level = compression_from_int(level.into_option())
+    fn compressobj(args: CompressobjArgs, vm: &VirtualMachine) -> PyResult<PyCompress> {
+        let CompressobjArgs {
+            level,
+            wbits,
+            #[cfg(feature = "zlib")]
+            zdict,
+            ..
+        } = args;
+        let level = compression_from_int(level)
            .ok_or_else(|| vm.new_value_error("invalid initialization option".to_owned()))?;
-        let compress = header_from_wbits(wbits.into_primitive(), vm)?.compress(level);
+        #[allow(unused_mut)]
+        let mut compress = InitOptions::new(wbits.value, vm)?.compress(level);
+        #[cfg(feature = "zlib")]
+        if let Some(zdict) = zdict {
+            zdict.with_ref(|zdict| compress.set_dictionary(zdict).unwrap());
+        }
        Ok(PyCompress {
-            inner: PyMutex::new(CompressInner {
-                compress,
-                unconsumed: Vec::new(),
-            }),
+            inner: PyMutex::new(CompressInner::new(compress)),
        })
    }

@@ -477,6 +509,12 @@ mod zlib {
    const CHUNKSIZE: usize = u32::MAX as usize;

    impl CompressInner {
+        fn new(compress: Compress) -> Self {
+            Self {
+                compress,
+                unconsumed: Vec::new(),
+            }
+        }
        fn compress(&mut self, data: &[u8], vm: &VirtualMachine) -> PyResult<Vec<u8>> {
            let orig_in = self.compress.total_in() as usize;
            let mut cur_in = 0;