Merge pull request #4688 from tdub0/compression

Update _compression, gzip, and test_gzip for CPython v3.11.2
This commit is contained in:
Jeong YunWon
2023-03-16 00:52:21 +09:00
committed by GitHub
5 changed files with 258 additions and 131 deletions

12
Lib/_compression.py vendored
View File

@@ -1,7 +1,7 @@
"""Internal classes used by the gzip, lzma and bz2 modules"""
import io
import sys
BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
@@ -110,6 +110,16 @@ class DecompressReader(io.RawIOBase):
self._pos += len(data)
return data
def readall(self):
chunks = []
# sys.maxsize means the max length of output buffer is unlimited,
# so that the whole input buffer can be decompressed within one
# .decompress() call.
while data := self.read(sys.maxsize):
chunks.append(data)
return b"".join(chunks)
# Rewind the file to the beginning of the data stream.
def _rewind(self):
self._fp.seek(0)

165
Lib/gzip.py vendored
View File

@@ -399,6 +399,59 @@ class GzipFile(_compression.BaseStream):
return self._buffer.readline(size)
def _read_exact(fp, n):
'''Read exactly *n* bytes from `fp`
This method is required because fp may be unbuffered,
i.e. return short reads.
'''
data = fp.read(n)
while len(data) < n:
b = fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data
def _read_gzip_header(fp):
'''Read a gzip header from `fp` and progress to the end of the header.
Returns last mtime if header was present or None otherwise.
'''
magic = fp.read(2)
if magic == b'':
return None
if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)
(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
if method != 8:
raise BadGzipFile('Unknown compression method')
if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
_read_exact(fp, extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
return last_mtime
class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib.decompressobj,
@@ -411,53 +464,11 @@ class _GzipReader(_compression.DecompressReader):
self._crc = zlib.crc32(b"")
self._stream_size = 0 # Decompressed size of unconcatenated stream
def _read_exact(self, n):
'''Read exactly *n* bytes from `self._fp`
This method is required because self._fp may be unbuffered,
i.e. return short reads.
'''
data = self._fp.read(n)
while len(data) < n:
b = self._fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data
def _read_gzip_header(self):
magic = self._fp.read(2)
if magic == b'':
last_mtime = _read_gzip_header(self._fp)
if last_mtime is None:
return False
if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)
(method, flag,
self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
if method != 8:
raise BadGzipFile('Unknown compression method')
if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", self._read_exact(2))
self._read_exact(extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = self._fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
self._read_exact(2) # Read & discard the 16-bit header CRC
self._last_mtime = last_mtime
return True
def read(self, size=-1):
@@ -520,7 +531,7 @@ class _GzipReader(_compression.DecompressReader):
# We check that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32, isize = struct.unpack("<II", self._read_exact(8))
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
if crc32 != self._crc:
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
hex(self._crc)))
@@ -540,21 +551,69 @@ class _GzipReader(_compression.DecompressReader):
super()._rewind()
self._new_member = True
def _create_simple_gzip_header(compresslevel: int,
mtime = None) -> bytes:
"""
Write a simple gzip header with no extra fields.
:param compresslevel: Compresslevel used to determine the xfl bytes.
:param mtime: The mtime (must support conversion to a 32-bit integer).
:return: A bytes object representing the gzip header.
"""
if mtime is None:
mtime = time.time()
if compresslevel == _COMPRESS_LEVEL_BEST:
xfl = 2
elif compresslevel == _COMPRESS_LEVEL_FAST:
xfl = 4
else:
xfl = 0
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
# fields added to header), mtime, xfl and os (255 for unknown OS).
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
"""Compress data in one shot and return the compressed string.
Optional argument is the compression level, in range of 0-9.
compresslevel sets the compression level in range of 0-9.
mtime can be used to set the modification time. The modification time is
set to the current time by default.
"""
buf = io.BytesIO()
with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
f.write(data)
return buf.getvalue()
if mtime == 0:
# Use zlib as it creates the header with 0 mtime by default.
# This is faster and with less overhead.
return zlib.compress(data, level=compresslevel, wbits=31)
header = _create_simple_gzip_header(compresslevel, mtime)
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
# Wbits=-15 creates a raw deflate block.
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
trailer)
def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
with GzipFile(fileobj=io.BytesIO(data)) as f:
return f.read()
decompressed_members = []
while True:
fp = io.BytesIO(data)
if _read_gzip_header(fp) is None:
return b"".join(decompressed_members)
# Use a zlib raw deflate compressor
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
# Read all the data except the header
decompressed = do.decompress(data[fp.tell():])
if not do.eof or len(do.unused_data) < 8:
raise EOFError("Compressed file ended before the end-of-stream "
"marker was reached")
crc, length = struct.unpack("<II", do.unused_data[:8])
if crc != zlib.crc32(decompressed):
raise BadGzipFile("CRC check failed")
if length != (len(decompressed) & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")
decompressed_members.append(decompressed)
data = do.unused_data[8:].lstrip(b"\x00")
def main():

21
Lib/test/test_gzip.py vendored
View File

@@ -12,7 +12,7 @@ import unittest
from subprocess import PIPE, Popen
from test.support import import_helper
from test.support import os_helper
from test.support import _4G, bigmemtest
from test.support import _4G, bigmemtest, requires_subprocess
from test.support.script_helper import assert_python_ok, assert_python_failure
gzip = import_helper.import_module('gzip')
@@ -552,6 +552,15 @@ class TestGzip(BaseTest):
f.read(1) # to set mtime attribute
self.assertEqual(f.mtime, mtime)
def test_compress_correct_level(self):
# gzip.compress calls with mtime == 0 take a different code path.
for mtime in (0, 42):
with self.subTest(mtime=mtime):
nocompress = gzip.compress(data1, compresslevel=0, mtime=mtime)
yescompress = gzip.compress(data1, compresslevel=1, mtime=mtime)
self.assertIn(data1, nocompress)
self.assertNotIn(data1, yescompress)
def test_decompress(self):
for data in (data1, data2):
buf = io.BytesIO()
@@ -562,6 +571,14 @@ class TestGzip(BaseTest):
datac = gzip.compress(data)
self.assertEqual(gzip.decompress(datac), data)
def test_decompress_truncated_trailer(self):
compressed_data = gzip.compress(data1)
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-4])
def test_decompress_missing_trailer(self):
compressed_data = gzip.compress(data1)
self.assertRaises(EOFError, gzip.decompress, compressed_data[:-8])
def test_read_truncated(self):
data = data1*50
# Drop the CRC (4 bytes) and file size (4 bytes).
@@ -756,6 +773,7 @@ def create_and_remove_directory(directory):
class TestCommandLine(unittest.TestCase):
data = b'This is a simple test with gzip'
@requires_subprocess()
def test_decompress_stdin_stdout(self):
with io.BytesIO() as bytes_io:
with gzip.GzipFile(fileobj=bytes_io, mode='wb') as gzip_file:
@@ -791,6 +809,7 @@ class TestCommandLine(unittest.TestCase):
self.assertEqual(rc, 1)
self.assertEqual(out, b'')
@requires_subprocess()
@create_and_remove_directory(TEMPDIR)
def test_compress_stdin_outfile(self):
args = sys.executable, '-m', 'gzip'

View File

@@ -48,8 +48,9 @@ compressed_lorem_list = [
b"x\xda\xf3\xc9/J\xcdU\xc8,(.\xcdUH\xc9\xcf\xc9/R(\xce,QH\xccM-\x01\x00\x83\xd5\t\xc5",
]
for level, text in enumerate(compressed_lorem_list):
assert zlib.compress(lorem, level) == text
for level, expected in enumerate(compressed_lorem_list):
actual = zlib.compress(lorem, level)
assert actual == expected
# default level
assert zlib.compress(lorem) == zlib.compress(lorem, -1) == zlib.compress(lorem, 6)

View File

@@ -5,7 +5,7 @@ mod zlib {
use crate::vm::{
builtins::{PyBaseExceptionRef, PyBytes, PyBytesRef, PyIntRef, PyTypeRef},
common::lock::PyMutex,
function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg, OptionalOption},
function::{ArgBytesLike, ArgPrimitiveIndex, ArgSize, OptionalArg},
PyPayload, PyResult, VirtualMachine,
};
use adler32::RollingAdler32 as Adler32;
@@ -47,7 +47,7 @@ mod zlib {
// copied from zlibmodule.c (commit 530f506ac91338)
#[pyattr]
const MAX_WBITS: u8 = 15;
const MAX_WBITS: i8 = 15;
#[pyattr]
const DEF_BUF_SIZE: usize = 16 * 1024;
#[pyattr]
@@ -78,8 +78,9 @@ mod zlib {
crate::binascii::crc32(data, begin_state)
}
fn compression_from_int(level: Option<i32>) -> Option<Compression> {
match level.unwrap_or(Z_DEFAULT_COMPRESSION) {
// TODO: rewrite with TryFromBorrowedObject
fn compression_from_int(level: i32) -> Option<Compression> {
match level {
Z_DEFAULT_COMPRESSION => Some(Compression::default()),
valid_level @ Z_NO_COMPRESSION..=Z_BEST_COMPRESSION => {
Some(Compression::new(valid_level as u32))
@@ -92,23 +93,33 @@ mod zlib {
struct PyFuncCompressArgs {
#[pyarg(positional)]
data: ArgBytesLike,
#[pyarg(any, optional)]
level: OptionalOption<i32>,
#[pyarg(any, default = "Z_DEFAULT_COMPRESSION")]
level: i32,
#[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
wbits: ArgPrimitiveIndex<i8>,
}
/// Returns a bytes object containing compressed data.
#[pyfunction]
fn compress(args: PyFuncCompressArgs, vm: &VirtualMachine) -> PyResult<PyBytesRef> {
let data = args.data;
let level = args.level;
let PyFuncCompressArgs {
data,
level,
ref wbits,
} = args;
let compression = compression_from_int(level.flatten())
let level = compression_from_int(level)
.ok_or_else(|| new_zlib_error("Bad compression level", vm))?;
let mut encoder = ZlibEncoder::new(Vec::new(), compression);
data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap());
let encoded_bytes = encoder.finish().unwrap();
let encoded_bytes = if args.wbits.value == MAX_WBITS {
let mut encoder = ZlibEncoder::new(Vec::new(), level);
data.with_ref(|input_bytes| encoder.write_all(input_bytes).unwrap());
encoder.finish().unwrap()
} else {
let mut inner = CompressInner::new(InitOptions::new(wbits.value, vm)?.compress(level));
data.with_ref(|input_bytes| inner.compress(input_bytes, vm))?;
inner.flush(vm)?
};
Ok(vm.ctx.new_bytes(encoded_bytes))
}
@@ -125,6 +136,21 @@ mod zlib {
}
impl InitOptions {
fn new(wbits: i8, vm: &VirtualMachine) -> PyResult<InitOptions> {
let header = wbits > 0;
let wbits = wbits.unsigned_abs();
match wbits {
9..=15 => Ok(InitOptions::Standard {
header,
#[cfg(feature = "zlib")]
wbits,
}),
#[cfg(feature = "zlib")]
25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }),
_ => Err(vm.new_value_error("Invalid initialization option".to_owned())),
}
}
fn decompress(self) -> Decompress {
match self {
#[cfg(not(feature = "zlib"))]
@@ -149,22 +175,6 @@ mod zlib {
}
}
fn header_from_wbits(wbits: OptionalArg<i8>, vm: &VirtualMachine) -> PyResult<InitOptions> {
let wbits = wbits.unwrap_or(MAX_WBITS as i8);
let header = wbits > 0;
let wbits = wbits.unsigned_abs();
match wbits {
9..=15 => Ok(InitOptions::Standard {
header,
#[cfg(feature = "zlib")]
wbits,
}),
#[cfg(feature = "zlib")]
25..=31 => Ok(InitOptions::Gzip { wbits: wbits - 16 }),
_ => Err(vm.new_value_error("Invalid initialization option".to_owned())),
}
}
fn _decompress(
mut data: &[u8],
d: &mut Decompress,
@@ -232,43 +242,55 @@ mod zlib {
struct PyFuncDecompressArgs {
#[pyarg(positional)]
data: ArgBytesLike,
#[pyarg(any, optional)]
wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
#[pyarg(any, optional)]
bufsize: OptionalArg<ArgPrimitiveIndex<usize>>,
#[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
wbits: ArgPrimitiveIndex<i8>,
#[pyarg(any, default = "ArgPrimitiveIndex { value: DEF_BUF_SIZE }")]
bufsize: ArgPrimitiveIndex<usize>,
}
/// Returns a bytes object containing the uncompressed data.
#[pyfunction]
fn decompress(arg: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
let data = arg.data;
let wbits = arg.wbits;
let bufsize = arg.bufsize;
fn decompress(args: PyFuncDecompressArgs, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
let PyFuncDecompressArgs {
data,
wbits,
bufsize,
} = args;
data.with_ref(|data| {
let bufsize = bufsize.into_primitive().unwrap_or(DEF_BUF_SIZE);
let mut d = InitOptions::new(wbits.value, vm)?.decompress();
let mut d = header_from_wbits(wbits.into_primitive(), vm)?.decompress();
_decompress(data, &mut d, bufsize, None, false, vm).and_then(|(buf, stream_end)| {
if stream_end {
Ok(buf)
} else {
Err(new_zlib_error(
"Error -5 while decompressing data: incomplete or truncated stream",
vm,
))
}
})
_decompress(data, &mut d, bufsize.value, None, false, vm).and_then(
|(buf, stream_end)| {
if stream_end {
Ok(buf)
} else {
Err(new_zlib_error(
"Error -5 while decompressing data: incomplete or truncated stream",
vm,
))
}
},
)
})
}
#[derive(FromArgs)]
struct DecompressobjArgs {
#[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
wbits: ArgPrimitiveIndex<i8>,
#[cfg(feature = "zlib")]
#[pyarg(any, optional)]
_zdict: OptionalArg<ArgBytesLike>,
}
#[pyfunction]
fn decompressobj(args: DecompressobjArgs, vm: &VirtualMachine) -> PyResult<PyDecompress> {
#[allow(unused_mut)]
let mut decompress = header_from_wbits(args.wbits.into_primitive(), vm)?.decompress();
let mut decompress = InitOptions::new(args.wbits.value, vm)?.decompress();
#[cfg(feature = "zlib")]
if let OptionalArg::Present(dict) = args.zdict {
dict.with_ref(|d| decompress.set_dictionary(d).unwrap());
if let OptionalArg::Present(_dict) = args._zdict {
// FIXME: always fails
// dict.with_ref(|d| decompress.set_dictionary(d));
}
Ok(PyDecompress {
decompress: PyMutex::new(decompress),
@@ -407,34 +429,44 @@ mod zlib {
}
#[derive(FromArgs)]
struct DecompressobjArgs {
#[pyarg(any, optional)]
wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
#[allow(dead_code)] // FIXME: use args
struct CompressobjArgs {
#[pyarg(any, default = "Z_DEFAULT_COMPRESSION")]
level: i32,
// only DEFLATED is valid right now, it's w/e
#[pyarg(any, default = "DEFLATED")]
_method: i32,
#[pyarg(any, default = "ArgPrimitiveIndex { value: MAX_WBITS }")]
wbits: ArgPrimitiveIndex<i8>,
#[pyarg(any, name = "_memLevel", default = "DEF_MEM_LEVEL")]
_mem_level: u8,
#[cfg(feature = "zlib")]
#[pyarg(any, default = "Z_DEFAULT_STRATEGY")]
_strategy: i32,
#[cfg(feature = "zlib")]
#[pyarg(any, optional)]
zdict: OptionalArg<ArgBytesLike>,
zdict: Option<ArgBytesLike>,
}
#[pyfunction]
fn compressobj(
level: OptionalArg<i32>,
// only DEFLATED is valid right now, it's w/e
_method: OptionalArg<i32>,
wbits: OptionalArg<ArgPrimitiveIndex<i8>>,
// these aren't used.
_mem_level: OptionalArg<i32>, // this is memLevel in CPython
_strategy: OptionalArg<i32>,
_zdict: OptionalArg<ArgBytesLike>,
vm: &VirtualMachine,
) -> PyResult<PyCompress> {
let level = compression_from_int(level.into_option())
fn compressobj(args: CompressobjArgs, vm: &VirtualMachine) -> PyResult<PyCompress> {
let CompressobjArgs {
level,
wbits,
#[cfg(feature = "zlib")]
zdict,
..
} = args;
let level = compression_from_int(level)
.ok_or_else(|| vm.new_value_error("invalid initialization option".to_owned()))?;
let compress = header_from_wbits(wbits.into_primitive(), vm)?.compress(level);
#[allow(unused_mut)]
let mut compress = InitOptions::new(wbits.value, vm)?.compress(level);
#[cfg(feature = "zlib")]
if let Some(zdict) = zdict {
zdict.with_ref(|zdict| compress.set_dictionary(zdict).unwrap());
}
Ok(PyCompress {
inner: PyMutex::new(CompressInner {
compress,
unconsumed: Vec::new(),
}),
inner: PyMutex::new(CompressInner::new(compress)),
})
}
@@ -477,6 +509,12 @@ mod zlib {
const CHUNKSIZE: usize = u32::MAX as usize;
impl CompressInner {
fn new(compress: Compress) -> Self {
Self {
compress,
unconsumed: Vec::new(),
}
}
fn compress(&mut self, data: &[u8], vm: &VirtualMachine) -> PyResult<Vec<u8>> {
let orig_in = self.compress.total_in() as usize;
let mut cur_in = 0;