From 7204ab7ea82cc92d66ada18434442f6c25926a77 Mon Sep 17 00:00:00 2001 From: boris Date: Fri, 12 Feb 2021 02:40:56 -0500 Subject: [PATCH 1/3] implement _bz2 Co-Authored-By: Jeong YunWon --- Cargo.lock | 22 +++++ stdlib/Cargo.toml | 1 + stdlib/src/bz2.rs | 246 ++++++++++++++++++++++++++++++++++++++++++++++ stdlib/src/lib.rs | 3 + 4 files changed, 272 insertions(+) create mode 100644 stdlib/src/bz2.rs diff --git a/Cargo.lock b/Cargo.lock index 1137c15b9..c5fe5809c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -150,6 +150,27 @@ version = "3.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f1e260c3a9040a7c19a12468758f4c16f31a81a1fe087482be9570ec864bb6c" +[[package]] +name = "bzip2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6afcd980b5f3a45017c57e57a2fcccbb351cc43a356ce117ef760ef8052b89b0" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "caseless" version = "0.2.1" @@ -1722,6 +1743,7 @@ dependencies = [ "ascii", "base64", "blake2", + "bzip2", "cfg-if", "crc32fast", "crossbeam-utils", diff --git a/stdlib/Cargo.toml b/stdlib/Cargo.toml index 90bab2a8c..91c3cd764 100644 --- a/stdlib/Cargo.toml +++ b/stdlib/Cargo.toml @@ -41,6 +41,7 @@ unic-ucd-ident = "0.9.0" adler32 = "1.2.0" crc32fast = "1.3.2" flate2 = "1.0.23" +bzip2 = "0.4" num-complex = "0.4.0" num-bigint = "0.4.3" diff --git a/stdlib/src/bz2.rs b/stdlib/src/bz2.rs new file mode 100644 index 000000000..977dea918 --- /dev/null +++ b/stdlib/src/bz2.rs @@ -0,0 +1,246 @@ +pub(crate) use _bz2::make_module; + +#[pymodule] +mod _bz2 { + use crate::common::lock::PyMutex; + use crate::vm::{ + builtins::{PyBytesRef, PyTypeRef}, + function::{ArgBytesLike, OptionalArg}, + object::{PyPayload, PyResult}, + types::Constructor, + VirtualMachine, + }; + use bzip2::{write::BzEncoder, Decompress, Status}; + use std::{fmt, io::Write}; + + // const BUFSIZ: i32 = 8192; + + struct DecompressorState { + decoder: Decompress, + eof: bool, + needs_input: bool, + // input_buffer: Vec, + // output_buffer: Vec, + } + + #[pyattr] + #[pyclass(name = "BZ2Decompressor")] + #[derive(PyPayload)] + struct BZ2Decompressor { + state: PyMutex, + } + + impl fmt::Debug for BZ2Decompressor { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "_bz2.BZ2Decompressor") + } + } + + impl Constructor for BZ2Decompressor { + type Args = (); + + fn py_new(cls: PyTypeRef, _: Self::Args, vm: &VirtualMachine) -> PyResult { + Self { + state: PyMutex::new(DecompressorState { + decoder: Decompress::new(false), + eof: false, + needs_input: true, + // input_buffer: Vec::new(), + // output_buffer: Vec::new(), + }), + } + .into_ref_with_type(vm, cls) + .map(Into::into) + } + } + + #[pyclass(with(Constructor))] + impl BZ2Decompressor { + #[pymethod] + fn decompress( + &self, + data: ArgBytesLike, + // TODO: PyIntRef + max_length: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let max_length = max_length.unwrap_or(-1); + if max_length >= 0 { + return Err(vm.new_not_implemented_error( + "the max_value argument is not implemented yet".to_owned(), + )); + } + // let max_length = if max_length < 0 || max_length >= BUFSIZ { + // BUFSIZ + // } else { + // max_length + // }; + + let mut state = self.state.lock(); + let DecompressorState { + decoder, + eof, + .. + // needs_input, + // input_buffer, + // output_buffer, + } = &mut *state; + + if *eof { + return Err(vm.new_exception_msg( + vm.ctx.exceptions.eof_error.to_owned(), + "End of stream already reached".to_owned(), + )); + } + + // data.with_ref(|data| input_buffer.extend(data)); + + // If max_length is negative: + // read the input X bytes at a time, compress it and append it to output. + // Once you're out of input, setting needs_input to true and return the + // output as bytes. + // + // TODO: + // If max_length is non-negative: + // Read the input X bytes at a time, compress it and append it to + // the output. If output reaches `max_length` in size, return + // it (up to max_length), and store the rest of the output + // for later. + + // TODO: arbitrary choice, not the right way to do it. + let mut buf = Vec::with_capacity(data.len() * 32); + + let before = decoder.total_in(); + let res = data.with_ref(|data| decoder.decompress_vec(data, &mut buf)); + let _written = (decoder.total_in() - before) as usize; + + let res = match res { + Ok(x) => x, + // TODO: error message + _ => return Err(vm.new_os_error("Invalid data stream".to_owned())), + }; + + if res == Status::StreamEnd { + *eof = true; + } + Ok(vm.ctx.new_bytes(buf.to_vec())) + } + + #[pyproperty] + fn eof(&self) -> bool { + let state = self.state.lock(); + state.eof + } + + #[pyproperty] + fn unused_data(&self, vm: &VirtualMachine) -> PyBytesRef { + // Data found after the end of the compressed stream. + // If this attribute is accessed before the end of the stream + // has been reached, its value will be b''. + vm.ctx.new_bytes(b"".to_vec()) + // alternatively, be more honest: + // Err(vm.new_not_implemented_error( + // "unused_data isn't implemented yet".to_owned(), + // )) + // + // TODO + // let state = self.state.lock(); + // if state.eof { + // vm.ctx.new_bytes(state.input_buffer.to_vec()) + // else { + // vm.ctx.new_bytes(b"".to_vec()) + // } + } + + #[pyproperty] + fn needs_input(&self) -> bool { + // False if the decompress() method can provide more + // decompressed data before requiring new uncompressed input. + let state = self.state.lock(); + state.needs_input + } + + // TODO: mro()? + } + + struct CompressorState { + flushed: bool, + encoder: Option>>, + } + + #[pyattr] + #[pyclass(name = "BZ2Compressor")] + #[derive(PyPayload)] + struct BZ2Compressor { + state: PyMutex, + } + + impl fmt::Debug for BZ2Compressor { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "_bz2.BZ2Compressor") + } + } + + impl Constructor for BZ2Compressor { + type Args = (OptionalArg,); + + fn py_new(cls: PyTypeRef, args: Self::Args, vm: &VirtualMachine) -> PyResult { + let (compresslevel,) = args; + // TODO: seriously? + // compresslevel.unwrap_or(bzip2::Compression::best().level().try_into().unwrap()); + let compresslevel = compresslevel.unwrap_or(9); + let level = match compresslevel { + valid_level @ 1..=9 => bzip2::Compression::new(valid_level as u32), + _ => { + return Err( + vm.new_value_error("compresslevel must be between 1 and 9".to_owned()) + ) + } + }; + + Self { + state: PyMutex::new(CompressorState { + flushed: false, + encoder: Some(BzEncoder::new(Vec::new(), level)), + }), + } + .into_ref_with_type(vm, cls) + .map(Into::into) + } + } + + // TODO: return partial results from compress() instead of returning everything in flush() + #[pyclass(with(Constructor))] + impl BZ2Compressor { + #[pymethod] + fn compress(&self, data: ArgBytesLike, vm: &VirtualMachine) -> PyResult { + let mut state = self.state.lock(); + if state.flushed { + return Err(vm.new_value_error("Compressor has been flushed".to_owned())); + } + + // let CompressorState { flushed, encoder } = &mut *state; + let CompressorState { encoder, .. } = &mut *state; + + // TODO: handle Err + data.with_ref(|input_bytes| encoder.as_mut().unwrap().write_all(input_bytes).unwrap()); + Ok(vm.ctx.new_bytes(Vec::new())) + } + + #[pymethod] + fn flush(&self, vm: &VirtualMachine) -> PyResult { + let mut state = self.state.lock(); + if state.flushed { + return Err(vm.new_value_error("Repeated call to flush()".to_owned())); + } + + // let CompressorState { flushed, encoder } = &mut *state; + let CompressorState { encoder, .. } = &mut *state; + + // TODO: handle Err + let out = encoder.take().unwrap().finish().unwrap(); + state.flushed = true; + Ok(vm.ctx.new_bytes(out.to_vec())) + } + } +} diff --git a/stdlib/src/lib.rs b/stdlib/src/lib.rs index 98cffde6f..e610e0059 100644 --- a/stdlib/src/lib.rs +++ b/stdlib/src/lib.rs @@ -25,6 +25,8 @@ mod random; mod statistics; // TODO: maybe make this an extension module, if we ever get those // mod re; +#[cfg(not(any(target_arch = "wasm32")))] +mod bz2; #[cfg(not(target_arch = "wasm32"))] pub mod socket; #[cfg(unix)] @@ -115,6 +117,7 @@ pub fn get_module_inits() -> impl Iterator, StdlibInit "select" => select::make_module, "_socket" => socket::make_module, "faulthandler" => faulthandler::make_module, + "_bz2" => bz2::make_module, } #[cfg(feature = "ssl")] { From b17c891115baa9f24540ee236e5b5b0b0bdeb23f Mon Sep 17 00:00:00 2001 From: CPython developers <> Date: Thu, 11 Feb 2021 04:18:09 -0500 Subject: [PATCH 2/3] Add bz2 from CPython 3.10.5 --- Lib/bz2.py | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 Lib/bz2.py diff --git a/Lib/bz2.py b/Lib/bz2.py new file mode 100644 index 000000000..fabe4f73c --- /dev/null +++ b/Lib/bz2.py @@ -0,0 +1,344 @@ +"""Interface to the libbzip2 compression library. + +This module provides a file interface, classes for incremental +(de)compression, and functions for one-shot (de)compression. +""" + +__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", + "open", "compress", "decompress"] + +__author__ = "Nadeem Vawda " + +from builtins import open as _builtin_open +import io +import os +import _compression + +from _bz2 import BZ2Compressor, BZ2Decompressor + + +_MODE_CLOSED = 0 +_MODE_READ = 1 +# Value 2 no longer used +_MODE_WRITE = 3 + + +class BZ2File(_compression.BaseStream): + + """A file object providing transparent bzip2 (de)compression. + + A BZ2File can act as a wrapper for an existing file object, or refer + directly to a named file on disk. + + Note that BZ2File provides a *binary* file interface - data read is + returned as bytes, and data to be written should be given as bytes. + """ + + def __init__(self, filename, mode="r", *, compresslevel=9): + """Open a bzip2-compressed file. + + If filename is a str, bytes, or PathLike object, it gives the + name of the file to be opened. Otherwise, it should be a file + object, which will be used to read or write the compressed data. + + mode can be 'r' for reading (default), 'w' for (over)writing, + 'x' for creating exclusively, or 'a' for appending. These can + equivalently be given as 'rb', 'wb', 'xb', and 'ab'. + + If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 + and 9 specifying the level of compression: 1 produces the least + compression, and 9 (default) produces the most compression. + + If mode is 'r', the input file may be the concatenation of + multiple compressed streams. + """ + self._fp = None + self._closefp = False + self._mode = _MODE_CLOSED + + if not (1 <= compresslevel <= 9): + raise ValueError("compresslevel must be between 1 and 9") + + if mode in ("", "r", "rb"): + mode = "rb" + mode_code = _MODE_READ + elif mode in ("w", "wb"): + mode = "wb" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor(compresslevel) + elif mode in ("x", "xb"): + mode = "xb" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor(compresslevel) + elif mode in ("a", "ab"): + mode = "ab" + mode_code = _MODE_WRITE + self._compressor = BZ2Compressor(compresslevel) + else: + raise ValueError("Invalid mode: %r" % (mode,)) + + if isinstance(filename, (str, bytes, os.PathLike)): + self._fp = _builtin_open(filename, mode) + self._closefp = True + self._mode = mode_code + elif hasattr(filename, "read") or hasattr(filename, "write"): + self._fp = filename + self._mode = mode_code + else: + raise TypeError("filename must be a str, bytes, file or PathLike object") + + if self._mode == _MODE_READ: + raw = _compression.DecompressReader(self._fp, + BZ2Decompressor, trailing_error=OSError) + self._buffer = io.BufferedReader(raw) + else: + self._pos = 0 + + def close(self): + """Flush and close the file. + + May be called more than once without error. Once the file is + closed, any other operation on it will raise a ValueError. + """ + if self._mode == _MODE_CLOSED: + return + try: + if self._mode == _MODE_READ: + self._buffer.close() + elif self._mode == _MODE_WRITE: + self._fp.write(self._compressor.flush()) + self._compressor = None + finally: + try: + if self._closefp: + self._fp.close() + finally: + self._fp = None + self._closefp = False + self._mode = _MODE_CLOSED + self._buffer = None + + @property + def closed(self): + """True if this file is closed.""" + return self._mode == _MODE_CLOSED + + def fileno(self): + """Return the file descriptor for the underlying file.""" + self._check_not_closed() + return self._fp.fileno() + + def seekable(self): + """Return whether the file supports seeking.""" + return self.readable() and self._buffer.seekable() + + def readable(self): + """Return whether the file was opened for reading.""" + self._check_not_closed() + return self._mode == _MODE_READ + + def writable(self): + """Return whether the file was opened for writing.""" + self._check_not_closed() + return self._mode == _MODE_WRITE + + def peek(self, n=0): + """Return buffered data without advancing the file position. + + Always returns at least one byte of data, unless at EOF. + The exact number of bytes returned is unspecified. + """ + self._check_can_read() + # Relies on the undocumented fact that BufferedReader.peek() + # always returns at least one byte (except at EOF), independent + # of the value of n + return self._buffer.peek(n) + + def read(self, size=-1): + """Read up to size uncompressed bytes from the file. + + If size is negative or omitted, read until EOF is reached. + Returns b'' if the file is already at EOF. + """ + self._check_can_read() + return self._buffer.read(size) + + def read1(self, size=-1): + """Read up to size uncompressed bytes, while trying to avoid + making multiple reads from the underlying stream. Reads up to a + buffer's worth of data if size is negative. + + Returns b'' if the file is at EOF. + """ + self._check_can_read() + if size < 0: + size = io.DEFAULT_BUFFER_SIZE + return self._buffer.read1(size) + + def readinto(self, b): + """Read bytes into b. + + Returns the number of bytes read (0 for EOF). + """ + self._check_can_read() + return self._buffer.readinto(b) + + def readline(self, size=-1): + """Read a line of uncompressed bytes from the file. + + The terminating newline (if present) is retained. If size is + non-negative, no more than size bytes will be read (in which + case the line may be incomplete). Returns b'' if already at EOF. + """ + if not isinstance(size, int): + if not hasattr(size, "__index__"): + raise TypeError("Integer argument expected") + size = size.__index__() + self._check_can_read() + return self._buffer.readline(size) + + def readlines(self, size=-1): + """Read a list of lines of uncompressed bytes from the file. + + size can be specified to control the number of lines read: no + further lines will be read once the total size of the lines read + so far equals or exceeds size. + """ + if not isinstance(size, int): + if not hasattr(size, "__index__"): + raise TypeError("Integer argument expected") + size = size.__index__() + self._check_can_read() + return self._buffer.readlines(size) + + def write(self, data): + """Write a byte string to the file. + + Returns the number of uncompressed bytes written, which is + always the length of data in bytes. Note that due to buffering, + the file on disk may not reflect the data written until close() + is called. + """ + self._check_can_write() + if isinstance(data, (bytes, bytearray)): + length = len(data) + else: + # accept any data that supports the buffer protocol + data = memoryview(data) + length = data.nbytes + + compressed = self._compressor.compress(data) + self._fp.write(compressed) + self._pos += length + return length + + def writelines(self, seq): + """Write a sequence of byte strings to the file. + + Returns the number of uncompressed bytes written. + seq can be any iterable yielding byte strings. + + Line separators are not added between the written byte strings. + """ + return _compression.BaseStream.writelines(self, seq) + + def seek(self, offset, whence=io.SEEK_SET): + """Change the file position. + + The new position is specified by offset, relative to the + position indicated by whence. Values for whence are: + + 0: start of stream (default); offset must not be negative + 1: current stream position + 2: end of stream; offset must not be positive + + Returns the new file position. + + Note that seeking is emulated, so depending on the parameters, + this operation may be extremely slow. + """ + self._check_can_seek() + return self._buffer.seek(offset, whence) + + def tell(self): + """Return the current file position.""" + self._check_not_closed() + if self._mode == _MODE_READ: + return self._buffer.tell() + return self._pos + + +def open(filename, mode="rb", compresslevel=9, + encoding=None, errors=None, newline=None): + """Open a bzip2-compressed file in binary or text mode. + + The filename argument can be an actual filename (a str, bytes, or + PathLike object), or an existing file object to read from or write + to. + + The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or + "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. + The default mode is "rb", and the default compresslevel is 9. + + For binary mode, this function is equivalent to the BZ2File + constructor: BZ2File(filename, mode, compresslevel). In this case, + the encoding, errors and newline arguments must not be provided. + + For text mode, a BZ2File object is created, and wrapped in an + io.TextIOWrapper instance with the specified encoding, error + handling behavior, and line ending(s). + + """ + if "t" in mode: + if "b" in mode: + raise ValueError("Invalid mode: %r" % (mode,)) + else: + if encoding is not None: + raise ValueError("Argument 'encoding' not supported in binary mode") + if errors is not None: + raise ValueError("Argument 'errors' not supported in binary mode") + if newline is not None: + raise ValueError("Argument 'newline' not supported in binary mode") + + bz_mode = mode.replace("t", "") + binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) + + if "t" in mode: + encoding = io.text_encoding(encoding) + return io.TextIOWrapper(binary_file, encoding, errors, newline) + else: + return binary_file + + +def compress(data, compresslevel=9): + """Compress a block of data. + + compresslevel, if given, must be a number between 1 and 9. + + For incremental compression, use a BZ2Compressor object instead. + """ + comp = BZ2Compressor(compresslevel) + return comp.compress(data) + comp.flush() + + +def decompress(data): + """Decompress a block of data. + + For incremental decompression, use a BZ2Decompressor object instead. + """ + results = [] + while data: + decomp = BZ2Decompressor() + try: + res = decomp.decompress(data) + except OSError: + if results: + break # Leftover data is not a valid bzip2 stream; ignore it. + else: + raise # Error on the first iteration; bail out. + results.append(res) + if not decomp.eof: + raise ValueError("Compressed data ended before the " + "end-of-stream marker was reached") + data = decomp.unused_data + return b"".join(results) From da9c98675bca5a24d160ca3cc456ee7ebbd5c204 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Fri, 5 Aug 2022 04:00:51 +0900 Subject: [PATCH 3/3] optional bz2 --- Cargo.toml | 1 + stdlib/Cargo.toml | 3 ++- stdlib/src/lib.rs | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d584879f0..238cbe32c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ jit = ["rustpython-vm/jit"] threading = ["rustpython-vm/threading", "rustpython-stdlib/threading"] pylib = ["rustpython-vm/pylib"] zlib = ["stdlib", "rustpython-stdlib/zlib"] +bz2 = ["stdlib", "rustpython-stdlib/bz2"] ssl = ["rustpython-stdlib/ssl"] ssl-vendor = ["rustpython-stdlib/ssl-vendor"] diff --git a/stdlib/Cargo.toml b/stdlib/Cargo.toml index 91c3cd764..1a5e3555f 100644 --- a/stdlib/Cargo.toml +++ b/stdlib/Cargo.toml @@ -41,7 +41,7 @@ unic-ucd-ident = "0.9.0" adler32 = "1.2.0" crc32fast = "1.3.2" flate2 = "1.0.23" -bzip2 = "0.4" +bzip2 = { version = "0.4", optional = true } num-complex = "0.4.0" num-bigint = "0.4.3" @@ -86,6 +86,7 @@ compiler = ["rustpython-vm/compiler"] # parser = ["rustpython-parser", "ast"] zlib = ["libz-sys", "flate2/zlib"] +bz2 = ["bzip2"] ssl = ["openssl", "openssl-sys", "foreign-types-shared"] ssl-vendor = ["ssl", "openssl/vendored", "openssl-probe"] diff --git a/stdlib/src/lib.rs b/stdlib/src/lib.rs index e610e0059..7ba884549 100644 --- a/stdlib/src/lib.rs +++ b/stdlib/src/lib.rs @@ -25,7 +25,7 @@ mod random; mod statistics; // TODO: maybe make this an extension module, if we ever get those // mod re; -#[cfg(not(any(target_arch = "wasm32")))] +#[cfg(feature = "bz2")] mod bz2; #[cfg(not(target_arch = "wasm32"))] pub mod socket; @@ -117,12 +117,15 @@ pub fn get_module_inits() -> impl Iterator, StdlibInit "select" => select::make_module, "_socket" => socket::make_module, "faulthandler" => faulthandler::make_module, - "_bz2" => bz2::make_module, } #[cfg(feature = "ssl")] { "_ssl" => ssl::make_module, } + #[cfg(feature = "bz2")] + { + "_bz2" => bz2::make_module, + } // Unix-only #[cfg(unix)] {