Fix float parsing (#5643)

* Fix float parsing

* Add rustpython_literal::complex

* Don't call .to_string() on a constant
This commit is contained in:
Noa
2025-03-31 00:37:47 -05:00
committed by GitHub
parent 0b35946972
commit 160363fa46
10 changed files with 155 additions and 133 deletions

12
Cargo.lock generated
View File

@@ -1270,9 +1270,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "lexical-parse-float"
version = "0.8.5"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2"
dependencies = [
"lexical-parse-integer",
"lexical-util",
@@ -1281,9 +1281,9 @@ dependencies = [
[[package]]
name = "lexical-parse-integer"
version = "0.8.6"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e"
dependencies = [
"lexical-util",
"static_assertions",
@@ -1291,9 +1291,9 @@ dependencies = [
[[package]]
name = "lexical-util"
version = "0.8.5"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3"
dependencies = [
"static_assertions",
]

View File

@@ -35,8 +35,6 @@ class OtherFloatSubclass(float):
class GeneralFloatCases(unittest.TestCase):
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_float(self):
self.assertEqual(float(3.14), 3.14)
self.assertEqual(float(314), 314.0)

View File

@@ -609,6 +609,49 @@ macro_rules! ascii {
}
pub use ascii;
// TODO: this should probably live in a crate like unic or unicode-properties
const UNICODE_DECIMAL_VALUES: &[char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '٠', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨',
'٩', '۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '߀', '߁', '߂', '߃', '߄', '߅', '߆', '߇',
'߈', '߉', '', '१', '२', '३', '४', '५', '६', '७', '८', '९', '', '১', '২', '৩', '', '৫', '৬',
'', '৮', '৯', '', '', '੨', '੩', '', '੫', '੬', '੭', '੮', '੯', '', '૧', '૨', '૩', '૪', '૫',
'૬', '૭', '૮', '૯', '', '୧', '', '୩', '୪', '୫', '୬', '୭', '୮', '୯', '', '௧', '௨', '௩', '௪',
'௫', '௬', '௭', '௮', '௯', '', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯', '', '೧', '೨', '೩',
'೪', '೫', '೬', '೭', '೮', '೯', '', '൧', '൨', '൩', '൪', '൫', '൬', '', '൮', '൯', '෦', '෧', '෨',
'෩', '෪', '෫', '෬', '෭', '෮', '෯', '', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', '', '໑',
'໒', '໓', '໔', '໕', '໖', '໗', '໘', '໙', '༠', '༡', '༢', '༣', '༤', '༥', '༦', '༧', '༨', '༩', '',
'၁', '၂', '၃', '၄', '၅', '၆', '၇', '၈', '၉', '႐', '႑', '႒', '႓', '႔', '႕', '႖', '႗', '႘', '႙',
'០', '១', '២', '៣', '៤', '៥', '៦', '៧', '៨', '៩', '᠐', '᠑', '᠒', '᠓', '᠔', '᠕', '᠖', '᠗', '᠘',
'᠙', '᥆', '᥇', '᥈', '᥉', '᥊', '᥋', '᥌', '᥍', '᥎', '᥏', '᧐', '᧑', '᧒', '᧓', '᧔', '᧕', '᧖', '᧗',
'᧘', '᧙', '᪀', '᪁', '᪂', '᪃', '᪄', '᪅', '᪆', '᪇', '᪈', '᪉', '᪐', '᪑', '᪒', '᪓', '᪔', '᪕', '᪖',
'᪗', '᪘', '᪙', '᭐', '᭑', '᭒', '᭓', '᭔', '᭕', '᭖', '᭗', '᭘', '᭙', '᮰', '᮱', '᮲', '᮳', '᮴', '᮵',
'᮶', '᮷', '᮸', '᮹', '᱀', '᱁', '᱂', '᱃', '᱄', '᱅', '᱆', '᱇', '᱈', '᱉', '᱐', '᱑', '᱒', '᱓', '᱔',
'᱕', '᱖', '᱗', '᱘', '᱙', '꘠', '꘡', '꘢', '꘣', '꘤', '꘥', '꘦', '꘧', '꘨', '꘩', '꣐', '꣑', '꣒', '꣓',
'꣔', '꣕', '꣖', '꣗', '꣘', '꣙', '꤀', '꤁', '꤂', '꤃', '꤄', '꤅', '꤆', '꤇', '꤈', '꤉', '꧐', '꧑', '꧒',
'꧓', '꧔', '꧕', '꧖', '꧗', '꧘', '꧙', '꧰', '꧱', '꧲', '꧳', '꧴', '꧵', '꧶', '꧷', '꧸', '꧹', '꩐', '꩑',
'꩒', '꩓', '꩔', '꩕', '꩖', '꩗', '꩘', '꩙', '꯰', '꯱', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '',
'', '', '', '', '', '', '', '', '', '𐒠', '𐒡', '𐒢', '𐒣', '𐒤', '𐒥', '𐒦', '𐒧',
'𐒨', '𐒩', '𑁦', '𑁧', '𑁨', '𑁩', '𑁪', '𑁫', '𑁬', '𑁭', '𑁮', '𑁯', '𑃰', '𑃱', '𑃲', '𑃳', '𑃴', '𑃵', '𑃶',
'𑃷', '𑃸', '𑃹', '𑄶', '𑄷', '𑄸', '𑄹', '𑄺', '𑄻', '𑄼', '𑄽', '𑄾', '𑄿', '𑇐', '𑇑', '𑇒', '𑇓', '𑇔', '𑇕',
'𑇖', '𑇗', '𑇘', '𑇙', '𑋰', '𑋱', '𑋲', '𑋳', '𑋴', '𑋵', '𑋶', '𑋷', '𑋸', '𑋹', '𑑐', '𑑑', '𑑒', '𑑓', '𑑔',
'𑑕', '𑑖', '𑑗', '𑑘', '𑑙', '𑓐', '𑓑', '𑓒', '𑓓', '𑓔', '𑓕', '𑓖', '𑓗', '𑓘', '𑓙', '𑙐', '𑙑', '𑙒', '𑙓',
'𑙔', '𑙕', '𑙖', '𑙗', '𑙘', '𑙙', '𑛀', '𑛁', '𑛂', '𑛃', '𑛄', '𑛅', '𑛆', '𑛇', '𑛈', '𑛉', '𑜰', '𑜱', '𑜲',
'𑜳', '𑜴', '𑜵', '𑜶', '𑜷', '𑜸', '𑜹', '𑣠', '𑣡', '𑣢', '𑣣', '𑣤', '𑣥', '𑣦', '𑣧', '𑣨', '𑣩', '𑱐', '𑱑',
'𑱒', '𑱓', '𑱔', '𑱕', '𑱖', '𑱗', '𑱘', '𑱙', '𑵐', '𑵑', '𑵒', '𑵓', '𑵔', '𑵕', '𑵖', '𑵗', '𑵘', '𑵙', '𖩠',
'𖩡', '𖩢', '𖩣', '𖩤', '𖩥', '𖩦', '𖩧', '𖩨', '𖩩', '𖭐', '𖭑', '𖭒', '𖭓', '𖭔', '𖭕', '𖭖', '𖭗', '𖭘', '𖭙',
'𝟎', '𝟏', '𝟐', '𝟑', '𝟒', '𝟓', '𝟔', '𝟕', '𝟖', '𝟗', '𝟘', '𝟙', '𝟚', '𝟛', '𝟜', '𝟝', '𝟞', '𝟟', '𝟠',
'𝟡', '𝟢', '𝟣', '𝟤', '𝟥', '𝟦', '𝟧', '𝟨', '𝟩', '𝟪', '𝟫', '𝟬', '𝟭', '𝟮', '𝟯', '𝟰', '𝟱', '𝟲', '𝟳',
'𝟴', '𝟵', '𝟶', '𝟷', '𝟸', '𝟹', '𝟺', '𝟻', '𝟼', '𝟽', '𝟾', '𝟿', '𞥐', '𞥑', '𞥒', '𞥓', '𞥔', '𞥕', '𞥖',
'𞥗', '𞥘', '𞥙',
];
pub fn char_to_decimal(ch: char) -> Option<u8> {
UNICODE_DECIMAL_VALUES
.binary_search(&ch)
.ok()
.map(|i| (i % 10) as u8)
}
#[cfg(test)]
mod tests {
use super::*;

View File

@@ -366,7 +366,7 @@ impl<'a, 'b, 'c> Unparser<'a, 'b, 'c> {
}
}
&ruff::Number::Complex { real, imag } => self
.p(&rustpython_literal::float::complex_to_string(real, imag)
.p(&rustpython_literal::complex::to_string(real, imag)
.replace("inf", inf_str))?,
}
}

View File

@@ -13,7 +13,7 @@ rustpython-wtf8 = { workspace = true }
hexf-parse = "0.2.1"
is-macro.workspace = true
lexical-parse-float = { version = "0.8.0", features = ["format"] }
lexical-parse-float = { version = "1.0.4", features = ["format"] }
num-traits = { workspace = true }
unic-ucd-category = { workspace = true }

View File

@@ -0,0 +1,73 @@
use crate::float;
/// Convert a complex number to a string.
pub fn to_string(re: f64, im: f64) -> String {
// integer => drop ., fractional => float_ops
let mut im_part = if im.fract() == 0.0 {
im.to_string()
} else {
float::to_string(im)
};
im_part.push('j');
// positive empty => return im_part, integer => drop ., fractional => float_ops
let re_part = if re == 0.0 {
if re.is_sign_positive() {
return im_part;
} else {
"-0".to_owned()
}
} else if re.fract() == 0.0 {
re.to_string()
} else {
float::to_string(re)
};
let mut result =
String::with_capacity(re_part.len() + im_part.len() + 2 + im.is_sign_positive() as usize);
result.push('(');
result.push_str(&re_part);
if im.is_sign_positive() || im.is_nan() {
result.push('+');
}
result.push_str(&im_part);
result.push(')');
result
}
/// Parse a complex number from a string.
///
/// Returns `Some((re, im))` on success.
pub fn parse_str(s: &str) -> Option<(f64, f64)> {
let s = s.trim();
// Handle parentheses
let s = match s.strip_prefix('(') {
None => s,
Some(s) => s.strip_suffix(')')?.trim(),
};
let value = match s.strip_suffix(|c| c == 'j' || c == 'J') {
None => (float::parse_str(s)?, 0.0),
Some(mut s) => {
let mut real = 0.0;
// Find the central +/- operator. If it exists, parse the real part.
for (i, w) in s.as_bytes().windows(2).enumerate() {
if (w[1] == b'+' || w[1] == b'-') && !(w[0] == b'e' || w[0] == b'E') {
real = float::parse_str(&s[..=i])?;
s = &s[i + 1..];
break;
}
}
let imag = match s {
// "j", "+j"
"" | "+" => 1.0,
// "-j"
"-" => -1.0,
s => float::parse_str(s)?,
};
(real, imag)
}
};
Some(value)
}

View File

@@ -6,49 +6,8 @@ pub fn parse_str(literal: &str) -> Option<f64> {
parse_inner(literal.trim().as_bytes())
}
fn strip_underlines(literal: &[u8]) -> Option<Vec<u8>> {
let mut prev = b'\0';
let mut dup = Vec::<u8>::new();
for p in literal {
if *p == b'_' {
// Underscores are only allowed after digits.
if !prev.is_ascii_digit() {
return None;
}
} else {
dup.push(*p);
// Underscores are only allowed before digits.
if prev == b'_' && !p.is_ascii_digit() {
return None;
}
}
prev = *p;
}
// Underscores are not allowed at the end.
if prev == b'_' {
return None;
}
Some(dup)
}
pub fn parse_bytes(literal: &[u8]) -> Option<f64> {
parse_inner(trim_slice(literal, |b| b.is_ascii_whitespace()))
}
fn trim_slice<T>(v: &[T], mut trim: impl FnMut(&T) -> bool) -> &[T] {
let mut it = v.iter();
// it.take_while_ref(&mut trim).for_each(drop);
// hmm.. `&mut slice::Iter<_>` is not `Clone`
// it.by_ref().rev().take_while_ref(&mut trim).for_each(drop);
while it.clone().next().is_some_and(&mut trim) {
it.next();
}
while it.clone().next_back().is_some_and(&mut trim) {
it.next_back();
}
it.as_slice()
parse_inner(literal.trim_ascii())
}
fn parse_inner(literal: &[u8]) -> Option<f64> {
@@ -56,15 +15,11 @@ fn parse_inner(literal: &[u8]) -> Option<f64> {
FromLexicalWithOptions, NumberFormatBuilder, Options, format::PYTHON3_LITERAL,
};
// Use custom function for underline handling for now.
// For further information see https://github.com/Alexhuszagh/rust-lexical/issues/96.
let stripped = strip_underlines(literal)?;
// lexical-core's format::PYTHON_STRING is inaccurate
const PYTHON_STRING: u128 = NumberFormatBuilder::rebuild(PYTHON3_LITERAL)
.no_special(false)
.build();
f64::from_lexical_with_options::<PYTHON_STRING>(&stripped, &Options::new()).ok()
f64::from_lexical_with_options::<PYTHON_STRING>(literal, &Options::new()).ok()
}
pub fn is_integer(v: f64) -> bool {
@@ -223,39 +178,6 @@ pub fn to_string(value: f64) -> String {
}
}
pub fn complex_to_string(re: f64, im: f64) -> String {
// integer => drop ., fractional => float_ops
let mut im_part = if im.fract() == 0.0 {
im.to_string()
} else {
to_string(im)
};
im_part.push('j');
// positive empty => return im_part, integer => drop ., fractional => float_ops
let re_part = if re == 0.0 {
if re.is_sign_positive() {
return im_part;
} else {
re.to_string()
}
} else if re.fract() == 0.0 {
re.to_string()
} else {
to_string(re)
};
let mut result =
String::with_capacity(re_part.len() + im_part.len() + 2 + im.is_sign_positive() as usize);
result.push('(');
result.push_str(&re_part);
if im.is_sign_positive() || im.is_nan() {
result.push('+');
}
result.push_str(&im_part);
result.push(')');
result
}
pub fn from_hex(s: &str) -> Option<f64> {
if let Ok(f) = hexf_parse::parse_hexf64(s, false) {
return Some(f);

View File

@@ -1,4 +1,5 @@
pub mod char;
pub mod complex;
pub mod escape;
pub mod float;
pub mod format;

View File

@@ -179,13 +179,13 @@ impl Constructor for PyComplex {
"complex() can't take second arg if first is a string".to_owned(),
));
}
let value = s
let (re, im) = s
.to_str()
.and_then(|s| parse_str(s.trim()))
.and_then(rustpython_literal::complex::parse_str)
.ok_or_else(|| {
vm.new_value_error("complex() arg is a malformed string".to_owned())
})?;
return Self::from(value)
return Self::from(Complex64 { re, im })
.into_ref_with_type(vm, cls)
.map(Into::into);
} else {
@@ -494,7 +494,7 @@ impl Representable for PyComplex {
// TODO: when you fix this, move it to rustpython_common::complex::repr and update
// ast/src/unparse.rs + impl Display for Constant in ast/src/constant.rs
let Complex64 { re, im } = zelf.value;
Ok(rustpython_literal::float::complex_to_string(re, im))
Ok(rustpython_literal::complex::to_string(re, im))
}
}
@@ -519,40 +519,3 @@ pub struct ComplexArgs {
#[pyarg(any, optional)]
imag: OptionalArg<PyObjectRef>,
}
fn parse_str(s: &str) -> Option<Complex64> {
// Handle parentheses
let s = match s.strip_prefix('(') {
None => s,
Some(s) => match s.strip_suffix(')') {
None => return None,
Some(s) => s.trim(),
},
};
let value = match s.strip_suffix(|c| c == 'j' || c == 'J') {
None => Complex64::new(crate::literal::float::parse_str(s)?, 0.0),
Some(mut s) => {
let mut real = 0.0;
// Find the central +/- operator. If it exists, parse the real part.
for (i, w) in s.as_bytes().windows(2).enumerate() {
if (w[1] == b'+' || w[1] == b'-') && !(w[0] == b'e' || w[0] == b'E') {
real = crate::literal::float::parse_str(&s[..=i])?;
s = &s[i + 1..];
break;
}
}
let imag = match s {
// "j", "+j"
"" | "+" => 1.0,
// "-j"
"-" => -1.0,
s => crate::literal::float::parse_str(s)?,
};
Complex64::new(real, imag)
}
};
Some(value)
}

View File

@@ -159,9 +159,31 @@ impl Constructor for PyFloat {
}
fn float_from_string(val: PyObjectRef, vm: &VirtualMachine) -> PyResult<f64> {
let (bytearray, buffer, buffer_lock);
let (bytearray, buffer, buffer_lock, mapped_string);
let b = if let Some(s) = val.payload_if_subclass::<PyStr>(vm) {
s.as_wtf8().trim().as_bytes()
use crate::common::str::PyKindStr;
match s.as_str_kind() {
PyKindStr::Ascii(s) => s.trim().as_bytes(),
PyKindStr::Utf8(s) => {
mapped_string = s
.trim()
.chars()
.map(|c| {
if let Some(n) = rustpython_common::str::char_to_decimal(c) {
char::from_digit(n.into(), 10).unwrap()
} else if c.is_whitespace() {
' '
} else {
c
}
})
.collect::<String>();
mapped_string.as_bytes()
}
// if there are surrogates, it's not gonna parse anyway,
// so we can just choose a known bad value
PyKindStr::Wtf8(_) => b"",
}
} else if let Some(bytes) = val.payload_if_subclass::<PyBytes>(vm) {
bytes.as_bytes()
} else if let Some(buf) = val.payload_if_subclass::<PyByteArray>(vm) {