-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: parse_float_as_decimal
supports scientific notation and Decimal256
#13806
Changes from 6 commits
e5c0576
bdf43b8
d36b73b
d33a6ff
5b94a38
4822444
731b0aa
49a4e0a
ec6f573
2077749
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,10 +19,13 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; | |
use arrow::compute::kernels::cast_utils::{ | ||
parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit, | ||
}; | ||
use arrow::datatypes::DECIMAL128_MAX_PRECISION; | ||
use arrow_schema::DataType; | ||
use arrow::datatypes::{i256, DECIMAL128_MAX_PRECISION}; | ||
use arrow_schema::{DataType, DECIMAL256_MAX_PRECISION}; | ||
use bigdecimal::num_bigint::BigInt; | ||
use bigdecimal::{BigDecimal, Signed, ToPrimitive}; | ||
use datafusion_common::{ | ||
internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue, | ||
internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, | ||
DataFusionError, Result, ScalarValue, | ||
}; | ||
use datafusion_expr::expr::{BinaryExpr, Placeholder}; | ||
use datafusion_expr::planner::PlannerResult; | ||
|
@@ -31,6 +34,9 @@ use log::debug; | |
use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value}; | ||
use sqlparser::parser::ParserError::ParserError; | ||
use std::borrow::Cow; | ||
use std::cmp::Ordering; | ||
use std::ops::Neg; | ||
use std::str::FromStr; | ||
|
||
impl<S: ContextProvider> SqlToRel<'_, S> { | ||
pub(crate) fn parse_value( | ||
|
@@ -84,7 +90,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> { | |
} | ||
|
||
if self.options.parse_float_as_decimal { | ||
parse_decimal_128(unsigned_number, negative) | ||
parse_decimal(unsigned_number, negative) | ||
} else { | ||
signed_number.parse::<f64>().map(lit).map_err(|_| { | ||
DataFusionError::from(ParserError(format!( | ||
|
@@ -315,45 +321,80 @@ const fn try_decode_hex_char(c: u8) -> Option<u8> { | |
} | ||
} | ||
|
||
/// Parse Decimal128 from a string | ||
/// | ||
/// TODO: support parsing from scientific notation | ||
fn parse_decimal_128(unsigned_number: &str, negative: bool) -> Result<Expr> { | ||
// remove leading zeroes | ||
let trimmed = unsigned_number.trim_start_matches('0'); | ||
// Parse precision and scale, remove decimal point if exists | ||
let (precision, scale, replaced_str) = if trimmed == "." { | ||
// Special cases for numbers such as “0.”, “000.”, and so on. | ||
(1, 0, Cow::Borrowed("0")) | ||
} else if let Some(i) = trimmed.find('.') { | ||
( | ||
trimmed.len() - 1, | ||
trimmed.len() - i - 1, | ||
Cow::Owned(trimmed.replace('.', "")), | ||
) | ||
} else { | ||
// No decimal point, keep as is | ||
(trimmed.len(), 0, Cow::Borrowed(trimmed)) | ||
}; | ||
/// Callers ensure the value is within i256 range | ||
/// Modified from <https://github.com/apache/arrow-rs/blob/c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e/arrow-buffer/src/bigint/mod.rs#L303> | ||
fn bigint_to_i256(v: BigInt) -> Result<i256> { | ||
let v_bytes = v.to_signed_bytes_le(); | ||
match v_bytes.len().cmp(&32) { | ||
Ordering::Less => { | ||
let mut bytes = if v.is_negative() { | ||
[255_u8; 32] | ||
} else { | ||
[0; 32] | ||
}; | ||
bytes[0..v_bytes.len()].copy_from_slice(&v_bytes[..v_bytes.len()]); | ||
Ok(i256::from_le_bytes(bytes)) | ||
} | ||
Ordering::Equal => Ok(i256::from_le_bytes(v_bytes.try_into().unwrap())), | ||
Ordering::Greater => { | ||
internal_err!("Unexpected overflow when converting {} to i256", v) | ||
} | ||
} | ||
} | ||
|
||
let number = replaced_str.parse::<i128>().map_err(|e| { | ||
fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add unit tests for this function, to exercise bound checks. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the new SLT tests have covered them, but please remind me if I've overlooked anything There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. they likely do, but i find it valuable to have unit tests, especially for a code which is well isolated (easy to unit tests) and has so many IFs (valuable to unit test). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make sense. Added in 2077749 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks! |
||
let mut dec = BigDecimal::from_str(unsigned_number).map_err(|e| { | ||
DataFusionError::from(ParserError(format!( | ||
"Cannot parse {replaced_str} as i128 when building decimal: {e}" | ||
"Cannot parse {unsigned_number} as BigDecimal: {e}" | ||
))) | ||
})?; | ||
|
||
// Check precision overflow | ||
if precision as u8 > DECIMAL128_MAX_PRECISION { | ||
return Err(DataFusionError::from(ParserError(format!( | ||
"Cannot parse {replaced_str} as i128 when building decimal: precision overflow" | ||
)))); | ||
if negative { | ||
dec = dec.neg(); | ||
} | ||
|
||
Ok(Expr::Literal(ScalarValue::Decimal128( | ||
Some(if negative { -number } else { number }), | ||
precision as u8, | ||
scale as i8, | ||
))) | ||
let digits = dec.digits(); | ||
let (int_val, scale) = dec.into_bigint_and_exponent(); | ||
if scale < i8::MIN as i64 { | ||
findepi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return not_impl_err!( | ||
"Decimal scale {} exceeds the minimum supported scale: {}", | ||
scale, | ||
i8::MIN | ||
); | ||
} | ||
let precision = if scale > 0 { | ||
// arrow-rs requires the precision to include the positive scale. | ||
// See <https://github.com/apache/arrow-rs/blob/123045cc766d42d1eb06ee8bb3f09e39ea995ddc/arrow-array/src/types.rs#L1230> | ||
std::cmp::max(digits, scale.unsigned_abs()) | ||
} else { | ||
digits | ||
}; | ||
if precision <= DECIMAL128_MAX_PRECISION as u64 { | ||
let val = int_val.to_i128().ok_or_else(|| { | ||
// Failures are unexpected here as we have already checked the precision | ||
internal_datafusion_err!( | ||
"Unexpected overflow when converting {} to i128", | ||
int_val | ||
) | ||
})?; | ||
Ok(Expr::Literal(ScalarValue::Decimal128( | ||
Some(val), | ||
precision as u8, | ||
scale as i8, | ||
))) | ||
} else if precision <= DECIMAL256_MAX_PRECISION as u64 { | ||
let val = bigint_to_i256(int_val)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. either add comment like above ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in 49a4e0a |
||
Ok(Expr::Literal(ScalarValue::Decimal256( | ||
Some(val), | ||
precision as u8, | ||
scale as i8, | ||
))) | ||
} else { | ||
not_impl_err!( | ||
findepi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"Decimal precision {} exceeds the maximum supported precision: {}", | ||
precision, | ||
DECIMAL256_MAX_PRECISION | ||
) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -192,19 +192,95 @@ select arrow_typeof(00009999999999999999999999999999999999.9999), | |
---- | ||
Decimal128(38, 4) Decimal128(38, 4) Decimal128(20, 0) | ||
|
||
# precision overflow | ||
statement error DataFusion error: SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\) | ||
select 123456789.012345678901234567890123456789 | ||
# scientific notation | ||
query RTRTRT | ||
select 1.23e3, arrow_typeof(1.23e3), | ||
+1.23e1, arrow_typeof(+1.23e1), | ||
-1234.56e-3, arrow_typeof(-1234.56e-3) | ||
---- | ||
1230 Decimal128(3, -1) 12.3 Decimal128(3, 1) -1.23456 Decimal128(6, 5) | ||
|
||
query RTRTRT | ||
select 1.23e-2, arrow_typeof(1.23e-2), | ||
1.23456e0, arrow_typeof(1.23456e0), | ||
-.0123e2, arrow_typeof(-.0123e2) | ||
---- | ||
0.0123 Decimal128(4, 4) 1.23456 Decimal128(6, 5) -1.23 Decimal128(3, 2) | ||
|
||
# Decimal256 cases | ||
query RT | ||
select 123456789.0123456789012345678901234567890, | ||
arrow_typeof(123456789.0123456789012345678901234567890) | ||
---- | ||
123456789.012345678901 Decimal256(40, 31) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Display is incomplete due to rounding in sqllogictest |
||
|
||
query RT | ||
select -123456789.0123456789012345678901234567890, | ||
arrow_typeof(-123456789.0123456789012345678901234567890) | ||
---- | ||
-123456789.012345678901 Decimal256(40, 31) | ||
|
||
# max precision and scale of Decimal256 | ||
query RTRT | ||
select -1e-76, arrow_typeof(-1e-76), | ||
-1.234567e-70, arrow_typeof(-1.234567e-70) | ||
---- | ||
0 Decimal256(76, 76) 0 Decimal256(76, 76) | ||
|
||
statement error SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\) | ||
select -123456789.012345678901234567890123456789 | ||
# Decimal256::MAX | ||
query RT | ||
select 9999999999999999999999999999999999999999999999999999999999999999999999999999, | ||
arrow_typeof(9999999999999999999999999999999999999999999999999999999999999999999999999999); | ||
---- | ||
9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0) | ||
|
||
# can not fit in i128 | ||
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\) | ||
select 123456789.0123456789012345678901234567890 | ||
# Decimal256::MIN | ||
query RT | ||
select -9999999999999999999999999999999999999999999999999999999999999999999999999999, | ||
arrow_typeof(-9999999999999999999999999999999999999999999999999999999999999999999999999999); | ||
---- | ||
-9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0) | ||
|
||
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\) | ||
select -123456789.0123456789012345678901234567890 | ||
# boundaries between decimal128 and decimal256 | ||
query RTRT | ||
select 1e-38, arrow_typeof(1e-38), | ||
1e-39, arrow_typeof(1e-39); | ||
---- | ||
0 Decimal128(38, 38) 0 Decimal256(39, 39) | ||
|
||
query RTRT | ||
select -1e-38, arrow_typeof(-1e-38), | ||
-1e-39, arrow_typeof(-1e-39); | ||
---- | ||
0 Decimal128(38, 38) 0 Decimal256(39, 39) | ||
|
||
# unsupported precision | ||
query error Decimal precision 77 exceeds the maximum supported precision: 76 | ||
select -1e-77; | ||
|
||
query error Decimal precision 79 exceeds the maximum supported precision: 76 | ||
select 1.000000000000000000000000000000000000000000000000000000000000000000000000000001; | ||
|
||
# negative scales | ||
query TR | ||
select arrow_typeof(1e77), 1e77 | ||
---- | ||
Decimal128(1, -77) 100000000000000000000000000000000000000000000000000000000000000000000000000000 | ||
|
||
query T | ||
select arrow_typeof(1e128) | ||
---- | ||
Decimal128(1, -128) | ||
|
||
query error Decimal scale \-129 exceeds the minimum supported scale: \-128 | ||
select 1e129 | ||
|
||
# simple arithmetic | ||
query RTRT | ||
select 1e40 + 1e40, arrow_typeof(1e40 + 1e40), | ||
1e-40 + -1e-40, arrow_typeof(1e-40 + -1e-40) | ||
---- | ||
20000000000000000000000000000000000000000 Decimal128(2, -40) 0 Decimal256(41, 40) | ||
|
||
# Restore option to default value | ||
statement ok | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't have to, the function is fallible
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I reorganized it and tried to clarify it in 49a4e0a