From 1f35b03426b00e176518b6791cd73a181473dc1a Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 8 Aug 2024 17:23:48 +0200 Subject: [PATCH 01/12] Sync rust docs params for CI and dev (#11890) Since a4ac0829ecf63b3640315835b1374211dfadd953 commit there was a discrepancy between rust.yml GitHub workflow and the `dev/rust_lint.sh` script behavior. Sync the behaviors. Reuse common script to prevent future discrepancies. --- .github/workflows/rust.yml | 6 +----- ci/scripts/rust_docs.sh | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 00d31765e77d..809f3acd8374 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -233,11 +233,7 @@ jobs: with: rust-version: stable - name: Run cargo doc - run: | - export RUSTDOCFLAGS="-D warnings" - cargo doc --document-private-items --no-deps --workspace - cd datafusion-cli - cargo doc --document-private-items --no-deps + run: ci/scripts/rust_docs.sh linux-wasm-pack: name: build with wasm-pack diff --git a/ci/scripts/rust_docs.sh b/ci/scripts/rust_docs.sh index cf83b80b5132..5c93711b6fb6 100755 --- a/ci/scripts/rust_docs.sh +++ b/ci/scripts/rust_docs.sh @@ -18,7 +18,7 @@ # under the License. set -ex -export RUSTDOCFLAGS="-D warnings -A rustdoc::private-intra-doc-links" +export RUSTDOCFLAGS="-D warnings" cargo doc --document-private-items --no-deps --workspace cd datafusion-cli cargo doc --document-private-items --no-deps From 0ce6d1687e3fccc4dd50c5759919f2a9baecb549 Mon Sep 17 00:00:00 2001 From: Dmitry Bugakov Date: Thu, 8 Aug 2024 18:10:30 +0200 Subject: [PATCH 02/12] Update ASCII scalar function to support Utf8View #11834 (#11884) --- datafusion/functions/src/string/ascii.rs | 118 +++++++++++++----- .../sqllogictest/test_files/string_view.slt | 99 +++++++++++++++ 2 files changed, 189 insertions(+), 28 deletions(-) diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 9e1e6b81b61d..68ba3f5ff15f 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -16,33 +16,15 @@ // under the License. use crate::utils::make_scalar_function; -use arrow::array::Int32Array; -use arrow::array::{ArrayRef, OffsetSizeTrait}; +use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; use arrow::datatypes::DataType; -use datafusion_common::{cast::as_generic_string_array, internal_err, Result}; +use arrow::error::ArrowError; +use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; -/// Returns the numeric code of the first character of the argument. -/// ascii('x') = 120 -pub fn ascii(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - - let result = string_array - .iter() - .map(|string| { - string.map(|string: &str| { - let mut chars = string.chars(); - chars.next().map_or(0, |v| v as i32) - }) - }) - .collect::(); - - Ok(Arc::new(result) as ArrayRef) -} - #[derive(Debug)] pub struct AsciiFunc { signature: Signature, @@ -60,7 +42,7 @@ impl AsciiFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), } @@ -87,12 +69,92 @@ impl ScalarUDFImpl for AsciiFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result { - match args[0].data_type() { - DataType::Utf8 => make_scalar_function(ascii::, vec![])(args), - DataType::LargeUtf8 => { - return make_scalar_function(ascii::, vec![])(args); - } - _ => internal_err!("Unsupported data type"), + make_scalar_function(ascii, vec![])(args) + } +} + +fn calculate_ascii<'a, V>(array: V) -> Result +where + V: ArrayAccessor, +{ + let iter = ArrayIter::new(array); + let result = iter + .map(|string| { + string.map(|s| { + let mut chars = s.chars(); + chars.next().map_or(0, |v| v as i32) + }) + }) + .collect::(); + + Ok(Arc::new(result) as ArrayRef) +} + +/// Returns the numeric code of the first character of the argument. +pub fn ascii(args: &[ArrayRef]) -> Result { + match args[0].data_type() { + DataType::Utf8 => { + let string_array = args[0].as_string::(); + Ok(calculate_ascii(string_array)?) + } + DataType::LargeUtf8 => { + let string_array = args[0].as_string::(); + Ok(calculate_ascii(string_array)?) + } + DataType::Utf8View => { + let string_array = args[0].as_string_view(); + Ok(calculate_ascii(string_array)?) } + _ => internal_err!("Unsupported data type"), + } +} + +#[cfg(test)] +mod tests { + use crate::string::ascii::AsciiFunc; + use crate::utils::test::test_function; + use arrow::array::{Array, Int32Array}; + use arrow::datatypes::DataType::Int32; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + macro_rules! test_ascii { + ($INPUT:expr, $EXPECTED:expr) => { + test_function!( + AsciiFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))], + $EXPECTED, + i32, + Int32, + Int32Array + ); + + test_function!( + AsciiFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))], + $EXPECTED, + i32, + Int32, + Int32Array + ); + + test_function!( + AsciiFunc::new(), + &[ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))], + $EXPECTED, + i32, + Int32, + Int32Array + ); + }; + } + + #[test] + fn test_functions() -> Result<()> { + test_ascii!(Some(String::from("x")), Ok(Some(120))); + test_ascii!(Some(String::from("a")), Ok(Some(97))); + test_ascii!(Some(String::from("")), Ok(Some(0))); + test_ascii!(None, Ok(None)); + Ok(()) } } diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 4d3f72b1e8d4..fc10a34256c5 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -500,3 +500,102 @@ select column2|| ' ' ||column3 from temp; ---- rust fast datafusion cool + +### ASCII +# Setup the initial test data +statement ok +create table test_source as values + ('Andrew', 'X'), + ('Xiangpeng', 'Xiangpeng'), + ('Raphael', 'R'), + (NULL, 'R'); + +# Table with the different combination of column types +statement ok +create table test as +SELECT + arrow_cast(column1, 'Utf8') as column1_utf8, + arrow_cast(column2, 'Utf8') as column2_utf8, + arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, + arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, + arrow_cast(column1, 'Utf8View') as column1_utf8view, + arrow_cast(column2, 'Utf8View') as column2_utf8view +FROM test_source; + +# Test ASCII with utf8view against utf8view, utf8, and largeutf8 +# (should be no casts) +query TT +EXPLAIN SELECT + ASCII(column1_utf8view) as c1, + ASCII(column2_utf8) as c2, + ASCII(column2_large_utf8) as c3 +FROM test; +---- +logical_plan +01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3 +02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view] + +query III +SELECT + ASCII(column1_utf8view) as c1, + ASCII(column2_utf8) as c2, + ASCII(column2_large_utf8) as c3 +FROM test; +---- +65 88 88 +88 88 88 +82 82 82 +NULL 82 82 + +query TT +EXPLAIN SELECT + ASCII(column1_utf8) as c1, + ASCII(column1_large_utf8) as c2, + ASCII(column2_utf8view) as c3, + ASCII('hello') as c4, + ASCII(arrow_cast('world', 'Utf8View')) as c5 +FROM test; +---- +logical_plan +01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5 +02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view] + +query IIIII +SELECT + ASCII(column1_utf8) as c1, + ASCII(column1_large_utf8) as c2, + ASCII(column2_utf8view) as c3, + ASCII('hello') as c4, + ASCII(arrow_cast('world', 'Utf8View')) as c5 +FROM test; +---- +65 65 88 104 119 +88 88 88 104 119 +82 82 82 104 119 +NULL NULL 82 104 119 + +# Test ASCII with literals cast to Utf8View +query TT +EXPLAIN SELECT + ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, + ASCII(arrow_cast('', 'Utf8View')) as c2, + ASCII(arrow_cast(NULL, 'Utf8View')) as c3 +FROM test; +---- +logical_plan +01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3 +02)--TableScan: test projection=[] + +query III +SELECT + ASCII(arrow_cast('äöüß', 'Utf8View')) as c1, + ASCII(arrow_cast('', 'Utf8View')) as c2, + ASCII(arrow_cast(NULL, 'Utf8View')) as c3 +---- +228 0 NULL + +statement ok +drop table test; + +statement ok +drop table test_source; From b9bf6c9ec4698aa7a383a936328ef7df22e8fbb3 Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Thu, 8 Aug 2024 17:51:37 +0100 Subject: [PATCH 03/12] Fix `Duration` vs `Interval` comparisons and `Interval` as LHS (#11876) * fix duration vs. interval and interval as LHS * add more operators to "interval vs. duration comparison" slt tests --- datafusion/expr/src/type_coercion/binary.rs | 4 +- datafusion/sql/src/expr/value.rs | 6 ++ datafusion/sql/tests/cases/plan_to_sql.rs | 31 ++++++++++ .../sqllogictest/test_files/timestamps.slt | 62 +++++++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 8da33081d652..6de0118f6bae 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -1130,7 +1130,9 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Some(Interval(MonthDayNano)), + (Interval(_) | Duration(_), Interval(_) | Duration(_)) => { + Some(Interval(MonthDayNano)) + } (Date64, Date32) | (Date32, Date64) => Some(Date64), (Timestamp(_, None), Date64) | (Date64, Timestamp(_, None)) => { Some(Timestamp(Nanosecond, None)) diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index 1564f06fe4b9..afcd182fa343 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -227,6 +227,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let df_op = match op { BinaryOperator::Plus => Operator::Plus, BinaryOperator::Minus => Operator::Minus, + BinaryOperator::Eq => Operator::Eq, + BinaryOperator::NotEq => Operator::NotEq, + BinaryOperator::Gt => Operator::Gt, + BinaryOperator::GtEq => Operator::GtEq, + BinaryOperator::Lt => Operator::Lt, + BinaryOperator::LtEq => Operator::LtEq, _ => { return not_impl_err!("Unsupported interval operator: {op:?}"); } diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 8f9f1dd78f93..2ac303487336 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -549,3 +549,34 @@ fn test_pretty_roundtrip() -> Result<()> { Ok(()) } + +fn sql_round_trip(query: &str, expect: &str) { + let statement = Parser::new(&GenericDialect {}) + .try_with_sql(query) + .unwrap() + .parse_statement() + .unwrap(); + + let context = MockContextProvider::default(); + let sql_to_rel = SqlToRel::new(&context); + let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); + + let roundtrip_statement = plan_to_sql(&plan).unwrap(); + assert_eq!(roundtrip_statement.to_string(), expect); +} + +#[test] +fn test_interval_lhs_eq() { + sql_round_trip( + "select interval '2 seconds' = interval '2 seconds'", + "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' = INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + ); +} + +#[test] +fn test_interval_lhs_lt() { + sql_round_trip( + "select interval '2 seconds' < interval '2 seconds'", + "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' < INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + ); +} diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index b63aad49d152..fb0fd8397f2d 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -3109,6 +3109,68 @@ SELECT * FROM VALUES 2024-02-01T08:00:00Z 2023-12-31T23:00:00Z +# interval vs. duration comparison +query B +select (now() - now()) < interval '1 seconds'; +---- +true + +query B +select (now() - now()) <= interval '1 seconds'; +---- +true + +query B +select (now() - now()) = interval '0 seconds'; +---- +true + +query B +select (now() - now()) != interval '1 seconds'; +---- +true + +query B +select (now() - now()) > interval '-1 seconds'; +---- +true + +query B +select (now() - now()) >= interval '-1 seconds'; +---- +true + +query B +select arrow_cast(123, 'Duration(Nanosecond)') < interval '200 nanoseconds'; +---- +true + +query B +select arrow_cast(123, 'Duration(Nanosecond)') < interval '100 nanoseconds'; +---- +false + +query B +select arrow_cast(123, 'Duration(Nanosecond)') < interval '1 seconds'; +---- +true + +query B +select interval '1 seconds' < arrow_cast(123, 'Duration(Nanosecond)') +---- +false + +# interval as LHS +query B +select interval '2 seconds' = interval '2 seconds'; +---- +true + +query B +select interval '1 seconds' < interval '2 seconds'; +---- +true + statement ok drop table t; From 786f353ca1859d980f2e79a9c13bd1dc1a850111 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Aug 2024 12:13:33 -0600 Subject: [PATCH 04/12] chore: Prepare 41.0.0-rc1 (#11889) * Prepare 41.0.0-rc1 * changelog * changelog * regenerate example parquet file --- Cargo.toml | 40 ++-- datafusion-cli/Cargo.lock | 62 ++--- datafusion-cli/Cargo.toml | 4 +- datafusion/core/example.parquet | Bin 976 -> 976 bytes dev/changelog/41.0.0.md | 363 ++++++++++++++++++++++++++++++ dev/release/README.md | 23 +- docs/source/user-guide/configs.md | 2 +- 7 files changed, 434 insertions(+), 60 deletions(-) create mode 100644 dev/changelog/41.0.0.md diff --git a/Cargo.toml b/Cargo.toml index e1bd0d7aa72b..3431c4673e0c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,7 +55,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" rust-version = "1.76" -version = "40.0.0" +version = "41.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -88,25 +88,25 @@ bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "6.0.1" -datafusion = { path = "datafusion/core", version = "40.0.0", default-features = false } -datafusion-catalog = { path = "datafusion/catalog", version = "40.0.0" } -datafusion-common = { path = "datafusion/common", version = "40.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "40.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "40.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "40.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "40.0.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "40.0.0" } -datafusion-functions-nested = { path = "datafusion/functions-nested", version = "40.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "40.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "40.0.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "40.0.0", default-features = false } -datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "40.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "40.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "40.0.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "40.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "40.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "40.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "40.0.0" } +datafusion = { path = "datafusion/core", version = "41.0.0", default-features = false } +datafusion-catalog = { path = "datafusion/catalog", version = "41.0.0" } +datafusion-common = { path = "datafusion/common", version = "41.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "41.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "41.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "41.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "41.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "41.0.0" } +datafusion-functions-nested = { path = "datafusion/functions-nested", version = "41.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "41.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "41.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "41.0.0", default-features = false } +datafusion-physical-optimizer = { path = "datafusion/physical-optimizer", version = "41.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "41.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "41.0.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "41.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "41.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "41.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "41.0.0" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9d20c242bbef..2eb93da7c020 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -874,9 +874,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.7" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" +checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549" dependencies = [ "jobserver", "libc", @@ -1128,7 +1128,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "apache-avro", @@ -1184,7 +1184,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow-schema", "async-trait", @@ -1196,7 +1196,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1223,7 +1223,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "apache-avro", @@ -1244,14 +1244,14 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "40.0.0" +version = "41.0.0" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "chrono", @@ -1270,7 +1270,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "arrow", @@ -1287,7 +1287,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "arrow-buffer", @@ -1312,7 +1312,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "arrow", @@ -1328,7 +1328,7 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "arrow-array", @@ -1348,7 +1348,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "async-trait", @@ -1366,7 +1366,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "arrow", @@ -1394,7 +1394,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "arrow", @@ -1406,7 +1406,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "40.0.0" +version = "41.0.0" dependencies = [ "datafusion-common", "datafusion-execution", @@ -1416,7 +1416,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "40.0.0" +version = "41.0.0" dependencies = [ "ahash", "arrow", @@ -1448,7 +1448,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "40.0.0" +version = "41.0.0" dependencies = [ "arrow", "arrow-array", @@ -2049,9 +2049,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", @@ -2524,9 +2524,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.2" +version = "0.36.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f203fa8daa7bb185f760ae12bd8e097f63d17041dcdcaf675ac54cdf863170e" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" dependencies = [ "memchr", ] @@ -3220,9 +3220,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" @@ -3341,18 +3341,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.204" +version = "1.0.205" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.205" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1" dependencies = [ "proc-macro2", "quote", @@ -3590,15 +3590,15 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" [[package]] name = "tempfile" -version = "3.11.0" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fcd239983515c23a32fb82099f97d0b11b8c72f654ed659363a95c3dad7a53" +checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", "fastrand 2.1.0", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index ff6673dc9727..cbd9ffd0feba 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "40.0.0" +version = "41.0.0" authors = ["Apache DataFusion "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -35,7 +35,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "40.0.0", features = [ +datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ "avro", "crypto_expressions", "datetime_expressions", diff --git a/datafusion/core/example.parquet b/datafusion/core/example.parquet index 94de10394b33d26a23a9888e88faa1fa90f14043..17f7473cd221426b545a5f437c42efdc6b1702b3 100644 GIT binary patch delta 33 kcmcb>et~_%c4kRKJp(-hNgEjk1_)xvVq{et~_%c4kQfJs^~{kzrtfAcibP28IB~AVUC$Sq4S` diff --git a/dev/changelog/41.0.0.md b/dev/changelog/41.0.0.md new file mode 100644 index 000000000000..3e289112c7bb --- /dev/null +++ b/dev/changelog/41.0.0.md @@ -0,0 +1,363 @@ + + +# Apache DataFusion 41.0.0 Changelog + +This release consists of 245 commits from 69 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- make unparser `Dialect` trait `Send` + `Sync` [#11504](https://github.com/apache/datafusion/pull/11504) (y-f-u) +- Implement physical plan serialization for csv COPY plans , add `as_any`, `Debug` to `FileFormatFactory` [#11588](https://github.com/apache/datafusion/pull/11588) (Lordworms) +- Consistent API to set parameters of aggregate and window functions (`AggregateExt` --> `ExprFunctionExt`) [#11550](https://github.com/apache/datafusion/pull/11550) (timsaucer) +- Rename `ColumnOptions` to `ParquetColumnOptions` [#11512](https://github.com/apache/datafusion/pull/11512) (alamb) +- Rename `input_type` --> `input_types` on AggregateFunctionExpr / AccumulatorArgs / StateFieldsArgs [#11666](https://github.com/apache/datafusion/pull/11666) (lewiszlw) +- Rename RepartitionExec metric `repart_time` to `repartition_time` [#11703](https://github.com/apache/datafusion/pull/11703) (alamb) +- Remove `AggregateFunctionDefinition` [#11803](https://github.com/apache/datafusion/pull/11803) (lewiszlw) +- Skipping partial aggregation when it is not helping for high cardinality aggregates [#11627](https://github.com/apache/datafusion/pull/11627) (korowa) +- Optionally create name of aggregate expression from expressions [#11776](https://github.com/apache/datafusion/pull/11776) (lewiszlw) + +**Performance related:** + +- feat: Optimize CASE expression for "column or null" use case [#11534](https://github.com/apache/datafusion/pull/11534) (andygrove) +- feat: Optimize CASE expression for usage where then and else values are literals [#11553](https://github.com/apache/datafusion/pull/11553) (andygrove) +- perf: Optimize IsNotNullExpr [#11586](https://github.com/apache/datafusion/pull/11586) (andygrove) + +**Implemented enhancements:** + +- feat: Add `fail_on_overflow` option to `BinaryExpr` [#11400](https://github.com/apache/datafusion/pull/11400) (andygrove) +- feat: add UDF to_local_time() [#11347](https://github.com/apache/datafusion/pull/11347) (appletreeisyellow) +- feat: switch to using proper Substrait types for IntervalYearMonth and IntervalDayTime [#11471](https://github.com/apache/datafusion/pull/11471) (Blizzara) +- feat: support UDWFs in Substrait [#11489](https://github.com/apache/datafusion/pull/11489) (Blizzara) +- feat: support `unnest` in GROUP BY clause [#11469](https://github.com/apache/datafusion/pull/11469) (JasonLi-cn) +- feat: support `COUNT()` [#11229](https://github.com/apache/datafusion/pull/11229) (tshauck) +- feat: consume and produce Substrait type extensions [#11510](https://github.com/apache/datafusion/pull/11510) (Blizzara) +- feat: Error when a SHOW command is passed in with an accompanying non-existant variable [#11540](https://github.com/apache/datafusion/pull/11540) (itsjunetime) +- feat: support Map literals in Substrait consumer and producer [#11547](https://github.com/apache/datafusion/pull/11547) (Blizzara) +- feat: add bounds for unary math scalar functions [#11584](https://github.com/apache/datafusion/pull/11584) (tshauck) +- feat: Add support for cardinality function on maps [#11801](https://github.com/apache/datafusion/pull/11801) (Weijun-H) +- feat: support `Utf8View` type in `starts_with` function [#11787](https://github.com/apache/datafusion/pull/11787) (tshauck) +- feat: Expose public method for optimizing physical plans [#11879](https://github.com/apache/datafusion/pull/11879) (andygrove) + +**Fixed bugs:** + +- fix: Fix eq properties regression from #10434 [#11363](https://github.com/apache/datafusion/pull/11363) (suremarc) +- fix: make sure JOIN ON expression is boolean type [#11423](https://github.com/apache/datafusion/pull/11423) (jonahgao) +- fix: `regexp_replace` fails when pattern or replacement is a scalar `NULL` [#11459](https://github.com/apache/datafusion/pull/11459) (Weijun-H) +- fix: unparser generates wrong sql for derived table with columns [#11505](https://github.com/apache/datafusion/pull/11505) (y-f-u) +- fix: make `UnKnownColumn`s not equal to others physical exprs [#11536](https://github.com/apache/datafusion/pull/11536) (jonahgao) +- fix: fixes trig function order by [#11559](https://github.com/apache/datafusion/pull/11559) (tshauck) +- fix: CASE with NULL [#11542](https://github.com/apache/datafusion/pull/11542) (Weijun-H) +- fix: panic and incorrect results in `LogFunc::output_ordering()` [#11571](https://github.com/apache/datafusion/pull/11571) (jonahgao) +- fix: expose the fluent API fn for approx_distinct instead of the module [#11644](https://github.com/apache/datafusion/pull/11644) (Michael-J-Ward) +- fix: dont try to coerce list for regex match [#11646](https://github.com/apache/datafusion/pull/11646) (tshauck) +- fix: regr_count now returns Uint64 [#11731](https://github.com/apache/datafusion/pull/11731) (Michael-J-Ward) +- fix: set `null_equals_null` to false when `convert_cross_join_to_inner_join` [#11738](https://github.com/apache/datafusion/pull/11738) (jonahgao) +- fix: Add additional required expression for natural join [#11713](https://github.com/apache/datafusion/pull/11713) (Lordworms) +- fix: hash join tests with forced collisions [#11806](https://github.com/apache/datafusion/pull/11806) (korowa) +- fix: `collect_columns` quadratic complexity [#11843](https://github.com/apache/datafusion/pull/11843) (crepererum) + +**Documentation updates:** + +- Minor: Add link to blog to main DataFusion website [#11356](https://github.com/apache/datafusion/pull/11356) (alamb) +- Add `to_local_time()` in function reference docs [#11401](https://github.com/apache/datafusion/pull/11401) (appletreeisyellow) +- Minor: Consolidate specification doc sections [#11427](https://github.com/apache/datafusion/pull/11427) (alamb) +- Combine the Roadmap / Quarterly Roadmap sections [#11426](https://github.com/apache/datafusion/pull/11426) (alamb) +- Minor: Add an example for backtrace pretty print [#11450](https://github.com/apache/datafusion/pull/11450) (goldmedal) +- Docs: Document creating new extension APIs [#11425](https://github.com/apache/datafusion/pull/11425) (alamb) +- Minor: Clarify which parquet options are used for reading/writing [#11511](https://github.com/apache/datafusion/pull/11511) (alamb) +- Support `newlines_in_values` CSV option [#11533](https://github.com/apache/datafusion/pull/11533) (connec) +- chore: Minor cleanup `simplify_demo()` example [#11576](https://github.com/apache/datafusion/pull/11576) (kavirajk) +- Move Datafusion Query Optimizer to library user guide [#11563](https://github.com/apache/datafusion/pull/11563) (devesh-2002) +- Fix typo in doc of Partitioning [#11612](https://github.com/apache/datafusion/pull/11612) (waruto210) +- Doc: A tiny typo in scalar function's doc [#11620](https://github.com/apache/datafusion/pull/11620) (2010YOUY01) +- Change default Parquet writer settings to match arrow-rs (except for compression & statistics) [#11558](https://github.com/apache/datafusion/pull/11558) (wiedld) +- Rename `functions-array` to `functions-nested` [#11602](https://github.com/apache/datafusion/pull/11602) (goldmedal) +- Add parser option enable_options_value_normalization [#11330](https://github.com/apache/datafusion/pull/11330) (xinlifoobar) +- Add reference to #comet channel in Arrow Rust Discord server [#11637](https://github.com/apache/datafusion/pull/11637) (ajmarcus) +- Extract catalog API to separate crate, change `TableProvider::scan` to take a trait rather than `SessionState` [#11516](https://github.com/apache/datafusion/pull/11516) (findepi) +- doc: why nullable of list item is set to true [#11626](https://github.com/apache/datafusion/pull/11626) (jcsherin) +- Docs: adding explicit mention of test_utils to docs [#11670](https://github.com/apache/datafusion/pull/11670) (edmondop) +- Ensure statistic defaults in parquet writers are in sync [#11656](https://github.com/apache/datafusion/pull/11656) (wiedld) +- Merge `string-view2` branch: reading from parquet up to 2x faster for some ClickBench queries (not on by default) [#11667](https://github.com/apache/datafusion/pull/11667) (alamb) +- Doc: Add Sail to known users list [#11791](https://github.com/apache/datafusion/pull/11791) (shehabgamin) +- Move min and max to user defined aggregate function, remove `AggregateFunction` / `AggregateFunctionDefinition::BuiltIn` [#11013](https://github.com/apache/datafusion/pull/11013) (edmondop) +- Change name of MAX/MIN udaf to lowercase max/min [#11795](https://github.com/apache/datafusion/pull/11795) (edmondop) +- doc: Add support for `map` and `make_map` functions [#11799](https://github.com/apache/datafusion/pull/11799) (Weijun-H) +- Improve readme page in crates.io [#11809](https://github.com/apache/datafusion/pull/11809) (lewiszlw) +- refactor: remove unneed mut for session context [#11864](https://github.com/apache/datafusion/pull/11864) (sunng87) + +**Other:** + +- Prepare 40.0.0 Release [#11343](https://github.com/apache/datafusion/pull/11343) (andygrove) +- Support `NULL` literals in where clause [#11266](https://github.com/apache/datafusion/pull/11266) (xinlifoobar) +- Implement TPCH substrait integration test, support tpch_6, tpch_10, t… [#11349](https://github.com/apache/datafusion/pull/11349) (Lordworms) +- Fix bug when pushing projection under joins [#11333](https://github.com/apache/datafusion/pull/11333) (jonahgao) +- Minor: some cosmetics in `filter.rs`, fix clippy due to logical conflict [#11368](https://github.com/apache/datafusion/pull/11368) (comphead) +- Update prost-derive requirement from 0.12 to 0.13 [#11355](https://github.com/apache/datafusion/pull/11355) (dependabot[bot]) +- Minor: update dashmap `6.0.1` [#11335](https://github.com/apache/datafusion/pull/11335) (alamb) +- Improve and test dataframe API examples in docs [#11290](https://github.com/apache/datafusion/pull/11290) (alamb) +- Remove redundant `unalias_nested` calls for creating Filter's [#11340](https://github.com/apache/datafusion/pull/11340) (alamb) +- Enable `clone_on_ref_ptr` clippy lint on optimizer [#11346](https://github.com/apache/datafusion/pull/11346) (lewiszlw) +- Update termtree requirement from 0.4.1 to 0.5.0 [#11383](https://github.com/apache/datafusion/pull/11383) (dependabot[bot]) +- Introduce `resources_err!` error macro [#11374](https://github.com/apache/datafusion/pull/11374) (comphead) +- Enable `clone_on_ref_ptr` clippy lint on common [#11384](https://github.com/apache/datafusion/pull/11384) (lewiszlw) +- Track parquet writer encoding memory usage on MemoryPool [#11345](https://github.com/apache/datafusion/pull/11345) (wiedld) +- Minor: remove clones and unnecessary Arcs in `from_substrait_rex` [#11337](https://github.com/apache/datafusion/pull/11337) (alamb) +- Minor: Change no-statement error message to be clearer [#11394](https://github.com/apache/datafusion/pull/11394) (itsjunetime) +- Change `array_agg` to return `null` on no input rather than empty list [#11299](https://github.com/apache/datafusion/pull/11299) (jayzhan211) +- Minor: return "not supported" for `COUNT DISTINCT` with multiple arguments [#11391](https://github.com/apache/datafusion/pull/11391) (jonahgao) +- Enable `clone_on_ref_ptr` clippy lint on sql [#11380](https://github.com/apache/datafusion/pull/11380) (lewiszlw) +- Move configuration information out of example usage page [#11300](https://github.com/apache/datafusion/pull/11300) (alamb) +- chore: reuse a single function to create the Substrait TPCH consumer test contexts [#11396](https://github.com/apache/datafusion/pull/11396) (Blizzara) +- refactor: change error type for "no statement" [#11411](https://github.com/apache/datafusion/pull/11411) (crepererum) +- Implement prettier SQL unparsing (more human readable) [#11186](https://github.com/apache/datafusion/pull/11186) (MohamedAbdeen21) +- Move `overlay` planning to`ExprPlanner` [#11398](https://github.com/apache/datafusion/pull/11398) (dharanad) +- Coerce types for all union children plans when eliminating nesting [#11386](https://github.com/apache/datafusion/pull/11386) (gruuya) +- Add customizable equality and hash functions to UDFs [#11392](https://github.com/apache/datafusion/pull/11392) (joroKr21) +- Implement ScalarFunction `MAKE_MAP` and `MAP` [#11361](https://github.com/apache/datafusion/pull/11361) (goldmedal) +- Improve `CommonSubexprEliminate` rule with surely and conditionally evaluated stats [#11357](https://github.com/apache/datafusion/pull/11357) (peter-toth) +- fix(11397): surface proper errors in ParquetSink [#11399](https://github.com/apache/datafusion/pull/11399) (wiedld) +- Minor: Add note about SQLLancer fuzz testing to docs [#11430](https://github.com/apache/datafusion/pull/11430) (alamb) +- Trivial: use arrow csv writer's timestamp_tz_format [#11407](https://github.com/apache/datafusion/pull/11407) (tmi) +- Improved unparser documentation [#11395](https://github.com/apache/datafusion/pull/11395) (alamb) +- Avoid calling shutdown after failed write of AsyncWrite [#11415](https://github.com/apache/datafusion/pull/11415) (joroKr21) +- Short term way to make `AggregateStatistics` still work when min/max is converted to udaf [#11261](https://github.com/apache/datafusion/pull/11261) (Rachelint) +- Implement TPCH substrait integration test, support tpch_13, tpch_14,16 [#11405](https://github.com/apache/datafusion/pull/11405) (Lordworms) +- Minor: fix giuthub action labeler rules [#11428](https://github.com/apache/datafusion/pull/11428) (alamb) +- Minor: change internal error to not supported error for nested field … [#11446](https://github.com/apache/datafusion/pull/11446) (alamb) +- Minor: change Datafusion --> DataFusion in docs [#11439](https://github.com/apache/datafusion/pull/11439) (alamb) +- Support serialization/deserialization for custom physical exprs in proto [#11387](https://github.com/apache/datafusion/pull/11387) (lewiszlw) +- remove termtree dependency [#11416](https://github.com/apache/datafusion/pull/11416) (Kev1n8) +- Add SessionStateBuilder and extract out the registration of defaults [#11403](https://github.com/apache/datafusion/pull/11403) (Omega359) +- integrate consumer tests, implement tpch query 18 to 22 [#11462](https://github.com/apache/datafusion/pull/11462) (Lordworms) +- Docs: Explain the usage of logical expressions for `create_aggregate_expr` [#11458](https://github.com/apache/datafusion/pull/11458) (jayzhan211) +- Return scalar result when all inputs are constants in `map` and `make_map` [#11461](https://github.com/apache/datafusion/pull/11461) (Rachelint) +- Enable `clone_on_ref_ptr` clippy lint on functions\* [#11468](https://github.com/apache/datafusion/pull/11468) (lewiszlw) +- minor: non-overlapping `repart_time` and `send_time` metrics [#11440](https://github.com/apache/datafusion/pull/11440) (korowa) +- Minor: rename `row_groups.rs` to `row_group_filter.rs` [#11481](https://github.com/apache/datafusion/pull/11481) (alamb) +- Support alternate formats for unparsing `datetime` to `timestamp` and `interval` [#11466](https://github.com/apache/datafusion/pull/11466) (y-f-u) +- chore: Add criterion benchmark for CaseExpr [#11482](https://github.com/apache/datafusion/pull/11482) (andygrove) +- Initial support for `StringView`, merge changes from `string-view` development branch [#11402](https://github.com/apache/datafusion/pull/11402) (alamb) +- Replace to_lowercase with to_string in sql example [#11486](https://github.com/apache/datafusion/pull/11486) (lewiszlw) +- Minor: Make execute_input_stream Accessible for Any Sinking Operators [#11449](https://github.com/apache/datafusion/pull/11449) (berkaysynnada) +- Enable `clone_on_ref_ptr` clippy lints on proto [#11465](https://github.com/apache/datafusion/pull/11465) (lewiszlw) +- upgrade sqlparser 0.47 -> 0.48 [#11453](https://github.com/apache/datafusion/pull/11453) (MohamedAbdeen21) +- Add extension hooks for encoding and decoding UDAFs and UDWFs [#11417](https://github.com/apache/datafusion/pull/11417) (joroKr21) +- Remove element's nullability of array_agg function [#11447](https://github.com/apache/datafusion/pull/11447) (jayzhan211) +- Get expr planners when creating new planner [#11485](https://github.com/apache/datafusion/pull/11485) (jayzhan211) +- Support alternate format for Utf8 unparsing (CHAR) [#11494](https://github.com/apache/datafusion/pull/11494) (sgrebnov) +- implement retract_batch for xor accumulator [#11500](https://github.com/apache/datafusion/pull/11500) (drewhayward) +- Refactor: more clearly delineate between `TableParquetOptions` and `ParquetWriterOptions` [#11444](https://github.com/apache/datafusion/pull/11444) (wiedld) +- chore: fix typos of common and core packages [#11520](https://github.com/apache/datafusion/pull/11520) (JasonLi-cn) +- Move spill related functions to spill.rs [#11509](https://github.com/apache/datafusion/pull/11509) (findepi) +- Add tests that show the different defaults for `ArrowWriter` and `TableParquetOptions` [#11524](https://github.com/apache/datafusion/pull/11524) (wiedld) +- Create `datafusion-physical-optimizer` crate [#11507](https://github.com/apache/datafusion/pull/11507) (lewiszlw) +- Minor: Assert `test_enabled_backtrace` requirements to run [#11525](https://github.com/apache/datafusion/pull/11525) (comphead) +- Move handlign of NULL literals in where clause to type coercion pass [#11491](https://github.com/apache/datafusion/pull/11491) (xinlifoobar) +- Update parquet page pruning code to use the `StatisticsExtractor` [#11483](https://github.com/apache/datafusion/pull/11483) (alamb) +- Enable SortMergeJoin LeftAnti filtered fuzz tests [#11535](https://github.com/apache/datafusion/pull/11535) (comphead) +- chore: fix typos of expr, functions, optimizer, physical-expr-common,… [#11538](https://github.com/apache/datafusion/pull/11538) (JasonLi-cn) +- Minor: Remove clone in `PushDownFilter` [#11532](https://github.com/apache/datafusion/pull/11532) (jayzhan211) +- Minor: avoid a clone in type coercion [#11530](https://github.com/apache/datafusion/pull/11530) (alamb) +- Move array `ArrayAgg` to a `UserDefinedAggregate` [#11448](https://github.com/apache/datafusion/pull/11448) (jayzhan211) +- Move `MAKE_MAP` to ExprPlanner [#11452](https://github.com/apache/datafusion/pull/11452) (goldmedal) +- chore: fix typos of sql, sqllogictest and substrait packages [#11548](https://github.com/apache/datafusion/pull/11548) (JasonLi-cn) +- Prevent bigger files from being checked in [#11508](https://github.com/apache/datafusion/pull/11508) (findepi) +- Add dialect param to use double precision for float64 in Postgres [#11495](https://github.com/apache/datafusion/pull/11495) (Sevenannn) +- Minor: move `SessionStateDefaults` into its own module [#11566](https://github.com/apache/datafusion/pull/11566) (alamb) +- refactor: rewrite mega type to an enum containing both cases [#11539](https://github.com/apache/datafusion/pull/11539) (LorrensP-2158466) +- Move `sql_compound_identifier_to_expr ` to `ExprPlanner` [#11487](https://github.com/apache/datafusion/pull/11487) (dharanad) +- Support SortMergeJoin spilling [#11218](https://github.com/apache/datafusion/pull/11218) (comphead) +- Fix unparser invalid sql for query with order [#11527](https://github.com/apache/datafusion/pull/11527) (y-f-u) +- Provide DataFrame API for `map` and move `map` to `functions-array` [#11560](https://github.com/apache/datafusion/pull/11560) (goldmedal) +- Move OutputRequirements to datafusion-physical-optimizer crate [#11579](https://github.com/apache/datafusion/pull/11579) (xinlifoobar) +- Minor: move `Column` related tests and rename `column.rs` [#11573](https://github.com/apache/datafusion/pull/11573) (jonahgao) +- Fix SortMergeJoin antijoin flaky condition [#11604](https://github.com/apache/datafusion/pull/11604) (comphead) +- Improve Union Equivalence Propagation [#11506](https://github.com/apache/datafusion/pull/11506) (mustafasrepo) +- Migrate `OrderSensitiveArrayAgg` to be a user defined aggregate [#11564](https://github.com/apache/datafusion/pull/11564) (jayzhan211) +- Minor:Disable flaky SMJ antijoin filtered test until the fix [#11608](https://github.com/apache/datafusion/pull/11608) (comphead) +- support Decimal256 type in datafusion-proto [#11606](https://github.com/apache/datafusion/pull/11606) (leoyvens) +- Chore/fifo tests cleanup [#11616](https://github.com/apache/datafusion/pull/11616) (ozankabak) +- Fix Internal Error for an INNER JOIN query [#11578](https://github.com/apache/datafusion/pull/11578) (xinlifoobar) +- test: get file size by func metadata [#11575](https://github.com/apache/datafusion/pull/11575) (zhuliquan) +- Improve unparser MySQL compatibility [#11589](https://github.com/apache/datafusion/pull/11589) (sgrebnov) +- Push scalar functions into cross join [#11528](https://github.com/apache/datafusion/pull/11528) (lewiszlw) +- Remove ArrayAgg Builtin in favor of UDF [#11611](https://github.com/apache/datafusion/pull/11611) (jayzhan211) +- refactor: simplify `DFSchema::field_with_unqualified_name` [#11619](https://github.com/apache/datafusion/pull/11619) (jonahgao) +- Minor: Use upstream `concat_batches` from arrow-rs [#11615](https://github.com/apache/datafusion/pull/11615) (alamb) +- Fix : `signum` function bug when `0.0` input [#11580](https://github.com/apache/datafusion/pull/11580) (getChan) +- Enforce uniqueness of `named_struct` field names [#11614](https://github.com/apache/datafusion/pull/11614) (dharanad) +- Minor: unecessary row_count calculation in `CrossJoinExec` and `NestedLoopsJoinExec` [#11632](https://github.com/apache/datafusion/pull/11632) (alamb) +- ExprBuilder for Physical Aggregate Expr [#11617](https://github.com/apache/datafusion/pull/11617) (jayzhan211) +- Minor: avoid copying order by exprs in planner [#11634](https://github.com/apache/datafusion/pull/11634) (alamb) +- Unify CI and pre-commit hook settings for clippy [#11640](https://github.com/apache/datafusion/pull/11640) (findepi) +- Parsing SQL strings to Exprs with the qualified schema [#11562](https://github.com/apache/datafusion/pull/11562) (Lordworms) +- Add some zero column tests covering LIMIT, GROUP BY, WHERE, JOIN, and WINDOW [#11624](https://github.com/apache/datafusion/pull/11624) (Kev1n8) +- Refactor/simplify window frame utils [#11648](https://github.com/apache/datafusion/pull/11648) (ozankabak) +- Minor: use `ready!` macro to simplify `FilterExec` [#11649](https://github.com/apache/datafusion/pull/11649) (alamb) +- Temporarily pin toolchain version to avoid clippy errors [#11655](https://github.com/apache/datafusion/pull/11655) (findepi) +- Fix clippy errors for Rust 1.80 [#11654](https://github.com/apache/datafusion/pull/11654) (findepi) +- Add `CsvExecBuilder` for creating `CsvExec` [#11633](https://github.com/apache/datafusion/pull/11633) (connec) +- chore(deps): update sqlparser requirement from 0.48 to 0.49 [#11630](https://github.com/apache/datafusion/pull/11630) (dependabot[bot]) +- Add support for USING to SQL unparser [#11636](https://github.com/apache/datafusion/pull/11636) (wackywendell) +- Run CI with latest (Rust 1.80), add ticket references to commented out tests [#11661](https://github.com/apache/datafusion/pull/11661) (alamb) +- Use `AccumulatorArgs::is_reversed` in `NthValueAgg` [#11669](https://github.com/apache/datafusion/pull/11669) (jcsherin) +- Implement physical plan serialization for json Copy plans [#11645](https://github.com/apache/datafusion/pull/11645) (Lordworms) +- Minor: improve documentation on `SessionState` [#11642](https://github.com/apache/datafusion/pull/11642) (alamb) +- Add LimitPushdown optimization rule and CoalesceBatchesExec fetch [#11652](https://github.com/apache/datafusion/pull/11652) (alihandroid) +- Update to arrow/parquet `52.2.0` [#11691](https://github.com/apache/datafusion/pull/11691) (alamb) +- Minor: Rename `RepartitionMetrics::repartition_time` to `RepartitionMetrics::repart_time` to match metric [#11478](https://github.com/apache/datafusion/pull/11478) (alamb) +- Update cache key used in rust CI script [#11641](https://github.com/apache/datafusion/pull/11641) (findepi) +- Fix bug in `remove_join_expressions` [#11693](https://github.com/apache/datafusion/pull/11693) (jonahgao) +- Initial changes to support using udaf min/max for statistics and opti… [#11696](https://github.com/apache/datafusion/pull/11696) (edmondop) +- Handle nulls in approx_percentile_cont [#11721](https://github.com/apache/datafusion/pull/11721) (Dandandan) +- Reduce repetition in try_process_group_by_unnest and try_process_unnest [#11714](https://github.com/apache/datafusion/pull/11714) (JasonLi-cn) +- Minor: Add example for `ScalarUDF::call` [#11727](https://github.com/apache/datafusion/pull/11727) (alamb) +- Use `cargo release` in `bench.sh` [#11722](https://github.com/apache/datafusion/pull/11722) (alamb) +- expose some fields on session state [#11716](https://github.com/apache/datafusion/pull/11716) (waynexia) +- Make DefaultSchemaAdapterFactory public [#11709](https://github.com/apache/datafusion/pull/11709) (adriangb) +- Check hashes first during probing the aggr hash table [#11718](https://github.com/apache/datafusion/pull/11718) (Rachelint) +- Implement physical plan serialization for parquet Copy plans [#11735](https://github.com/apache/datafusion/pull/11735) (Lordworms) +- Support cross-timezone `timestamp` comparison via coercsion [#11711](https://github.com/apache/datafusion/pull/11711) (jeffreyssmith2nd) +- Minor: Improve documentation for AggregateUDFImpl::state_fields [#11740](https://github.com/apache/datafusion/pull/11740) (lewiszlw) +- Do not push down Sorts if it violates the sort requirements [#11678](https://github.com/apache/datafusion/pull/11678) (alamb) +- Use upstream `StatisticsConverter` from arrow-rs in DataFusion [#11479](https://github.com/apache/datafusion/pull/11479) (alamb) +- Fix `plan_to_sql`: Add wildcard projection to SELECT statement if no projection was set [#11744](https://github.com/apache/datafusion/pull/11744) (LatrecheYasser) +- Use upstream `DataType::from_str` in arrow-cast [#11254](https://github.com/apache/datafusion/pull/11254) (alamb) +- Fix documentation warnings, make CsvExecBuilder and Unparsed pub [#11729](https://github.com/apache/datafusion/pull/11729) (alamb) +- [Minor] Add test for only nulls (empty) as input in APPROX_PERCENTILE_CONT [#11760](https://github.com/apache/datafusion/pull/11760) (Dandandan) +- Add `TrackedMemoryPool` with better error messages on exhaustion [#11665](https://github.com/apache/datafusion/pull/11665) (wiedld) +- Derive `Debug` for logical plan nodes [#11757](https://github.com/apache/datafusion/pull/11757) (lewiszlw) +- Minor: add "clickbench extended" queries to slt tests [#11763](https://github.com/apache/datafusion/pull/11763) (alamb) +- Minor: Add comment explaining rationale for hash check [#11750](https://github.com/apache/datafusion/pull/11750) (alamb) +- Fix bug that `COUNT(DISTINCT)` on StringView panics [#11768](https://github.com/apache/datafusion/pull/11768) (XiangpengHao) +- [Minor] Refactor approx_percentile [#11769](https://github.com/apache/datafusion/pull/11769) (Dandandan) +- minor: always time batch_filter even when the result is an empty batch [#11775](https://github.com/apache/datafusion/pull/11775) (andygrove) +- Improve OOM message when a single reservation request fails to get more bytes. [#11771](https://github.com/apache/datafusion/pull/11771) (wiedld) +- [Minor] Short circuit `ApplyFunctionRewrites` if there are no function rewrites [#11765](https://github.com/apache/datafusion/pull/11765) (gruuya) +- Fix #11692: Improve doc comments within macros [#11694](https://github.com/apache/datafusion/pull/11694) (Rafferty97) +- Extract `CoalesceBatchesStream` to a struct [#11610](https://github.com/apache/datafusion/pull/11610) (alamb) +- refactor: move ExecutionPlan and related structs into dedicated mod [#11759](https://github.com/apache/datafusion/pull/11759) (waynexia) +- Minor: Add references to github issue in comments [#11784](https://github.com/apache/datafusion/pull/11784) (findepi) +- Add docs and rename param for `Signature::numeric` [#11778](https://github.com/apache/datafusion/pull/11778) (matthewmturner) +- Support planning `Map` literal [#11780](https://github.com/apache/datafusion/pull/11780) (goldmedal) +- Support `LogicalPlan` `Debug` differently than `Display` [#11774](https://github.com/apache/datafusion/pull/11774) (lewiszlw) +- Remove redundant Aggregate when `DISTINCT` & `GROUP BY` are in the same query [#11781](https://github.com/apache/datafusion/pull/11781) (mertak-synnada) +- Minor: add ticket reference and fmt [#11805](https://github.com/apache/datafusion/pull/11805) (alamb) +- Improve MSRV CI check to print out problems to log [#11789](https://github.com/apache/datafusion/pull/11789) (alamb) +- Improve log func tests stability [#11808](https://github.com/apache/datafusion/pull/11808) (lewiszlw) +- Add valid Distinct case for aggregation [#11814](https://github.com/apache/datafusion/pull/11814) (mertak-synnada) +- Don't implement `create_sliding_accumulator` repeatedly [#11813](https://github.com/apache/datafusion/pull/11813) (lewiszlw) +- chore(deps): update rstest requirement from 0.21.0 to 0.22.0 [#11811](https://github.com/apache/datafusion/pull/11811) (dependabot[bot]) +- Minor: Update exected output due to logical conflict [#11824](https://github.com/apache/datafusion/pull/11824) (alamb) +- Pass scalar to `eq` inside `nullif` [#11697](https://github.com/apache/datafusion/pull/11697) (simonvandel) +- refactor: move `aggregate_statistics` to `datafusion-physical-optimizer` [#11798](https://github.com/apache/datafusion/pull/11798) (Weijun-H) +- Minor: refactor probe check into function `should_skip_aggregation` [#11821](https://github.com/apache/datafusion/pull/11821) (alamb) +- Minor: consolidate `path_partition` test into `core_integration` [#11831](https://github.com/apache/datafusion/pull/11831) (alamb) +- Move optimizer integration tests to `core_integration` [#11830](https://github.com/apache/datafusion/pull/11830) (alamb) +- Bump deprecated version of SessionState::new_with_config_rt to 41.0.0 [#11839](https://github.com/apache/datafusion/pull/11839) (kezhuw) +- Fix partial aggregation skipping with Decimal aggregators [#11833](https://github.com/apache/datafusion/pull/11833) (alamb) +- Fix bug with zero-sized buffer for StringViewArray [#11841](https://github.com/apache/datafusion/pull/11841) (XiangpengHao) +- Reduce clone of `Statistics` in `ListingTable` and `PartitionedFile` [#11802](https://github.com/apache/datafusion/pull/11802) (Rachelint) +- Add `LogicalPlan::CreateIndex` [#11817](https://github.com/apache/datafusion/pull/11817) (lewiszlw) +- Update `object_store` to 0.10.2 [#11860](https://github.com/apache/datafusion/pull/11860) (danlgrca) +- Add `skipped_aggregation_rows` metric to aggregate operator [#11706](https://github.com/apache/datafusion/pull/11706) (alamb) +- Cast `Utf8View` to `Utf8` to support `||` from `StringViewArray` [#11796](https://github.com/apache/datafusion/pull/11796) (dharanad) +- Improve nested loop join code [#11863](https://github.com/apache/datafusion/pull/11863) (lewiszlw) +- [Minor]: Refactor to use Result.transpose() [#11882](https://github.com/apache/datafusion/pull/11882) (djanderson) +- support `ANY()` op [#11849](https://github.com/apache/datafusion/pull/11849) (samuelcolvin) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 48 Andrew Lamb + 20 张林伟 + 9 Jay Zhan + 9 Jonah Gao + 8 Andy Grove + 8 Lordworms + 8 Piotr Findeisen + 8 wiedld + 7 Oleks V + 6 Jax Liu + 5 Alex Huang + 5 Arttu + 5 JasonLi + 5 Trent Hauck + 5 Xin Li + 4 Dharan Aditya + 4 Edmondo Porcu + 4 dependabot[bot] + 4 kamille + 4 yfu + 3 Daniël Heres + 3 Eduard Karacharov + 3 Georgi Krastev + 2 Chris Connelly + 2 Chunchun Ye + 2 June + 2 Marco Neumann + 2 Marko Grujic + 2 Mehmet Ozan Kabak + 2 Michael J Ward + 2 Mohamed Abdeen + 2 Ruihang Xia + 2 Sergei Grebnov + 2 Xiangpeng Hao + 2 jcsherin + 2 kf zheng + 2 mertak-synnada + 1 Adrian Garcia Badaracco + 1 Alexander Rafferty + 1 Alihan Çelikcan + 1 Ariel Marcus + 1 Berkay Şahin + 1 Bruce Ritchie + 1 Devesh Rahatekar + 1 Douglas Anderson + 1 Drew Hayward + 1 Jeffrey Smith II + 1 Kaviraj Kanagaraj + 1 Kezhu Wang + 1 Leonardo Yvens + 1 Lorrens Pantelis + 1 Matthew Cramerus + 1 Matthew Turner + 1 Mustafa Akur + 1 Namgung Chan + 1 Ning Sun + 1 Peter Toth + 1 Qianqian + 1 Samuel Colvin + 1 Shehab Amin + 1 Simon Vandel Sillesen + 1 Tim Saucer + 1 Wendell Smith + 1 Yasser Latreche + 1 Yongting You + 1 danlgrca + 1 tmi + 1 waruto + 1 zhuliquan +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/dev/release/README.md b/dev/release/README.md index c6bc9be2b0db..1817b3002578 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -124,20 +124,31 @@ Here are the commands that could be used to prepare the `38.0.0` release: Checkout the main commit to be released -``` +```shell git fetch apache git checkout apache/main ``` -Update datafusion version in `datafusion/Cargo.toml` to `38.0.0`: +Manually update the datafusion version in the root `Cargo.toml` to `38.0.0`. +Run `cargo update` in the root directory and also in `datafusion-cli`: + +```shell +cargo update +cd datafustion-cli +cargo update +cd .. ``` -./dev/update_datafusion_versions.py 38.0.0 + +Run `cargo test` to re-generate some example files: + +```shell +cargo test ``` Lastly commit the version change: -``` +```shell git commit -a -m 'Update version' ``` @@ -193,7 +204,7 @@ For the release to become "official" it needs at least three PMC members to vote The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like: -``` +```shell ./dev/release/verify-release-candidate.sh 38.0.0 0 ``` @@ -222,7 +233,7 @@ Congratulations! The release is now official! Tag the same release candidate commit with the final release tag -``` +```shell git co apache/38.0.0-rc0 git tag 38.0.0 git push apache 38.0.0 diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index badd07822ac2..e0c8391a259a 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -65,7 +65,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 40.0.0 | (writing) Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 41.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | From 193955e0ec4057ab9fc1747ed78b21ddc71a7e25 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 8 Aug 2024 22:38:05 +0200 Subject: [PATCH 05/12] Produce clear error message when build runs with conflicting features (#11895) Throw explicit error message when `cargo build` is invoked with conflicting features, e.g. `cargo test --lib --tests --bins --all-features`. --- benchmarks/src/bin/dfbench.rs | 5 +++++ benchmarks/src/bin/tpch.rs | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index 441b6cdc0293..9ce6848a063a 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -20,6 +20,11 @@ use datafusion::error::Result; use structopt::StructOpt; +#[cfg(all(feature = "snmalloc", feature = "mimalloc"))] +compile_error!( + "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time" +); + #[cfg(feature = "snmalloc")] #[global_allocator] static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; diff --git a/benchmarks/src/bin/tpch.rs b/benchmarks/src/bin/tpch.rs index fc0f4ca0613c..3270b082cfb4 100644 --- a/benchmarks/src/bin/tpch.rs +++ b/benchmarks/src/bin/tpch.rs @@ -21,6 +21,11 @@ use datafusion::error::Result; use datafusion_benchmarks::tpch; use structopt::StructOpt; +#[cfg(all(feature = "snmalloc", feature = "mimalloc"))] +compile_error!( + "feature \"snmalloc\" and feature \"mimalloc\" cannot be enabled at the same time" +); + #[cfg(feature = "snmalloc")] #[global_allocator] static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; From 368df80b7fa511d59e7612e35f49839eb0249882 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 8 Aug 2024 16:39:05 -0400 Subject: [PATCH 06/12] Add tests for StringView / character functions, fix `regexp_like` and `regexp_match` to work with StringView (#11753) * Minor: Add tests for StringView / character functions * Fix regexp_like and regexp_match to work with StringVeiw * Update for ASCII and BTRIM * Add comment about why it is ok to return boolean with catchall match * Fix character_length * Add ticket references --- datafusion/functions/src/regex/regexplike.rs | 9 +- datafusion/functions/src/regex/regexpmatch.rs | 12 +- .../functions/src/unicode/character_length.rs | 2 +- .../sqllogictest/test_files/string_view.slt | 509 +++++++++++++++--- 4 files changed, 427 insertions(+), 105 deletions(-) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 09b96a28c107..20029ba005c4 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -75,13 +75,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { use DataType::*; Ok(match &arg_types[0] { - LargeUtf8 | Utf8 => Boolean, Null => Null, - other => { - return plan_err!( - "The regexp_like function can only accept strings. Got {other}" - ); - } + // Type coercion is done by DataFusion based on signature, so if we + // get here, the first argument is always a string + _ => Boolean, }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index f57d3c17bd72..764acd7de757 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -74,17 +74,9 @@ impl ScalarUDFImpl for RegexpMatchFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - use DataType::*; - Ok(match &arg_types[0] { - LargeUtf8 => List(Arc::new(Field::new("item", LargeUtf8, true))), - Utf8 => List(Arc::new(Field::new("item", Utf8, true))), - Null => Null, - other => { - return plan_err!( - "The regexp_match function can only accept strings. Got {other}" - ); - } + DataType::Null => DataType::Null, + other => DataType::List(Arc::new(Field::new("item", other.clone(), true))), }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index cee1a57bc6d9..e46ee162ff12 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -44,7 +44,7 @@ impl CharacterLengthFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), aliases: vec![String::from("length"), String::from("char_length")], diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fc10a34256c5..e7166690580f 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -425,102 +425,26 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] -statement ok -drop table test; - -# coercion from stringview to integer, as input to make_date -query D -select make_date(arrow_cast('2024', 'Utf8View'), arrow_cast('01', 'Utf8View'), arrow_cast('23', 'Utf8View')) ----- -2024-01-23 - -# coercions between stringview and date types -statement ok -create table dates (dt date) as values - (date '2024-01-23'), - (date '2023-11-30'); - -query D -select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; ----- -2024-01-23 - -statement ok -drop table dates; - -statement ok -create table temp as values -('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), -('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); - -query T -select column2||' is fast' from temp; ----- -rust is fast -datafusion is fast - - -query T -select column2 || ' is ' || column3 from temp; ----- -rust is fast -datafusion is cool +# Ensure string functions use native StringView implementation +# and do not fall back to Utf8 or LargeUtf8 +# Should see no casts to Utf8 in the plans below +## Ensure no casts for LIKE/ILIKE query TT -explain select column2 || 'is' || column3 from temp; ----- -logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || Utf8("is") || CAST(temp.column3 AS Utf8) -02)--TableScan: temp projection=[column2, column3] - - -query TT -explain select column2||' is fast' from temp; +EXPLAIN SELECT + column1_utf8view like 'foo' as "like", + column1_utf8view ilike 'foo' as "ilike" +FROM test; ---- logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || Utf8(" is fast") -02)--TableScan: temp projection=[column2] - +01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike +02)--TableScan: test projection=[column1_utf8view] -query T -select column2||column3 from temp; ----- -rustfast -datafusioncool -query TT -explain select column2||column3 from temp; ----- -logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || CAST(temp.column3 AS Utf8) -02)--TableScan: temp projection=[column2, column3] -query T -select column2|| ' ' ||column3 from temp; ----- -rust fast -datafusion cool +## Ensure no casts for ASCII ### ASCII -# Setup the initial test data -statement ok -create table test_source as values - ('Andrew', 'X'), - ('Xiangpeng', 'Xiangpeng'), - ('Raphael', 'R'), - (NULL, 'R'); - -# Table with the different combination of column types -statement ok -create table test as -SELECT - arrow_cast(column1, 'Utf8') as column1_utf8, - arrow_cast(column2, 'Utf8') as column2_utf8, - arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, - arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, - arrow_cast(column1, 'Utf8View') as column1_utf8view, - arrow_cast(column2, 'Utf8View') as column2_utf8view -FROM test_source; # Test ASCII with utf8view against utf8view, utf8, and largeutf8 # (should be no casts) @@ -594,8 +518,417 @@ SELECT ---- 228 0 NULL +## Ensure no casts for BTRIM +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CHARACTER_LENGTH +query TT +EXPLAIN SELECT + CHARACTER_LENGTH(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: character_length(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CONCAT +## TODO https://github.com/apache/datafusion/issues/11836 +query TT +EXPLAIN SELECT + concat(column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat(CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONCAT_WS +## TODO https://github.com/apache/datafusion/issues/11837 +query TT +EXPLAIN SELECT + concat_ws(', ', column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat_ws(Utf8(", "), CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONTAINS +## TODO https://github.com/apache/datafusion/issues/11838 +query TT +EXPLAIN SELECT + CONTAINS(column1_utf8view, 'foo') as c1, + CONTAINS(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: contains(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, contains(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for ENDS_WITH +## TODO https://github.com/apache/datafusion/issues/11852 +query TT +EXPLAIN SELECT + ENDS_WITH(column1_utf8view, 'foo') as c1, + ENDS_WITH(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: ends_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, ends_with(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for INITCAP +## TODO https://github.com/apache/datafusion/issues/11853 +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LEVENSHTEIN +## TODO https://github.com/apache/datafusion/issues/11854 +query TT +EXPLAIN SELECT + levenshtein(column1_utf8view, 'foo') as c1, + levenshtein(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: levenshtein(__common_expr_1, Utf8("foo")) AS c1, levenshtein(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for LOWER +## TODO https://github.com/apache/datafusion/issues/11855 +query TT +EXPLAIN SELECT + LOWER(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LTRIM +## TODO https://github.com/apache/datafusion/issues/11856 +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LPAD +## TODO https://github.com/apache/datafusion/issues/11857 +query TT +EXPLAIN SELECT + LPAD(column1_utf8view, 12, ' ') as c1 +FROM test; +---- +logical_plan +01)Projection: lpad(CAST(test.column1_utf8view AS Utf8), Int64(12), Utf8(" ")) AS c1 +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for OCTET_LENGTH +## TODO https://github.com/apache/datafusion/issues/11858 +query TT +EXPLAIN SELECT + OCTET_LENGTH(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: octet_length(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for OVERLAY +## TODO file ticket +query TT +EXPLAIN SELECT + OVERLAY(column1_utf8view PLACING 'foo' FROM 2 ) as c1 +FROM test; +---- +logical_plan +01)Projection: overlay(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_LIKE +query TT +EXPLAIN SELECT + REGEXP_LIKE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_like(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_MATCH +query TT +EXPLAIN SELECT + REGEXP_MATCH(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_REPLACE +query TT +EXPLAIN SELECT + REGEXP_REPLACE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for REPEAT +## TODO file ticket +query TT +EXPLAIN SELECT + REPEAT(column1_utf8view, 2) as c1 +FROM test; +---- +logical_plan +01)Projection: repeat(CAST(test.column1_utf8view AS Utf8), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REPLACE +## TODO file ticket +query TT +EXPLAIN SELECT + REPLACE(column1_utf8view, 'foo', 'bar') as c1, + REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 +FROM test; +---- +logical_plan +01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for REVERSE +## TODO file ticket +query TT +EXPLAIN SELECT + REVERSE(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c1, + RTRIM(column1_utf8view, 'foo') as c2 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RIGHT +## TODO file ticket +query TT +EXPLAIN SELECT + RIGHT(column1_utf8view, 3) as c2 +FROM test; +---- +logical_plan +01)Projection: right(CAST(test.column1_utf8view AS Utf8), Int64(3)) AS c2 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RPAD +## TODO file ticket +query TT +EXPLAIN SELECT + RPAD(column1_utf8view, 1) as c1, + RPAD(column1_utf8view, 2, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: rpad(__common_expr_1, Int64(1)) AS c1, rpad(__common_expr_1, Int64(2), CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c, + RTRIM(column1_utf8view, column2_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SPLIT_PART +## TODO file ticket +query TT +EXPLAIN SELECT + SPLIT_PART(column1_utf8view, 'f', 1) as c +FROM test; +---- +logical_plan +01)Projection: split_part(CAST(test.column1_utf8view AS Utf8), Utf8("f"), Int64(1)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for STRPOS +## TODO file ticket +query TT +EXPLAIN SELECT + STRPOS(column1_utf8view, 'f') as c, + STRPOS(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: strpos(__common_expr_1, Utf8("f")) AS c, strpos(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SUBSTR +## TODO file ticket +query TT +EXPLAIN SELECT + SUBSTR(column1_utf8view, 1) as c, + SUBSTR(column1_utf8view, 1 ,2) as c2 +FROM test; +---- +logical_plan +01)Projection: substr(__common_expr_1, Int64(1)) AS c, substr(__common_expr_1, Int64(1), Int64(2)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts on columns for STARTS_WITH +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'foo') as c, + STARTS_WITH(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for TRANSLATE +## TODO file ticket +query TT +EXPLAIN SELECT + TRANSLATE(column1_utf8view, 'foo', 'bar') as c +FROM test; +---- +logical_plan +01)Projection: translate(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Utf8("bar")) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for FIND_IN_SET +## TODO file ticket +query TT +EXPLAIN SELECT + FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +logical_plan +01)Projection: find_in_set(CAST(test.column1_utf8view AS Utf8), Utf8("a,b,c,d")) AS c +02)--TableScan: test projection=[column1_utf8view] + + + + statement ok drop table test; +# coercion from stringview to integer, as input to make_date +query D +select make_date(arrow_cast('2024', 'Utf8View'), arrow_cast('01', 'Utf8View'), arrow_cast('23', 'Utf8View')) +---- +2024-01-23 + +# coercions between stringview and date types +statement ok +create table dates (dt date) as values + (date '2024-01-23'), + (date '2023-11-30'); + +query D +select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; +---- +2024-01-23 + +statement ok +drop table dates; + statement ok -drop table test_source; +create table temp as values +('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), +('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); + +query T +select column2||' is fast' from temp; +---- +rust is fast +datafusion is fast + + +query T +select column2 || ' is ' || column3 from temp; +---- +rust is fast +datafusion is cool + +query TT +explain select column2 || 'is' || column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8("is") || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + + +query TT +explain select column2||' is fast' from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8(" is fast") +02)--TableScan: temp projection=[column2] + + +query T +select column2||column3 from temp; +---- +rustfast +datafusioncool + +query TT +explain select column2||column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + +query T +select column2|| ' ' ||column3 from temp; +---- +rust fast +datafusion cool From 86030a1ff713cc9709a81a1e9df82d4d13b8818d Mon Sep 17 00:00:00 2001 From: yfu Date: Fri, 9 Aug 2024 06:39:19 +1000 Subject: [PATCH 07/12] fix: invalid sqls when unparsing derived table with columns contains calculations, limit/order/distinct (#11756) * Fix unparser derived table with columns include calculations, limit/order/distinct (#24) * compare format output to make sure the two level of projects match * add method to find inner projection that could be nested under limit/order/distinct * use format! for matching in unparser sort optimization too * refactor * use to_string and also put comments in * clippy * fix unparser derived table contains cast (#25) * fix unparser derived table contains cast * remove dbg --- datafusion/sql/src/unparser/plan.rs | 67 +------------ datafusion/sql/src/unparser/rewrite.rs | 109 ++++++++++++++++++++-- datafusion/sql/tests/cases/plan_to_sql.rs | 32 +++++++ 3 files changed, 139 insertions(+), 69 deletions(-) diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index e08f25d3c27c..277efd5fe700 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -30,8 +30,10 @@ use super::{ BuilderError, DerivedRelationBuilder, QueryBuilder, RelationBuilder, SelectBuilder, TableRelationBuilder, TableWithJoinsBuilder, }, - rewrite::normalize_union_schema, - rewrite::rewrite_plan_for_sort_on_non_projected_fields, + rewrite::{ + normalize_union_schema, rewrite_plan_for_sort_on_non_projected_fields, + subquery_alias_inner_query_and_columns, + }, utils::{find_agg_node_within_select, unproject_window_exprs, AggVariant}, Unparser, }; @@ -687,67 +689,6 @@ impl Unparser<'_> { } } -// This logic is to work out the columns and inner query for SubqueryAlias plan for both types of -// subquery -// - `(SELECT column_a as a from table) AS A` -// - `(SELECT column_a from table) AS A (a)` -// -// A roundtrip example for table alias with columns -// -// query: SELECT id FROM (SELECT j1_id from j1) AS c (id) -// -// LogicPlan: -// Projection: c.id -// SubqueryAlias: c -// Projection: j1.j1_id AS id -// Projection: j1.j1_id -// TableScan: j1 -// -// Before introducing this logic, the unparsed query would be `SELECT c.id FROM (SELECT j1.j1_id AS -// id FROM (SELECT j1.j1_id FROM j1)) AS c`. -// The query is invalid as `j1.j1_id` is not a valid identifier in the derived table -// `(SELECT j1.j1_id FROM j1)` -// -// With this logic, the unparsed query will be: -// `SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)` -// -// Caveat: this won't handle the case like `select * from (select 1, 2) AS a (b, c)` -// as the parser gives a wrong plan which has mismatch `Int(1)` types: Literal and -// Column in the Projections. Once the parser side is fixed, this logic should work -fn subquery_alias_inner_query_and_columns( - subquery_alias: &datafusion_expr::SubqueryAlias, -) -> (&LogicalPlan, Vec) { - let plan: &LogicalPlan = subquery_alias.input.as_ref(); - - let LogicalPlan::Projection(outer_projections) = plan else { - return (plan, vec![]); - }; - - // check if it's projection inside projection - let LogicalPlan::Projection(inner_projection) = outer_projections.input.as_ref() - else { - return (plan, vec![]); - }; - - let mut columns: Vec = vec![]; - // check if the inner projection and outer projection have a matching pattern like - // Projection: j1.j1_id AS id - // Projection: j1.j1_id - for (i, inner_expr) in inner_projection.expr.iter().enumerate() { - let Expr::Alias(ref outer_alias) = &outer_projections.expr[i] else { - return (plan, vec![]); - }; - - if outer_alias.expr.as_ref() != inner_expr { - return (plan, vec![]); - }; - - columns.push(outer_alias.name.as_str().into()); - } - - (outer_projections.input.as_ref(), columns) -} - impl From for DataFusionError { fn from(e: BuilderError) -> Self { DataFusionError::External(Box::new(e)) diff --git a/datafusion/sql/src/unparser/rewrite.rs b/datafusion/sql/src/unparser/rewrite.rs index fba95ad48f32..f6725485f920 100644 --- a/datafusion/sql/src/unparser/rewrite.rs +++ b/datafusion/sql/src/unparser/rewrite.rs @@ -25,6 +25,7 @@ use datafusion_common::{ Result, }; use datafusion_expr::{Expr, LogicalPlan, Projection, Sort}; +use sqlparser::ast::Ident; /// Normalize the schema of a union plan to remove qualifiers from the schema fields and sort expressions. /// @@ -137,14 +138,25 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields( let inner_exprs = inner_p .expr .iter() - .map(|f| { - if let Expr::Alias(alias) = f { + .enumerate() + .map(|(i, f)| match f { + Expr::Alias(alias) => { let a = Expr::Column(alias.name.clone().into()); map.insert(a.clone(), f.clone()); a - } else { + } + Expr::Column(_) => { + map.insert( + Expr::Column(inner_p.schema.field(i).name().into()), + f.clone(), + ); f.clone() } + _ => { + let a = Expr::Column(inner_p.schema.field(i).name().into()); + map.insert(a.clone(), f.clone()); + a + } }) .collect::>(); @@ -155,9 +167,17 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields( } } - if collects.iter().collect::>() - == inner_exprs.iter().collect::>() - { + // Compare outer collects Expr::to_string with inner collected transformed values + // alias -> alias column + // column -> remain + // others, extract schema field name + let outer_collects = collects.iter().map(Expr::to_string).collect::>(); + let inner_collects = inner_exprs + .iter() + .map(Expr::to_string) + .collect::>(); + + if outer_collects == inner_collects { let mut sort = sort.clone(); let mut inner_p = inner_p.clone(); @@ -175,3 +195,80 @@ pub(super) fn rewrite_plan_for_sort_on_non_projected_fields( None } } + +// This logic is to work out the columns and inner query for SubqueryAlias plan for both types of +// subquery +// - `(SELECT column_a as a from table) AS A` +// - `(SELECT column_a from table) AS A (a)` +// +// A roundtrip example for table alias with columns +// +// query: SELECT id FROM (SELECT j1_id from j1) AS c (id) +// +// LogicPlan: +// Projection: c.id +// SubqueryAlias: c +// Projection: j1.j1_id AS id +// Projection: j1.j1_id +// TableScan: j1 +// +// Before introducing this logic, the unparsed query would be `SELECT c.id FROM (SELECT j1.j1_id AS +// id FROM (SELECT j1.j1_id FROM j1)) AS c`. +// The query is invalid as `j1.j1_id` is not a valid identifier in the derived table +// `(SELECT j1.j1_id FROM j1)` +// +// With this logic, the unparsed query will be: +// `SELECT c.id FROM (SELECT j1.j1_id FROM j1) AS c (id)` +// +// Caveat: this won't handle the case like `select * from (select 1, 2) AS a (b, c)` +// as the parser gives a wrong plan which has mismatch `Int(1)` types: Literal and +// Column in the Projections. Once the parser side is fixed, this logic should work +pub(super) fn subquery_alias_inner_query_and_columns( + subquery_alias: &datafusion_expr::SubqueryAlias, +) -> (&LogicalPlan, Vec) { + let plan: &LogicalPlan = subquery_alias.input.as_ref(); + + let LogicalPlan::Projection(outer_projections) = plan else { + return (plan, vec![]); + }; + + // check if it's projection inside projection + let Some(inner_projection) = find_projection(outer_projections.input.as_ref()) else { + return (plan, vec![]); + }; + + let mut columns: Vec = vec![]; + // check if the inner projection and outer projection have a matching pattern like + // Projection: j1.j1_id AS id + // Projection: j1.j1_id + for (i, inner_expr) in inner_projection.expr.iter().enumerate() { + let Expr::Alias(ref outer_alias) = &outer_projections.expr[i] else { + return (plan, vec![]); + }; + + // inner projection schema fields store the projection name which is used in outer + // projection expr + let inner_expr_string = match inner_expr { + Expr::Column(_) => inner_expr.to_string(), + _ => inner_projection.schema.field(i).name().clone(), + }; + + if outer_alias.expr.to_string() != inner_expr_string { + return (plan, vec![]); + }; + + columns.push(outer_alias.name.as_str().into()); + } + + (outer_projections.input.as_ref(), columns) +} + +fn find_projection(logical_plan: &LogicalPlan) -> Option<&Projection> { + match logical_plan { + LogicalPlan::Projection(p) => Some(p), + LogicalPlan::Limit(p) => find_projection(p.input.as_ref()), + LogicalPlan::Distinct(p) => find_projection(p.input().as_ref()), + LogicalPlan::Sort(p) => find_projection(p.input.as_ref()), + _ => None, + } +} diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 2ac303487336..9bbdbe8dbfc9 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -373,6 +373,38 @@ fn roundtrip_statement_with_dialect() -> Result<()> { parser_dialect: Box::new(GenericDialect {}), unparser_dialect: Box::new(UnparserDefaultDialect {}), }, + // Test query that has calculation in derived table with columns + TestStatementWithDialect { + sql: "SELECT id FROM (SELECT j1_id + 1 * 3 from j1) AS c (id)", + expected: r#"SELECT c.id FROM (SELECT (j1.j1_id + (1 * 3)) FROM j1) AS c (id)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(UnparserDefaultDialect {}), + }, + // Test query that has limit/distinct/order in derived table with columns + TestStatementWithDialect { + sql: "SELECT id FROM (SELECT distinct (j1_id + 1 * 3) FROM j1 LIMIT 1) AS c (id)", + expected: r#"SELECT c.id FROM (SELECT DISTINCT (j1.j1_id + (1 * 3)) FROM j1 LIMIT 1) AS c (id)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(UnparserDefaultDialect {}), + }, + TestStatementWithDialect { + sql: "SELECT id FROM (SELECT j1_id + 1 FROM j1 ORDER BY j1_id DESC LIMIT 1) AS c (id)", + expected: r#"SELECT c.id FROM (SELECT (j1.j1_id + 1) FROM j1 ORDER BY j1.j1_id DESC NULLS FIRST LIMIT 1) AS c (id)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(UnparserDefaultDialect {}), + }, + TestStatementWithDialect { + sql: "SELECT id FROM (SELECT CAST((CAST(j1_id as BIGINT) + 1) as int) * 10 FROM j1 LIMIT 1) AS c (id)", + expected: r#"SELECT c.id FROM (SELECT (CAST((CAST(j1.j1_id AS BIGINT) + 1) AS INTEGER) * 10) FROM j1 LIMIT 1) AS c (id)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(UnparserDefaultDialect {}), + }, + TestStatementWithDialect { + sql: "SELECT id FROM (SELECT CAST(j1_id as BIGINT) + 1 FROM j1 ORDER BY j1_id LIMIT 1) AS c (id)", + expected: r#"SELECT c.id FROM (SELECT (CAST(j1.j1_id AS BIGINT) + 1) FROM j1 ORDER BY j1.j1_id ASC NULLS LAST LIMIT 1) AS c (id)"#, + parser_dialect: Box::new(GenericDialect {}), + unparser_dialect: Box::new(UnparserDefaultDialect {}), + } ]; for query in tests { From 1c9583ab95310fb1afa93fec88432ed6536da749 Mon Sep 17 00:00:00 2001 From: Xiangpeng Hao Date: Fri, 9 Aug 2024 04:39:44 +0800 Subject: [PATCH 08/12] Avoid unecessary copy when reading arrow files (#11840) * avoid copy * fmt --------- Co-authored-by: Andrew Lamb --- .../core/src/datasource/physical_plan/arrow_file.rs | 11 +++++++++-- datafusion/proto-common/src/from_proto/mod.rs | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index a1ee6fbe1341..b4edc221c1f8 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -31,6 +31,7 @@ use crate::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, }; +use arrow::buffer::Buffer; use arrow_ipc::reader::FileDecoder; use arrow_schema::SchemaRef; use datafusion_common::config::ConfigOptions; @@ -296,7 +297,10 @@ impl FileOpener for ArrowOpener { for (dict_block, dict_result) in footer.dictionaries().iter().flatten().zip(dict_results) { - decoder.read_dictionary(dict_block, &dict_result.into())?; + decoder.read_dictionary( + dict_block, + &Buffer::from_bytes(dict_result.into()), + )?; } // filter recordbatches according to range @@ -332,7 +336,10 @@ impl FileOpener for ArrowOpener { .zip(recordbatch_results) .filter_map(move |(block, data)| { decoder - .read_record_batch(&block, &data.into()) + .read_record_batch( + &block, + &Buffer::from_bytes(data.into()), + ) .transpose() }), ) diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 3487f43ae24e..feb4c11aa809 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -408,7 +408,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { "Error IPC message while deserializing ScalarValue::List: {e}" )) })?; - let buffer = Buffer::from(arrow_data); + let buffer = Buffer::from(arrow_data.as_slice()); let ipc_batch = message.header_as_record_batch().ok_or_else(|| { Error::General( @@ -423,7 +423,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { "Error IPC message while deserializing ScalarValue::List dictionary message: {e}" )) })?; - let buffer = Buffer::from(arrow_data); + let buffer = Buffer::from(arrow_data.as_slice()); let dict_batch = message.header_as_dictionary_batch().ok_or_else(|| { Error::General( From 56be714ecaf0b0271d1c8a64cbc41d1dbc972228 Mon Sep 17 00:00:00 2001 From: Xin Li <33629085+xinlifoobar@users.noreply.github.com> Date: Thu, 8 Aug 2024 13:40:06 -0700 Subject: [PATCH 09/12] Support NULL literal in Min/Max (#11812) * Support NULL literal in Min/Max * Fix ut * fix fmt --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 5 ++++- datafusion/core/tests/dataframe/describe.rs | 4 ++-- datafusion/functions-aggregate/src/min_max.rs | 2 ++ datafusion/sqllogictest/test_files/aggregate.slt | 6 ++++++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index cc1a63cc05f7..5fa65cb0da42 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -717,7 +717,10 @@ impl DataFrame { { let column = batchs[0].column_by_name(field.name()).unwrap(); - if field.data_type().is_numeric() { + + if column.data_type().is_null() { + Arc::new(StringArray::from(vec!["null"])) + } else if field.data_type().is_numeric() { cast(column, &DataType::Float64)? } else { cast(column, &DataType::Utf8)? diff --git a/datafusion/core/tests/dataframe/describe.rs b/datafusion/core/tests/dataframe/describe.rs index e446d71473be..9321481efbd2 100644 --- a/datafusion/core/tests/dataframe/describe.rs +++ b/datafusion/core/tests/dataframe/describe.rs @@ -102,8 +102,8 @@ async fn describe_null() -> Result<()> { "| null_count | 0 | 1 |", "| mean | null | null |", "| std | null | null |", - "| min | null | null |", - "| max | null | null |", + "| min | a | null |", + "| max | a | null |", "| median | null | null |", "+------------+------+------+" ]; diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 18028e358b21..f19d6d767ba1 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -304,6 +304,7 @@ macro_rules! typed_min_max_batch { macro_rules! min_max_batch { ($VALUES:expr, $OP:ident) => {{ match $VALUES.data_type() { + DataType::Null => ScalarValue::Null, DataType::Decimal128(precision, scale) => { typed_min_max_batch!( $VALUES, @@ -579,6 +580,7 @@ macro_rules! interval_min_max { macro_rules! min_max { ($VALUE:expr, $DELTA:expr, $OP:ident) => {{ Ok(match ($VALUE, $DELTA) { + (ScalarValue::Null, ScalarValue::Null) => ScalarValue::Null, ( lhs @ ScalarValue::Decimal128(lhsv, lhsp, lhss), rhs @ ScalarValue::Decimal128(rhsv, rhsp, rhss) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 8a5222143356..c68a6c345caa 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -5548,3 +5548,9 @@ set datafusion.explain.logical_plan_only = false; statement ok drop table employee_csv; + +# test null literal handling in supported aggregate functions +query I??III?T +select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ','); +---- +0 NULL NULL NULL NULL NULL NULL NULL \ No newline at end of file From 20fbd88521a9687e0bf2db6c68d83955fb46e723 Mon Sep 17 00:00:00 2001 From: Piotr Findeisen Date: Thu, 8 Aug 2024 22:40:30 +0200 Subject: [PATCH 10/12] Remove many `crate::` imports in listing table provider module (#11887) This is part of isolating this module in order to be able to move it out of core. This commit attempts to replace all `crate::` imports that are possible to avoid today (i.e. which would be replaced when listing table provider was moved to separate crate), leaving those that cannot be replaced. This makes it easy to notice the remaining coupling between the listing table provider module and the core. --- .../core/src/datasource/listing/helpers.rs | 8 +++--- datafusion/core/src/datasource/listing/mod.rs | 4 +-- .../core/src/datasource/listing/table.rs | 25 ++++++++----------- datafusion/core/src/datasource/listing/url.rs | 2 +- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index 67af8ef12c8b..b5dd2dd12e10 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -21,11 +21,11 @@ use std::collections::HashMap; use std::mem; use std::sync::Arc; +use super::ListingTableUrl; use super::PartitionedFile; -use crate::datasource::listing::ListingTableUrl; use crate::execution::context::SessionState; -use crate::logical_expr::{BinaryExpr, Operator}; -use crate::{error::Result, scalar::ScalarValue}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{BinaryExpr, Operator}; use arrow::{ array::{Array, ArrayRef, AsArray, StringBuilder}, @@ -518,8 +518,8 @@ mod tests { use futures::StreamExt; - use crate::logical_expr::{case, col, lit, Expr}; use crate::test::object_store::make_test_store_and_state; + use datafusion_expr::{case, col, lit, Expr}; use super::*; diff --git a/datafusion/core/src/datasource/listing/mod.rs b/datafusion/core/src/datasource/listing/mod.rs index 21a60614cff2..c5a441aacf1d 100644 --- a/datafusion/core/src/datasource/listing/mod.rs +++ b/datafusion/core/src/datasource/listing/mod.rs @@ -22,8 +22,8 @@ mod helpers; mod table; mod url; -use crate::error::Result; use chrono::TimeZone; +use datafusion_common::Result; use datafusion_common::{ScalarValue, Statistics}; use futures::Stream; use object_store::{path::Path, ObjectMeta}; @@ -162,7 +162,7 @@ impl From for PartitionedFile { #[cfg(test)] mod tests { - use crate::datasource::listing::ListingTableUrl; + use super::ListingTableUrl; use datafusion_execution::object_store::{ DefaultObjectStoreRegistry, ObjectStoreRegistry, }; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index bb86ac3ae416..89066d8234ac 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -24,19 +24,18 @@ use std::{any::Any, sync::Arc}; use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_files}; use super::PartitionedFile; -use crate::catalog::TableProvider; -use crate::datasource::{create_ordering, get_statistics_with_limit, TableType}; +use super::ListingTableUrl; +use crate::datasource::{create_ordering, get_statistics_with_limit}; use crate::datasource::{ file_format::{file_compression_type::FileCompressionType, FileFormat}, - listing::ListingTableUrl, physical_plan::{FileScanConfig, FileSinkConfig}, }; -use crate::{ - error::{DataFusionError, Result}, - execution::context::SessionState, - logical_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}, - physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics}, -}; +use crate::execution::context::SessionState; +use datafusion_catalog::TableProvider; +use datafusion_common::{DataFusionError, Result}; +use datafusion_expr::TableType; +use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; +use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics}; use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef}; use arrow_schema::Schema; @@ -1051,12 +1050,12 @@ mod tests { use crate::datasource::file_format::parquet::ParquetFormat; use crate::datasource::{provider_as_source, MemTable}; use crate::execution::options::ArrowReadOptions; - use crate::physical_plan::collect; use crate::prelude::*; use crate::{ assert_batches_eq, test::{columns, object_store::register_test_store}, }; + use datafusion_physical_plan::collect; use arrow::record_batch::RecordBatch; use arrow_schema::SortOptions; @@ -1154,10 +1153,8 @@ mod tests { let options = ListingOptions::new(Arc::new(ParquetFormat::default())); let schema = options.infer_schema(&state, &table_path).await.unwrap(); - use crate::{ - datasource::file_format::parquet::ParquetFormat, - physical_plan::expressions::col as physical_col, - }; + use crate::datasource::file_format::parquet::ParquetFormat; + use datafusion_physical_plan::expressions::col as physical_col; use std::ops::Add; // (file_sort_order, expected_result) diff --git a/datafusion/core/src/datasource/listing/url.rs b/datafusion/core/src/datasource/listing/url.rs index 7566df628ed7..1701707fdb72 100644 --- a/datafusion/core/src/datasource/listing/url.rs +++ b/datafusion/core/src/datasource/listing/url.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::datasource::object_store::ObjectStoreUrl; use crate::execution::context::SessionState; use datafusion_common::{DataFusionError, Result}; +use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_optimizer::OptimizerConfig; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; From 56f8e3527b78282511e51701223971f04244c8cf Mon Sep 17 00:00:00 2001 From: Michael J Ward Date: Thu, 8 Aug 2024 15:41:27 -0500 Subject: [PATCH 11/12] feat: expose centroids in approx_percentile_cont fluent api (#11878) * feat: expose centroids in approx_percentile_count fluent api Closes https://github.com/apache/datafusion/issues/11877 * avoid repeated import prefix in function signature * update test_fn_approx_percentile_cont so that adjusting centroids changes the result --- .../tests/dataframe/dataframe_functions.rs | 19 ++++++++++++++-- .../src/approx_percentile_cont.rs | 22 +++++++++++++------ .../tests/cases/roundtrip_logical_plan.rs | 3 ++- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 7a0e9888a61c..1bd90fce839d 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -360,7 +360,7 @@ async fn test_fn_approx_median() -> Result<()> { #[tokio::test] async fn test_fn_approx_percentile_cont() -> Result<()> { - let expr = approx_percentile_cont(col("b"), lit(0.5)); + let expr = approx_percentile_cont(col("b"), lit(0.5), None); let expected = [ "+---------------------------------------------+", @@ -381,7 +381,7 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { None::<&str>, "arg_2".to_string(), )); - let expr = approx_percentile_cont(col("b"), alias_expr); + let expr = approx_percentile_cont(col("b"), alias_expr, None); let df = create_test_table().await?; let expected = [ "+--------------------------------------+", @@ -394,6 +394,21 @@ async fn test_fn_approx_percentile_cont() -> Result<()> { assert_batches_eq!(expected, &batches); + // with number of centroids set + let expr = approx_percentile_cont(col("b"), lit(0.5), Some(lit(2))); + let expected = [ + "+------------------------------------------------------+", + "| approx_percentile_cont(test.b,Float64(0.5),Int32(2)) |", + "+------------------------------------------------------+", + "| 30 |", + "+------------------------------------------------------+", + ]; + + let df = create_test_table().await?; + let batches = df.aggregate(vec![], vec![expr]).unwrap().collect().await?; + + assert_batches_eq!(expected, &batches); + Ok(()) } diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index af2a26fd05ec..ffa623c13b0b 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -46,13 +46,21 @@ use datafusion_physical_expr_common::aggregate::tdigest::{ }; use datafusion_physical_expr_common::utils::limited_convert_logical_expr_to_physical_expr_with_dfschema; -make_udaf_expr_and_func!( - ApproxPercentileCont, - approx_percentile_cont, - expression percentile, - "Computes the approximate percentile continuous of a set of numbers", - approx_percentile_cont_udaf -); +create_func!(ApproxPercentileCont, approx_percentile_cont_udaf); + +/// Computes the approximate percentile continuous of a set of numbers +pub fn approx_percentile_cont( + expression: Expr, + percentile: Expr, + centroids: Option, +) -> Expr { + let args = if let Some(centroids) = centroids { + vec![expression, percentile, centroids] + } else { + vec![expression, percentile] + }; + approx_percentile_cont_udaf().call(args) +} pub struct ApproxPercentileCont { signature: Signature, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index e5c226418441..a18fa03b2d15 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -887,7 +887,8 @@ async fn roundtrip_expr_api() -> Result<()> { stddev_pop(lit(2.2)), approx_distinct(lit(2)), approx_median(lit(2)), - approx_percentile_cont(lit(2), lit(0.5)), + approx_percentile_cont(lit(2), lit(0.5), None), + approx_percentile_cont(lit(2), lit(0.5), Some(lit(50))), approx_percentile_cont_with_weight(lit(2), lit(1), lit(0.5)), grouping(lit(1)), bit_and(lit(2)), From 7c41323fa46963a6889adafd5d4a8e43c69291b2 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Fri, 9 Aug 2024 09:00:02 +0800 Subject: [PATCH 12/12] Rename `Expr::display_name` to `Expr::schema_name`, make `UNNEST` naming conform to convention (#11797) * introduce schema_name Signed-off-by: jayzhan211 * fix Signed-off-by: jayzhan211 * cleanup Signed-off-by: jayzhan211 * cleanup Signed-off-by: jayzhan211 * fix doc Signed-off-by: jayzhan211 * reuse for simple case Signed-off-by: jayzhan211 * unnest + udf Signed-off-by: jayzhan211 * fmt Signed-off-by: jayzhan211 * add display name for udf Signed-off-by: jayzhan211 * fix name in udf Signed-off-by: jayzhan211 * rename unnest Signed-off-by: jayzhan211 * rm column Signed-off-by: jayzhan211 * inlis Signed-off-by: jayzhan211 * fmt Signed-off-by: jayzhan211 * udaf Signed-off-by: jayzhan211 * case Signed-off-by: jayzhan211 * use write Signed-off-by: jayzhan211 * fix test Signed-off-by: jayzhan211 * fix window Signed-off-by: jayzhan211 * window Signed-off-by: jayzhan211 * like and similar to Signed-off-by: jayzhan211 * rm display name Signed-off-by: jayzhan211 * comment Signed-off-by: jayzhan211 * cliip Signed-off-by: jayzhan211 * fix doc Signed-off-by: jayzhan211 * display Signed-off-by: jayzhan211 * fix conflict Signed-off-by: jayzhan211 * fix merge conflict Signed-off-by: jayzhan211 * with display Signed-off-by: jayzhan211 * function for exprs Signed-off-by: jayzhan211 * clippy Signed-off-by: jayzhan211 * fix doc Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 --- datafusion/core/src/physical_planner.rs | 4 +- datafusion/expr/src/expr.rs | 617 +++++++++--------- datafusion/expr/src/expr_rewriter/mod.rs | 16 +- datafusion/expr/src/expr_rewriter/order_by.rs | 4 +- datafusion/expr/src/expr_schema.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 10 +- datafusion/expr/src/logical_plan/plan.rs | 4 +- datafusion/expr/src/udf.rs | 24 +- datafusion/expr/src/utils.rs | 4 +- datafusion/functions-nested/src/expr_ext.rs | 4 +- datafusion/functions-nested/src/extract.rs | 51 +- datafusion/functions-nested/src/utils.rs | 7 +- datafusion/functions/src/core/expr_ext.rs | 2 +- datafusion/functions/src/core/getfield.rs | 22 +- .../optimizer/src/analyzer/type_coercion.rs | 14 +- .../optimizer/src/common_subexpr_eliminate.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 9 +- .../optimizer/src/optimize_projections/mod.rs | 8 +- datafusion/optimizer/src/push_down_filter.rs | 4 +- .../optimizer/src/scalar_subquery_to_join.rs | 4 +- .../src/single_distinct_to_groupby.rs | 2 +- datafusion/proto/tests/cases/serialize.rs | 2 +- datafusion/sql/src/unparser/utils.rs | 2 +- datafusion/sql/src/utils.rs | 26 +- datafusion/sql/tests/cases/plan_to_sql.rs | 8 +- datafusion/sqllogictest/test_files/array.slt | 4 +- .../test_files/push_down_filter.slt | 40 +- datafusion/sqllogictest/test_files/unnest.slt | 18 +- .../substrait/src/logical_plan/consumer.rs | 2 +- .../tests/cases/consumer_integration.rs | 4 +- 30 files changed, 494 insertions(+), 426 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 58b02c08e34c..ab0765ac0deb 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1529,7 +1529,7 @@ pub fn create_window_expr( // unpack aliased logical expressions, e.g. "sum(col) over () as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (name.clone(), expr.as_ref()), - _ => (e.display_name()?, e), + _ => (e.schema_name().to_string(), e), }; create_window_expr_with_name(e, name, logical_schema, execution_props) } @@ -1620,7 +1620,7 @@ pub fn create_aggregate_expr_and_maybe_filter( // unpack (nested) aliased logical expressions, e.g. "sum(col) as total" let (name, e) = match e { Expr::Alias(Alias { expr, name, .. }) => (Some(name.clone()), expr.as_ref()), - Expr::AggregateFunction(_) => (e.display_name().ok(), e), + Expr::AggregateFunction(_) => (Some(e.schema_name().to_string()), e), _ => (None, e), }; diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index edf45a244e1f..5030a95d3c8a 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -983,10 +983,35 @@ impl PartialOrd for Expr { } impl Expr { - /// Returns the name of this expression as it should appear in a schema. This name - /// will not include any CAST expressions. + #[deprecated(since = "40.0.0", note = "use schema_name instead")] pub fn display_name(&self) -> Result { - create_name(self) + Ok(self.schema_name().to_string()) + } + + /// The name of the column (field) that this `Expr` will produce. + /// + /// For example, for a projection (e.g. `SELECT `) the resulting arrow + /// [`Schema`] will have a field with this name. + /// + /// Note that the resulting string is subtlety different than the `Display` + /// representation for certain `Expr`. Some differences: + /// + /// 1. [`Expr::Alias`], which shows only the alias itself + /// 2. [`Expr::Cast`] / [`Expr::TryCast`], which only displays the expression + /// + /// # Example + /// ``` + /// # use datafusion_expr::{col, lit}; + /// let expr = col("foo").eq(lit(42)); + /// assert_eq!("foo = Int32(42)", expr.schema_name().to_string()); + /// + /// let expr = col("foo").alias("bar").eq(lit(11)); + /// assert_eq!("bar = Int32(11)", expr.schema_name().to_string()); + /// ``` + /// + /// [`Schema`]: arrow::datatypes::Schema + pub fn schema_name(&self) -> impl Display + '_ { + SchemaDisplay(self) } /// Returns a full and complete string representation of this expression. @@ -1119,7 +1144,7 @@ impl Expr { match self { // call Expr::display_name() on a Expr::Sort will throw an error Expr::Sort(Sort { expr, .. }) => expr.name_for_alias(), - expr => expr.display_name(), + expr => Ok(expr.schema_name().to_string()), } } @@ -1127,7 +1152,6 @@ impl Expr { /// alias if necessary. pub fn alias_if_changed(self, original_name: String) -> Result { let new_name = self.name_for_alias()?; - if new_name == original_name { return Ok(self); } @@ -1749,6 +1773,287 @@ macro_rules! expr_vec_fmt { }}; } +struct SchemaDisplay<'a>(&'a Expr); +impl<'a> Display for SchemaDisplay<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self.0 { + // The same as Display + Expr::Column(_) + | Expr::Literal(_) + | Expr::ScalarVariable(..) + | Expr::Sort(_) + | Expr::OuterReferenceColumn(..) + | Expr::Placeholder(_) + | Expr::Wildcard { .. } => write!(f, "{}", self.0), + + Expr::AggregateFunction(AggregateFunction { + func, + args, + distinct, + filter, + order_by, + null_treatment, + }) => { + write!( + f, + "{}({}{})", + func.name(), + if *distinct { "DISTINCT " } else { "" }, + schema_name_from_exprs_comma_seperated_without_space(args)? + )?; + + if let Some(null_treatment) = null_treatment { + write!(f, " {}", null_treatment)?; + } + + if let Some(filter) = filter { + write!(f, " FILTER (WHERE {filter})")?; + }; + + if let Some(order_by) = order_by { + write!(f, " ORDER BY [{}]", schema_name_from_exprs(order_by)?)?; + }; + + Ok(()) + } + // expr is not shown since it is aliased + Expr::Alias(Alias { name, .. }) => write!(f, "{name}"), + Expr::Between(Between { + expr, + negated, + low, + high, + }) => { + if *negated { + write!( + f, + "{} NOT BETWEEN {} AND {}", + SchemaDisplay(expr), + SchemaDisplay(low), + SchemaDisplay(high), + ) + } else { + write!( + f, + "{} BETWEEN {} AND {}", + SchemaDisplay(expr), + SchemaDisplay(low), + SchemaDisplay(high), + ) + } + } + Expr::BinaryExpr(BinaryExpr { left, op, right }) => { + write!(f, "{} {op} {}", SchemaDisplay(left), SchemaDisplay(right),) + } + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { + write!(f, "CASE ")?; + + if let Some(e) = expr { + write!(f, "{} ", SchemaDisplay(e))?; + } + + for (when, then) in when_then_expr { + write!( + f, + "WHEN {} THEN {} ", + SchemaDisplay(when), + SchemaDisplay(then), + )?; + } + + if let Some(e) = else_expr { + write!(f, "ELSE {} ", SchemaDisplay(e))?; + } + + write!(f, "END") + } + // cast expr is not shown to be consistant with Postgres and Spark + Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) => { + write!(f, "{}", SchemaDisplay(expr)) + } + Expr::InList(InList { + expr, + list, + negated, + }) => { + let inlist_name = schema_name_from_exprs(list)?; + + if *negated { + write!(f, "{} NOT IN {}", SchemaDisplay(expr), inlist_name) + } else { + write!(f, "{} IN {}", SchemaDisplay(expr), inlist_name) + } + } + Expr::Exists(Exists { negated: true, .. }) => write!(f, "NOT EXISTS"), + Expr::Exists(Exists { negated: false, .. }) => write!(f, "EXISTS"), + Expr::GroupingSet(GroupingSet::Cube(exprs)) => { + write!(f, "ROLLUP ({})", schema_name_from_exprs(exprs)?) + } + Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { + write!(f, "GROUPING SETS (")?; + for exprs in lists_of_exprs.iter() { + write!(f, "({})", schema_name_from_exprs(exprs)?)?; + } + write!(f, ")") + } + Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { + write!(f, "ROLLUP ({})", schema_name_from_exprs(exprs)?) + } + Expr::IsNull(expr) => write!(f, "{} IS NULL", SchemaDisplay(expr)), + Expr::IsNotNull(expr) => { + write!(f, "{} IS NOT NULL", SchemaDisplay(expr)) + } + Expr::IsUnknown(expr) => { + write!(f, "{} IS UNKNOWN", SchemaDisplay(expr)) + } + Expr::IsNotUnknown(expr) => { + write!(f, "{} IS NOT UNKNOWN", SchemaDisplay(expr)) + } + Expr::InSubquery(InSubquery { negated: true, .. }) => { + write!(f, "NOT IN") + } + Expr::InSubquery(InSubquery { negated: false, .. }) => write!(f, "IN"), + Expr::IsTrue(expr) => write!(f, "{} IS TRUE", SchemaDisplay(expr)), + Expr::IsFalse(expr) => write!(f, "{} IS FALSE", SchemaDisplay(expr)), + Expr::IsNotTrue(expr) => { + write!(f, "{} IS NOT TRUE", SchemaDisplay(expr)) + } + Expr::IsNotFalse(expr) => { + write!(f, "{} IS NOT FALSE", SchemaDisplay(expr)) + } + Expr::Like(Like { + negated, + expr, + pattern, + escape_char, + case_insensitive, + }) => { + write!( + f, + "{} {}{} {}", + SchemaDisplay(expr), + if *negated { "NOT " } else { "" }, + if *case_insensitive { "ILIKE" } else { "LIKE" }, + SchemaDisplay(pattern), + )?; + + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::Negative(expr) => write!(f, "(- {})", SchemaDisplay(expr)), + Expr::Not(expr) => write!(f, "NOT {}", SchemaDisplay(expr)), + Expr::Unnest(Unnest { expr }) => { + write!(f, "UNNEST({})", SchemaDisplay(expr)) + } + Expr::ScalarFunction(ScalarFunction { func, args }) => { + match func.schema_name(args) { + Ok(name) => { + write!(f, "{name}") + } + Err(e) => { + write!(f, "got error from schema_name {}", e) + } + } + } + Expr::ScalarSubquery(Subquery { subquery, .. }) => { + write!(f, "{}", subquery.schema().field(0).name()) + } + Expr::SimilarTo(Like { + negated, + expr, + pattern, + escape_char, + .. + }) => { + write!( + f, + "{} {} {}", + SchemaDisplay(expr), + if *negated { + "NOT SIMILAR TO" + } else { + "SIMILAR TO" + }, + SchemaDisplay(pattern), + )?; + if let Some(char) = escape_char { + write!(f, " CHAR '{char}'")?; + } + + Ok(()) + } + Expr::WindowFunction(WindowFunction { + fun, + args, + partition_by, + order_by, + window_frame, + null_treatment, + }) => { + write!( + f, + "{}({})", + fun, + schema_name_from_exprs_comma_seperated_without_space(args)? + )?; + + if let Some(null_treatment) = null_treatment { + write!(f, " {}", null_treatment)?; + } + + if !partition_by.is_empty() { + write!( + f, + " PARTITION BY [{}]", + schema_name_from_exprs(partition_by)? + )?; + } + + if !order_by.is_empty() { + write!(f, " ORDER BY [{}]", schema_name_from_exprs(order_by)?)?; + }; + + write!(f, " {window_frame}") + } + } + } +} + +/// Get schema_name for Vector of expressions +/// +/// Internal usage. Please call `schema_name_from_exprs` instead +// TODO: Use ", " to standardize the formatting of Vec, +// +pub(crate) fn schema_name_from_exprs_comma_seperated_without_space( + exprs: &[Expr], +) -> Result { + schema_name_from_exprs_inner(exprs, ",") +} + +/// Get schema_name for Vector of expressions +pub fn schema_name_from_exprs(exprs: &[Expr]) -> Result { + schema_name_from_exprs_inner(exprs, ", ") +} + +fn schema_name_from_exprs_inner(exprs: &[Expr], sep: &str) -> Result { + let mut s = String::new(); + for (i, e) in exprs.iter().enumerate() { + if i > 0 { + write!(&mut s, "{sep}")?; + } + write!(&mut s, "{}", SchemaDisplay(e))?; + } + + Ok(s) +} + /// Format expressions for display as part of a logical plan. In many cases, this will produce /// similar output to `Expr.name()` except that column names will be prefixed with '#'. impl fmt::Display for Expr { @@ -1827,6 +2132,10 @@ impl fmt::Display for Expr { Expr::ScalarFunction(fun) => { fmt_function(f, fun.name(), false, &fun.args, true) } + // TODO: use udf's display_name, need to fix the seperator issue, + // Expr::ScalarFunction(ScalarFunction { func, args }) => { + // write!(f, "{}", func.display_name(args).unwrap()) + // } Expr::WindowFunction(WindowFunction { fun, args, @@ -1961,6 +2270,7 @@ impl fmt::Display for Expr { }, Expr::Placeholder(Placeholder { id, .. }) => write!(f, "{id}"), Expr::Unnest(Unnest { expr }) => { + // TODO: use Display instead of Debug, there is non-unique expression name in projection issue. write!(f, "UNNEST({expr:?})") } } @@ -1979,7 +2289,6 @@ fn fmt_function( false => args.iter().map(|arg| format!("{arg:?}")).collect(), }; - // let args: Vec = args.iter().map(|arg| format!("{:?}", arg)).collect(); let distinct_str = match distinct { true => "DISTINCT ", false => "", @@ -1987,297 +2296,6 @@ fn fmt_function( write!(f, "{}({}{})", fun, distinct_str, args.join(", ")) } -fn write_function_name( - w: &mut W, - fun: &str, - distinct: bool, - args: &[Expr], -) -> Result<()> { - write!(w, "{}(", fun)?; - if distinct { - w.write_str("DISTINCT ")?; - } - write_names_join(w, args, ",")?; - w.write_str(")")?; - Ok(()) -} - -/// Returns a readable name of an expression based on the input schema. -/// This function recursively transverses the expression for names such as "CAST(a > 2)". -pub(crate) fn create_name(e: &Expr) -> Result { - let mut s = String::new(); - write_name(&mut s, e)?; - Ok(s) -} - -fn write_name(w: &mut W, e: &Expr) -> Result<()> { - match e { - Expr::Alias(Alias { name, .. }) => write!(w, "{}", name)?, - Expr::Column(c) => write!(w, "{}", c.flat_name())?, - Expr::OuterReferenceColumn(_, c) => write!(w, "outer_ref({})", c.flat_name())?, - Expr::ScalarVariable(_, variable_names) => { - write!(w, "{}", variable_names.join("."))? - } - Expr::Literal(value) => write!(w, "{value:?}")?, - Expr::BinaryExpr(binary_expr) => { - write_name(w, binary_expr.left.as_ref())?; - write!(w, " {} ", binary_expr.op)?; - write_name(w, binary_expr.right.as_ref())?; - } - Expr::Like(Like { - negated, - expr, - pattern, - escape_char, - case_insensitive, - }) => { - write!( - w, - "{} {}{} {}", - expr, - if *negated { "NOT " } else { "" }, - if *case_insensitive { "ILIKE" } else { "LIKE" }, - pattern, - )?; - if let Some(char) = escape_char { - write!(w, " CHAR '{char}'")?; - } - } - Expr::SimilarTo(Like { - negated, - expr, - pattern, - escape_char, - case_insensitive: _, - }) => { - write!( - w, - "{} {} {}", - expr, - if *negated { - "NOT SIMILAR TO" - } else { - "SIMILAR TO" - }, - pattern, - )?; - if let Some(char) = escape_char { - write!(w, " CHAR '{char}'")?; - } - } - Expr::Case(case) => { - write!(w, "CASE ")?; - if let Some(e) = &case.expr { - write_name(w, e)?; - w.write_str(" ")?; - } - for (when, then) in &case.when_then_expr { - w.write_str("WHEN ")?; - write_name(w, when)?; - w.write_str(" THEN ")?; - write_name(w, then)?; - w.write_str(" ")?; - } - if let Some(e) = &case.else_expr { - w.write_str("ELSE ")?; - write_name(w, e)?; - w.write_str(" ")?; - } - w.write_str("END")?; - } - Expr::Cast(Cast { expr, .. }) => { - // CAST does not change the expression name - write_name(w, expr)?; - } - Expr::TryCast(TryCast { expr, .. }) => { - // CAST does not change the expression name - write_name(w, expr)?; - } - Expr::Not(expr) => { - w.write_str("NOT ")?; - write_name(w, expr)?; - } - Expr::Negative(expr) => { - w.write_str("(- ")?; - write_name(w, expr)?; - w.write_str(")")?; - } - Expr::IsNull(expr) => { - write_name(w, expr)?; - w.write_str(" IS NULL")?; - } - Expr::IsNotNull(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT NULL")?; - } - Expr::IsTrue(expr) => { - write_name(w, expr)?; - w.write_str(" IS TRUE")?; - } - Expr::IsFalse(expr) => { - write_name(w, expr)?; - w.write_str(" IS FALSE")?; - } - Expr::IsUnknown(expr) => { - write_name(w, expr)?; - w.write_str(" IS UNKNOWN")?; - } - Expr::IsNotTrue(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT TRUE")?; - } - Expr::IsNotFalse(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT FALSE")?; - } - Expr::IsNotUnknown(expr) => { - write_name(w, expr)?; - w.write_str(" IS NOT UNKNOWN")?; - } - Expr::Exists(Exists { negated: true, .. }) => w.write_str("NOT EXISTS")?, - Expr::Exists(Exists { negated: false, .. }) => w.write_str("EXISTS")?, - Expr::InSubquery(InSubquery { negated: true, .. }) => w.write_str("NOT IN")?, - Expr::InSubquery(InSubquery { negated: false, .. }) => w.write_str("IN")?, - Expr::ScalarSubquery(subquery) => { - w.write_str(subquery.subquery.schema().field(0).name().as_str())?; - } - Expr::Unnest(Unnest { expr }) => { - w.write_str("unnest(")?; - write_name(w, expr)?; - w.write_str(")")?; - } - Expr::ScalarFunction(fun) => { - w.write_str(fun.func.display_name(&fun.args)?.as_str())?; - } - Expr::WindowFunction(WindowFunction { - fun, - args, - window_frame, - partition_by, - order_by, - null_treatment, - }) => { - write_function_name(w, &fun.to_string(), false, args)?; - - if let Some(nt) = null_treatment { - w.write_str(" ")?; - write!(w, "{}", nt)?; - } - if !partition_by.is_empty() { - w.write_str(" ")?; - write!(w, "PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; - } - if !order_by.is_empty() { - w.write_str(" ")?; - write!(w, "ORDER BY [{}]", expr_vec_fmt!(order_by))?; - } - w.write_str(" ")?; - write!(w, "{window_frame}")?; - } - Expr::AggregateFunction(AggregateFunction { - func, - distinct, - args, - filter, - order_by, - null_treatment, - }) => { - write_function_name(w, func.name(), *distinct, args)?; - if let Some(fe) = filter { - write!(w, " FILTER (WHERE {fe})")?; - }; - if let Some(order_by) = order_by { - write!(w, " ORDER BY [{}]", expr_vec_fmt!(order_by))?; - }; - if let Some(nt) = null_treatment { - write!(w, " {}", nt)?; - } - } - Expr::GroupingSet(grouping_set) => match grouping_set { - GroupingSet::Rollup(exprs) => { - write!(w, "ROLLUP (")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - GroupingSet::Cube(exprs) => { - write!(w, "CUBE (")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - GroupingSet::GroupingSets(lists_of_exprs) => { - write!(w, "GROUPING SETS (")?; - for (i, exprs) in lists_of_exprs.iter().enumerate() { - if i != 0 { - write!(w, ", ")?; - } - write!(w, "(")?; - write_names(w, exprs.as_slice())?; - write!(w, ")")?; - } - write!(w, ")")?; - } - }, - Expr::InList(InList { - expr, - list, - negated, - }) => { - write_name(w, expr)?; - let list = list.iter().map(create_name); - if *negated { - write!(w, " NOT IN ({list:?})")?; - } else { - write!(w, " IN ({list:?})")?; - } - } - Expr::Between(Between { - expr, - negated, - low, - high, - }) => { - write_name(w, expr)?; - if *negated { - write!(w, " NOT BETWEEN ")?; - } else { - write!(w, " BETWEEN ")?; - } - write_name(w, low)?; - write!(w, " AND ")?; - write_name(w, high)?; - } - Expr::Sort { .. } => { - return internal_err!("Create name does not support sort expression") - } - Expr::Wildcard { qualifier } => match qualifier { - Some(qualifier) => { - return internal_err!( - "Create name does not support qualified wildcard, got {qualifier}" - ) - } - None => write!(w, "*")?, - }, - Expr::Placeholder(Placeholder { id, .. }) => write!(w, "{}", id)?, - }; - Ok(()) -} - -fn write_names(w: &mut W, exprs: &[Expr]) -> Result<()> { - exprs.iter().try_for_each(|e| write_name(w, e)) -} - -fn write_names_join(w: &mut W, exprs: &[Expr], sep: &str) -> Result<()> { - let mut iter = exprs.iter(); - if let Some(first_arg) = iter.next() { - write_name(w, first_arg)?; - } - for a in iter { - w.write_str(sep)?; - write_name(w, a)?; - } - Ok(()) -} - pub fn create_function_physical_name( fun: &str, distinct: bool, @@ -2394,7 +2412,7 @@ fn create_physical_name(e: &Expr, is_first_expr: bool) -> Result { let expr = create_physical_name(expr, false)?; Ok(format!("{expr} IS NOT UNKNOWN")) } - Expr::ScalarFunction(fun) => fun.func.display_name(&fun.args), + Expr::ScalarFunction(fun) => fun.func.schema_name(&fun.args), Expr::WindowFunction(WindowFunction { fun, args, @@ -2552,7 +2570,6 @@ mod test { let expected = "CASE a WHEN Int32(1) THEN Boolean(true) WHEN Int32(0) THEN Boolean(false) ELSE NULL END"; assert_eq!(expected, expr.canonical_name()); assert_eq!(expected, format!("{expr}")); - assert_eq!(expected, expr.display_name()?); Ok(()) } @@ -2567,7 +2584,7 @@ mod test { assert_eq!(expected_canonical, format!("{expr}")); // note that CAST intentionally has a name that is different from its `Display` // representation. CAST does not change the name of expressions. - assert_eq!("Float32(1.23)", expr.display_name()?); + assert_eq!("Float32(1.23)", expr.schema_name().to_string()); Ok(()) } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index bf2bfe2c3932..0dc41d4a9ac1 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -173,7 +173,7 @@ pub fn create_col_from_scalar_expr( name, )), _ => { - let scalar_column = scalar_expr.display_name()?; + let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( Some::(subqry_alias.into()), scalar_column, @@ -475,16 +475,14 @@ mod test { let expr = rewrite_preserving_name(expr_from.clone(), &mut rewriter).unwrap(); let original_name = match &expr_from { - Expr::Sort(Sort { expr, .. }) => expr.display_name(), - expr => expr.display_name(), - } - .unwrap(); + Expr::Sort(Sort { expr, .. }) => expr.schema_name().to_string(), + expr => expr.schema_name().to_string(), + }; let new_name = match &expr { - Expr::Sort(Sort { expr, .. }) => expr.display_name(), - expr => expr.display_name(), - } - .unwrap(); + Expr::Sort(Sort { expr, .. }) => expr.schema_name().to_string(), + expr => expr.schema_name().to_string(), + }; assert_eq!( original_name, new_name, diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 3d79caa21fde..bbb855801c3e 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -108,8 +108,8 @@ fn rewrite_in_terms_of_projection( }; // expr is an actual expr like min(t.c2), but we are looking - // for a column with the same "min(C2)", so translate there - let name = normalized_expr.display_name()?; + // for a column with the same "MIN(C2)", so translate there + let name = normalized_expr.schema_name().to_string(); let search_col = Expr::Column(Column { relation: None, diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 9faeb8aed506..73123819ba99 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -474,7 +474,7 @@ impl ExprSchemable for Expr { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( None, - Field::new(self.display_name()?, data_type, nullable) + Field::new(self.schema_name().to_string(), data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index aa2ea4ae1c26..4ef346656ff4 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1298,15 +1298,15 @@ fn add_group_by_exprs_from_dependencies( // c1 + 1` produces an output field named `"c1 + 1"` let mut group_by_field_names = group_expr .iter() - .map(|e| e.display_name()) - .collect::>>()?; + .map(|e| e.schema_name().to_string()) + .collect::>(); if let Some(target_indices) = get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { let expr = Expr::Column(Column::from(schema.qualified_field(idx))); - let expr_name = expr.display_name()?; + let expr_name = expr.schema_name().to_string(); if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); group_expr.push(expr); @@ -1323,7 +1323,7 @@ pub(crate) fn validate_unique_names<'a>( let mut unique_names = HashMap::new(); expressions.into_iter().enumerate().try_for_each(|(position, expr)| { - let name = expr.display_name()?; + let name = expr.schema_name().to_string(); match unique_names.get(&name) { None => { unique_names.insert(name, (position, expr)); @@ -1557,7 +1557,7 @@ pub fn wrap_projection_for_join_if_necessary( if let Some(col) = key.try_as_col() { Ok(col.clone()) } else { - let name = key.display_name()?; + let name = key.schema_name().to_string(); Ok(Column::from_name(name)) } }) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 02176a506a25..c5538d8880a7 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2740,8 +2740,8 @@ fn calc_func_dependencies_for_aggregate( if !contains_grouping_set(group_expr) { let group_by_expr_names = group_expr .iter() - .map(|item| item.display_name()) - .collect::>>()?; + .map(|item| item.schema_name().to_string()) + .collect::>(); let aggregate_func_dependencies = aggregate_functional_dependencies( input.schema(), &group_by_expr_names, diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 5ba6e3007ead..34b5909f0a5a 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -26,7 +26,7 @@ use arrow::datatypes::DataType; use datafusion_common::{not_impl_err, ExprSchema, Result}; -use crate::expr::create_name; +use crate::expr::schema_name_from_exprs_comma_seperated_without_space; use crate::interval_arithmetic::Interval; use crate::simplify::{ExprSimplifyResult, SimplifyInfo}; use crate::sort_properties::{ExprProperties, SortProperties}; @@ -154,6 +154,13 @@ impl ScalarUDF { self.inner.display_name(args) } + /// Returns this function's schema_name. + /// + /// See [`ScalarUDFImpl::schema_name`] for more details + pub fn schema_name(&self, args: &[Expr]) -> Result { + self.inner.schema_name(args) + } + /// Returns the aliases for this function. /// /// See [`ScalarUDF::with_aliases`] for more details @@ -345,12 +352,23 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { fn name(&self) -> &str; /// Returns the user-defined display name of the UDF given the arguments - /// fn display_name(&self, args: &[Expr]) -> Result { - let names: Vec = args.iter().map(create_name).collect::>()?; + let names: Vec = args.iter().map(ToString::to_string).collect(); + // TODO: join with ", " to standardize the formatting of Vec, Ok(format!("{}({})", self.name(), names.join(","))) } + /// Returns the name of the column this expression would create + /// + /// See [`Expr::schema_name`] for details + fn schema_name(&self, args: &[Expr]) -> Result { + Ok(format!( + "{}({})", + self.name(), + schema_name_from_exprs_comma_seperated_without_space(args)? + )) + } + /// Returns the function's [`Signature`] for information about what input /// types are accepted and the function's Volatility. fn signature(&self) -> &Signature; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 65a70b673266..c3e4505ed19c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -798,7 +798,9 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; Ok(Expr::from(Column::from((qualifier, field)))) } - _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), + _ => Ok(Expr::Column(Column::from_name( + expr.schema_name().to_string(), + ))), } } diff --git a/datafusion/functions-nested/src/expr_ext.rs b/datafusion/functions-nested/src/expr_ext.rs index 3524d62d0bc4..4da4a3f583b7 100644 --- a/datafusion/functions-nested/src/expr_ext.rs +++ b/datafusion/functions-nested/src/expr_ext.rs @@ -38,7 +38,7 @@ use crate::extract::{array_element, array_slice}; /// # use datafusion_functions_nested::expr_ext::IndexAccessor; /// let expr = col("c1") /// .index(lit(3)); -/// assert_eq!(expr.display_name().unwrap(), "c1[Int32(3)]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[Int32(3)]"); /// ``` pub trait IndexAccessor { fn index(self, key: Expr) -> Expr; @@ -68,7 +68,7 @@ impl IndexAccessor for Expr { /// # use datafusion_functions_nested::expr_ext::SliceAccessor; /// let expr = col("c1") /// .range(lit(2), lit(4)); -/// assert_eq!(expr.display_name().unwrap(), "c1[Int32(2):Int32(4)]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[Int32(2):Int32(4)]"); /// ``` pub trait SliceAccessor { fn range(self, start: Expr, stop: Expr) -> Expr; diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index af4e36926b68..b9e82f371369 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -40,7 +40,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; -use crate::utils::{get_arg_name, make_scalar_function}; +use crate::utils::make_scalar_function; // Create static instances of ScalarUDFs for each function make_udf_expr_and_func!( @@ -97,11 +97,24 @@ impl ScalarUDFImpl for ArrayElement { } fn display_name(&self, args: &[Expr]) -> Result { - Ok(format!( - "{}[{}]", - get_arg_name(args, 0), - get_arg_name(args, 1) - )) + let args_name = args.iter().map(ToString::to_string).collect::>(); + if args_name.len() != 2 { + return exec_err!("expect 2 args, got {}", args_name.len()); + } + + Ok(format!("{}[{}]", args_name[0], args_name[1])) + } + + fn schema_name(&self, args: &[Expr]) -> Result { + let args_name = args + .iter() + .map(|e| e.schema_name().to_string()) + .collect::>(); + if args_name.len() != 2 { + return exec_err!("expect 2 args, got {}", args_name.len()); + } + + Ok(format!("{}[{}]", args_name[0], args_name[1])) } fn signature(&self) -> &Signature { @@ -254,14 +267,24 @@ impl ScalarUDFImpl for ArraySlice { } fn display_name(&self, args: &[Expr]) -> Result { - Ok(format!( - "{}[{}]", - get_arg_name(args, 0), - (1..args.len()) - .map(|i| get_arg_name(args, i)) - .collect::>() - .join(":") - )) + let args_name = args.iter().map(ToString::to_string).collect::>(); + if let Some((arr, indexes)) = args_name.split_first() { + Ok(format!("{arr}[{}]", indexes.join(":"))) + } else { + exec_err!("no argument") + } + } + + fn schema_name(&self, args: &[Expr]) -> Result { + let args_name = args + .iter() + .map(|e| e.schema_name().to_string()) + .collect::>(); + if let Some((arr, indexes)) = args_name.split_first() { + Ok(format!("{arr}[{}]", indexes.join(":"))) + } else { + exec_err!("no argument") + } } fn name(&self) -> &str { diff --git a/datafusion/functions-nested/src/utils.rs b/datafusion/functions-nested/src/utils.rs index f396c3b22581..688e1633e5cf 100644 --- a/datafusion/functions-nested/src/utils.rs +++ b/datafusion/functions-nested/src/utils.rs @@ -32,7 +32,7 @@ use datafusion_common::{exec_err, plan_err, Result, ScalarValue}; use core::any::type_name; use datafusion_common::DataFusionError; -use datafusion_expr::{ColumnarValue, Expr, ScalarFunctionImplementation}; +use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; macro_rules! downcast_arg { ($ARG:expr, $ARRAY_TYPE:ident) => {{ @@ -253,11 +253,6 @@ pub(crate) fn compute_array_dims( } } -/// Returns the name of the argument at index `i`, or an empty string if the index is out of bounds. -pub(super) fn get_arg_name(args: &[Expr], i: usize) -> String { - args.get(i).map(ToString::to_string).unwrap_or_default() -} - #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/core/expr_ext.rs b/datafusion/functions/src/core/expr_ext.rs index d80df0f334ab..af05f447f1c1 100644 --- a/datafusion/functions/src/core/expr_ext.rs +++ b/datafusion/functions/src/core/expr_ext.rs @@ -41,7 +41,7 @@ use super::expr_fn::get_field; /// # use datafusion_functions::core::expr_ext::FieldAccessor; /// let expr = col("c1") /// .field("my_field"); -/// assert_eq!(expr.display_name().unwrap(), "c1[my_field]"); +/// assert_eq!(expr.schema_name().to_string(), "c1[my_field]"); /// ``` pub trait FieldAccessor { fn field(self, name: impl Literal) -> Expr; diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index 2c2e36b91b13..a51f895c5084 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -74,7 +74,27 @@ impl ScalarUDFImpl for GetFieldFunc { } }; - Ok(format!("{}[{}]", args[0].display_name()?, name)) + Ok(format!("{}[{}]", args[0], name)) + } + + fn schema_name(&self, args: &[Expr]) -> Result { + if args.len() != 2 { + return exec_err!( + "get_field function requires 2 arguments, got {}", + args.len() + ); + } + + let name = match &args[1] { + Expr::Literal(name) => name, + _ => { + return exec_err!( + "get_field function requires the argument field_name to be a string" + ); + } + }; + + Ok(format!("{}[{}]", args[0].schema_name(), name)) } fn signature(&self) -> &Signature { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 2823b0fca2d1..2bb859d84ad7 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1040,9 +1040,7 @@ mod test { let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); let empty = empty_with_type(DataType::Int64); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - let expected = - "Projection: a IN ([CAST(Int32(1) AS Int64), CAST(Int8(4) AS Int64), Int64(8)]) AS a IN (Map { iter: Iter([Literal(Int32(1)), Literal(Int8(4)), Literal(Int64(8))]) })\ - \n EmptyRelation"; + let expected = "Projection: a IN ([CAST(Int32(1) AS Int64), CAST(Int8(4) AS Int64), Int64(8)])\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; // a in (1,4,8), a is decimal @@ -1055,9 +1053,7 @@ mod test { )?), })); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); - let expected = - "Projection: CAST(a AS Decimal128(24, 4)) IN ([CAST(Int32(1) AS Decimal128(24, 4)), CAST(Int8(4) AS Decimal128(24, 4)), CAST(Int64(8) AS Decimal128(24, 4))]) AS a IN (Map { iter: Iter([Literal(Int32(1)), Literal(Int8(4)), Literal(Int64(8))]) })\ - \n EmptyRelation"; + let expected = "Projection: CAST(a AS Decimal128(24, 4)) IN ([CAST(Int32(1) AS Decimal128(24, 4)), CAST(Int8(4) AS Decimal128(24, 4)), CAST(Int64(8) AS Decimal128(24, 4))])\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected) } @@ -1150,8 +1146,7 @@ mod test { let like_expr = Expr::Like(Like::new(false, expr, pattern, None, false)); let empty = empty_with_type(DataType::Utf8); let plan = LogicalPlan::Projection(Projection::try_new(vec![like_expr], empty)?); - let expected = "Projection: a LIKE CAST(NULL AS Utf8) AS a LIKE NULL\ - \n EmptyRelation"; + let expected = "Projection: a LIKE CAST(NULL AS Utf8)\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; let expr = Box::new(col("a")); @@ -1179,8 +1174,7 @@ mod test { let ilike_expr = Expr::Like(Like::new(false, expr, pattern, None, true)); let empty = empty_with_type(DataType::Utf8); let plan = LogicalPlan::Projection(Projection::try_new(vec![ilike_expr], empty)?); - let expected = "Projection: a ILIKE CAST(NULL AS Utf8) AS a ILIKE NULL\ - \n EmptyRelation"; + let expected = "Projection: a ILIKE CAST(NULL AS Utf8)\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected)?; let expr = Box::new(col("a")); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 9cd9e4dece26..45e5409ae9ac 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1108,7 +1108,7 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_, '_> { self.down_index += 1; } - let expr_name = expr.display_name()?; + let expr_name = expr.schema_name().to_string(); let (_, expr_alias) = self.common_exprs.entry(expr_id).or_insert_with(|| { let expr_alias = self.alias_generator.next(CSE_PREFIX); diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index fdd9ef8a8b0b..16b4e43abcd5 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -452,7 +452,8 @@ fn agg_exprs_evaluation_result_on_empty_batch( let simplifier = ExprSimplifier::new(info); let result_expr = simplifier.simplify(result_expr)?; if matches!(result_expr, Expr::Literal(ScalarValue::Int64(_))) { - expr_result_map_for_count_bug.insert(e.display_name()?, result_expr); + expr_result_map_for_count_bug + .insert(e.schema_name().to_string(), result_expr); } } Ok(()) @@ -490,7 +491,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), Expr::Column(Column { relation: _, name }) => name.to_string(), - _ => expr.display_name()?, + _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); } @@ -546,8 +547,8 @@ fn filter_exprs_evaluation_result_on_empty_batch( )], else_expr: Some(Box::new(Expr::Literal(ScalarValue::Null))), }); - expr_result_map_for_count_bug - .insert(new_expr.display_name()?, new_expr); + let expr_key = new_expr.schema_name().to_string(); + expr_result_map_for_count_bug.insert(expr_key, new_expr); } None } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index d014b9149aab..ac4ed87a4a1a 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -135,8 +135,8 @@ fn optimize_projections( let group_by_expr_existing = aggregate .group_expr .iter() - .map(|group_by_expr| group_by_expr.display_name()) - .collect::>>()?; + .map(|group_by_expr| group_by_expr.schema_name().to_string()) + .collect::>(); let new_group_bys = if let Some(simplest_groupby_indices) = get_required_group_by_exprs_indices( @@ -1928,8 +1928,8 @@ mod tests { WindowFunctionDefinition::AggregateUDF(max_udaf()), vec![col("test.b")], )); - let col1 = col(max1.display_name()?); - let col2 = col(max2.display_name()?); + let col1 = col(max1.schema_name().to_string()); + let col2 = col(max2.schema_name().to_string()); let plan = LogicalPlanBuilder::from(table_scan) .window(vec![max1])? diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 4254d3464662..8455919c35a8 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -817,7 +817,7 @@ impl OptimizerRule for PushDownFilter { let group_expr_columns = agg .group_expr .iter() - .map(|e| Ok(Column::from_qualified_name(e.display_name()?))) + .map(|e| Ok(Column::from_qualified_name(e.schema_name().to_string()))) .collect::>>()?; let predicates = split_conjunction_owned(filter.predicate.clone()); @@ -838,7 +838,7 @@ impl OptimizerRule for PushDownFilter { // So we need create a replace_map, add {`a+b` --> Expr(Column(a)+Column(b))} let mut replace_map = HashMap::new(); for expr in &agg.group_expr { - replace_map.insert(expr.display_name()?, expr.clone()); + replace_map.insert(expr.schema_name().to_string(), expr.clone()); } let replaced_push_predicates = push_predicates .into_iter() diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 3c66da21aff6..c79180b79256 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -188,9 +188,9 @@ impl OptimizerRule for ScalarSubqueryToJoin { let mut proj_exprs = vec![]; for expr in projection.expr.iter() { - let old_expr_name = expr.display_name()?; + let old_expr_name = expr.schema_name().to_string(); let new_expr = expr_to_rewrite_expr_map.get(expr).unwrap(); - let new_expr_name = new_expr.display_name()?; + let new_expr_name = new_expr.schema_name().to_string(); if new_expr_name != old_expr_name { proj_exprs.push(new_expr.clone().alias(old_expr_name)) } else { diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 5b4395792447..30cae17eaf9f 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -194,7 +194,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { } let arg = args.swap_remove(0); - if group_fields_set.insert(arg.display_name()?) { + if group_fields_set.insert(arg.schema_name().to_string()) { inner_group_exprs .push(arg.alias(SINGLE_DISTINCT_ALIAS)); } diff --git a/datafusion/proto/tests/cases/serialize.rs b/datafusion/proto/tests/cases/serialize.rs index cc683e778ebc..f28098d83b97 100644 --- a/datafusion/proto/tests/cases/serialize.rs +++ b/datafusion/proto/tests/cases/serialize.rs @@ -276,7 +276,7 @@ fn test_expression_serialization_roundtrip() { /// Extracts the first part of a function name /// 'foo(bar)' -> 'foo' fn extract_function_name(expr: &Expr) -> String { - let name = expr.display_name().unwrap(); + let name = expr.schema_name().to_string(); name.split('(').next().unwrap().to_string() } } diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index 71f64f1cf459..c1b3fe18f7e7 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -115,7 +115,7 @@ pub(crate) fn unproject_window_exprs(expr: &Expr, windows: &[&Window]) -> Result if let Some(unproj) = windows .iter() .flat_map(|w| w.window_expr.iter()) - .find(|window_expr| window_expr.display_name().unwrap() == c.name) + .find(|window_expr| window_expr.schema_name().to_string() == c.name) { Ok(Transformed::yes(unproj.clone())) } else { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 3b044646e6cb..5cdc546e0267 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -329,7 +329,7 @@ pub(crate) fn transform_bottom_unnest( // Full context, we are trying to plan the execution as InnerProjection->Unnest->OuterProjection // inside unnest execution, each column inside the inner projection // will be transformed into new columns. Thus we need to keep track of these placeholding column names - let placeholder_name = unnest_expr.display_name()?; + let placeholder_name = unnest_expr.schema_name().to_string(); unnest_placeholder_columns.push(placeholder_name.clone()); // Add alias for the argument expression, to avoid naming conflicts @@ -402,7 +402,7 @@ pub(crate) fn transform_bottom_unnest( } else { // We need to evaluate the expr in the inner projection, // outer projection just select its name - let column_name = transformed_expr.display_name()?; + let column_name = transformed_expr.schema_name().to_string(); inner_projection_exprs.push(transformed_expr); Ok(vec![Expr::Column(Column::from_name(column_name))]) } @@ -469,16 +469,16 @@ mod tests { assert_eq!( transformed_exprs, vec![ - col("unnest(struct_col).field1"), - col("unnest(struct_col).field2"), + col("UNNEST(struct_col).field1"), + col("UNNEST(struct_col).field2"), ] ); - assert_eq!(unnest_placeholder_columns, vec!["unnest(struct_col)"]); + assert_eq!(unnest_placeholder_columns, vec!["UNNEST(struct_col)"]); // still reference struct_col in original schema but with alias, // to avoid colliding with the projection on the column itself if any assert_eq!( inner_projection_exprs, - vec![col("struct_col").alias("unnest(struct_col)"),] + vec![col("struct_col").alias("UNNEST(struct_col)"),] ); // unnest(array_col) + 1 @@ -491,12 +491,12 @@ mod tests { )?; assert_eq!( unnest_placeholder_columns, - vec!["unnest(struct_col)", "unnest(array_col)"] + vec!["UNNEST(struct_col)", "UNNEST(array_col)"] ); // only transform the unnest children assert_eq!( transformed_exprs, - vec![col("unnest(array_col)").add(lit(1i64))] + vec![col("UNNEST(array_col)").add(lit(1i64))] ); // keep appending to the current vector @@ -505,8 +505,8 @@ mod tests { assert_eq!( inner_projection_exprs, vec![ - col("struct_col").alias("unnest(struct_col)"), - col("array_col").alias("unnest(array_col)") + col("struct_col").alias("UNNEST(struct_col)"), + col("array_col").alias("UNNEST(array_col)") ] ); @@ -553,17 +553,17 @@ mod tests { // Only the inner most/ bottom most unnest is transformed assert_eq!( transformed_exprs, - vec![unnest(col("unnest(struct_col[matrix])"))] + vec![unnest(col("UNNEST(struct_col[matrix])"))] ); assert_eq!( unnest_placeholder_columns, - vec!["unnest(struct_col[matrix])"] + vec!["UNNEST(struct_col[matrix])"] ); assert_eq!( inner_projection_exprs, vec![col("struct_col") .field("matrix") - .alias("unnest(struct_col[matrix])"),] + .alias("UNNEST(struct_col[matrix])"),] ); Ok(()) diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 9bbdbe8dbfc9..179fc108e6d2 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -447,10 +447,10 @@ fn test_unnest_logical_plan() -> Result<()> { let sql_to_rel = SqlToRel::new(&context); let plan = sql_to_rel.sql_statement_to_plan(statement).unwrap(); - let expected = "Projection: unnest(unnest_table.struct_col).field1, unnest(unnest_table.struct_col).field2, unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ - \n Unnest: lists[unnest(unnest_table.array_col)] structs[unnest(unnest_table.struct_col)]\ - \n Projection: unnest_table.struct_col AS unnest(unnest_table.struct_col), unnest_table.array_col AS unnest(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ - \n TableScan: unnest_table"; + let expected = "Projection: UNNEST(unnest_table.struct_col).field1, UNNEST(unnest_table.struct_col).field2, UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n Unnest: lists[UNNEST(unnest_table.array_col)] structs[UNNEST(unnest_table.struct_col)]\ + \n Projection: unnest_table.struct_col AS UNNEST(unnest_table.struct_col), unnest_table.array_col AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col\ + \n TableScan: unnest_table"; assert_eq!(format!("{plan}"), expected); diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index b71bc765ba37..2b8b9429cce2 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -1137,7 +1137,7 @@ from arrays_values_without_nulls; ## array_element (aliases: array_extract, list_extract, list_element) # Testing with empty arguments should result in an error -query error DataFusion error: Error during planning: Error during planning: array_element does not support zero arguments. +query error DataFusion error: Error during planning: Error during planning: array_element does not support zero arguments select array_element(); # array_element error @@ -1979,7 +1979,7 @@ select array_slice(a, -1, 2, 1), array_slice(a, -1, 2), [6.0] [6.0] [] [] # Testing with empty arguments should result in an error -query error DataFusion error: Error during planning: Error during planning: array_slice does not support zero arguments. +query error DataFusion error: Error during planning: Error during planning: array_slice does not support zero arguments select array_slice(); diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index 3ca187ddee84..2d74c1fc6994 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -36,9 +36,9 @@ query TT explain select uc2 from (select unnest(column2) as uc2, column1 from v) where column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2 -02)--Unnest: lists[unnest(v.column2)] structs[] -03)----Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2 +02)--Unnest: lists[UNNEST(v.column2)] structs[] +03)----Projection: v.column2 AS UNNEST(v.column2), v.column1 04)------Filter: v.column1 = Int64(2) 05)--------TableScan: v projection=[column1, column2] @@ -53,11 +53,11 @@ query TT explain select uc2 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2 -02)--Filter: unnest(v.column2) > Int64(3) -03)----Projection: unnest(v.column2) -04)------Unnest: lists[unnest(v.column2)] structs[] -05)--------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2 +02)--Filter: UNNEST(v.column2) > Int64(3) +03)----Projection: UNNEST(v.column2) +04)------Unnest: lists[UNNEST(v.column2)] structs[] +05)--------Projection: v.column2 AS UNNEST(v.column2), v.column1 06)----------TableScan: v projection=[column1, column2] query II @@ -71,10 +71,10 @@ query TT explain select uc2, column1 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3 AND column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2, v.column1 -02)--Filter: unnest(v.column2) > Int64(3) -03)----Unnest: lists[unnest(v.column2)] structs[] -04)------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2, v.column1 +02)--Filter: UNNEST(v.column2) > Int64(3) +03)----Unnest: lists[UNNEST(v.column2)] structs[] +04)------Projection: v.column2 AS UNNEST(v.column2), v.column1 05)--------Filter: v.column1 = Int64(2) 06)----------TableScan: v projection=[column1, column2] @@ -90,10 +90,10 @@ query TT explain select uc2, column1 from (select unnest(column2) as uc2, column1 from v) where uc2 > 3 OR column1 = 2; ---- logical_plan -01)Projection: unnest(v.column2) AS uc2, v.column1 -02)--Filter: unnest(v.column2) > Int64(3) OR v.column1 = Int64(2) -03)----Unnest: lists[unnest(v.column2)] structs[] -04)------Projection: v.column2 AS unnest(v.column2), v.column1 +01)Projection: UNNEST(v.column2) AS uc2, v.column1 +02)--Filter: UNNEST(v.column2) > Int64(3) OR v.column1 = Int64(2) +03)----Unnest: lists[UNNEST(v.column2)] structs[] +04)------Projection: v.column2 AS UNNEST(v.column2), v.column1 05)--------TableScan: v projection=[column1, column2] statement ok @@ -112,10 +112,10 @@ query TT explain select * from (select column1, unnest(column2) as o from d) where o['a'] = 1; ---- logical_plan -01)Projection: d.column1, unnest(d.column2) AS o -02)--Filter: get_field(unnest(d.column2), Utf8("a")) = Int64(1) -03)----Unnest: lists[unnest(d.column2)] structs[] -04)------Projection: d.column1, d.column2 AS unnest(d.column2) +01)Projection: d.column1, UNNEST(d.column2) AS o +02)--Filter: get_field(UNNEST(d.column2), Utf8("a")) = Int64(1) +03)----Unnest: lists[UNNEST(d.column2)] structs[] +04)------Projection: d.column1, d.column2 AS UNNEST(d.column2) 05)--------TableScan: d projection=[column1, column2] diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index d818c0e92795..4957011b8ba2 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -539,21 +539,21 @@ query TT explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unnest_table; ---- logical_plan -01)Unnest: lists[unnest(unnest(unnest(recursive_unnest_table.column3)[c1]))] structs[] -02)--Projection: unnest(unnest(recursive_unnest_table.column3)[c1]) AS unnest(unnest(unnest(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 -03)----Unnest: lists[unnest(unnest(recursive_unnest_table.column3)[c1])] structs[] -04)------Projection: get_field(unnest(recursive_unnest_table.column3), Utf8("c1")) AS unnest(unnest(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 -05)--------Unnest: lists[unnest(recursive_unnest_table.column3)] structs[] -06)----------Projection: recursive_unnest_table.column3 AS unnest(recursive_unnest_table.column3), recursive_unnest_table.column3 +01)Unnest: lists[UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1]))] structs[] +02)--Projection: UNNEST(UNNEST(recursive_unnest_table.column3)[c1]) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 +03)----Unnest: lists[UNNEST(UNNEST(recursive_unnest_table.column3)[c1])] structs[] +04)------Projection: get_field(UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS UNNEST(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +05)--------Unnest: lists[UNNEST(recursive_unnest_table.column3)] structs[] +06)----------Projection: recursive_unnest_table.column3 AS UNNEST(recursive_unnest_table.column3), recursive_unnest_table.column3 07)------------TableScan: recursive_unnest_table projection=[column3] physical_plan 01)UnnestExec -02)--ProjectionExec: expr=[unnest(unnest(recursive_unnest_table.column3)[c1])@0 as unnest(unnest(unnest(recursive_unnest_table.column3)[c1])), column3@1 as column3] +02)--ProjectionExec: expr=[UNNEST(UNNEST(recursive_unnest_table.column3)[c1])@0 as UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), column3@1 as column3] 03)----UnnestExec -04)------ProjectionExec: expr=[get_field(unnest(recursive_unnest_table.column3)@0, c1) as unnest(unnest(recursive_unnest_table.column3)[c1]), column3@1 as column3] +04)------ProjectionExec: expr=[get_field(UNNEST(recursive_unnest_table.column3)@0, c1) as UNNEST(UNNEST(recursive_unnest_table.column3)[c1]), column3@1 as column3] 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 06)----------UnnestExec -07)------------ProjectionExec: expr=[column3@0 as unnest(recursive_unnest_table.column3), column3@0 as column3] +07)------------ProjectionExec: expr=[column3@0 as UNNEST(recursive_unnest_table.column3), column3@0 as column3] 08)--------------MemoryExec: partitions=1, partition_sizes=[1] ## group by unnest diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 89f2efec66aa..f2756bb06d1e 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -417,7 +417,7 @@ pub async fn from_substrait_rel( } // Ensure the expression has a unique display name, so that project's // validate_unique_names doesn't fail - let name = x.display_name()?; + let name = x.schema_name().to_string(); let mut new_name = name.clone(); let mut i = 0; while names.contains(&new_name) { diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 360377c231a3..0a86d27e013c 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -358,8 +358,8 @@ mod tests { let plan = from_substrait_plan(&ctx, &proto).await?; let plan_str = format!("{}", plan); - assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ - \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ + assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ + \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE Utf8(\"PROMO%\") THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ \n Projection: CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount) ELSE Decimal128(Some(0),19,0) END, FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ \n Filter: FILENAME_PLACEHOLDER_0.l_partkey = FILENAME_PLACEHOLDER_1.p_partkey AND FILENAME_PLACEHOLDER_0.l_shipdate >= Date32(\"1995-09-01\") AND FILENAME_PLACEHOLDER_0.l_shipdate < CAST(Utf8(\"1995-10-01\") AS Date32)\ \n Inner Join: Filter: Boolean(true)\