From d19e9d684bbe1fd820674d48a96795bfbea9db7d Mon Sep 17 00:00:00 2001 From: wiedld Date: Sat, 23 Sep 2023 03:42:48 -0700 Subject: [PATCH] fix(5975/5976): timezone handling for timestamps and `date_trunc`, `date_part` and `date_bin` (#7614) * test: enforce timestamptz contract * test(5975/5976): demonstrate what logical plan casting must occur with datetime scalar functions. * These test cases also document how our scalar functions are currently not correct. * Extra comments documenting the logical plan will be removed on test cleanup (after code fixes). * fix(5975/5976): enable type coercion to include specific timezones * Prior to this change, the outcome was always coerced to Timestamp(Nanoseconds, None) and the tz was dropped. * fix(5975/5976): have date_trunc use DateTime, instead of NaiveDateTime * chore(5975/5976): test cleanup -- consolidate into the single timestamps test file * fix(5975/5976): enable all valid timezones to be supported in type coercion * chore: update cargo.lock in datafusion-cli * test(5975/5976): tests to document the bounds of timezone acceptance * test(5975/5976): document irregular offsets and daylight savings time * refactor(5975/5976): do not parse timezone string during type coersion, should have already failed in parser if invalid * chore: properly abbreviate abbreviations * fix(5975/5976): apply tz string parsing per batch. * move parsing up to date_trunc() to apply per batch, not per value. * do not infer a default UTC timezone for missing tz. Instead use the appropriate type for with, or without, tz. --- datafusion-cli/Cargo.lock | 36 +- datafusion/core/tests/sql/expr.rs | 8 + datafusion/expr/Cargo.toml | 1 + datafusion/expr/src/built_in_function.rs | 32 +- .../expr/src/type_coercion/functions.rs | 135 ++++--- .../physical-expr/src/datetime_expressions.rs | 166 ++++++-- .../sqllogictest/test_files/timestamps.slt | 371 ++++++++++++++++++ 7 files changed, 648 insertions(+), 101 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 0cf511e566d9..ba733726c379 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -32,9 +32,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2135563fb5c609d2b2b87c1e8ce7bc41b0b45430fa9661f457981503dd5bf0" +checksum = "ea5d730647d4fadd988536d06fecce94b7b4f2a7efdae548f1cf4b63205518ab" dependencies = [ "memchr", ] @@ -711,16 +711,15 @@ dependencies = [ [[package]] name = "blake3" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "199c42ab6972d92c9f8995f086273d25c42fc0f7b2a1fcefba465c1352d25ba5" +checksum = "0231f06152bf547e9c2b5194f247cd97aacf6dcd8b15d8e5ec0663f64580da87" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", - "digest", ] [[package]] @@ -1156,6 +1155,7 @@ version = "31.0.0" dependencies = [ "ahash", "arrow", + "arrow-array", "datafusion-common", "sqlparser", "strum 0.25.0", @@ -1659,9 +1659,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" [[package]] name = "hex" @@ -2191,7 +2191,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.2", + "hermit-abi 0.3.3", "libc", ] @@ -2723,9 +2723,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.13" +version = "0.38.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7db8590df6dfcd144d22afd1b83b36c21a18d7cbc1dc4bb5295a8712e9eb662" +checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f" dependencies = [ "bitflags 2.4.0", "errno", @@ -2964,9 +2964,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" [[package]] name = "snafu" @@ -3309,9 +3309,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.8" +version = "0.7.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "806fe8c2c87eccc8b3267cbae29ed3ab2d0bd37fca70ab622e46aaa9375ddb7d" +checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" dependencies = [ "bytes", "futures-core", @@ -3433,9 +3433,9 @@ checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" [[package]] name = "unicode-width" -version = "0.1.10" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" [[package]] name = "untrusted" @@ -3644,9 +3644,9 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" dependencies = [ "winapi", ] diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index 2cc61b1fdd09..044b3b57ea90 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -128,6 +128,14 @@ async fn test_encoding_expressions() -> Result<()> { Ok(()) } +#[should_panic(expected = "Invalid timezone \\\"Foo\\\": 'Foo' is not a valid timezone")] +#[tokio::test] +async fn test_array_cast_invalid_timezone_will_panic() { + let ctx = SessionContext::new(); + let sql = "SELECT arrow_cast('2021-01-02T03:04:00', 'Timestamp(Nanosecond, Some(\"Foo\"))')"; + execute(&ctx, sql).await; +} + #[tokio::test] #[cfg_attr(not(feature = "crypto_expressions"), ignore)] async fn test_crypto_expressions() -> Result<()> { diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index f6d8e15fdd9d..4d69ce747518 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -37,6 +37,7 @@ path = "src/lib.rs" [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } arrow = { workspace = true } +arrow-array = { workspace = true } datafusion-common = { path = "../common", version = "31.0.0", default-features = false } sqlparser = { workspace = true } strum = { version = "0.25.0", features = ["derive"] } diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 3f1eb581aa98..a42963495617 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -1020,9 +1020,13 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::DateTrunc => Signature::one_of( vec![ Exact(vec![Utf8, Timestamp(Nanosecond, None)]), + Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Microsecond, None)]), + Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Millisecond, None)]), + Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Second, None)]), + Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]), ], self.volatility(), ), @@ -1034,16 +1038,37 @@ impl BuiltinScalarFunction { Timestamp(array_type.clone(), None), Timestamp(Nanosecond, None), ]), + Exact(vec![ + Interval(MonthDayNano), + Timestamp(array_type.clone(), Some("+TZ".into())), + Timestamp(Nanosecond, Some("+TZ".into())), + ]), Exact(vec![ Interval(DayTime), Timestamp(array_type.clone(), None), Timestamp(Nanosecond, None), ]), + Exact(vec![ + Interval(DayTime), + Timestamp(array_type.clone(), Some("+TZ".into())), + Timestamp(Nanosecond, Some("+TZ".into())), + ]), + Exact(vec![ + Interval(MonthDayNano), + Timestamp(array_type.clone(), None), + ]), Exact(vec![ Interval(MonthDayNano), + Timestamp(array_type.clone(), Some("+TZ".into())), + ]), + Exact(vec![ + Interval(DayTime), Timestamp(array_type.clone(), None), ]), - Exact(vec![Interval(DayTime), Timestamp(array_type, None)]), + Exact(vec![ + Interval(DayTime), + Timestamp(array_type, Some("+TZ".into())), + ]), ] }; @@ -1060,10 +1085,13 @@ impl BuiltinScalarFunction { Exact(vec![Utf8, Date32]), Exact(vec![Utf8, Date64]), Exact(vec![Utf8, Timestamp(Second, None)]), + Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Microsecond, None)]), + Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Millisecond, None)]), + Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]), Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![Utf8, Timestamp(Nanosecond, Some("+00:00".into()))]), + Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]), ], self.volatility(), ), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 371a3950ac6e..883ca2b39362 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -120,7 +120,7 @@ fn maybe_data_types( new_type.push(current_type.clone()) } else { // attempt to coerce - if can_coerce_from(valid_type, current_type) { + if let Some(valid_type) = coerced_from(valid_type, current_type) { new_type.push(valid_type.clone()) } else { // not possible @@ -136,62 +136,97 @@ fn maybe_data_types( /// /// See the module level documentation for more detail on coercion. pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool { - use self::DataType::*; - if type_into == type_from { return true; } - // Null can convert to most of types + if let Some(coerced) = coerced_from(type_into, type_from) { + return coerced == type_into; + } + false +} + +fn coerced_from<'a>( + type_into: &'a DataType, + type_from: &'a DataType, +) -> Option<&'a DataType> { + use self::DataType::*; + match type_into { - Int8 => matches!(type_from, Null | Int8), - Int16 => matches!(type_from, Null | Int8 | Int16 | UInt8), - Int32 => matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16), - Int64 => matches!( - type_from, - Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 - ), - UInt8 => matches!(type_from, Null | UInt8), - UInt16 => matches!(type_from, Null | UInt8 | UInt16), - UInt32 => matches!(type_from, Null | UInt8 | UInt16 | UInt32), - UInt64 => matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64), - Float32 => matches!( - type_from, - Null | Int8 - | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - ), - Float64 => matches!( - type_from, - Null | Int8 - | Int16 - | Int32 - | Int64 - | UInt8 - | UInt16 - | UInt32 - | UInt64 - | Float32 - | Float64 - | Decimal128(_, _) - ), - Timestamp(TimeUnit::Nanosecond, _) => { - matches!( + // coerced into type_into + Int8 if matches!(type_from, Null | Int8) => Some(type_into), + Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) => Some(type_into), + Int32 if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16) => { + Some(type_into) + } + Int64 + if matches!( type_from, - Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8 - ) + Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 + ) => + { + Some(type_into) } - Interval(_) => { - matches!(type_from, Utf8 | LargeUtf8) + UInt8 if matches!(type_from, Null | UInt8) => Some(type_into), + UInt16 if matches!(type_from, Null | UInt8 | UInt16) => Some(type_into), + UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => Some(type_into), + UInt64 if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => { + Some(type_into) } - Utf8 | LargeUtf8 => true, - Null => can_cast_types(type_from, type_into), - _ => false, + Float32 + if matches!( + type_from, + Null | Int8 + | Int16 + | Int32 + | Int64 + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Float32 + ) => + { + Some(type_into) + } + Float64 + if matches!( + type_from, + Null | Int8 + | Int16 + | Int32 + | Int64 + | UInt8 + | UInt16 + | UInt32 + | UInt64 + | Float32 + | Float64 + | Decimal128(_, _) + ) => + { + Some(type_into) + } + Timestamp(TimeUnit::Nanosecond, None) + if matches!( + type_from, + Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8 + ) => + { + Some(type_into) + } + Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into), + Utf8 | LargeUtf8 => Some(type_into), + Null if can_cast_types(type_from, type_into) => Some(type_into), + + // Coerce to consistent timezones, if the `type_from` timezone exists. + Timestamp(TimeUnit::Nanosecond, Some(_)) + if matches!(type_from, Timestamp(TimeUnit::Nanosecond, Some(_))) => + { + Some(type_from) + } + + // cannot coerce + _ => None, } } diff --git a/datafusion/physical-expr/src/datetime_expressions.rs b/datafusion/physical-expr/src/datetime_expressions.rs index 63f7645fea5d..263b46192e96 100644 --- a/datafusion/physical-expr/src/datetime_expressions.rs +++ b/datafusion/physical-expr/src/datetime_expressions.rs @@ -20,8 +20,10 @@ use arrow::array::Float64Builder; use arrow::compute::cast; use arrow::{ - array::TimestampNanosecondArray, compute::kernels::temporal, datatypes::TimeUnit, - temporal_conversions::timestamp_ns_to_datetime, + array::TimestampNanosecondArray, + compute::kernels::temporal, + datatypes::TimeUnit, + temporal_conversions::{as_datetime_with_timezone, timestamp_ns_to_datetime}, }; use arrow::{ array::{Array, ArrayRef, Float64Array, OffsetSizeTrait, PrimitiveArray}, @@ -33,7 +35,8 @@ use arrow::{ }, }; use arrow_array::{ - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampSecondArray, + timezone::Tz, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampSecondArray, }; use chrono::prelude::*; use chrono::{Duration, Months, NaiveDate}; @@ -47,6 +50,7 @@ use datafusion_common::{ ScalarValue, }; use datafusion_expr::ColumnarValue; +use std::str::FromStr; use std::sync::Arc; /// given a function `op` that maps a `&str` to a Result of an arrow native type, @@ -208,23 +212,20 @@ pub fn make_current_time( move |_arg| Ok(ColumnarValue::Scalar(ScalarValue::Time64Nanosecond(nano))) } -fn quarter_month(date: &NaiveDateTime) -> u32 { +fn quarter_month(date: &T) -> u32 +where + T: chrono::Datelike, +{ 1 + 3 * ((date.month() - 1) / 3) } -/// Tuncates the single `value`, expressed in nanoseconds since the -/// epoch, for granularities greater than 1 second, in taking into -/// account that some granularities are not uniform durations of time -/// (e.g. months are not always the same lengths, leap seconds, etc) -fn date_trunc_coarse(granularity: &str, value: i64) -> Result { - // Use chrono NaiveDateTime to clear the various fields - // correctly accounting for non uniform granularities - let value = timestamp_ns_to_datetime(value).ok_or_else(|| { - DataFusionError::Execution(format!("Timestamp {value} out of range")) - })?; - - let value = Some(value); - +fn _date_trunc_coarse(granularity: &str, value: Option) -> Result> +where + T: chrono::Datelike + + chrono::Timelike + + std::ops::Sub + + std::marker::Copy, +{ let value = match granularity { "millisecond" => value, "microsecond" => value, @@ -271,7 +272,49 @@ fn date_trunc_coarse(granularity: &str, value: i64) -> Result { return exec_err!("Unsupported date_trunc granularity: {unsupported}"); } }; - let value = value.and_then(|value| value.timestamp_nanos_opt()); + Ok(value) +} + +fn _date_trunc_coarse_with_tz( + granularity: &str, + value: Option>, +) -> Result> { + let value = _date_trunc_coarse::>(granularity, value)?; + Ok(value.and_then(|value| value.timestamp_nanos_opt())) +} + +fn _date_trunc_coarse_without_tz( + granularity: &str, + value: Option, +) -> Result> { + let value = _date_trunc_coarse::(granularity, value)?; + Ok(value.and_then(|value| value.timestamp_nanos_opt())) +} + +/// Tuncates the single `value`, expressed in nanoseconds since the +/// epoch, for granularities greater than 1 second, in taking into +/// account that some granularities are not uniform durations of time +/// (e.g. months are not always the same lengths, leap seconds, etc) +fn date_trunc_coarse(granularity: &str, value: i64, tz: Arc>) -> Result { + let value = match tz.as_ref() { + Some(tz) => { + // Use chrono DateTime to clear the various fields because need to clear per timezone, + // and NaiveDateTime (ISO 8601) has no concept of timezones + let value = as_datetime_with_timezone::(value, *tz) + .ok_or(DataFusionError::Execution(format!( + "Timestamp {value} out of range" + )))?; + _date_trunc_coarse_with_tz(granularity, Some(value)) + } + None => { + // Use chrono NaiveDateTime to clear the various fields, if we don't have a timezone. + let value = timestamp_ns_to_datetime(value).ok_or_else(|| { + DataFusionError::Execution(format!("Timestamp {value} out of range")) + })?; + _date_trunc_coarse_without_tz(granularity, Some(value)) + } + }?; + // `with_x(0)` are infallible because `0` are always a valid Ok(value.unwrap()) } @@ -280,6 +323,7 @@ fn date_trunc_coarse(granularity: &str, value: i64) -> Result { fn _date_trunc( tu: TimeUnit, value: &Option, + tz: Arc>, granularity: &str, ) -> Result, DataFusionError> { let scale = match tu { @@ -294,7 +338,7 @@ fn _date_trunc( }; // convert to nanoseconds - let nano = date_trunc_coarse(granularity, scale * value)?; + let nano = date_trunc_coarse(granularity, scale * value, tz)?; let result = match tu { TimeUnit::Second => match granularity { @@ -323,6 +367,16 @@ fn _date_trunc( Ok(result) } +fn parse_tz(tz: &Option>) -> Result> { + tz.as_ref() + .map(|tz| { + Tz::from_str(tz).map_err(|op| { + DataFusionError::Execution(format!("failed on timezone {tz}: {:?}", op)) + }) + }) + .transpose() +} + /// date_trunc SQL function pub fn date_trunc(args: &[ColumnarValue]) -> Result { let (granularity, array) = (&args[0], &args[1]); @@ -336,62 +390,112 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result { Ok(match array { ColumnarValue::Scalar(ScalarValue::TimestampNanosecond(v, tz_opt)) => { - let value = _date_trunc(TimeUnit::Nanosecond, v, granularity.as_str())?; + let parsed_tz = parse_tz(tz_opt)?; + let value = _date_trunc( + TimeUnit::Nanosecond, + v, + Arc::new(parsed_tz), + granularity.as_str(), + )?; let value = ScalarValue::TimestampNanosecond(value, tz_opt.clone()); ColumnarValue::Scalar(value) } ColumnarValue::Scalar(ScalarValue::TimestampMicrosecond(v, tz_opt)) => { - let value = _date_trunc(TimeUnit::Microsecond, v, granularity.as_str())?; + let parsed_tz = parse_tz(tz_opt)?; + let value = _date_trunc( + TimeUnit::Microsecond, + v, + Arc::new(parsed_tz), + granularity.as_str(), + )?; let value = ScalarValue::TimestampMicrosecond(value, tz_opt.clone()); ColumnarValue::Scalar(value) } ColumnarValue::Scalar(ScalarValue::TimestampMillisecond(v, tz_opt)) => { - let value = _date_trunc(TimeUnit::Millisecond, v, granularity.as_str())?; + let parsed_tz = parse_tz(tz_opt)?; + let value = _date_trunc( + TimeUnit::Millisecond, + v, + Arc::new(parsed_tz), + granularity.as_str(), + )?; let value = ScalarValue::TimestampMillisecond(value, tz_opt.clone()); ColumnarValue::Scalar(value) } ColumnarValue::Scalar(ScalarValue::TimestampSecond(v, tz_opt)) => { - let value = _date_trunc(TimeUnit::Second, v, granularity.as_str())?; + let parsed_tz = parse_tz(tz_opt)?; + let value = _date_trunc( + TimeUnit::Second, + v, + Arc::new(parsed_tz), + granularity.as_str(), + )?; let value = ScalarValue::TimestampSecond(value, tz_opt.clone()); ColumnarValue::Scalar(value) } ColumnarValue::Array(array) => { let array_type = array.data_type(); match array_type { - DataType::Timestamp(TimeUnit::Second, _) => { + DataType::Timestamp(TimeUnit::Second, tz_opt) => { + let parsed_tz = Arc::new(parse_tz(tz_opt)?); let array = as_timestamp_second_array(array)?; let array = array .iter() - .map(|x| _date_trunc(TimeUnit::Second, &x, granularity.as_str())) + .map(|x| { + _date_trunc( + TimeUnit::Second, + &x, + parsed_tz.clone(), + granularity.as_str(), + ) + }) .collect::>()?; ColumnarValue::Array(Arc::new(array)) } - DataType::Timestamp(TimeUnit::Millisecond, _) => { + DataType::Timestamp(TimeUnit::Millisecond, tz_opt) => { + let parsed_tz = Arc::new(parse_tz(tz_opt)?); let array = as_timestamp_millisecond_array(array)?; let array = array .iter() .map(|x| { - _date_trunc(TimeUnit::Millisecond, &x, granularity.as_str()) + _date_trunc( + TimeUnit::Millisecond, + &x, + parsed_tz.clone(), + granularity.as_str(), + ) }) .collect::>()?; ColumnarValue::Array(Arc::new(array)) } - DataType::Timestamp(TimeUnit::Microsecond, _) => { + DataType::Timestamp(TimeUnit::Microsecond, tz_opt) => { + let parsed_tz = Arc::new(parse_tz(tz_opt)?); let array = as_timestamp_microsecond_array(array)?; let array = array .iter() .map(|x| { - _date_trunc(TimeUnit::Microsecond, &x, granularity.as_str()) + _date_trunc( + TimeUnit::Microsecond, + &x, + parsed_tz.clone(), + granularity.as_str(), + ) }) .collect::>()?; ColumnarValue::Array(Arc::new(array)) } _ => { + let parsed_tz = Arc::new(None); let array = as_timestamp_nanosecond_array(array)?; let array = array .iter() .map(|x| { - _date_trunc(TimeUnit::Nanosecond, &x, granularity.as_str()) + _date_trunc( + TimeUnit::Nanosecond, + &x, + parsed_tz.clone(), + granularity.as_str(), + ) }) .collect::>()?; @@ -958,7 +1062,7 @@ mod tests { cases.iter().for_each(|(original, granularity, expected)| { let left = string_to_timestamp_nanos(original).unwrap(); let right = string_to_timestamp_nanos(expected).unwrap(); - let result = date_trunc_coarse(granularity, left).unwrap(); + let result = date_trunc_coarse(granularity, left, Arc::new(None)).unwrap(); assert_eq!(result, right, "{original} = {expected}"); }); } diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index abb74b468e8f..88a024e0f9da 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -100,6 +100,40 @@ select * from foo where ts != '2000-02-01T00:00:00'; statement ok drop table foo; + +########## +## Timezone Handling Tests +########## + +statement ok +SET TIME ZONE = '+08' + +# should use execution timezone +query P +SELECT TIMESTAMPTZ '2000-01-01T01:01:01' +---- +2000-01-01T01:01:01+08:00 + +# casts return timezone to use execution timezone (same as postgresql) +query P +SELECT TIMESTAMPTZ '2000-01-01T01:01:01+07:00' +---- +2000-01-01T02:01:01+08:00 + +query P +SELECT TIMESTAMPTZ '2000-01-01T01:01:01Z' +---- +2000-01-01T09:01:01+08:00 + +statement ok +SET TIME ZONE = '+00' + +query P +SELECT TIMESTAMPTZ '2000-01-01T01:01:01' +---- +2000-01-01T01:01:01Z + + ########## ## to_timestamp tests ########## @@ -1307,3 +1341,340 @@ drop table ts_data_millis statement ok drop table ts_data_secs + + + +########## +## Timezone impact on builtin scalar functions +# +# server time = +07 +########## + +statement ok +set timezone to '+07'; + +# postgresql: 2000-01-01 01:00:00+07 +query P +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as ts +---- +2000-01-01T01:00:00+07:00 + +# postgresql: 2000-01-01 00:00:00+07 +query P +SELECT date_trunc('day', TIMESTAMPTZ '2000-01-01T01:01:01') as ts +---- +2000-01-01T00:00:00+07:00 + +# postgresql: 2000-01-01 08:00:00+07 +query P +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T01:01:01Z') as ts +---- +2000-01-01T08:00:00+07:00 + +# postgresql: 2000-01-01 00:00:00+07 +query P +SELECT date_trunc('day', TIMESTAMPTZ '2000-01-01T01:01:01Z') as ts +---- +2000-01-01T00:00:00+07:00 + +# postgresql: 2022-01-01 00:00:00+07 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 20:10:00', TIMESTAMPTZ '2020-01-01') +---- +2022-01-01T00:00:00+07:00 + +# postgresql: 2022-01-02 00:00:00+07 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 20:10:00Z', TIMESTAMPTZ '2020-01-01') +---- +2022-01-02T00:00:00+07:00 + +# postgresql: 1 +query R +SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as part +---- +1 + +# postgresql: 8 +query R +SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01Z') as part +---- +8 + + + +########## +## Timezone impact on builtin scalar functions +# +# server time = UTC +########## + +statement ok +set timezone to '+00'; + +# postgresql: 2000-01-01T01:00:00+00 +query P +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as ts +---- +2000-01-01T01:00:00Z + +# postgresql: 2000-01-01T00:00:00+00 +query P +SELECT date_trunc('day', TIMESTAMPTZ '2000-01-01T01:01:01') as ts +---- +2000-01-01T00:00:00Z + +# postgresql: 1999-12-31T18:00:00+00 +query P +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T01:01:01+07') as ts +---- +1999-12-31T18:00:00Z + +# postgresql: 1999-12-31T00:00:00+00 +query P +SELECT date_trunc('day', TIMESTAMPTZ '2000-01-01T01:01:01+07') as ts +---- +1999-12-31T00:00:00Z + +# postgresql: 2022-01-01 00:00:00+00 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 20:10:00', TIMESTAMPTZ '2020-01-01') +---- +2022-01-01T00:00:00Z + +# postgresql: 2021-12-31 00:00:00+00 +query P +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 01:10:00+07', TIMESTAMPTZ '2020-01-01') +---- +2021-12-31T00:00:00Z + +# postgresql: 1 +query R +SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as part +---- +1 + +# postgresql: 18 +query R +SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01+07') as part +---- +18 + + + +########## +## Timezone impact on builtin scalar functions +# +# irregular offsets +########## + +query P rowsort +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00+00:45') as ts_irregular_offset + UNION ALL +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00+00:30') as ts_irregular_offset + UNION ALL +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00+00:15') as ts_irregular_offset + UNION ALL +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00-00:15') as ts_irregular_offset + UNION ALL +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00-00:30') as ts_irregular_offset + UNION ALL +SELECT date_trunc('hour', TIMESTAMPTZ '2000-01-01T00:00:00-00:45') as ts_irregular_offset +---- +1999-12-31T23:00:00Z +1999-12-31T23:00:00Z +1999-12-31T23:00:00Z +2000-01-01T00:00:00Z +2000-01-01T00:00:00Z +2000-01-01T00:00:00Z + +query P rowsort +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 00:00:00+00:30', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 00:00:00+00:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 00:00:00-00:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 00:00:00-00:30', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset +---- +2021-12-31T00:00:00Z +2021-12-31T00:00:00Z +2022-01-01T00:00:00Z +2022-01-01T00:00:00Z + +query P rowsort +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00+01:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00+00:45', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00+00:30', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00+00:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00-00:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00-00:30', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00-00:45', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset + UNION ALL +SELECT date_bin('1 hour', TIMESTAMPTZ '2022-01-01 00:00:00-01:15', TIMESTAMPTZ '2020-01-01') as ts_irregular_offset +---- +2021-12-31T22:00:00Z +2021-12-31T23:00:00Z +2021-12-31T23:00:00Z +2021-12-31T23:00:00Z +2022-01-01T00:00:00Z +2022-01-01T00:00:00Z +2022-01-01T00:00:00Z +2022-01-01T01:00:00Z + + + +########## +## Timezone acceptance bounds +# +# standard formats +########## + +query P +SELECT TIMESTAMPTZ '2022-01-01 01:10:00' as rfc3339_no_tz +---- +2022-01-01T01:10:00Z + +# +00, +00:00, +0000 +# +01, +01:00, +0100 +# -01, -01:00, -0100 +query P rowsort +SELECT TIMESTAMPTZ '2022-01-01 01:10:00+00' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00+00:00' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00+0000' as rfc3339_offset_tz + UNION ALL + SELECT TIMESTAMPTZ '2022-01-01 01:10:00+01' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00+01:00' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00+0100' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00-01' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00-01:00' as rfc3339_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00-0100' as rfc3339_offset_tz +---- +2022-01-01T00:10:00Z +2022-01-01T00:10:00Z +2022-01-01T00:10:00Z +2022-01-01T01:10:00Z +2022-01-01T01:10:00Z +2022-01-01T01:10:00Z +2022-01-01T02:10:00Z +2022-01-01T02:10:00Z +2022-01-01T02:10:00Z + +query P +SELECT TIMESTAMPTZ '2022-01-01T01:10:00' as iso8601_no_tz +---- +2022-01-01T01:10:00Z + +# +00, +00:00, +0000 +# +01, +01:00, +0100 +# -01, -01:00, -0100 +query P rowsort +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+00' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+00:00' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+0000' as iso8601_offset_tz + UNION ALL + SELECT TIMESTAMPTZ '2022-01-01T01:10:00+01' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+01:00' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+0100' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00-01' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00-01:00' as iso8601_offset_tz + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01T01:10:00-0100' as iso8601_offset_tz +---- +2022-01-01T00:10:00Z +2022-01-01T00:10:00Z +2022-01-01T00:10:00Z +2022-01-01T01:10:00Z +2022-01-01T01:10:00Z +2022-01-01T01:10:00Z +2022-01-01T02:10:00Z +2022-01-01T02:10:00Z +2022-01-01T02:10:00Z + +statement error +SELECT TIMESTAMPTZ '2023‐W38‐5' as iso8601_week_designation + +statement error +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+Foo' as bad_tz + +statement error +SELECT TIMESTAMPTZ '2022-01-01T01:10:00+42:00' as bad_tz + +query P rowsort +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 GMT' as ts_gmt +---- +2022-01-01T01:10:00Z + +statement error +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 GMT-1' as ts_gmt_offset + +# will not accept non-GMT geo abbr +# postgresql: accepts +statement error +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 AEST' + +# ok to use geo longform +query P rowsort +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 Australia/Sydney' as ts_geo + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 Antarctica/Vostok' as ts_geo + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 Africa/Johannesburg' as ts_geo + UNION ALL +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 America/Los_Angeles' as ts_geo +---- +2021-12-31T14:10:00Z +2021-12-31T19:10:00Z +2021-12-31T23:10:00Z +2022-01-01T09:10:00Z + +# geo longform timezones need whitespace converted to underscore +statement error +SELECT TIMESTAMPTZ '2022-01-01 01:10:00 America/Los Angeles' as ts_geo + +statement error +SELECT TIMESTAMPTZ 'Sat, 1 Jan 2022 01:10:00 GMT' as rfc1123 + + + +########## +## Timezone acceptance bounds +# +# daylight savings +########## + +# will not accept daylight savings designations as geo abbr (because not accepting geo abbr) +# postgresql: accepts +statement error +SELECT TIMESTAMPTZ '2023-03-12 02:00:00 EDT' + +# ok to use geo longform +query P +SELECT TIMESTAMPTZ '2023-03-11 02:00:00 America/Los_Angeles' as ts_geo +---- +2023-03-11T10:00:00Z + +# will error if provide geo longform with time not possible due to daylight savings +# Arrow error: Parser error: Error parsing timestamp from '2023-03-12 02:00:00 America/Los_Angeles': error computing timezone offset +# postgresql: accepts +statement error +SELECT TIMESTAMPTZ '2023-03-12 02:00:00 America/Los_Angeles' as ts_geo