Skip to content

Commit

Permalink
fix(5975/5976): timezone handling for timestamps and date_trunc, `d…
Browse files Browse the repository at this point in the history
…ate_part` and `date_bin` (#7614)

* test: enforce timestamptz contract

* test(5975/5976): demonstrate what logical plan casting must occur with datetime scalar functions.

  * These test cases also document how our scalar functions are currently not correct.

  * Extra comments documenting the logical plan will be removed on test cleanup (after code fixes).

* fix(5975/5976): enable type coercion to include specific timezones

  * Prior to this change, the outcome was always coerced to Timestamp(Nanoseconds, None)
and the tz was dropped.

* fix(5975/5976): have date_trunc use DateTime<Tz>, instead of NaiveDateTime

* chore(5975/5976): test cleanup -- consolidate into the single timestamps test file

* fix(5975/5976): enable all valid timezones to be supported in type coercion

* chore: update cargo.lock in datafusion-cli

* test(5975/5976): tests to document the bounds of timezone acceptance

* test(5975/5976): document irregular offsets and daylight savings time

* refactor(5975/5976): do not parse timezone string during type coersion, should have already failed in parser if invalid

* chore: properly abbreviate abbreviations

* fix(5975/5976): apply tz string parsing per batch.

  * move parsing up to date_trunc() to apply per batch, not per value.
  * do not infer a default UTC timezone for missing tz. Instead use the appropriate type for with, or without, tz.
  • Loading branch information
wiedld authored Sep 23, 2023
1 parent 9b45ed9 commit d19e9d6
Show file tree
Hide file tree
Showing 7 changed files with 648 additions and 101 deletions.
36 changes: 18 additions & 18 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions datafusion/core/tests/sql/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,14 @@ async fn test_encoding_expressions() -> Result<()> {
Ok(())
}

#[should_panic(expected = "Invalid timezone \\\"Foo\\\": 'Foo' is not a valid timezone")]
#[tokio::test]
async fn test_array_cast_invalid_timezone_will_panic() {
let ctx = SessionContext::new();
let sql = "SELECT arrow_cast('2021-01-02T03:04:00', 'Timestamp(Nanosecond, Some(\"Foo\"))')";
execute(&ctx, sql).await;
}

#[tokio::test]
#[cfg_attr(not(feature = "crypto_expressions"), ignore)]
async fn test_crypto_expressions() -> Result<()> {
Expand Down
1 change: 1 addition & 0 deletions datafusion/expr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ path = "src/lib.rs"
[dependencies]
ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
arrow = { workspace = true }
arrow-array = { workspace = true }
datafusion-common = { path = "../common", version = "31.0.0", default-features = false }
sqlparser = { workspace = true }
strum = { version = "0.25.0", features = ["derive"] }
Expand Down
32 changes: 30 additions & 2 deletions datafusion/expr/src/built_in_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1020,9 +1020,13 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::DateTrunc => Signature::one_of(
vec![
Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Microsecond, None)]),
Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Millisecond, None)]),
Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Second, None)]),
Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]),
],
self.volatility(),
),
Expand All @@ -1034,16 +1038,37 @@ impl BuiltinScalarFunction {
Timestamp(array_type.clone(), None),
Timestamp(Nanosecond, None),
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(array_type.clone(), Some("+TZ".into())),
Timestamp(Nanosecond, Some("+TZ".into())),
]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type.clone(), None),
Timestamp(Nanosecond, None),
]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type.clone(), Some("+TZ".into())),
Timestamp(Nanosecond, Some("+TZ".into())),
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(array_type.clone(), None),
]),
Exact(vec![
Interval(MonthDayNano),
Timestamp(array_type.clone(), Some("+TZ".into())),
]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type.clone(), None),
]),
Exact(vec![Interval(DayTime), Timestamp(array_type, None)]),
Exact(vec![
Interval(DayTime),
Timestamp(array_type, Some("+TZ".into())),
]),
]
};

Expand All @@ -1060,10 +1085,13 @@ impl BuiltinScalarFunction {
Exact(vec![Utf8, Date32]),
Exact(vec![Utf8, Date64]),
Exact(vec![Utf8, Timestamp(Second, None)]),
Exact(vec![Utf8, Timestamp(Second, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Microsecond, None)]),
Exact(vec![Utf8, Timestamp(Microsecond, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Millisecond, None)]),
Exact(vec![Utf8, Timestamp(Millisecond, Some("+TZ".into()))]),
Exact(vec![Utf8, Timestamp(Nanosecond, None)]),
Exact(vec![Utf8, Timestamp(Nanosecond, Some("+00:00".into()))]),
Exact(vec![Utf8, Timestamp(Nanosecond, Some("+TZ".into()))]),
],
self.volatility(),
),
Expand Down
135 changes: 85 additions & 50 deletions datafusion/expr/src/type_coercion/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ fn maybe_data_types(
new_type.push(current_type.clone())
} else {
// attempt to coerce
if can_coerce_from(valid_type, current_type) {
if let Some(valid_type) = coerced_from(valid_type, current_type) {
new_type.push(valid_type.clone())
} else {
// not possible
Expand All @@ -136,62 +136,97 @@ fn maybe_data_types(
///
/// See the module level documentation for more detail on coercion.
pub fn can_coerce_from(type_into: &DataType, type_from: &DataType) -> bool {
use self::DataType::*;

if type_into == type_from {
return true;
}
// Null can convert to most of types
if let Some(coerced) = coerced_from(type_into, type_from) {
return coerced == type_into;
}
false
}

fn coerced_from<'a>(
type_into: &'a DataType,
type_from: &'a DataType,
) -> Option<&'a DataType> {
use self::DataType::*;

match type_into {
Int8 => matches!(type_from, Null | Int8),
Int16 => matches!(type_from, Null | Int8 | Int16 | UInt8),
Int32 => matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16),
Int64 => matches!(
type_from,
Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
),
UInt8 => matches!(type_from, Null | UInt8),
UInt16 => matches!(type_from, Null | UInt8 | UInt16),
UInt32 => matches!(type_from, Null | UInt8 | UInt16 | UInt32),
UInt64 => matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64),
Float32 => matches!(
type_from,
Null | Int8
| Int16
| Int32
| Int64
| UInt8
| UInt16
| UInt32
| UInt64
| Float32
),
Float64 => matches!(
type_from,
Null | Int8
| Int16
| Int32
| Int64
| UInt8
| UInt16
| UInt32
| UInt64
| Float32
| Float64
| Decimal128(_, _)
),
Timestamp(TimeUnit::Nanosecond, _) => {
matches!(
// coerced into type_into
Int8 if matches!(type_from, Null | Int8) => Some(type_into),
Int16 if matches!(type_from, Null | Int8 | Int16 | UInt8) => Some(type_into),
Int32 if matches!(type_from, Null | Int8 | Int16 | Int32 | UInt8 | UInt16) => {
Some(type_into)
}
Int64
if matches!(
type_from,
Null | Timestamp(_, _) | Date32 | Utf8 | LargeUtf8
)
Null | Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32
) =>
{
Some(type_into)
}
Interval(_) => {
matches!(type_from, Utf8 | LargeUtf8)
UInt8 if matches!(type_from, Null | UInt8) => Some(type_into),
UInt16 if matches!(type_from, Null | UInt8 | UInt16) => Some(type_into),
UInt32 if matches!(type_from, Null | UInt8 | UInt16 | UInt32) => Some(type_into),
UInt64 if matches!(type_from, Null | UInt8 | UInt16 | UInt32 | UInt64) => {
Some(type_into)
}
Utf8 | LargeUtf8 => true,
Null => can_cast_types(type_from, type_into),
_ => false,
Float32
if matches!(
type_from,
Null | Int8
| Int16
| Int32
| Int64
| UInt8
| UInt16
| UInt32
| UInt64
| Float32
) =>
{
Some(type_into)
}
Float64
if matches!(
type_from,
Null | Int8
| Int16
| Int32
| Int64
| UInt8
| UInt16
| UInt32
| UInt64
| Float32
| Float64
| Decimal128(_, _)
) =>
{
Some(type_into)
}
Timestamp(TimeUnit::Nanosecond, None)
if matches!(
type_from,
Null | Timestamp(_, None) | Date32 | Utf8 | LargeUtf8
) =>
{
Some(type_into)
}
Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => Some(type_into),
Utf8 | LargeUtf8 => Some(type_into),
Null if can_cast_types(type_from, type_into) => Some(type_into),

// Coerce to consistent timezones, if the `type_from` timezone exists.
Timestamp(TimeUnit::Nanosecond, Some(_))
if matches!(type_from, Timestamp(TimeUnit::Nanosecond, Some(_))) =>
{
Some(type_from)
}

// cannot coerce
_ => None,
}
}

Expand Down
Loading

0 comments on commit d19e9d6

Please sign in to comment.