From e19cbb297bc425441c04247a98dc890b22d1adef Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Tue, 14 Mar 2023 13:32:40 +0800 Subject: [PATCH 01/11] use same PartialEq + Eq + Hash + PartialOrd + Ord logic as PostgreSQL to avoid overflow --- src/common/src/types/interval.rs | 78 +++++++++++++++++--------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 153eadc9076a..8730b95d3df0 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -485,6 +485,47 @@ impl std::fmt::Debug for IntervalUnitDisplay<'_> { } } +/// +/// +/// Do NOT make this `pub` as the assumption of 1 month = 30 days and 1 day = 24 hours does not +/// always hold in other places. +#[derive(PartialEq, Eq, Hash, PartialOrd, Ord)] +struct IntervalCmpValue(i128); + +impl From for IntervalCmpValue { + fn from(value: IntervalUnit) -> Self { + let days = (value.days as i64) + 30i64 * (value.months as i64); + let usecs = (value.usecs as i128) + (USECS_PER_DAY as i128) * (days as i128); + Self(usecs) + } +} + +impl Ord for IntervalUnit { + fn cmp(&self, other: &Self) -> Ordering { + IntervalCmpValue::from(*self).cmp(&(*other).into()) + } +} + +impl PartialOrd for IntervalUnit { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for IntervalUnit { + fn eq(&self, other: &Self) -> bool { + self.cmp(other).is_eq() + } +} + +impl Eq for IntervalUnit {} + +impl Hash for IntervalUnit { + fn hash(&self, state: &mut H) { + IntervalCmpValue::from(*self).hash(state); + } +} + /// Loss of information during the process due to `justify`. Only intended for memcomparable /// encoding. impl Serialize for IntervalUnit { @@ -565,43 +606,6 @@ impl Add for IntervalUnit { } } -impl PartialOrd for IntervalUnit { - fn partial_cmp(&self, other: &Self) -> Option { - if self.eq(other) { - Some(Ordering::Equal) - } else { - let diff = *self - *other; - let days = diff.months as i64 * 30 + diff.days as i64; - Some((days * USECS_PER_DAY + diff.usecs).cmp(&0)) - } - } -} - -impl Hash for IntervalUnit { - fn hash(&self, state: &mut H) { - let interval = self.justified(); - interval.months.hash(state); - interval.usecs.hash(state); - interval.days.hash(state); - } -} - -impl PartialEq for IntervalUnit { - fn eq(&self, other: &Self) -> bool { - let interval = self.justified(); - let other = other.justified(); - interval.days == other.days && interval.usecs == other.usecs - } -} - -impl Eq for IntervalUnit {} - -impl Ord for IntervalUnit { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() - } -} - impl CheckedNeg for IntervalUnit { fn checked_neg(&self) -> Option { let months = self.months.checked_neg()?; From d79e4a78047c9b5bf8a6a208712f232ed40779e8 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Tue, 14 Mar 2023 14:24:18 +0800 Subject: [PATCH 02/11] serde --- src/common/src/hash/key.rs | 27 ++-------------- src/common/src/types/interval.rs | 53 ++++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/src/common/src/hash/key.rs b/src/common/src/hash/key.rs index 95a6d6c58189..2762fcbbe512 100644 --- a/src/common/src/hash/key.rs +++ b/src/common/src/hash/key.rs @@ -39,8 +39,8 @@ use crate::collection::estimate_size::EstimateSize; use crate::hash::VirtualNode; use crate::row::{OwnedRow, RowDeserializer}; use crate::types::{ - DataType, Decimal, IntervalUnit, NaiveDateTimeWrapper, NaiveDateWrapper, NaiveTimeWrapper, - OrderedF32, OrderedF64, ScalarRef, + DataType, Decimal, NaiveDateTimeWrapper, NaiveDateWrapper, NaiveTimeWrapper, OrderedF32, + OrderedF64, ScalarRef, }; use crate::util::hash_util::Crc32FastBuilder; use crate::util::iter_util::ZipEqFast; @@ -357,29 +357,6 @@ impl HashKeySerDe<'_> for Decimal { } } -impl HashKeySerDe<'_> for IntervalUnit { - type S = [u8; 16]; - - fn serialize(mut self) -> Self::S { - self.justify_interval(); - let mut ret = [0; 16]; - ret[0..4].copy_from_slice(&self.get_months().to_ne_bytes()); - ret[4..8].copy_from_slice(&self.get_days().to_ne_bytes()); - ret[8..16].copy_from_slice(&self.get_usecs().to_ne_bytes()); - - ret - } - - fn deserialize(source: &mut R) -> Self { - let value = Self::read_fixed_size_bytes::(source); - IntervalUnit::from_month_day_usec( - i32::from_ne_bytes(value[0..4].try_into().unwrap()), - i32::from_ne_bytes(value[4..8].try_into().unwrap()), - i64::from_ne_bytes(value[8..16].try_into().unwrap()), - ) - } -} - impl<'a> HashKeySerDe<'a> for &'a str { type S = Vec; diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 8730b95d3df0..17999964a04e 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -533,13 +533,23 @@ impl Serialize for IntervalUnit { where S: serde::Serializer, { - let IntervalUnit { - months, - days, - usecs, - } = self.justified(); - // serialize the `IntervalUnit` as a tuple - (months, days, usecs).serialize(serializer) + let cmp_value = IntervalCmpValue::from(*self); + // split i128 as (i64, u64), which is equivalent + ( + (cmp_value.0 >> 64) as i64, + cmp_value.0 as u64, // truncate to get the lower part + ) + .serialize(serializer) + } +} + +impl IntervalCmpValue { + fn as_justified(&self) -> Option { + todo!() + } + + fn as_alternate(&self) -> IntervalUnit { + todo!() } } @@ -548,12 +558,29 @@ impl<'de> Deserialize<'de> for IntervalUnit { where D: serde::Deserializer<'de>, { - let (months, days, usecs) = <(i32, i32, i64)>::deserialize(deserializer)?; - Ok(Self { - months, - days, - usecs, - }) + let (hi, lo) = <(i64, u64)>::deserialize(deserializer)?; + let cmp_value = IntervalCmpValue(((hi as i128) << 64) | (lo as i128)); + let interval = cmp_value + .as_justified() + .unwrap_or_else(|| cmp_value.as_alternate()); + Ok(interval) + } +} + +impl crate::hash::HashKeySerDe<'_> for IntervalUnit { + type S = [u8; 16]; + + fn serialize(self) -> Self::S { + let cmp_value = IntervalCmpValue::from(self); + cmp_value.0.to_ne_bytes() + } + + fn deserialize(source: &mut R) -> Self { + let value = Self::read_fixed_size_bytes::(source); + let cmp_value = IntervalCmpValue(i128::from_ne_bytes(value)); + cmp_value + .as_justified() + .unwrap_or_else(|| cmp_value.as_alternate()) } } From a3fdb7068ecc0031adf3e8e637ba244756ca5329 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Tue, 14 Mar 2023 15:26:01 +0800 Subject: [PATCH 03/11] as_justified and as_alternate --- src/common/src/types/interval.rs | 64 ++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 8 deletions(-) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 17999964a04e..db2626ee9d15 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -545,11 +545,55 @@ impl Serialize for IntervalUnit { impl IntervalCmpValue { fn as_justified(&self) -> Option { - todo!() - } - - fn as_alternate(&self) -> IntervalUnit { - todo!() + let usecs = (self.0 % (USECS_PER_DAY as i128)) as i64; + let remaining_days = self.0 / (USECS_PER_DAY as i128); + let days = (remaining_days % 30) as i32; + let months = (remaining_days / 30).try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + + fn as_alternate(&self) -> Option { + match self.0.cmp(&0) { + Ordering::Equal => Some(IntervalUnit::from_month_day_usec(0, 0, 0)), + Ordering::Greater => { + let remaining_usecs = self.0; + let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; + let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + let extra_days = ((i64::MAX - usecs) / USECS_PER_DAY) + .min(remaining_days.try_into().unwrap_or(i64::MAX)); + usecs += extra_days * USECS_PER_DAY; + remaining_days -= extra_days as i128; + + let mut days = (remaining_days % 30) as i32; + let mut remaining_months = remaining_days / 30; + let extra_months = + ((i32::MAX - days) / 30).min(remaining_months.try_into().unwrap_or(i32::MAX)); + days += extra_months * 30; + remaining_months -= extra_months as i128; + + let months = remaining_months.try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + Ordering::Less => { + let remaining_usecs = self.0; + let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; + let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + let extra_days = ((i64::MIN - usecs) / USECS_PER_DAY) + .max(remaining_days.try_into().unwrap_or(i64::MIN)); + usecs += extra_days * USECS_PER_DAY; + remaining_days -= extra_days as i128; + + let mut days = (remaining_days % 30) as i32; + let mut remaining_months = remaining_days / 30; + let extra_months = + ((i32::MIN - days) / 30).max(remaining_months.try_into().unwrap_or(i32::MIN)); + days += extra_months * 30; + remaining_months -= extra_months as i128; + + let months = remaining_months.try_into().ok()?; + Some(IntervalUnit::from_month_day_usec(months, days, usecs)) + } + } } } @@ -562,8 +606,11 @@ impl<'de> Deserialize<'de> for IntervalUnit { let cmp_value = IntervalCmpValue(((hi as i128) << 64) | (lo as i128)); let interval = cmp_value .as_justified() - .unwrap_or_else(|| cmp_value.as_alternate()); - Ok(interval) + .or_else(|| cmp_value.as_alternate()); + interval.ok_or_else(|| { + use serde::de::Error as _; + D::Error::custom("memcomparable deserialize interval overflow") + }) } } @@ -580,7 +627,8 @@ impl crate::hash::HashKeySerDe<'_> for IntervalUnit { let cmp_value = IntervalCmpValue(i128::from_ne_bytes(value)); cmp_value .as_justified() - .unwrap_or_else(|| cmp_value.as_alternate()) + .or_else(|| cmp_value.as_alternate()) + .expect("HashKey deserialize interval overflow") } } From a92ab588f31c63e37e04efc2334037e76be9753e Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Tue, 14 Mar 2023 15:52:38 +0800 Subject: [PATCH 04/11] justify_interval --- src/common/src/types/interval.rs | 52 ++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index db2626ee9d15..de4e238a3f8d 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -81,23 +81,40 @@ impl IntervalUnit { self.usecs.rem_euclid(USECS_PER_DAY) as u64 } - /// Justify interval, convert 1 month to 30 days and 86400 s to 1 day. - /// If day is positive, complement the seconds negative value. - /// These rules only use in interval comparison. - pub fn justify_interval(&mut self) { - #[expect(deprecated)] - let total_usecs = self.as_usecs_i64(); - *self = Self { - months: 0, - days: (total_usecs / USECS_PER_DAY) as i32, - usecs: total_usecs % USECS_PER_DAY, + /// Justify interval, convert 24 hours to 1 day and 30 days to 1 month. + /// Also makes signs of all fields to be the same. + /// + /// + pub fn justify_interval(&self) -> Option { + let mut v = *self; + if v.days > 0 && v.usecs > 0 || v.days < 0 && v.usecs < 0 { + v.months = v.months.checked_add(v.days / 30)?; + v.days %= 30; + } + + v.days += (v.usecs / USECS_PER_DAY) as i32; + v.usecs %= USECS_PER_DAY; + + v.months = v.months.checked_add(v.days / 30)?; + v.days %= 30; + + if v.months > 0 && (v.days < 0 || v.days == 0 && v.usecs < 0) { + v.days += 30; + v.months -= 1; + } else if v.months < 0 && (v.days > 0 || v.days == 0 && v.usecs > 0) { + v.days -= 30; + v.months += 1; + } + + if v.days > 0 && v.usecs < 0 { + v.usecs += USECS_PER_DAY; + v.days -= 1; + } else if v.days < 0 && v.usecs > 0 { + v.usecs -= USECS_PER_DAY; + v.days += 1; } - } - pub fn justified(&self) -> Self { - let mut interval = *self; - interval.justify_interval(); - interval + Some(v) } #[deprecated] @@ -747,10 +764,7 @@ impl Zero for IntervalUnit { impl IsNegative for IntervalUnit { fn is_negative(&self) -> bool { - let i = self.justified(); - i.months < 0 - || (i.months == 0 && i.days < 0) - || (i.months == 0 && i.days == 0 && i.usecs < 0) + self < &Self::from_month_day_usec(0, 0, 0) } } From a66e9cf1864f1bd0477cadf686a1ac856e1de574 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Tue, 14 Mar 2023 17:58:08 +0800 Subject: [PATCH 05/11] unit tests --- src/common/src/types/interval.rs | 88 ++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index de4e238a3f8d..baac9572bd8f 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -1392,4 +1392,92 @@ mod tests { assert_eq!(lhs.cmp(&rhs), order) } } + + #[test] + fn test_justify() { + let cases = [ + ( + (0, 0, USECS_PER_MONTH * 2 + USECS_PER_DAY * 3 + 4), + Some((2, 3, 4i64, "2 mons 3 days 00:00:00.000004")), + ), + ((i32::MIN, i32::MIN, i64::MIN), None), + ((i32::MAX, i32::MAX, i64::MAX), None), + ( + (0, i32::MIN, i64::MIN), + Some(( + -75141187, + -29, + -14454775808, + "-6261765 years -7 mons -29 days -04:00:54.775808", + )), + ), + ( + (i32::MIN, -60, i64::MAX), + Some(( + -2143925250, + -8, + -71945224193, + "-178660437 years -6 mons -8 days -19:59:05.224193", + )), + ), + ]; + for ((lhs_months, lhs_days, lhs_usecs), rhs) in cases { + let input = IntervalUnit::from_month_day_usec(lhs_months, lhs_days, lhs_usecs); + let actual = input.justify_interval(); + let actual_deserialize = IntervalCmpValue::from(input).as_justified(); + + match rhs { + None => { + assert_eq!(actual, None); + assert_eq!(actual_deserialize, None); + } + Some((rhs_months, rhs_days, rhs_usecs, rhs_str)) => { + // We should test individual fields rather than using custom `Eq` + assert_eq!(actual.unwrap().get_months(), rhs_months); + assert_eq!(actual.unwrap().get_days(), rhs_days); + assert_eq!(actual.unwrap().get_usecs(), rhs_usecs); + assert_eq!(actual_deserialize.unwrap().get_months(), rhs_months); + assert_eq!(actual_deserialize.unwrap().get_days(), rhs_days); + assert_eq!(actual_deserialize.unwrap().get_usecs(), rhs_usecs); + assert_eq!(actual.unwrap().to_string(), rhs_str); + } + } + } + } + + #[test] + fn test_deserialize_alternate() { + let cases = [ + (0, 0, USECS_PER_MONTH * 2 + USECS_PER_DAY * 3 + 4), + (i32::MIN, i32::MIN, i64::MIN), + (i32::MAX, i32::MAX, i64::MAX), + (0, i32::MIN, i64::MIN), + (i32::MIN, -60, i64::MAX), + ]; + for (months, days, usecs) in cases { + let input = IntervalUnit::from_month_day_usec(months, days, usecs); + + let mut serializer = memcomparable::Serializer::new(vec![]); + input.serialize(&mut serializer).unwrap(); + let buf = serializer.into_inner(); + let mut deserializer = memcomparable::Deserializer::new(&buf[..]); + let actual = IntervalUnit::deserialize(&mut deserializer).unwrap(); + + // The IntervalUnit we get back can be a different one, but they should be equal. + assert_eq!(actual, input); + } + + // Decoding invalid value + let mut serializer = memcomparable::Serializer::new(vec![]); + (i64::MAX, u64::MAX).serialize(&mut serializer).unwrap(); + let buf = serializer.into_inner(); + let mut deserializer = memcomparable::Deserializer::new(&buf[..]); + assert!(IntervalUnit::deserialize(&mut deserializer).is_err()); + + let buf = i128::MIN.to_ne_bytes(); + std::panic::catch_unwind(|| { + ::deserialize(&mut &buf[..]) + }) + .unwrap_err(); + } } From b95a9069cba65fac2b26c227602d2de253ce3096 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Wed, 15 Mar 2023 12:39:43 +0800 Subject: [PATCH 06/11] regress test as e2e slt --- e2e_test/batch/types/interval.slt.part | 51 +++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/e2e_test/batch/types/interval.slt.part b/e2e_test/batch/types/interval.slt.part index 16c688707485..5e9f7287e969 100644 --- a/e2e_test/batch/types/interval.slt.part +++ b/e2e_test/batch/types/interval.slt.part @@ -74,20 +74,20 @@ SELECT INTERVAL '3 mins' * 1.5; 00:04:30 # https://github.com/risingwavelabs/risingwave/issues/3873 -query TTTTT +query T select distinct * from (values (interval '1' month), (interval '30' day)) as t; ---- -30 days +1 mon -query TTTTT +query T select distinct * from (values (interval '30' day), (interval '1' month)) as t; ---- -30 days +1 mon -query TTTTT +query T select distinct * from (values (interval '720' hour), (interval '1' month)) as t; ---- -30 days +1 mon query TTTTTT select interval '1 year 1 month 1 day 1'; @@ -119,3 +119,42 @@ query T select '1 mons 1 days 00:00:00.0000001'::INTERVAL; ---- 1 mon 1 day + +# Tests moved from regress tests due to not matching exactly. + +# PostgreSQL outputs `-1 mons +1 day` while we output `-1 mons 1 day`. +# But the main purpose of this test case is ordering of large values. + +statement ok +CREATE TABLE INTERVAL_TBL_OF (f1 interval); + +statement ok +INSERT INTO INTERVAL_TBL_OF (f1) VALUES + ('2147483647 days 2147483647 months'), + ('2147483647 days -2147483648 months'), + ('1 year'), + ('-2147483648 days 2147483647 months'), + ('-2147483648 days -2147483648 months'); + +statement ok +FLUSH; + +query TT +SELECT r1.*, r2.* + FROM INTERVAL_TBL_OF r1, INTERVAL_TBL_OF r2 + WHERE r1.f1 > r2.f1 + ORDER BY r1.f1, r2.f1; +---- +-178956970 years -8 mons 2147483647 days -178956970 years -8 mons -2147483648 days +1 year -178956970 years -8 mons -2147483648 days +1 year -178956970 years -8 mons 2147483647 days +178956970 years 7 mons -2147483648 days -178956970 years -8 mons -2147483648 days +178956970 years 7 mons -2147483648 days -178956970 years -8 mons 2147483647 days +178956970 years 7 mons -2147483648 days 1 year +178956970 years 7 mons 2147483647 days -178956970 years -8 mons -2147483648 days +178956970 years 7 mons 2147483647 days -178956970 years -8 mons 2147483647 days +178956970 years 7 mons 2147483647 days 1 year +178956970 years 7 mons 2147483647 days 178956970 years 7 mons -2147483648 days + +statement ok +DROP TABLE INTERVAL_TBL_OF; From 75fc2012b2d41555a1d32614ad5b11d4869420d5 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Wed, 15 Mar 2023 14:31:04 +0800 Subject: [PATCH 07/11] comments --- src/common/src/types/interval.rs | 43 ++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index baac9572bd8f..0d6225950f28 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -506,6 +506,42 @@ impl std::fmt::Debug for IntervalUnitDisplay<'_> { /// /// Do NOT make this `pub` as the assumption of 1 month = 30 days and 1 day = 24 hours does not /// always hold in other places. +/// +/// Given this equality definition in PostgreSQL, different interval values can be considered equal, +/// forming equivalence classes. For example: +/// * '-45 days' == '-1 months -15 days' == '1 months -75 days' +/// * '-2147483646 months -210 days' == '-2147483648 months -150 days' == '-2075900865 months +/// -2147483640 days' +/// +/// To hash and memcompare them, we need to pick a representative for each equivalence class, and +/// then map all values from the same equivalence class to the same representative. There are 3 +/// choices (may be more): +/// (a) an `i128` of total `usecs`, with `months` and `days` transformed into `usecs`; +/// (b) the justified interval, as defined by PostgreSQL `justify_interval`; +/// (c) the alternate representative interval that maximizes `abs` of smaller units; +/// +/// For simplicity we will assume there are only `months` and `days` and ignore `usecs` below. +/// +/// The justified interval is more human friendly. However, it may overflow. In the 2 examples +/// above, '-1 months -15 days' is the justified interval of the first equivalence class, but there +/// is no justified interval in the second one. It would be '-2147483653 months' but this overflows +/// `i32`. A lot of bits are wasted in a justified interval because `months` is using `i32` for +/// `-29..=29` only. +/// +/// The alternate representative interval aims to avoid this overflow. It still requires all units +/// to have the same sign, but maximizes `abs` of smaller unit rather than limit it to `29`. The +/// alternate representative of the 2 examples above are '-45 days' and '-2075900865 months +/// -2147483640 days'. The alternate representative interval always exists. +/// +/// For serialize, we could use any of 3. But justified interval requires (i33, i6, i38), and +/// alternate representative interval is not intuitive. +/// +/// For deserialize, we attempt justified interval first and fallback to alternate. This could give +/// human friendly results in common cases and still guarantee no overflow, as long as the bytes +/// were serialized properly. +/// +/// Note the alternate representative interval does not exist in PostgreSQL as they do not +/// deserialize from `IntervalCmpValue`. #[derive(PartialEq, Eq, Hash, PartialOrd, Ord)] struct IntervalCmpValue(i128); @@ -543,8 +579,8 @@ impl Hash for IntervalUnit { } } -/// Loss of information during the process due to `justify`. Only intended for memcomparable -/// encoding. +/// Loss of information during the process due to `IntervalCmpValue`. Only intended for +/// memcomparable encoding. impl Serialize for IntervalUnit { fn serialize(&self, serializer: S) -> std::result::Result where @@ -561,6 +597,7 @@ impl Serialize for IntervalUnit { } impl IntervalCmpValue { + /// Recover the justified interval from this equivalence class, if it exists. fn as_justified(&self) -> Option { let usecs = (self.0 % (USECS_PER_DAY as i128)) as i64; let remaining_days = self.0 / (USECS_PER_DAY as i128); @@ -569,6 +606,8 @@ impl IntervalCmpValue { Some(IntervalUnit::from_month_day_usec(months, days, usecs)) } + /// Recover the alternate representative interval from this equivalence class. + /// It always exists unless the encoding is invalid. See [`IntervalCmpValue`] for details. fn as_alternate(&self) -> Option { match self.0.cmp(&0) { Ordering::Equal => Some(IntervalUnit::from_month_day_usec(0, 0, 0)), From e937d294f4105534fa6c024635dccb4abfd5544d Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Wed, 15 Mar 2023 18:03:41 +0800 Subject: [PATCH 08/11] more comments and tests about `justify_interval` from PostgreSQL --- src/common/src/types/interval.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 0d6225950f28..3fc4e2a05cf8 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -87,14 +87,25 @@ impl IntervalUnit { /// pub fn justify_interval(&self) -> Option { let mut v = *self; + // When `days` and `usecs` have the same sign, `usecs` could potentially push `days` over + // `i32::MAX` or `i32::MIN`, for example in (0, i32::MIN, i64::MIN). So we attempt to + // restrict `abs(days) < 30` first. + // When they have different signs, restricting `days` first could lead to false positive of + // overflow, for example in (i32::MIN, -60, i64::MAX). So this step should be skipped. if v.days > 0 && v.usecs > 0 || v.days < 0 && v.usecs < 0 { v.months = v.months.checked_add(v.days / 30)?; v.days %= 30; } + // Either `days` and `usecs` are of different signs, or we have already limited + // `abs(days) < 30` above. It is impossible to overflow here because + // `-29 + i64::MIN / USECS_PER_DAY > i32::MIN` v.days += (v.usecs / USECS_PER_DAY) as i32; v.usecs %= USECS_PER_DAY; + // Unfortunately this could still generate false positive of overflow due to lack of sign + // adjustment below, for example in (i32::MIN, -30, 1). This is the same behavior as + // PostgreSQL, but I believe we (and they) would like to have it fixed. v.months = v.months.checked_add(v.days / 30)?; v.days %= 30; @@ -1482,6 +1493,17 @@ mod tests { } } } + + // A false positive overflow that is buggy in PostgreSQL 15.2 as well. + let input = IntervalUnit::from_month_day_usec(i32::MIN, -30, 1); + let actual = input.justify_interval(); + let actual_deserialize = IntervalCmpValue::from(input).as_justified(); + // It has a justified interval within range, and can be obtained by our deserialization. + assert_eq!(actual_deserialize.unwrap().get_months(), i32::MIN); + assert_eq!(actual_deserialize.unwrap().get_days(), -29); + assert_eq!(actual_deserialize.unwrap().get_usecs(), -USECS_PER_DAY + 1); + // But the PostgreSQL `justify_interval` function reports overflow. + assert_eq!(actual, None); } #[test] From aa925dfbcbcf55422e155965b55a20572d761b01 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Wed, 15 Mar 2023 18:46:17 +0800 Subject: [PATCH 09/11] comments --- src/common/src/types/interval.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 3fc4e2a05cf8..4440e00232bd 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -85,6 +85,9 @@ impl IntervalUnit { /// Also makes signs of all fields to be the same. /// /// + /// + /// Be careful when using this as the normalized form, as it is not a total function and may not + /// exist for certain values (e.g. [`Self::MIN`]). See [`IntervalCmpValue`] for details. pub fn justify_interval(&self) -> Option { let mut v = *self; // When `days` and `usecs` have the same sign, `usecs` could potentially push `days` over From a535823a349a0e071ac5053bfc73d8644ca6a633 Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Thu, 16 Mar 2023 13:15:08 +0800 Subject: [PATCH 10/11] improve messaging --- e2e_test/batch/types/interval.slt.part | 4 +++- src/common/src/types/interval.rs | 17 +++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/e2e_test/batch/types/interval.slt.part b/e2e_test/batch/types/interval.slt.part index 5e9f7287e969..73bd7189941b 100644 --- a/e2e_test/batch/types/interval.slt.part +++ b/e2e_test/batch/types/interval.slt.part @@ -122,7 +122,9 @@ select '1 mons 1 days 00:00:00.0000001'::INTERVAL; # Tests moved from regress tests due to not matching exactly. -# PostgreSQL outputs `-1 mons +1 day` while we output `-1 mons 1 day`. +# In mixed sign intervals, PostgreSQL displays positive sign after negative +# (e.g. `-1 mons +1 day`) while we display it without sign +# (e.g. `-1 mons 1 day`). # But the main purpose of this test case is ordering of large values. statement ok diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 4440e00232bd..624f94591e82 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -24,6 +24,7 @@ use bytes::BytesMut; use num_traits::{CheckedAdd, CheckedNeg, CheckedSub, Zero}; use postgres_types::{to_sql_checked, FromSql}; use risingwave_pb::data::IntervalUnit as IntervalUnitProto; +use static_assertions::const_assert; use super::ops::IsNegative; use super::to_binary::ToBinary; @@ -103,6 +104,8 @@ impl IntervalUnit { // Either `days` and `usecs` are of different signs, or we have already limited // `abs(days) < 30` above. It is impossible to overflow here because // `-29 + i64::MIN / USECS_PER_DAY > i32::MIN` + const_assert!(29 + i64::MAX / USECS_PER_DAY <= (i32::MAX as i64)); + const_assert!(-29 + i64::MIN / USECS_PER_DAY >= (i32::MIN as i64)); v.days += (v.usecs / USECS_PER_DAY) as i32; v.usecs %= USECS_PER_DAY; @@ -547,8 +550,8 @@ impl std::fmt::Debug for IntervalUnitDisplay<'_> { /// alternate representative of the 2 examples above are '-45 days' and '-2075900865 months /// -2147483640 days'. The alternate representative interval always exists. /// -/// For serialize, we could use any of 3. But justified interval requires (i33, i6, i38), and -/// alternate representative interval is not intuitive. +/// For serialize, we could use any of 3, with a workaround of using (i33, i6, i38) rather than +/// (i32, i32, i64) to avoid overflow of the justified interval. We chose the `usecs: i128` option. /// /// For deserialize, we attempt justified interval first and fallback to alternate. This could give /// human friendly results in common cases and still guarantee no overflow, as long as the bytes @@ -629,11 +632,19 @@ impl IntervalCmpValue { let remaining_usecs = self.0; let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + // `usecs` is now smaller than `USECS_PER_DAY` but has 64 bits. + // How much more days (multiples of `USECS_PER_DAY`) can it hold before overflowing + // i64::MAX? + // It should also not exceed `remaining_days` to bring it from positive to negative. + // When `remaining_days` is larger than `i64::MAX`, just limit by `i64::MAX` (no-op) let extra_days = ((i64::MAX - usecs) / USECS_PER_DAY) .min(remaining_days.try_into().unwrap_or(i64::MAX)); + // The lhs of `min` ensures `extra_days * USECS_PER_DAY <= i64::MAX - usecs` usecs += extra_days * USECS_PER_DAY; + // The rhs of `min` ensures `extra_days <= remaining_days` remaining_days -= extra_days as i128; + // Similar above let mut days = (remaining_days % 30) as i32; let mut remaining_months = remaining_days / 30; let extra_months = @@ -648,6 +659,8 @@ impl IntervalCmpValue { let remaining_usecs = self.0; let mut usecs = (remaining_usecs % (USECS_PER_DAY as i128)) as i64; let mut remaining_days = remaining_usecs / (USECS_PER_DAY as i128); + // The negative case. Borrow negative `extra_days` to make `usecs` as close to + // `i64::MIN` as possible. let extra_days = ((i64::MIN - usecs) / USECS_PER_DAY) .max(remaining_days.try_into().unwrap_or(i64::MIN)); usecs += extra_days * USECS_PER_DAY; From e00c1b04e8f42799866f3e86427e34166f566c5c Mon Sep 17 00:00:00 2001 From: Xiangjin Date: Thu, 16 Mar 2023 13:24:20 +0800 Subject: [PATCH 11/11] remove `justify_interval` before it is exposed to user --- src/common/src/types/interval.rs | 78 ++++---------------------------- 1 file changed, 9 insertions(+), 69 deletions(-) diff --git a/src/common/src/types/interval.rs b/src/common/src/types/interval.rs index 624f94591e82..0081dadb173c 100644 --- a/src/common/src/types/interval.rs +++ b/src/common/src/types/interval.rs @@ -24,7 +24,6 @@ use bytes::BytesMut; use num_traits::{CheckedAdd, CheckedNeg, CheckedSub, Zero}; use postgres_types::{to_sql_checked, FromSql}; use risingwave_pb::data::IntervalUnit as IntervalUnitProto; -use static_assertions::const_assert; use super::ops::IsNegative; use super::to_binary::ToBinary; @@ -82,58 +81,6 @@ impl IntervalUnit { self.usecs.rem_euclid(USECS_PER_DAY) as u64 } - /// Justify interval, convert 24 hours to 1 day and 30 days to 1 month. - /// Also makes signs of all fields to be the same. - /// - /// - /// - /// Be careful when using this as the normalized form, as it is not a total function and may not - /// exist for certain values (e.g. [`Self::MIN`]). See [`IntervalCmpValue`] for details. - pub fn justify_interval(&self) -> Option { - let mut v = *self; - // When `days` and `usecs` have the same sign, `usecs` could potentially push `days` over - // `i32::MAX` or `i32::MIN`, for example in (0, i32::MIN, i64::MIN). So we attempt to - // restrict `abs(days) < 30` first. - // When they have different signs, restricting `days` first could lead to false positive of - // overflow, for example in (i32::MIN, -60, i64::MAX). So this step should be skipped. - if v.days > 0 && v.usecs > 0 || v.days < 0 && v.usecs < 0 { - v.months = v.months.checked_add(v.days / 30)?; - v.days %= 30; - } - - // Either `days` and `usecs` are of different signs, or we have already limited - // `abs(days) < 30` above. It is impossible to overflow here because - // `-29 + i64::MIN / USECS_PER_DAY > i32::MIN` - const_assert!(29 + i64::MAX / USECS_PER_DAY <= (i32::MAX as i64)); - const_assert!(-29 + i64::MIN / USECS_PER_DAY >= (i32::MIN as i64)); - v.days += (v.usecs / USECS_PER_DAY) as i32; - v.usecs %= USECS_PER_DAY; - - // Unfortunately this could still generate false positive of overflow due to lack of sign - // adjustment below, for example in (i32::MIN, -30, 1). This is the same behavior as - // PostgreSQL, but I believe we (and they) would like to have it fixed. - v.months = v.months.checked_add(v.days / 30)?; - v.days %= 30; - - if v.months > 0 && (v.days < 0 || v.days == 0 && v.usecs < 0) { - v.days += 30; - v.months -= 1; - } else if v.months < 0 && (v.days > 0 || v.days == 0 && v.usecs > 0) { - v.days -= 30; - v.months += 1; - } - - if v.days > 0 && v.usecs < 0 { - v.usecs += USECS_PER_DAY; - v.days -= 1; - } else if v.days < 0 && v.usecs > 0 { - v.usecs -= USECS_PER_DAY; - v.days += 1; - } - - Some(v) - } - #[deprecated] fn from_total_usecs(usecs: i64) -> Self { let mut remaining_usecs = usecs; @@ -539,11 +486,12 @@ impl std::fmt::Debug for IntervalUnitDisplay<'_> { /// /// For simplicity we will assume there are only `months` and `days` and ignore `usecs` below. /// -/// The justified interval is more human friendly. However, it may overflow. In the 2 examples -/// above, '-1 months -15 days' is the justified interval of the first equivalence class, but there -/// is no justified interval in the second one. It would be '-2147483653 months' but this overflows -/// `i32`. A lot of bits are wasted in a justified interval because `months` is using `i32` for -/// `-29..=29` only. +/// The justified interval is more human friendly. It requires all units to have the same sign, and +/// that `0 <= abs(usecs) < USECS_PER_DAY && 0 <= abs(days) < 30`. However, it may overflow. In the +/// 2 examples above, '-1 months -15 days' is the justified interval of the first equivalence class, +/// but there is no justified interval in the second one. It would be '-2147483653 months' but this +/// overflows `i32`. A lot of bits are wasted in a justified interval because `days` is using +/// `i32` for `-29..=29` only. /// /// The alternate representative interval aims to avoid this overflow. It still requires all units /// to have the same sign, but maximizes `abs` of smaller unit rather than limit it to `29`. The @@ -1460,7 +1408,7 @@ mod tests { } #[test] - fn test_justify() { + fn test_deserialize_justify() { let cases = [ ( (0, 0, USECS_PER_MONTH * 2 + USECS_PER_DAY * 3 + 4), @@ -1489,37 +1437,29 @@ mod tests { ]; for ((lhs_months, lhs_days, lhs_usecs), rhs) in cases { let input = IntervalUnit::from_month_day_usec(lhs_months, lhs_days, lhs_usecs); - let actual = input.justify_interval(); let actual_deserialize = IntervalCmpValue::from(input).as_justified(); match rhs { None => { - assert_eq!(actual, None); assert_eq!(actual_deserialize, None); } Some((rhs_months, rhs_days, rhs_usecs, rhs_str)) => { // We should test individual fields rather than using custom `Eq` - assert_eq!(actual.unwrap().get_months(), rhs_months); - assert_eq!(actual.unwrap().get_days(), rhs_days); - assert_eq!(actual.unwrap().get_usecs(), rhs_usecs); assert_eq!(actual_deserialize.unwrap().get_months(), rhs_months); assert_eq!(actual_deserialize.unwrap().get_days(), rhs_days); assert_eq!(actual_deserialize.unwrap().get_usecs(), rhs_usecs); - assert_eq!(actual.unwrap().to_string(), rhs_str); + assert_eq!(actual_deserialize.unwrap().to_string(), rhs_str); } } } - // A false positive overflow that is buggy in PostgreSQL 15.2 as well. + // A false positive overflow that is buggy in PostgreSQL 15.2. let input = IntervalUnit::from_month_day_usec(i32::MIN, -30, 1); - let actual = input.justify_interval(); let actual_deserialize = IntervalCmpValue::from(input).as_justified(); // It has a justified interval within range, and can be obtained by our deserialization. assert_eq!(actual_deserialize.unwrap().get_months(), i32::MIN); assert_eq!(actual_deserialize.unwrap().get_days(), -29); assert_eq!(actual_deserialize.unwrap().get_usecs(), -USECS_PER_DAY + 1); - // But the PostgreSQL `justify_interval` function reports overflow. - assert_eq!(actual, None); } #[test]