From 44894e10955851d9bed8ce33efff925addb7567a Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 17 Aug 2024 21:55:12 +0800 Subject: [PATCH 01/23] operate stringview instead of generating string in SUBSTR --- datafusion/functions/src/unicode/substr.rs | 157 +++++++++++++++++++-- 1 file changed, 147 insertions(+), 10 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 9fd8c75eab23..708b85939e94 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -20,11 +20,12 @@ use std::cmp::max; use std::sync::Arc; use arrow::array::{ - ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, + Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, + OffsetSizeTrait, StringViewArray, StringViewBuilder, }; use arrow::datatypes::DataType; -use datafusion_common::cast::as_int64_array; +use datafusion_common::cast::{as_int64_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -77,7 +78,11 @@ impl ScalarUDFImpl for SubstrFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "substr") + if arg_types[0] == DataType::Utf8View { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "substr") + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -107,11 +112,119 @@ pub fn substr(args: &[ArrayRef]) -> Result { } } -/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) -/// substr('alphabet', 3) = 'phabet' -/// substr('alphabet', 3, 2) = 'ph' -/// The implementation uses UTF-8 code points as characters -fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result +// Decoding ref the trait at: arrow/arrow-data/src/byte_view.rs:44 +// From for ByteView +fn calculate_string_view( + string_array: &StringViewArray, + args: &[ArrayRef], +) -> Result { + let mut builder = StringViewBuilder::new(); + for block in string_array.data_buffers() { + builder.append_block(block.clone()); + } + + let start_array = as_int64_array(&args[0])?; + + match args.len() { + 1 => { + for (raw, start) in string_array.views().iter().zip(start_array.iter()) { + if let Some(start) = start { + let length = *raw as u32; + let start = (start - 1).max(0); + if length == 0 { + builder.append_null(); + } else if length > 12 { + let buffer_index = (*raw >> 64) as u32; + let offset = (*raw >> 96) as u32; + // Safety: builder is guaranteed to have corresponding blocks + unsafe { + builder.append_view_unchecked( + buffer_index, + offset + start as u32, + length - start as u32, // guarantee that length >= start + ); + } + } else { + let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); + let str = match std::str::from_utf8(&bytes[..14]) { + Ok(str) => &str[start as usize..], + _ => { + return exec_err!( + "Failed to convert inline bytes to &str." + ) + } + }; + builder.append_value(str); + } + } else { + builder.append_null(); + } + } + } + 2 => { + let count_array = as_int64_array(&args[1])?; + for ((raw, start), count) in string_array + .views() + .iter() + .zip(start_array.iter()) + .zip(count_array.iter()) + { + if let (Some(start), Some(count)) = (start, count) { + let length = *raw as u32; + let start = (start - 1).max(0) as usize; + if count < 0 { + return exec_err!( + "negative substring length not allowed: substr(, {start}, {count})" + ); + } else { + let count = (count as u32).min(length); + if length == 0 { + builder.append_null(); + } else if length > 12 { + let buffer_index = (*raw >> 64) as u32; + let offset = (*raw >> 96) as u32; + // Safety: builder is guaranteed to have corresponding blocks + unsafe { + builder.append_view_unchecked( + buffer_index, + offset + start as u32, + count, // guarantee that count >= start and count <= length + ); + } + } else { + let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); + let str = match std::str::from_utf8(&bytes[..14]) { + Ok(str) => { + let end = + (start + count as usize).min(length as usize); + &str[start..end] + } + _ => { + return exec_err!( + "Failed to convert inline bytes to &str." + ) + } + }; + builder.append_value(str); + } + } + } else { + builder.append_null(); + } + } + } + other => { + return exec_err!( + "substr was called with {other} arguments. It requires 2 or 3." + ) + } + } + + let result = builder.finish(); + Ok(Arc::new(result) as ArrayRef) +} + +fn calculate_string<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result where V: ArrayAccessor, T: OffsetSizeTrait, @@ -168,10 +281,34 @@ where } } +/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) +/// substr('alphabet', 3) = 'phabet' +/// substr('alphabet', 3, 2) = 'ph' +/// The implementation uses UTF-8 code points as characters +fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result +where + V: ArrayAccessor, + T: OffsetSizeTrait, +{ + match string_array.data_type() { + DataType::Utf8View => { + calculate_string_view(as_string_view_array(&string_array)?, args) + } + DataType::Utf8 | DataType::LargeUtf8 => { + calculate_string::(string_array, args) + } + other => { + exec_err!( + "unexpected datatype {other}, expected Utf8View, Utf8 or LargeUtf8." + ) + } + } +} + #[cfg(test)] mod tests { use arrow::array::{Array, StringArray}; - use arrow::datatypes::DataType::Utf8; + use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -189,7 +326,7 @@ mod tests { ], Ok(None), &str, - Utf8, + Utf8View, StringArray ); test_function!( From 4810be93f60ff812c5f46fc7ee84c2b146c131a9 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 17 Aug 2024 21:58:44 +0800 Subject: [PATCH 02/23] treat Utf8View as Text in sqllogictests output --- .../sqllogictest/src/engines/datafusion_engine/normalize.rs | 4 +++- datafusion/sqllogictest/test_files/arrow_typeof.slt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index 66ffeadf8cec..b6b583b9fbdb 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -267,7 +267,9 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { | DataType::Float64 | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => DFColumnType::Float, - DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + DFColumnType::Text + } DataType::Date32 | DataType::Date64 | DataType::Time32(_) diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 448706744305..930bd488d7eb 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -424,7 +424,7 @@ select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); [1, 2, 3] # Tests for Utf8View -query ?T +query TT select arrow_cast('MyAwesomeString', 'Utf8View'), arrow_typeof(arrow_cast('MyAwesomeString', 'Utf8View')) ---- MyAwesomeString Utf8View From e8854b8bf951c9a5886e3f8689a6cd96c29ad546 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:11:39 +0800 Subject: [PATCH 03/23] add bench to see enhancement of utf8view against utf8 and large_utf8 --- datafusion/functions/Cargo.toml | 5 + datafusion/functions/benches/substr.rs | 206 +++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 datafusion/functions/benches/substr.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 2b3f80fc930b..4a88cb217d28 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -156,3 +156,8 @@ required-features = ["unicode_expressions"] harness = false name = "repeat" required-features = ["string_expressions"] + +[[bench]] +harness = false +name = "substr" +required-features = ["string_expressions"] diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs new file mode 100644 index 000000000000..4ac80b01296b --- /dev/null +++ b/datafusion/functions/benches/substr.rs @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode; +use std::sync::Arc; +use std::time::Duration; + +fn create_args_without_count( + size: usize, + str_len: usize, + start_half_way: bool, + use_string_view: bool, +) -> Vec { + let start_array = Arc::new(Int64Array::from( + (0..size) + .map(|_| { + if start_half_way { + (str_len / 2) as i64 + } else { + 1i64 + } + }) + .collect::>(), + )); + + if use_string_view { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ] + } +} + +fn create_args_with_count( + size: usize, + str_len: usize, + count_max: usize, + use_string_view: bool, +) -> Vec { + let start_array = + Arc::new(Int64Array::from((0..size).map(|_| 1).collect::>())); + let count = count_max.min(str_len) as i64; + let count_array = Arc::new(Int64Array::from( + (0..size).map(|_| count).collect::>(), + )); + + if use_string_view { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(start_array), + ColumnarValue::Array(count_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), + ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef), + ] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + let substr = unicode::substr(); + for size in [1024, 4096] { + // string_len = 12, substring_len=6 (see `create_args_without_count`) + let len = 12; + let mut group = c.benchmark_group("shorter than 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + let args = create_args_without_count::(size, len, true, true); + group.bench_function( + &format!("substr_string_view [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_without_count::(size, len, false, false); + group.bench_function( + &format!("substr_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_without_count::(size, len, true, false); + group.bench_function( + &format!("substr_large_string [size={}, strlen={}]", size, len), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + + // string_len = 128, start=1, count=64, substring_len=64 + let len = 128; + let count = 64; + let mut group = c.benchmark_group("longer than 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + &format!( + "substr_string_view [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_large_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + + // string_len = 128, start=1, count=6, substring_len=6 + let len = 128; + let count = 6; + let mut group = c.benchmark_group("src_len > 12 but sub_len < 12"); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + let args = create_args_with_count::(size, len, count, true); + group.bench_function( + &format!( + "substr_string_view [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + let args = create_args_with_count::(size, len, count, false); + group.bench_function( + &format!( + "substr_large_string [size={}, count={}, strlen={}]", + size, count, len, + ), + |b| b.iter(|| black_box(substr.invoke(&args))), + ); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 35e4658f864730190224099859749272ad8bf0c4 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:11:50 +0800 Subject: [PATCH 04/23] fix a tiny bug --- datafusion/functions/src/unicode/substr.rs | 46 +++++++++++++++------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 708b85939e94..a54f30b869ee 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -112,6 +112,22 @@ pub fn substr(args: &[ArrayRef]) -> Result { } } +fn get_str_by_range(input: &str, start: usize, end: usize) -> &str { + let mut char_cnt = 0; + let (mut st, mut ct) = (input.len(), input.len()); + for (byte_cnt, _) in input.char_indices() { + if char_cnt == start { + st = byte_cnt; + } + if char_cnt == end { + ct = byte_cnt; + break; + } + char_cnt += 1; + } + &input[st..ct] +} + // Decoding ref the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView fn calculate_string_view( @@ -146,8 +162,10 @@ fn calculate_string_view( } } else { let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); - let str = match std::str::from_utf8(&bytes[..14]) { - Ok(str) => &str[start as usize..], + let str = match std::str::from_utf8(&bytes[..length as usize]) { + Ok(str) => { + get_str_by_range(str, start as usize, length as usize) + }, _ => { return exec_err!( "Failed to convert inline bytes to &str." @@ -193,11 +211,11 @@ fn calculate_string_view( } } else { let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); - let str = match std::str::from_utf8(&bytes[..14]) { + let str = match std::str::from_utf8(&bytes[..length as usize]) { Ok(str) => { let end = (start + count as usize).min(length as usize); - &str[start..end] + get_str_by_range(str, start, end) } _ => { return exec_err!( @@ -307,7 +325,7 @@ where #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; + use arrow::array::{Array, StringArray, StringViewArray}; use arrow::datatypes::DataType::{Utf8, Utf8View}; use datafusion_common::{exec_err, Result, ScalarValue}; @@ -327,7 +345,7 @@ mod tests { Ok(None), &str, Utf8View, - StringArray + StringViewArray ); test_function!( SubstrFunc::new(), @@ -339,8 +357,8 @@ mod tests { ], Ok(Some("alphabet")), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( SubstrFunc::new(), @@ -352,8 +370,8 @@ mod tests { ], Ok(Some("ésoj")), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( SubstrFunc::new(), @@ -366,8 +384,8 @@ mod tests { ], Ok(Some("ph")), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( SubstrFunc::new(), @@ -380,8 +398,8 @@ mod tests { ], Ok(Some("phabet")), &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); test_function!( SubstrFunc::new(), From bf6c37e956d07d1347e6e228e66eb26bb8a909db Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sat, 17 Aug 2024 23:14:22 +0800 Subject: [PATCH 05/23] make clippy happy --- datafusion/functions/src/unicode/substr.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index a54f30b869ee..b8db2b892de1 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -113,9 +113,8 @@ pub fn substr(args: &[ArrayRef]) -> Result { } fn get_str_by_range(input: &str, start: usize, end: usize) -> &str { - let mut char_cnt = 0; let (mut st, mut ct) = (input.len(), input.len()); - for (byte_cnt, _) in input.char_indices() { + for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() { if char_cnt == start { st = byte_cnt; } @@ -123,7 +122,6 @@ fn get_str_by_range(input: &str, start: usize, end: usize) -> &str { ct = byte_cnt; break; } - char_cnt += 1; } &input[st..ct] } @@ -165,7 +163,7 @@ fn calculate_string_view( let str = match std::str::from_utf8(&bytes[..length as usize]) { Ok(str) => { get_str_by_range(str, start as usize, length as usize) - }, + } _ => { return exec_err!( "Failed to convert inline bytes to &str." @@ -211,7 +209,8 @@ fn calculate_string_view( } } else { let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); - let str = match std::str::from_utf8(&bytes[..length as usize]) { + let str = match std::str::from_utf8(&bytes[..length as usize]) + { Ok(str) => { let end = (start + count as usize).min(length as usize); From 74acc7cd6dd14c00d4c1cb764be5e1550d4a88b9 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 00:19:09 +0800 Subject: [PATCH 06/23] add tests to cover stringview larger than 12B and correct the code --- datafusion/functions/src/unicode/substr.rs | 93 ++++++++++++++++++---- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index b8db2b892de1..626b6ec84210 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -112,27 +112,39 @@ pub fn substr(args: &[ArrayRef]) -> Result { } } -fn get_str_by_range(input: &str, start: usize, end: usize) -> &str { - let (mut st, mut ct) = (input.len(), input.len()); +// Return the exact byte index for [start, end), set count to -1 to ignore count +fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, usize) { + let (mut st, mut ed) = (input.len(), input.len()); + let mut start_counting = false; + let mut cnt = 0; for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() { if char_cnt == start { st = byte_cnt; + if count != -1 { + start_counting = true; + } else { + break; + } } - if char_cnt == end { - ct = byte_cnt; - break; + if start_counting { + if cnt == count { + ed = byte_cnt; + break; + } + cnt += 1; } } - &input[st..ct] + (st, ed) } -// Decoding ref the trait at: arrow/arrow-data/src/byte_view.rs:44 +// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView fn calculate_string_view( string_array: &StringViewArray, args: &[ArrayRef], ) -> Result { let mut builder = StringViewBuilder::new(); + // Copy all blocks from input for block in string_array.data_buffers() { builder.append_block(block.clone()); } @@ -141,32 +153,46 @@ fn calculate_string_view( match args.len() { 1 => { - for (raw, start) in string_array.views().iter().zip(start_array.iter()) { + for (idx, (raw, start)) in string_array + .views() + .iter() + .zip(start_array.iter()) + .enumerate() + { if let Some(start) = start { let length = *raw as u32; let start = (start - 1).max(0); + + // Operate according to the length of bytes if length == 0 { builder.append_null(); } else if length > 12 { let buffer_index = (*raw >> 64) as u32; let offset = (*raw >> 96) as u32; + let str = string_array.value(idx); + let (start, end) = + get_true_start_count(str, start as usize, -1); // Safety: builder is guaranteed to have corresponding blocks unsafe { builder.append_view_unchecked( buffer_index, offset + start as u32, - length - start as u32, // guarantee that length >= start + // guarantee that end-offset >= 0 for end <= str.len() + (end - start) as u32, ); } } else { let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); let str = match std::str::from_utf8(&bytes[..length as usize]) { Ok(str) => { - get_str_by_range(str, start as usize, length as usize) + // Extract str[start, end) by char + let (start, end) = + get_true_start_count(str, start as usize, length as i64); + &str[start..end] } _ => { return exec_err!( - "Failed to convert inline bytes to &str." + "failed to convert inline bytes to &str." ) } }; @@ -179,11 +205,12 @@ fn calculate_string_view( } 2 => { let count_array = as_int64_array(&args[1])?; - for ((raw, start), count) in string_array + for (idx, ((raw, start), count)) in string_array .views() .iter() .zip(start_array.iter()) .zip(count_array.iter()) + .enumerate() { if let (Some(start), Some(count)) = (start, count) { let length = *raw as u32; @@ -199,12 +226,16 @@ fn calculate_string_view( } else if length > 12 { let buffer_index = (*raw >> 64) as u32; let offset = (*raw >> 96) as u32; + let str = string_array.value(idx); + let (start, end) = + get_true_start_count(str, start, count as i64); // Safety: builder is guaranteed to have corresponding blocks unsafe { builder.append_view_unchecked( buffer_index, offset + start as u32, - count, // guarantee that count >= start and count <= length + // guarantee that end-offset >= 0 for end <= str.len() + (end - start) as u32, ); } } else { @@ -212,13 +243,14 @@ fn calculate_string_view( let str = match std::str::from_utf8(&bytes[..length as usize]) { Ok(str) => { - let end = - (start + count as usize).min(length as usize); - get_str_by_range(str, start, end) + // Extract str[start, end) by char + let (start, end) = + get_true_start_count(str, start, count as i64); + &str[start..end] } _ => { return exec_err!( - "Failed to convert inline bytes to &str." + "failed to convert inline bytes to &str." ) } }; @@ -359,6 +391,33 @@ mod tests { Utf8View, StringViewArray ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "this és longer than 12B" + )))), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some(" é")), + &str, + Utf8View, + StringViewArray + ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from( + "this is longer than 12B" + )))), + ColumnarValue::Scalar(ScalarValue::from(5i64)), + ], + Ok(Some(" is longer than 12B")), + &str, + Utf8View, + StringViewArray + ); test_function!( SubstrFunc::new(), &[ From af795a4af274192b9d13d8182f47f9e7553be763 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 00:24:45 +0800 Subject: [PATCH 07/23] better comments --- datafusion/functions/benches/substr.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 4ac80b01296b..3f7f77b00a4f 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -101,7 +101,7 @@ fn criterion_benchmark(c: &mut Criterion) { for size in [1024, 4096] { // string_len = 12, substring_len=6 (see `create_args_without_count`) let len = 12; - let mut group = c.benchmark_group("shorter than 12"); + let mut group = c.benchmark_group("SHORTER THAN 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); group.measurement_time(Duration::from_secs(10)); @@ -129,7 +129,7 @@ fn criterion_benchmark(c: &mut Criterion) { // string_len = 128, start=1, count=64, substring_len=64 let len = 128; let count = 64; - let mut group = c.benchmark_group("longer than 12"); + let mut group = c.benchmark_group("LONGER THAN 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); group.measurement_time(Duration::from_secs(10)); @@ -166,7 +166,7 @@ fn criterion_benchmark(c: &mut Criterion) { // string_len = 128, start=1, count=6, substring_len=6 let len = 128; let count = 6; - let mut group = c.benchmark_group("src_len > 12 but sub_len < 12"); + let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); group.measurement_time(Duration::from_secs(10)); From f8989d2024f76d172a6fd79b81b905746133631c Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 00:25:01 +0800 Subject: [PATCH 08/23] fix lint --- datafusion/functions/src/unicode/substr.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 626b6ec84210..120adf8e0fd2 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -170,8 +170,7 @@ fn calculate_string_view( let buffer_index = (*raw >> 64) as u32; let offset = (*raw >> 96) as u32; let str = string_array.value(idx); - let (start, end) = - get_true_start_count(str, start as usize, -1); + let (start, end) = get_true_start_count(str, start as usize, -1); // Safety: builder is guaranteed to have corresponding blocks unsafe { builder.append_view_unchecked( @@ -186,8 +185,11 @@ fn calculate_string_view( let str = match std::str::from_utf8(&bytes[..length as usize]) { Ok(str) => { // Extract str[start, end) by char - let (start, end) = - get_true_start_count(str, start as usize, length as i64); + let (start, end) = get_true_start_count( + str, + start as usize, + length as i64, + ); &str[start..end] } _ => { From 26a878aa5cf768861b334cd78f16385551a7609f Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 01:18:03 +0800 Subject: [PATCH 09/23] correct feature setting --- datafusion/functions/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 4a88cb217d28..9a8b1d1a6fd0 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -160,4 +160,4 @@ required-features = ["string_expressions"] [[bench]] harness = false name = "substr" -required-features = ["string_expressions"] +required-features = ["unicode_expressions"] From 2d8458e3261c8f925b55c377e6aef19b51fe90ca Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:52:32 +0800 Subject: [PATCH 10/23] avoid expensive utf8 and some other checks --- datafusion/functions/src/unicode/substr.rs | 83 +++++++++++----------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 120adf8e0fd2..244227411a5d 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -169,10 +169,12 @@ fn calculate_string_view( } else if length > 12 { let buffer_index = (*raw >> 64) as u32; let offset = (*raw >> 96) as u32; - let str = string_array.value(idx); - let (start, end) = get_true_start_count(str, start as usize, -1); - // Safety: builder is guaranteed to have corresponding blocks + // Safety: + // 1. idx < string_array.views.size() + // 2. builder is guaranteed to have corresponding blocks unsafe { + let str = string_array.value_unchecked(idx); + let (start, end) = get_true_start_count(str, start as usize, -1); builder.append_view_unchecked( buffer_index, offset + start as u32, @@ -181,24 +183,22 @@ fn calculate_string_view( ); } } else { - let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); - let str = match std::str::from_utf8(&bytes[..length as usize]) { - Ok(str) => { - // Extract str[start, end) by char - let (start, end) = get_true_start_count( - str, - start as usize, - length as i64, - ); - &str[start..end] - } - _ => { - return exec_err!( - "failed to convert inline bytes to &str." - ) - } - }; - builder.append_value(str); + // Safety: + // (1) original bytes are valid utf-8, + // (2) we do not slice on utf-8 codepoint + unsafe { + let bytes = + StringViewArray::inline_value(raw, length as usize); + let str = + std::str::from_utf8_unchecked(&bytes[..length as usize]); + // Extract str[start, end) by char + let (start, end) = get_true_start_count( + str, + start as usize, + length as i64, + ); + builder.append_value(&str[start..end]); + } } } else { builder.append_null(); @@ -216,7 +216,7 @@ fn calculate_string_view( { if let (Some(start), Some(count)) = (start, count) { let length = *raw as u32; - let start = (start - 1).max(0) as usize; + let start = start.saturating_sub(1) as usize; if count < 0 { return exec_err!( "negative substring length not allowed: substr(, {start}, {count})" @@ -228,11 +228,13 @@ fn calculate_string_view( } else if length > 12 { let buffer_index = (*raw >> 64) as u32; let offset = (*raw >> 96) as u32; - let str = string_array.value(idx); - let (start, end) = - get_true_start_count(str, start, count as i64); - // Safety: builder is guaranteed to have corresponding blocks + // Safety: + // 1. idx < string_array.views.size() + // 2. builder is guaranteed to have corresponding blocks unsafe { + let str = string_array.value_unchecked(idx); + let (start, end) = + get_true_start_count(str, start, count as i64); builder.append_view_unchecked( buffer_index, offset + start as u32, @@ -241,22 +243,19 @@ fn calculate_string_view( ); } } else { - let bytes = ((*raw >> 32) & u128::MAX).to_le_bytes(); - let str = match std::str::from_utf8(&bytes[..length as usize]) - { - Ok(str) => { - // Extract str[start, end) by char - let (start, end) = - get_true_start_count(str, start, count as i64); - &str[start..end] - } - _ => { - return exec_err!( - "failed to convert inline bytes to &str." - ) - } - }; - builder.append_value(str); + // Safety: + // (1) original bytes are valid utf-8, + // (2) we do not slice on utf-8 codepoint + unsafe { + let bytes = + StringViewArray::inline_value(raw, length as usize); + let str = + std::str::from_utf8_unchecked(&bytes[..length as usize]); + // Extract str[start, end) by char + let (start, end) = + get_true_start_count(str, start, count as i64); + builder.append_value(&str[start..end]); + } } } } else { From 934516ac4feff5e3a415d9afab87534d8a3be73b Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:54:48 +0800 Subject: [PATCH 11/23] fix lint --- datafusion/functions/src/unicode/substr.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 244227411a5d..f5306eac878e 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -174,7 +174,8 @@ fn calculate_string_view( // 2. builder is guaranteed to have corresponding blocks unsafe { let str = string_array.value_unchecked(idx); - let (start, end) = get_true_start_count(str, start as usize, -1); + let (start, end) = + get_true_start_count(str, start as usize, -1); builder.append_view_unchecked( buffer_index, offset + start as u32, @@ -192,11 +193,8 @@ fn calculate_string_view( let str = std::str::from_utf8_unchecked(&bytes[..length as usize]); // Extract str[start, end) by char - let (start, end) = get_true_start_count( - str, - start as usize, - length as i64, - ); + let (start, end) = + get_true_start_count(str, start as usize, length as i64); builder.append_value(&str[start..end]); } } @@ -249,8 +247,9 @@ fn calculate_string_view( unsafe { let bytes = StringViewArray::inline_value(raw, length as usize); - let str = - std::str::from_utf8_unchecked(&bytes[..length as usize]); + let str = std::str::from_utf8_unchecked( + &bytes[..length as usize], + ); // Extract str[start, end) by char let (start, end) = get_true_start_count(str, start, count as i64); From 48e1643b5a992aee59ae8725cad4fb1eb6871830 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Mon, 19 Aug 2024 01:40:04 +0800 Subject: [PATCH 12/23] remove unnecessary indirection --- datafusion/functions/src/unicode/substr.rs | 49 +++++++--------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index f5306eac878e..229c92df0e34 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -25,12 +25,12 @@ use arrow::array::{ }; use arrow::datatypes::DataType; -use datafusion_common::cast::{as_int64_array, as_string_view_array}; +use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use crate::utils::{make_scalar_function, optimized_utf8_to_str_type, utf8_to_str_type}; #[derive(Debug)] pub struct SubstrFunc { @@ -79,7 +79,7 @@ impl ScalarUDFImpl for SubstrFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { if arg_types[0] == DataType::Utf8View { - Ok(DataType::Utf8View) + optimized_utf8_to_str_type(&arg_types[0], "substr") } else { utf8_to_str_type(&arg_types[0], "substr") } @@ -94,21 +94,28 @@ impl ScalarUDFImpl for SubstrFunc { } } +/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) +/// substr('alphabet', 3) = 'phabet' +/// substr('alphabet', 3, 2) = 'ph' +/// The implementation uses UTF-8 code points as characters pub fn substr(args: &[ArrayRef]) -> Result { match args[0].data_type() { DataType::Utf8 => { let string_array = args[0].as_string::(); - calculate_substr::<_, i32>(string_array, &args[1..]) + string_substr::<_, i32>(string_array, &args[1..]) } DataType::LargeUtf8 => { let string_array = args[0].as_string::(); - calculate_substr::<_, i64>(string_array, &args[1..]) + string_substr::<_, i64>(string_array, &args[1..]) } DataType::Utf8View => { let string_array = args[0].as_string_view(); - calculate_substr::<_, i32>(string_array, &args[1..]) + string_view_substr(string_array, &args[1..]) } - other => exec_err!("Unsupported data type {other:?} for function substr"), + other => exec_err!( + "Unsupported data type {other:?} for function substr,\ + expected Utf8View, Utf8 or LargeUtf8." + ), } } @@ -139,7 +146,7 @@ fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, usize) // The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView -fn calculate_string_view( +fn string_view_substr( string_array: &StringViewArray, args: &[ArrayRef], ) -> Result { @@ -273,7 +280,7 @@ fn calculate_string_view( Ok(Arc::new(result) as ArrayRef) } -fn calculate_string<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result +fn string_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result where V: ArrayAccessor, T: OffsetSizeTrait, @@ -330,30 +337,6 @@ where } } -/// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) -/// substr('alphabet', 3) = 'phabet' -/// substr('alphabet', 3, 2) = 'ph' -/// The implementation uses UTF-8 code points as characters -fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result -where - V: ArrayAccessor, - T: OffsetSizeTrait, -{ - match string_array.data_type() { - DataType::Utf8View => { - calculate_string_view(as_string_view_array(&string_array)?, args) - } - DataType::Utf8 | DataType::LargeUtf8 => { - calculate_string::(string_array, args) - } - other => { - exec_err!( - "unexpected datatype {other}, expected Utf8View, Utf8 or LargeUtf8." - ) - } - } -} - #[cfg(test)] mod tests { use arrow::array::{Array, StringArray, StringViewArray}; From 6ff32fc9a5c8e30f9e6d377c61e7ded9b39ae512 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Mon, 19 Aug 2024 01:41:01 +0800 Subject: [PATCH 13/23] add optimized_utf8_to_str_type --- datafusion/functions/src/utils.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index 7b367174006d..ceecbd9a2b61 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -71,6 +71,20 @@ macro_rules! get_optimal_return_type { // `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); +// `optimized_utf8_to_str_type`: returns Utf8View when the string function has a specialised +// implementation for StringView which returns Utf8View. +pub(crate) fn optimized_utf8_to_str_type( + arg_type: &DataType, + name: &str, +) -> Result { + let support_list = ["substr"]; + if support_list.contains(&name) { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(arg_type, name) + } +} + // `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); From 9537c47f212302cb3144609664fae8ad776a5849 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Aug 2024 17:47:30 -0400 Subject: [PATCH 14/23] Simplify type check --- datafusion/functions/src/unicode/substr.rs | 4 ++-- datafusion/functions/src/utils.rs | 14 -------------- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 229c92df0e34..5fd6a8bd8ac0 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -30,7 +30,7 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, optimized_utf8_to_str_type, utf8_to_str_type}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] pub struct SubstrFunc { @@ -79,7 +79,7 @@ impl ScalarUDFImpl for SubstrFunc { fn return_type(&self, arg_types: &[DataType]) -> Result { if arg_types[0] == DataType::Utf8View { - optimized_utf8_to_str_type(&arg_types[0], "substr") + Ok(DataType::Utf8View) } else { utf8_to_str_type(&arg_types[0], "substr") } diff --git a/datafusion/functions/src/utils.rs b/datafusion/functions/src/utils.rs index ceecbd9a2b61..7b367174006d 100644 --- a/datafusion/functions/src/utils.rs +++ b/datafusion/functions/src/utils.rs @@ -71,20 +71,6 @@ macro_rules! get_optimal_return_type { // `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size. get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8); -// `optimized_utf8_to_str_type`: returns Utf8View when the string function has a specialised -// implementation for StringView which returns Utf8View. -pub(crate) fn optimized_utf8_to_str_type( - arg_type: &DataType, - name: &str, -) -> Result { - let support_list = ["substr"]; - if support_list.contains(&name) { - Ok(DataType::Utf8View) - } else { - utf8_to_str_type(arg_type, name) - } -} - // `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size. get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32); From 5356b2715c189b57f39d4605804d6692c706d686 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 19 Aug 2024 17:57:24 -0400 Subject: [PATCH 15/23] Use ByteView --- Cargo.toml | 1 + datafusion/functions/Cargo.toml | 1 + datafusion/functions/src/unicode/substr.rs | 30 +++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 02b1f1ccd92a..815ae2230f94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,6 +75,7 @@ arrow = { version = "52.2.0", features = [ arrow-array = { version = "52.2.0", default-features = false, features = [ "chrono-tz", ] } +arrow-data = { version = "52.2.0", default-features = false } arrow-buffer = { version = "52.2.0", default-features = false } arrow-flight = { version = "52.2.0", features = [ "flight-sql-experimental", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 9a8b1d1a6fd0..5fa86e82146a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -66,6 +66,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-data = { workspace = true } arrow-buffer = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 5fd6a8bd8ac0..5c6fc1d3a45f 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -24,7 +24,7 @@ use arrow::array::{ OffsetSizeTrait, StringViewArray, StringViewBuilder, }; use arrow::datatypes::DataType; - +use arrow_data::ByteView; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; @@ -147,12 +147,12 @@ fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, usize) // The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView fn string_view_substr( - string_array: &StringViewArray, + string_view_array: &StringViewArray, args: &[ArrayRef], ) -> Result { let mut builder = StringViewBuilder::new(); // Copy all blocks from input - for block in string_array.data_buffers() { + for block in string_view_array.data_buffers() { builder.append_block(block.clone()); } @@ -160,7 +160,7 @@ fn string_view_substr( match args.len() { 1 => { - for (idx, (raw, start)) in string_array + for (idx, (raw, start)) in string_view_array .views() .iter() .zip(start_array.iter()) @@ -174,18 +174,18 @@ fn string_view_substr( if length == 0 { builder.append_null(); } else if length > 12 { - let buffer_index = (*raw >> 64) as u32; - let offset = (*raw >> 96) as u32; + let view = ByteView::from(*raw); + // Safety: // 1. idx < string_array.views.size() // 2. builder is guaranteed to have corresponding blocks unsafe { - let str = string_array.value_unchecked(idx); + let str = string_view_array.value_unchecked(idx); let (start, end) = get_true_start_count(str, start as usize, -1); builder.append_view_unchecked( - buffer_index, - offset + start as u32, + view.buffer_index, + view.offset + start as u32, // guarantee that end-offset >= 0 for end <= str.len() (end - start) as u32, ); @@ -212,7 +212,7 @@ fn string_view_substr( } 2 => { let count_array = as_int64_array(&args[1])?; - for (idx, ((raw, start), count)) in string_array + for (idx, ((raw, start), count)) in string_view_array .views() .iter() .zip(start_array.iter()) @@ -231,18 +231,18 @@ fn string_view_substr( if length == 0 { builder.append_null(); } else if length > 12 { - let buffer_index = (*raw >> 64) as u32; - let offset = (*raw >> 96) as u32; + let view = ByteView::from(*raw); + // Safety: // 1. idx < string_array.views.size() // 2. builder is guaranteed to have corresponding blocks unsafe { - let str = string_array.value_unchecked(idx); + let str = string_view_array.value_unchecked(idx); let (start, end) = get_true_start_count(str, start, count as i64); builder.append_view_unchecked( - buffer_index, - offset + start as u32, + view.buffer_index, + view.offset + start as u32, // guarantee that end-offset >= 0 for end <= str.len() (end - start) as u32, ); From b4e1ac7dd4586780839911aa80cb3f34a8be8f02 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 Aug 2024 15:07:51 -0400 Subject: [PATCH 16/23] update datafusion-cli.lock --- datafusion-cli/Cargo.lock | 145 ++++++++++++++++++++++++-------------- 1 file changed, 92 insertions(+), 53 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a164b74c55a5..5389745c06f7 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "adler32" version = "1.2.0" @@ -167,9 +173,9 @@ checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" @@ -430,7 +436,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -765,7 +771,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -815,9 +821,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" dependencies = [ "arrayref", "arrayvec", @@ -999,7 +1005,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1155,7 +1161,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1358,6 +1364,7 @@ version = "41.0.0" dependencies = [ "arrow", "arrow-buffer", + "arrow-data", "base64 0.22.1", "blake2", "blake3", @@ -1740,12 +1747,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -1828,7 +1835,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1921,9 +1928,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -2108,7 +2115,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "httparse", @@ -2145,7 +2152,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2359,9 +2366,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.156" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f43f184355eefb8d17fc948dbecf6c13be3c141f20d834ae842193a448c72a" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -2495,6 +2502,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" @@ -2835,7 +2851,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3034,9 +3050,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", @@ -3080,15 +3096,15 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64 0.22.1", "bytes", "futures-core", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "http-body-util", @@ -3104,7 +3120,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3120,7 +3136,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "windows-registry", ] [[package]] @@ -3259,9 +3275,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3427,7 +3443,7 @@ checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3569,7 +3585,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3615,7 +3631,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3628,7 +3644,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3650,9 +3666,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.74" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3664,6 +3680,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "tempfile" @@ -3710,7 +3729,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3780,9 +3799,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -3804,7 +3823,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3901,7 +3920,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3946,7 +3965,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4101,7 +4120,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -4135,7 +4154,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4219,6 +4238,36 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4367,16 +4416,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -4410,7 +4449,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] From 7106ef22c0375c9fe18082d05c80ecbfc7779562 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 Aug 2024 15:19:03 -0400 Subject: [PATCH 17/23] Remove duration override --- datafusion/functions/benches/substr.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 3f7f77b00a4f..14a3389da380 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -25,7 +25,6 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingM use datafusion_expr::ColumnarValue; use datafusion_functions::unicode; use std::sync::Arc; -use std::time::Duration; fn create_args_without_count( size: usize, @@ -104,7 +103,6 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("SHORTER THAN 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - group.measurement_time(Duration::from_secs(10)); let args = create_args_without_count::(size, len, true, true); group.bench_function( @@ -132,7 +130,6 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("LONGER THAN 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - group.measurement_time(Duration::from_secs(10)); let args = create_args_with_count::(size, len, count, true); group.bench_function( @@ -169,7 +166,6 @@ fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); group.sampling_mode(SamplingMode::Flat); group.sample_size(10); - group.measurement_time(Duration::from_secs(10)); let args = create_args_with_count::(size, len, count, true); group.bench_function( From 28d6aca5377e21fbed97d0c54843038ea305e4f4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 21 Aug 2024 15:46:28 -0400 Subject: [PATCH 18/23] format toml --- Cargo.toml | 2 +- datafusion/functions/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 105802566b23..3d8f050c7dea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -75,8 +75,8 @@ arrow = { version = "52.2.0", features = [ arrow-array = { version = "52.2.0", default-features = false, features = [ "chrono-tz", ] } -arrow-data = { version = "52.2.0", default-features = false } arrow-buffer = { version = "52.2.0", default-features = false } +arrow-data = { version = "52.2.0", default-features = false } arrow-flight = { version = "52.2.0", features = [ "flight-sql-experimental", ] } diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 4920bcdb1b68..7443b215f397 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -66,8 +66,8 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } -arrow-data = { workspace = true } arrow-buffer = { workspace = true } +arrow-data = { workspace = true } base64 = { version = "0.22", optional = true } blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } From 3be3553a36b012dab6df8e1a963d8869a1f94493 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:53:19 +0800 Subject: [PATCH 19/23] refactor the code, using append_view_u128 from arrow --- datafusion/functions/src/unicode/substr.rs | 143 ++++++++------------- 1 file changed, 57 insertions(+), 86 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 5c6fc1d3a45f..534419eb7d5b 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -19,8 +19,9 @@ use std::any::Any; use std::cmp::max; use std::sync::Arc; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, + make_view, Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, StringViewArray, StringViewBuilder, }; use arrow::datatypes::DataType; @@ -30,8 +31,6 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct SubstrFunc { signature: Signature, @@ -120,7 +119,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { } // Return the exact byte index for [start, end), set count to -1 to ignore count -fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, usize) { +fn get_true_start_end(input: &str, start: usize, count: i64) -> (usize, usize) { let (mut st, mut ed) = (input.len(), input.len()); let mut start_counting = false; let mut cnt = 0; @@ -144,6 +143,34 @@ fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize, usize) (st, ed) } +/// Make a `u128` based on the given substr, start(offset to view.offset), and +/// append to the given builder +/// +/// # Safety: +/// (1) The block of the given view must be added to the builder +/// (2) The range `view.offset+start..end` must be within the bounds of the block, where +/// end = view.offset+start+substr_len +unsafe fn make_and_append_view( + builder: &mut StringViewBuilder, + raw: &u128, + substr: &str, + start: u32, +) { + let substr_len = substr.len(); + if substr_len == 0 { + builder.append_null(); + } else { + let sub_view = if substr_len > 12 { + let view = ByteView::from(*raw); + make_view(substr.as_bytes(), view.buffer_index, view.offset + start) + } else { + // inline value does not need block id or offset + make_view(substr.as_bytes(), 0, 0) + }; + builder.append_view_u128_unchecked(sub_view); + } +} + // The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44 // From for ByteView fn string_view_substr( @@ -167,43 +194,17 @@ fn string_view_substr( .enumerate() { if let Some(start) = start { - let length = *raw as u32; - let start = (start - 1).max(0); + let start = (start - 1).max(0) as usize; - // Operate according to the length of bytes - if length == 0 { - builder.append_null(); - } else if length > 12 { - let view = ByteView::from(*raw); + // Safety: + // 1. idx < string_array.views.size() + // 2. builder is guaranteed to have corresponding blocks + unsafe { + let str = string_view_array.value_unchecked(idx); + let (start, end) = get_true_start_end(str, start, -1); + let substr = &str[start..end]; - // Safety: - // 1. idx < string_array.views.size() - // 2. builder is guaranteed to have corresponding blocks - unsafe { - let str = string_view_array.value_unchecked(idx); - let (start, end) = - get_true_start_count(str, start as usize, -1); - builder.append_view_unchecked( - view.buffer_index, - view.offset + start as u32, - // guarantee that end-offset >= 0 for end <= str.len() - (end - start) as u32, - ); - } - } else { - // Safety: - // (1) original bytes are valid utf-8, - // (2) we do not slice on utf-8 codepoint - unsafe { - let bytes = - StringViewArray::inline_value(raw, length as usize); - let str = - std::str::from_utf8_unchecked(&bytes[..length as usize]); - // Extract str[start, end) by char - let (start, end) = - get_true_start_count(str, start as usize, length as i64); - builder.append_value(&str[start..end]); - } + make_and_append_view(&mut builder, raw, substr, start as u32); } } else { builder.append_null(); @@ -220,48 +221,18 @@ fn string_view_substr( .enumerate() { if let (Some(start), Some(count)) = (start, count) { - let length = *raw as u32; - let start = start.saturating_sub(1) as usize; + let start = (start - 1).max(0) as usize; if count < 0 { return exec_err!( "negative substring length not allowed: substr(, {start}, {count})" ); } else { - let count = (count as u32).min(length); - if length == 0 { - builder.append_null(); - } else if length > 12 { - let view = ByteView::from(*raw); + unsafe { + let str = string_view_array.value_unchecked(idx); + let (start, end) = get_true_start_end(str, start, count); + let substr = &str[start..end]; - // Safety: - // 1. idx < string_array.views.size() - // 2. builder is guaranteed to have corresponding blocks - unsafe { - let str = string_view_array.value_unchecked(idx); - let (start, end) = - get_true_start_count(str, start, count as i64); - builder.append_view_unchecked( - view.buffer_index, - view.offset + start as u32, - // guarantee that end-offset >= 0 for end <= str.len() - (end - start) as u32, - ); - } - } else { - // Safety: - // (1) original bytes are valid utf-8, - // (2) we do not slice on utf-8 codepoint - unsafe { - let bytes = - StringViewArray::inline_value(raw, length as usize); - let str = std::str::from_utf8_unchecked( - &bytes[..length as usize], - ); - // Extract str[start, end) by char - let (start, end) = - get_true_start_count(str, start, count as i64); - builder.append_value(&str[start..end]); - } + make_and_append_view(&mut builder, raw, substr, start as u32); } } } else { @@ -350,17 +321,17 @@ mod tests { #[test] fn test_functions() -> Result<()> { - test_function!( - SubstrFunc::new(), - &[ - ColumnarValue::Scalar(ScalarValue::Utf8View(None)), - ColumnarValue::Scalar(ScalarValue::from(1i64)), - ], - Ok(None), - &str, - Utf8View, - StringViewArray - ); + // test_function!( + // SubstrFunc::new(), + // &[ + // ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + // ColumnarValue::Scalar(ScalarValue::from(1i64)), + // ], + // Ok(None), + // &str, + // Utf8View, + // StringViewArray + // ); test_function!( SubstrFunc::new(), &[ From b28860528d13dcd4f71c9212b0dca7d78204cbd9 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Fri, 23 Aug 2024 18:59:51 +0800 Subject: [PATCH 20/23] manually collect the views and nulls --- datafusion/functions/src/unicode/substr.rs | 70 ++++++++++++++-------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 534419eb7d5b..280913388f14 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -22,9 +22,10 @@ use std::sync::Arc; use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ make_view, Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, - OffsetSizeTrait, StringViewArray, StringViewBuilder, + OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; +use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; use arrow_data::ByteView; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; @@ -144,21 +145,18 @@ fn get_true_start_end(input: &str, start: usize, count: i64) -> (usize, usize) { } /// Make a `u128` based on the given substr, start(offset to view.offset), and -/// append to the given builder -/// -/// # Safety: -/// (1) The block of the given view must be added to the builder -/// (2) The range `view.offset+start..end` must be within the bounds of the block, where -/// end = view.offset+start+substr_len -unsafe fn make_and_append_view( - builder: &mut StringViewBuilder, +/// push into to the given buffers +fn make_and_append_view( + views_buffer: &mut Vec, + null_builder: &mut NullBufferBuilder, raw: &u128, substr: &str, start: u32, ) { let substr_len = substr.len(); if substr_len == 0 { - builder.append_null(); + null_builder.append_null(); + views_buffer.push(0); } else { let sub_view = if substr_len > 12 { let view = ByteView::from(*raw); @@ -167,7 +165,8 @@ unsafe fn make_and_append_view( // inline value does not need block id or offset make_view(substr.as_bytes(), 0, 0) }; - builder.append_view_u128_unchecked(sub_view); + views_buffer.push(sub_view); + null_builder.append_non_null(); } } @@ -177,11 +176,8 @@ fn string_view_substr( string_view_array: &StringViewArray, args: &[ArrayRef], ) -> Result { - let mut builder = StringViewBuilder::new(); - // Copy all blocks from input - for block in string_view_array.data_buffers() { - builder.append_block(block.clone()); - } + let mut views_buf = Vec::with_capacity(string_view_array.len()); + let mut null_builder = NullBufferBuilder::new(string_view_array.len()); let start_array = as_int64_array(&args[0])?; @@ -197,17 +193,22 @@ fn string_view_substr( let start = (start - 1).max(0) as usize; // Safety: - // 1. idx < string_array.views.size() - // 2. builder is guaranteed to have corresponding blocks + // idx is always smaller or equal to string_view_array.views.len() unsafe { let str = string_view_array.value_unchecked(idx); let (start, end) = get_true_start_end(str, start, -1); let substr = &str[start..end]; - make_and_append_view(&mut builder, raw, substr, start as u32); + make_and_append_view( + &mut views_buf, + &mut null_builder, + raw, + substr, + start as u32, + ); } } else { - builder.append_null(); + null_builder.append_null(); } } } @@ -227,16 +228,24 @@ fn string_view_substr( "negative substring length not allowed: substr(, {start}, {count})" ); } else { + // Safety: + // idx is always smaller or equal to string_view_array.views.len() unsafe { let str = string_view_array.value_unchecked(idx); let (start, end) = get_true_start_end(str, start, count); let substr = &str[start..end]; - make_and_append_view(&mut builder, raw, substr, start as u32); + make_and_append_view( + &mut views_buf, + &mut null_builder, + raw, + substr, + start as u32, + ); } } } else { - builder.append_null(); + null_builder.append_null(); } } } @@ -247,8 +256,21 @@ fn string_view_substr( } } - let result = builder.finish(); - Ok(Arc::new(result) as ArrayRef) + let views_buf = ScalarBuffer::from(views_buf); + let nulls_buf = null_builder.finish(); + + // Safety: + // (1) The blocks of the given views are all provided + // (2) Each of the range `view.offset+start..end` of view in views_buf is within + // the bounds of each of the blocks + unsafe { + let array = StringViewArray::new_unchecked( + views_buf, + string_view_array.data_buffers().to_vec(), + nulls_buf, + ); + Ok(Arc::new(array) as ArrayRef) + } } fn string_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) -> Result From e2643f999a6be98ff2f8a1ae1a5481c424bc6770 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Fri, 23 Aug 2024 19:18:53 +0800 Subject: [PATCH 21/23] remove bench file and fix some comments --- datafusion/functions/Cargo.toml | 5 - datafusion/functions/benches/substr.rs | 202 --------------------- datafusion/functions/src/unicode/substr.rs | 22 +-- 3 files changed, 11 insertions(+), 218 deletions(-) delete mode 100644 datafusion/functions/benches/substr.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 7443b215f397..16305c0e72f2 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -158,11 +158,6 @@ harness = false name = "repeat" required-features = ["string_expressions"] -[[bench]] -harness = false -name = "substr" -required-features = ["unicode_expressions"] - [[bench]] harness = false name = "random" diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs deleted file mode 100644 index 14a3389da380..000000000000 --- a/datafusion/functions/benches/substr.rs +++ /dev/null @@ -1,202 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -extern crate criterion; - -use arrow::array::{ArrayRef, Int64Array, OffsetSizeTrait}; -use arrow::util::bench_util::{ - create_string_array_with_len, create_string_view_array_with_len, -}; -use criterion::{black_box, criterion_group, criterion_main, Criterion, SamplingMode}; -use datafusion_expr::ColumnarValue; -use datafusion_functions::unicode; -use std::sync::Arc; - -fn create_args_without_count( - size: usize, - str_len: usize, - start_half_way: bool, - use_string_view: bool, -) -> Vec { - let start_array = Arc::new(Int64Array::from( - (0..size) - .map(|_| { - if start_half_way { - (str_len / 2) as i64 - } else { - 1i64 - } - }) - .collect::>(), - )); - - if use_string_view { - let string_array = - Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(start_array), - ] - } else { - let string_array = - Arc::new(create_string_array_with_len::(size, 0.1, str_len)); - - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), - ] - } -} - -fn create_args_with_count( - size: usize, - str_len: usize, - count_max: usize, - use_string_view: bool, -) -> Vec { - let start_array = - Arc::new(Int64Array::from((0..size).map(|_| 1).collect::>())); - let count = count_max.min(str_len) as i64; - let count_array = Arc::new(Int64Array::from( - (0..size).map(|_| count).collect::>(), - )); - - if use_string_view { - let string_array = - Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(start_array), - ColumnarValue::Array(count_array), - ] - } else { - let string_array = - Arc::new(create_string_array_with_len::(size, 0.1, str_len)); - - vec![ - ColumnarValue::Array(string_array), - ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef), - ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef), - ] - } -} - -fn criterion_benchmark(c: &mut Criterion) { - let substr = unicode::substr(); - for size in [1024, 4096] { - // string_len = 12, substring_len=6 (see `create_args_without_count`) - let len = 12; - let mut group = c.benchmark_group("SHORTER THAN 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args_without_count::(size, len, true, true); - group.bench_function( - &format!("substr_string_view [size={}, strlen={}]", size, len), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_without_count::(size, len, false, false); - group.bench_function( - &format!("substr_string [size={}, strlen={}]", size, len), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_without_count::(size, len, true, false); - group.bench_function( - &format!("substr_large_string [size={}, strlen={}]", size, len), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - group.finish(); - - // string_len = 128, start=1, count=64, substring_len=64 - let len = 128; - let count = 64; - let mut group = c.benchmark_group("LONGER THAN 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args_with_count::(size, len, count, true); - group.bench_function( - &format!( - "substr_string_view [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_with_count::(size, len, count, false); - group.bench_function( - &format!( - "substr_string [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_with_count::(size, len, count, false); - group.bench_function( - &format!( - "substr_large_string [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - group.finish(); - - // string_len = 128, start=1, count=6, substring_len=6 - let len = 128; - let count = 6; - let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12"); - group.sampling_mode(SamplingMode::Flat); - group.sample_size(10); - - let args = create_args_with_count::(size, len, count, true); - group.bench_function( - &format!( - "substr_string_view [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_with_count::(size, len, count, false); - group.bench_function( - &format!( - "substr_string [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - let args = create_args_with_count::(size, len, count, false); - group.bench_function( - &format!( - "substr_large_string [size={}, count={}, strlen={}]", - size, count, len, - ), - |b| b.iter(|| black_box(substr.invoke(&args))), - ); - - group.finish(); - } -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 280913388f14..2e8f718fb9cb 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -343,17 +343,17 @@ mod tests { #[test] fn test_functions() -> Result<()> { - // test_function!( - // SubstrFunc::new(), - // &[ - // ColumnarValue::Scalar(ScalarValue::Utf8View(None)), - // ColumnarValue::Scalar(ScalarValue::from(1i64)), - // ], - // Ok(None), - // &str, - // Utf8View, - // StringViewArray - // ); + test_function!( + SubstrFunc::new(), + &[ + ColumnarValue::Scalar(ScalarValue::Utf8View(None)), + ColumnarValue::Scalar(ScalarValue::from(1i64)), + ], + Ok(None), + &str, + Utf8View, + StringViewArray + ); test_function!( SubstrFunc::new(), &[ From f462bbb0f203a551d11158b1bdeea92ab78e3f26 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Fri, 23 Aug 2024 20:27:04 +0800 Subject: [PATCH 22/23] fix tiny mistake --- datafusion/functions/src/unicode/substr.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 2e8f718fb9cb..9a18dbe5fba2 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -209,6 +209,7 @@ fn string_view_substr( } } else { null_builder.append_null(); + views_buf.push(0); } } } @@ -246,6 +247,7 @@ fn string_view_substr( } } else { null_builder.append_null(); + views_buf.push(0); } } } From 2c88a196e0488e700e363ecbb78e0fdd72008086 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Sep 2024 08:41:39 -0400 Subject: [PATCH 23/23] Update Cargo.lock --- datafusion-cli/Cargo.lock | 613 ++++++++++++++++++-------------------- 1 file changed, 286 insertions(+), 327 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 2eae9a0606d8..f28c2705816b 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" +checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" dependencies = [ "arrow-arith", "arrow-array", @@ -200,9 +200,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" +checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" +checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" +checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" +checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" +checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,9 +283,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" +checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" dependencies = [ "arrow-buffer", "arrow-schema", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" +checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" dependencies = [ "arrow-array", "arrow-buffer", @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" +checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" dependencies = [ "arrow-array", "arrow-buffer", @@ -330,9 +330,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" +checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" dependencies = [ "arrow-array", "arrow-buffer", @@ -345,9 +345,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" +checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" dependencies = [ "ahash", "arrow-array", @@ -359,15 +359,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" [[package]] name = "arrow-select" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" +checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" dependencies = [ "ahash", "arrow-array", @@ -379,9 +379,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" +checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" dependencies = [ "arrow-array", "arrow-buffer", @@ -430,13 +430,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -473,160 +473,156 @@ checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "aws-config" -version = "0.55.3" +version = "1.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcdcf0d683fe9c23d32cf5b53c9918ea0a500375a9fb20109802552658e576c9" +checksum = "4e95816a168520d72c0e7680c405a5a8c1fb6a035b4bc4b9d7b0de8e1a941697" dependencies = [ "aws-credential-types", - "aws-http", + "aws-runtime", "aws-sdk-sso", + "aws-sdk-ssooidc", "aws-sdk-sts", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", - "aws-smithy-http-tower", "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand 1.9.0", + "fastrand", "hex", "http 0.2.12", - "hyper 0.14.30", - "ring 0.16.20", + "ring", "time", "tokio", - "tower", "tracing", + "url", "zeroize", ] [[package]] name = "aws-credential-types" -version = "0.55.3" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fcdb2f7acbc076ff5ad05e7864bdb191ca70a6fd07668dc3a1a8bcd051de5ae" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" dependencies = [ "aws-smithy-async", + "aws-smithy-runtime-api", "aws-smithy-types", - "fastrand 1.9.0", - "tokio", - "tracing", "zeroize", ] [[package]] -name = "aws-endpoint" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cce1c41a6cfaa726adee9ebb9a56fcd2bbfd8be49fd8a04c5e20fd968330b04" -dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "aws-types", - "http 0.2.12", - "regex", - "tracing", -] - -[[package]] -name = "aws-http" -version = "0.55.3" +name = "aws-runtime" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aadbc44e7a8f3e71c8b374e03ecd972869eb91dd2bc89ed018954a52ba84bc44" +checksum = "2424565416eef55906f9f8cece2072b6b6a76075e3ff81483ebe938a89a4c05f" dependencies = [ "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", + "fastrand", "http 0.2.12", "http-body 0.4.6", - "lazy_static", + "once_cell", "percent-encoding", "pin-project-lite", "tracing", + "uuid", ] [[package]] name = "aws-sdk-sso" -version = "0.28.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8b812340d86d4a766b2ca73f740dfd47a97c2dff0c06c8517a16d88241957e4" +checksum = "11822090cf501c316c6f75711d77b96fba30658e3867a7762e5e2f5d32d31e81" dependencies = [ "aws-credential-types", - "aws-endpoint", - "aws-http", - "aws-sig-auth", + "aws-runtime", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", - "aws-smithy-http-tower", "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", "http 0.2.12", - "regex", - "tokio-stream", - "tower", + "once_cell", + "regex-lite", "tracing", ] [[package]] -name = "aws-sdk-sts" -version = "0.28.0" +name = "aws-sdk-ssooidc" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "265fac131fbfc188e5c3d96652ea90ecc676a934e3174eaaee523c6cec040b3b" +checksum = "78a2a06ff89176123945d1bbe865603c4d7101bea216a550bb4d2e4e9ba74d74" dependencies = [ "aws-credential-types", - "aws-endpoint", - "aws-http", - "aws-sig-auth", + "aws-runtime", "aws-smithy-async", - "aws-smithy-client", "aws-smithy-http", - "aws-smithy-http-tower", "aws-smithy-json", - "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", "aws-smithy-types", - "aws-smithy-xml", "aws-types", "bytes", "http 0.2.12", - "regex", - "tower", + "once_cell", + "regex-lite", "tracing", ] [[package]] -name = "aws-sig-auth" -version = "0.55.3" +name = "aws-sdk-sts" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b94acb10af0c879ecd5c7bdf51cda6679a0a4f4643ce630905a77673bfa3c61" +checksum = "a20a91795850826a6f456f4a48eff1dfa59a0e69bdbf5b8c50518fd372106574" dependencies = [ "aws-credential-types", - "aws-sigv4", + "aws-runtime", + "aws-smithy-async", "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", "aws-types", "http 0.2.12", + "once_cell", + "regex-lite", "tracing", ] [[package]] name = "aws-sigv4" -version = "0.55.3" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d2ce6f507be68e968a33485ced670111d1cbad161ddbbab1e313c03d37d8f4c" +checksum = "5df1b0fa6be58efe9d4ccc257df0a53b89cd8909e86591a13ca54817c87517be" dependencies = [ + "aws-credential-types", "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", "form_urlencoded", "hex", "hmac", "http 0.2.12", + "http 1.1.0", "once_cell", "percent-encoding", - "regex", "sha2", "time", "tracing", @@ -634,53 +630,28 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "0.55.3" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bda3996044c202d75b91afeb11a9afae9db9a721c6a7a427410018e286b880" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" dependencies = [ "futures-util", "pin-project-lite", "tokio", - "tokio-stream", -] - -[[package]] -name = "aws-smithy-client" -version = "0.55.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a86aa6e21e86c4252ad6a0e3e74da9617295d8d6e374d552be7d3059c41cedd" -dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-http-tower", - "aws-smithy-types", - "bytes", - "fastrand 1.9.0", - "http 0.2.12", - "http-body 0.4.6", - "hyper 0.14.30", - "hyper-rustls 0.23.2", - "lazy_static", - "pin-project-lite", - "rustls 0.20.9", - "tokio", - "tower", - "tracing", ] [[package]] name = "aws-smithy-http" -version = "0.55.3" +version = "0.60.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b3b693869133551f135e1f2c77cb0b8277d9e3e17feaf2213f735857c4f0d28" +checksum = "01dbcb6e2588fd64cfb6d7529661b06466419e4c54ed1c62d6510d2d0350a728" dependencies = [ + "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "bytes-utils", "futures-core", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.30", "once_cell", "percent-encoding", "pin-project-lite", @@ -689,74 +660,113 @@ dependencies = [ ] [[package]] -name = "aws-smithy-http-tower" -version = "0.55.3" +name = "aws-smithy-json" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae4f6c5798a247fac98a867698197d9ac22643596dc3777f0c76b91917616b9" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" dependencies = [ - "aws-smithy-http", "aws-smithy-types", - "bytes", - "http 0.2.12", - "http-body 0.4.6", - "pin-project-lite", - "tower", - "tracing", ] [[package]] -name = "aws-smithy-json" -version = "0.55.3" +name = "aws-smithy-query" +version = "0.60.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23f9f42fbfa96d095194a632fbac19f60077748eba536eb0b9fecc28659807f8" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" dependencies = [ "aws-smithy-types", + "urlencoding", ] [[package]] -name = "aws-smithy-query" -version = "0.55.3" +name = "aws-smithy-runtime" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", + "http-body 1.0.1", + "httparse", + "hyper 0.14.30", + "hyper-rustls 0.24.2", + "once_cell", + "pin-project-lite", + "pin-utils", + "rustls 0.21.12", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98819eb0b04020a1c791903533b638534ae6c12e2aceda3e6e6fba015608d51d" +checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" dependencies = [ + "aws-smithy-async", "aws-smithy-types", - "urlencoding", + "bytes", + "http 0.2.12", + "http 1.1.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", ] [[package]] name = "aws-smithy-types" -version = "0.55.3" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16a3d0bf4f324f4ef9793b86a1701d9700fbcdbd12a846da45eed104c634c6e8" +checksum = "273dcdfd762fae3e1650b8024624e7cd50e484e37abdab73a7a706188ad34543" dependencies = [ "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.1.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", "itoa", "num-integer", + "pin-project-lite", + "pin-utils", "ryu", + "serde", "time", + "tokio", + "tokio-util", ] [[package]] name = "aws-smithy-xml" -version = "0.55.3" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1b9d12875731bd07e767be7baad95700c3137b56730ec9ddeedb52a5e5ca63b" +checksum = "d123fbc2a4adc3c301652ba8e149bf4bc1d1725affb9784eb20c953ace06bf55" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "0.55.3" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6dd209616cc8d7bfb82f87811a5c655dc97537f592689b18743bddf5dc5c4829" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" dependencies = [ "aws-credential-types", "aws-smithy-async", - "aws-smithy-client", - "aws-smithy-http", + "aws-smithy-runtime-api", "aws-smithy-types", - "http 0.2.12", "rustc_version", "tracing", ] @@ -924,9 +934,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.13" +version = "1.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" +checksum = "e9d013ecb737093c0e86b151a7b837993cf9ec6c502946cfb44bedc392421e0b" dependencies = [ "jobserver", "libc", @@ -976,9 +986,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.16" +version = "4.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" +checksum = "3e5a21b8495e732f1b3c364c9949b201ca7bae518c502c80256c96ad79eaf6ac" dependencies = [ "clap_builder", "clap_derive", @@ -986,9 +996,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.15" +version = "4.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" +checksum = "8cf2dd12af7a047ad9d6da2b6b249759a22a7abc0f474c1dae1777afa4b21a73" dependencies = [ "anstream", "anstyle", @@ -1005,7 +1015,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1064,9 +1074,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" @@ -1161,7 +1171,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1172,9 +1182,9 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "6.0.1" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1212,7 +1222,6 @@ dependencies = [ "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", - "datafusion-physical-expr-functions-aggregate", "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", @@ -1263,6 +1272,9 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", "clap", "ctor", "datafusion", @@ -1366,7 +1378,6 @@ version = "41.0.0" dependencies = [ "arrow", "arrow-buffer", - "arrow-data", "base64 0.22.1", "blake2", "blake3", @@ -1508,20 +1519,6 @@ dependencies = [ "rand", ] -[[package]] -name = "datafusion-physical-expr-functions-aggregate" -version = "41.0.0" -dependencies = [ - "ahash", - "arrow", - "datafusion-common", - "datafusion-expr", - "datafusion-expr-common", - "datafusion-functions-aggregate-common", - "datafusion-physical-expr-common", - "rand", -] - [[package]] name = "datafusion-physical-optimizer" version = "41.0.0" @@ -1553,7 +1550,6 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr", "datafusion-physical-expr-common", - "datafusion-physical-expr-functions-aggregate", "futures", "half", "hashbrown", @@ -1708,18 +1704,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" -dependencies = [ - "instant", -] - -[[package]] -name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fd-lock" @@ -1750,9 +1737,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.32" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "miniz_oxide 0.8.0", @@ -1838,7 +1825,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -2131,31 +2118,32 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.2" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ + "futures-util", "http 0.2.12", "hyper 0.14.30", "log", - "rustls 0.20.9", + "rustls 0.21.12", "rustls-native-certs 0.6.3", "tokio", - "tokio-rustls 0.23.4", + "tokio-rustls 0.24.1", ] [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.2", + "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2217,9 +2205,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -2635,18 +2623,18 @@ dependencies = [ [[package]] name = "object" -version = "0.36.3" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.10.2" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" dependencies = [ "async-trait", "base64 0.22.1", @@ -2662,7 +2650,7 @@ dependencies = [ "quick-xml", "rand", "reqwest", - "ring 0.17.8", + "ring", "rustls-pemfile 2.1.3", "serde", "serde_json", @@ -2725,9 +2713,9 @@ dependencies = [ [[package]] name = "parquet" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "arrow-array", @@ -2845,7 +2833,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -2938,9 +2926,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.3" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" dependencies = [ "bytes", "pin-project-lite", @@ -2956,13 +2944,13 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" +checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" dependencies = [ "bytes", "rand", - "ring 0.17.8", + "ring", "rustc-hash", "rustls 0.23.12", "slab", @@ -2973,22 +2961,22 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" dependencies = [ "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -3103,7 +3091,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.4.1", - "hyper-rustls 0.27.2", + "hyper-rustls 0.27.3", "hyper-util", "ipnet", "js-sys", @@ -3114,7 +3102,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.2", + "rustls-native-certs 0.7.3", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3133,21 +3121,6 @@ dependencies = [ "windows-registry", ] -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - [[package]] name = "ring" version = "0.17.8" @@ -3158,8 +3131,8 @@ dependencies = [ "cfg-if", "getrandom", "libc", - "spin 0.9.8", - "untrusted 0.9.0", + "spin", + "untrusted", "windows-sys 0.52.0", ] @@ -3209,18 +3182,18 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" dependencies = [ "bitflags 2.6.0", "errno", @@ -3231,14 +3204,14 @@ dependencies = [ [[package]] name = "rustls" -version = "0.20.9" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ "log", - "ring 0.16.20", + "ring", + "rustls-webpki 0.101.7", "sct", - "webpki", ] [[package]] @@ -3248,9 +3221,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" dependencies = [ "once_cell", - "ring 0.17.8", + "ring", "rustls-pki-types", - "rustls-webpki", + "rustls-webpki 0.102.7", "subtle", "zeroize", ] @@ -3269,9 +3242,22 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" +checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.3", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3307,13 +3293,23 @@ checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.8", + "ring", + "untrusted", +] + +[[package]] +name = "rustls-webpki" +version = "0.102.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" +dependencies = [ + "ring", "rustls-pki-types", - "untrusted 0.9.0", + "untrusted", ] [[package]] @@ -3381,8 +3377,8 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", + "ring", + "untrusted", ] [[package]] @@ -3422,29 +3418,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "serde_json" -version = "1.0.125" +version = "1.0.128" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" +checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" dependencies = [ "itoa", "memchr", @@ -3513,24 +3509,23 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" dependencies = [ - "doc-comment", "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.77", ] [[package]] @@ -3549,12 +3544,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - [[package]] name = "spin" version = "0.9.8" @@ -3579,7 +3568,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3625,7 +3614,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3638,7 +3627,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3660,9 +3649,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.75" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -3685,7 +3674,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" dependencies = [ "cfg-if", - "fastrand 2.1.0", + "fastrand", "once_cell", "rustix", "windows-sys 0.59.0", @@ -3723,7 +3712,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3793,9 +3782,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.3" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" +checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" dependencies = [ "backtrace", "bytes", @@ -3817,18 +3806,17 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "tokio-rustls" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ - "rustls 0.20.9", + "rustls 0.21.12", "tokio", - "webpki", ] [[package]] @@ -3842,22 +3830,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-stream" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267ac89e0bec6e691e5813911606935d77c476ff49024f98abcea3e7b15e37af" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" -version = "0.7.11" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +checksum = "61e7c3654c13bcd040d4a03abee2c75b1d14a37b423cf5a813ceae1cc903ec6a" dependencies = [ "bytes", "futures-core", @@ -3879,7 +3856,6 @@ dependencies = [ "tokio", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -3900,7 +3876,6 @@ version = "0.1.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ - "log", "pin-project-lite", "tracing-attributes", "tracing-core", @@ -3914,7 +3889,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3959,7 +3934,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -4001,12 +3976,6 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - [[package]] name = "untrusted" version = "0.9.0" @@ -4114,7 +4083,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-shared", ] @@ -4148,7 +4117,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4182,16 +4151,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.22.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" -dependencies = [ - "ring 0.17.8", - "untrusted 0.9.0", -] - [[package]] name = "winapi" version = "0.3.9" @@ -4443,7 +4402,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -4491,9 +4450,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config",