From 18e3113443bf4b35e220d4cd0e1ea6813719c01e Mon Sep 17 00:00:00 2001 From: ShiKaiWi Date: Fri, 20 Sep 2024 23:48:13 +0800 Subject: [PATCH 1/2] Use `Arc<[Buffer]>` instead of raw `Vec` in `GenericByteViewArray` for faster `slice` --- arrow-array/src/array/byte_view_array.rs | 28 +++++++++++++++--------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index a155b6ab22e2..f9ce4d0d8772 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -114,7 +114,7 @@ use super::ByteArrayType; pub struct GenericByteViewArray { data_type: DataType, views: ScalarBuffer, - buffers: Vec, + buffers: Arc<[Buffer]>, phantom: PhantomData, nulls: Option, } @@ -178,7 +178,7 @@ impl GenericByteViewArray { Ok(Self { data_type: T::DATA_TYPE, views, - buffers, + buffers: buffers.into(), nulls, phantom: Default::default(), }) @@ -191,14 +191,14 @@ impl GenericByteViewArray { /// Safe if [`Self::try_new`] would not error pub unsafe fn new_unchecked( views: ScalarBuffer, - buffers: Vec, + buffers: impl Into>, nulls: Option, ) -> Self { Self { data_type: T::DATA_TYPE, phantom: Default::default(), views, - buffers, + buffers: buffers.into(), nulls, } } @@ -208,7 +208,7 @@ impl GenericByteViewArray { Self { data_type: T::DATA_TYPE, views: vec![0; len].into(), - buffers: vec![], + buffers: vec![].into(), nulls: Some(NullBuffer::new_null(len)), phantom: Default::default(), } @@ -234,7 +234,7 @@ impl GenericByteViewArray { } /// Deconstruct this array into its constituent parts - pub fn into_parts(self) -> (ScalarBuffer, Vec, Option) { + pub fn into_parts(self) -> (ScalarBuffer, Arc<[Buffer]>, Option) { (self.views, self.buffers, self.nulls) } @@ -516,7 +516,7 @@ impl From for GenericByteViewArray { Self { data_type: T::DATA_TYPE, views, - buffers, + buffers: buffers.into(), nulls: value.nulls().cloned(), phantom: Default::default(), } @@ -569,12 +569,20 @@ where } impl From> for ArrayData { - fn from(mut array: GenericByteViewArray) -> Self { + fn from(array: GenericByteViewArray) -> Self { let len = array.len(); - array.buffers.insert(0, array.views.into_inner()); + let new_buffers = { + let mut buffers = Vec::with_capacity(array.buffers.len() + 1); + buffers.push(array.views.into_inner()); + for buffer in array.buffers.iter() { + buffers.push(buffer.clone()); + } + buffers + }; + let builder = ArrayDataBuilder::new(T::DATA_TYPE) .len(len) - .buffers(array.buffers) + .buffers(new_buffers) .nulls(array.nulls); unsafe { builder.build_unchecked() } From 33eae4bb0b4ee83530eeec5a5e14adf376fc936f Mon Sep 17 00:00:00 2001 From: ShiKaiWi Date: Sat, 21 Sep 2024 00:30:04 +0800 Subject: [PATCH 2/2] add benchmark case about view array slice --- arrow-array/Cargo.toml | 2 +- arrow-array/benches/{gc_view_types.rs => view_types.rs} | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) rename arrow-array/benches/{gc_view_types.rs => view_types.rs} (91%) diff --git a/arrow-array/Cargo.toml b/arrow-array/Cargo.toml index 6170db85dc66..c6c9dfb967cd 100644 --- a/arrow-array/Cargo.toml +++ b/arrow-array/Cargo.toml @@ -65,7 +65,7 @@ name = "occupancy" harness = false [[bench]] -name = "gc_view_types" +name = "view_types" harness = false [[bench]] diff --git a/arrow-array/benches/gc_view_types.rs b/arrow-array/benches/view_types.rs similarity index 91% rename from arrow-array/benches/gc_view_types.rs rename to arrow-array/benches/view_types.rs index 4b74a8f60b06..929a97551632 100644 --- a/arrow-array/benches/gc_view_types.rs +++ b/arrow-array/benches/view_types.rs @@ -42,6 +42,12 @@ fn criterion_benchmark(c: &mut Criterion) { black_box(sliced.gc()); }); }); + + c.bench_function("view types slice", |b| { + b.iter(|| { + black_box(array.slice(0, 100_000 / 2)); + }); + }); } criterion_group!(benches, criterion_benchmark);