Skip to content

Commit

Permalink
Fix ChunkedArray find_chunk_idx for empty chunks (#802)
Browse files Browse the repository at this point in the history
  • Loading branch information
robert3005 authored Sep 13, 2024
1 parent 1a384b3 commit 750b9c8
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 83 deletions.
30 changes: 3 additions & 27 deletions vortex-array/src/array/chunked/compute/mod.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
use vortex_dtype::{DType, Nullability};
use vortex_error::{vortex_err, vortex_panic, VortexResult};
use vortex_scalar::Scalar;
use vortex_error::VortexResult;

use crate::array::chunked::ChunkedArray;
use crate::compute::unary::{
scalar_at, scalar_at_unchecked, try_cast, CastFn, ScalarAtFn, SubtractScalarFn,
};
use crate::compute::unary::{try_cast, CastFn, ScalarAtFn, SubtractScalarFn};
use crate::compute::{compare, slice, ArrayCompute, CompareFn, Operator, SliceFn, TakeFn};
use crate::{Array, IntoArray};

mod scalar_at;
mod slice;
mod take;

Expand Down Expand Up @@ -38,28 +36,6 @@ impl ArrayCompute for ChunkedArray {
}
}

impl ScalarAtFn for ChunkedArray {
fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
let (chunk_index, chunk_offset) = self.find_chunk_idx(index);
scalar_at(
&self
.chunk(chunk_index)
.ok_or_else(|| vortex_err!(OutOfBounds: chunk_index, 0, self.nchunks()))?,
chunk_offset,
)
}

fn scalar_at_unchecked(&self, index: usize) -> Scalar {
let (chunk_index, chunk_offset) = self.find_chunk_idx(index);
scalar_at_unchecked(
&self
.chunk(chunk_index)
.unwrap_or_else(|| vortex_panic!(OutOfBounds: chunk_index, 0, self.nchunks())),
chunk_offset,
)
}
}

impl CastFn for ChunkedArray {
fn cast(&self, dtype: &DType) -> VortexResult<Array> {
let mut cast_chunks = Vec::new();
Expand Down
89 changes: 89 additions & 0 deletions vortex-array/src/array/chunked/compute/scalar_at.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use vortex_error::{vortex_err, vortex_panic, VortexResult};
use vortex_scalar::Scalar;

use crate::array::ChunkedArray;
use crate::compute::unary::{scalar_at, scalar_at_unchecked, ScalarAtFn};

impl ScalarAtFn for ChunkedArray {
fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
let (chunk_index, chunk_offset) = self.find_chunk_idx(index);
scalar_at(
&self
.chunk(chunk_index)
.ok_or_else(|| vortex_err!(OutOfBounds: chunk_index, 0, self.nchunks()))?,
chunk_offset,
)
}

fn scalar_at_unchecked(&self, index: usize) -> Scalar {
let (chunk_index, chunk_offset) = self.find_chunk_idx(index);
scalar_at_unchecked(
&self
.chunk(chunk_index)
.unwrap_or_else(|| vortex_panic!(OutOfBounds: chunk_index, 0, self.nchunks())),
chunk_offset,
)
}
}

#[cfg(test)]
mod tests {
use vortex_dtype::{DType, Nullability, PType};

use crate::array::{ChunkedArray, PrimitiveArray};
use crate::compute::unary::scalar_at;
use crate::IntoArray;

#[test]
fn empty_children_both_sides() {
let array = ChunkedArray::try_new(
vec![
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(vec![1u64, 2]).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
],
DType::Primitive(PType::U64, Nullability::NonNullable),
)
.unwrap();
assert_eq!(scalar_at(array.array(), 0).unwrap(), 1u64.into());
assert_eq!(scalar_at(array.array(), 1).unwrap(), 2u64.into());
}

#[test]
fn empty_children_trailing() {
let array = ChunkedArray::try_new(
vec![
PrimitiveArray::from(vec![1u64, 2]).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(vec![3u64, 4]).into_array(),
],
DType::Primitive(PType::U64, Nullability::NonNullable),
)
.unwrap();
assert_eq!(scalar_at(array.array(), 0).unwrap(), 1u64.into());
assert_eq!(scalar_at(array.array(), 1).unwrap(), 2u64.into());
assert_eq!(scalar_at(array.array(), 2).unwrap(), 3u64.into());
assert_eq!(scalar_at(array.array(), 3).unwrap(), 4u64.into());
}

#[test]
fn empty_children_leading() {
let array = ChunkedArray::try_new(
vec![
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(Vec::<u64>::new()).into_array(),
PrimitiveArray::from(vec![1u64, 2]).into_array(),
PrimitiveArray::from(vec![3u64, 4]).into_array(),
],
DType::Primitive(PType::U64, Nullability::NonNullable),
)
.unwrap();
assert_eq!(scalar_at(array.array(), 0).unwrap(), 1u64.into());
assert_eq!(scalar_at(array.array(), 1).unwrap(), 2u64.into());
assert_eq!(scalar_at(array.array(), 2).unwrap(), 3u64.into());
assert_eq!(scalar_at(array.array(), 3).unwrap(), 4u64.into());
}
}
64 changes: 64 additions & 0 deletions vortex-array/src/array/chunked/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,67 @@ impl SliceFn for ChunkedArray {
Self::try_new(chunks, self.dtype().clone()).map(|a| a.into_array())
}
}

#[cfg(test)]
mod tests {
use vortex_dtype::{DType, NativePType, Nullability, PType};

use crate::array::ChunkedArray;
use crate::compute::slice;
use crate::{Array, IntoArray, IntoArrayVariant};

fn chunked_array() -> ChunkedArray {
ChunkedArray::try_new(
vec![
vec![1u64, 2, 3].into_array(),
vec![4u64, 5, 6].into_array(),
vec![7u64, 8, 9].into_array(),
],
DType::Primitive(PType::U64, Nullability::NonNullable),
)
.unwrap()
}

fn assert_equal_slices<T: NativePType>(arr: Array, slice: &[T]) {
let mut values = Vec::with_capacity(arr.len());
ChunkedArray::try_from(arr)
.unwrap()
.chunks()
.map(|a| a.into_primitive().unwrap())
.for_each(|a| values.extend_from_slice(a.maybe_null_slice::<T>()));
assert_eq!(values, slice);
}

#[test]
pub fn slice_middle() {
assert_equal_slices(slice(chunked_array().array(), 2, 5).unwrap(), &[3u64, 4, 5])
}

#[test]
pub fn slice_begin() {
assert_equal_slices(slice(chunked_array().array(), 1, 3).unwrap(), &[2u64, 3]);
}

#[test]
pub fn slice_aligned() {
assert_equal_slices(slice(chunked_array().array(), 3, 6).unwrap(), &[4u64, 5, 6]);
}

#[test]
pub fn slice_many_aligned() {
assert_equal_slices(
slice(chunked_array().array(), 0, 6).unwrap(),
&[1u64, 2, 3, 4, 5, 6],
);
}

#[test]
pub fn slice_end() {
assert_equal_slices(slice(chunked_array().array(), 7, 8).unwrap(), &[8u64]);
}

#[test]
pub fn slice_exactly_end() {
assert_equal_slices(slice(chunked_array().array(), 6, 9).unwrap(), &[7u64, 8, 9]);
}
}
4 changes: 2 additions & 2 deletions vortex-array/src/array/chunked/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ fn take_strict_sorted(chunked: &ChunkedArray, indices: &Array) -> VortexResult<A
mod test {
use crate::array::chunked::ChunkedArray;
use crate::compute::take;
use crate::{ArrayDType, AsArray, IntoArray, IntoArrayVariant};
use crate::{ArrayDType, IntoArray, IntoArrayVariant};

#[test]
fn test_take() {
Expand All @@ -137,7 +137,7 @@ mod test {
assert_eq!(arr.len(), 9);
let indices = vec![0u64, 0, 6, 4].into_array();

let result = &ChunkedArray::try_from(take(arr.as_array_ref(), &indices).unwrap())
let result = &ChunkedArray::try_from(take(arr.array(), &indices).unwrap())
.unwrap()
.into_array()
.into_primitive()
Expand Down
53 changes: 6 additions & 47 deletions vortex-array/src/array/chunked/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,12 @@ impl ChunkedArray {
fn find_chunk_idx(&self, index: usize) -> (usize, usize) {
assert!(index <= self.len(), "Index out of bounds of the array");

let index_chunk = search_sorted(&self.chunk_offsets(), index, SearchSortedSide::Left)
// Since there might be duplicate values in offsets because of empty chunks we want to search from right
// and take the last chunk (we subtract 1 since there's a leading 0)
let index_chunk = search_sorted(&self.chunk_offsets(), index, SearchSortedSide::Right)
.vortex_expect("Search sorted failed in find_chunk_idx")
.to_offset_ends_index(self.nchunks());
.to_ends_index(self.nchunks() + 1)
.saturating_sub(1);
let chunk_start = scalar_at(&self.chunk_offsets(), index_chunk)
.and_then(|s| usize::try_from(&s))
.vortex_expect("Failed to find chunk start in find_chunk_idx");
Expand Down Expand Up @@ -221,11 +224,10 @@ impl SubtractScalarFn for ChunkedArray {

#[cfg(test)]
mod test {
use vortex_dtype::{DType, NativePType, Nullability, PType};
use vortex_dtype::{DType, Nullability, PType};
use vortex_error::VortexResult;

use crate::array::chunked::ChunkedArray;
use crate::compute::slice;
use crate::compute::unary::{scalar_at, subtract_scalar};
use crate::{assert_arrays_eq, Array, ArrayDType, IntoArray, IntoArrayVariant, ToArray};

Expand All @@ -241,49 +243,6 @@ mod test {
.unwrap()
}

fn assert_equal_slices<T: NativePType>(arr: Array, slice: &[T]) {
let mut values = Vec::with_capacity(arr.len());
ChunkedArray::try_from(arr)
.unwrap()
.chunks()
.map(|a| a.into_primitive().unwrap())
.for_each(|a| values.extend_from_slice(a.maybe_null_slice::<T>()));
assert_eq!(values, slice);
}

#[test]
pub fn slice_middle() {
assert_equal_slices(slice(chunked_array().array(), 2, 5).unwrap(), &[3u64, 4, 5])
}

#[test]
pub fn slice_begin() {
assert_equal_slices(slice(chunked_array().array(), 1, 3).unwrap(), &[2u64, 3]);
}

#[test]
pub fn slice_aligned() {
assert_equal_slices(slice(chunked_array().array(), 3, 6).unwrap(), &[4u64, 5, 6]);
}

#[test]
pub fn slice_many_aligned() {
assert_equal_slices(
slice(chunked_array().array(), 0, 6).unwrap(),
&[1u64, 2, 3, 4, 5, 6],
);
}

#[test]
pub fn slice_end() {
assert_equal_slices(slice(chunked_array().array(), 7, 8).unwrap(), &[8u64]);
}

#[test]
pub fn slice_exactly_end() {
assert_equal_slices(slice(chunked_array().array(), 6, 9).unwrap(), &[7u64, 8, 9]);
}

#[test]
fn test_scalar_subtract() {
let chunked = chunked_array();
Expand Down
2 changes: 1 addition & 1 deletion vortex-array/src/array/sparse/compute/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ impl ScalarAtFn for SparseArray {
impl SearchSortedFn for SparseArray {
fn search_sorted(&self, value: &Scalar, side: SearchSortedSide) -> VortexResult<SearchResult> {
search_sorted(&self.values(), value.clone(), side).and_then(|sr| {
let sidx = sr.to_offset_ends_index(self.metadata().indices_len);
let sidx = sr.to_offsets_index(self.metadata().indices_len);
let index: usize = scalar_at(&self.indices(), sidx)?.as_ref().try_into()?;
Ok(match sr {
SearchResult::Found(i) => SearchResult::Found(
Expand Down
7 changes: 3 additions & 4 deletions vortex-array/src/compute/search_sorted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@ impl SearchResult {
}
}

/// Convert search result into an index suitable for searching array of end indices with explicit 0 offset entry,
/// i.e. first element starts at 0.
/// Convert search result into an index suitable for searching array of offset indices, i.e. first element starts at 0.
///
/// For example for a ChunkedArray with chunk ends array [0, 3, 8, 10] you can use this method to
/// For example for a ChunkedArray with chunk offsets array [0, 3, 8, 10] you can use this method to
/// obtain index suitable for indexing into it after performing a search
pub fn to_offset_ends_index(self, len: usize) -> usize {
pub fn to_offsets_index(self, len: usize) -> usize {
match self {
SearchResult::Found(i) => {
if i == len {
Expand Down
5 changes: 3 additions & 2 deletions vortex-serde/src/chunked_reader/take_rows.rs
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,9 @@ fn find_chunks(row_offsets: &Array, indices: &Array) -> VortexResult<Vec<ChunkIn
let mut chunks = HashMap::new();

for (pos, idx) in indices.maybe_null_slice::<u64>().iter().enumerate() {
let chunk_idx = search_sorted(row_offsets.array(), *idx, SearchSortedSide::Left)?
.to_offset_ends_index(row_offsets.len());
let chunk_idx = search_sorted(row_offsets.array(), *idx, SearchSortedSide::Right)?
.to_ends_index(row_offsets.len())
.saturating_sub(1);
chunks
.entry(chunk_idx as u32)
.and_modify(|chunk_indices: &mut ChunkIndices| {
Expand Down

0 comments on commit 750b9c8

Please sign in to comment.