Skip to content

Commit

Permalink
Minor API adjustments for StringViewBuilder (#6047)
Browse files Browse the repository at this point in the history
* minor update

* add memory accounting

* Update arrow-buffer/src/builder/null.rs

Co-authored-by: Andrew Lamb <[email protected]>

* Update arrow-array/src/builder/generic_bytes_view_builder.rs

Co-authored-by: Andrew Lamb <[email protected]>

* update comments

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
XiangpengHao and alamb authored Jul 15, 2024
1 parent 199ce91 commit 9acc9fa
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
6 changes: 4 additions & 2 deletions arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,9 +324,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
/// Note that it will copy the array regardless of whether the original array is compact.
/// Use with caution as this can be an expensive operation, only use it when you are sure that the view
/// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
///
/// Note: this function does not attempt to canonicalize / deduplicate values. For this
/// feature see [`GenericByteViewBuilder::with_deduplicate_strings`].
pub fn gc(&self) -> Self {
let mut builder =
GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings();
let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());

for v in self.iter() {
builder.append_option(v);
Expand Down
16 changes: 15 additions & 1 deletion arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {

/// Returns the value at the given index
/// Useful if we want to know what value has been inserted to the builder
fn get_value(&self, index: usize) -> &[u8] {
/// The index has to be smaller than `self.len()`, otherwise it will panic
pub fn get_value(&self, index: usize) -> &[u8] {
let view = self.views_builder.as_slice().get(index).unwrap();
let len = *view as u32;
if len <= 12 {
Expand Down Expand Up @@ -337,6 +338,19 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}

/// Return the allocated size of this builder in bytes, useful for memory accounting.
pub fn allocated_size(&self) -> usize {
let views = self.views_builder.capacity() * std::mem::size_of::<u128>();
let null = self.null_buffer_builder.allocated_size();
let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>();
let in_progress = self.in_progress.capacity();
let tracker = match &self.string_tracker {
Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
None => 0,
};
buffer_size + in_progress + tracker + views + null
}
}

impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
Expand Down
8 changes: 8 additions & 0 deletions arrow-buffer/src/builder/null.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,14 @@ impl NullBufferBuilder {
pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> {
self.bitmap_builder.as_mut().map(|b| b.as_slice_mut())
}

/// Return the allocated size of this builder, in bytes, useful for memory accounting.
pub fn allocated_size(&self) -> usize {
self.bitmap_builder
.as_ref()
.map(|b| b.capacity())
.unwrap_or(0)
}
}

impl NullBufferBuilder {
Expand Down

0 comments on commit 9acc9fa

Please sign in to comment.