From 82a40f3edf264834c28f3a1bf2a3143951802634 Mon Sep 17 00:00:00 2001
From: wiedld <wiedld@users.noreply.github.com>
Date: Wed, 18 Dec 2024 03:11:40 -0800
Subject: [PATCH] Handle possible overflows in StringArrayBuilder /
 LargeStringArrayBuilder (#13802)

* test(13796): reproducer of overflow on capacity

* fix(13796): handle overflows with proper max capacity number which is valid for MutableBuffer

* refactor: use simple solution and provide panic
---
 datafusion/functions/src/strings.rs | 37 ++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs
index caafbae6ba5f..a6587a91a9fe 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -124,8 +124,12 @@ pub struct StringArrayBuilder {
 
 impl StringArrayBuilder {
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let mut offsets_buffer =
-            MutableBuffer::with_capacity((item_capacity + 1) * size_of::<i32>());
+        let capacity = item_capacity
+            .checked_add(1)
+            .map(|i| i.saturating_mul(size_of::<i32>()))
+            .expect("capacity integer overflow");
+
+        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
         // SAFETY: the first offset value is definitely not going to exceed the bounds.
         unsafe { offsets_buffer.push_unchecked(0_i32) };
         Self {
@@ -182,7 +186,7 @@ impl StringArrayBuilder {
             .len()
             .try_into()
             .expect("byte array offset overflow");
-        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+        self.offsets_buffer.push(next_offset);
     }
 
     /// Finalise the builder into a concrete [`StringArray`].
@@ -289,8 +293,12 @@ pub struct LargeStringArrayBuilder {
 
 impl LargeStringArrayBuilder {
     pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
-        let mut offsets_buffer =
-            MutableBuffer::with_capacity((item_capacity + 1) * size_of::<i64>());
+        let capacity = item_capacity
+            .checked_add(1)
+            .map(|i| i.saturating_mul(size_of::<i64>()))
+            .expect("capacity integer overflow");
+
+        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
         // SAFETY: the first offset value is definitely not going to exceed the bounds.
         unsafe { offsets_buffer.push_unchecked(0_i64) };
         Self {
@@ -347,7 +355,7 @@ impl LargeStringArrayBuilder {
             .len()
             .try_into()
             .expect("byte array offset overflow");
-        unsafe { self.offsets_buffer.push_unchecked(next_offset) };
+        self.offsets_buffer.push(next_offset);
     }
 
     /// Finalise the builder into a concrete [`LargeStringArray`].
@@ -452,3 +460,20 @@ impl ColumnarValueRef<'_> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    #[should_panic(expected = "capacity integer overflow")]
+    fn test_overflow_string_array_builder() {
+        let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
+    }
+
+    #[test]
+    #[should_panic(expected = "capacity integer overflow")]
+    fn test_overflow_large_string_array_builder() {
+        let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
+    }
+}