GH-35627: [Format][Integration] Add string-view to arrow format (#37526)

String view (and equivalent non-utf8 binary view) is an alternative representation for variable length strings which offers greater efficiency for several common operations. This representation is in use by UmbraDB, DuckDB, and Velox. Where those databases use a raw pointer to out-of-line strings this PR uses a pair of 32 bit integers as a buffer index and offset, which - makes explicit the guarantee that lifetime of all character data is equal to that of the array which views it, which is critical for confident consumption across an interface boundary - makes the arrays meaningfully serializable and venue agnostic; directly usable in shared memory without modification - allows easy validation This PR is extracted from #35628 to unblock independent PRs now that the vote has passed, including: - New types added to Schema.fbs - Message.fbs amended to support variable buffer counts between string view chunks - datagen.py extended to produce integration JSON for string view arrays - Columnar.rst amended with a description of the string view format * Closes: #35627 Authored-by: Benjamin Kietzman <[email protected]> Signed-off-by: Benjamin Kietzman <[email protected]>
apache · Sep 21, 2023 · 9d6d501 · 9d6d501
1 parent 79e49db
commit 9d6d501
Show file tree

Hide file tree

Showing 4 changed files with 243 additions and 14 deletions.
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
@@ -665,6 +665,26 @@ def _get_type(self):
         return OrderedDict([('name', 'largeutf8')])
 
 
+class BinaryViewField(BinaryField):
+
+    @property
+    def column_class(self):
+        return BinaryViewColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'binaryview')])
+
+
+class StringViewField(StringField):
+
+    @property
+    def column_class(self):
+        return StringViewColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'utf8view')])
+
+
 class Schema(object):
 
     def __init__(self, fields, metadata=None):
@@ -744,6 +764,74 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
     pass
 
 
+class BinaryViewColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return frombytes(binascii.hexlify(x).upper())
+
+    def _get_buffers(self):
+        views = []
+        data_buffers = []
+        # a small default data buffer size is used so we can exercise
+        # arrays with multiple data buffers with small data sets
+        DEFAULT_BUFFER_SIZE = 32
+        INLINE_SIZE = 12
+
+        for i, v in enumerate(self.values):
+            if not self.is_valid[i]:
+                v = b''
+            assert isinstance(v, bytes)
+
+            if len(v) <= INLINE_SIZE:
+                # Append an inline view, skip data buffer management.
+                views.append(OrderedDict([
+                    ('SIZE', len(v)),
+                    ('INLINED', self._encode_value(v)),
+                ]))
+                continue
+
+            if len(data_buffers) == 0:
+                # No data buffers have been added yet;
+                # add this string whole (we may append to it later).
+                offset = 0
+                data_buffers.append(v)
+            elif len(data_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
+                # Appending this string to the current active data buffer
+                # would overflow the default buffer size; add it whole.
+                offset = 0
+                data_buffers.append(v)
+            else:
+                # Append this string to the current active data buffer.
+                offset = len(data_buffers[-1])
+                data_buffers[-1] += v
+
+            # the prefix is always 4 bytes so it may not be utf-8
+            # even if the whole string view is
+            prefix = frombytes(binascii.hexlify(v[:4]).upper())
+
+            views.append(OrderedDict([
+                ('SIZE', len(v)),
+                ('PREFIX_HEX', prefix),
+                ('BUFFER_INDEX', len(data_buffers) - 1),
+                ('OFFSET', offset),
+            ]))
+
+        return [
+            ('VALIDITY', [int(x) for x in self.is_valid]),
+            ('VIEWS', views),
+            ('VARIADIC_DATA_BUFFERS', [
+                frombytes(binascii.hexlify(b).upper())
+                for b in data_buffers
+            ]),
+        ]
+
+
+class StringViewColumn(BinaryViewColumn):
+
+    def _encode_value(self, x):
+        return frombytes(x)
+
+
 class FixedSizeBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
@@ -1568,6 +1656,15 @@ def generate_run_end_encoded_case():
     return _generate_file("run_end_encoded", fields, batch_sizes)
 
 
+def generate_binary_view_case():
+    fields = [
+        BinaryViewField('bv'),
+        StringViewField('sv'),
+    ]
+    batch_sizes = [0, 7, 256]
+    return _generate_file("binary_view", fields, batch_sizes)
+
+
 def generate_nested_large_offsets_case():
     fields = [
         LargeListField('large_list_nullable', get_field('item', 'int32')),
@@ -1763,6 +1860,14 @@ def _temp_path():
         .skip_tester('JS')
         .skip_tester('Rust'),
 
+        generate_binary_view_case()
+        .skip_tester('C++')
+        .skip_tester('C#')
+        .skip_tester('Go')
+        .skip_tester('Java')
+        .skip_tester('JS')
+        .skip_tester('Rust'),
+
         generate_extension_case()
         .skip_tester('C#')
         # TODO: ensure the extension is registered in the C++ entrypoint

diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
@@ -21,7 +21,7 @@
 Arrow Columnar Format
 *********************
 
-*Version: 1.3*
+*Version: 1.4*
 
 The "Arrow Columnar Format" includes a language-agnostic in-memory
 data structure specification, metadata serialization, and a protocol
@@ -108,6 +108,10 @@ the different physical layouts defined by Arrow:
 * **Variable-size Binary**: a sequence of values each having a variable
   byte length. Two variants of this layout are supported using 32-bit
   and 64-bit length encoding.
+* **Views of Variable-size Binary**: a sequence of values each having a
+  variable byte length. In contrast to Variable-size Binary, the values
+  of this layout are distributed across potentially multiple buffers
+  instead of densely and sequentially packed in a single buffer.
 * **Fixed-size List**: a nested layout where each value has the same
   number of elements taken from a child data type.
 * **Variable-size List**: a nested layout where each value is a
@@ -350,6 +354,51 @@ will be represented as follows: ::
     |----------------|-----------------------|
     | joemark        | unspecified (padding) |
 
+Variable-size Binary View Layout
+--------------------------------
+
+.. versionadded:: Arrow Columnar Format 1.4
+
+Each value in this layout consists of 0 or more bytes. These bytes'
+locations are indicated using a **views** buffer, which may point to one
+of potentially several **data** buffers or may contain the characters
+inline.
+
+The views buffer contains `length` view structures with the following layout:
+
+::
+
+    * Short strings, length <= 12
+      | Bytes 0-3  | Bytes 4-15                            |
+      |------------|---------------------------------------|
+      | length     | data (padded with 0)                  |
+
+    * Long strings, length > 12
+      | Bytes 0-3  | Bytes 4-7  | Bytes 8-11 | Bytes 12-15 |
+      |------------|------------|------------|-------------|
+      | length     | prefix     | buf. index | offset      |
+
+In both the long and short string cases, the first four bytes encode the
+length of the string and can be used to determine how the rest of the view
+should be interpreted.
+
+In the short string case the string's bytes are inlined- stored inside the
+view itself, in the twelve bytes which follow the length.
+
+In the long string case, a buffer index indicates which data buffer
+stores the data bytes and an offset indicates where in that buffer the
+data bytes begin. Buffer index 0 refers to the first data buffer, IE
+the first buffer **after** the validity buffer and the views buffer.
+The half-open range ``[offset, offset + length)`` must be entirely contained
+within the indicated buffer. A copy of the first four bytes of the string is
+stored inline in the prefix, after the length. This prefix enables a
+profitable fast path for string comparisons, which are frequently determined
+within the first four bytes.
+
+All integers (length, buffer index, and offset) are signed.
+
+This layout is adapted from TU Munich's `UmbraDB`_.
+
 .. _variable-size-list-layout:
 
 Variable-size List Layout
@@ -880,19 +929,20 @@ For the avoidance of ambiguity, we provide listing the order and type
 of memory buffers for each layout.
 
 .. csv-table:: Buffer Layouts
-   :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2"
-   :widths: 30, 20, 20, 20
-
-   "Primitive",validity,data,
-   "Variable Binary",validity,offsets,data
-   "List",validity,offsets,
-   "Fixed-size List",validity,,
-   "Struct",validity,,
-   "Sparse Union",type ids,,
-   "Dense Union",type ids,offsets,
-   "Null",,,
-   "Dictionary-encoded",validity,data (indices),
-   "Run-end encoded",,,
+   :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2", "Variadic Buffers"
+   :widths: 30, 20, 20, 20, 20
+
+   "Primitive",validity,data,,
+   "Variable Binary",validity,offsets,data,
+   "Variable Binary View",validity,views,,data
+   "List",validity,offsets,,
+   "Fixed-size List",validity,,,
+   "Struct",validity,,,
+   "Sparse Union",type ids,,,
+   "Dense Union",type ids,offsets,,
+   "Null",,,,
+   "Dictionary-encoded",validity,data (indices),,
+   "Run-end encoded",,,,
 
 Logical Types
 =============
@@ -1071,6 +1121,39 @@ bytes. Since this metadata can be used to communicate in-memory pointer
 addresses between libraries, it is recommended to set ``size`` to the actual
 memory size rather than the padded size.
 
+Variadic buffers
+^^^^^^^^^^^^^^^^
+
+Some types such as Utf8View are represented using a variable number of buffers.
+For each such Field in the pre-ordered flattened logical schema, there will be
+an entry in ``variadicBufferCounts`` to indicate the number of variadic buffers
+which belong to that Field in the current RecordBatch.
+
+For example, consider the schema ::
+
+    col1: Struct<a: Int32, b: BinaryView, c: Float64>
+    col2: Utf8View
+
+This has two fields with variadic buffers, so ``variadicBufferCounts`` will
+have two entries in each RecordBatch. For a RecordBatch of this schema with
+``variadicBufferCounts = [3, 2]``, the flattened buffers would be::
+
+    buffer 0:  col1    validity
+    buffer 1:  col1.a  validity
+    buffer 2:  col1.a  values
+    buffer 3:  col1.b  validity
+    buffer 4:  col1.b  views
+    buffer 5:  col1.b  data
+    buffer 6:  col1.b  data
+    buffer 7:  col1.b  data
+    buffer 8:  col1.c  validity
+    buffer 9:  col1.c  values
+    buffer 10: col2    validity
+    buffer 11: col2    views
+    buffer 12: col2    data
+    buffer 13: col2    data
+
+
 Byte Order (`Endianness`_)
 ---------------------------
 
@@ -1346,3 +1429,4 @@ the Arrow spec.
 .. _Endianness: https://en.wikipedia.org/wiki/Endianness
 .. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates
 .. _Parquet: https://parquet.apache.org/docs/
+.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
diff --git a/format/Message.fbs b/format/Message.fbs
@@ -99,6 +99,22 @@ table RecordBatch {
 
   /// Optional compression of the message body
   compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<a: Int32, b: BinaryView, c: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.b` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
 }
 
 /// For sending dictionary encoding information. Any Field can be

diff --git a/format/Schema.fbs b/format/Schema.fbs
@@ -22,6 +22,7 @@
 /// Version 1.1 - Add Decimal256.
 /// Version 1.2 - Add Interval MONTH_DAY_NANO.
 /// Version 1.3 - Add Run-End Encoded.
+/// Version 1.4 - Add BinaryView, Utf8View, and variadicBufferCounts.
 
 namespace org.apache.arrow.flatbuf;
 
@@ -171,6 +172,27 @@ table LargeUtf8 {
 table LargeBinary {
 }
 
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table Utf8View {
+}
+
+/// Logically the same as Binary, but the internal representation uses a header
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table BinaryView {
+}
+
+
 table FixedSizeBinary {
   /// Number of bytes per value
   byteWidth: int;
@@ -427,6 +449,8 @@ union Type {
   LargeUtf8,
   LargeList,
   RunEndEncoded,
+  BinaryView,
+  Utf8View,
 }
 
 /// ----------------------------------------------------------------------