spiraldb · robert3005 · Dec 3, 2024 · Nov 15, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/vortex-file/src/read/buffered.rs b/vortex-file/src/read/buffered.rs
@@ -0,0 +1,232 @@
+use std::collections::VecDeque;
+use std::io;
+use std::io::ErrorKind;
+use std::pin::Pin;
+use std::sync::{Arc, RwLock};
+use std::task::{Context, Poll, Waker};
+
+use futures::Stream;
+use futures_util::future::BoxFuture;
+use futures_util::{FutureExt, StreamExt};
+use vortex_array::ArrayData;
+use vortex_error::{vortex_err, vortex_panic, VortexExpect, VortexResult};
+use vortex_io::{Dispatch, IoDispatcher, VortexReadAt, VortexReadRanges};
+
+use crate::{LayoutMessageCache, LayoutReader, Message, MessageLocator, MessageRead, RowMask};
+
+const NUM_TO_COALESCE: usize = 8;
+
+pub trait RowMaskReader<V> {
+    fn read_mask(&self, mask: &RowMask) -> VortexResult<Option<MessageRead<V>>>;
+}
+
+pub struct MaskLayoutReader {
+    layout: Box<dyn LayoutReader>,
+}
+
+impl MaskLayoutReader {
+    pub fn new(layout: Box<dyn LayoutReader>) -> Self {
+        Self { layout }
+    }
+}
+
+impl RowMaskReader<ArrayData> for MaskLayoutReader {
+    /// Read given mask out of the reader
+    fn read_mask(&self, mask: &RowMask) -> VortexResult<Option<MessageRead<ArrayData>>> {
+        self.layout.read_selection(mask)
+    }
+}
+
+enum RowMaskState<V> {
+    Pending(RowMask),
+    Ready(V),
+    Empty,
+}
+
+pub struct BufferedLayoutReader<R, S, V, RM> {
+    values: S,
+    row_mask_reader: RM,
+    in_flight: Option<BoxFuture<'static, io::Result<Vec<Message>>>>,
+    queued: VecDeque<RowMaskState<V>>,
+    io_read: VortexReadRanges<R>,
+    dispatcher: Arc<IoDispatcher>,
+    cache: Arc<RwLock<LayoutMessageCache>>,
+}
+
+impl<R, S, V, RM> BufferedLayoutReader<R, S, V, RM>
+where
+    R: VortexReadAt,
+    S: Stream<Item = VortexResult<RowMask>> + Unpin,
+    RM: RowMaskReader<V>,
+{
+    pub fn new(
+        read: R,
+        dispatcher: Arc<IoDispatcher>,
+        values: S,
+        row_mask_reader: RM,
+        cache: Arc<RwLock<LayoutMessageCache>>,
+    ) -> Self {
+        Self {
+            values,
+            row_mask_reader,
+            in_flight: None,
+            queued: VecDeque::new(),
+            io_read: VortexReadRanges::new(read, dispatcher.clone(), 1 << 20),
+            dispatcher,
+            cache,
+        }
+    }
+
+    fn store_messages(&self, messages: Vec<Message>) {
+        let mut write_cache_guard = self
+            .cache
+            .write()
+            .unwrap_or_else(|poison| vortex_panic!("Failed to write to message cache: {poison}"));
+        for Message(message_id, buf) in messages {
+            write_cache_guard.set(message_id, buf);
+        }
+    }
+
+    fn gather_read_messages(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> VortexResult<(Vec<MessageLocator>, bool)> {
+        let mut to_read = Vec::with_capacity(NUM_TO_COALESCE);
+        let mut read_more_count = 0;
+
+        // Poll all queued pending masks to see if we can make progress
+        for queued_res in self.queued.iter_mut() {
+            match queued_res {
+                RowMaskState::Pending(pending_mask) => {
+                    if let Some(pending_read) = self.row_mask_reader.read_mask(pending_mask)? {
+                        match pending_read {
+                            MessageRead::ReadMore(m) => {
+                                to_read.extend(m);
+                                read_more_count += 1;
+                            }
+                            MessageRead::Value(v) => {
+                                *queued_res = RowMaskState::Ready(v);
+                            }
+                        }
+                    } else {
+                        *queued_res = RowMaskState::Empty;
+                    }
+                }
+                RowMaskState::Ready(_) => {}
+                RowMaskState::Empty => {}
+            }
+        }
+
+        let mut exhausted = false;
+        while read_more_count < NUM_TO_COALESCE {
+            match self.values.poll_next_unpin(cx) {
+                Poll::Ready(Some(Ok(next_mask))) => {
+                    if let Some(read_result) = self.row_mask_reader.read_mask(&next_mask)? {
+                        match read_result {
+                            MessageRead::ReadMore(m) => {
+                                self.queued.push_back(RowMaskState::Pending(next_mask));
+                                to_read.extend(m);
+                                read_more_count += 1;
+                            }
+                            MessageRead::Value(v) => {
+                                self.queued.push_back(RowMaskState::Ready(v));
+                            }
+                        }
+                    }
+                }
+                Poll::Ready(Some(Err(e))) => {
+                    return Err(e);
+                }
+                Poll::Ready(None) => {
+                    exhausted = true;
+                    break;
+                }
+                Poll::Pending => {
+                    break;
+                }
+            }
+        }
+        Ok((to_read, exhausted))
+    }
+
+    fn dispatch_messages(
+        &self,
+        messages: Vec<MessageLocator>,
+        waker: Waker,
+    ) -> BoxFuture<'static, io::Result<Vec<Message>>> {
+        let reader = self.io_read.clone();
+        self.dispatcher
+            .dispatch(move || async move {
+                let read_messages = reader
+                    .read_byte_ranges(messages.iter().map(|msg| msg.1.to_range()).collect())
+                    .map(move |read_res| {
+                        Ok(messages
+                            .into_iter()
+                            .map(|loc| loc.0)
+                            .zip(read_res?)
+                            .map(|(loc, bytes)| Message(loc, bytes))
+                            .collect())
+                    })
+                    .await;
+                waker.wake();
+                read_messages
+            })
+            .vortex_expect("Async task dispatch")
+            .map(|res| res.unwrap_or_else(|e| Err(io::Error::new(ErrorKind::Other, e))))
+            .boxed()
+    }
+}
+
+impl<R, S, V, RM> Stream for BufferedLayoutReader<R, S, V, RM>
+where
+    R: VortexReadAt + Unpin,
+    S: Stream<Item = VortexResult<RowMask>> + Unpin,
+    RM: RowMaskReader<V> + Unpin,
+    V: Unpin,
+{
+    type Item = VortexResult<V>;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let exhausted = if let Some(in_flight) = &mut self.in_flight {
+            match in_flight.poll_unpin(cx) {
+                Poll::Ready(msgs) => {
+                    self.store_messages(
+                        msgs.map_err(|e| vortex_err!("Cancelled in flight read {e}"))?,
+                    );
+                    let (messages, exhausted) = self.gather_read_messages(cx)?;
+                    if !messages.is_empty() {
+                        self.in_flight = Some(self.dispatch_messages(messages, cx.waker().clone()));
+                    } else {
+                        self.in_flight = None;
+                    }
+                    exhausted
+                }
+                // If read is pending see if we have any available results
+                Poll::Pending => false,
+            }
+        } else {
+            let (messages, exhausted) = self.gather_read_messages(cx)?;
+            if !messages.is_empty() {
+                self.in_flight = Some(self.dispatch_messages(messages, cx.waker().clone()));
+            }
+            exhausted
+        };
+
+        while let Some(next_mask) = self.queued.pop_front() {
+            match next_mask {
+                RowMaskState::Pending(m) => {
+                    self.queued.push_front(RowMaskState::Pending(m));
+                    return Poll::Pending;
+                }
+                RowMaskState::Ready(next_ready) => return Poll::Ready(Some(Ok(next_ready))),
+                RowMaskState::Empty => continue,
+            }
+        }
+
+        if exhausted {
+            Poll::Ready(None)
+        } else {
+            Poll::Pending
+        }
+    }
+}
diff --git a/vortex-file/src/read/builder/mod.rs b/vortex-file/src/read/builder/mod.rs
@@ -71,7 +71,7 @@ pub struct VortexReadBuilder<R> {
     io_dispatcher: Option<Arc<IoDispatcher>>,
 }
 
-impl<R: VortexReadAt> VortexReadBuilder<R> {
+impl<R: VortexReadAt + Unpin> VortexReadBuilder<R> {
     pub fn new(read_at: R, layout_serde: LayoutDeserializer) -> Self {
         Self {
             read_at,
@@ -167,7 +167,7 @@ impl<R: VortexReadAt> VortexReadBuilder<R> {
         // Default: fallback to single-threaded tokio dispatcher.
         let io_dispatcher = self.io_dispatcher.unwrap_or_default();
 
-        Ok(VortexFileArrayStream::new(
+        VortexFileArrayStream::try_new(
             self.read_at,
             layout_reader,
             filter_reader,
@@ -176,7 +176,7 @@ impl<R: VortexReadAt> VortexReadBuilder<R> {
             row_count,
             row_mask,
             io_dispatcher,
-        ))
+        )
     }
 
     async fn size(&self) -> VortexResult<u64> {

diff --git a/vortex-file/src/read/layouts/chunked.rs b/vortex-file/src/read/layouts/chunked.rs
@@ -221,7 +221,7 @@ impl ChunkedLayoutReader {
                     BatchRead::ReadMore(m) => {
                         messages_to_fetch.extend(m);
                     }
-                    BatchRead::Batch(a) => {
+                    BatchRead::Value(a) => {
                         *array_slot = ChildRead::Finished(Some(a));
                     }
                 }
@@ -312,10 +312,10 @@ impl LayoutReader for ChunkedLayoutReader {
                 .filter_map(ChildRead::into_value)
                 .collect::<Vec<_>>();
             match child_arrays.len() {
-                0 | 1 => Ok(child_arrays.pop().map(BatchRead::Batch)),
+                0 | 1 => Ok(child_arrays.pop().map(BatchRead::Value)),
                 _ => {
                     let dtype = child_arrays[0].dtype().clone();
-                    Ok(Some(BatchRead::Batch(
+                    Ok(Some(BatchRead::Value(
                         ChunkedArray::try_new(child_arrays, dtype)?.into_array(),
                     )))
                 }
@@ -325,68 +325,73 @@ impl LayoutReader for ChunkedLayoutReader {
         }
     }
 
-    fn read_metadata(&self) -> VortexResult<MetadataRead> {
+    fn read_metadata(&self) -> VortexResult<Option<MetadataRead>> {
         match self.metadata_layout() {
-            None => Ok(MetadataRead::None),
+            None => Ok(None),
             Some(metadata_layout) => {
                 if let Some(md) = self.cached_metadata.get() {
-                    return Ok(MetadataRead::Batches(vec![Some(md.clone())]));
+                    return Ok(Some(MetadataRead::Value(vec![Some(md.clone())])));
                 }
 
                 match metadata_layout
                     .read_selection(&RowMask::new_valid_between(0, self.n_chunks()))?
                 {
-                    Some(BatchRead::Batch(array)) => {
+                    Some(BatchRead::Value(array)) => {
                         // We don't care if the write failed
                         _ = self.cached_metadata.set(array.clone());
-                        Ok(MetadataRead::Batches(vec![Some(array)]))
+                        Ok(Some(MetadataRead::Value(vec![Some(array)])))
                     }
-                    Some(BatchRead::ReadMore(messages)) => Ok(MetadataRead::ReadMore(messages)),
-                    None => Ok(MetadataRead::None),
+                    Some(BatchRead::ReadMore(messages)) => {
+                        Ok(Some(MetadataRead::ReadMore(messages)))
+                    }
+                    None => Ok(None),
                 }
             }
         }
     }
 
     fn can_prune(&self, begin: usize, end: usize) -> VortexResult<PruningRead> {
         if let Some(chunk_prunability) = self.cached_prunability.get() {
-            return Ok(PruningRead::CanPrune(self.can_prune_overlapping_chunks(
+            return Ok(PruningRead::Value(self.can_prune_overlapping_chunks(
                 chunk_prunability,
                 begin,
                 end,
             )?));
         }
 
         let Some(predicate_expression) = self.scan.expr.as_ref() else {
-            return Ok(PruningRead::CanPrune(false));
+            return Ok(PruningRead::Value(false));
         };
 
-        Ok(match self.read_metadata()? {
-            MetadataRead::None => PruningRead::CanPrune(false),
-            MetadataRead::ReadMore(messages) => PruningRead::ReadMore(messages),
-            MetadataRead::Batches(mut batches) => {
-                if batches.len() != 1 {
-                    vortex_bail!("chunked layout should have exactly one metadata array");
-                }
-                let Some(metadata) = batches.swap_remove(0) else {
-                    vortex_bail!("chunked layout should have exactly one metadata array")
-                };
-                let prunability = PruningPredicate::try_new(predicate_expression)
-                    .map(|p| p.evaluate(&metadata))
-                    .transpose()?
-                    .flatten();
-
-                match prunability {
-                    Some(chunk_prunability) => {
-                        let is_selection_pruned =
-                            self.can_prune_overlapping_chunks(&chunk_prunability, begin, end)?;
-                        let _ = self.cached_prunability.set(chunk_prunability); // Losing the race is fine
-                        PruningRead::CanPrune(is_selection_pruned)
+        if let Some(mr) = self.read_metadata()? {
+            Ok(match mr {
+                MetadataRead::ReadMore(messages) => PruningRead::ReadMore(messages),
+                MetadataRead::Value(mut batches) => {
+                    if batches.len() != 1 {
+                        vortex_bail!("chunked layout should have exactly one metadata array");
+                    }
+                    let Some(metadata) = batches.swap_remove(0) else {
+                        vortex_bail!("chunked layout should have exactly one metadata array")
+                    };
+                    let prunability = PruningPredicate::try_new(predicate_expression)
+                        .map(|p| p.evaluate(&metadata))
+                        .transpose()?
+                        .flatten();
+
+                    match prunability {
+                        Some(chunk_prunability) => {
+                            let is_selection_pruned =
+                                self.can_prune_overlapping_chunks(&chunk_prunability, begin, end)?;
+                            let _ = self.cached_prunability.set(chunk_prunability); // Losing the race is fine
+                            PruningRead::Value(is_selection_pruned)
+                        }
+                        None => PruningRead::Value(false),
                     }
-                    None => PruningRead::CanPrune(false),
                 }
-            }
-        })
+            })
+        } else {
+            Ok(PruningRead::Value(false))
+        }
     }
 }