From 537d4e1fa83a899e0a1d5601cd1db6c508f5a046 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato <takahiro.ebato@gmail.com> Date: Sun, 15 Dec 2024 16:20:10 +0900 Subject: [PATCH] Adjust the scan range considering the number of rows --- python/python/tests/test_dataset.py | 17 +++++++++++++++-- rust/lance/src/dataset/scanner.rs | 14 ++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index 8cb9b57c62..edb686c8d3 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -486,9 +486,11 @@ def test_limit_offset(tmp_path: Path, data_storage_version: str): # test just limit assert dataset.to_table(limit=10) == table.slice(0, 10) + assert dataset.to_table(limit=100) == table.slice(0, 100) # test just offset - assert dataset.to_table(offset=10) == table.slice(10, 100) + assert dataset.to_table(offset=0) == table.slice(0, 100) + assert dataset.to_table(offset=10) == table.slice(10, 90) # test both assert dataset.to_table(offset=10, limit=10) == table.slice(10, 10) @@ -503,7 +505,18 @@ def test_limit_offset(tmp_path: Path, data_storage_version: str): assert dataset.to_table(offset=50, limit=25) == table.slice(50, 25) # Limit past the end - assert dataset.to_table(offset=50, limit=100) == table.slice(50, 50) + assert dataset.to_table(limit=101) == table.slice(0, 100) + + # Limit with offset past the end + assert dataset.to_table(offset=50, limit=51) == table.slice(50, 50) + + # Offset past the end + assert dataset.to_table(offset=100) == table.slice(100, 0) # Empty table + assert dataset.to_table(offset=101) == table.slice(100, 0) # Empty table + + # Offset with limit past the end + assert dataset.to_table(offset=100, limit=1) == table.slice(100, 0) # Empty table + assert dataset.to_table(offset=101, limit=1) == table.slice(100, 0) # Empty table # Invalid limit / offset with pytest.raises(ValueError, match="Offset must be non-negative"): diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs index b813c633f0..2203545967 100644 --- a/rust/lance/src/dataset/scanner.rs +++ b/rust/lance/src/dataset/scanner.rs @@ -1219,12 +1219,18 @@ impl Scanner { } else { match (self.limit, self.offset) { (None, None) => None, - (Some(limit), None) => Some(0..limit as u64), + (Some(limit), None) => { + let num_rows = self.dataset.count_all_rows().await? as i64; + Some(0..limit.min(num_rows) as u64) + } (None, Some(offset)) => { - let num_rows = self.dataset.count_all_rows().await?; - Some(offset as u64..num_rows as u64) + let num_rows = self.dataset.count_all_rows().await? as i64; + Some(offset.min(num_rows) as u64..num_rows as u64) + } + (Some(limit), Some(offset)) => { + let num_rows = self.dataset.count_all_rows().await? as i64; + Some(offset.min(num_rows) as u64..(offset + limit).min(num_rows) as u64) } - (Some(limit), Some(offset)) => Some(offset as u64..(offset + limit) as u64), } }; let mut use_limit_node = true;