From 537d4e1fa83a899e0a1d5601cd1db6c508f5a046 Mon Sep 17 00:00:00 2001
From: Takahiro Ebato <takahiro.ebato@gmail.com>
Date: Sun, 15 Dec 2024 16:20:10 +0900
Subject: [PATCH] Adjust the scan range considering the number of rows

---
 python/python/tests/test_dataset.py | 17 +++++++++++++++--
 rust/lance/src/dataset/scanner.rs   | 14 ++++++++++----
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py
index 8cb9b57c62..edb686c8d3 100644
--- a/python/python/tests/test_dataset.py
+++ b/python/python/tests/test_dataset.py
@@ -486,9 +486,11 @@ def test_limit_offset(tmp_path: Path, data_storage_version: str):
 
     # test just limit
     assert dataset.to_table(limit=10) == table.slice(0, 10)
+    assert dataset.to_table(limit=100) == table.slice(0, 100)
 
     # test just offset
-    assert dataset.to_table(offset=10) == table.slice(10, 100)
+    assert dataset.to_table(offset=0) == table.slice(0, 100)
+    assert dataset.to_table(offset=10) == table.slice(10, 90)
 
     # test both
     assert dataset.to_table(offset=10, limit=10) == table.slice(10, 10)
@@ -503,7 +505,18 @@ def test_limit_offset(tmp_path: Path, data_storage_version: str):
     assert dataset.to_table(offset=50, limit=25) == table.slice(50, 25)
 
     # Limit past the end
-    assert dataset.to_table(offset=50, limit=100) == table.slice(50, 50)
+    assert dataset.to_table(limit=101) == table.slice(0, 100)
+
+    # Limit with offset past the end
+    assert dataset.to_table(offset=50, limit=51) == table.slice(50, 50)
+
+    # Offset past the end
+    assert dataset.to_table(offset=100) == table.slice(100, 0)  # Empty table
+    assert dataset.to_table(offset=101) == table.slice(100, 0)  # Empty table
+
+    # Offset with limit past the end
+    assert dataset.to_table(offset=100, limit=1) == table.slice(100, 0)  # Empty table
+    assert dataset.to_table(offset=101, limit=1) == table.slice(100, 0)  # Empty table
 
     # Invalid limit / offset
     with pytest.raises(ValueError, match="Offset must be non-negative"):
diff --git a/rust/lance/src/dataset/scanner.rs b/rust/lance/src/dataset/scanner.rs
index b813c633f0..2203545967 100644
--- a/rust/lance/src/dataset/scanner.rs
+++ b/rust/lance/src/dataset/scanner.rs
@@ -1219,12 +1219,18 @@ impl Scanner {
         } else {
             match (self.limit, self.offset) {
                 (None, None) => None,
-                (Some(limit), None) => Some(0..limit as u64),
+                (Some(limit), None) => {
+                    let num_rows = self.dataset.count_all_rows().await? as i64;
+                    Some(0..limit.min(num_rows) as u64)
+                }
                 (None, Some(offset)) => {
-                    let num_rows = self.dataset.count_all_rows().await?;
-                    Some(offset as u64..num_rows as u64)
+                    let num_rows = self.dataset.count_all_rows().await? as i64;
+                    Some(offset.min(num_rows) as u64..num_rows as u64)
+                }
+                (Some(limit), Some(offset)) => {
+                    let num_rows = self.dataset.count_all_rows().await? as i64;
+                    Some(offset.min(num_rows) as u64..(offset + limit).min(num_rows) as u64)
                 }
-                (Some(limit), Some(offset)) => Some(offset as u64..(offset + limit) as u64),
             }
         };
         let mut use_limit_node = true;