lancedb · changhiskhan · Jul 15, 2024 · Jul 16, 2024
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -2546,6 +2546,25 @@ def write_dataset(
             if schema is None:
                 schema = data_obj.features.arrow_schema
             data_obj = data_obj.data.to_batches()
+        elif isinstance(data_obj, datasets.DatasetDict):
+            raise ValueError(
+                "DatasetDict is not yet supported. For now please "
+                "iterate through the DatasetDict and pass in single "
+                "Dataset instances (e.g., from dataset_dict.data) to "
+                "`write_dataset`. "
+            )
+        elif isinstance(data_obj, datasets.IterableDataset):
+            if schema is None:
+                schema = data_obj.features.arrow_schema
+            # this is hacky because 1) `max_rows_per_group` is
+            # a write parameter, not a read parameter, and 2) we
+            # are getting rid of row groups anyways. However,
+            # this is a required parameter and it's a reasonable
+            # default. If manual batch size is needed, the user
+            # can manually iterate through the IterableDataset
+            # and pass in the batches (see test_huggingface.py for
+            # unit test example).
+            data_obj = data_obj.iter(batch_size=max_rows_per_group)
 
     reader = _coerce_reader(data_obj, schema)
     _validate_schema(reader.schema)
@@ -2592,6 +2611,9 @@ def _coerce_reader(
         return data_obj.to_reader()
     elif isinstance(data_obj, pa.RecordBatchReader):
         return data_obj
+    elif isinstance(data_obj, dict):
+        # dict of columns
+        return pa.Table.from_pydict(data_obj, schema=schema)
     elif (
         type(data_obj).__module__.startswith("polars")
         and data_obj.__class__.__name__ == "DataFrame"
@@ -2691,7 +2713,10 @@ def _casting_recordbatch_iter(
     uses float32 for vectors.
     """
     for batch in input_iter:
-        if not isinstance(batch, pa.RecordBatch):
+        if isinstance(batch, dict):
+            # if it's a dict, it's a dict of columns
+            batch = pa.RecordBatch.from_pydict(batch, schema)
+        elif not isinstance(batch, pa.RecordBatch):
             raise TypeError(f"Expected RecordBatch, got {type(batch)}")
         if batch.schema != schema:
             try:

diff --git a/python/python/tests/test_huggingface.py b/python/python/tests/test_huggingface.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import lance
+import pyarrow as pa
 import pytest
 
 datasets = pytest.importorskip("datasets")
@@ -21,3 +22,31 @@ def test_write_hf_dataset(tmp_path: Path):
     assert ds.count_rows() == 50
 
     assert ds.schema == hf_ds.features.arrow_schema
+
+
+def test_iterable_dataset(tmp_path: Path):
+    # IterableDataset yields dict of arrays
+
+    def gen():
+        yield {"text": "Good", "label": 0}
+        yield {"text": "Bad", "label": 1}
+
+    arrow_schema = pa.schema([("text", pa.string()), ("label", pa.int64())])
+    features = datasets.Features.from_arrow_schema(arrow_schema)
+
+    iter_ds = datasets.IterableDataset.from_generator(gen, features=features)
+    # streaming batch size is controlled by max_rows_per_group
+    ds1 = lance.write_dataset(iter_ds, tmp_path / "ds1.lance")
+    assert ds1.count_rows() == 2
+    assert ds1.schema == iter_ds.features.arrow_schema
+
+    # to manually control streaming batch size
+    ds2 = lance.write_dataset(
+        pa.Table.from_arrays([[], []], schema=arrow_schema), tmp_path / "ds2.lance"
+    )
+    for batch in iter_ds.iter(batch_size=1):
+        # shouldn't fail
+        ds2 = lance.write_dataset(batch, tmp_path / "ds2.lance", mode="append")
+
+    assert len(ds1) == len(ds2)
+    assert ds1.schema == ds2.schema