Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): add support for HuggingFace IterableDataset #2599

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2546,6 +2546,25 @@ def write_dataset(
if schema is None:
schema = data_obj.features.arrow_schema
data_obj = data_obj.data.to_batches()
elif isinstance(data_obj, datasets.DatasetDict):
raise ValueError(
"DatasetDict is not yet supported. For now please "
"iterate through the DatasetDict and pass in single "
"Dataset instances (e.g., from dataset_dict.data) to "
"`write_dataset`. "
)
elif isinstance(data_obj, datasets.IterableDataset):
if schema is None:
schema = data_obj.features.arrow_schema
# this is hacky because 1) `max_rows_per_group` is
# a write parameter, not a read parameter, and 2) we
# are getting rid of row groups anyways. However,
# this is a required parameter and it's a reasonable
# default. If manual batch size is needed, the user
# can manually iterate through the IterableDataset
# and pass in the batches (see test_huggingface.py for
# unit test example).
data_obj = data_obj.iter(batch_size=max_rows_per_group)

reader = _coerce_reader(data_obj, schema)
_validate_schema(reader.schema)
Expand Down Expand Up @@ -2592,6 +2611,9 @@ def _coerce_reader(
return data_obj.to_reader()
elif isinstance(data_obj, pa.RecordBatchReader):
return data_obj
elif isinstance(data_obj, dict):
# dict of columns
return pa.Table.from_pydict(data_obj, schema=schema)
elif (
type(data_obj).__module__.startswith("polars")
and data_obj.__class__.__name__ == "DataFrame"
Expand Down Expand Up @@ -2691,7 +2713,10 @@ def _casting_recordbatch_iter(
uses float32 for vectors.
"""
for batch in input_iter:
if not isinstance(batch, pa.RecordBatch):
if isinstance(batch, dict):
# if it's a dict, it's a dict of columns
batch = pa.RecordBatch.from_pydict(batch, schema)
elif not isinstance(batch, pa.RecordBatch):
raise TypeError(f"Expected RecordBatch, got {type(batch)}")
if batch.schema != schema:
try:
Expand Down
29 changes: 29 additions & 0 deletions python/python/tests/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

import lance
import pyarrow as pa
import pytest

datasets = pytest.importorskip("datasets")
Expand All @@ -21,3 +22,31 @@ def test_write_hf_dataset(tmp_path: Path):
assert ds.count_rows() == 50

assert ds.schema == hf_ds.features.arrow_schema


def test_iterable_dataset(tmp_path: Path):
# IterableDataset yields dict of arrays

def gen():
yield {"text": "Good", "label": 0}
yield {"text": "Bad", "label": 1}

arrow_schema = pa.schema([("text", pa.string()), ("label", pa.int64())])
features = datasets.Features.from_arrow_schema(arrow_schema)

iter_ds = datasets.IterableDataset.from_generator(gen, features=features)
# streaming batch size is controlled by max_rows_per_group
ds1 = lance.write_dataset(iter_ds, tmp_path / "ds1.lance")
assert ds1.count_rows() == 2
assert ds1.schema == iter_ds.features.arrow_schema

# to manually control streaming batch size
ds2 = lance.write_dataset(
pa.Table.from_arrays([[], []], schema=arrow_schema), tmp_path / "ds2.lance"
)
for batch in iter_ds.iter(batch_size=1):
# shouldn't fail
ds2 = lance.write_dataset(batch, tmp_path / "ds2.lance", mode="append")

assert len(ds1) == len(ds2)
assert ds1.schema == ds2.schema
Loading