diff --git a/python/deeplake/__init__.py b/python/deeplake/__init__.py index 19173be1d5..40a82c9082 100644 --- a/python/deeplake/__init__.py +++ b/python/deeplake/__init__.py @@ -1,12 +1,20 @@ import os from typing import Callable, Any, Dict +try: + from tqdm import tqdm as progress_bar +except ImportError: + + def progress_bar(iterable, *args, **kwargs): + return iterable + + import numpy import deeplake from ._deeplake import * -__version__ = "4.0.3" +__version__ = "4.1.0" __all__ = [ "__version__", @@ -22,7 +30,6 @@ "ColumnView", "Column", "Version", - "Prefetcher", "DatasetView", "Dataset", "ReadOnlyDataset", @@ -34,6 +41,8 @@ "ColumnAlreadyExistsError", "ColumnDoesNotExistError", "InvalidColumnValueError", + "InvalidPolygonShapeError", + "InvalidLinkDataError", "PushError", "GcsStorageProviderFailed", "History", @@ -42,6 +51,7 @@ "LogNotexistsError", "IncorrectDeeplakePathError", "AuthenticationError", + "BadRequestError", "AuthorizationError", "NotFoundError", "AgreementError", @@ -56,13 +66,15 @@ "InvalidChunkStrategyType", "InvalidSequenceOfSequence", "InvalidTypeAndFormatPair", + "InvalidLinkType", "UnknownType", "InvalidTextType", "UnsupportedPythonType", "UnsupportedSampleCompression", "UnsupportedChunkCompression", "InvalidImageCompression", - "InvalidMaskCompression", + "InvalidSegmentMaskCompression", + "InvalidBinaryMaskCompression", "DtypeMismatch", "UnspecifiedDtype", "DimensionsMismatch", @@ -90,6 +102,8 @@ "StorageInternalError", "WriteFailedError", "QuantizationType", + "InvalidCredsKeyAssignmentError", + "CredsKeyAlreadyAssignedError", "core", "create", "create_async", @@ -122,65 +136,158 @@ def _tensorflow(self) -> Any: from deeplake._tensorflow import _from_dataset + return _from_dataset(self) def _pytorch(self, transform: Callable[[Any], Any] = None): from deeplake._torch import TorchDataset + return TorchDataset(self, transform=transform) DatasetView.pytorch = _pytorch DatasetView.tensorflow = _tensorflow + def load(*args, **kwargs): """ .. deprecated:: 4.0.0 """ - raise Exception(""" + raise Exception( + """ The API for Deep Lake 4.0 has changed significantly, including the `load` method being replaced by `open`. To continue using Deep Lake 3.x, use `pip install "deeplake<4"`. For information on migrating your code, see https://docs.deeplake.ai/latest/details/v3_conversion/ - """.replace("\n", " ").strip()) + """.replace( + "\n", " " + ).strip() + ) + def empty(*args, **kwargs): """ .. deprecated:: 4.0.0 """ - raise Exception(""" + raise Exception( + """ The API for Deep Lake 4.0 has changed significantly, including the `empty` method being replaced by `create`. To continue using Deep Lake 3.x, use `pip install "deeplake<4"`. For information on migrating your code, see https://docs.deeplake.ai/latest/details/v3_conversion/ - """.replace("\n", " ").strip()) + """.replace( + "\n", " " + ).strip() + ) + def convert(src: str, dst: str, dst_creds: Dict[str, str] = None): """ Copies the v3 dataset at src into a new dataset in the new v4 format. """ + def commit_data(dataset, message="Committing data"): + dataset.commit() + + def get_raw_columns(source): + return [ + col.name + for col in source.schema.columns + if not col.dtype.is_link + and col.dtype.kind + in { + deeplake.types.TypeKind.Image, + deeplake.types.TypeKind.SegmentMask, + deeplake.types.TypeKind.BinaryMask, + } + ] + + def transfer_non_link_data(source, dest, batch_size): + dl = deeplake._deeplake._Prefetcher( + source, + batch_size=batch_size, + adaptive=True, + raw_columns=set(get_raw_columns(source)), + ) + for counter, batch in enumerate(progress_bar(dl), start=1): + dest.append(batch) + if counter % 100 == 0: + commit_data(dest) + commit_data(dest, "Final commit of non-link data") + + def transfer_with_links(source, dest, links, column_names, batch_size): + iterable_cols = [col for col in column_names if col not in links] + link_sample_info = {link: source[link]._links_info() for link in links} + dest.set_creds_key(link_sample_info[links[0]]["key"]) + pref_ds = source.query(f"SELECT {','.join(iterable_cols)}") + dl = deeplake._deeplake._Prefetcher( + pref_ds, + batch_size=batch_size, + adaptive=True, + raw_columns=set(get_raw_columns(source)), + ) + + for counter, batch in enumerate(progress_bar(dl), start=1): + for link in links: + link_data = link_sample_info[link]["data"] + start_index = (counter - 1) * batch_size + end_index = min((counter) * batch_size, len(link_data)) + batch[link] = link_data[start_index:end_index] + + dest.append(batch) + if counter % 100 == 0: + commit_data(dest) + commit_data(dest, "Final commit of linked data") + source_ds = deeplake.query(f'select * from "{src}"') dest_ds = deeplake.like(source_ds, dst, dst_creds) - dest_ds.commit("Created dataset") + commit_data(dest_ds, "Created dataset") + + column_names = [col.name for col in source_ds.schema.columns] + links = [ + col.name + for col in source_ds.schema.columns + if source_ds.schema[col.name].dtype.is_link + ] + batch_size = 10000 - dl = deeplake.Prefetcher(source_ds, batch_size=10000) - counter = 0 print(f"Transferring {len(source_ds)} rows to {dst}...") - for b in dl: - dest_ds.append(b) - counter += 1 - if counter > 0 and counter % 100 == 0: - dest_ds.commit() - dest_ds.commit() - print(f"Transferring data.... to {dst}... DONE") + if not links: + transfer_non_link_data(source_ds, dest_ds, batch_size) + else: + transfer_with_links(source_ds, dest_ds, links, column_names, batch_size) + for column in column_names: + meta = dict(source_ds[column].metadata) + if meta: + for key, value in meta.items(): + dest_ds[column].metadata[key] = value + + commit_data(dest_ds, "Final commit of metadata") + print(f"Data transfer to {dst} complete.") def __register_at_fork(): from ._deeplake import __prepare_atfork, __parent_atfork, __child_atfork UNSAFE_TYPES = ( - Dataset, DatasetView, ReadOnlyDataset, Column, ColumnView, ColumnDefinition, ColumnDefinitionView, Row, RowView, - RowRange, RowRangeView, Schema, SchemaView, Version, History, Prefetcher,Tag,Tags) + Dataset, + DatasetView, + ReadOnlyDataset, + Column, + ColumnView, + ColumnDefinition, + ColumnDefinitionView, + Row, + RowView, + RowRange, + RowRangeView, + Schema, + SchemaView, + Version, + History, + Tag, + Tags, + ) def check_main_globals_for_unsafe_types(): import inspect diff --git a/python/deeplake/__init__.pyi b/python/deeplake/__init__.pyi index 2275e15856..541eef2078 100644 --- a/python/deeplake/__init__.pyi +++ b/python/deeplake/__init__.pyi @@ -20,7 +20,6 @@ __all__ = [ "ColumnView", "Column", "Version", - "Prefetcher", "DatasetView", "Dataset", "ReadOnlyDataset", @@ -32,6 +31,8 @@ __all__ = [ "ColumnAlreadyExistsError", "ColumnDoesNotExistError", "InvalidColumnValueError", + "InvalidPolygonShapeError", + "InvalidLinkDataError", "PushError", "GcsStorageProviderFailed", "History", @@ -40,6 +41,7 @@ __all__ = [ "LogNotexistsError", "IncorrectDeeplakePathError", "AuthenticationError", + "BadRequestError", "AuthorizationError", "NotFoundError", "AgreementError", @@ -54,13 +56,15 @@ __all__ = [ "InvalidChunkStrategyType", "InvalidSequenceOfSequence", "InvalidTypeAndFormatPair", + "InvalidLinkType", "UnknownType", "InvalidTextType", "UnsupportedPythonType", "UnsupportedSampleCompression", "UnsupportedChunkCompression", "InvalidImageCompression", - "InvalidMaskCompression", + "InvalidSegmentMaskCompression", + "InvalidBinaryMaskCompression", "DtypeMismatch", "UnspecifiedDtype", "DimensionsMismatch", @@ -88,6 +92,8 @@ __all__ = [ "StorageInternalError", "WriteFailedError", "QuantizationType", + "InvalidCredsKeyAssignmentError", + "CredsKeyAlreadyAssignedError", "core", "create", "create_async", @@ -117,12 +123,11 @@ __all__ = [ "__parent_atfork", ] - class Future: """ A future that represents a value that will be resolved in the future. - Once the Future is resolved, it will hold the result, and you can retrieve it + Once the Future is resolved, it will hold the result, and you can retrieve it using either a blocking call (`result()`) or via asynchronous mechanisms (`await`). The future will resolve automatically even if you do not explicitly wait for it. @@ -130,10 +135,10 @@ class Future: Methods: result() -> typing.Any: Blocks until the Future is resolved and returns the object. - + __await__() -> typing.Any: Awaits the future asynchronously and returns the object once it's ready. - + is_completed() -> bool: Returns True if the Future is already resolved, False otherwise. """ @@ -152,8 +157,10 @@ class Future: Awaits the resolution of the Future asynchronously. Examples: - >>> result = await future - + ```python + result = await future + ``` + Returns: typing.Any: The result when the Future is resolved. """ @@ -177,10 +184,10 @@ class FutureVoid: Methods: wait() -> None: Blocks until the FutureVoid is resolved and then returns `None`. - + __await__() -> None: Awaits the FutureVoid asynchronously and returns `None` once the operation is complete. - + is_completed() -> bool: Returns True if the FutureVoid is already resolved, False otherwise. """ @@ -190,7 +197,9 @@ class FutureVoid: Blocks until the FutureVoid is resolved, then returns `None`. Examples: - >>> future_void.wait() # Blocks until the operation completes. + ```python + future_void.wait() # Blocks until the operation completes. + ``` Returns: None: Indicates the operation has completed. @@ -202,8 +211,10 @@ class FutureVoid: Awaits the resolution of the FutureVoid asynchronously. Examples: - >>> await future_void # Waits for the completion of the async operation. - + ```python + await future_void # Waits for the completion of the async operation. + ``` + Returns: None: Indicates the operation has completed. """ @@ -246,7 +257,6 @@ class Metadata(ReadOnlyMetadata): """ ... - def query(query: str, token: str | None = None) -> DatasetView: """ Executes the given TQL query and returns a DatasetView. @@ -255,7 +265,9 @@ def query(query: str, token: str | None = None) -> DatasetView: or query a single dataset without opening it first. Examples: - >>> r = deeplake.query("select * from \\"al://my_org/dataset\\" where id > 30") + ```python + r = deeplake.query("select * from \\"al://my_org/dataset\\" where id > 30") + ``` """ ... @@ -265,23 +277,24 @@ def query_async(query: str, token: str | None = None) -> Future: Asynchronously executes the given TQL query and returns a Future that will resolve into DatasetView. Examples: - >>> future = deeplake.query_async("select * where category == 'active'") - >>> result = future.result() - >>> for row in result: - >>> print("Id is: ", row["id"]) - - >>> # or use the Future in an await expression - >>> future = deeplake.query_async("select * where category == 'active'") - >>> result = await future - >>> for row in result: - >>> print("Id is: ", row["id"]) + ```python + future = deeplake.query_async("select * where category == 'active'") + result = future.result() + for row in result: + print("Id is: ", row["id"]) + + # or use the Future in an await expression + future = deeplake.query_async("select * where category == 'active'") + result = await future + for row in result: + print("Id is: ", row["id"]) + ``` """ ... class Client: endpoint: str - class Tag: """ Describes a tag within the dataset. @@ -319,7 +332,7 @@ class Tag: """ ... - def open(self) -> ReadOnlyDataset: + def open(self) -> DatasetView: """ Fetches the dataset corresponding to the tag """ @@ -331,7 +344,7 @@ class Tag: """ ... - def __repr__(self) -> str: ... + def __str__(self) -> str: ... class TagView: """ @@ -358,7 +371,7 @@ class TagView: The version that has been tagged """ - def open(self) -> ReadOnlyDataset: + def open(self) -> DatasetView: """ Fetches the dataset corresponding to the tag """ @@ -370,7 +383,7 @@ class TagView: """ ... - def __repr__(self) -> str: ... + def __str__(self) -> str: ... class TagNotFoundError(Exception): pass @@ -403,13 +416,11 @@ class Tags: """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... def names(self) -> list[str]: """ Return a list of tag names """ - ... class TagsView: @@ -431,19 +442,15 @@ class TagsView: """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... def names(self) -> list[str]: """ Return a list of tag names """ - ... - class ColumnDefinition: - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def name(self) -> str: """ @@ -474,14 +481,12 @@ class ColumnDefinition: """ ... - class ColumnDefinitionView: """ A read-only view of a [deeplake.ColumnDefinition][] """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def name(self) -> str: """ @@ -496,42 +501,33 @@ class ColumnDefinitionView: """ ... - class ColumnView: """ Provides access to a column in a dataset. """ - def __getitem__(self, index: int | slice) -> typing.Any: ... - - def get_async(self, index: int | slice) -> Future: ... - + def __getitem__(self, index: int | slice | list | tuple) -> typing.Any: ... + def get_async(self, index: int | slice | list | tuple) -> Future: ... def __len__(self) -> int: ... - - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... + def _links_info(self) -> dict: ... @property def metadata(self) -> ReadOnlyMetadata: ... - @property def name(self) -> str: ... - class Column(ColumnView): def __setitem__(self, index: int | slice, value: typing.Any) -> None: ... def set_async(self, index: int | slice, value: typing.Any) -> FutureVoid: ... - @property def metadata(self) -> Metadata: ... - class Version: """ An atomic change within [deeplake.Dataset][]'s history """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def client_timestamp(self) -> datetime.datetime: """ @@ -566,7 +562,6 @@ class Version: """ ... - class Row: """ Provides mutable access to a particular row in a dataset. @@ -580,7 +575,7 @@ class Row: def get_async(self, column: str) -> Future: """ Asynchronously retrieves data for the specified column and returns a Future object. - + Args: column (str): The name of the column to retrieve data for. @@ -588,13 +583,15 @@ class Row: Future: A Future object that will resolve to the value containing the column data. Examples: - >>> future = row.get_async("column_name") - >>> column = future.result() # Blocking call to get the result when it's ready. - + ```python + future = row.get_async("column_name") + column = future.result() # Blocking call to get the result when it's ready. + ``` + Notes: - - The Future will resolve asynchronously, meaning the method will not block execution + - The Future will resolve asynchronously, meaning the method will not block execution while the data is being retrieved. - - You can either wait for the result using `future.result()` (a blocking call) + - You can either wait for the result using `future.result()` (a blocking call) or use the Future in an `await` expression. """ @@ -615,24 +612,24 @@ class Row: FutureVoid: A FutureVoid object that will resolve when the operation is complete. Examples: - >>> future_void = row.set_async("column_name", new_value) - >>> future_void.wait() # Blocks until the operation is complete. - + ```python + future_void = row.set_async("column_name", new_value) + future_void.wait() # Blocks until the operation is complete. + ``` + Notes: - The method sets the value asynchronously and immediately returns a FutureVoid. - - You can either block and wait for the operation to complete using `wait()` + - You can either block and wait for the operation to complete using `wait()` or await the FutureVoid object in an asynchronous context. """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def row_id(self) -> int: """ The row_id of the row """ - class RowRange: """ Provides mutable access to a row range in a dataset. @@ -652,11 +649,11 @@ class RowRange: """ The value for the given column """ - + def get_async(self, column: str) -> Future: """ Asynchronously retrieves data for the specified column and returns a Future object. - + Args: column (str): The name of the column to retrieve data for. @@ -664,13 +661,15 @@ class RowRange: Future: A Future object that will resolve to the value containing the column data. Examples: - >>> future = row_range.get_async("column_name") - >>> column = future.result() # Blocking call to get the result when it's ready. - + ```python + future = row_range.get_async("column_name") + column = future.result() # Blocking call to get the result when it's ready. + ``` + Notes: - - The Future will resolve asynchronously, meaning the method will not block execution + - The Future will resolve asynchronously, meaning the method will not block execution while the data is being retrieved. - - You can either wait for the result using `future.result()` (a blocking call) + - You can either wait for the result using `future.result()` (a blocking call) or use the Future in an `await` expression. """ @@ -691,17 +690,21 @@ class RowRange: FutureVoid: A FutureVoid object that will resolve when the operation is complete. Examples: - >>> future_void = row_range.set_async("column_name", new_value) - >>> future_void.wait() # Blocks until the operation is complete. - + ```python + future_void = row_range.set_async("column_name", new_value) + future_void.wait() # Blocks until the operation is complete. + ``` + Notes: - The method sets the value asynchronously and immediately returns a FutureVoid. - - You can either block and wait for the operation to complete using `wait()` + - You can either block and wait for the operation to complete using `wait()` or await the FutureVoid object in an asynchronous context. """ - def __repr__(self) -> str: ... - + def summary(self) -> None: + """ + Prints a summary of the RowRange. + """ class RowRangeView: """ @@ -723,10 +726,15 @@ class RowRangeView: The value for the given column """ + def summary(self) -> None: + """ + Prints a summary of the RowRange. + """ + def get_async(self, column: str) -> Future: """ Asynchronously retrieves data for the specified column and returns a Future object. - + Args: column (str): The name of the column to retrieve data for. @@ -734,19 +742,18 @@ class RowRangeView: Future: A Future object that will resolve to the value containing the column data. Examples: - >>> future = row_range_view.get_async("column_name") - >>> column = future.result() # Blocking call to get the result when it's ready. - + ```python + future = row_range_view.get_async("column_name") + column = future.result() # Blocking call to get the result when it's ready. + ``` + Notes: - - The Future will resolve asynchronously, meaning the method will not block execution + - The Future will resolve asynchronously, meaning the method will not block execution while the data is being retrieved. - - You can either wait for the result using `future.result()` (a blocking call) + - You can either wait for the result using `future.result()` (a blocking call) or use the Future in an `await` expression. """ - def __repr__(self) -> str: ... - - class RowView: """ Provides access to a particular row in a dataset. @@ -760,7 +767,7 @@ class RowView: def get_async(self, column: str) -> Future: """ Asynchronously retrieves data for the specified column and returns a Future object. - + Args: column (str): The name of the column to retrieve data for. @@ -768,32 +775,31 @@ class RowView: Future: A Future object that will resolve to the value containing the column data. Examples: - >>> future = row_view.get_async("column_name") - >>> column = future.result() # Blocking call to get the result when it's ready. - + ```python + future = row_view.get_async("column_name") + column = future.result() # Blocking call to get the result when it's ready. + ``` + Notes: - - The Future will resolve asynchronously, meaning the method will not block execution + - The Future will resolve asynchronously, meaning the method will not block execution while the data is being retrieved. - - You can either wait for the result using `future.result()` (a blocking call) + - You can either wait for the result using `future.result()` (a blocking call) or use the Future in an `await` expression. """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def row_id(self) -> int: """ The row_id of the row """ - class DatasetView: """ A DatasetView is a dataset-like structure. It has a defined schema and contains data which can be queried. """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @typing.overload def __getitem__(self, offset: int) -> RowView: """ @@ -808,6 +814,20 @@ class DatasetView: """ ... + @typing.overload + def __getitem__(self, indices: list) -> RowRangeView: + """ + Get a range of rows by the given list of indices within the DatasetView. + """ + ... + + @typing.overload + def __getitem__(self, indices: tuple) -> RowRangeView: + """ + Get a range of rows by the given tuple of indices within the DatasetView. + """ + ... + @typing.overload def __getitem__(self, column: str) -> ColumnView: """ @@ -816,7 +836,7 @@ class DatasetView: ... def __getitem__( - self, input: int | slice | str + self, input: int | slice | list | tuple | str ) -> RowView | RowRangeView | ColumnView: """ Returns a subset of data from the DatasetView. @@ -825,32 +845,38 @@ class DatasetView: - `int`: The zero-based offset of the single row to return. Returns a [deeplake.RowView][] - `slice`: A slice specifying the range of rows to return. Returns a [deeplake.RowRangeView][] + - `list`: A list of indices specifying the rows to return. Returns a [deeplake.RowRangeView][] + - `tuple`: A tuple of indices specifying the rows to return. Returns a [deeplake.RowRangeView - `str`: A string specifying column to return all values from. Returns a [deeplake.ColumnView][] Examples: - >>> ds = deeplake.create("mem://") - >>> ds.add_column("id", int) - >>> ds.add_column("name", str) - >>> ds.append({"id": [1,2,3], "name": ["Mary", "Joe", "Bill"]}) - >>> - >>> row = ds[1] - >>> print("Id:", row["id"], "Name:", row["name"]) - Id: 2 Name: Joe - >>> rows = ds[1:2] - >>> print(rows["id"]) + ```python + ds = deeplake.create("mem://") + ds.add_column("id", int) + ds.add_column("name", str) + ds.append({"id": [1,2,3], "name": ["Mary", "Joe", "Bill"]}) - >>> column_data = ds["id"] + row = ds[1] + print("Id:", row["id"], "Name:", row["name"]) # Output: 2 Name: Joe + rows = ds[1:2] + print(rows["id"]) + column_data = ds["id"] + ``` """ + def __getstate__(self) -> tuple: ... + def __setstate__(self, arg0: tuple) -> None: ... def __iter__(self) -> typing.Iterator[RowView]: """ Row based iteration over the dataset. Examples: - >>> for row in ds: - >>> # process row - >>> pass + ```python + for row in ds: + # process row + pass + ``` """ ... @@ -866,18 +892,18 @@ class DatasetView: Prints a summary of the dataset. Examples: - >>> ds.summary() - Dataset(columns=(id,title,embedding), length=51611356) - +---------+-------------------------------------------------------+ - | column | type | - +---------+-------------------------------------------------------+ - | id | kind=generic, dtype=int32 | - +---------+-------------------------------------------------------+ - | title | text | - +---------+-------------------------------------------------------+ - |embedding|kind=embedding, dtype=array(dtype=float32, shape=[768])| - +---------+-------------------------------------------------------+ + ```python + ds.summary() + ``` + Example Output: + ``` + Dataset length: 5 + Columns: + id : int64 + title : text + embedding: embedding(768) + ``` """ ... @@ -886,9 +912,11 @@ class DatasetView: Executes the given TQL query against the dataset and return the results as a [deeplake.DatasetView][]. Examples: - >>> result = ds.query("select * where category == 'active'") - >>> for row in result: - >>> print("Id is: ", row["id"]) + ```python + result = ds.query("select * where category == 'active'") + for row in result: + print("Id is: ", row["id"]) + ``` """ ... @@ -898,16 +926,24 @@ class DatasetView: Asynchronously executes the given TQL query against the dataset and return a future that will resolve into [deeplake.DatasetView][]. Examples: - >>> future = ds.query_async("select * where category == 'active'") - >>> result = future.result() - >>> for row in result: - >>> print("Id is: ", row["id"]) + ```python + future = ds.query_async("select * where category == 'active'") + result = future.result() + for row in result: + print("Id is: ", row["id"]) - >>> # or use the Future in an await expression - >>> future = ds.query_async("select * where category == 'active'") - >>> result = await future - >>> for row in result: - >>> print("Id is: ", row["id"]) + # or use the Future in an await expression + future = ds.query_async("select * where category == 'active'") + result = await future + for row in result: + print("Id is: ", row["id"]) + ``` + """ + ... + + def tag(self, name: str | None = None) -> Tag: + """ + Saves the current view as a tag to its source dataset and returns the tag. """ ... @@ -917,7 +953,6 @@ class DatasetView: The schema of the dataset. """ - def tensorflow(self) -> typing.Any: """ Returns a TensorFlow `tensorflow.data.Dataset` wrapper around this DatasetView. @@ -926,15 +961,16 @@ class DatasetView: ImportError: If TensorFlow is not installed Examples: - >>> ds = deeplake.open("path/to/dataset") - >>> dl = ds.tensorflow().shuffle(500).batch(32). - >>> for i_batch, sample_batched in enumerate(dataloader): - >>> process_batch(sample_batched) + ```python + ds = deeplake.open("path/to/dataset") + dl = ds.tensorflow().shuffle(500).batch(32). + for i_batch, sample_batched in enumerate(dataloader): + process_batch(sample_batched) + ``` """ ... - def pytorch(self, transform: typing.Callable[[typing.Any], typing.Any] = None): """ Returns a PyTorch `torch.utils.data. Dataset` wrapper around this dataset. @@ -948,78 +984,37 @@ class DatasetView: ImportError: If pytorch is not installed Examples: - >>> from torch.utils.data import DataLoader - >>> - >>> ds = deeplake.open("path/to/dataset") - >>> dataloader = DataLoader(ds.pytorch(), batch_size=60, - >>> shuffle=True, num_workers=10) - >>> for i_batch, sample_batched in enumerate(dataloader): - >>> process_batch(sample_batched) + ```python + from torch.utils.data import DataLoader + + ds = deeplake.open("path/to/dataset") + dataloader = DataLoader(ds.pytorch(), batch_size=60, + shuffle=True, num_workers=10) + for i_batch, sample_batched in enumerate(dataloader): + process_batch(sample_batched) + ``` """ ... - def batches(self, batch_size: int, drop_last: bool = False) -> Prefetcher: + def batches(self, batch_size: int, drop_last: bool = False) -> typing.Iterable: """ - Return a [deeplake.Prefetcher][] for this DatasetView + The batches can be used to more efficiently stream large amounts of data from a DeepLake dataset, such as to the DataLoader then to the training framework. Parameters: batch_size: Number of rows in each batch drop_last: Whether to drop the final batch if it is incomplete - """ - ... - -class Prefetcher: - """ - The Prefetcher can be used to more efficiently stream large amounts of data from a DeepLake dataset, such as to the DataLoader then to the training framework. - - Examples: - >>> ds = deeplake.open("al://my_org/dataset") - >>> fetcher = deeplake.Prefetcher(view, batch_size=2000) - >>> for batch in dl: - >>> process_batch(batch["images"]) - - """ - def __init__( - self, - dataset: DatasetView, - batch_size: int = 1, - drop_last: bool = False, - ) -> None: - """ - Parameters: - dataset: The [deeplake.DatasetView][] to stream from - batch_size: The numer of rows to return in each iteration - drop_last: If true, do not return a non-full final batch + Examples: + ```python + ds = deeplake.open("al://my_org/dataset") + batches = ds.batches(batch_size=2000, drop_last=True) + for batch in batches: + process_batch(batch["images"]) + ``` """ ... - def __iter__(self) -> Prefetcher: - """ - Iterate over the dataset view - """ - ... - - def __len__(self) -> int: - """ - The number of batches in the Prefetcher - """ - ... - - def __next__(self) -> dict: - """ - Returns the next batch of dataset - """ - ... - - def reset(self) -> None: - """ - Reset the iterator - """ - ... - - class Dataset(DatasetView): """ Datasets are the primary data structure used in DeepLake. They are used to store and manage data for searching, training, evaluation. @@ -1027,8 +1022,7 @@ class Dataset(DatasetView): Unlike [deeplake.ReadOnlyDataset][], instances of `Dataset` can be modified. """ - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... def tag(self, name: str, version: str | None = None) -> Tag: """ Tags a version of the dataset. If no version is given, the current version is tagged. @@ -1044,7 +1038,6 @@ class Dataset(DatasetView): """ The collection of [deeplake.Tag][]s within the dataset """ - name: str """ The name of the dataset. Setting the value will immediately persist the change without requiring a commit(). @@ -1108,6 +1101,20 @@ class Dataset(DatasetView): """ ... + @typing.overload + def __getitem__(self, indices: list) -> RowRange: + """ + Get a range of rows by the given list of indices within the dataset. + """ + ... + + @typing.overload + def __getitem__(self, indices: tuple) -> RowRange: + """ + Get a range of rows by the given tuple of indices within the dataset. + """ + ... + @typing.overload def __getitem__(self, column: str) -> Column: """ @@ -1115,7 +1122,9 @@ class Dataset(DatasetView): """ ... - def __getitem__(self, input: int | slice | str) -> Row | RowRange | Column: + def __getitem__( + self, input: int | slice | list | tuple | str + ) -> Row | RowRange | Column: """ Returns a subset of data from the Dataset @@ -1123,17 +1132,26 @@ class Dataset(DatasetView): - `int`: The zero-based offset of the single row to return. Returns a [deeplake.Row][] - `slice`: A slice specifying the range of rows to return. Returns a [deeplake.RowRange][] + - `list`: A list of indices specifying the rows to return. Returns a [deeplake.RowRange][] + - `tuple`: A tuple of indices specifying the rows to return. Returns a [deeplake.RowRange][] - `str`: A string specifying column to return all values from. Returns a [deeplake.Column][] Examples: - >>> row = ds[318] + ```python + row = ds[318] - >>> rows = ds[931:1038] + rows = ds[931:1038] - >>> column_data = ds["id"] + rows = ds[931:1038:3] - """ + rows = ds[[1, 3, 5, 7]] + + rows = ds[(1, 3, 5, 7)] + column_data = ds["id"] + ``` + + """ ... def __iter__(self) -> typing.Iterator[Row]: @@ -1141,9 +1159,11 @@ class Dataset(DatasetView): Row based iteration over the dataset. Examples: - >>> for row in ds: - >>> # process row - >>> pass + ```python + for row in ds: + # process row + pass + ``` """ ... @@ -1163,10 +1183,10 @@ class Dataset(DatasetView): """ def add_column( - self, - name: str, - dtype: types.DataType | str | types.Type | type | typing.Callable, - format: formats.DataFormat | None = None, + self, + name: str, + dtype: types.DataType | str | types.Type | type | typing.Callable, + format: formats.DataFormat | None = None, ) -> None: """ Add a new column to the dataset. @@ -1184,26 +1204,25 @@ class Dataset(DatasetView): format (DataFormat, optional): The format of the column, if applicable. Only required when the dtype is [deeplake.types.DataType][]. Examples: - >>> ds.add_column("labels", deeplake.types.Int32) + ```python + ds.add_column("labels", deeplake.types.Int32) - >>> ds.add_column("labels", "int32") + ds.add_column("labels", "int32") - >>> ds.add_column("name", deeplake.types.Text()) + ds.add_column("name", deeplake.types.Text()) - >>> ds.add_column("json_data", deeplake.types.Dict()) + ds.add_column("json_data", deeplake.types.Dict()) - >>> ds.add_column("images", deeplake.types.Image(dtype=deeplake.types.UInt8(), sample_compression="jpeg")) + ds.add_column("images", deeplake.types.Image(dtype=deeplake.types.UInt8(), sample_compression="jpeg")) - >>> ds.add_column("embedding", deeplake.types.Embedding(dtype=deeplake.types.Float32(), dimensions=768)) + ds.add_column("embedding", deeplake.types.Embedding(dtype=deeplake.types.Float32(), dimensions=768)) + ``` Raises: deeplake.ColumnAlreadyExistsError: If a column with the same name already exists. """ - def remove_column( - self, - name: str - ) -> None: + def remove_column(self, name: str) -> None: """ Remove the existing column from the dataset. @@ -1211,17 +1230,15 @@ class Dataset(DatasetView): name: The name of the column to remove Examples: - >>> ds.remove_column("name") + ```python + ds.remove_column("name") + ``` Raises: deeplake.ColumnDoesNotExistsError: If a column with the specified name does not exists. """ - def rename_column( - self, - name: str, - new_name: str - ) -> None: + def rename_column(self, name: str, new_name: str) -> None: """ Renames the existing column in the dataset. @@ -1230,7 +1247,9 @@ class Dataset(DatasetView): new_name: The new name to set to column Examples: - >>> ds.rename_column("old_name", "new_name") + ```python + ds.rename_column("old_name", "new_name") + ``` Raises: deeplake.ColumnDoesNotExistsError: If a column with the specified name does not exists. @@ -1239,15 +1258,12 @@ class Dataset(DatasetView): @typing.overload def append(self, data: list[dict[str, typing.Any]]) -> None: ... - @typing.overload def append(self, data: dict[str, typing.Any]) -> None: ... - @typing.overload def append(self, data: DatasetView) -> None: ... - def append( - self, data: list[dict[str, typing.Any]] | dict[str, typing.Any] | DatasetView + self, data: list[dict[str, typing.Any]] | dict[str, typing.Any] | DatasetView ) -> None: """ Adds data to the dataset. @@ -1262,17 +1278,21 @@ class Dataset(DatasetView): data: The data to insert into the dataset. Examples: - >>> ds.append({"name": ["Alice", "Bob"], "age": [25, 30]}) + ```python + ds.append({"name": ["Alice", "Bob"], "age": [25, 30]}) - >>> ds.append([{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]) + ds.append([{"name": "Alice", "age": 25}, {"name": "Bob", "age": 30}]) - >>> ds.append({ - >>> "embedding": np.random.rand(4, 768), - >>> "text": ["Hello World"] * 4}) + ds.append({ + "embedding": np.random.rand(4, 768), + "text": ["Hello World"] * 4}) - >>> ds.append([{"embedding": np.random.rand(768), "text": "Hello World"}] * 4) + ds.append([{"embedding": np.random.rand(768), "text": "Hello World"}] * 4) + ``` - >>> ds.append(deeplake.from_parquet("./file.parquet")) + ```python + ds.append(deeplake.from_parquet("./file.parquet")) + ``` Raises: deeplake.ColumnMissingAppendValueError: If any column is missing from the input data. @@ -1297,9 +1317,13 @@ class Dataset(DatasetView): message (str, optional): A message to store in history describing the changes made in the version Examples: - >>> ds.commit() + ```python + ds.commit() + ``` - >>> ds.commit("Added data from updated documents") + ```python + ds.commit("Added data from updated documents") + ``` """ @@ -1313,16 +1337,25 @@ class Dataset(DatasetView): message (str, optional): A message to store in history describing the changes made in the commit Examples: - >>> ds.commit_async().wait() + ```python + ds.commit_async().wait() + ``` - >>> ds.commit_async("Added data from updated documents").wait() + ```python + ds.commit_async("Added data from updated documents").wait() + ``` - >>> await ds.commit_async() + ```python + await ds.commit_async() + ``` - >>> await ds.commit_async("Added data from updated documents") - - >>> future = ds.commit_async() # then you can check if the future is completed using future.is_completed() + ```python + await ds.commit_async("Added data from updated documents") + ``` + ```python + future = ds.commit_async() # then you can check if the future is completed using future.is_completed() + ``` """ def rollback(self) -> None: @@ -1335,7 +1368,21 @@ class Dataset(DatasetView): Asynchronously reverts any in-progress changes to the dataset you have made. Does not revert any changes that have been committed. """ - def push(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> None: + def set_creds_key(self, key: str, token: str | None = None) -> None: + """ + Sets the key used to store the credentials for the dataset. + """ + pass + + @property + def creds_key(self) -> str | None: + """ + The key used to store the credentials for the dataset. + """ + + def push( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> None: """ Pushes any new history from this dataset to the dataset at the given url @@ -1347,7 +1394,10 @@ class Dataset(DatasetView): token: Optional deeplake token """ ... - def push_async(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> FutureVoid: + + def push_async( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> FutureVoid: """ Asynchronously Pushes new any history from this dataset to the dataset at the given url @@ -1360,7 +1410,9 @@ class Dataset(DatasetView): """ ... - def pull(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> None: + def pull( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> None: """ Pulls any new history from the dataset at the passed url into this dataset. @@ -1372,7 +1424,10 @@ class Dataset(DatasetView): token: Optional deeplake token """ ... - def pull_async(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> FutureVoid: + + def pull_async( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> FutureVoid: """ Asynchronously pulls any new history from the dataset at the passed url into this dataset. @@ -1385,7 +1440,6 @@ class Dataset(DatasetView): """ ... - @property def history(self) -> History: """ @@ -1399,65 +1453,22 @@ class Dataset(DatasetView): """ ... - class ReadOnlyDataset(DatasetView): - @typing.overload - def __getitem__(self, offset: int) -> RowView: - """ - Get a row by offset within the dataset. - """ - ... - - @typing.overload - def __getitem__(self, range: slice) -> RowRangeView: - """ - Get a range of rows by offset within the dataset. - """ - ... - - @typing.overload - def __getitem__(self, column: str) -> ColumnView: - """ - Get a column by name within the dataset. - """ - ... - - def __getitem__( - self, input: int | slice | str - ) -> RowView | RowRangeView | ColumnView: - """ - Returns a subset of data from the dataset. - - The result will depend on the type of value passed to the `[]` operator. - - - `int`: The zero-based offset of the single row to return. Returns a [deeplake.RowView][] - - `slice`: A slice specifying the range of rows to return. Returns a [deeplake.RowRangeView][] - - `str`: A string specifying column to return all values from. Returns a [deeplake.ColumnView][] - - Examples: - >>> row = ds[318] - - >>> rows = ds[931:1038] - - >>> column_data = ds["id"] - - """ - ... - def __iter__(self) -> typing.Iterator[RowView]: """ Row based iteration over the dataset. Examples: - >>> for row in ds: - >>> # process row - >>> pass + ```python + for row in ds: + # process row + pass + ``` """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... @property def tags(self) -> TagsView: """ @@ -1529,7 +1540,9 @@ class ReadOnlyDataset(DatasetView): """ ... - def push(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> None: + def push( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> None: """ Pushes any history from this dataset to the dataset at the given url @@ -1539,7 +1552,10 @@ class ReadOnlyDataset(DatasetView): token: Optional deeplake token """ ... - def push_async(self, url: str, creds: dict[str, str] | None = None, token: str | None = None) -> FutureVoid: + + def push_async( + self, url: str, creds: dict[str, str] | None = None, token: str | None = None + ) -> FutureVoid: """ Asynchronously Pushes any history from this dataset to the dataset at the given url @@ -1567,38 +1583,35 @@ class ReadOnlyDataset(DatasetView): class ExpiredTokenError(Exception): pass - class FormatNotSupportedError(Exception): pass - class UnevenColumnsError(Exception): pass - class UnevenUpdateError(Exception): pass - class ColumnMissingAppendValueError(Exception): pass - class ColumnAlreadyExistsError(Exception): pass - class ColumnDoesNotExistError(Exception): pass - class InvalidColumnValueError(Exception): pass +class InvalidPolygonShapeError(Exception): + pass -class GcsStorageProviderFailed(Exception): +class InvalidLinkDataError(Exception): pass +class GcsStorageProviderFailed(Exception): + pass class History: """ @@ -1606,15 +1619,10 @@ class History: """ @typing.overload - def __getitem__(self, offset: int) -> Version: - ... + def __getitem__(self, offset: int) -> Version: ... @typing.overload - def __getitem__(self, version: str) -> Version: - ... - - def __getitem__(self, input: int | str) -> Version: - ... - + def __getitem__(self, version: str) -> Version: ... + def __getitem__(self, input: int | str) -> Version: ... def __iter__(self) -> typing.Iterator[Version]: """ Iterate over the history, starting at the initial version @@ -1627,153 +1635,131 @@ class History: """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... class InvalidType(Exception): pass - class LogExistsError(Exception): pass - class LogNotexistsError(Exception): pass - class IncorrectDeeplakePathError(Exception): pass - class AuthenticationError(Exception): pass +class BadRequestError(Exception): + pass class AuthorizationError(Exception): pass - class NotFoundError(Exception): pass - class AgreementError(Exception): pass - class AgreementNotAcceptedError(Exception): pass - class NotLoggedInAgreementError(Exception): pass - class JSONKeyNotFound(Exception): pass - class JSONIndexNotFound(Exception): pass - class UnknownFormat(Exception): pass - class UnknownStringType(Exception): pass - class InvalidChunkStrategyType(Exception): pass - class InvalidSequenceOfSequence(Exception): pass - class InvalidTypeAndFormatPair(Exception): pass +class InvalidLinkType(Exception): + pass class UnknownType(Exception): pass - class InvalidTextType(Exception): pass - class UnsupportedPythonType(Exception): pass - class UnsupportedSampleCompression(Exception): pass - class UnsupportedChunkCompression(Exception): pass - class InvalidImageCompression(Exception): pass +class InvalidCredsKeyAssignmentError(Exception): + pass -class InvalidMaskCompression(Exception): +class CredsKeyAlreadyAssignedError(Exception): pass +class InvalidSegmentMaskCompression(Exception): + pass -class DtypeMismatch(Exception): +class InvalidBinaryMaskCompression(Exception): pass +class DtypeMismatch(Exception): + pass class UnspecifiedDtype(Exception): pass - class DimensionsMismatch(Exception): pass - class ShapeIndexOutOfChunk(Exception): pass - class BytePositionIndexOutOfChunk(Exception): pass - class TensorAlreadyExists(Exception): pass - class CanNotCreateTensorWithProvidedCompressions(Exception): pass - class WrongChunkCompression(Exception): pass - class WrongSampleCompression(Exception): pass - class UnknownBoundingBoxCoordinateFormat(Exception): pass - class UnknownBoundingBoxPixelFormat(Exception): pass - class InvalidTypeDimensions(Exception): pass - class SchemaView: """ A read-only view of a [deeplake.Dataset][] [deeplake.Schema][]. @@ -1798,8 +1784,7 @@ class SchemaView: """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... class Schema: """ @@ -1825,35 +1810,32 @@ class Schema: """ ... - def __repr__(self) -> str: ... - + def __str__(self) -> str: ... class StorageAccessDenied(Exception): pass - class StorageKeyAlreadyExists(Exception): pass - class StorageKeyNotFound(Exception): pass - class StorageNetworkConnectionError(Exception): pass - class StorageInternalError(Exception): pass - class WriteFailedError(Exception): pass - -def create(url: str, creds: dict[str, str] | None = None, token: str | None = None, - schema: schemas.SchemaTemplate | None = None) -> Dataset: +def create( + url: str, + creds: dict[str, str] | None = None, + token: str | None = None, + schema: schemas.SchemaTemplate | None = None, +) -> Dataset: """ Creates a new dataset at the given URL. @@ -1882,101 +1864,135 @@ def create(url: str, creds: dict[str, str] | None = None, token: str | None = No schema (dict): The initial schema to use for the dataset. See `deeplake.schema` such as [deeplake.schemas.TextEmbeddings][] for common starting schemas. Examples: - >>> import deeplake - >>> from deeplake import types - >>> - >>> # Create a dataset in your local filesystem: - >>> ds = deeplake.create("directory_path") - >>> ds.add_column("id", types.Int32()) - >>> ds.add_column("url", types.Text()) - >>> ds.add_column("embedding", types.Embedding(768)) - >>> ds.commit() - >>> ds.summary() - Dataset(columns=(id,url,embedding), length=0) - +---------+-------------------------------------------------------+ - | column | type | - +---------+-------------------------------------------------------+ - | id | kind=generic, dtype=int32 | - +---------+-------------------------------------------------------+ - | url | text | - +---------+-------------------------------------------------------+ - |embedding|kind=embedding, dtype=array(dtype=float32, shape=[768])| - +---------+-------------------------------------------------------+ - - - >>> # Create dataset in your app.activeloop.ai organization: - >>> ds = deeplake.create("al://organization_id/dataset_name") - - >>> # Create a dataset stored in your cloud using specified credentials: - >>> ds = deeplake.create("s3://mybucket/my_dataset", - >>> creds = {"aws_access_key_id": ..., ...}) - - >>> # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - >>> ds = deeplake.create("s3://mybucket/my_dataset", - >>> creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") - - >>> # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - >>> ds = deeplake.create("azure://bucket/path/to/dataset") - - >>> ds = deeplake.create("gcs://bucket/path/to/dataset") - - >>> ds = deeplake.create("mem://in-memory") - + ```python + import deeplake + from deeplake import types + + # Create a dataset in your local filesystem: + ds = deeplake.create("directory_path") + ds.add_column("id", types.Int32()) + ds.add_column("url", types.Text()) + ds.add_column("embedding", types.Embedding(768)) + ds.commit() + ds.summary() + ``` + Output: + ``` + Dataset length: 0 + Columns: + id : int32 + url : text + embedding: embedding(768) + ``` + + ```python + # Create dataset in your app.activeloop.ai organization: + ds = deeplake.create("al://organization_id/dataset_name") + ``` + + ```python + # Create a dataset stored in your cloud using specified credentials: + ds = deeplake.create("s3://mybucket/my_dataset", + creds = {"aws_access_key_id": ..., ...}) + ``` + + ```python + # Create dataset stored in your cloud using app.activeloop.ai managed credentials. + ds = deeplake.create("s3://mybucket/my_dataset", + creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") + ``` + + ```python + # Create dataset stored in your cloud using app.activeloop.ai managed credentials. + ds = deeplake.create("azure://bucket/path/to/dataset") + ``` + + ```python + ds = deeplake.create("gcs://bucket/path/to/dataset") + ``` + + ```python + ds = deeplake.create("mem://in-memory") + ``` Raises: ValueError: if a dataset already exists at the given URL """ - -def create_async(url: str, creds: dict[str, str] | None = None, token: str | None = None, - schema: schemas.SchemaTemplate | None = None) -> Future: +def create_async( + url: str, + creds: dict[str, str] | None = None, + token: str | None = None, + schema: schemas.SchemaTemplate | None = None, +) -> Future: """ Asynchronously creates a new dataset at the given URL. See [deeplake.create][] for more information. To open an existing dataset, use [deeplake.open_async][]. - - Examples: - >>> import deeplake - >>> from deeplake import types - >>> - >>> # Asynchronously create a dataset in your local filesystem: - >>> ds = await deeplake.create_async("directory_path") - >>> await ds.add_column("id", types.Int32()) - >>> await ds.add_column("url", types.Text()) - >>> await ds.add_column("embedding", types.Embedding(768)) - >>> await ds.commit() - >>> await ds.summary() # Example of usage in an async context - - >>> # Alternatively, create a dataset using .result(). - >>> future_ds = deeplake.create_async("directory_path") - >>> ds = future_ds.result() # Blocks until the dataset is created - - >>> # Create a dataset in your app.activeloop.ai organization: - >>> ds = await deeplake.create_async("al://organization_id/dataset_name") - - >>> # Create a dataset stored in your cloud using specified credentials: - >>> ds = await deeplake.create_async("s3://mybucket/my_dataset", - >>> creds={"aws_access_key_id": ..., ...}) - - >>> # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - >>> ds = await deeplake.create_async("s3://mybucket/my_dataset", - >>> creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - - >>> # Create dataset stored in your cloud using app.activeloop.ai managed credentials. - >>> ds = await deeplake.create_async("azure://bucket/path/to/dataset") - >>> ds = await deeplake.create_async("gcs://bucket/path/to/dataset") - - >>> ds = await deeplake.create_async("mem://in-memory") + Examples: + ```python + import deeplake + from deeplake import types + + # Asynchronously create a dataset in your local filesystem: + ds = await deeplake.create_async("directory_path") + await ds.add_column("id", types.Int32()) + await ds.add_column("url", types.Text()) + await ds.add_column("embedding", types.Embedding(768)) + await ds.commit() + await ds.summary() # Example of usage in an async context + ``` + + ```python + # Alternatively, create a dataset using .result(). + future_ds = deeplake.create_async("directory_path") + ds = future_ds.result() # Blocks until the dataset is created + ``` + + ```python + # Create a dataset in your app.activeloop.ai organization: + ds = await deeplake.create_async("al://organization_id/dataset_name") + ``` + + ```python + # Create a dataset stored in your cloud using specified credentials: + ds = await deeplake.create_async("s3://mybucket/my_dataset", + creds={"aws_access_key_id": ..., ...}) + ``` + + ```python + # Create dataset stored in your cloud using app.activeloop.ai managed credentials. + ds = await deeplake.create_async("s3://mybucket/my_dataset", + creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") + ``` + + ```python + # Create dataset stored in your cloud using app.activeloop.ai managed credentials. + ds = await deeplake.create_async("azure://bucket/path/to/dataset") + ``` + + ```python + ds = await deeplake.create_async("gcs://bucket/path/to/dataset") + ``` + + ```python + ds = await deeplake.create_async("mem://in-memory") + ``` Raises: ValueError: if a dataset already exists at the given URL (will be raised when the future is awaited) """ -def copy(src: str, dst: str, src_creds: dict[str, str] | None = None, dst_creds: dict[str, str] | None = None, - token: str | None = None,) -> None: +def copy( + src: str, + dst: str, + src_creds: dict[str, str] | None = None, + dst_creds: dict[str, str] | None = None, + token: str | None = None, +) -> None: """ Copies the dataset at the source URL to the destination URL. @@ -1990,15 +2006,15 @@ def copy(src: str, dst: str, src_creds: dict[str, str] | None = None, dst_creds: token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. Examples: - >>> deeplake.copy("al://organization_id/source_dataset", "al://organization_id/destination_dataset") + ```python + deeplake.copy("al://organization_id/source_dataset", "al://organization_id/destination_dataset") + ``` """ - def delete( - url: str, - creds: dict[str, str] | None = None, - token: str | None = None) -> None: + url: str, creds: dict[str, str] | None = None, token: str | None = None +) -> None: """ Deletes an existing dataset. @@ -2010,7 +2026,7 @@ def delete( """ def exists( - url: str, creds: dict[str, str] | None = None, token: str | None = None + url: str, creds: dict[str, str] | None = None, token: str | None = None ) -> bool: """ Check if a dataset exists at the given URL @@ -2022,7 +2038,7 @@ def exists( """ def open( - url: str, creds: dict[str, str] | None = None, token: str | None = None + url: str, creds: dict[str, str] | None = None, token: str | None = None ) -> Dataset: """ Opens an existing dataset, potenitally for modifying its content. @@ -2052,23 +2068,34 @@ def open( token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. Examples: - >>> # Load dataset managed by Deep Lake. - >>> ds = deeplake.open("al://organization_id/dataset_name") + ```python + # Load dataset managed by Deep Lake. + ds = deeplake.open("al://organization_id/dataset_name") + ``` - >>> # Load dataset stored in your cloud using your own credentials. - >>> ds = deeplake.open("s3://bucket/my_dataset", - >>> creds = {"aws_access_key_id": ..., ...}) + ```python + # Load dataset stored in your cloud using your own credentials. + ds = deeplake.open("s3://bucket/my_dataset", + creds = {"aws_access_key_id": ..., ...}) + ``` - >>> # Load dataset stored in your cloud using Deep Lake managed credentials. - >>> ds = deeplake.open("s3://bucket/my_dataset", - >>> ...creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") + ```python + # Load dataset stored in your cloud using Deep Lake managed credentials. + ds = deeplake.open("s3://bucket/my_dataset", + ...creds = {"creds_key": "managed_creds_key"}, org_id = "my_org_id") + ``` - >>> ds = deeplake.open("s3://bucket/path/to/dataset") + ```python + ds = deeplake.open("s3://bucket/path/to/dataset") + ``` - >>> ds = deeplake.open("azure://bucket/path/to/dataset") - - >>> ds = deeplake.open("gcs://bucket/path/to/dataset") + ```python + ds = deeplake.open("azure://bucket/path/to/dataset") + ``` + ```python + ds = deeplake.open("gcs://bucket/path/to/dataset") + ``` """ def open_async( @@ -2080,33 +2107,47 @@ def open_async( See [deeplake.open][] for opening the dataset synchronously. Examples: - >>> # Asynchronously load dataset managed by Deep Lake using await. - >>> ds = await deeplake.open_async("al://organization_id/dataset_name") - - >>> # Asynchronously load dataset stored in your cloud using your own credentials. - >>> ds = await deeplake.open_async("s3://bucket/my_dataset", - >>> creds={"aws_access_key_id": ..., ...}) - - >>> # Asynchronously load dataset stored in your cloud using Deep Lake managed credentials. - >>> ds = await deeplake.open_async("s3://bucket/my_dataset", - >>> creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") - - >>> ds = await deeplake.open_async("s3://bucket/path/to/dataset") - - >>> ds = await deeplake.open_async("azure://bucket/path/to/dataset") - - >>> ds = await deeplake.open_async("gcs://bucket/path/to/dataset") - - >>> # Alternatively, load the dataset using .result(). - >>> future_ds = deeplake.open_async("al://organization_id/dataset_name") - >>> ds = future_ds.result() # Blocks until the dataset is loaded + ```python + # Asynchronously load dataset managed by Deep Lake using await. + ds = await deeplake.open_async("al://organization_id/dataset_name") + ``` + + ```python + # Asynchronously load dataset stored in your cloud using your own credentials. + ds = await deeplake.open_async("s3://bucket/my_dataset", + creds={"aws_access_key_id": ..., ...}) + ``` + + ```python + # Asynchronously load dataset stored in your cloud using Deep Lake managed credentials. + ds = await deeplake.open_async("s3://bucket/my_dataset", + creds={"creds_key": "managed_creds_key"}, org_id="my_org_id") + ``` + + ```python + ds = await deeplake.open_async("s3://bucket/path/to/dataset") + ``` + + ```python + ds = await deeplake.open_async("azure://bucket/path/to/dataset") + ``` + + ```python + ds = await deeplake.open_async("gcs://bucket/path/to/dataset") + ``` + + ```python + # Alternatively, load the dataset using .result(). + future_ds = deeplake.open_async("al://organization_id/dataset_name") + ds = future_ds.result() # Blocks until the dataset is loaded + ``` """ def like( - src: DatasetView, - dest: str, - creds: dict[str, str] | None = None, - token: str | None = None, + src: DatasetView, + dest: str, + creds: dict[str, str] | None = None, + token: str | None = None, ) -> Dataset: """ Creates a new dataset by copying the ``source`` dataset's structure to a new location. @@ -2126,18 +2167,19 @@ def like( token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated. Examples: - >>> ds = deeplake.like(src="az://bucket/existing/to/dataset", - >>> dest="s3://bucket/new/dataset") + ```python + ds = deeplake.like(src="az://bucket/existing/to/dataset", + dest="s3://bucket/new/dataset") + ``` """ - def connect( - src: str, - dest: str | None = None, - org_id: str | None = None, - creds_key: str | None = None, - token: str | None = None, + src: str, + dest: str | None = None, + org_id: str | None = None, + creds_key: str | None = None, + token: str | None = None, ) -> Dataset: """ Connects an existing dataset your [app.activeloop.ai](https://app.activeloop.ai) account. @@ -2154,25 +2196,34 @@ def connect( token (str, optional): Activeloop token used to fetch the managed credentials. Examples: - >>> ds = deeplake.connect("s3://bucket/path/to/dataset", - >>> "al://my_org/dataset") + ```python + ds = deeplake.connect("s3://bucket/path/to/dataset", + "al://my_org/dataset") + ``` - >>> ds = deeplake.connect("s3://bucket/path/to/dataset", - >>> "al://my_org/dataset", creds_key="my_key") + ```python + ds = deeplake.connect("s3://bucket/path/to/dataset", + "al://my_org/dataset", creds_key="my_key") + ``` - >>> # Connect the dataset as al://my_org/dataset - >>> ds = deeplake.connect("s3://bucket/path/to/dataset", - >>> org_id="my_org") + ```python + # Connect the dataset as al://my_org/dataset + ds = deeplake.connect("s3://bucket/path/to/dataset", + org_id="my_org") + ``` - >>> ds = deeplake.connect("az://bucket/path/to/dataset", - >>> "al://my_org/dataset", creds_key="my_key") + ```python + ds = deeplake.connect("az://bucket/path/to/dataset", + "al://my_org/dataset", creds_key="my_key") + ``` - >>> ds = deeplake.connect("gcs://bucket/path/to/dataset", - >>> "al://my_org/dataset", creds_key="my_key") + ```python + ds = deeplake.connect("gcs://bucket/path/to/dataset", + "al://my_org/dataset", creds_key="my_key") + ``` """ - def disconnect(url: str, token: str | None = None) -> None: """ Disconnect the dataset your Activeloop account. @@ -2187,13 +2238,14 @@ def disconnect(url: str, token: str | None = None) -> None: token (str, optional): Activeloop token to authenticate user. Examples: - >>> deeplake.disconnect("al://my_org/dataset_name") + ```python + deeplake.disconnect("al://my_org/dataset_name") + ``` """ - def open_read_only( - url: str, creds: dict[str, str] | None = None, token: str | None = None + url: str, creds: dict[str, str] | None = None, token: str | None = None ) -> ReadOnlyDataset: """ Opens an existing dataset in read-only mode. @@ -2223,30 +2275,39 @@ def open_read_only( token (str, optional): Activeloop token to authenticate user. Examples: - >>> ds = deeplake.open_read_only("directory_path") - >>> ds.summary() - Dataset(columns=(id,url,embedding), length=0) - +---------+-------------------------------------------------------+ - | column | type | - +---------+-------------------------------------------------------+ - | id | kind=generic, dtype=int32 | - +---------+-------------------------------------------------------+ - | url | text | - +---------+-------------------------------------------------------+ - |embedding|kind=embedding, dtype=array(dtype=float32, shape=[768])| - +---------+-------------------------------------------------------+ + ```python + ds = deeplake.open_read_only("directory_path") + ds.summary() + ``` + Example Output: + ``` + Dataset length: 5 + Columns: + id : int32 + url : text + embedding: embedding(768) + ``` - >>> ds = deeplake.open_read_only("file:///path/to/dataset") + ```python + ds = deeplake.open_read_only("file:///path/to/dataset") + ``` - >>> ds = deeplake.open_read_only("s3://bucket/path/to/dataset") + ```python + ds = deeplake.open_read_only("s3://bucket/path/to/dataset") + ``` - >>> ds = deeplake.open_read_only("azure://bucket/path/to/dataset") + ```python + ds = deeplake.open_read_only("azure://bucket/path/to/dataset") + ``` - >>> ds = deeplake.open_read_only("gcs://bucket/path/to/dataset") - - >>> ds = deeplake.open_read_only("mem://in-memory") + ```python + ds = deeplake.open_read_only("gcs://bucket/path/to/dataset") + ``` + ```python + ds = deeplake.open_read_only("mem://in-memory") + ``` """ def open_read_only_async( @@ -2258,22 +2319,36 @@ def open_read_only_async( See [deeplake.open_async][] for opening datasets for modification and [deeplake.open_read_only][] for sync open. Examples: - >>> # Asynchronously open a dataset in read-only mode: - >>> ds = await deeplake.open_read_only_async("directory_path") + ```python + # Asynchronously open a dataset in read-only mode: + ds = await deeplake.open_read_only_async("directory_path") + ``` - >>> # Alternatively, open the dataset using .result(). - >>> future_ds = deeplake.open_read_only_async("directory_path") - >>> ds = future_ds.result() # Blocks until the dataset is loaded + ```python + # Alternatively, open the dataset using .result(). + future_ds = deeplake.open_read_only_async("directory_path") + ds = future_ds.result() # Blocks until the dataset is loaded + ``` - >>> ds = await deeplake.open_read_only_async("file:///path/to/dataset") + ```python + ds = await deeplake.open_read_only_async("file:///path/to/dataset") + ``` - >>> ds = await deeplake.open_read_only_async("s3://bucket/path/to/dataset") + ```python + ds = await deeplake.open_read_only_async("s3://bucket/path/to/dataset") + ``` - >>> ds = await deeplake.open_read_only_async("azure://bucket/path/to/dataset") + ```python + ds = await deeplake.open_read_only_async("azure://bucket/path/to/dataset") + ``` - >>> ds = await deeplake.open_read_only_async("gcs://bucket/path/to/dataset") + ```python + ds = await deeplake.open_read_only_async("gcs://bucket/path/to/dataset") + ``` - >>> ds = await deeplake.open_read_only_async("mem://in-memory") + ```python + ds = await deeplake.open_read_only_async("mem://in-memory") + ``` """ def from_parquet(url: str) -> ReadOnlyDataset: @@ -2284,13 +2359,6 @@ def from_parquet(url: str) -> ReadOnlyDataset: url: The URL of the Parquet dataset. If no protocol is specified, it assumes `file://` """ - -def __child_atfork() -> None: - ... - - -def __parent_atfork() -> None:... - - - -def __prepare_atfork() -> None:... +def __child_atfork() -> None: ... +def __parent_atfork() -> None: ... +def __prepare_atfork() -> None: ... diff --git a/python/deeplake/_tensorflow.py b/python/deeplake/_tensorflow.py index c62b147814..215397b414 100644 --- a/python/deeplake/_tensorflow.py +++ b/python/deeplake/_tensorflow.py @@ -1,9 +1,12 @@ import numpy as np + try: import tensorflow as tf from tensorflow.data import Dataset except ImportError: - raise ImportError("TensorFlow is not installed. Please install tensorflow to use this feature.") + raise ImportError( + "TensorFlow is not installed. Please install tensorflow to use this feature." + ) import deeplake diff --git a/python/deeplake/_torch.py b/python/deeplake/_torch.py index 3713fcd6af..dd4c30c0d2 100644 --- a/python/deeplake/_torch.py +++ b/python/deeplake/_torch.py @@ -1,7 +1,9 @@ try: from torch.utils.data import Dataset except ImportError: - raise ImportError("Torch is not installed. Please install torch to use this feature.") + raise ImportError( + "Torch is not installed. Please install torch to use this feature." + ) import deeplake diff --git a/python/deeplake/core.pyi b/python/deeplake/core.pyi index 1998f64b38..6d8facfbb5 100644 --- a/python/deeplake/core.pyi +++ b/python/deeplake/core.pyi @@ -9,42 +9,23 @@ import typing __all__ = ["Dict", "IndexMapping64", "MemoryBuffer"] class Dict: - def __getstate__(self: dict) -> dict: - ... - def __setstate__(self: dict, arg0: dict) -> None: - ... - def __eq__(self: dict, other: dict | dict) -> bool: - ... - def __getitem__(self: dict, key: str) -> typing.Any: - ... - def __len__(self: dict) -> int: - ... - def __ne__(self: dict, other: dict | dict) -> bool: - ... - def items(self: dict) -> list: - ... - def keys(self: dict) -> list[str]: - ... - def to_dict(self: dict) -> dict: - ... - + def __getstate__(self: dict) -> dict: ... + def __setstate__(self: dict, arg0: dict) -> None: ... + def __eq__(self: dict, other: dict | dict) -> bool: ... + def __getitem__(self: dict, key: str) -> typing.Any: ... + def __len__(self: dict) -> int: ... + def __ne__(self: dict, other: dict | dict) -> bool: ... + def __str__(self: dict) -> str: ... + def items(self: dict) -> list: ... + def keys(self: dict) -> list[str]: ... + def to_dict(self: dict) -> dict: ... class IndexMapping64: def __getitem__(self, index: int) -> int: ... - - - def __getstate__(self) -> tuple: - ... - - def __iter__(self) -> typing.Iterator[int]: - ... - - def __len__(self) -> int: - ... - - def __setstate__(self, arg0: tuple) -> None: - ... - + def __getstate__(self) -> tuple: ... + def __iter__(self) -> typing.Iterator[int]: ... + def __len__(self) -> int: ... + def __setstate__(self, arg0: tuple) -> None: ... class MemoryBuffer: def __buffer__(self, flags): diff --git a/python/deeplake/formats.pyi b/python/deeplake/formats.pyi index 8a0aa3b199..8156d63ffd 100644 --- a/python/deeplake/formats.pyi +++ b/python/deeplake/formats.pyi @@ -7,7 +7,7 @@ class DataFormat: Base class for all datafile formats. """ - def __repr__(self) -> str: ... + def __str__(self) -> str: ... def Chunk( sample_compression: str | None = None, chunk_compression: str | None = None diff --git a/python/deeplake/schemas.py b/python/deeplake/schemas.py index 4d3bf173ad..c15dc4e3ab 100644 --- a/python/deeplake/schemas.py +++ b/python/deeplake/schemas.py @@ -5,4 +5,3 @@ "COCOImages", "SchemaTemplate", ] - diff --git a/python/deeplake/schemas.pyi b/python/deeplake/schemas.pyi index 2c4cbd9a3f..0e37e0c2c6 100644 --- a/python/deeplake/schemas.pyi +++ b/python/deeplake/schemas.pyi @@ -8,7 +8,6 @@ __all__ = [ "SchemaTemplate", ] - def TextEmbeddings(embedding_size: int, quantize: bool = False) -> SchemaTemplate: """ A schema for storing embedded text from documents. @@ -25,22 +24,28 @@ def TextEmbeddings(embedding_size: int, quantize: bool = False) -> SchemaTemplat quantize: If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed Examples: - >>> # Create a dataset with the standard schema - >>> ds = deeplake.create("ds_path", - >>> schema=deeplake.schemas.TextEmbeddings(768).build()) - - >>> # Customize the schema before creating the dataset - >>> ds = deeplake.create("ds_path", schema=deeplake.schemas.TextEmbeddings(768) - >>> .rename("embedding", "text_embed") - >>> .add("author", types.Text()) - >>> .build()) + ```python + # Create a dataset with the standard schema + ds = deeplake.create("ds_path", + schema=deeplake.schemas.TextEmbeddings(768).build()) + + # Customize the schema before creating the dataset + ds = deeplake.create("ds_path", schema=deeplake.schemas.TextEmbeddings(768) + .rename("embedding", "text_embed") + .add("author", types.Text()) + .build()) + ``` """ ... - -def COCOImages(embedding_size: int, quantize: bool = False, objects: bool = True, keypoints: bool = False, - stuffs: bool = False) -> SchemaTemplate: +def COCOImages( + embedding_size: int, + quantize: bool = False, + objects: bool = True, + keypoints: bool = False, + stuffs: bool = False, +) -> SchemaTemplate: """ A schema for storing COCO-based image data. @@ -76,35 +81,43 @@ def COCOImages(embedding_size: int, quantize: bool = False, objects: bool = True quantize: If true, quantize the embeddings to slightly decrease accuracy while greatly increasing query speed Examples: - >>> # Create a dataset with the standard schema - >>> ds = deeplake.create("ds_path", - >>> schema=deeplake.schemas.COCOImages(768).build()) - - >>> # Customize the schema before creating the dataset - >>> ds = deeplake.create("ds_path", schema=deeplake.schemas.COCOImages(768, - >>> objects=True, keypoints=True) - >>> .rename("embedding", "image_embed") - >>> .add("author", types.Text()).build()) + ```python + # Create a dataset with the standard schema + ds = deeplake.create("ds_path", + schema=deeplake.schemas.COCOImages(768).build()) + + # Customize the schema before creating the dataset + ds = deeplake.create("ds_path", schema=deeplake.schemas.COCOImages(768, + objects=True, keypoints=True) + .rename("embedding", "image_embed") + .add("author", types.Text()).build()) + ``` """ ... - class SchemaTemplate: """ A template that can be used for creating a new dataset with [deeplake.create][] """ # Temporary workaround. Need to remove `deeplake._deeplake` from the return type. - def __init__(self, - schema: dict[str, deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type]) -> None: + def __init__( + self, + schema: dict[ + str, deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type + ], + ) -> None: """ Constructs a new SchemaTemplate from the given dict """ ... - def add(self, name: str, - dtype: deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type) -> SchemaTemplate: + def add( + self, + name: str, + dtype: deeplake._deeplake.types.DataType | str | deeplake._deeplake.types.Type, + ) -> SchemaTemplate: """ Adds a new column to the template diff --git a/python/deeplake/tql.pyi b/python/deeplake/tql.pyi index 10dee309d2..42af0b12c7 100644 --- a/python/deeplake/tql.pyi +++ b/python/deeplake/tql.pyi @@ -17,10 +17,12 @@ def register_function(function: typing.Callable) -> None: to be used in TQL should accept input arguments as numpy arrays and return numpy array. Examples: - >>> def next_number(a): - >>> return a + 1 - >>> - >>> deeplake.tql.register_function(next_number) - >>> - >>> r = ds.query("SELECT * WHERE next_number(column_name) > 10") + ```python + def next_number(a): + return a + 1 + + deeplake.tql.register_function(next_number) + + r = ds.query("SELECT * WHERE next_number(column_name) > 10") + ``` """ diff --git a/python/deeplake/types.py b/python/deeplake/types.py index 6311285d46..008b60117d 100644 --- a/python/deeplake/types.py +++ b/python/deeplake/types.py @@ -3,31 +3,34 @@ __all__ = [ "Array", "BM25", + "Binary", + "BinaryMask", "Bool", + "BoundingBox", + "ClassLabel", "DataType", "Dict", "Embedding", "Float32", "Float64", - "Type", + "Image", "Int16", "Int32", "Int64", "Int8", "Inverted", + "Link", + "Polygon", + "QuantizationType", + "SegmentMask", "Sequence", - "Image", "Struct", "Text", + "TextIndexType", + "Type", + "TypeKind", "UInt16", "UInt32", "UInt64", "UInt8", - "BoundingBox", - "BinaryMask", - "SegmentMask", - "TypeKind", - "TextIndexType", - "QuantizationType", - "Binary", ] diff --git a/python/deeplake/types.pyi b/python/deeplake/types.pyi index fe33f016e4..1b46326859 100644 --- a/python/deeplake/types.pyi +++ b/python/deeplake/types.pyi @@ -5,36 +5,38 @@ import typing __all__ = [ "Array", "BM25", + "Binary", + "BinaryMask", "Bool", + "BoundingBox", + "ClassLabel", "DataType", "Dict", "Embedding", "Float32", "Float64", - "Type", + "Image", "Int16", "Int32", "Int64", "Int8", "Inverted", + "Link", + "Polygon", + "QuantizationType", + "SegmentMask", "Sequence", - "Image", "Struct", "Text", + "TextIndexType", + "Type", + "TypeKind", "UInt16", "UInt32", "UInt64", "UInt8", - "BoundingBox", - "BinaryMask", - "SegmentMask", - "TypeKind", - "TextIndexType", - "QuantizationType", - "Binary", ] - class QuantizationType: Binary: typing.ClassVar[QuantizationType] """ @@ -54,7 +56,6 @@ class QuantizationType: def __repr__(self) -> str: ... def __setstate__(self, state: int) -> None: ... def __str__(self) -> str: ... - @property def name(self) -> str: ... @property @@ -76,64 +77,40 @@ class TextIndexType: __members__: typing.ClassVar[dict[str, TextIndexType]] def __eq__(self, other: typing.Any) -> bool: ... - def __getstate__(self) -> int: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __init__(self, value: int) -> None: ... - def __int__(self) -> int: ... - def __ne__(self, other: typing.Any) -> bool: ... - def __repr__(self) -> str: ... - def __setstate__(self, state: int) -> None: ... - def __str__(self) -> str: ... - @property def name(self) -> str: ... - @property def value(self) -> int: ... - class DataType: """ The base class all specific types extend from. """ - def __eq__(self, other: DataType) -> bool: - ... - - def __ne__(self, other: DataType) -> bool: - ... - - def __repr__(self) -> str: ... - + def __eq__(self, other: DataType) -> bool: ... + def __ne__(self, other: DataType) -> bool: ... + def __str__(self) -> str: ... class Type: """ """ - def __repr__(self) -> str: ... - - def __eq__(self, other: Type) -> bool: - ... - - def __ne__(self, other: Type) -> bool: - ... - + def __str__(self) -> str: ... + def __eq__(self, other: Type) -> bool: ... + def __ne__(self, other: Type) -> bool: ... @property(readonly=True) def data_type(self) -> DataType: ... - @property(readonly=True) # Temporary workaround. Need to remove `deeplake._deeplake` from the return type. def default_format(self) -> deeplake._deeplake.formats.DataFormat: ... - @property def id(self) -> str: """ @@ -142,13 +119,15 @@ class Type: ... @property - def is_sequence(self) -> bool: - ... - + def is_sequence(self) -> bool: ... @property - def kind(self) -> TypeKind: - ... - + def is_link(self) -> bool: ... + @property + def is_image(self) -> bool: ... + @property + def is_segment_mask(self) -> bool: ... + @property + def kind(self) -> TypeKind: ... @property def shape(self) -> list[int] | None: """ @@ -156,75 +135,67 @@ class Type: """ ... - class TypeKind: """ Members: - + Generic - + Text - + Dict - + Embedding - + Sequence - + Image - + BoundingBox - + BinaryMask - + SegmentMask + + Polygon + + ClassLabel + + Link """ + BinaryMask: typing.ClassVar[TypeKind] BoundingBox: typing.ClassVar[TypeKind] + ClassLabel: typing.ClassVar[TypeKind] Dict: typing.ClassVar[TypeKind] Embedding: typing.ClassVar[TypeKind] Generic: typing.ClassVar[TypeKind] Image: typing.ClassVar[TypeKind] + Link: typing.ClassVar[TypeKind] + Polygon: typing.ClassVar[TypeKind] SegmentMask: typing.ClassVar[TypeKind] Sequence: typing.ClassVar[TypeKind] Text: typing.ClassVar[TypeKind] __members__: typing.ClassVar[dict[str, TypeKind]] - def __eq__(self, other: typing.Any) -> bool: - ... - def __getstate__(self) -> int: - ... - def __hash__(self) -> int: - ... - def __index__(self) -> int: - ... - def __init__(self, value: int) -> None: - ... - def __int__(self) -> int: - ... - def __ne__(self, other: typing.Any) -> bool: - ... - def __repr__(self) -> str: - ... - def __setstate__(self, state: int) -> None: - ... - def __str__(self) -> str: - ... + def __eq__(self, other: typing.Any) -> bool: ... + def __getstate__(self) -> int: ... + def __hash__(self) -> int: ... + def __index__(self) -> int: ... + def __init__(self, value: int) -> None: ... + def __int__(self) -> int: ... + def __ne__(self, other: typing.Any) -> bool: ... + def __repr__(self) -> str: ... + def __setstate__(self, state: int) -> None: ... + def __str__(self) -> str: ... @property - def name(self) -> str: - ... + def name(self) -> str: ... @property - def value(self) -> int: - ... - + def value(self) -> int: ... @typing.overload def Array(dtype: DataType | str, dimensions: int) -> DataType: ... - - @typing.overload def Array(dtype: DataType | str, shape: list[int]) -> DataType: ... - - def Array(dtype: DataType | str, dimensions: int, shape: list[int]) -> DataType: """ A generic array of data. @@ -235,26 +206,28 @@ def Array(dtype: DataType | str, dimensions: int, shape: list[int]) -> DataType: shape: Constrain the size of each dimension in the array Examples: - >>> # Create a three-dimensional array, where each dimension can have any number of elements - >>> ds.add_column("col1", types.Array("int32", dimensions=3)) - >>> - >>> # Create a three-dimensional array, where each dimension has a known size - >>> ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) + ```python + # Create a three-dimensional array, where each dimension can have any number of elements + ds.add_column("col1", types.Array("int32", dimensions=3)) + + # Create a three-dimensional array, where each dimension has a known size + ds.add_column("col2", types.Array(types.Float32(), shape=[50, 30, 768])) + ``` """ ... - def Bool() -> DataType: """ A boolean value Examples: - >>> ds.add_column("col1", types.Bool) - >>> ds.add_column("col2", "bool") + ```python + ds.add_column("col1", types.Bool) + ds.add_column("col2", "bool") + ``` """ ... - def Text(index_type: str | TextIndexType | None = None) -> Type: """ Text data of arbitrary length. @@ -268,15 +241,16 @@ def Text(index_type: str | TextIndexType | None = None) -> Type: index_type: How to index the data in the column for faster searching. Default is `None` meaning "do not index" Examples: - >>> ds.add_column("col1", types.Text) - >>> ds.add_column("col2", "text") - >>> ds.add_column("col3", str) - >>> ds.add_column("col4", types.Text(index_type=types.Inverted)) - >>> ds.add_column("col4", types.Text(index_type=types.BM25)) + ```python + ds.add_column("col1", types.Text) + ds.add_column("col2", "text") + ds.add_column("col3", str) + ds.add_column("col4", types.Text(index_type=types.Inverted)) + ds.add_column("col4", types.Text(index_type=types.BM25)) + ``` """ ... - BM25: TextIndexType """ A [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) based index of text data. @@ -298,16 +272,21 @@ def Dict() -> Type: See [deeplake.types.Struct][] for a type that supports defining allowed keys. Examples: - >>> ds.add_column("col1", types.Dict) - >>> - >>> ds.append([{"col1", {"a": 1, "b": 2}}]) - >>> ds.append([{"col1", {"b": 3, "c": 4}}]) + ```python + ds.add_column("col1", types.Dict) + + ds.append([{"col1", {"a": 1, "b": 2}}]) + ds.append([{"col1", {"b": 3, "c": 4}}]) + ``` """ ... - -def Embedding(size: int, dtype: DataType | str = "float32", quantization: QuantizationType | None = None ) -> Type: +def Embedding( + size: int | None = None, + dtype: DataType | str = "float32", + quantization: QuantizationType | None = None, +) -> Type: """ A single-dimensional embedding of a given length. See [deeplake.types.Array][] for a multidimensional array. @@ -317,72 +296,79 @@ def Embedding(size: int, dtype: DataType | str = "float32", quantization: Quanti quantization: How to compress the embeddings in the index. Default uses no compression, but can be set to [deeplake.types.QuantizationType.Binary][] Examples: - >>> ds.add_column("col1", types.Embedding(768)) - >>> ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) + ```python + ds.add_column("col1", types.Embedding(768)) + ds.add_column("col2", types.Embedding(768, quantization=types.QuantizationType.Binary)) + ``` """ ... - def Float32() -> DataType: """ A 32-bit float value Examples: - >>> ds.add_column("col1", types.Float) + ```python + ds.add_column("col1", types.Float) + ``` """ ... - def Float64() -> DataType: """ A 64-bit float value Examples: - >>> ds.add_column("col1", types.Float64) + ```python + ds.add_column("col1", types.Float64) + ``` """ ... - def Int16() -> DataType: """ A 16-bit integer value Examples: - >>> ds.add_column("col1", types.Int16) + ```python + ds.add_column("col1", types.Int16) + ``` """ ... - def Int32() -> DataType: """ A 32-bit integer value Examples: - >>> ds.add_column("col1", types.Int32) + ```python + ds.add_column("col1", types.Int32) + ``` """ ... - def Int64() -> DataType: """ A 64-bit integer value Examples: - >>> ds.add_column("col1", types.Int64) + ```python + ds.add_column("col1", types.Int64) + ``` """ ... - def Int8() -> DataType: """ An 8-bit integer value Examples: - >>> ds.add_column("col1", types.Int8) + ```python + ds.add_column("col1", types.Int8) + ``` """ ... - def Sequence(nested_type: DataType | str | Type) -> Type: """ A sequence is a list of other data types, where there is a order to the values in the list. @@ -393,11 +379,12 @@ def Sequence(nested_type: DataType | str | Type) -> Type: nested_type: The data type of the values in the sequence. Can be any data type, not just primitive types. Examples: - >>> ds.add_column("col1", types.Sequence(types.Image(sample_compression="jpeg"))) + ```python + ds.add_column("col1", types.Sequence(types.Image(sample_compression="jpeg"))) + ``` """ ... - def Image(dtype: DataType | str = "uint8", sample_compression: str = "png") -> Type: """ An image of a given format. The value returned will be a multidimensional array of values rather than the raw image bytes. @@ -419,34 +406,52 @@ def Image(dtype: DataType | str = "uint8", sample_compression: str = "png") -> T sample_compression: The on-disk compression/format of the image Examples: - >>> ds.add_column("col1", types.Sequence(types.Image)) - >>> ds.add_column("col1", types.Sequence(types.Image(sample_compression="jpg"))) + ```python + ds.add_column("col1", types.Image) + ds.add_column("col1", types.Image(sample_compression="jpg")) + ``` """ ... +def Link(type: Type) -> Type: + """ + A link to an external resource. The value returned will be a reference to the external resource rather than the raw data. + + Parameters: + type: The type of the linked data + + Examples: + ```python + ds.add_column("col1", types.Link(types.Image())) + ``` + """ + ... +def Polygon() -> Type: ... +def ClassLabel(dtype: DataType | str) -> Type: ... def BoundingBox( - dtype: DataType | str = "float32", - format: str | None = None, - bbox_type: str | None = None, + dtype: DataType | str = "float32", + format: str | None = None, + bbox_type: str | None = None, ) -> Type: """ Stores an array of values specifying the bounding boxes of an image. Parameters: dtype: The datatype of values (default float32) - format: The bounding box format. Possible values: `ccwh`, `tlwh`, `tlbr`, `unknown` + format: The bounding box format. Possible values: `ccwh`, `ltwh`, `ltrb`, `unknown` bbox_type: The pixel type. Possible values: `pixel`, `fractional` Examples: - >>> ds.add_column("col1", types.BoundingBox()) - >>> ds.add_column("col2", types.BoundingBox(format="tlwh")) + ```python + ds.add_column("col1", types.BoundingBox()) + ds.add_column("col2", types.BoundingBox(format="ltwh")) + ``` """ ... - def BinaryMask( - sample_compression: str | None = None, chunk_compression: str | None = None + sample_compression: str | None = None, chunk_compression: str | None = None ) -> Type: """ In binary mask, pixel value is a boolean for whether there is/is-not an object of a class present. @@ -458,16 +463,17 @@ def BinaryMask( chunk_compression: How to compress all the values stored in a single file. Possible values: lz4, null (default: null) Examples: - >>> ds.add_column("col1", types.BinaryMask(sample_compression="lz4")) - >>> ds.append(np.zeros((512, 512, 5), dtype="bool")) + ```python + ds.add_column("col1", types.BinaryMask(sample_compression="lz4")) + ds.append(np.zeros((512, 512, 5), dtype="bool")) + ``` """ ... - def SegmentMask( - dtype: DataType | str = "uint8", - sample_compression: str | None = None, - chunk_compression: str | None = None, + dtype: DataType | str = "uint8", + sample_compression: str | None = None, + chunk_compression: str | None = None, ) -> Type: """ Segmentation masks are 2D representations of class labels where a numerical class value is encoded in an array of same shape as the image. @@ -479,12 +485,13 @@ def SegmentMask( chunk_compression: How to compress all the values stored in a single file. Possible values: lz4, null (default: null) Examples: - >>> ds.add_column("col1", types.SegmentMask(sample_compression="lz4")) - >>> ds.append("col1", np.zeros((512, 512))) + ```python + ds.add_column("col1", types.SegmentMask(sample_compression="lz4")) + ds.append("col1", np.zeros((512, 512))) + ``` """ ... - def Struct(fields: dict[str, DataType | str]) -> DataType: """ Defines a custom datatype with specified keys. @@ -495,54 +502,60 @@ def Struct(fields: dict[str, DataType | str]) -> DataType: fields: A dict where the key is the name of the field, and the value is the datatype definition for it Examples: - >>> ds.add_column("col1", types.Struct({ - >>> "field1": types.Int16(), - >>> "field2": types.Text(), - >>> })) - >>> - >>> ds.append([{"col1": {"field1": 3, "field2": "a"}}]) - >>> print(ds[0]["col1"]["field1"]) + ```python + ds.add_column("col1", types.Struct({ + "field1": types.Int16(), + "field2": types.Text(), + })) + + ds.append([{"col1": {"field1": 3, "field2": "a"}}]) + print(ds[0]["col1"]["field1"]) # Output: 3 + ``` """ ... - def UInt16() -> DataType: """ An unsigned 16-bit integer value Examples: - >>> ds.add_column("col1", types.UInt16) + ```python + ds.add_column("col1", types.UInt16) + ``` """ ... - def UInt32() -> DataType: """ An unsigned 32-bit integer value Examples: - >>> ds.add_column("col1", types.UInt16) + ```python + ds.add_column("col1", types.UInt16) + ``` """ ... - def UInt64() -> DataType: """ An unsigned 64-bit integer value Examples: - >>> ds.add_column("col1", types.UInt64) + ```python + ds.add_column("col1", types.UInt64) + ``` """ ... - def UInt8() -> DataType: """ An unsigned 8-bit integer value Examples: - >>> ds.add_column("col1", types.UInt16) + ```python + ds.add_column("col1", types.UInt16) + ``` """ ...