From 18045dd849b042f2c9378312f593f531aeb784e1 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 8 Dec 2021 19:31:54 +0530 Subject: [PATCH 01/58] dataset view sacing --- hub/api/info.py | 3 +++ hub/core/dataset/__init__.py | 6 +++-- hub/core/dataset/dataset.py | 41 +++++++++++++++++++++++++++++++ hub/core/query/filter.py | 16 +++++++++++- hub/core/query/test/test_query.py | 21 +++++++++++++--- hub/util/exceptions.py | 2 +- 6 files changed, 82 insertions(+), 7 deletions(-) diff --git a/hub/api/info.py b/hub/api/info.py index ae843e5c15..bd077da791 100644 --- a/hub/api/info.py +++ b/hub/api/info.py @@ -85,6 +85,9 @@ def __getattribute__(self, name: str) -> Any: def __getitem__(self, key: str): return self._info[key] + def get(self, key: str, default: Optional[Any] = None): + return self._info.get(key, default) + def __str__(self): return self._info.__str__() diff --git a/hub/core/dataset/__init__.py b/hub/core/dataset/__init__.py index ea8d6ff770..9a800d8a62 100644 --- a/hub/core/dataset/__init__.py +++ b/hub/core/dataset/__init__.py @@ -12,7 +12,6 @@ def dataset_factory(path, *args, **kwargs): """Returns a Dataset object from the appropriate class. For example: If `path` is a hub cloud path (prefixed with `hub://`), the returned Dataset object will be of HubCloudDataset. """ - if FORCE_CLASS is not None: clz = FORCE_CLASS elif is_hub_cloud_path(path): @@ -21,5 +20,8 @@ def dataset_factory(path, *args, **kwargs): clz = Dataset if clz in {Dataset, HubCloudDataset}: - return clz(path=path, *args, **kwargs) + ds = clz(path=path, *args, **kwargs) + if "VDS_INDEX" in ds.tensors: + ds = ds._get_view() + return ds raise TypeError(f"Invalid dataset class {clz}") diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 483d895683..a2ad419d4c 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -402,6 +402,10 @@ def commit(self, message: Optional[str] = None) -> None: Returns: str: the commit id of the stored commit that can be used to access the snapshot. """ + if getattr(self, "_is_filterd_view", False): + raise Exception( + "Cannot perform version control operations on a filtered dataset view." + ) commit_id = self.version_state["commit_id"] try_flushing(self) commit(self.version_state, self.storage, message) @@ -426,6 +430,10 @@ def checkout(self, address: str, create: bool = False) -> str: Returns: str: The commit_id of the dataset after checkout. """ + if getattr(self, "_is_filterd_view", False): + raise Exception( + "Cannot perform version control operations on a filtered dataset view." + ) try_flushing(self) checkout(self.version_state, self.storage, address, create) self._info = None @@ -985,3 +993,36 @@ def __result__(self): def __args__(self): return None + + def store(self, path, **ds_args): + if len(self.index.values) > 1: + raise NotImplementedError("Storing sub-sample slices is not supported yet.") + + # TODO + # Process path arg here (add hashes etc) + + ds = hub.dataset(path, **ds_args) + + info = { + "description": "Virtual Datasource", + "virtual-datasource": True, + "source-dataset": self.path, + "source-dataset-version": self.version_state["commit_id"], + } + + query = getattr(self, "_query", None) + if query: + info["query"] = query + with ds: + ds.info.update(info) + ds.create_tensor("VDS_INDEX", dtype="uint64").extend( + list(self.index.values[0].indices(len(self))) + ) + ds._view = self + return ds + + def _get_view(self): + # Only applicable for virtual datasets + ds = hub.dataset(path=self.info["source-dataset"], verbose=False) + ds = ds[self.VDS_INDEX.numpy().reshape(-1).tolist()] + return ds diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 3fef5c9960..6deb22b384 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -6,6 +6,8 @@ from hub.util.compute import get_compute_provider from hub.util.dataset import map_tensor_keys +import inspect + def filter_dataset( dataset: hub.Dataset, @@ -23,7 +25,19 @@ def filter_dataset( else: index_map = filter_inplace(dataset, filter_function, progressbar) - return dataset[index_map] # type: ignore [this is fine] + ds = dataset[index_map] + ds._is_filtered_view = True + if isinstance(filter_function, hub.core.query.DatasetQuery): + query = filter_function._query + else: + try: + query = inspect.getsource(filter_function) + except OSError: + query = getattr( + filter_function, "__name__", filter_function.__class__.__name__ + ) + ds._query = query + return ds # type: ignore [this is fine] def filter_with_compute( diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 02820e0b0d..2894e0c712 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -5,6 +5,8 @@ from hub.core.query import DatasetQuery from hub.core.query.query import EvalGenericTensor, EvalLabelClassTensor +import hub + first_row = {"images": [1, 2, 3], "labels": [0]} second_row = {"images": [6, 7, 5], "labels": [1]} @@ -12,9 +14,8 @@ class_names = ["dog", "cat", "fish"] -@pytest.fixture -def sample_ds(memory_ds): - with memory_ds as ds: +def _populate_data(ds): + with ds: ds.create_tensor("images") ds.create_tensor("labels", htype="class_label", class_names=class_names) @@ -22,6 +23,10 @@ def sample_ds(memory_ds): ds.images.append(row["images"]) ds.labels.append(row["labels"]) + +@pytest.fixture +def sample_ds(memory_ds): + _populate_data(memory_ds) return memory_ds @@ -169,3 +174,13 @@ def filter_result(ds): ) == 3141 ) + + +def test_dataset_view_save(sample_ds): + with hub.dataset(".tests/ds", overwrite=True) as ds: + _populate_data(ds) + view = ds.filter("labels == 'dog'") + view.store(".tests/ds_view", overwrite=True) + view2 = hub.dataset(".tests/ds_view") + for t in view.tensors: + np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py index b7388a9691..16336b4f82 100644 --- a/hub/util/exceptions.py +++ b/hub/util/exceptions.py @@ -69,7 +69,7 @@ def __init__(self, key: str, meta: dict): super().__init__(f"Key '{key}' missing from tensor meta '{str(meta)}'.") -class TensorDoesNotExistError(KeyError): +class TensorDoesNotExistError(KeyError, AttributeError): def __init__(self, tensor_name: str): super().__init__(f"Tensor '{tensor_name}' does not exist.") From ef7728eca5fc112bcac87037ec8af01c2be61d34 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 8 Dec 2021 19:48:03 +0530 Subject: [PATCH 02/58] darg --- hub/core/dataset/dataset.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index a2ad419d4c..7f566a25f8 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -401,6 +401,9 @@ def commit(self, message: Optional[str] = None) -> None: Returns: str: the commit id of the stored commit that can be used to access the snapshot. + + Raises: + Exception: if dataset is a filtered view. """ if getattr(self, "_is_filterd_view", False): raise Exception( @@ -429,6 +432,9 @@ def checkout(self, address: str, create: bool = False) -> str: Returns: str: The commit_id of the dataset after checkout. + + Raises: + Exception: if dataset is a filtered view. """ if getattr(self, "_is_filterd_view", False): raise Exception( From 966cb78278203bdb80ede521c37205246e9c9518 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 8 Dec 2021 23:10:49 +0530 Subject: [PATCH 03/58] create new ds always --- hub/core/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 7f566a25f8..25f0e383ca 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1007,7 +1007,7 @@ def store(self, path, **ds_args): # TODO # Process path arg here (add hashes etc) - ds = hub.dataset(path, **ds_args) + ds = hub.empty(path, **ds_args) info = { "description": "Virtual Datasource", From 0a77d8d24d21af457af18a8a18caf2036e58acba Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 13 Dec 2021 19:16:17 +0530 Subject: [PATCH 04/58] inplace save --- hub/core/dataset/dataset.py | 47 +++++++++++++++++++++++-------- hub/core/query/test/test_query.py | 11 ++++++++ hub/core/storage/gcs.py | 7 +++++ hub/core/storage/local.py | 3 ++ hub/core/storage/s3.py | 14 +++++++-- 5 files changed, 69 insertions(+), 13 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 60c8f23813..8936f203dc 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -9,17 +9,19 @@ from hub.api.info import load_info from hub.client.log import logger from hub.constants import FIRST_COMMIT_ID +from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE from hub.core.fast_forwarding import ffw_dataset_meta from hub.core.index import Index from hub.core.lock import lock, unlock from hub.core.meta.dataset_meta import DatasetMeta -from hub.core.storage import LRUCache, S3Provider +from hub.core.storage import LRUCache, S3Provider, MemoryProvider from hub.core.tensor import Tensor, create_tensor from hub.core.version_control.commit_node import CommitNode # type: ignore from hub.htype import DEFAULT_HTYPE, HTYPE_CONFIGURATIONS, UNSPECIFIED from hub.integrations import dataset_to_tensorflow from hub.util.bugout_reporter import hub_reporter from hub.util.dataset import try_flushing +from hub.util.cache_chain import generate_chain from hub.util.exceptions import ( CouldNotCreateNewDatasetException, InvalidKeyTypeError, @@ -56,6 +58,7 @@ load_meta, ) from tqdm import tqdm # type: ignore +import hashlib class Dataset: @@ -1005,14 +1008,36 @@ def __result__(self): def __args__(self): return None - def store(self, path, **ds_args): + def _view_hash(self): + return hashlib.sha1( + ( + f"{self.path}[{':'.join(str(e.value) for e in self.index.values)}]@{self.version_state['commit_id']}&{getattr(self, '_query', None)}" + ).encode() + ).hexdigest() + + def store(self, path: Optional[str] = None, **ds_args): if len(self.index.values) > 1: raise NotImplementedError("Storing sub-sample slices is not supported yet.") - # TODO - # Process path arg here (add hashes etc) - - ds = hub.empty(path, **ds_args) + if path is None: + if isinstance(self, MemoryProvider): + raise NotImplementedError( + "Saving views inplace is not supported for in-memory datasets." + ) + if self.read_only: + raise Exception( + "Cannot save view in read only dataset. Speicify a path to store the view in a different location." + ) + self.flush() + storage = get_base_storage(self.storage).subdir( + f"queries/{self._view_hash()}" + ) + storage = generate_chain( + storage, DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, self.path + ) + ds = hub.Dataset(storage) + else: + ds = hub.empty(path, **ds_args) info = { "description": "Virtual Datasource", @@ -1026,11 +1051,11 @@ def store(self, path, **ds_args): info["query"] = query with ds: ds.info.update(info) - ds.create_tensor("VDS_INDEX", dtype="uint64").extend( - list(self.index.values[0].indices(len(self))) - ) - ds._view = self - return ds + ds.create_tensor("VDS_INDEX", dtype="uint64") + ds.VDS_INDEX.extend(list(self.index.values[0].indices(len(self)))) + + print(f"Virtual dataset stored at {ds.path}") + return ds.path def _get_view(self): # Only applicable for virtual datasets diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 2894e0c712..e73e31c689 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -184,3 +184,14 @@ def test_dataset_view_save(sample_ds): view2 = hub.dataset(".tests/ds_view") for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) + + +def test_inplace_dataset_view_save(s3_ds_generator): + ds = s3_ds_generator() + with ds: + _populate_data(ds) + view = ds.filter("labels == 'dog'") + vds_path = view.store() + view2 = hub.dataset(vds_path) + for t in view.tensors: + np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) diff --git a/hub/core/storage/gcs.py b/hub/core/storage/gcs.py index e1c6381426..86e5e371b5 100644 --- a/hub/core/storage/gcs.py +++ b/hub/core/storage/gcs.py @@ -215,6 +215,13 @@ def __init__(self, root: str, token: Union[str, Dict] = None, project: str = Non ) self._initialize_provider() + def subdir(self, path: str): + return self.__class__( + root=posixpath.join(self.root, path), + token=self.token, + project=self.project, + ) + def _initialize_provider(self): self._set_bucket_and_path() if not self.token: diff --git a/hub/core/storage/local.py b/hub/core/storage/local.py index f3aa05f185..e9d2f947b4 100644 --- a/hub/core/storage/local.py +++ b/hub/core/storage/local.py @@ -26,6 +26,9 @@ def __init__(self, root: str): self.root = root self.files: Optional[Set[str]] = None + def subdir(self, path: str): + return self.__class__(os.path.join(self.root, path)) + def __getitem__(self, path: str): """Gets the object present at the path within the given byte range. diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index 9d46adccc7..1f6908297f 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -2,6 +2,7 @@ import time import boto3 import botocore # type: ignore +import posixpath from typing import Optional from botocore.session import ComponentLocator from hub.client.client import HubBackendClient @@ -93,6 +94,16 @@ def __init__( self.client_config = hub.config["s3"] self._initialize_s3_parameters() + def subdir(self, path: str): + return self.__class__( + root=posixpath.join(self.root, path), + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token, + aws_region=self.aws_region, + endpoint_url=self.endpoint_url, + ) + def __setitem__(self, path, content): """Sets the object present at the path with the value @@ -311,8 +322,7 @@ def __setstate__(self, state): def _set_bucket_and_path(self): root = self.root.replace("s3://", "") - self.bucket = root.split("/")[0] - self.path = "/".join(root.split("/")[1:]) + self.bucket, self.path = root.split("/", 1) if not self.path.endswith("/"): self.path += "/" From 9f4ad43673db430222ede17d1c1fe34d7ac2f684 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 13 Dec 2021 19:20:59 +0530 Subject: [PATCH 05/58] test --- hub/core/dataset/dataset.py | 2 +- hub/core/query/test/test_query.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 8936f203dc..7fd5ffe722 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1050,7 +1050,7 @@ def store(self, path: Optional[str] = None, **ds_args): if query: info["query"] = query with ds: - ds.info.update(info) + # ds.info.update(info) ds.create_tensor("VDS_INDEX", dtype="uint64") ds.VDS_INDEX.extend(list(self.index.values[0].indices(len(self)))) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index e73e31c689..ec4f80210c 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -14,14 +14,14 @@ class_names = ["dog", "cat", "fish"] -def _populate_data(ds): +def _populate_data(ds, n=1): with ds: ds.create_tensor("images") ds.create_tensor("labels", htype="class_label", class_names=class_names) - - for row in rows: - ds.images.append(row["images"]) - ds.labels.append(row["labels"]) + for _ in range(n): + for row in rows: + ds.images.append(row["images"]) + ds.labels.append(row["labels"]) @pytest.fixture @@ -189,7 +189,7 @@ def test_dataset_view_save(sample_ds): def test_inplace_dataset_view_save(s3_ds_generator): ds = s3_ds_generator() with ds: - _populate_data(ds) + _populate_data(ds, n=2) view = ds.filter("labels == 'dog'") vds_path = view.store() view2 = hub.dataset(vds_path) From 8d448a0749d0dc25dd8b537bba6d5aed1ec31456 Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Sat, 18 Dec 2021 12:23:04 +0530 Subject: [PATCH 06/58] windows issue fix --- hub/core/chunk_engine.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 3a451b66cb..18f02bee9d 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -269,7 +269,10 @@ def num_chunks(self) -> int: def num_samples(self) -> int: if not self.chunk_id_encoder_exists: return 0 - return int(np.uint32(self.chunk_id_encoder.num_samples)) + num = self.chunk_id_encoder.num_samples + # chunk id encoder starts out by putting -1 when there are no samples, this gets converted to 2^32 - 1. + # when we call num_samples, it adds 1 to the last entry (2^32 - 1) and returns 2^32, when actual sample count is 0. + return 0 if num == 2 ** 32 else num @property def last_chunk_key(self) -> str: @@ -623,9 +626,10 @@ def validate_num_samples_is_synchronized(self): tensor_meta_length = self.tensor_meta.length # compare chunk ID encoder and tensor meta - chunk_id_num_samples = np.uint32( - self.chunk_id_encoder.num_samples if self.chunk_id_encoder_exists else 0 - ) + + # update this if we change self.num_samples implementation later to use tensor meta length instead of chunk_id_encoder + chunk_id_num_samples = self.num_samples + if tensor_meta_length != chunk_id_num_samples: commit_id = self.version_state["commit_id"] tkey = get_tensor_meta_key(self.key, commit_id) From 94223ac3c9f3ae3e610eb9bbfe521abb1b5a36ee Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Sat, 18 Dec 2021 12:27:22 +0530 Subject: [PATCH 07/58] load compute after checks --- hub/core/transform/transform.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hub/core/transform/transform.py b/hub/core/transform/transform.py index 637ab91e8a..aee1f410d4 100644 --- a/hub/core/transform/transform.py +++ b/hub/core/transform/transform.py @@ -115,8 +115,6 @@ def eval( if num_workers <= 0: scheduler = "serial" num_workers = max(num_workers, 1) - compute_provider = get_compute_provider(scheduler, num_workers) - original_data_in = data_in if isinstance(data_in, hub.Dataset): data_in = get_dataset_with_zero_size_cache(data_in) @@ -140,6 +138,7 @@ def eval( if overwrite: original_data_in.clear_cache() + compute_provider = get_compute_provider(scheduler, num_workers) try: self.run( data_in, From cc1bba5e1a4ea9413c7570187ac2c2484e887dcb Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Sat, 18 Dec 2021 12:30:45 +0530 Subject: [PATCH 08/58] move autoflush restore into finally --- hub/core/transform/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/transform/transform.py b/hub/core/transform/transform.py index aee1f410d4..ecc52714c6 100644 --- a/hub/core/transform/transform.py +++ b/hub/core/transform/transform.py @@ -152,7 +152,7 @@ def eval( raise TransformError(e) finally: compute_provider.close() - target_ds.storage.autoflush = initial_autoflush + target_ds.storage.autoflush = initial_autoflush def _run_with_progbar( self, func: Callable, ret: dict, total: int, desc: Optional[str] = "" From c6c62e6398b775c029896f8ef7c51a152b25c2df Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Sat, 18 Dec 2021 13:32:11 +0530 Subject: [PATCH 09/58] reduce test size for transform --- hub/core/transform/test_transform.py | 94 ++++++++++++++-------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/hub/core/transform/test_transform.py b/hub/core/transform/test_transform.py index 1aca06b1ae..892cc5ba61 100644 --- a/hub/core/transform/test_transform.py +++ b/hub/core/transform/test_transform.py @@ -104,7 +104,7 @@ def check_target_array(ds, index, target): ds.img[index].numpy(), target * np.ones((500, 500, 3)) ) np.testing.assert_array_equal( - ds.label[index].numpy(), target * np.ones((100, 100, 3)) + ds.label[index].numpy(), target * np.ones((1,)) ) @@ -593,24 +593,24 @@ def test_inplace_transform(local_ds_generator): with ds: ds.create_tensor("img") ds.create_tensor("label") - for i in range(100): - if i == 55: + for i in range(10): + if i == 5: ds.img.append(np.zeros((500, 500, 3))) else: ds.img.append(np.ones((500, 500, 3))) - ds.label.append(np.ones((100, 100, 3))) + ds.label.append(1) a = ds.commit() - assert len(ds) == 100 - for i in range(100): - if i != 55: + assert len(ds) == 10 + for i in range(10): + if i != 5: check_target_array(ds, i, 1) - ds.img[55] = np.ones((500, 500, 3)) + ds.img[5] = np.ones((500, 500, 3)) b = ds.commit() inplace_transform().eval(ds, num_workers=TRANSFORM_TEST_NUM_WORKERS) - assert ds.img.chunk_engine.num_samples == len(ds) == 200 + assert ds.img.chunk_engine.num_samples == len(ds) == 20 - for i in range(200): + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) @@ -618,14 +618,14 @@ def test_inplace_transform(local_ds_generator): change = { "img": { "created": False, - "data_added": [0, 200], + "data_added": [0, 20], "data_updated": set(), "data_transformed_in_place": True, "info_updated": False, }, "label": { "created": False, - "data_added": [0, 200], + "data_added": [0, 20], "data_updated": set(), "data_transformed_in_place": True, "info_updated": False, @@ -634,19 +634,19 @@ def test_inplace_transform(local_ds_generator): assert diff == change ds.checkout(b) - assert len(ds) == 100 - for i in range(100): + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) ds = local_ds_generator() - assert len(ds) == 200 - for i in range(200): + assert len(ds) == 20 + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) ds.checkout(b) - assert len(ds) == 100 - for i in range(100): + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) @@ -656,23 +656,23 @@ def test_inplace_transform_without_commit(local_ds_generator): with ds: ds.create_tensor("img") ds.create_tensor("label") - for _ in range(100): + for _ in range(10): ds.img.append(np.ones((500, 500, 3))) - ds.label.append(np.ones((100, 100, 3))) - assert len(ds) == 100 - for i in range(100): + ds.label.append(1) + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) inplace_transform().eval(ds, num_workers=TRANSFORM_TEST_NUM_WORKERS) - assert ds.img.chunk_engine.num_samples == len(ds) == 200 + assert ds.img.chunk_engine.num_samples == len(ds) == 20 - for i in range(200): + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) ds = local_ds_generator() - assert len(ds) == 200 - for i in range(200): + assert len(ds) == 20 + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) @@ -682,53 +682,53 @@ def test_inplace_transform_non_head(local_ds_generator): with ds: ds.create_tensor("img") ds.create_tensor("label") - for _ in range(100): + for _ in range(10): ds.img.append(np.ones((500, 500, 3))) - ds.label.append(np.ones((100, 100, 3))) - assert len(ds) == 100 - for i in range(100): + ds.label.append(1) + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) a = ds.commit() - for _ in range(50): + for _ in range(5): ds.img.append(np.ones((500, 500, 3))) - ds.label.append(np.ones((100, 100, 3))) - assert len(ds) == 150 - for i in range(150): + ds.label.append(1) + assert len(ds) == 15 + for i in range(15): check_target_array(ds, i, 1) ds.checkout(a) # transforming non-head node - inplace_transform().eval(ds, num_workers=4) + inplace_transform().eval(ds, num_workers=TRANSFORM_TEST_NUM_WORKERS) br = ds.branch - assert len(ds) == 200 - for i in range(200): + assert len(ds) == 20 + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) ds.checkout(a) - assert len(ds) == 100 - for i in range(100): + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) ds.checkout("main") - assert len(ds) == 150 - for i in range(150): + assert len(ds) == 15 + for i in range(15): check_target_array(ds, i, 1) ds = local_ds_generator() - assert len(ds) == 150 - for i in range(150): + assert len(ds) == 15 + for i in range(15): check_target_array(ds, i, 1) ds.checkout(a) - assert len(ds) == 100 - for i in range(100): + assert len(ds) == 10 + for i in range(10): check_target_array(ds, i, 1) ds.checkout(br) - assert len(ds) == 200 - for i in range(200): + assert len(ds) == 20 + for i in range(20): target = 2 if i % 2 == 0 else 3 check_target_array(ds, i, target) From 2ad9889fdb2dad4cedbc45e83038bfd1e10b496b Mon Sep 17 00:00:00 2001 From: AbhinavTuli Date: Sat, 18 Dec 2021 13:51:11 +0530 Subject: [PATCH 10/58] lint fix --- hub/core/transform/test_transform.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hub/core/transform/test_transform.py b/hub/core/transform/test_transform.py index 892cc5ba61..f618ea47bd 100644 --- a/hub/core/transform/test_transform.py +++ b/hub/core/transform/test_transform.py @@ -103,9 +103,7 @@ def check_target_array(ds, index, target): np.testing.assert_array_equal( ds.img[index].numpy(), target * np.ones((500, 500, 3)) ) - np.testing.assert_array_equal( - ds.label[index].numpy(), target * np.ones((1,)) - ) + np.testing.assert_array_equal(ds.label[index].numpy(), target * np.ones((1,))) @all_schedulers From 1c17976afdd53b8ec76da852fdfc7d08e51bf23d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 15:23:54 +0530 Subject: [PATCH 11/58] async filter --- hub/constants.py | 3 ++ hub/core/dataset/dataset.py | 56 ++++++++++++++++------ hub/core/index/index.py | 10 ++++ hub/core/query/filter.py | 78 ++++++++++++++++++++++++++----- hub/core/query/test/test_query.py | 2 +- hub/util/keys.py | 10 ++++ 6 files changed, 131 insertions(+), 28 deletions(-) diff --git a/hub/constants.py b/hub/constants.py index ce5c1b2578..c48b1d6278 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -112,3 +112,6 @@ CONVERT_GRAYSCALE = True PARTIAL_NUM_SAMPLES = 0.5 + +QUERIES_FILENAME = "queries.json" +QUERIES_LOCK_FILENAME = "queries.lock" diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index aa6ae1616f..81cab52630 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -12,9 +12,11 @@ from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE from hub.core.fast_forwarding import ffw_dataset_meta from hub.core.index import Index -from hub.core.lock import lock_version, unlock_version +from hub.core.lock import lock_version, unlock_version, Lock from hub.core.meta.dataset_meta import DatasetMeta -from hub.core.storage import LRUCache, S3Provider, MemoryProvider, GCSProvider +from hub.core.storage import LRUCache, S3Provider, MemoryProvider # GCSProvider + +GCSProvider = S3Provider from hub.core.tensor import Tensor, create_tensor from hub.core.version_control.commit_node import CommitNode # type: ignore from hub.htype import DEFAULT_HTYPE, HTYPE_CONFIGURATIONS, UNSPECIFIED @@ -40,6 +42,8 @@ get_dataset_meta_key, get_version_control_info_key, tensor_exists, + get_queries_key, + get_queries_lock_key, ) from hub.util.path import get_path_from_storage from hub.util.remove_cache import get_base_storage @@ -59,6 +63,7 @@ ) from tqdm import tqdm # type: ignore import hashlib +import json class Dataset: @@ -104,7 +109,6 @@ def __init__( self.storage = storage self._read_only = read_only self._locked_out = False # User requested write access but was denied - base_storage = get_base_storage(storage) self.index: Index = index or Index() self.group_index = group_index self._token = token @@ -1093,17 +1097,31 @@ def store(self, path: Optional[str] = None, **ds_args): "Saving views inplace is not supported for in-memory datasets." ) if self.read_only: - raise Exception( - "Cannot save view in read only dataset. Speicify a path to store the view in a different location." - ) - self.flush() - storage = get_base_storage(self.storage).subdir( - f"queries/{self._view_hash()}" - ) - storage = generate_chain( - storage, DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, self.path - ) - ds = hub.Dataset(storage) + if isinstance(self, hub.core.dataset.HubCloudDataset): + path = f"hub://{self.org_id}/_query_{self._view_hash}" + ds = hub.empty(path, **ds_args) + else: + raise Exception( + "Cannot save view in read only dataset. Speicify a path to store the view in a different location." + ) + else: + self.flush() + hash = self._view_hash() + self.storage.flush() + base_storage = get_base_storage(self.storage) + path = base_storage.subdir(f"queries/{hash}").root + ds = hub.dataset(path, **ds_args) + lock = Lock(base_storage, get_queries_lock_key()) + lock.acquire(timeout=10, force=True) + queries_key = get_queries_key() + try: + queries = json.loads(base_storage[queries_key].decode("utf-8")) + except KeyError: + queries = [] + queries.append(hash) + base_storage[queries_key] = json.dumps(queries).encode("utf-8") + lock.release() + else: ds = hub.empty(path, **ds_args) @@ -1117,8 +1135,9 @@ def store(self, path: Optional[str] = None, **ds_args): query = getattr(self, "_query", None) if query: info["query"] = query + info["source-dataset-index"] = getattr(self, "_source_ds_idx", None) with ds: - # ds.info.update(info) + ds.info.update(info) ds.create_tensor("VDS_INDEX", dtype="uint64") ds.VDS_INDEX.extend(list(self.index.values[0].indices(len(self)))) @@ -1129,4 +1148,11 @@ def _get_view(self): # Only applicable for virtual datasets ds = hub.dataset(path=self.info["source-dataset"], verbose=False) ds = ds[self.VDS_INDEX.numpy().reshape(-1).tolist()] + ds._vds = self return ds + + def _get_empty_vds(self, vds_path=None, query=None, **vds_args): + view = self[:0] + if query: + view._query = query + return view.store(vds_path, **vds_args) diff --git a/hub/core/index/index.py b/hub/core/index/index.py index fd92d92ec6..d7366db8cd 100644 --- a/hub/core/index/index.py +++ b/hub/core/index/index.py @@ -391,3 +391,13 @@ def __str__(self): def __repr__(self): return f"Index(values={self.values})" + + def to_json(self): + ret = [] + for e in self.values: + v = e.value + if isinstance(v, slice): + ret.append({"start": v.start, "stop": v.stop, "step": v.step}) + else: + ret.append(v) + return ret \ No newline at end of file diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 6deb22b384..21891ce527 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -1,10 +1,11 @@ -from typing import Callable, List, Sequence +from typing import Callable, List, Optional, Sequence import hub from hub.core.io import SampleStreaming from hub.util.compute import get_compute_provider from hub.util.dataset import map_tensor_keys +from time import time import inspect @@ -15,18 +16,12 @@ def filter_dataset( num_workers: int = 0, scheduler: str = "threaded", progressbar: bool = True, + save_result: bool = False, + result_path: Optional[str] = None, + result_ds_args: Optional[dict] = None, ) -> hub.Dataset: index_map: List[int] - if num_workers > 0: - index_map = filter_with_compute( - dataset, filter_function, num_workers, scheduler, progressbar - ) - else: - index_map = filter_inplace(dataset, filter_function, progressbar) - - ds = dataset[index_map] - ds._is_filtered_view = True if isinstance(filter_function, hub.core.query.DatasetQuery): query = filter_function._query else: @@ -36,7 +31,31 @@ def filter_dataset( query = getattr( filter_function, "__name__", filter_function.__class__.__name__ ) + + vds = dataset._get_empty_vds(result_path, result_ds_args, query=query) if save_result else None + + if num_workers > 0: + index_map = filter_with_compute( + dataset, + filter_function, + num_workers, + scheduler, + progressbar, + vds, + ) + else: + index_map = filter_inplace( + dataset, + filter_function, + progressbar, + vds, + ) + + ds = dataset[index_map] + ds._is_filtered_view = True + ds._query = query + ds._source_ds_idx = dataset.index.to_json() return ds # type: ignore [this is fine] @@ -46,17 +65,33 @@ def filter_with_compute( num_workers: int, scheduler: str, progressbar: bool = True, + vds: Optional[hub.Dataset] = None, + vds_update_frequency: int = 5, # seconds ) -> List[int]: blocks = SampleStreaming(dataset, tensors=map_tensor_keys(dataset)).list_blocks() compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) + if vds: + vds.autoflush = False + vds.info["total_samples"] = len(dataset) + vds.info["samples_processed"] = 0 + def filter_slice(indices: Sequence[int]): result = list() + + last_update_time = time() for i in indices: if filter_function(dataset[i]): result.append(i) - + if vds: + vds.VDS_INDEX.append(i) + vds.info["samples_processed"] = vds.info["samples_processed"] + 1 + if time() - last_update_time > vds_update_frequency: + vds.flush() + last_update_time = time() + if vds: + vds.autoflush = True return result def pg_filter_slice(pg_callback, indices: Sequence[int]): @@ -85,12 +120,22 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): def filter_inplace( - dataset: hub.Dataset, filter_function: Callable, progressbar: bool + dataset: hub.Dataset, + filter_function: Callable, + progressbar: bool, + vds: Optional[hub.Dataset] = None, + vds_update_frequency: int = 5, ) -> List[int]: index_map: List[int] = list() it = enumerate(dataset) + if vds: + vds.autoflush = False + vds.info["total_samples"] = len(dataset) + vds.info["samples_processed"] = 0 + + if progressbar: from tqdm import tqdm # type: ignore @@ -99,5 +144,14 @@ def filter_inplace( for i, sample_in in it: if filter_function(sample_in): index_map.append(i) + if vds: + vds.VDS_INDEX.append(i) + vds.info["samples_processed"] = vds.info["samples_processed"] + 1 + if time() - last_update_time > vds_update_frequency: + vds.flush() + last_update_time = time() + + if vds: + vds.autoflush = True return index_map diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index ec4f80210c..c17acae7d9 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -176,7 +176,7 @@ def filter_result(ds): ) -def test_dataset_view_save(sample_ds): +def test_dataset_view_save(): with hub.dataset(".tests/ds", overwrite=True) as ds: _populate_data(ds) view = ds.filter("labels == 'dog'") diff --git a/hub/util/keys.py b/hub/util/keys.py index d55e2bbf8f..269c7e8d39 100644 --- a/hub/util/keys.py +++ b/hub/util/keys.py @@ -13,6 +13,8 @@ TENSOR_COMMIT_DIFF_FILENAME, VERSION_CONTROL_INFO_FILENAME, VERSION_CONTROL_INFO_LOCK_FILENAME, + QUERIES_FILENAME, + QUERIES_LOCK_FILENAME, ) @@ -122,3 +124,11 @@ def tensor_exists(key: str, storage, commit_id: str) -> bool: return True except KeyError: return False + + +def get_queries_key() -> str: + return QUERIES_FILENAME + + +def get_queries_lock_key() -> str: + return QUERIES_LOCK_FILENAME From 5d98a8cda45f6da0fbd6c06a94ff61986076d6da Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 15:28:17 +0530 Subject: [PATCH 12/58] fix --- hub/core/dataset/__init__.py | 2 +- hub/core/index/index.py | 2 +- hub/core/query/filter.py | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/hub/core/dataset/__init__.py b/hub/core/dataset/__init__.py index 9a800d8a62..50d7908edd 100644 --- a/hub/core/dataset/__init__.py +++ b/hub/core/dataset/__init__.py @@ -21,7 +21,7 @@ def dataset_factory(path, *args, **kwargs): if clz in {Dataset, HubCloudDataset}: ds = clz(path=path, *args, **kwargs) - if "VDS_INDEX" in ds.tensors: + if ds.info.get("virtual-datasource", False): ds = ds._get_view() return ds raise TypeError(f"Invalid dataset class {clz}") diff --git a/hub/core/index/index.py b/hub/core/index/index.py index d7366db8cd..23ebb4ca82 100644 --- a/hub/core/index/index.py +++ b/hub/core/index/index.py @@ -400,4 +400,4 @@ def to_json(self): ret.append({"start": v.start, "stop": v.stop, "step": v.step}) else: ret.append(v) - return ret \ No newline at end of file + return ret diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 21891ce527..58dd42b778 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -32,7 +32,11 @@ def filter_dataset( filter_function, "__name__", filter_function.__class__.__name__ ) - vds = dataset._get_empty_vds(result_path, result_ds_args, query=query) if save_result else None + vds = ( + dataset._get_empty_vds(result_path, result_ds_args, query=query) + if save_result + else None + ) if num_workers > 0: index_map = filter_with_compute( @@ -87,7 +91,7 @@ def filter_slice(indices: Sequence[int]): if vds: vds.VDS_INDEX.append(i) vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time > vds_update_frequency: + if time() - last_update_time > vds_update_frequency: vds.flush() last_update_time = time() if vds: @@ -135,7 +139,6 @@ def filter_inplace( vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 - if progressbar: from tqdm import tqdm # type: ignore @@ -147,7 +150,7 @@ def filter_inplace( if vds: vds.VDS_INDEX.append(i) vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time > vds_update_frequency: + if time() - last_update_time > vds_update_frequency: vds.flush() last_update_time = time() From ee77b3e04153148440e08e406b9efe3e472e68bc Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 17:28:56 +0530 Subject: [PATCH 13/58] Update dataset.py --- hub/core/dataset/dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 81cab52630..e075e85999 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -14,9 +14,7 @@ from hub.core.index import Index from hub.core.lock import lock_version, unlock_version, Lock from hub.core.meta.dataset_meta import DatasetMeta -from hub.core.storage import LRUCache, S3Provider, MemoryProvider # GCSProvider - -GCSProvider = S3Provider +from hub.core.storage import LRUCache, S3Provider, MemoryProvider, GCSProvider from hub.core.tensor import Tensor, create_tensor from hub.core.version_control.commit_node import CommitNode # type: ignore from hub.htype import DEFAULT_HTYPE, HTYPE_CONFIGURATIONS, UNSPECIFIED From b978ade876bd202a615361cbf909ff59f152c161 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 17:55:18 +0530 Subject: [PATCH 14/58] Update __init__.py --- hub/core/storage/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hub/core/storage/__init__.py b/hub/core/storage/__init__.py index 1e0935f13f..5644814f26 100644 --- a/hub/core/storage/__init__.py +++ b/hub/core/storage/__init__.py @@ -3,3 +3,4 @@ from hub.core.storage.memory import MemoryProvider from hub.core.storage.local import LocalProvider from hub.core.storage.lru_cache import LRUCache +from hub.core.storage.gcs import GCSProvider From b3b199d8bc828a91a63c9c67da4cffd8fc7b83d5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 20:24:06 +0530 Subject: [PATCH 15/58] streaming query --- hub/core/dataset/dataset.py | 32 +++++++++++++++++++++---------- hub/core/query/filter.py | 20 +++++++++++-------- hub/core/query/test/test_query.py | 6 +++--- 3 files changed, 37 insertions(+), 21 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index e075e85999..ba3d869178 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -703,18 +703,24 @@ def filter( num_workers: int = 0, scheduler: str = "threaded", progressbar: bool = True, + save_result: bool = False, + result_path: Optional[str] = None, + result_ds_args: Optional[dict] = None, ): """Filters the dataset in accordance of filter function `f(x: sample) -> bool` Args: - function(Callable | str): filter function that takes sample as argument and returns True/False + function(Callable | str): Filter function that takes sample as argument and returns True/False if sample should be included in result. Also supports simplified expression evaluations. See hub.core.query.DatasetQuery for more details. - num_workers(int): level of parallelization of filter evaluations. + num_workers(int): Level of parallelization of filter evaluations. `0` indicates in-place for-loop evaluation, multiprocessing is used otherwise. - scheduler(str): scheduler to use for multiprocessing evaluation. + scheduler(str): Scheduler to use for multiprocessing evaluation. `threaded` is default - progressbar(bool): display progress bar while filtering. True is default + progressbar(bool): Display progress bar while filtering. True is default + save_result (bool): If True, result of the filter will be saved to a dataset asynchronously. + result_path (Optional, str): Path to save the filter result. Only applicable if `save_result` is True. + result_ds_args (Optional, dict): Additional args for result dataset. Only applicable if `save_result` is True. Returns: View on Dataset with elements, that satisfy filter function @@ -737,6 +743,9 @@ def filter( num_workers=num_workers, scheduler=scheduler, progressbar=progressbar, + save_result=save_result, + result_path=result_path, + result_ds_args=result_ds_args, ) def _get_total_meta(self): @@ -1085,11 +1094,13 @@ def _view_hash(self): ).encode() ).hexdigest() - def store(self, path: Optional[str] = None, **ds_args): + def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): if len(self.index.values) > 1: raise NotImplementedError("Storing sub-sample slices is not supported yet.") if path is None: + if hasattr(self, "_vds"): + return self._vds if _ret_ds else self._vds.path if isinstance(self, MemoryProvider): raise NotImplementedError( "Saving views inplace is not supported for in-memory datasets." @@ -1108,7 +1119,7 @@ def store(self, path: Optional[str] = None, **ds_args): self.storage.flush() base_storage = get_base_storage(self.storage) path = base_storage.subdir(f"queries/{hash}").root - ds = hub.dataset(path, **ds_args) + ds = hub.dataset(path, **ds_args) # type: ignore lock = Lock(base_storage, get_queries_lock_key()) lock.acquire(timeout=10, force=True) queries_key = get_queries_key() @@ -1136,11 +1147,12 @@ def store(self, path: Optional[str] = None, **ds_args): info["source-dataset-index"] = getattr(self, "_source_ds_idx", None) with ds: ds.info.update(info) - ds.create_tensor("VDS_INDEX", dtype="uint64") - ds.VDS_INDEX.extend(list(self.index.values[0].indices(len(self)))) + ds.create_tensor("VDS_INDEX", dtype="uint64").extend( + list(self.index.values[0].indices(len(self))) + ) print(f"Virtual dataset stored at {ds.path}") - return ds.path + return ds if _ret_ds else ds.path def _get_view(self): # Only applicable for virtual datasets @@ -1153,4 +1165,4 @@ def _get_empty_vds(self, vds_path=None, query=None, **vds_args): view = self[:0] if query: view._query = query - return view.store(vds_path, **vds_args) + return view.store(vds_path, _ret_ds=True, **vds_args) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 58dd42b778..1631232cef 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -33,7 +33,7 @@ def filter_dataset( ) vds = ( - dataset._get_empty_vds(result_path, result_ds_args, query=query) + dataset._get_empty_vds(result_path, query=query, **(result_ds_args or {})) if save_result else None ) @@ -60,6 +60,9 @@ def filter_dataset( ds._query = query ds._source_ds_idx = dataset.index.to_json() + + if vds: + ds._vds = vds return ds # type: ignore [this is fine] @@ -80,20 +83,20 @@ def filter_with_compute( vds.autoflush = False vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 + last_update_time = {"value": time()} def filter_slice(indices: Sequence[int]): result = list() - last_update_time = time() for i in indices: if filter_function(dataset[i]): result.append(i) if vds: - vds.VDS_INDEX.append(i) + vds.VDS_INDEX.append(i) # type: ignore vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time > vds_update_frequency: + if time() - last_update_time["value"] > vds_update_frequency: vds.flush() - last_update_time = time() + last_update_time["value"] = time() if vds: vds.autoflush = True return result @@ -138,6 +141,7 @@ def filter_inplace( vds.autoflush = False vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 + last_update_time = {"value": time()} if progressbar: from tqdm import tqdm # type: ignore @@ -148,11 +152,11 @@ def filter_inplace( if filter_function(sample_in): index_map.append(i) if vds: - vds.VDS_INDEX.append(i) + vds.VDS_INDEX.append(i) # type: ignore vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time > vds_update_frequency: + if time() - last_update_time["value"] > vds_update_frequency: vds.flush() - last_update_time = time() + last_update_time["value"] = time() if vds: vds.autoflush = True diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index c17acae7d9..d3d2c6d45d 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -185,12 +185,12 @@ def test_dataset_view_save(): for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) - -def test_inplace_dataset_view_save(s3_ds_generator): +@pytest.mark.parametrize("stream", [True, False]) +def test_inplace_dataset_view_save(s3_ds_generator, stream): ds = s3_ds_generator() with ds: _populate_data(ds, n=2) - view = ds.filter("labels == 'dog'") + view = ds.filter("labels == 'dog'", save_result=stream) vds_path = view.store() view2 = hub.dataset(vds_path) for t in view.tensors: From cebd5c8ca45c828b30171fd9c1912f6291699f51 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 20:24:44 +0530 Subject: [PATCH 16/58] format --- hub/core/query/test/test_query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index d3d2c6d45d..5c150d9a0b 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -185,6 +185,7 @@ def test_dataset_view_save(): for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) + @pytest.mark.parametrize("stream", [True, False]) def test_inplace_dataset_view_save(s3_ds_generator, stream): ds = s3_ds_generator() From e21d5c3cdfb2dc490cf1cd5d2e72eee6023ccf85 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 20 Dec 2021 22:53:13 +0530 Subject: [PATCH 17/58] streaming fixes --- hub/core/dataset/dataset.py | 33 ++++++++++++++++++++++++++----- hub/core/query/filter.py | 22 +++++++++++++-------- hub/core/query/test/test_query.py | 4 +++- hub/core/storage/gcs.py | 6 ++++++ hub/core/storage/s3.py | 15 ++++++++++++++ 5 files changed, 66 insertions(+), 14 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index ba3d869178..ebee69dcf3 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -703,7 +703,7 @@ def filter( num_workers: int = 0, scheduler: str = "threaded", progressbar: bool = True, - save_result: bool = False, + store_result: bool = False, result_path: Optional[str] = None, result_ds_args: Optional[dict] = None, ): @@ -718,9 +718,9 @@ def filter( scheduler(str): Scheduler to use for multiprocessing evaluation. `threaded` is default progressbar(bool): Display progress bar while filtering. True is default - save_result (bool): If True, result of the filter will be saved to a dataset asynchronously. - result_path (Optional, str): Path to save the filter result. Only applicable if `save_result` is True. - result_ds_args (Optional, dict): Additional args for result dataset. Only applicable if `save_result` is True. + store_result (bool): If True, result of the filter will be saved to a dataset asynchronously. + result_path (Optional, str): Path to save the filter result. Only applicable if `store_result` is True. + result_ds_args (Optional, dict): Additional args for result dataset. Only applicable if `store_result` is True. Returns: View on Dataset with elements, that satisfy filter function @@ -743,7 +743,7 @@ def filter( num_workers=num_workers, scheduler=scheduler, progressbar=progressbar, - save_result=save_result, + store_result=store_result, result_path=result_path, result_ds_args=result_ds_args, ) @@ -1119,6 +1119,10 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): self.storage.flush() base_storage = get_base_storage(self.storage) path = base_storage.subdir(f"queries/{hash}").root + if hasattr(base_storage, "_args"): + args = base_storage._args() + args.update(ds_args) + ds_args = args ds = hub.dataset(path, **ds_args) # type: ignore lock = Lock(base_storage, get_queries_lock_key()) lock.acquire(timeout=10, force=True) @@ -1166,3 +1170,22 @@ def _get_empty_vds(self, vds_path=None, query=None, **vds_args): if query: view._query = query return view.store(vds_path, _ret_ds=True, **vds_args) + + def _get_query_history(self) -> List[str]: + """ + Internal. Returns a list of hashes which can be passed to Dataset._get_stored_vds to get a dataset view. + """ + try: + queries = json.loads(self.storage[get_queries_key()].decode("utf-8")) + return queries + except KeyError: + return [] + + def _get_stored_vds(self, hash: str): + """ + Internal. + """ + base_storage = get_base_storage(self.storage) + path = base_storage.subdir(f"queries/{hash}").root + ds_args = base_storage._args() if hasattr(base_storage, "_args") else {} + return hub.dataset(path, **ds_args)._vds diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 1631232cef..75905deb80 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -16,7 +16,7 @@ def filter_dataset( num_workers: int = 0, scheduler: str = "threaded", progressbar: bool = True, - save_result: bool = False, + store_result: bool = False, result_path: Optional[str] = None, result_ds_args: Optional[dict] = None, ) -> hub.Dataset: @@ -34,7 +34,7 @@ def filter_dataset( vds = ( dataset._get_empty_vds(result_path, query=query, **(result_ds_args or {})) - if save_result + if store_result else None ) @@ -87,14 +87,17 @@ def filter_with_compute( def filter_slice(indices: Sequence[int]): result = list() - - for i in indices: + num_samples = len(indices) + for idx, i in enumerate(indices): if filter_function(dataset[i]): result.append(i) if vds: vds.VDS_INDEX.append(i) # type: ignore vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time["value"] > vds_update_frequency: + if ( + idx == num_samples - 1 + or time() - last_update_time["value"] > vds_update_frequency + ): vds.flush() last_update_time["value"] = time() if vds: @@ -136,7 +139,7 @@ def filter_inplace( index_map: List[int] = list() it = enumerate(dataset) - + num_samples = len(dataset) if vds: vds.autoflush = False vds.info["total_samples"] = len(dataset) @@ -146,7 +149,7 @@ def filter_inplace( if progressbar: from tqdm import tqdm # type: ignore - it = tqdm(it, total=len(dataset)) + it = tqdm(it, total=num_samples) for i, sample_in in it: if filter_function(sample_in): @@ -154,7 +157,10 @@ def filter_inplace( if vds: vds.VDS_INDEX.append(i) # type: ignore vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if time() - last_update_time["value"] > vds_update_frequency: + if ( + i == num_samples - 1 + or time() - last_update_time["value"] > vds_update_frequency + ): vds.flush() last_update_time["value"] = time() diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 5c150d9a0b..3212d6d4d4 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -191,8 +191,10 @@ def test_inplace_dataset_view_save(s3_ds_generator, stream): ds = s3_ds_generator() with ds: _populate_data(ds, n=2) - view = ds.filter("labels == 'dog'", save_result=stream) + view = ds.filter("labels == 'dog'", store_result=stream) + assert len(ds._get_query_history()) == int(stream) vds_path = view.store() + assert len(ds._get_query_history()) == 1 view2 = hub.dataset(vds_path) for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) diff --git a/hub/core/storage/gcs.py b/hub/core/storage/gcs.py index b7851cf392..e84c67361d 100644 --- a/hub/core/storage/gcs.py +++ b/hub/core/storage/gcs.py @@ -311,3 +311,9 @@ def __setstate__(self, state): self.project = state[3] self.read_only = state[4] self._initialize_provider() + + def _args(self) -> Union[str, Dict]: + """ + Internal. Arguments other than the path required to initialize this storage. + """ + return {"creds": self.token} diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index 475c3007d2..765beab9cb 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -421,3 +421,18 @@ def need_to_reload_creds(self, err: botocore.exceptions.ClientError) -> bool: err.response["Error"]["Code"] == "ExpiredToken" and self.loaded_creds_from_environment ) + + def _args(self): + """ + Internal. Arguments other than the path required to initialize this storage. + """ + return { + "creds": { + "aws_access_key_id": self.aws_access_key_id, + "aws_secret_access_key": self.aws_secret_access_key, + "aws_session_token": self.aws_session_token, + "endpoint_url": self.endpoint_url, + "aws_region": self.aws_region, + }, + "token": self.token, + } From 1f1e76266d278047ebd51819bc13c09ba8769e52 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 21 Dec 2021 18:28:20 +0530 Subject: [PATCH 18/58] changes for plat --- hub/core/dataset/__init__.py | 7 ++- hub/core/dataset/dataset.py | 61 +++++++++++-------- hub/core/query/filter.py | 111 ++++++++++++++++++++++------------- 3 files changed, 112 insertions(+), 67 deletions(-) diff --git a/hub/core/dataset/__init__.py b/hub/core/dataset/__init__.py index 50d7908edd..c9d2c8650e 100644 --- a/hub/core/dataset/__init__.py +++ b/hub/core/dataset/__init__.py @@ -2,7 +2,7 @@ from .hub_cloud_dataset import HubCloudDataset from hub.util.path import is_hub_cloud_path - +import hub # NOTE: experimentation helper FORCE_CLASS = None @@ -16,6 +16,11 @@ def dataset_factory(path, *args, **kwargs): clz = FORCE_CLASS elif is_hub_cloud_path(path): clz = HubCloudDataset + if "/.queries/" in path: + path, query_hash = path.split("/.queries/", 1) + return hub.dataset(path, *args, **kwargs)._get_stored_vds( + query_hash, as_view=True + ) else: clz = Dataset diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 9707cb2999..0f13584cc5 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -69,6 +69,7 @@ warn_node_checkout, ) from tqdm import tqdm # type: ignore +from time import time import hashlib import json from collections import defaultdict @@ -1219,6 +1220,9 @@ def __result__(self): def __args__(self): return None + def __bool__(self): + return True + def _view_hash(self): return hashlib.sha1( ( @@ -1227,12 +1231,29 @@ def _view_hash(self): ).hexdigest() def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): + tm = getattr(self, "_created_at", time()) if len(self.index.values) > 1: raise NotImplementedError("Storing sub-sample slices is not supported yet.") + if path is None and hasattr(self, "_vds"): + return self._vds if _ret_ds else self._vds.path + + hash = self._view_hash() + info = { + "id": hash, + "description": "Virtual Datasource", + "virtual-datasource": True, + "source-dataset": self.path, + "source-dataset-version": self.version_state["commit_id"], + "created_at": tm, + } + + query = getattr(self, "_query", None) + if query: + info["query"] = query + info["source-dataset-index"] = getattr(self, "_source_ds_idx", None) + if path is None: - if hasattr(self, "_vds"): - return self._vds if _ret_ds else self._vds.path if isinstance(self, MemoryProvider): raise NotImplementedError( "Saving views inplace is not supported for in-memory datasets." @@ -1247,10 +1268,9 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): ) else: self.flush() - hash = self._view_hash() self.storage.flush() base_storage = get_base_storage(self.storage) - path = base_storage.subdir(f"queries/{hash}").root + path = base_storage.subdir(f".queries/{hash}").root if hasattr(base_storage, "_args"): args = base_storage._args() args.update(ds_args) @@ -1263,32 +1283,22 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): queries = json.loads(base_storage[queries_key].decode("utf-8")) except KeyError: queries = [] - queries.append(hash) + queries.append(info) base_storage[queries_key] = json.dumps(queries).encode("utf-8") lock.release() - else: ds = hub.empty(path, **ds_args) - - info = { - "description": "Virtual Datasource", - "virtual-datasource": True, - "source-dataset": self.path, - "source-dataset-version": self.version_state["commit_id"], - } - - query = getattr(self, "_query", None) - if query: - info["query"] = query - info["source-dataset-index"] = getattr(self, "_source_ds_idx", None) with ds: ds.info.update(info) ds.create_tensor("VDS_INDEX", dtype="uint64").extend( list(self.index.values[0].indices(len(self))) ) - - print(f"Virtual dataset stored at {ds.path}") - return ds if _ret_ds else ds.path + if isinstance(self, hub.core.dataset.HubCloudDataset): + path = f"{self.path}/.queries/{hash}" + else: + path = ds.path + print(f"Virtual dataset stored at {path}") + return ds if _ret_ds else path def _get_view(self): # Only applicable for virtual datasets @@ -1313,11 +1323,14 @@ def _get_query_history(self) -> List[str]: except KeyError: return [] - def _get_stored_vds(self, hash: str): + def _get_stored_vds(self, hash: str, as_view: bool = False): """ Internal. """ base_storage = get_base_storage(self.storage) - path = base_storage.subdir(f"queries/{hash}").root + path = base_storage.subdir(f".queries/{hash}").root ds_args = base_storage._args() if hasattr(base_storage, "_args") else {} - return hub.dataset(path, **ds_args)._vds + view = hub.dataset(path, **ds_args) + if as_view: + return view + return view._vds diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 75905deb80..a473c13b14 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -8,6 +8,8 @@ from time import time import inspect +import threading +import queue def filter_dataset( @@ -22,6 +24,8 @@ def filter_dataset( ) -> hub.Dataset: index_map: List[int] + tm = time() + if isinstance(filter_function, hub.core.query.DatasetQuery): query = filter_function._query else: @@ -38,34 +42,58 @@ def filter_dataset( else None ) - if num_workers > 0: - index_map = filter_with_compute( - dataset, - filter_function, - num_workers, - scheduler, - progressbar, - vds, - ) - else: - index_map = filter_inplace( - dataset, - filter_function, - progressbar, - vds, - ) + index_map = None + try: + if num_workers > 0: + index_map = filter_with_compute( + dataset, + filter_function, + num_workers, + scheduler, + progressbar, + vds, + ) + else: + index_map = filter_inplace( + dataset, + filter_function, + progressbar, + vds, + ) + except Exception as e: + vds.info["error"] = str(e) + raise (e) ds = dataset[index_map] ds._is_filtered_view = True ds._query = query ds._source_ds_idx = dataset.index.to_json() - + ds._created_at = tm if vds: ds._vds = vds return ds # type: ignore [this is fine] +def _get_vds_thread(vds, queue, num_samples, vds_update_frequency): + def loop(): + processed = 0 + last_flushed_time = time() + while True: + index, include = queue.get() + vds.info["samples_processed"] += 1 + if include: + vds.VDS_INDEX.append(index) + processed += 1 + if processed == num_samples: + vds.flush() + break + if time() - last_flushed_time > vds_update_frequency: + vds.flush() + + return threading.Thread(target=loop) + + def filter_with_compute( dataset: hub.Dataset, filter_function: Callable, @@ -83,25 +111,20 @@ def filter_with_compute( vds.autoflush = False vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 - last_update_time = {"value": time()} + vds_queue = compute.create_queue() + vds_thread = _get_vds_thread(vds, vds_queue, len(dataset), vds_update_frequency) + vds_thread.start() def filter_slice(indices: Sequence[int]): result = list() - num_samples = len(indices) - for idx, i in enumerate(indices): + for i in indices: if filter_function(dataset[i]): result.append(i) if vds: - vds.VDS_INDEX.append(i) # type: ignore - vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if ( - idx == num_samples - 1 - or time() - last_update_time["value"] > vds_update_frequency - ): - vds.flush() - last_update_time["value"] = time() - if vds: - vds.autoflush = True + vds_queue.put((i, True)) + elif vds: + vds_queue.put((i, False)) + return result def pg_filter_slice(pg_callback, indices: Sequence[int]): @@ -110,6 +133,10 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): pg_callback(1) if filter_function(dataset[i]): result.append(i) + if vds: + vds_queue.put((i, True)) + elif vds: + vds_queue.put((i, False)) return result @@ -125,7 +152,11 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): finally: compute.close() - + if vds: + vds.autoflush = True + vds_thread.join() + if hasattr(vds_queue, "close"): + vds_queue.close() return index_map @@ -144,8 +175,9 @@ def filter_inplace( vds.autoflush = False vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 - last_update_time = {"value": time()} - + vds_queue = queue.Queue() + vds_thread = _get_vds_thread(vds, vds_queue, num_samples, vds_update_frequency) + vds_thread.start() if progressbar: from tqdm import tqdm # type: ignore @@ -155,16 +187,11 @@ def filter_inplace( if filter_function(sample_in): index_map.append(i) if vds: - vds.VDS_INDEX.append(i) # type: ignore - vds.info["samples_processed"] = vds.info["samples_processed"] + 1 - if ( - i == num_samples - 1 - or time() - last_update_time["value"] > vds_update_frequency - ): - vds.flush() - last_update_time["value"] = time() - + vds_queue.put((i, True)) + elif vds: + vds_queue.put((i, False)) if vds: vds.autoflush = True + vds_thread.join() return index_map From 11b902b1441e3710fa65d3112b56cdd24cdc67d4 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 22 Dec 2021 18:35:42 +0530 Subject: [PATCH 19/58] some refacs --- hub/core/dataset/__init__.py | 8 ++++++-- hub/core/dataset/dataset.py | 17 +++++++++++++++-- hub/core/query/test/test_query.py | 5 +++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/hub/core/dataset/__init__.py b/hub/core/dataset/__init__.py index c9d2c8650e..5cd425a2fc 100644 --- a/hub/core/dataset/__init__.py +++ b/hub/core/dataset/__init__.py @@ -2,7 +2,6 @@ from .hub_cloud_dataset import HubCloudDataset from hub.util.path import is_hub_cloud_path -import hub # NOTE: experimentation helper FORCE_CLASS = None @@ -16,9 +15,14 @@ def dataset_factory(path, *args, **kwargs): clz = FORCE_CLASS elif is_hub_cloud_path(path): clz = HubCloudDataset + if "/queries/" in path: + path, query_hash = path.split("/queries/", 1) + return dataset_factory( + f"{path}/queries/.queries/{query_hash}", *args, **kwargs + ) if "/.queries/" in path: path, query_hash = path.split("/.queries/", 1) - return hub.dataset(path, *args, **kwargs)._get_stored_vds( + return dataset_factory(path, *args, **kwargs)._get_stored_vds( query_hash, as_view=True ) else: diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 730118f32e..5d30bc2fa5 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -42,6 +42,7 @@ InvalidTensorGroupNameError, LockedException, TensorGroupAlreadyExistsError, + ReadOnlyModeError, ) from hub.util.keys import ( dataset_exists, @@ -68,6 +69,9 @@ load_meta, warn_node_checkout, ) +from hub.client.utils import get_user_name + + from tqdm import tqdm # type: ignore from time import time import hashlib @@ -1260,10 +1264,19 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): ) if self.read_only: if isinstance(self, hub.core.dataset.HubCloudDataset): - path = f"hub://{self.org_id}/_query_{self._view_hash}" + + # This is a special dataset that stores queries that the user ran on datasets they don't have write access to. + username = get_user_name() + if username == "public": + raise ReadOnlyModeError( + "Cannot save view in read only dataset. Speicify a path to store the view in a different location, or login using command `activeloop login` to store the view under your hub account." + ) + queries_ds_path = f"hub://{username}/queries" + queries_ds = hub.dataset(queries_ds_path) + path = f"{queries_ds_path}/{self._view_hash}" ds = hub.empty(path, **ds_args) else: - raise Exception( + raise ReadOnlyModeError( "Cannot save view in read only dataset. Speicify a path to store the view in a different location." ) else: diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 3212d6d4d4..11e55fec1c 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -187,11 +187,12 @@ def test_dataset_view_save(): @pytest.mark.parametrize("stream", [True, False]) -def test_inplace_dataset_view_save(s3_ds_generator, stream): +@pytest.mark.parametrize("num_workers", [0, 2]) +def test_inplace_dataset_view_save(s3_ds_generator, stream, num_workers): ds = s3_ds_generator() with ds: _populate_data(ds, n=2) - view = ds.filter("labels == 'dog'", store_result=stream) + view = ds.filter("labels == 'dog'", store_result=stream, num_workers=num_workers) assert len(ds._get_query_history()) == int(stream) vds_path = view.store() assert len(ds._get_query_history()) == 1 From bd4b10bf0e3459309e32c6e3e88adc95b2f58832 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 22 Dec 2021 21:40:26 +0530 Subject: [PATCH 20/58] upd --- hub/core/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 5d30bc2fa5..41388a60b4 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1272,7 +1272,7 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): "Cannot save view in read only dataset. Speicify a path to store the view in a different location, or login using command `activeloop login` to store the view under your hub account." ) queries_ds_path = f"hub://{username}/queries" - queries_ds = hub.dataset(queries_ds_path) + hub.dataset(queries_ds_path) # create if doesn't exist path = f"{queries_ds_path}/{self._view_hash}" ds = hub.empty(path, **ds_args) else: From 5cc5ea68c300d649410ec799e8a5284abaf9051f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 23 Dec 2021 11:36:37 +0530 Subject: [PATCH 21/58] updates --- hub/core/dataset/dataset.py | 54 ++++++++++++++++++++----------------- hub/util/storage.py | 9 ++++--- hub/util/tag.py | 26 +++++++++++++++--- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 41388a60b4..787a3910f2 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1234,14 +1234,8 @@ def _view_hash(self): ).encode() ).hexdigest() - def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): + def _get_view_info(self): tm = getattr(self, "_created_at", time()) - if len(self.index.values) > 1: - raise NotImplementedError("Storing sub-sample slices is not supported yet.") - - if path is None and hasattr(self, "_vds"): - return self._vds if _ret_ds else self._vds.path - hash = self._view_hash() info = { "id": hash, @@ -1256,7 +1250,17 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): if query: info["query"] = query info["source-dataset-index"] = getattr(self, "_source_ds_idx", None) + return info + def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): + if len(self.index.values) > 1: + raise NotImplementedError("Storing sub-sample slices is not supported yet.") + + if path is None and hasattr(self, "_vds"): + return self._vds if _ret_ds else self._vds.path + + info = self._get_view_info() + hash = info["id"] if path is None: if isinstance(self, MemoryProvider): raise NotImplementedError( @@ -1272,8 +1276,10 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): "Cannot save view in read only dataset. Speicify a path to store the view in a different location, or login using command `activeloop login` to store the view under your hub account." ) queries_ds_path = f"hub://{username}/queries" - hub.dataset(queries_ds_path) # create if doesn't exist - path = f"{queries_ds_path}/{self._view_hash}" + base_storage = get_base_storage( + hub.dataset(queries_ds_path).storage + ) # create if doesn't exist + path = f"{queries_ds_path}/{hash}" ds = hub.empty(path, **ds_args) else: raise ReadOnlyModeError( @@ -1289,27 +1295,27 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): args.update(ds_args) ds_args = args ds = hub.dataset(path, **ds_args) # type: ignore - lock = Lock(base_storage, get_queries_lock_key()) - lock.acquire(timeout=10, force=True) - queries_key = get_queries_key() - try: - queries = json.loads(base_storage[queries_key].decode("utf-8")) - except KeyError: - queries = [] - queries.append(info) - base_storage[queries_key] = json.dumps(queries).encode("utf-8") - lock.release() + if isinstance(self, hub.core.dataset.HubCloudDataset): + path = f"{self.path}/.queries/{hash}" + lock = Lock(base_storage, get_queries_lock_key()) + lock.acquire(timeout=10, force=True) + queries_key = get_queries_key() + try: + queries = json.loads(base_storage[queries_key].decode("utf-8")) + except KeyError: + queries = [] + info = self._get_view_info() + queries.append(info) + base_storage[queries_key] = json.dumps(queries).encode("utf-8") + lock.release() else: ds = hub.empty(path, **ds_args) with ds: - ds.info.update(info) + ds.info.update(info or self._get_view_info()) ds.create_tensor("VDS_INDEX", dtype="uint64").extend( list(self.index.values[0].indices(len(self))) ) - if isinstance(self, hub.core.dataset.HubCloudDataset): - path = f"{self.path}/.queries/{hash}" - else: - path = ds.path + print(f"Virtual dataset stored at {path}") return ds if _ret_ds else path diff --git a/hub/util/storage.py b/hub/util/storage.py index c07294ec1a..16807839b4 100644 --- a/hub/util/storage.py +++ b/hub/util/storage.py @@ -1,12 +1,13 @@ from hub.core.storage.gcs import GCSProvider from hub.util.cache_chain import generate_chain from hub.constants import LOCAL_CACHE_PREFIX, MB -from hub.util.tag import check_hub_path +from hub.util.tag import process_hub_path from typing import Optional from hub.core.storage.provider import StorageProvider import os from hub.core.storage import LocalProvider, S3Provider, MemoryProvider, LRUCache from hub.client.client import HubBackendClient +import posixpath def storage_provider_from_path( @@ -65,9 +66,7 @@ def storage_provider_from_path( def storage_provider_from_hub_path( path: str, read_only: bool = False, token: str = None ): - check_hub_path(path) - tag = path[6:] - org_id, ds_name = tag.split("/") + path, org_id, ds_name, subdir = process_hub_path(path) client = HubBackendClient(token=token) mode = "r" if read_only else None @@ -80,6 +79,8 @@ def storage_provider_from_hub_path( print("Opening dataset in read-only mode as you don't have write permissions.") read_only = True + url = posixpath.join(url, subdir) + storage = storage_provider_from_path(url, creds, read_only) storage._set_hub_creds_info(path, expiration) return storage diff --git a/hub/util/tag.py b/hub/util/tag.py index b0096ca64d..e49108a03e 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -1,8 +1,28 @@ from hub.util.exceptions import InvalidHubPathException +from typing import Tuple -def check_hub_path(path): - """Checks whether tag is in the format hub://username/datasetname.""" +def process_hub_path(path: str) -> Tuple[str, str, str, str]: + """Checks whether path is a valid hub path.""" + # Allowed formats: + # hub://org/ds + # hub://org/ds/.queries/hash + # hub://org/queries/hash + # hub//org/queries/.queries/hash + tag = path[6:] - if len(tag.split("/")) != 2: + s = tag.split("/") + if len(s) == 2: + if s[1] == "queries": # Attempting to open queries ds root + raise InvalidHubPathException(path) + return (path, *s, "") + elif len(s) == 3: + if s[1] != "queries": + raise InvalidHubPathException(path) + return (f"hub://{s[0]}/queries/.queries/{s[2]}", *s[:2], f".queries/{s[2]}") + elif len(s) == 4: + if s[2] != ".queries": + raise InvalidHubPathException(path) + return (path, *s[:2], f".queries/{s[3]}") + else: raise InvalidHubPathException(path) From 32416101e0e5f2bea9c3f64507e832cb62412e42 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 19:13:54 +0530 Subject: [PATCH 22/58] fixes --- hub/api/dataset.py | 1 + hub/core/dataset/dataset.py | 65 ++++++++++++++++++------------- hub/core/query/test/test_query.py | 2 +- hub/core/storage/gcs.py | 6 --- hub/core/storage/s3.py | 15 ------- 5 files changed, 39 insertions(+), 50 deletions(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 3bffe40f7b..7b56b103a0 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -83,6 +83,7 @@ def __new__( try: read_only = storage.read_only + assert not read_only return dataset_factory( path=path, storage=cache_chain, diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 2c899038fc..9a6b02fab8 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -11,7 +11,7 @@ from hub.api.info import load_info from hub.client.log import logger from hub.constants import FIRST_COMMIT_ID -from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE +from hub.constants import DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE, MB from hub.core.fast_forwarding import ffw_dataset_meta from hub.core.index import Index from hub.core.lock import lock_version, unlock_version, Lock @@ -1387,39 +1387,27 @@ def _write_queries_json(ds, info): lock.release() def _write_vds(self, vds): - """Writes the indices of this view to a vds""" + """Writes the indices of this view to a vds.""" info = self._get_view_info() with vds: vds.info.update(info) vds.create_tensor("VDS_INDEX", dtype="uint64").extend( list(self.index.values[0].indices(len(self))) ) + idxs = hub.dataset(vds.path)._vds.VDS_INDEX.numpy().reshape(-1).tolist() + exp = list(self.index.values[0].indices(len(self))) + assert idxs == exp, (idxs, exp, vds.path) def _store_view_in_subdir(self): """Stores this view under ".queries" sub directory of same storage.""" info = self._get_view_info() hash = info["id"] - + path = f".queries/{hash}" self.flush() - base_storage = get_base_storage(self.storage) - sub_storage = base_storage.subdir(f".queries/{hash}") - sub_storage.clear() - path = ( - self.path + "/.queries/" + hash - if self.path.startswith("hub://") - else sub_storage.root - ) - vds = hub.Dataset( - generate_chain( - sub_storage, DEFAULT_MEMORY_CACHE_SIZE, DEFAULT_LOCAL_CACHE_SIZE - ), - token=self._token, - path=path, - ) - + get_base_storage(self.storage).subdir(path).clear() + vds = self._sub_ds(path, empty=True) self._write_vds(vds) - Dataset._write_queries_json(self, info) return vds @@ -1503,14 +1491,35 @@ def _get_query_history(self) -> List[str]: except KeyError: return [] - def _get_stored_vds(self, hash: str, as_view: bool = False): + + def _sub_ds(self, path, empty=False): + """Loads a nested dataset. Internal. + Note: Virtual datasets are returned as such, they are not converted to views. + + Args: + empty (bool): If True, all contents of the sub directory is cleared before initializing the sub dataset. + + Returns: + Sub dataset + """ + base_storage = get_base_storage(self.storage) + sub_storage = base_storage.subdir(path) + + if self.path.startswith("hub://"): + path = posixpath.join(self.path, path) + cls = hub.core.dataset.HubCloudDataset + else: + path=sub_storage.root + cls = hub.core.dataset.Dataset + + return cls(generate_chain( + sub_storage, DEFAULT_MEMORY_CACHE_SIZE * MB, DEFAULT_LOCAL_CACHE_SIZE * MB + ), path=path, token=self._token) + + + + def _get_stored_vds(self, hash: str): """ Internal. """ - base_storage = get_base_storage(self.storage) - path = base_storage.subdir(f".queries/{hash}").root - ds_args = base_storage._args() if hasattr(base_storage, "_args") else {} - view = hub.dataset(path, **ds_args) - if as_view: - return view - return view._vds + return self._get_sub_ds(".queries/" + hash) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index e8f70f7c29..4c4f72bf1d 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -199,7 +199,7 @@ def test_dataset_view_save(): np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) -@pytest.mark.parametrize("stream", [True, False]) +@pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) def test_inplace_dataset_view_save(s3_ds_generator, stream, num_workers): ds = s3_ds_generator() diff --git a/hub/core/storage/gcs.py b/hub/core/storage/gcs.py index e84c67361d..b7851cf392 100644 --- a/hub/core/storage/gcs.py +++ b/hub/core/storage/gcs.py @@ -311,9 +311,3 @@ def __setstate__(self, state): self.project = state[3] self.read_only = state[4] self._initialize_provider() - - def _args(self) -> Union[str, Dict]: - """ - Internal. Arguments other than the path required to initialize this storage. - """ - return {"creds": self.token} diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index 765beab9cb..475c3007d2 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -421,18 +421,3 @@ def need_to_reload_creds(self, err: botocore.exceptions.ClientError) -> bool: err.response["Error"]["Code"] == "ExpiredToken" and self.loaded_creds_from_environment ) - - def _args(self): - """ - Internal. Arguments other than the path required to initialize this storage. - """ - return { - "creds": { - "aws_access_key_id": self.aws_access_key_id, - "aws_secret_access_key": self.aws_secret_access_key, - "aws_session_token": self.aws_session_token, - "endpoint_url": self.endpoint_url, - "aws_region": self.aws_region, - }, - "token": self.token, - } From d38259b48e5cc12e3d85d7fb166e4574a207a52a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 19:15:38 +0530 Subject: [PATCH 23/58] format --- hub/core/dataset/dataset.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 9a6b02fab8..c2912b0106 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1491,7 +1491,6 @@ def _get_query_history(self) -> List[str]: except KeyError: return [] - def _sub_ds(self, path, empty=False): """Loads a nested dataset. Internal. Note: Virtual datasets are returned as such, they are not converted to views. @@ -1509,14 +1508,18 @@ def _sub_ds(self, path, empty=False): path = posixpath.join(self.path, path) cls = hub.core.dataset.HubCloudDataset else: - path=sub_storage.root + path = sub_storage.root cls = hub.core.dataset.Dataset - return cls(generate_chain( - sub_storage, DEFAULT_MEMORY_CACHE_SIZE * MB, DEFAULT_LOCAL_CACHE_SIZE * MB - ), path=path, token=self._token) - - + return cls( + generate_chain( + sub_storage, + DEFAULT_MEMORY_CACHE_SIZE * MB, + DEFAULT_LOCAL_CACHE_SIZE * MB, + ), + path=path, + token=self._token, + ) def _get_stored_vds(self, hash: str): """ From 18e06f19d85f5c36a7cfaaf2cd2974005f5fc98d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 19:17:02 +0530 Subject: [PATCH 24/58] rem debug ln --- hub/api/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 7b56b103a0..3bffe40f7b 100644 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -83,7 +83,6 @@ def __new__( try: read_only = storage.read_only - assert not read_only return dataset_factory( path=path, storage=cache_chain, From e032b8248f4fc3f92b56c3643d101e6b91535a94 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 19:48:45 +0530 Subject: [PATCH 25/58] simplify --- hub/constants.py | 2 ++ hub/core/dataset/__init__.py | 10 ---------- hub/util/tag.py | 34 ++++++++++++++++++++-------------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/hub/constants.py b/hub/constants.py index c48b1d6278..7973c28d6d 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -115,3 +115,5 @@ QUERIES_FILENAME = "queries.json" QUERIES_LOCK_FILENAME = "queries.lock" + +_ENABLE_HUB_SUB_DATASETS = False diff --git a/hub/core/dataset/__init__.py b/hub/core/dataset/__init__.py index c25765d2f7..7ec4bf361c 100644 --- a/hub/core/dataset/__init__.py +++ b/hub/core/dataset/__init__.py @@ -15,16 +15,6 @@ def dataset_factory(path, *args, **kwargs): clz = FORCE_CLASS elif is_hub_cloud_path(path): clz = HubCloudDataset - if "/queries/" in path: - path, query_hash = path.split("/queries/", 1) - return dataset_factory( - f"{path}/queries/.queries/{query_hash}", *args, **kwargs - ) - if "/.queries/" in path: - path, query_hash = path.split("/.queries/", 1) - return dataset_factory(path, *args, **kwargs)._get_stored_vds( - query_hash, as_view=True - ) else: clz = Dataset diff --git a/hub/util/tag.py b/hub/util/tag.py index e49108a03e..4b9c08e685 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -1,5 +1,6 @@ from hub.util.exceptions import InvalidHubPathException from typing import Tuple +import hub def process_hub_path(path: str) -> Tuple[str, str, str, str]: @@ -8,21 +9,26 @@ def process_hub_path(path: str) -> Tuple[str, str, str, str]: # hub://org/ds # hub://org/ds/.queries/hash # hub://org/queries/hash - # hub//org/queries/.queries/hash + # hub://org/queries/.queries/hash + # hub://org/ds/sub_ds1/sub_ds2/sub_ds3/..../sub_ds{n} # Only for internal usage. tag = path[6:] s = tag.split("/") - if len(s) == 2: - if s[1] == "queries": # Attempting to open queries ds root - raise InvalidHubPathException(path) - return (path, *s, "") - elif len(s) == 3: - if s[1] != "queries": - raise InvalidHubPathException(path) - return (f"hub://{s[0]}/queries/.queries/{s[2]}", *s[:2], f".queries/{s[2]}") - elif len(s) == 4: - if s[2] != ".queries": - raise InvalidHubPathException(path) - return (path, *s[:2], f".queries/{s[3]}") - else: + + if len(s) < 2: raise InvalidHubPathException(path) + + path = f"hub://{s[0]}/{s[1]}" + + if len(s) == 3 and s[1] == "queries" and not s[2].startswith("."): + # Special case: expand hub://username/queries/hash to hub://username/queries/.queries/hash + subdir = f"queries/.queries/{s[2]}" + else: + subdir = "/".join(s[2:]) + if len(s) > 2: + if ( + not (len(s) == 4 and s[2] == ".queries") + and not hub.constants._ENABLE_HUB_SUB_DATASETS + ): + raise InvalidHubPathException(path) + return path, s[:2], subdir From 06f6a629224dab60082fce2a62febe3ae8d1773d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 20:01:41 +0530 Subject: [PATCH 26/58] hub tests --- hub/core/query/test/test_query.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 4c4f72bf1d..26aef5e8ed 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -5,6 +5,11 @@ from hub.core.query import DatasetQuery from hub.core.query.query import EvalGenericTensor, EvalLabelClassTensor from hub.core.index import Index +from hub.tests.dataset_fixtures import ( + local_ds_generator, + s3_ds_generator, + hub_ds_generator, +) import hub @@ -201,8 +206,16 @@ def test_dataset_view_save(): @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) -def test_inplace_dataset_view_save(s3_ds_generator, stream, num_workers): - ds = s3_ds_generator() +@pytest.mark.parametrize( + "ds_generator", + [s3_ds_generator, local_ds_generator, hub_ds_generator], + indirect=True, +) +@pytest.mark.parametrize("read_only", [False, True]) +def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only): + ds = ds_generator() + if read_only and not ds.path.startswith("hub://"): + return with ds: _populate_data(ds, n=2) view = ds.filter("labels == 'dog'", store_result=stream, num_workers=num_workers) @@ -210,5 +223,11 @@ def test_inplace_dataset_view_save(s3_ds_generator, stream, num_workers): vds_path = view.store() assert len(ds._get_query_history()) == 1 view2 = hub.dataset(vds_path) + if ds.path.startswith("hub://"): + assert vds_path.startswith("hub://") + if read_only: + assert vds_path[6:].split("/")[1] == "queries" + else: + assert ds.path + "/.queries/" in vds_path for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) From 49198c50d64900e1698d37cba8c119e85c535ea6 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 20:42:20 +0530 Subject: [PATCH 27/58] cleanup --- hub/core/query/test/test_query.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 26aef5e8ed..c4d602e746 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -231,3 +231,7 @@ def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only) assert ds.path + "/.queries/" in vds_path for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) + if ds.path.startswith("hub://") and read_only: + # Delete queries ds from testing acc: + org = ds.path[6:].split("/")[1] + hub.delete(f"hub://{org}/queries", large_ok=True) From 7aeec7e0eb821fb37524afac693963505e4fc2ac Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 20:53:44 +0530 Subject: [PATCH 28/58] import fix --- hub/core/query/test/test_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index c4d602e746..2c3248ec14 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -8,7 +8,7 @@ from hub.tests.dataset_fixtures import ( local_ds_generator, s3_ds_generator, - hub_ds_generator, + hub_cloud_ds_generator, ) import hub @@ -208,7 +208,7 @@ def test_dataset_view_save(): @pytest.mark.parametrize("num_workers", [0, 2]) @pytest.mark.parametrize( "ds_generator", - [s3_ds_generator, local_ds_generator, hub_ds_generator], + [s3_ds_generator, local_ds_generator, hub_cloud_ds_generator], indirect=True, ) @pytest.mark.parametrize("read_only", [False, True]) From 506d28f7411002d34f207b898c39f67d5def9b75 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 21:54:34 +0530 Subject: [PATCH 29/58] fix tag --- hub/util/tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/util/tag.py b/hub/util/tag.py index 4b9c08e685..3ae75194a0 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -31,4 +31,4 @@ def process_hub_path(path: str) -> Tuple[str, str, str, str]: and not hub.constants._ENABLE_HUB_SUB_DATASETS ): raise InvalidHubPathException(path) - return path, s[:2], subdir + return path, *s[:2], subdir From 9aa31759c6e8a1ee94807be24db6583143cc1757 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 22:01:16 +0530 Subject: [PATCH 30/58] format --- hub/util/tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/util/tag.py b/hub/util/tag.py index 3ae75194a0..cfb8a019d1 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -31,4 +31,4 @@ def process_hub_path(path: str) -> Tuple[str, str, str, str]: and not hub.constants._ENABLE_HUB_SUB_DATASETS ): raise InvalidHubPathException(path) - return path, *s[:2], subdir + return (path, *s[:2], subdir) From b8907a3dc69a6ec1de90dd89b736955c74ea3bf9 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 22:13:44 +0530 Subject: [PATCH 31/58] mypy --- hub/core/query/filter.py | 8 ++++---- hub/util/tag.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 0a2e72b08e..d8ae0310bd 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional, Sequence +from typing import Callable, List, Optional, Sequence, Dict from uuid import uuid4 import hub @@ -21,7 +21,7 @@ _UPDATE_FREQUENCY = 5 # seconds -_LAST_UPDATED_TIMES = defaultdict(time) +_LAST_UPDATED_TIMES: Dict = defaultdict(time) def _counter(id): @@ -79,7 +79,7 @@ def filter_dataset( else None ) - index_map = None + index_map = None # type: ignore try: if num_workers > 0: index_map = filter_with_compute( @@ -263,7 +263,7 @@ def filter_inplace( vds.autoflush = False vds.info["total_samples"] = len(dataset) vds.info["samples_processed"] = 0 - vds_queue = Queue() + vds_queue: Queue = Queue() vds_thread = _get_vds_thread(vds, vds_queue, num_samples) vds_thread.start() if progressbar: diff --git a/hub/util/tag.py b/hub/util/tag.py index cfb8a019d1..87cefbe528 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -31,4 +31,4 @@ def process_hub_path(path: str) -> Tuple[str, str, str, str]: and not hub.constants._ENABLE_HUB_SUB_DATASETS ): raise InvalidHubPathException(path) - return (path, *s[:2], subdir) + return (path, *s[:2], subdir) # type: ignore From b5f9990abd2f4faebaca6567f67f3c9844a62abe Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 29 Dec 2021 23:58:45 +0530 Subject: [PATCH 32/58] fixt fix --- hub/core/query/test/test_query.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 2c3248ec14..f90132d3fc 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -5,11 +5,7 @@ from hub.core.query import DatasetQuery from hub.core.query.query import EvalGenericTensor, EvalLabelClassTensor from hub.core.index import Index -from hub.tests.dataset_fixtures import ( - local_ds_generator, - s3_ds_generator, - hub_cloud_ds_generator, -) +from hub.tests.dataset_fixtures import enabled_persistent_dataset_generators import hub @@ -204,13 +200,9 @@ def test_dataset_view_save(): np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) +@enabled_persistent_dataset_generators @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) -@pytest.mark.parametrize( - "ds_generator", - [s3_ds_generator, local_ds_generator, hub_cloud_ds_generator], - indirect=True, -) @pytest.mark.parametrize("read_only", [False, True]) def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only): ds = ds_generator() From 2f0c3a6d50ae9d0fe610f3f687b8ae857ddf867a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 21:02:30 +0530 Subject: [PATCH 33/58] updates --- hub/api/tests/test_events.py | 2 +- hub/constants.py | 3 + hub/core/dataset/dataset.py | 144 ++++++++++++++++++++++++++--------- hub/core/query/filter.py | 92 +++++++++++----------- hub/core/storage/s3.py | 5 +- 5 files changed, 163 insertions(+), 83 deletions(-) diff --git a/hub/api/tests/test_events.py b/hub/api/tests/test_events.py index 602b373a8f..72a44a5d20 100644 --- a/hub/api/tests/test_events.py +++ b/hub/api/tests/test_events.py @@ -13,7 +13,7 @@ def test_query_progress_event(hub_cloud_ds): ds.labels.append([0]) ds.labels.append([1]) - result = ds.filter("labels == 0", progressbar=False) + result = ds.filter("labels == 0", progressbar=False, store_result=True) assert len(result) == 1 diff --git a/hub/constants.py b/hub/constants.py index 7973c28d6d..8eca737991 100644 --- a/hub/constants.py +++ b/hub/constants.py @@ -117,3 +117,6 @@ QUERIES_LOCK_FILENAME = "queries.lock" _ENABLE_HUB_SUB_DATASETS = False + +# Frequency for sending progress events and writing to vds +QUERY_PROGRESS_UPDATE_FREQUENCY = 5 # seconds diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index c2912b0106..772c341b60 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -28,6 +28,7 @@ from hub.util.bugout_reporter import hub_reporter from hub.util.dataset import try_flushing from hub.util.cache_chain import generate_chain +from hub.util.hash import hash_inputs from hub.util.exceptions import ( CouldNotCreateNewDatasetException, InvalidKeyTypeError, @@ -130,8 +131,6 @@ def __init__( d["_locked_out"] = False # User requested write access but was denied d["is_iteration"] = is_iteration d["is_first_load"] = is_first_load = version_state is None - # self.__dict__.update(d) - # d.clear() d["index"] = index or Index() d["group_index"] = group_index d["_token"] = token @@ -145,6 +144,8 @@ def __init__( self._initial_autoflush: List[ bool ] = [] # This is a stack to support nested with contexts + self._is_filtered_view = False + self._view_info = None def _lock_lost_handler(self): """This is called when lock is acquired but lost later on due to slow update.""" @@ -189,19 +190,23 @@ def __getstate__(self) -> Dict[str, Any]: """ if self.path.startswith("mem://"): raise MemoryDatasetCanNotBePickledError - return { - "path": self.path, - "_read_only": self._read_only, - "index": self.index, - "group_index": self.group_index, - "public": self.public, - "storage": self.storage, - "_token": self.token, - "verbose": self.verbose, - "version_state": self.version_state, - "org_id": self.org_id, - "ds_name": self.ds_name, - } + keys = [ + "path", + "_read_only", + "index", + "group_index", + "public", + "storage", + "_token", + "verbose", + "version_state", + "org_id", + "ds_name", + "_is_filtered_view", + "_view_info", + ] + state = {k: getattr(self, k) for k in keys} + return state def __setstate__(self, state: Dict[str, Any]): """Restores dataset from a pickled state. @@ -584,7 +589,7 @@ def commit(self, message: Optional[str] = None) -> str: return self._commit(message) def _commit(self, message: Optional[str] = None, hash: Optional[str] = None) -> str: - if getattr(self, "_is_filterd_view", False): + if self._is_filtered_view: raise Exception( "Cannot perform version control operations on a filtered dataset view." ) @@ -625,7 +630,7 @@ def checkout(self, address: str, create: bool = False) -> Optional[str]: def _checkout( self, address: str, create: bool = False, hash: Optional[str] = None ) -> Optional[str]: - if getattr(self, "_is_filterd_view", False): + if self._is_filtered_view: raise Exception( "Cannot perform version control operations on a filtered dataset view." ) @@ -1343,15 +1348,17 @@ def append(self, sample: Dict[str, Any], skip_ok: bool = False): ) from e2 raise e - def _view_hash(self): - return hashlib.sha1( - ( - f"{self.path}[{':'.join(str(e.value) for e in self.index.values)}]@{self.version_state['commit_id']}&{getattr(self, '_query', None)}" - ).encode() - ).hexdigest() + def _view_hash(self) -> str: + """Generates a unique hash for a filtered dataset view.""" + return hash_inputs( + self.path, + *[e.value for e in self.index.values], + self.pending_commit_id, + getattr(self, "_query", None), + ) def _get_view_info(self): - if not hasattr(self, "_view_info"): + if self._view_info is None: tm = getattr(self, "_created_at", time()) hash = self._view_hash() info = { @@ -1394,9 +1401,6 @@ def _write_vds(self, vds): vds.create_tensor("VDS_INDEX", dtype="uint64").extend( list(self.index.values[0].indices(len(self))) ) - idxs = hub.dataset(vds.path)._vds.VDS_INDEX.numpy().reshape(-1).tolist() - exp = list(self.index.values[0].indices(len(self))) - assert idxs == exp, (idxs, exp, vds.path) def _store_view_in_subdir(self): """Stores this view under ".queries" sub directory of same storage.""" @@ -1442,7 +1446,40 @@ def _store_view_in_path(self, path, **ds_args): self._write_vds(vds) return vds - def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): + def store(self, path: Optional[str] = None, **ds_args) -> str: + """Stores a dataset view as a virtual dataset (VDS) + + Args: + path (Optional, str): If specified, the VDS will stored as a standalone dataset at the specified path. If not, + the VDS is stored under `.queries` subdirectory of the source dataset's storage. If the user doesn't have + write access to the source dataset and the source dataset is a hub cloud dataset, then the VDS is stored + is stored under the user's hub account and can be accessed using hub.load(f"hub://{username}/queries/{query_hash}"). + ds_args (dict): Additional args for creating VDS when path is specified. (See documentation for `hub.dataset()`) + + Returns: + (str) Path to the stored VDS. + """ + return self._store(path, False, **ds_args) + + def _store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): + """Stores a dataset view as a virtual dataset (VDS) + + Args: + path (Optional, str): If specified, the VDS will stored as a standalone dataset at the specified path. If not, + the VDS is stored under `.queries` subdirectory of the source dataset's storage. If the user doesn't have + write access to the source dataset and the source dataset is a hub cloud dataset, then the VDS is stored + is stored under the user's hub account and can be accessed using hub.load(f"hub://{username}/queries/{query_hash}"). + _ret_ds (Optional, str): If True, the VDS is retured as such without converting it to a view. If False, the VDS path is returned. + Default False. + ds_args (dict): Additional args for creating VDS when path is specified. (See documentation for `hub.dataset()`) + + Returns: + If _ret_ds is True, the VDS is returned, else path to the VDS is returned. + + Raises: + NotImplementedError: When storing sub-sample slices and saving views inplace for in-memory datasets. + ReadOnlyModeError: When attempting to save a view inplace and the user doesn't have write access. + """ if len(self.index.values) > 1: raise NotImplementedError("Storing sub-sample slices is not supported yet.") @@ -1469,17 +1506,37 @@ def store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): return vds.path def _get_view(self): - # Only applicable for virtual datasets - ds = hub.dataset(path=self.info["source-dataset"], verbose=False) + """Returns a view for this VDS. Only works if this Dataset is a virtual dataset. + + Raises: + Exception: If this is not a VDS. + """ + try: + ds = hub.dataset(path=self.info["source-dataset"], verbose=False) + except KeyError: + raise Exception("Dataset._get_view() works only for virtual datasets.") ds = ds[self.VDS_INDEX.numpy().reshape(-1).tolist()] ds._vds = self return ds - def _get_empty_vds(self, vds_path=None, query=None, **vds_args): + def _get_empty_vds( + self, vds_path: Optional[str] = None, query: Optional[str] = None, **vds_args + ): + """Returns an empty VDS with this dataset as the source dataset. Internal. + + Args: + vds_path (Optional, str): If specified, the vds will be stored at this path. Else the vds will be stored + under `.queries` subdirectory. + query (Optional, str): Query string associated with this view. + vds_args (dict): Additional args for creating vds when path is specified. + + Returns: + Empty VDS with this dataset as the source dataset. + """ view = self[:0] if query: view._query = query - return view.store(vds_path, _ret_ds=True, **vds_args) + return view._store(vds_path, _ret_ds=True, **vds_args) def _get_query_history(self) -> List[str]: """ @@ -1491,12 +1548,20 @@ def _get_query_history(self) -> List[str]: except KeyError: return [] - def _sub_ds(self, path, empty=False): + def _sub_ds( + self, + path, + empty=False, + memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE, + local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE, + ): """Loads a nested dataset. Internal. Note: Virtual datasets are returned as such, they are not converted to views. Args: empty (bool): If True, all contents of the sub directory is cleared before initializing the sub dataset. + memory_cache_size (int): Memory cache size for the sub dataset. + local_cache_size (int): Local storage cache size for the sub dataset. Returns: Sub dataset @@ -1514,15 +1579,20 @@ def _sub_ds(self, path, empty=False): return cls( generate_chain( sub_storage, - DEFAULT_MEMORY_CACHE_SIZE * MB, - DEFAULT_LOCAL_CACHE_SIZE * MB, + memory_cache_size * MB, + local_cache_size * MB, ), path=path, token=self._token, ) def _get_stored_vds(self, hash: str): - """ - Internal. + """Returns a vds stored under the `.queries` subdirectory given its hash. + + Args: + hash (str): Hash of the required vds. + + Returns: + VDS with the specified hash. """ return self._get_sub_ds(".queries/" + hash) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index d8ae0310bd..23adc5a82d 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -6,6 +6,7 @@ from hub.core.io import SampleStreaming from hub.util.compute import get_compute_provider from hub.util.dataset import map_tensor_keys +from hub.constants import QUERY_PROGRESS_UPDATE_FREQUENCY from time import time import inspect @@ -17,20 +18,16 @@ from hub.util.hash import hash_inputs -# Frequency for sending progress events and writing to vds -_UPDATE_FREQUENCY = 5 # seconds - - _LAST_UPDATED_TIMES: Dict = defaultdict(time) def _counter(id): - """A method which returns True only every `_UPDATE_FREQUENCY` seconds for each id. + """A method which returns True only every `QUERY_PROGRESS_UPDATE_FREQUENCY` seconds for each id. Used for sending query progress update events and writing to vds. """ last_updated_time = _LAST_UPDATED_TIMES[id] curr_time = time() - if curr_time - last_updated_time > _UPDATE_FREQUENCY: + if curr_time - last_updated_time > QUERY_PROGRESS_UPDATE_FREQUENCY: _LAST_UPDATED_TIMES[id] = curr_time return True return False @@ -189,9 +186,10 @@ def filter_slice(indices: Sequence[int]): result.append(i) if vds: vds_queue.put((i, True)) + _event_callback() elif vds: vds_queue.put((i, False)) - _event_callback() + _event_callback() return result def pg_filter_slice(pg_callback, indices: Sequence[int]): @@ -201,18 +199,19 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): result.append(i) if vds: vds_queue.put((i, True)) + _event_callback() elif vds: vds_queue.put((i, False)) + _event_callback() pg_callback(1) - _event_callback() return result result: Sequence[List[int]] idx: List[List[int]] = [block.indices() for block in blocks] - - dataset._send_query_progress( - query_text=query_text, query_id=query_id, start=True, progress=0 - ) + if vds: + dataset._send_query_progress( + query_text=query_text, query_id=query_id, start=True, progress=0 + ) try: if progressbar: @@ -220,21 +219,23 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): else: result = compute.map(filter_slice, idx) # type: ignore index_map = [k for x in result for k in x] # unfold the result map - dataset._send_query_progress( - query_text=query_text, - query_id=query_id, - end=True, - progress=100, - status="success", - ) + if vds: + dataset._send_query_progress( + query_text=query_text, + query_id=query_id, + end=True, + progress=100, + status="success", + ) except Exception as e: - dataset._send_query_progress( - query_text=query_text, - query_id=query_id, - end=True, - progress=100, - status="failed", - ) + if vds: + dataset._send_query_progress( + query_text=query_text, + query_id=query_id, + end=True, + progress=100, + status="failed", + ) raise FilterError(e) finally: @@ -273,9 +274,10 @@ def filter_inplace( query_id = hash_inputs(dataset.path, dataset.pending_commit_id, query_text) - dataset._send_query_progress( - query_text=query_text, query_id=query_id, start=True, progress=0 - ) + if vds: + dataset._send_query_progress( + query_text=query_text, query_id=query_id, start=True, progress=0 + ) try: for i, sample_in in it: @@ -285,28 +287,30 @@ def filter_inplace( vds_queue.put((i, True)) elif vds: vds_queue.put((i, False)) - if _counter(query_id): + if vds and _counter(query_id): dataset._send_query_progress( query_text=query_text, query_id=query_id, progress=int(i * 100 / num_samples), status="success", ) - dataset._send_query_progress( - query_text=query_text, - query_id=query_id, - end=True, - progress=100, - status="success", - ) + if vds: + dataset._send_query_progress( + query_text=query_text, + query_id=query_id, + end=True, + progress=100, + status="success", + ) except Exception as e: - dataset._send_query_progress( - query_text=query_text, - query_id=query_id, - end=True, - progress=100, - status="failed", - ) + if vds: + dataset._send_query_progress( + query_text=query_text, + query_id=query_id, + end=True, + progress=100, + status="failed", + ) raise (e) finally: if vds: diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index 475c3007d2..7991aa3754 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -95,7 +95,7 @@ def __init__( self._initialize_s3_parameters() def subdir(self, path: str): - return self.__class__( + sd = self.__class__( root=posixpath.join(self.root, path), aws_access_key_id=self.aws_access_key_id, aws_secret_access_key=self.aws_secret_access_key, @@ -103,6 +103,9 @@ def subdir(self, path: str): aws_region=self.aws_region, endpoint_url=self.endpoint_url, ) + if sd.expiration: + sd._set_hub_creds_info(self.hub_path, self.expiration) + return sd def __setitem__(self, path, content): """Sets the object present at the path with the value From ed277c215d2cbd60dbd5d9b16e2822a0b830b2c0 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 21:07:54 +0530 Subject: [PATCH 34/58] format --- hub/core/dataset/dataset.py | 1 + hub/core/query/filter.py | 3 +++ hub/core/query/test/test_query.py | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index ea2bde042f..ec038e20fa 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1565,6 +1565,7 @@ def _sub_ds( Note: Virtual datasets are returned as such, they are not converted to views. Args: + path (str): Path to sub directory empty (bool): If True, all contents of the sub directory is cleared before initializing the sub dataset. memory_cache_size (int): Memory cache size for the sub dataset. local_cache_size (int): Local storage cache size for the sub dataset. diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 23adc5a82d..c256ec1e15 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -122,6 +122,9 @@ def _get_vds_thread(vds: hub.Dataset, queue: Queue, num_samples: int): where the int is a sample index and the bool is whether or not to include the sample index in the vds. num_samples (int): Total number of samples in the source dataset. + + Returns: + threading.Thread object """ id = str(uuid4().hex) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index f90132d3fc..33e320c79a 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -5,7 +5,6 @@ from hub.core.query import DatasetQuery from hub.core.query.query import EvalGenericTensor, EvalLabelClassTensor from hub.core.index import Index -from hub.tests.dataset_fixtures import enabled_persistent_dataset_generators import hub @@ -200,7 +199,18 @@ def test_dataset_view_save(): np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) -@enabled_persistent_dataset_generators +pytest.mark.parametrize( + "ds_generator", + [ + "local_ds_generator", + "s3_ds_generator", + "gcs_ds_generator", + "hub_cloud_ds_generator", + ], + indirect=True, +) + + @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) @pytest.mark.parametrize("read_only", [False, True]) From 0a05f84e2f465b8cc7a27b1e5b98982101958955 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 21:26:13 +0530 Subject: [PATCH 35/58] mypy --- hub/core/storage/__init__.py | 4 +++- hub/core/storage/s3.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/hub/core/storage/__init__.py b/hub/core/storage/__init__.py index 5644814f26..b030d4c74a 100644 --- a/hub/core/storage/__init__.py +++ b/hub/core/storage/__init__.py @@ -3,4 +3,6 @@ from hub.core.storage.memory import MemoryProvider from hub.core.storage.local import LocalProvider from hub.core.storage.lru_cache import LRUCache -from hub.core.storage.gcs import GCSProvider + +# from hub.core.storage.gcs import GCSProvider +GCSProvider = S3Provider diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py index 7991aa3754..8ba9461ae4 100644 --- a/hub/core/storage/s3.py +++ b/hub/core/storage/s3.py @@ -104,7 +104,7 @@ def subdir(self, path: str): endpoint_url=self.endpoint_url, ) if sd.expiration: - sd._set_hub_creds_info(self.hub_path, self.expiration) + sd._set_hub_creds_info(self.hub_path, self.expiration) # type: ignore return sd def __setitem__(self, path, content): From b9b22db2fc7e39efca4b7eabca67ca38c6552f01 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 21:52:19 +0530 Subject: [PATCH 36/58] fixture fix --- hub/core/query/test/test_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 33e320c79a..e8252ba841 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -200,7 +200,7 @@ def test_dataset_view_save(): pytest.mark.parametrize( - "ds_generator", + "ds_gen", [ "local_ds_generator", "s3_ds_generator", @@ -214,8 +214,8 @@ def test_dataset_view_save(): @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) @pytest.mark.parametrize("read_only", [False, True]) -def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only): - ds = ds_generator() +def test_inplace_dataset_view_save(ds_gen, stream, num_workers, read_only): + ds = ds_gen() if read_only and not ds.path.startswith("hub://"): return with ds: From 2a36b555713bd426cc6c950a396f6e926ef352a5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 22:06:06 +0530 Subject: [PATCH 37/58] darg fix --- hub/core/dataset/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index ec038e20fa..8a6b3e4717 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1475,7 +1475,7 @@ def _store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): the VDS is stored under `.queries` subdirectory of the source dataset's storage. If the user doesn't have write access to the source dataset and the source dataset is a hub cloud dataset, then the VDS is stored is stored under the user's hub account and can be accessed using hub.load(f"hub://{username}/queries/{query_hash}"). - _ret_ds (Optional, str): If True, the VDS is retured as such without converting it to a view. If False, the VDS path is returned. + _ret_ds bool: If True, the VDS is retured as such without converting it to a view. If False, the VDS path is returned. Default False. ds_args (dict): Additional args for creating VDS when path is specified. (See documentation for `hub.dataset()`) @@ -1514,6 +1514,9 @@ def _store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): def _get_view(self): """Returns a view for this VDS. Only works if this Dataset is a virtual dataset. + Returns: + A view of the source dataset based on the indices from VDS. + Raises: Exception: If this is not a VDS. """ From 94de43053777244ac82bfb5c4253149542c32681 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 22:07:26 +0530 Subject: [PATCH 38/58] rem debug ln --- hub/core/storage/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hub/core/storage/__init__.py b/hub/core/storage/__init__.py index b030d4c74a..5644814f26 100644 --- a/hub/core/storage/__init__.py +++ b/hub/core/storage/__init__.py @@ -3,6 +3,4 @@ from hub.core.storage.memory import MemoryProvider from hub.core.storage.local import LocalProvider from hub.core.storage.lru_cache import LRUCache - -# from hub.core.storage.gcs import GCSProvider -GCSProvider = S3Provider +from hub.core.storage.gcs import GCSProvider From 900c7c401a65a14b5aa1323fbcbd3912a81b74bf Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Dec 2021 22:26:21 +0530 Subject: [PATCH 39/58] darg --- hub/core/dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 8a6b3e4717..a93d8a5821 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1475,7 +1475,7 @@ def _store(self, path: Optional[str] = None, _ret_ds: bool = False, **ds_args): the VDS is stored under `.queries` subdirectory of the source dataset's storage. If the user doesn't have write access to the source dataset and the source dataset is a hub cloud dataset, then the VDS is stored is stored under the user's hub account and can be accessed using hub.load(f"hub://{username}/queries/{query_hash}"). - _ret_ds bool: If True, the VDS is retured as such without converting it to a view. If False, the VDS path is returned. + _ret_ds (bool): If True, the VDS is retured as such without converting it to a view. If False, the VDS path is returned. Default False. ds_args (dict): Additional args for creating VDS when path is specified. (See documentation for `hub.dataset()`) From c7303333a6ae26719e17921123a0d849ab6fd77d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 31 Dec 2021 13:05:30 +0530 Subject: [PATCH 40/58] dbg --- hub/core/dataset/dataset.py | 16 +----- hub/core/query/filter.py | 61 +++++++++++++++-------- hub/core/query/query.py | 8 +-- hub/core/query/test/test_query.py | 83 ++++++------------------------- hub/core/storage/__init__.py | 3 +- 5 files changed, 65 insertions(+), 106 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 1ee99555c0..692b7d67de 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -951,20 +951,8 @@ def filter( from hub.core.query import filter_dataset, query_dataset from hub.core.query import DatasetQuery - if isinstance(function, str): - return query_dataset( - self, - function, - num_workers=num_workers, - scheduler=scheduler, - progressbar=progressbar, - store_result=store_result, - result_path=result_path, - result_ds_args=result_ds_args, - ) - else: - - return filter_dataset( + fn = query_dataset if isinstance(function, str) else filter_dataset + return fn( self, function, num_workers=num_workers, diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index ded45b7af5..8274820469 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional, Sequence, Dict, final +from typing import Callable, List, Optional, Sequence, Dict from uuid import uuid4 import hub @@ -137,6 +137,7 @@ def loop(): if include: vds.VDS_INDEX.append(index) processed += 1 + print(processed, num_samples) if processed == num_samples: vds.flush() _del_counter(id) @@ -342,10 +343,12 @@ def query_dataset( if store_result else None ) - index_map = query_inplace(dataset, query, progressbar, num_workers, scheduler, vds) - return dataset[index_map] # type: ignore [this is fine] + ret = dataset[index_map] # type: ignore [this is fine] + if vds: + ret._vds = vds + return ret def query_inplace( @@ -358,14 +361,14 @@ def query_inplace( ) -> List[int]: num_samples = len(dataset) - compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) + compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) if num_workers > 0 else None query_id = hash_inputs(dataset.path, dataset.pending_commit_id, query) if vds: vds.autoflush = False vds.info["total_samples"] = num_samples vds.info["samples_processed"] = 0 - vds_queue = compute.create_queue() + vds_queue = Queue() if num_workers == 0 else compute.create_queue() vds_thread = _get_vds_thread(vds, vds_queue, num_samples) vds_thread.start() dataset._send_query_progress( @@ -373,7 +376,9 @@ def query_inplace( ) num_processed = {"value": 0} + def update_vds(idx, include): + print("update_vds", idx, include) if vds: vds_queue.put((idx, include)) num_processed["value"] += 1 @@ -384,10 +389,12 @@ def update_vds(idx, include): progress=int(num_processed["value"] * 100 / num_samples), status="success", ) + print("update_vds", "num_processed", num_processed) def subquery(dataset_query): + print("subquery()", id(num_processed), num_processed, dataset_query) dataset, query = dataset_query - + print(len(dataset)) if progressbar: from tqdm import tqdm @@ -396,19 +403,23 @@ def subquery(dataset_query): def update(idx, include): bar.update(1) update_vds(idx, include) + try: ds_query = DatasetQuery(dataset, query, update) - return ds_query.execute() + ret = ds_query.execute() finally: bar.close() else: - return DatasetQuery(dataset, query, update_vds).execute() - + print("DatasetQuery()") + ret = DatasetQuery(dataset, query, update_vds).execute() + print("after update", num_processed) + return ret def pg_subquery(pg_callback, dataset_query): def update(idx, include): update_vds(idx, include) pg_callback(1) + dataset, query = dataset_query ds_query = DatasetQuery(dataset, query, progress_callback=update) return ds_query.execute() @@ -426,7 +437,7 @@ def update(idx, include): ] if progressbar: - result = compute.map_with_progressbar(pg_subquery, subdatasets, total_length=len(dataset)) # type: ignore + result = compute.map_with_progressbar(pg_subquery, subdatasets, total_length=num_samples) # type: ignore else: result = compute.map(subquery, subdatasets) # type: ignore @@ -435,19 +446,27 @@ def update(idx, include): compute.close() except Exception as e: dataset._send_query_progress( - query_text=query, - query_id=query_id, - end=True, - progress=100, - status="failed", - ) + query_text=query, + query_id=query_id, + end=True, + progress=100, + status="failed", + ) raise e finally: + if vds: + vds.autoflush = True + print("joining...") + vds_thread.join() + print("done.") + if hasattr(vds_queue, "close"): + vds_queue.close() _del_counter(query_id) dataset._send_query_progress( - query_text=query, - query_id=query_id, - end=True, - progress=100, - status="success",) + query_text=query, + query_id=query_id, + end=True, + progress=100, + status="success", + ) return index_map diff --git a/hub/core/query/query.py b/hub/core/query/query.py index 62d22b1d75..49378519f9 100644 --- a/hub/core/query/query.py +++ b/hub/core/query/query.py @@ -29,6 +29,7 @@ def __init__( self._wrappers = self._export_tensors() def execute(self) -> List[int]: + print("execute()", len(self._dataset)) idx_map: List[int] = list() max_size = len(self._dataset) @@ -42,12 +43,13 @@ def execute(self) -> List[int]: tensor: self._wrap_value(tensor, cache[tensor][local_idx]) for tensor in self._tensors } + print("pg_callback()", idx) if eval(self._cquery, p): idx_map.append(int(idx)) - self._pg_callback(idx, True) + self._pg_callback(int(idx), True) else: - self._pg_callback(idx, False) - + self._pg_callback(int(idx), False) + print("execute() Done.") return idx_map def _wrap_value(self, tensor, val): diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 7283dfcc43..708d14b2b4 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -3,8 +3,6 @@ import numpy as np from hub.core.query import DatasetQuery -from hub.core.query.query import EvalGenericTensor, EvalLabelClassTensor -from hub.core.index import Index import hub @@ -25,59 +23,9 @@ def _populate_data(ds, n=1): @pytest.fixture -def sample_ds(memory_ds): - _populate_data(memory_ds) - return memory_ds - - -def test_tensor_functions(sample_ds): - for ind, row in enumerate(rows): - i = EvalGenericTensor(DatasetQuery(sample_ds[ind], ""), sample_ds[ind].images) - i.at_index(Index(ind)) - - l = EvalGenericTensor(DatasetQuery(sample_ds[ind], ""), sample_ds[ind].labels) - l.at_index(Index(ind)) - - assert i.min == min(row["images"]) - assert i.max == max(row["images"]) - assert i.mean == sum(row["images"]) / len(row["images"]) - assert i.shape[0] == len(row["images"]) - assert i.size == len(row["images"]) - assert i[1] == row["images"][1] - - assert l == row["labels"][0] - assert l != row["labels"][0] + 2 - assert l > row["labels"][0] - 1 - assert l < row["labels"][0] + 1 - assert l >= row["labels"][0] - assert l <= row["labels"][0] - - -def test_class_label_tensor_function(sample_ds): - eval_object = EvalLabelClassTensor( - DatasetQuery(sample_ds[0], ""), sample_ds[0].labels - ) - eval_object.at_index(Index(0)) - assert eval_object == "dog" - - eval_object = EvalLabelClassTensor( - DatasetQuery(sample_ds[1], ""), sample_ds[1].labels - ) - eval_object.at_index(Index(1)) - assert eval_object == "cat" - - -def test_tensor_subscript(memory_ds): - arr = [[[1], [2]], [[2], [3]], [[4], [5]]] - - memory_ds.create_tensor("images") - memory_ds.images.append(arr) - - i = EvalGenericTensor(DatasetQuery(memory_ds[0], ""), memory_ds[0].images) - i.at_index(Index(0)) - - assert i[2, 1] == arr[2][1] - assert i[1].min == min(arr[1])[0] +def sample_ds(local_ds): + _populate_data(local_ds) + return local_ds @pytest.mark.parametrize( @@ -156,25 +104,26 @@ def test_dataset_view_save(): np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) -pytest.mark.parametrize( - "ds_gen", +@pytest.mark.parametrize( + "ds_generator", [ "local_ds_generator", - "s3_ds_generator", - "gcs_ds_generator", - "hub_cloud_ds_generator", - ], + # "s3_ds_generator", + # "gcs_ds_generator", + # "hub_cloud_ds_generator", + ], indirect=True ) -@pytest.mark.parametrize("stream", [False, True]) -@pytest.mark.parametrize("num_workers", [0, 2]) -@pytest.mark.parametrize("read_only", [False, True]) -def test_inplace_dataset_view_save(ds_gen, stream, num_workers, read_only): - ds = ds_gen() +@pytest.mark.parametrize("stream", [False]) +@pytest.mark.parametrize("num_workers", [2]) +@pytest.mark.parametrize("read_only", [False]) +@pytest.mark.parametrize("progressbar", [True]) +def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, progressbar): + ds = ds_generator() if read_only and not ds.path.startswith("hub://"): return with ds: _populate_data(ds, n=2) - view = ds.filter("labels == 'dog'", store_result=stream, num_workers=num_workers) + view = ds.filter("labels == 'dog'", store_result=stream, num_workers=num_workers, progressbar=progressbar) assert len(ds._get_query_history()) == int(stream) vds_path = view.store() assert len(ds._get_query_history()) == 1 diff --git a/hub/core/storage/__init__.py b/hub/core/storage/__init__.py index 5644814f26..12e9000145 100644 --- a/hub/core/storage/__init__.py +++ b/hub/core/storage/__init__.py @@ -3,4 +3,5 @@ from hub.core.storage.memory import MemoryProvider from hub.core.storage.local import LocalProvider from hub.core.storage.lru_cache import LRUCache -from hub.core.storage.gcs import GCSProvider +# from hub.core.storage.gcs import GCSProvider +GCSProvider = S3Provider From 6e2d830643823d95f70f4d65c086d6852e578b96 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 31 Dec 2021 13:30:04 +0530 Subject: [PATCH 41/58] updates --- hub/core/query/filter.py | 9 --------- hub/core/query/query.py | 7 ++++--- hub/core/query/test/test_query.py | 22 +++++++--------------- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 8274820469..c525a962de 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -137,7 +137,6 @@ def loop(): if include: vds.VDS_INDEX.append(index) processed += 1 - print(processed, num_samples) if processed == num_samples: vds.flush() _del_counter(id) @@ -378,7 +377,6 @@ def query_inplace( num_processed = {"value": 0} def update_vds(idx, include): - print("update_vds", idx, include) if vds: vds_queue.put((idx, include)) num_processed["value"] += 1 @@ -389,12 +387,9 @@ def update_vds(idx, include): progress=int(num_processed["value"] * 100 / num_samples), status="success", ) - print("update_vds", "num_processed", num_processed) def subquery(dataset_query): - print("subquery()", id(num_processed), num_processed, dataset_query) dataset, query = dataset_query - print(len(dataset)) if progressbar: from tqdm import tqdm @@ -410,9 +405,7 @@ def update(idx, include): finally: bar.close() else: - print("DatasetQuery()") ret = DatasetQuery(dataset, query, update_vds).execute() - print("after update", num_processed) return ret def pg_subquery(pg_callback, dataset_query): @@ -456,9 +449,7 @@ def update(idx, include): finally: if vds: vds.autoflush = True - print("joining...") vds_thread.join() - print("done.") if hasattr(vds_queue, "close"): vds_queue.close() _del_counter(query_id) diff --git a/hub/core/query/query.py b/hub/core/query/query.py index 49378519f9..e16c4030a0 100644 --- a/hub/core/query/query.py +++ b/hub/core/query/query.py @@ -32,7 +32,7 @@ def execute(self) -> List[int]: print("execute()", len(self._dataset)) idx_map: List[int] = list() max_size = len(self._dataset) - + num_samples_processed = 0 for f in self._np_access: cache = {tensor: f(tensor) for tensor in self._tensors} for local_idx, idx in enumerate(f("index")): @@ -43,13 +43,14 @@ def execute(self) -> List[int]: tensor: self._wrap_value(tensor, cache[tensor][local_idx]) for tensor in self._tensors } - print("pg_callback()", idx) + num_samples_processed += 1 if eval(self._cquery, p): idx_map.append(int(idx)) self._pg_callback(int(idx), True) else: self._pg_callback(int(idx), False) - print("execute() Done.") + print("execute() Done. num_samples_processed: ", num_samples_processed) + assert num_samples_processed == len(self._dataset) return idx_map def _wrap_value(self, tensor, val): diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 708d14b2b4..5a48b7a5db 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -77,21 +77,13 @@ def test_query_scheduler(local_ds): ds.create_tensor("labels") ds.labels.extend(np.arange(10_000)) - def filter_result(ds): - return ds[0].labels.numpy() - - assert ( - filter_result(ds.filter("labels == 3141", num_workers=2, progressbar=False)) - == 3141 - ) - assert ( - filter_result( - ds.filter( - lambda s: s.labels.numpy() == 3141, num_workers=2, progressbar=False - ) - ) - == 3141 - ) + f1 = "labels == 3141" + f2 = lambda s: s.labels.numpy() == 3141 + + view1 = ds.filter(f1, num_workers=2, progressbar=True) + view2 = ds.filter(f2, num_workers=2, progressbar=True) + + np.testing.assert_array_equal(view1.labels.numpy(), view2.labels.numpy()) def test_dataset_view_save(): From b63497dc31437d281d4be75ed7b1de7d30e33e62 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 17 Jan 2022 18:18:20 +0530 Subject: [PATCH 42/58] fixes --- hub/core/dataset/dataset.py | 2 +- hub/core/query/filter.py | 62 +++++++++++++++---------------- hub/core/query/query.py | 29 +++++++++++---- hub/core/query/test/test_query.py | 35 +++++++---------- 4 files changed, 67 insertions(+), 61 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 9c0a2b405f..462a5538f1 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1438,7 +1438,7 @@ def _store_view_in_user_queries_dataset(self): vds = hub.empty(path, overwrite=True) - self._write_vds() + self._write_vds(vds) Dataset._write_queries_json(queries_ds, info) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 6cca40beca..1373139642 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -245,11 +245,12 @@ def pg_filter_slice(pg_callback, indices: Sequence[int]): finally: compute.close() if vds: - vds.autoflush = True - vds_thread.join() if hasattr(vds_queue, "close"): vds_queue.close() _del_counter(query_id) + if vds: + vds.autoflush = True + vds_thread.join() return index_map @@ -317,11 +318,11 @@ def filter_inplace( ) raise (e) finally: - if vds: - vds.autoflush = True - vds_thread.join() _del_counter(query_id) + if vds: + vds.autoflush = True + vds_thread.join() return index_map @@ -343,7 +344,6 @@ def query_dataset( else None ) index_map = query_inplace(dataset, query, progressbar, num_workers, scheduler, vds) - ret = dataset[index_map] # type: ignore [this is fine] if vds: ret._vds = vds @@ -429,29 +429,28 @@ def update(idx, include): ds_query = DatasetQuery(dataset, query, progress_callback=update) return ds_query.execute() - if num_workers == 0: - return subquery(QuerySlice(0, len(dataset), dataset, query)) - - compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) try: - btch = len(dataset) // num_workers - subdatasets = [ - QuerySlice(idx * btch, btch, dataset, query) - for idx in range(0, num_workers) - ] - if num_workers == 0: - return subquery((dataset, query)) - - if progressbar: - result = compute.map_with_progressbar(pg_subquery, subdatasets, total_length=num_samples) # type: ignore + index_map = subquery(QuerySlice(0, len(dataset), dataset, query)) else: - result = compute.map(subquery, subdatasets) # type: ignore + compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) - index_map = [ - k + dataset_slice.offset - for x, dataset_slice in zip(result, subdatasets) - for k in x] # unfold the result map + btch = len(dataset) // num_workers + subdatasets = [ + QuerySlice(idx * btch, btch, dataset, query) + for idx in range(0, num_workers) + ] + + + if progressbar: + result = compute.map_with_progressbar(pg_subquery, subdatasets, total_length=num_samples) # type: ignore + else: + result = compute.map(subquery, subdatasets) # type: ignore + + index_map = [ + k + dataset_slice.offset + for x, dataset_slice in zip(result, subdatasets) + for k in x] # unfold the result map except Exception as e: dataset._send_query_progress( query_text=query, @@ -462,12 +461,10 @@ def update(idx, include): ) raise e finally: - compute.close() - if vds: - vds.autoflush = True - vds_thread.join() - if hasattr(vds_queue, "close"): - vds_queue.close() + if vds and hasattr(vds_queue, "close"): + vds_queue.close() + if compute: + compute.close() _del_counter(query_id) dataset._send_query_progress( query_text=query, @@ -476,4 +473,7 @@ def update(idx, include): progress=100, status="success", ) + if vds: + vds.autoflush = True + vds_thread.join() return index_map diff --git a/hub/core/query/query.py b/hub/core/query/query.py index 5da800aad3..0f947d8165 100644 --- a/hub/core/query/query.py +++ b/hub/core/query/query.py @@ -16,12 +16,10 @@ def __init__( self, dataset, query: str, - offset: int = 0, progress_callback: Callable[[int], None] = lambda *_: None, ): self._dataset = dataset self._query = query - self._offset = offset self._pg_callback = progress_callback self._cquery = compile(query, "", "eval") self._tensors = [ @@ -36,11 +34,8 @@ def __init__( self._groups = self._export_groups(self._wrappers) def execute(self) -> List[int]: - print("execute()", len(self._dataset)) idx_map: List[int] = list() max_size = len(self._dataset) - num_samples_processed = 0 - offset = self._offset for f in self._np_access: cache = {tensor: f(tensor) for tensor in self._tensors} for local_idx in range(max_size): @@ -49,15 +44,12 @@ def execute(self) -> List[int]: tensor: self._wrap_value(tensor, cache[tensor][local_idx]) for tensor in self._tensors } - num_samples_processed += 1 p.update(self._groups) if eval(self._cquery, p): idx_map.append(global_idx) self._pg_callback(global_idx, True) else: self._pg_callback(global_idx, False) - print("execute() Done. num_samples_processed: ", num_samples_processed) - assert num_samples_processed == len(self._dataset) return idx_map def _wrap_value(self, tensor, val): @@ -175,6 +167,27 @@ def __ge__(self, o: object) -> bool: def __ne__(self, o: object) -> bool: return self.val != o + def __mod__(self, o: object): + return self.val % o + + def __add__(self, o: object): + return self.val + o + + def __sub__(self, o: object): + return self.val - o + + def __div__(self, o: object): + return self.val / o + + def __floordiv__(self, o: object): + return self.val // o + + def __mul__(self, o: object): + return self.val * o + + def __pow__(self, o: object): + return self.val ** o + class GroupTensor: def __init__(self, dataset: Dataset, wrappers, prefix: str) -> None: diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 94405467ac..bdcc53fb9f 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -3,6 +3,8 @@ import numpy as np from hub.core.query import DatasetQuery +from hub.util.remove_cache import get_base_storage +from hub.core.storage import LocalProvider import hub @@ -100,22 +102,23 @@ def test_dataset_view_save(): "ds_generator", [ "local_ds_generator", - # "s3_ds_generator", + "s3_ds_generator", # "gcs_ds_generator", - # "hub_cloud_ds_generator", + "hub_cloud_ds_generator", ], indirect=True ) -@pytest.mark.parametrize("stream", [False]) -@pytest.mark.parametrize("num_workers", [2]) -@pytest.mark.parametrize("read_only", [False]) -@pytest.mark.parametrize("progressbar", [True]) -def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, progressbar): +@pytest.mark.parametrize("stream", [False, True]) +@pytest.mark.parametrize("num_workers", [0, 2]) +@pytest.mark.parametrize("read_only", [False, True]) +@pytest.mark.parametrize("progressbar", [False, True]) +@pytest.mark.parametrize("query_type", ["string", "function"]) +def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, progressbar, query_type): ds = ds_generator() if read_only and not ds.path.startswith("hub://"): return - with ds: - _populate_data(ds, n=2) - view = ds.filter("labels == 'dog'", store_result=stream, num_workers=num_workers, progressbar=progressbar) + _populate_data(ds, n=2) + f = "labels == 'dog'" if query_type == "string" else lambda s: s.labels == "dog" + view = ds.filter(f, store_result=stream, num_workers=num_workers, progressbar=progressbar) assert len(ds._get_query_history()) == int(stream) vds_path = view.store() assert len(ds._get_query_history()) == 1 @@ -132,17 +135,7 @@ def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, # Delete queries ds from testing acc: org = ds.path[6:].split("/")[1] hub.delete(f"hub://{org}/queries", large_ok=True) - def filter_result(ds): - return ds[0].labels.numpy() - - assert ( - ds.filter("labels == 3141", num_workers=2, progressbar=False)[0].labels.numpy() - == 3141 - ) - assert ( - ds.filter(lambda s: s.labels.numpy() == 3141, num_workers=2, progressbar=False)[0].labels.numpy() - == 3141 - ) + def test_group(local_ds): From 49905c0ca5d8cd4ed73173614a2760d8a4a90bba Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 17 Jan 2022 18:19:01 +0530 Subject: [PATCH 43/58] imp fix --- hub/core/storage/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hub/core/storage/__init__.py b/hub/core/storage/__init__.py index 12e9000145..5644814f26 100644 --- a/hub/core/storage/__init__.py +++ b/hub/core/storage/__init__.py @@ -3,5 +3,4 @@ from hub.core.storage.memory import MemoryProvider from hub.core.storage.local import LocalProvider from hub.core.storage.lru_cache import LRUCache -# from hub.core.storage.gcs import GCSProvider -GCSProvider = S3Provider +from hub.core.storage.gcs import GCSProvider From 70219dbea39d65173bbb567dd72672d4c5d6c364 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 17 Jan 2022 21:38:53 +0530 Subject: [PATCH 44/58] format --- hub/core/query/filter.py | 16 ++++++++++------ hub/core/query/test/test_query.py | 12 ++++++++---- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 1373139642..504edc6808 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -360,7 +360,11 @@ def query_inplace( ) -> List[int]: num_samples = len(dataset) - compute = get_compute_provider(scheduler=scheduler, num_workers=num_workers) if num_workers > 0 else None + compute = ( + get_compute_provider(scheduler=scheduler, num_workers=num_workers) + if num_workers > 0 + else None + ) query_id = hash_inputs(dataset.path, dataset.pending_commit_id, query) if vds: @@ -388,7 +392,6 @@ def update_vds(idx, include): status="success", ) - class QuerySlice: def __init__(self, offset, size, dataset, query) -> None: self.offset = offset @@ -425,6 +428,7 @@ def pg_subquery(pg_callback, query_slice): def update(idx, include): update_vds(idx, include) pg_callback(1) + dataset = query_slice.slice_dataset() ds_query = DatasetQuery(dataset, query, progress_callback=update) return ds_query.execute() @@ -441,16 +445,16 @@ def update(idx, include): for idx in range(0, num_workers) ] - if progressbar: result = compute.map_with_progressbar(pg_subquery, subdatasets, total_length=num_samples) # type: ignore else: result = compute.map(subquery, subdatasets) # type: ignore index_map = [ - k + dataset_slice.offset - for x, dataset_slice in zip(result, subdatasets) - for k in x] # unfold the result map + k + dataset_slice.offset + for x, dataset_slice in zip(result, subdatasets) + for k in x + ] # unfold the result map except Exception as e: dataset._send_query_progress( query_text=query, diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index bdcc53fb9f..6d582e9aa6 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -105,20 +105,25 @@ def test_dataset_view_save(): "s3_ds_generator", # "gcs_ds_generator", "hub_cloud_ds_generator", - ], indirect=True + ], + indirect=True, ) @pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("num_workers", [0, 2]) @pytest.mark.parametrize("read_only", [False, True]) @pytest.mark.parametrize("progressbar", [False, True]) @pytest.mark.parametrize("query_type", ["string", "function"]) -def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, progressbar, query_type): +def test_inplace_dataset_view_save( + ds_generator, stream, num_workers, read_only, progressbar, query_type +): ds = ds_generator() if read_only and not ds.path.startswith("hub://"): return _populate_data(ds, n=2) f = "labels == 'dog'" if query_type == "string" else lambda s: s.labels == "dog" - view = ds.filter(f, store_result=stream, num_workers=num_workers, progressbar=progressbar) + view = ds.filter( + f, store_result=stream, num_workers=num_workers, progressbar=progressbar + ) assert len(ds._get_query_history()) == int(stream) vds_path = view.store() assert len(ds._get_query_history()) == 1 @@ -137,7 +142,6 @@ def test_inplace_dataset_view_save(ds_generator, stream, num_workers, read_only, hub.delete(f"hub://{org}/queries", large_ok=True) - def test_group(local_ds): with local_ds as ds: ds.create_tensor("labels/t1") From 43de6bc3a339769eb61fea9eadf5e96039f144c6 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 18 Jan 2022 11:55:02 +0530 Subject: [PATCH 45/58] events fix --- hub/core/dataset/hub_cloud_dataset.py | 9 +++++++-- hub/core/query/query.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/hub/core/dataset/hub_cloud_dataset.py b/hub/core/dataset/hub_cloud_dataset.py index fb9d97b280..6aea2528b4 100644 --- a/hub/core/dataset/hub_cloud_dataset.py +++ b/hub/core/dataset/hub_cloud_dataset.py @@ -6,6 +6,7 @@ from hub.client.log import logger from hub.util.agreement import handle_dataset_agreement from hub.util.path import is_hub_cloud_path +from hub.util.tag import process_hub_path from warnings import warn import time import hub @@ -53,8 +54,9 @@ def _set_org_and_name(self): if self.is_actually_cloud: if self.org_id is not None: return - split_path = self.path.split("/") - org_id, ds_name = split_path[2], split_path[3] + _, (org_id, ds_name), subdir = process_hub_path(self.path) + if subdir: + ds_name += "/" + subdir else: # if this dataset isn't actually pointing to a datset in the cloud # a.k.a this dataset is trying to simulate a hub cloud dataset @@ -67,6 +69,9 @@ def _set_org_and_name(self): def _register_dataset(self): # called in super()._populate_meta self._set_org_and_name() + if "/" in self.ds_name: + # Sub dataset + return self.client.create_dataset_entry( self.org_id, self.ds_name, diff --git a/hub/core/query/query.py b/hub/core/query/query.py index 0f947d8165..5adbf9e529 100644 --- a/hub/core/query/query.py +++ b/hub/core/query/query.py @@ -16,7 +16,7 @@ def __init__( self, dataset, query: str, - progress_callback: Callable[[int], None] = lambda *_: None, + progress_callback: Callable[[int, bool], None] = lambda *_: None, ): self._dataset = dataset self._query = query From baf38ad86f07d29443b56fd6b3b787821d930f8d Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Tue, 18 Jan 2022 14:10:08 +0530 Subject: [PATCH 46/58] mypy --- hub/core/query/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/query/filter.py b/hub/core/query/filter.py index 504edc6808..25f96effde 100644 --- a/hub/core/query/filter.py +++ b/hub/core/query/filter.py @@ -371,7 +371,7 @@ def query_inplace( vds.autoflush = False vds.info["total_samples"] = num_samples vds.info["samples_processed"] = 0 - vds_queue = Queue() if num_workers == 0 else compute.create_queue() + vds_queue = Queue() if num_workers == 0 else compute.create_queue() # type: ignore vds_thread = _get_vds_thread(vds, vds_queue, num_samples) vds_thread.start() dataset._send_query_progress( From 4290553f71c90f89f3e51e51b8cc6a26c334eb2f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Wed, 19 Jan 2022 16:54:20 +0530 Subject: [PATCH 47/58] smol fix --- hub/core/dataset/hub_cloud_dataset.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hub/core/dataset/hub_cloud_dataset.py b/hub/core/dataset/hub_cloud_dataset.py index 6aea2528b4..a022554b67 100644 --- a/hub/core/dataset/hub_cloud_dataset.py +++ b/hub/core/dataset/hub_cloud_dataset.py @@ -54,7 +54,7 @@ def _set_org_and_name(self): if self.is_actually_cloud: if self.org_id is not None: return - _, (org_id, ds_name), subdir = process_hub_path(self.path) + _, org_id, ds_name, subdir = process_hub_path(self.path) if subdir: ds_name += "/" + subdir else: @@ -66,11 +66,13 @@ def _set_org_and_name(self): self.__dict__["org_id"] = org_id self.__dict__["ds_name"] = ds_name + def _is_sub_ds(self): + return "/" in self.ds_name + def _register_dataset(self): # called in super()._populate_meta self._set_org_and_name() - if "/" in self.ds_name: - # Sub dataset + if self._is_sub_ds(): return self.client.create_dataset_entry( self.org_id, @@ -214,7 +216,8 @@ def make_private(self): def delete(self, large_ok=False): super().delete(large_ok=large_ok) - + if self._is_sub_ds(): + return self.client.delete_dataset_entry(self.org_id, self.ds_name) @property From 5234d92f124705cb5d282fa77fe78a47699bb337 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 01:09:21 +0530 Subject: [PATCH 48/58] test fix --- hub/core/query/test/test_query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 6d582e9aa6..5ded0f9681 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -117,6 +117,7 @@ def test_inplace_dataset_view_save( ds_generator, stream, num_workers, read_only, progressbar, query_type ): ds = ds_generator() + ds.read_only = read_only if read_only and not ds.path.startswith("hub://"): return _populate_data(ds, n=2) From 97cf742dce36a1bf335d698679e0832e1773f2a7 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 05:14:03 +0530 Subject: [PATCH 49/58] readonly fix --- hub/core/query/test/test_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 5ded0f9681..98fb4af285 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -117,10 +117,10 @@ def test_inplace_dataset_view_save( ds_generator, stream, num_workers, read_only, progressbar, query_type ): ds = ds_generator() - ds.read_only = read_only if read_only and not ds.path.startswith("hub://"): return _populate_data(ds, n=2) + ds.read_only = read_only f = "labels == 'dog'" if query_type == "string" else lambda s: s.labels == "dog" view = ds.filter( f, store_result=stream, num_workers=num_workers, progressbar=progressbar From a4ee4ce9c24ac37c2905e6b23d20b5dacfea6a24 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 09:39:18 +0530 Subject: [PATCH 50/58] locking fix --- hub/core/dataset/dataset.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index 462a5538f1..fd25b464b3 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1384,6 +1384,10 @@ def _get_view_info(self): @staticmethod def _write_queries_json(ds, info): base_storage = get_base_storage(ds.storage) + storage_read_only = storage.read_only + if ds._locked_out: + # Ignore storage level lock since we have file level lock + storage.read_only = False lock = Lock(base_storage, get_queries_lock_key()) lock.acquire(timeout=10, force=True) queries_key = get_queries_key() @@ -1396,6 +1400,7 @@ def _write_queries_json(ds, info): base_storage[queries_key] = json.dumps(queries).encode("utf-8") finally: lock.release() + storage.read_only = storage_read_only def _write_vds(self, vds): """Writes the indices of this view to a vds.""" From 9c906467ec816b7983e38d1f793ece6f63749790 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 10:35:53 +0530 Subject: [PATCH 51/58] locking fix --- hub/core/dataset/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index fd25b464b3..b68e93f0bb 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1384,10 +1384,10 @@ def _get_view_info(self): @staticmethod def _write_queries_json(ds, info): base_storage = get_base_storage(ds.storage) - storage_read_only = storage.read_only + storage_read_only = base_storage.read_only if ds._locked_out: # Ignore storage level lock since we have file level lock - storage.read_only = False + base_storage.read_only = False lock = Lock(base_storage, get_queries_lock_key()) lock.acquire(timeout=10, force=True) queries_key = get_queries_key() @@ -1400,7 +1400,7 @@ def _write_queries_json(ds, info): base_storage[queries_key] = json.dumps(queries).encode("utf-8") finally: lock.release() - storage.read_only = storage_read_only + base_storage.read_only = storage_read_only def _write_vds(self, vds): """Writes the indices of this view to a vds.""" From 23f782187063376330212c16ac0e9602d406cd94 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 13:04:41 +0530 Subject: [PATCH 52/58] locking fix --- hub/core/query/test/test_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 98fb4af285..ff9649dccc 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -125,9 +125,9 @@ def test_inplace_dataset_view_save( view = ds.filter( f, store_result=stream, num_workers=num_workers, progressbar=progressbar ) - assert len(ds._get_query_history()) == int(stream) + assert read_only or len(ds._get_query_history()) == int(stream) vds_path = view.store() - assert len(ds._get_query_history()) == 1 + assert read_only or len(ds._get_query_history()) == 1 view2 = hub.dataset(vds_path) if ds.path.startswith("hub://"): assert vds_path.startswith("hub://") From 5bec509695fabdd5666ecb894f9787d4a5dcac33 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 14:22:39 +0530 Subject: [PATCH 53/58] cleanup fix --- hub/core/query/test/test_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index ff9649dccc..36c9eb07a0 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -139,7 +139,7 @@ def test_inplace_dataset_view_save( np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) if ds.path.startswith("hub://") and read_only: # Delete queries ds from testing acc: - org = ds.path[6:].split("/")[1] + org = ds.path[6:].split("/")[0] hub.delete(f"hub://{org}/queries", large_ok=True) From e49e2fc8d08c3ce78c6f6585a70781886860207f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 19:56:00 +0530 Subject: [PATCH 54/58] lock fix --- hub/core/dataset/dataset.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index b68e93f0bb..d4b79094a5 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1437,7 +1437,15 @@ def _store_view_in_user_queries_dataset(self): info = self._get_view_info() hash = info["id"] - queries_ds = hub.dataset(f"hub://{username}/queries") # create if doesn't exist + queries_ds_path = f"hub://{username}/queries" + + try: + queries_ds = hub.dataset( + queries_ds_path, verbose=False + ) # create if doesn't exist + except PathNotEmptyException: + hub.delete(queries_ds_path, force=True) + queries_ds = hub.dataset(queries_ds_path, verbose=False) path = f"hub://{username}/queries/{hash}" From f5bdfed8db45d10d38eceb0e2e0b89ecd0b674ad Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 20 Jan 2022 20:27:15 +0530 Subject: [PATCH 55/58] lock fix --- hub/core/dataset/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hub/core/dataset/dataset.py b/hub/core/dataset/dataset.py index d4b79094a5..d646165804 100644 --- a/hub/core/dataset/dataset.py +++ b/hub/core/dataset/dataset.py @@ -1447,6 +1447,8 @@ def _store_view_in_user_queries_dataset(self): hub.delete(queries_ds_path, force=True) queries_ds = hub.dataset(queries_ds_path, verbose=False) + queries_ds._unlock() # we don't need locking as no data will be added to this ds. + path = f"hub://{username}/queries/{hash}" vds = hub.empty(path, overwrite=True) From b4dcec0ccf03d866ea19a4e3fe43a73a9c0fe0bc Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 21 Jan 2022 00:42:50 +0530 Subject: [PATCH 56/58] fix test --- hub/core/query/test/test_query.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 36c9eb07a0..3ee43cb406 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -137,10 +137,6 @@ def test_inplace_dataset_view_save( assert ds.path + "/.queries/" in vds_path for t in view.tensors: np.testing.assert_array_equal(view[t].numpy(), view2[t].numpy()) - if ds.path.startswith("hub://") and read_only: - # Delete queries ds from testing acc: - org = ds.path[6:].split("/")[0] - hub.delete(f"hub://{org}/queries", large_ok=True) def test_group(local_ds): From aba1e611397eeab8af3df5c6345e28f1cb1777c5 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 21 Jan 2022 13:21:44 +0530 Subject: [PATCH 57/58] tag fi --- hub/core/query/test/test_query.py | 7 ++++++- hub/util/tag.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/hub/core/query/test/test_query.py b/hub/core/query/test/test_query.py index 3ee43cb406..43df460e75 100644 --- a/hub/core/query/test/test_query.py +++ b/hub/core/query/test/test_query.py @@ -6,6 +6,7 @@ from hub.util.remove_cache import get_base_storage from hub.core.storage import LocalProvider import hub +from uuid import uuid4 first_row = {"images": [1, 2, 3], "labels": [0]} @@ -121,7 +122,11 @@ def test_inplace_dataset_view_save( return _populate_data(ds, n=2) ds.read_only = read_only - f = "labels == 'dog'" if query_type == "string" else lambda s: s.labels == "dog" + f = ( + f"labels == 'dog'#{uuid4().hex}" + if query_type == "string" + else lambda s: s.labels == "dog" + ) view = ds.filter( f, store_result=stream, num_workers=num_workers, progressbar=progressbar ) diff --git a/hub/util/tag.py b/hub/util/tag.py index 87cefbe528..1ed5247bf9 100644 --- a/hub/util/tag.py +++ b/hub/util/tag.py @@ -22,7 +22,7 @@ def process_hub_path(path: str) -> Tuple[str, str, str, str]: if len(s) == 3 and s[1] == "queries" and not s[2].startswith("."): # Special case: expand hub://username/queries/hash to hub://username/queries/.queries/hash - subdir = f"queries/.queries/{s[2]}" + subdir = f".queries/{s[2]}" else: subdir = "/".join(s[2:]) if len(s) > 2: From b0efe2307b118e6af138cc2ea36d9a79139fa633 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Fri, 21 Jan 2022 15:33:29 +0530 Subject: [PATCH 58/58] teardown fix --- hub/tests/path_fixtures.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hub/tests/path_fixtures.py b/hub/tests/path_fixtures.py index dac565652c..aa9f4361a6 100644 --- a/hub/tests/path_fixtures.py +++ b/hub/tests/path_fixtures.py @@ -248,7 +248,13 @@ def hub_cloud_path(request, hub_cloud_dev_token): # clear storage unless flagged otherwise if not is_opt_true(request, KEEP_STORAGE_OPT): - storage_provider_from_hub_path(path, token=hub_cloud_dev_token).clear() + try: + storage_provider_from_hub_path(path, token=hub_cloud_dev_token).clear() + except Exception: + # TODO: investigate flakey `BadRequestException: + # Invalid Request. One or more request parameters is incorrect.` + # (on windows 3.8 only) + pass @pytest.fixture