feat: storage option (#78)

* feat: add storage * ci: add requirement and fix test * feat: fix test and remove add magic method * refactor: remove debug print and optional check * refactor: address comments * fix: flake8 * refactor: address comments * feat: add serialize config * refactor: type check * fix: set_doc_value_pairs * refactor: type check
docarray · Jan 28, 2022 · f55fb47 · f55fb47
1 parent 95d9f5a
commit f55fb47
Show file tree

Hide file tree

Showing 21 changed files with 1,006 additions and 98 deletions.
diff --git a/docarray/array/document.py b/docarray/array/document.py
@@ -41,6 +41,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs) -> 'DocumentArrayLike
                 from .sqlite import DocumentArraySqlite
 
                 instance = super().__new__(DocumentArraySqlite)
+            elif storage == 'weaviate':
+                from .weaviate import DocumentArrayWeaviate
+
+                instance = super().__new__(DocumentArrayWeaviate)
             else:
                 raise ValueError(f'storage=`{storage}` is not supported.')
         else:

diff --git a/docarray/array/mixins/delitem.py b/docarray/array/mixins/delitem.py
@@ -35,9 +35,14 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
             if (
                 isinstance(index, tuple)
                 and len(index) == 2
-                and isinstance(index[0], (slice, Sequence))
+                and (
+                    isinstance(index[0], (slice, Sequence, str, int))
+                    or index[0] is Ellipsis
+                )
+                and isinstance(index[1], (str, Sequence))
             ):
-                if isinstance(index[0], str) and isinstance(index[1], str):
+                # TODO: add support for cases such as da[1, ['text', 'id']]?
+                if isinstance(index[0], (str, int)) and isinstance(index[1], str):
                     # ambiguity only comes from the second string
                     if index[1] in self:
                         del self[index[0]]

diff --git a/docarray/array/mixins/getitem.py b/docarray/array/mixins/getitem.py
@@ -67,14 +67,24 @@ def __getitem__(
             if (
                 isinstance(index, tuple)
                 and len(index) == 2
-                and isinstance(index[0], (slice, Sequence))
+                and (
+                    isinstance(index[0], (slice, Sequence, str, int))
+                    or index[0] is Ellipsis
+                )
+                and isinstance(index[1], (str, Sequence))
             ):
-                if isinstance(index[0], str) and isinstance(index[1], str):
+                # TODO: add support for cases such as da[1, ['text', 'id']]?
+                if isinstance(index[0], (str, int)) and isinstance(index[1], str):
                     # ambiguity only comes from the second string
                     if index[1] in self:
                         return DocumentArray([self[index[0]], self[index[1]]])
                     else:
-                        return getattr(self[index[0]], index[1])
+                        _docs = self[index[0]]
+                        if not _docs:
+                            return []
+                        if isinstance(_docs, Document):
+                            return getattr(_docs, index[1])
+                        return _docs._get_attributes(index[1])
                 elif isinstance(index[0], (slice, Sequence)):
                     _docs = self[index[0]]
                     _attrs = index[1]

diff --git a/docarray/array/mixins/setitem.py b/docarray/array/mixins/setitem.py
@@ -79,8 +79,59 @@ def __setitem__(
             if (
                 isinstance(index, tuple)
                 and len(index) == 2
-                and isinstance(index[0], (slice, Sequence))
+                and (
+                    isinstance(index[0], (slice, Sequence, str, int))
+                    or index[0] is Ellipsis
+                )
+                and isinstance(index[1], (str, Sequence))
             ):
+                # TODO: this is added because we are still trying to figure out the proper way
+                # to set attribute and to get test_path_syntax_indexing_set to pass.
+                # we may have to refactor the following logic
+
+                # NOTE: this check is not proper way to handle, but a temporary hack.
+                # writing it this way to minimize effect on other docarray classs and
+                # to make it easier to remove/refactor the following block
+                if self.__class__.__name__ in {
+                    'DocumentArrayWeaviate',
+                    'DocumentArrayInMemory',
+                }:
+                    from ..memory import DocumentArrayInMemory
+
+                    if index[1] in self:
+                        # we first handle the case when second item in index is an id not attr
+                        _docs = DocumentArrayInMemory(
+                            self[index[0]]
+                        ) + DocumentArrayInMemory(self[index[1]])
+                        self._set_doc_value_pairs(_docs, value)
+                        return
+
+                    _docs = self[index[0]]
+
+                    if not _docs:
+                        return
+
+                    if isinstance(_docs, Document):
+                        _docs = DocumentArrayInMemory(_docs)
+                        # because we've augmented docs dimension, we do the same for value
+                        value = (value,)
+
+                    attrs = index[1]
+                    if isinstance(attrs, str):
+                        attrs = (attrs,)
+                        # because we've augmented attrs dimension, we do the same for value
+                        value = (value,)
+
+                    for attr in attrs:
+                        if not hasattr(_docs[0], attr):
+                            raise ValueError(
+                                f'`{attr}` is neither a valid id nor attribute name'
+                            )
+
+                    for _a, _v in zip(attrs, value):
+                        self._set_docs_attrs(_docs, _a, _v)
+                    return
+
                 if isinstance(index[0], str) and isinstance(index[1], str):
                     # ambiguity only comes from the second string
                     if index[1] in self:

diff --git a/docarray/array/storage/base/getsetdel.py b/docarray/array/storage/base/getsetdel.py
@@ -36,20 +36,29 @@ def _get_doc_by_id(self, _id: str) -> 'Document':
 
     def _get_docs_by_slice(self, _slice: slice) -> Iterable['Document']:
         """This function is derived from :meth:`_get_doc_by_offset`
+        Override this function if there is a more efficient logic
 
-        Override this function if there is a more efficient logic"""
+        :param _slice: the slice used for indexing
+        :return: an iterable of document
+        """
         return (self._get_doc_by_offset(o) for o in range(len(self))[_slice])
 
     def _get_docs_by_offsets(self, offsets: Sequence[int]) -> Iterable['Document']:
         """This function is derived from :meth:`_get_doc_by_offset`
+        Override this function if there is a more efficient logic
 
-        Override this function if there is a more efficient logic"""
+        :param offsets: the offsets used for indexing
+        :return: an iterable of document
+        """
         return (self._get_doc_by_offset(o) for o in offsets)
 
     def _get_docs_by_ids(self, ids: Sequence[str]) -> Iterable['Document']:
         """This function is derived from :meth:`_get_doc_by_id`
+        Override this function if there is a more efficient logic
 
-        Override this function if there is a more efficient logic"""
+        :param ids: the ids used for indexing
+        :return: an iterable of document
+        """
         return (self._get_doc_by_id(_id) for _id in ids)
 
     # Delitem APIs
@@ -64,15 +73,17 @@ def _del_doc_by_id(self, _id: str):
 
     def _del_docs_by_slice(self, _slice: slice):
         """This function is derived and may not have the most efficient implementation.
-
-        Override this function if there is a more efficient logic"""
+        Override this function if there is a more efficient logic
+        :param _slice: the slice used for indexing
+        """
         for j in range(len(self))[_slice]:
             self._del_doc_by_offset(j)
 
     def _del_docs_by_mask(self, mask: Sequence[bool]):
         """This function is derived and may not have the most efficient implementation.
-
-        Override this function if there is a more efficient logic"""
+        Override this function if there is a more efficient logic
+        :param mask: the boolean mask used for indexing
+        """
         for idx, m in enumerate(mask):
             if not m:
                 self._del_doc_by_offset(idx)
@@ -98,6 +109,9 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
         """This function is derived and may not have the most efficient implementation.
 
         Override this function if there is a more efficient logic
+        :param _slice: the slice used for indexing
+        :param value: the value docs will be updated to
+        :raises TypeError: error raised when right-hand assignment is not an iterable
         """
         if not isinstance(value, Iterable):
             raise TypeError(
@@ -107,17 +121,26 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
             self._set_doc_by_offset(_offset, val)
 
     def _set_doc_value_pairs(
-        self, docs: Iterable['Document'], values: Iterable['Document']
+        self, docs: Iterable['Document'], values: Sequence['Document']
     ):
         """This function is derived and may not have the most efficient implementation.
 
         Override this function if there is a more efficient logic
+        :param docs: the docs to update
+        :param values: the value docs will be updated to
         """
+        docs = list(docs)
+        if len(docs) != len(values):
+            raise ValueError(
+                f'length of docs to set({len(docs)}) does not match '
+                f'length of values({len(values)})'
+            )
+
         for _d, _v in zip(docs, values):
             _d._data = _v._data
 
         for _d in docs:
-            if _d not in docs:
+            if _d not in self:
                 root_d = self._find_root_doc(_d)
             else:
                 # _d is already on the root-level
@@ -130,6 +153,9 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
         """This function is derived and may not have the most efficient implementation.
 
         Override this function if there is a more efficient logic
+        :param offset: the offset used for indexing
+        :param attr: the attribute of document to update
+        :param value: the value doc's attr will be updated to
         """
         d = self._get_doc_by_offset(offset)
         if hasattr(d, attr):
@@ -140,14 +166,20 @@ def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
         """This function is derived and may not have the most efficient implementation.
 
         Override this function if there is a more efficient logic
+        :param _id: the id used for indexing
+        :param attr: the attribute of document to update
+        :param value: the value doc's attr will be updated to
         """
         d = self._get_doc_by_id(_id)
         if hasattr(d, attr):
             setattr(d, attr, value)
             self._set_doc_by_id(d.id, d)
 
-    def _find_root_doc(self, d: Document):
-        """Find `d`'s root Document in an exhaustive manner"""
+    def _find_root_doc(self, d: Document) -> 'Document':
+        """Find `d`'s root Document in an exhaustive manner
+        :param: d: the input document
+        :return: the root of the input document
+        """
         from docarray import DocumentArray
 
         for _d in self:

diff --git a/docarray/array/storage/memory/getsetdel.py b/docarray/array/storage/memory/getsetdel.py
@@ -6,7 +6,7 @@
 )
 
 from ..base.getsetdel import BaseGetSetDelMixin
-from .... import Document
+from .... import Document, DocumentArray
 
 
 class GetSetDelMixin(BaseGetSetDelMixin):
@@ -46,8 +46,15 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
         self._rebuild_id2offset()
 
     def _set_doc_value_pairs(
-        self, docs: Iterable['Document'], values: Iterable['Document']
+        self, docs: Iterable['Document'], values: Sequence['Document']
     ):
+        docs = list(docs)
+        if len(docs) != len(values):
+            raise ValueError(
+                f'length of docs to set({len(docs)}) does not match '
+                f'length of values({len(values)})'
+            )
+
         for _d, _v in zip(docs, values):
             _d._data = _v._data
         self._rebuild_id2offset()
@@ -58,6 +65,17 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
     def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
         setattr(self._data[self._id2offset[_id]], attr, value)
 
+    def _set_docs_attrs(self, docs: 'DocumentArray', attr: str, values: Iterable[Any]):
+        # TODO: remove this function to use _set_doc_attr_by_id once
+        # we find a way to do
+        if attr == 'embedding':
+            docs.embeddings = values
+        elif attr == 'tensor':
+            docs.tensors = values
+        else:
+            for _d, _v in zip(docs, values):
+                setattr(_d, attr, _v)
+
     def _get_doc_by_offset(self, offset: int) -> 'Document':
         return self._data[offset]
 

diff --git a/docarray/array/storage/sqlite/getsetdel.py b/docarray/array/storage/sqlite/getsetdel.py
@@ -118,7 +118,14 @@ def _del_docs_by_mask(self, mask: Sequence[bool]):
         self._commit()
 
     def _set_doc_value_pairs(
-        self, docs: Iterable['Document'], values: Iterable['Document']
+        self, docs: Iterable['Document'], values: Sequence['Document']
     ):
+        docs = list(docs)
+        if len(docs) != len(values):
+            raise ValueError(
+                f'length of docs to set({len(docs)}) does not match '
+                f'length of values({len(values)})'
+            )
+
         for _d, _v in zip(docs, values):
             self._set_doc_by_id(_d.id, _v)
diff --git a/docarray/array/storage/sqlite/seqlike.py b/docarray/array/storage/sqlite/seqlike.py
@@ -97,8 +97,3 @@ def __eq__(self, other):
             and type(self._config) is type(other._config)
             and self._config == other._config
         )
-
-    def __add__(self, other: Union['Document', Sequence['Document']]):
-        v = type(self)(self, storage='sqlite')
-        v.extend(other)
-        return v
diff --git a/docarray/array/storage/weaviate/__init__.py b/docarray/array/storage/weaviate/__init__.py
@@ -0,0 +1,9 @@
+from .backend import BackendMixin
+from .getsetdel import GetSetDelMixin
+from .seqlike import SequenceLikeMixin
+
+__all__ = ['StorageMixins']
+
+
+class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin):
+    ...