Skip to content

Commit

Permalink
feat: storage option (#78)
Browse files Browse the repository at this point in the history
* feat: add storage

* ci: add requirement and fix test

* feat: fix test and remove add magic method

* refactor: remove debug print and optional check

* refactor: address comments

* fix: flake8

* refactor: address comments

* feat: add serialize config

* refactor: type check

* fix: set_doc_value_pairs

* refactor: type check
  • Loading branch information
winstonww authored Jan 28, 2022
1 parent 95d9f5a commit f55fb47
Show file tree
Hide file tree
Showing 21 changed files with 1,006 additions and 98 deletions.
4 changes: 4 additions & 0 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs) -> 'DocumentArrayLike
from .sqlite import DocumentArraySqlite

instance = super().__new__(DocumentArraySqlite)
elif storage == 'weaviate':
from .weaviate import DocumentArrayWeaviate

instance = super().__new__(DocumentArrayWeaviate)
else:
raise ValueError(f'storage=`{storage}` is not supported.')
else:
Expand Down
9 changes: 7 additions & 2 deletions docarray/array/mixins/delitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,14 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
if isinstance(index[0], str) and isinstance(index[1], str):
# TODO: add support for cases such as da[1, ['text', 'id']]?
if isinstance(index[0], (str, int)) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
del self[index[0]]
Expand Down
16 changes: 13 additions & 3 deletions docarray/array/mixins/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,24 @@ def __getitem__(
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
if isinstance(index[0], str) and isinstance(index[1], str):
# TODO: add support for cases such as da[1, ['text', 'id']]?
if isinstance(index[0], (str, int)) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
return DocumentArray([self[index[0]], self[index[1]]])
else:
return getattr(self[index[0]], index[1])
_docs = self[index[0]]
if not _docs:
return []
if isinstance(_docs, Document):
return getattr(_docs, index[1])
return _docs._get_attributes(index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
Expand Down
53 changes: 52 additions & 1 deletion docarray/array/mixins/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,59 @@ def __setitem__(
if (
isinstance(index, tuple)
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
and (
isinstance(index[0], (slice, Sequence, str, int))
or index[0] is Ellipsis
)
and isinstance(index[1], (str, Sequence))
):
# TODO: this is added because we are still trying to figure out the proper way
# to set attribute and to get test_path_syntax_indexing_set to pass.
# we may have to refactor the following logic

# NOTE: this check is not proper way to handle, but a temporary hack.
# writing it this way to minimize effect on other docarray classs and
# to make it easier to remove/refactor the following block
if self.__class__.__name__ in {
'DocumentArrayWeaviate',
'DocumentArrayInMemory',
}:
from ..memory import DocumentArrayInMemory

if index[1] in self:
# we first handle the case when second item in index is an id not attr
_docs = DocumentArrayInMemory(
self[index[0]]
) + DocumentArrayInMemory(self[index[1]])
self._set_doc_value_pairs(_docs, value)
return

_docs = self[index[0]]

if not _docs:
return

if isinstance(_docs, Document):
_docs = DocumentArrayInMemory(_docs)
# because we've augmented docs dimension, we do the same for value
value = (value,)

attrs = index[1]
if isinstance(attrs, str):
attrs = (attrs,)
# because we've augmented attrs dimension, we do the same for value
value = (value,)

for attr in attrs:
if not hasattr(_docs[0], attr):
raise ValueError(
f'`{attr}` is neither a valid id nor attribute name'
)

for _a, _v in zip(attrs, value):
self._set_docs_attrs(_docs, _a, _v)
return

if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self:
Expand Down
54 changes: 43 additions & 11 deletions docarray/array/storage/base/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,29 @@ def _get_doc_by_id(self, _id: str) -> 'Document':

def _get_docs_by_slice(self, _slice: slice) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_offset`
Override this function if there is a more efficient logic
Override this function if there is a more efficient logic"""
:param _slice: the slice used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_offset(o) for o in range(len(self))[_slice])

def _get_docs_by_offsets(self, offsets: Sequence[int]) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_offset`
Override this function if there is a more efficient logic
Override this function if there is a more efficient logic"""
:param offsets: the offsets used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_offset(o) for o in offsets)

def _get_docs_by_ids(self, ids: Sequence[str]) -> Iterable['Document']:
"""This function is derived from :meth:`_get_doc_by_id`
Override this function if there is a more efficient logic
Override this function if there is a more efficient logic"""
:param ids: the ids used for indexing
:return: an iterable of document
"""
return (self._get_doc_by_id(_id) for _id in ids)

# Delitem APIs
Expand All @@ -64,15 +73,17 @@ def _del_doc_by_id(self, _id: str):

def _del_docs_by_slice(self, _slice: slice):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic"""
Override this function if there is a more efficient logic
:param _slice: the slice used for indexing
"""
for j in range(len(self))[_slice]:
self._del_doc_by_offset(j)

def _del_docs_by_mask(self, mask: Sequence[bool]):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic"""
Override this function if there is a more efficient logic
:param mask: the boolean mask used for indexing
"""
for idx, m in enumerate(mask):
if not m:
self._del_doc_by_offset(idx)
Expand All @@ -98,6 +109,9 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic
:param _slice: the slice used for indexing
:param value: the value docs will be updated to
:raises TypeError: error raised when right-hand assignment is not an iterable
"""
if not isinstance(value, Iterable):
raise TypeError(
Expand All @@ -107,17 +121,26 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
self._set_doc_by_offset(_offset, val)

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic
:param docs: the docs to update
:param values: the value docs will be updated to
"""
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
_d._data = _v._data

for _d in docs:
if _d not in docs:
if _d not in self:
root_d = self._find_root_doc(_d)
else:
# _d is already on the root-level
Expand All @@ -130,6 +153,9 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic
:param offset: the offset used for indexing
:param attr: the attribute of document to update
:param value: the value doc's attr will be updated to
"""
d = self._get_doc_by_offset(offset)
if hasattr(d, attr):
Expand All @@ -140,14 +166,20 @@ def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
"""This function is derived and may not have the most efficient implementation.
Override this function if there is a more efficient logic
:param _id: the id used for indexing
:param attr: the attribute of document to update
:param value: the value doc's attr will be updated to
"""
d = self._get_doc_by_id(_id)
if hasattr(d, attr):
setattr(d, attr, value)
self._set_doc_by_id(d.id, d)

def _find_root_doc(self, d: Document):
"""Find `d`'s root Document in an exhaustive manner"""
def _find_root_doc(self, d: Document) -> 'Document':
"""Find `d`'s root Document in an exhaustive manner
:param: d: the input document
:return: the root of the input document
"""
from docarray import DocumentArray

for _d in self:
Expand Down
22 changes: 20 additions & 2 deletions docarray/array/storage/memory/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
)

from ..base.getsetdel import BaseGetSetDelMixin
from .... import Document
from .... import Document, DocumentArray


class GetSetDelMixin(BaseGetSetDelMixin):
Expand Down Expand Up @@ -46,8 +46,15 @@ def _set_docs_by_slice(self, _slice: slice, value: Sequence['Document']):
self._rebuild_id2offset()

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
_d._data = _v._data
self._rebuild_id2offset()
Expand All @@ -58,6 +65,17 @@ def _set_doc_attr_by_offset(self, offset: int, attr: str, value: Any):
def _set_doc_attr_by_id(self, _id: str, attr: str, value: Any):
setattr(self._data[self._id2offset[_id]], attr, value)

def _set_docs_attrs(self, docs: 'DocumentArray', attr: str, values: Iterable[Any]):
# TODO: remove this function to use _set_doc_attr_by_id once
# we find a way to do
if attr == 'embedding':
docs.embeddings = values
elif attr == 'tensor':
docs.tensors = values
else:
for _d, _v in zip(docs, values):
setattr(_d, attr, _v)

def _get_doc_by_offset(self, offset: int) -> 'Document':
return self._data[offset]

Expand Down
9 changes: 8 additions & 1 deletion docarray/array/storage/sqlite/getsetdel.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,14 @@ def _del_docs_by_mask(self, mask: Sequence[bool]):
self._commit()

def _set_doc_value_pairs(
self, docs: Iterable['Document'], values: Iterable['Document']
self, docs: Iterable['Document'], values: Sequence['Document']
):
docs = list(docs)
if len(docs) != len(values):
raise ValueError(
f'length of docs to set({len(docs)}) does not match '
f'length of values({len(values)})'
)

for _d, _v in zip(docs, values):
self._set_doc_by_id(_d.id, _v)
5 changes: 0 additions & 5 deletions docarray/array/storage/sqlite/seqlike.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,3 @@ def __eq__(self, other):
and type(self._config) is type(other._config)
and self._config == other._config
)

def __add__(self, other: Union['Document', Sequence['Document']]):
v = type(self)(self, storage='sqlite')
v.extend(other)
return v
9 changes: 9 additions & 0 deletions docarray/array/storage/weaviate/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .backend import BackendMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins']


class StorageMixins(BackendMixin, GetSetDelMixin, SequenceLikeMixin):
...
Loading

0 comments on commit f55fb47

Please sign in to comment.