Skip to content

Commit

Permalink
fix: rename pqlite to annlite (#151)
Browse files Browse the repository at this point in the history
  • Loading branch information
numb3r3 authored Mar 4, 2022
1 parent a4e5a38 commit 2196082
Show file tree
Hide file tree
Showing 30 changed files with 246 additions and 212 deletions.
9 changes: 9 additions & 0 deletions docarray/array/annlite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .document import DocumentArray
from .storage.annlite import StorageMixins, AnnliteConfig

__all__ = ['AnnliteConfig', 'DocumentArrayAnnlite']


class DocumentArrayAnnlite(StorageMixins, DocumentArray):
def __new__(cls, *args, **kwargs):
return super().__new__(cls)
18 changes: 9 additions & 9 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from ..types import DocumentArraySourceType
from .memory import DocumentArrayInMemory
from .sqlite import DocumentArraySqlite
from .pqlite import DocumentArrayPqlite
from .annlite import DocumentArrayAnnlite
from .weaviate import DocumentArrayWeaviate
from .storage.sqlite import SqliteConfig
from .storage.pqlite import PqliteConfig
from .storage.annlite import AnnliteConfig
from .storage.weaviate import WeaviateConfig


Expand Down Expand Up @@ -46,10 +46,10 @@ def __new__(
def __new__(
cls,
_docs: Optional['DocumentArraySourceType'] = None,
storage: str = 'pqlite',
config: Optional[Union['PqliteConfig', Dict]] = None,
) -> 'DocumentArrayPqlite':
"""Create a PQLite-powered DocumentArray object."""
storage: str = 'annlite',
config: Optional[Union['AnnliteConfig', Dict]] = None,
) -> 'DocumentArrayAnnlite':
"""Create a AnnLite-powered DocumentArray object."""
...

def __new__(cls, *args, storage: str = 'memory', **kwargs):
Expand All @@ -62,10 +62,10 @@ def __new__(cls, *args, storage: str = 'memory', **kwargs):
from .sqlite import DocumentArraySqlite

instance = super().__new__(DocumentArraySqlite)
elif storage == 'pqlite':
from .pqlite import DocumentArrayPqlite
elif storage == 'annlite':
from .annlite import DocumentArrayAnnlite

instance = super().__new__(DocumentArrayPqlite)
instance = super().__new__(DocumentArrayAnnlite)
elif storage == 'weaviate':
from .weaviate import DocumentArrayWeaviate

Expand Down
2 changes: 1 addition & 1 deletion docarray/array/mixins/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def find(
result: List['DocumentArray']

if isinstance(_result, list) and isinstance(_result[0], DocumentArray):
# already auto-boxed by the storage backend, e.g. pqlite
# already auto-boxed by the storage backend, e.g. annlite
result = _result
elif (
isinstance(_result, tuple)
Expand Down
9 changes: 0 additions & 9 deletions docarray/array/pqlite.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
from abc import ABC
import numpy as np

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from .... import Document

from .backend import BackendMixin, PqliteConfig
from .backend import BackendMixin, AnnliteConfig
from .find import FindMixin
from .getsetdel import GetSetDelMixin
from .seqlike import SequenceLikeMixin

__all__ = ['StorageMixins', 'PqliteConfig']
__all__ = ['StorageMixins', 'AnnliteConfig']


class StorageMixins(FindMixin, BackendMixin, GetSetDelMixin, SequenceLikeMixin, ABC):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import itertools
import numpy as np
from dataclasses import dataclass, asdict, field
from typing import (
Union,
Expand All @@ -10,9 +11,6 @@
Iterator,
)

import numpy as np
from pqlite import PQLite

from ..base.backend import BaseBackendMixin
from ....helper import dataclass_from_dict

Expand All @@ -21,7 +19,7 @@


@dataclass
class PqliteConfig:
class AnnliteConfig:
n_dim: int
metric: str = 'cosine'
serialize_config: Dict = field(default_factory=dict)
Expand All @@ -31,16 +29,28 @@ class PqliteConfig:
class BackendMixin(BaseBackendMixin):
"""Provide necessary functions to enable this storage backend. """

def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
if embedding is None:
embedding = np.zeros(self.n_dim, dtype=np.float32)
elif isinstance(embedding, list):
from ....math.ndarray import to_numpy_array

embedding = to_numpy_array(embedding)

if embedding.ndim > 1:
embedding = np.asarray(embedding).squeeze()
return embedding

def _init_storage(
self,
_docs: Optional['DocumentArraySourceType'] = None,
config: Optional[Union[PqliteConfig, Dict]] = None,
config: Optional[Union[AnnliteConfig, Dict]] = None,
**kwargs,
):
if not config:
raise ValueError('Config object must be specified')
elif isinstance(config, dict):
config = dataclass_from_dict(PqliteConfig, config)
config = dataclass_from_dict(AnnliteConfig, config)

self._persist = bool(config.data_path)

Expand All @@ -52,9 +62,11 @@ def _init_storage(
self._config = config

config = asdict(config)
n_dim = config.pop('n_dim')
self.n_dim = config.pop('n_dim')

self._pqlite = PQLite(n_dim, lock=False, **config)
from annlite import AnnLite

self._annlite = AnnLite(self.n_dim, lock=False, **config)
from ... import DocumentArray
from .... import Document

Expand All @@ -74,7 +86,7 @@ def _init_storage(

def __getstate__(self):
state = dict(self.__dict__)
del state['_pqlite']
del state['_annlite']
del state['_offsetmapping']
return state

Expand All @@ -85,21 +97,14 @@ def __setstate__(self, state):
config = asdict(config)
n_dim = config.pop('n_dim')

from pqlite import PQLite
from annlite import AnnLite

self._pqlite = PQLite(n_dim, lock=False, **config)
self._annlite = AnnLite(n_dim, lock=False, **config)

def _get_storage_infos(self) -> Dict:
return {
'Backend': 'PQLite',
'Distance Metric': self._pqlite.metric.name,
'Backend': 'AnnLite',
'Distance Metric': self._annlite.metric.name,
'Data Path': self._config.data_path,
'Serialization Protocol': self._config.serialize_config.get('protocol'),
}

def _map_embedding(self, embedding: 'ArrayType') -> 'ArrayType':
if embedding is None:
embedding = np.zeros(self._pqlite.dim, dtype=np.float32)
elif isinstance(embedding, list):
embedding = np.array(embedding, dtype=np.float32)
return embedding
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def _find(
if n_rows == 1:
query = query.reshape(1, -1)

_, match_docs = self._pqlite._search_documents(
_, match_docs = self._annlite._search_documents(
query, limit=limit, include_metadata=not only_id
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,33 +13,32 @@ class GetSetDelMixin(BaseGetSetDelMixin):
# essential methods start

def _get_doc_by_id(self, _id: str) -> 'Document':
doc = self._pqlite.get_doc_by_id(_id)
doc = self._annlite.get_doc_by_id(_id)
if doc is None:
raise KeyError(f'Can not find Document with id=`{_id}`')
return doc

def _set_doc_by_id(self, _id: str, value: 'Document'):
if _id != value.id:
self._pqlite.delete([_id])
self._del_doc_by_id(_id)

value.embedding = self._map_embedding(value.embedding)
docs = DocumentArrayInMemory([value])
self._pqlite.update(docs)
self._annlite.update(docs)

def _del_doc_by_id(self, _id: str):
self._pqlite.delete([_id])
self._annlite.delete([_id])

def _clear_storage(self):
self._pqlite.clear()
self._annlite.clear()

def _set_docs_by_ids(self, ids, docs: Iterable['Document'], mismatch_ids: Dict):
self._pqlite.delete(list(mismatch_ids.keys()))
docs = DocumentArrayInMemory(docs)
for doc in docs:
for _id, doc in zip(ids, docs):
doc.embedding = self._map_embedding(doc.embedding)
self._pqlite.update(docs)
self._set_doc_by_id(_id, doc)

def _del_docs_by_ids(self, ids):
self._pqlite.delete(ids)
self._annlite.delete(ids)

def _load_offset2ids(self):
self._offsetmapping = OffsetMapping(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pathlib import Path
from typing import Optional, List, Tuple

from pqlite.storage.table import Table
from annlite.storage.table import Table


class OffsetMapping(Table):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ def extend(self, values: Iterable['Document']) -> None:
for doc in docs:
doc.embedding = self._map_embedding(doc.embedding)

self._pqlite.index(docs)
self._annlite.index(docs)
self._offset2ids.extend([doc.id for doc in docs])

def __del__(self) -> None:
if not self._persist:
self._offset2ids.clear()
self._pqlite.clear()
self._annlite.clear()

def __eq__(self, other):
"""In pqlite backend, data are considered as identical if configs point to the same database source"""
"""In annlite backend, data are considered as identical if configs point to the same database source"""
return (
type(self) is type(other)
and type(self._config) is type(other._config)
Expand All @@ -41,7 +41,7 @@ def __bool__(self):
return len(self) > 0

def __repr__(self):
return f'<DocumentArray[PQLite] (length={len(self)}) at {id(self)}>'
return f'<DocumentArray[AnnLite] (length={len(self)}) at {id(self)}>'

def __add__(self, other: Union['Document', Sequence['Document']]):
v = type(self)(self)
Expand All @@ -50,8 +50,8 @@ def __add__(self, other: Union['Document', Sequence['Document']]):

def __contains__(self, x: Union[str, 'Document']):
if isinstance(x, str):
return self._pqlite.get_doc_by_id(x) is not None
return self._annlite.get_doc_by_id(x) is not None
elif isinstance(x, Document):
return self._pqlite.get_doc_by_id(x.id) is not None
return self._annlite.get_doc_by_id(x.id) is not None
else:
return False
40 changes: 40 additions & 0 deletions docs/advanced/document-store/annlite.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Annlite

One can use [Annlite](https://github.com/jina-ai/annlite) as the document store for DocumentArray. It is useful when one wants to have faster Document retrieval on embeddings, i.e. `.match()`, `.find()`.


## Usage

One can instantiate a DocumentArray with Annlite storage like so:

```python
from docarray import DocumentArray

da = DocumentArray(storage='annlite', config={'n_dim': 10})
```

The usage would be the same as the ordinary DocumentArray.

To access a DocumentArray formerly persisted, one can specify the `data_path` in `config`.

```python
from docarray import DocumentArray

da = DocumentArray(storage='annlite', config={'data_path': './data', 'n_dim': 10})

da.summary()
```

Note that specifying the `n_dim` is mandatory before using `Annlite` as a backend for DocumentArray.

Other functions behave the same as in-memory DocumentArray.

## Config

The following configs can be set:

| Name | Description | Default |
|---------------------|---------------------------------------------------------------------------------|-----------------------------|
| `n_dim` | Number of dimensions of embeddings to be stored and retrieved | **This is always required** |
| `data_path` | The data folder where the data is located | **A random temp folder** |
| `distance` | Distance metric to be used during search. Can be 'cosine', 'dot' or 'euclidean' | 'cosine' |
1 change: 1 addition & 0 deletions docs/advanced/document-store/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
weaviate
sqlite
qdrant
annlite
```

Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ In addition to `common`, the following dependencies will be installed to enable
| `av` | for video processing and IO |
| `trimesh` | for 3D mesh processing and IO |
| `weaviate-client` | for using Weaviate-based document store |
| `pqlite` | for using PQLite-based document store |
| `annlite` | for using Annlite-based document store |
| `qdrant-client` | for using Qdrant-based document store |
| `strawberry-graphql` | for GraphQL support |
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
'fastapi',
'uvicorn',
'weaviate-client~=3.3.0',
'pqlite>=0.2.1',
'annlite>=0.3.0',
'qdrant-client~=0.6.0',
'strawberry-graphql',
],
Expand All @@ -90,7 +90,7 @@
'jupyterlab',
'transformers==4.16.2',
'weaviate-client~=3.3.0',
'pqlite>=0.2.4',
'annlite>=0.3.0',
],
},
classifiers=[
Expand Down
Loading

0 comments on commit 2196082

Please sign in to comment.