Skip to content

Commit

Permalink
feat: annlite find with filter and no query vector (#401)
Browse files Browse the repository at this point in the history
* feat: use annlite find with only filter

* chore: upgrade annlite version

* docs: remove typo

Co-authored-by: Alaeddine Abdessalem <[email protected]>
  • Loading branch information
davidbp and alaeddine-13 authored Jun 13, 2022
1 parent aad1e1f commit 52fde78
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 5 deletions.
22 changes: 21 additions & 1 deletion docarray/array/storage/annlite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

if TYPE_CHECKING:
import numpy as np
from .... import DocumentArray

from .... import DocumentArray


class FindMixin:
Expand Down Expand Up @@ -41,3 +42,22 @@ def _find(
)

return match_docs

def _filter(
self,
filter: Dict,
limit: Optional[Union[int, float]] = 20,
only_id: bool = False,
) -> 'DocumentArray':
"""Returns a subset of documents by filtering by the given filter (`Annlite` filter).
:param filter: the input filter to apply in each stored document
:param limit: the number of results to get for each query document in search.
:param only_id: if set, then returning matches will only contain ``id``
:return: a `DocumentArray` containing the `Document` objects that verify the filter.
"""

docs = self._annlite.filter(
filter=filter, limit=limit, include_metadata=not only_id
)
return DocumentArray(docs)
57 changes: 56 additions & 1 deletion docs/advanced/document-store/annlite.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,62 @@ The following configs can be set:
Search with `.find` can be restricted by user-defined filters.
Filters can be constructed following the guidelines provided in [the AnnLite source repository](https://github.com/jina-ai/annlite).

### Example of `.find` with a filter
### Example of `.find` with a filter only


Consider you store Documents with a certain tag `price` into annlite and you want to retrieve all Documents
with `price` lower or equal to some `max_price` value.


You can index such Documents as follows:
```python
from docarray import Document, DocumentArray
import numpy as np

n_dim = 3
da = DocumentArray(
storage='annlite',
config={
'n_dim': n_dim,
'columns': [('price', 'float')],
},
)

with da:
da.extend([Document(id=f'r{i}', tags={'price': i}) for i in range(10)])

print('\nIndexed Prices:\n')
for price in da[:, 'tags__price']:
print(f'\t price={price}')
```

Then you can retrieve all documents whose price is lower than or equal to `max_price` by applying the following
filter:

```python
max_price = 3
n_limit = 4

filter = {'price': {'$lte': max_price}}
results = da.find(filter=filter)

print('\n Returned examples that verify filter "price at most 3":\n')
for price in results[:, 'tags__price']:
print(f'\t price={price}')
```

This would print

```
Returned examples that satisfy condition "price at most 3":
price=0
price=1
price=2
price=3
```

### Example of `.find` with query vector and filter

Consider Documents with embeddings `[0,0,0]` up to ` [9,9,9]` where the document with embedding `[i,i,i]`
has as tag `price` with value `i`. We can create such example with the following code:
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@
'uvicorn',
'strawberry-graphql',
'weaviate-client~=3.3.0',
'annlite>=0.3.0',
'annlite>=0.3.2',
'qdrant-client~=0.7.3',
'elasticsearch>=8.2.0',
],
'qdrant': [
'qdrant-client~=0.7.3',
],
'annlite': [
'annlite>=0.3.0',
'annlite>=0.3.2',
],
'weaviate': [
'weaviate-client~=3.3.0',
Expand Down Expand Up @@ -100,7 +100,7 @@
'jupyterlab',
'transformers>=4.16.2',
'weaviate-client~=3.3.0',
'annlite>=0.3.0',
'annlite>=0.3.2',
'elasticsearch>=8.2.0',
'jina',
],
Expand Down
11 changes: 11 additions & 0 deletions tests/unit/array/mixins/test_find.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,17 @@ def test_search_pre_filtering(
)
for operator in ['gt', 'gte', 'lt', 'lte']
],
*[
tuple(
[
'annlite',
lambda operator, threshold: {'price': {operator: threshold}},
numeric_operators_annlite,
operator,
]
)
for operator in numeric_operators_annlite.keys()
],
],
)
def test_filtering(storage, filter_gen, operator, numeric_operators, start_storage):
Expand Down

0 comments on commit 52fde78

Please sign in to comment.