Skip to content

Commit

Permalink
feat: push meta data along with docarray 0.16 (#490)
Browse files Browse the repository at this point in the history
* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray (#491)

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* feat: push meta data along with docarray

* chore: login not required

* Revert "chore: login not required"

This reverts commit 7b008c1.

* refactor: reuse common code

* chore: rename client

* docs: document cloud operations

* chore: login not required

* chore: apply suggestions

* refactor: reuse code

* chore: better naming

Co-authored-by: Alaeddine Abdessalem <[email protected]>
  • Loading branch information
hanxiao and alaeddine-13 authored Sep 16, 2022
1 parent ea2a7a8 commit 14526db
Show file tree
Hide file tree
Showing 11 changed files with 281 additions and 48 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ jobs:
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::docarray"
timeout-minutes: 30
env:
JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ jobs:
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::docarray"
timeout-minutes: 30
env:
JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
Expand Down
159 changes: 147 additions & 12 deletions docarray/array/mixins/io/pushpull.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,159 @@
import json
import os
import os.path
import warnings
from collections import Counter
from pathlib import Path
from typing import Dict, Type, TYPE_CHECKING, Optional
from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any

from docarray.helper import get_request_header, __cache_path__
import hubble
from hubble import Client as HubbleClient
from hubble.client.endpoints import EndpointsV2


from docarray.helper import get_request_header, __cache_path__, _get_array_info

if TYPE_CHECKING:
from docarray.typing import T


def _get_length_from_summary(summary: List[Dict]) -> Optional[int]:
"""Get the length from summary."""
for item in summary:
if 'Length' == item['name']:
return item['value']


class PushPullMixin:
"""Transmitting :class:`DocumentArray` via Jina Cloud Service"""

_max_bytes = 4 * 1024 * 1024 * 1024

@staticmethod
def cloud_list(show_table: bool = False) -> List[str]:
"""List all available arrays in the cloud.
:param show_table: if true, show the table of the arrays.
:returns: List of available DocumentArray's names.
"""
from rich import print

result = []
from rich.table import Table
from rich import box

resp = HubbleClient(jsonify=True).list_artifacts(
filter={'type': 'documentArray'}, sort={'createdAt': 1}
)

table = Table(
title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud',
box=box.SIMPLE,
highlight=True,
)
table.add_column('Name')
table.add_column('Length')
table.add_column('Access')
table.add_column('Created at', justify='center')
table.add_column('Updated at', justify='center')

for da in resp['data']:
result.append(da['name'])

table.add_row(
da['name'],
str(_get_length_from_summary(da['metaData'].get('summary', []))),
da['visibility'],
da['createdAt'],
da['updatedAt'],
)

if show_table:
print(table)
return result

@staticmethod
def cloud_delete(name: str) -> None:
"""
Delete a DocumentArray from the cloud.
:param name: the name of the DocumentArray to delete.
"""
HubbleClient(jsonify=True).delete_artifact(name=name)

def _get_raw_summary(self) -> List[Dict[str, Any]]:
(
is_homo,
_nested_in,
_nested_items,
attr_counter,
all_attrs_names,
) = _get_array_info(self)

items = [
dict(
name='Type',
value=self.__class__.__name__,
description='The type of the DocumentArray',
),
dict(
name='Length',
value=len(self),
description='The length of the DocumentArray',
),
dict(
name='Homogenous Documents',
value=is_homo,
description='Whether all documents are of the same structure, attributes',
),
dict(
name='Common Attributes',
value=list(attr_counter.items())[0][0] if attr_counter else None,
description='The common attributes of all documents',
),
dict(
name='Has nested Documents in',
value=tuple(_nested_in),
description='The field that contains nested Documents',
),
dict(
name='Multimodal dataclass',
value=all(d.is_multimodal for d in self),
description='Whether all documents are multimodal',
),
dict(
name='Subindices', value=tuple(getattr(self, '_subindices', {}).keys())
),
]

items.append(
dict(
name='Inspect attributes',
value=_nested_items,
description='Quick overview of attributes of all documents',
)
)

storage_infos = self._get_storage_infos()
_nested_items = []
if storage_infos:
for k, v in storage_infos.items():
_nested_items.append(dict(name=k, value=v))
items.append(
dict(
name='Storage backend',
value=_nested_items,
description='Quick overview of the Document Store',
)
)

return items

def push(
self,
name: str,
show_progress: bool = False,
public: bool = True,
branding: Optional[Dict] = None,
) -> Dict:
"""Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push`
Expand All @@ -33,6 +168,7 @@ def push(
:param show_progress: if to show a progress bar on pulling
:param public: by default anyone can pull a DocumentArray if they know its name.
Setting this to False will allow only the creator to pull it. This feature of course you to login first.
:param branding: a dict of branding information to be sent to Jina Cloud. {"icon": "emoji", "background": "#fff"}
"""
import requests

Expand All @@ -47,11 +183,14 @@ def push(
'name': name,
'type': 'documentArray',
'public': public,
'metaData': json.dumps(
{'summary': self._get_raw_summary(), 'branding': branding},
sort_keys=True,
),
}
)

headers = {'Content-Type': ctype, **get_request_header()}
import hubble

auth_token = hubble.get_token()
if auth_token:
Expand Down Expand Up @@ -98,11 +237,9 @@ def _get_chunk(_batch):
yield _tail

with pbar:
from hubble import Client
from hubble.client.endpoints import EndpointsV2

response = requests.post(
Client()._base_url + EndpointsV2.upload_artifact,
HubbleClient()._base_url + EndpointsV2.upload_artifact,
data=gen(),
headers=headers,
)
Expand Down Expand Up @@ -133,17 +270,12 @@ def pull(

headers = {}

import hubble

auth_token = hubble.get_token()

if auth_token:
headers['Authorization'] = f'token {auth_token}'

from hubble import Client
from hubble.client.endpoints import EndpointsV2

url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}'
url = HubbleClient()._base_url + EndpointsV2.download_artifact + f'?name={name}'
response = requests.get(url, headers=headers)

if response.ok:
Expand Down Expand Up @@ -183,3 +315,6 @@ def pull(
fp.write(_source.content)

return r

cloud_push = push
cloud_pull = pull
38 changes: 12 additions & 26 deletions docarray/array/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

import numpy as np

from docarray.helper import _get_array_info


class PlotMixin:
"""Helper functions for plotting the arrays."""
Expand All @@ -37,44 +39,28 @@ def summary(self):
tables = []
console = Console()

all_attrs = self._get_attributes('non_empty_fields')
# remove underscore attribute
all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
attr_counter = Counter(all_attrs)
(
is_homo,
_nested_in,
_nested_items,
attr_counter,
all_attrs_names,
) = _get_array_info(self)

table = Table(box=box.SIMPLE, highlight=True)
table.show_header = False
table.add_row('Type', self.__class__.__name__)
table.add_row('Length', str(len(self)))
is_homo = len(attr_counter) == 1
table.add_row('Homogenous Documents', str(is_homo))

all_attrs_names = set(v for k in all_attrs for v in k)
_nested_in = []
if 'chunks' in all_attrs_names:
_nested_in.append('chunks')

if 'matches' in all_attrs_names:
_nested_in.append('matches')

if _nested_in:
table.add_row('Has nested Documents in', str(tuple(_nested_in)))

if is_homo:
table.add_row('Common Attributes', str(list(attr_counter.items())[0][0]))
else:
for _a, _n in attr_counter.most_common():
if _n == 1:
_doc_text = f'{_n} Document has'
else:
_doc_text = f'{_n} Documents have'
if len(_a) == 1:
_text = f'{_doc_text} one attribute'
elif len(_a) == 0:
_text = f'{_doc_text} no attribute'
else:
_text = f'{_doc_text} attributes'
table.add_row(_text, str(_a))

for item in _nested_items:
table.add_row(item['name'], item['value'])

is_multimodal = all(d.is_multimodal for d in self)
table.add_row('Multimodal dataclass', str(is_multimodal))
Expand Down
42 changes: 41 additions & 1 deletion docarray/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import uuid
import warnings
from os.path import expanduser
from typing import Any, Dict, Optional, Sequence, Tuple, Union
from typing import Any, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
from collections import Counter

if TYPE_CHECKING:
from docarray import DocumentArray

__resources_path__ = os.path.join(
os.path.dirname(
Expand Down Expand Up @@ -455,3 +459,39 @@ def _safe_cast_int(value: Union[str, int, float]) -> int:
if isinstance(value, float) and not value.is_integer():
raise ValueError(f"Can't safely cast {value} to an int")
return int(value)


def _get_array_info(da: 'DocumentArray'):
all_attrs = da._get_attributes('non_empty_fields')
# remove underscore attribute
all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
attr_counter = Counter(all_attrs)

all_attrs_names = set(v for k in all_attrs for v in k)
_nested_in = []
if 'chunks' in all_attrs_names:
_nested_in.append('chunks')

if 'matches' in all_attrs_names:
_nested_in.append('matches')

is_homo = len(attr_counter) == 1

_nested_items = []
if not is_homo:
for n_attributes, n_docs in attr_counter.most_common():
if n_docs == 1:
_doc_text = f'{n_docs} Document has'
else:
_doc_text = f'{n_docs} Documents have'
if len(n_attributes) == 1:
_text = f'{_doc_text} one attribute'
elif len(n_attributes) == 0:
_text = f'{_doc_text} no attribute'
else:
_text = f'{_doc_text} attributes'
_nested_items.append(
dict(name=_text, value=str(n_attributes), description='')
)

return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names
23 changes: 23 additions & 0 deletions docs/fundamentals/documentarray/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,3 +393,26 @@ The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compre

To avoid unnecessary download when upstream DocumentArray is unchanged, you can add `DocumentArray.pull(..., local_cache=True)`.

Furthermore, it is possible to list all `DocumentArray` objects stored on the cloud using:
```python
DocumentArray.cloud_list(show_table=True)
```

```text
You have 1 DocumentArray on the cloud
Name Length Access Created at Updated at
────────────────────────────────────────────────────────────────────────────────
da123 10 public 2022-09-15T07:14:54.256Z 2022-09-15T07:14:54.256Z
['da123']
```

```{tip}
Use parameter `show_table` to show table summarizing information about DocumentArrays in the cloud.
```

It is also possible to delete DocumentArray objects in the cloud using:
```python
DocumentArray.cloud_delete('da123')
```
Loading

0 comments on commit 14526db

Please sign in to comment.