feat: push meta data along with docarray 0.16 (#490)

* feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray (#491) * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * feat: push meta data along with docarray * chore: login not required * Revert "chore: login not required" This reverts commit 7b008c1. * refactor: reuse common code * chore: rename client * docs: document cloud operations * chore: login not required * chore: apply suggestions * refactor: reuse code * chore: better naming Co-authored-by: Alaeddine Abdessalem <[email protected]>
docarray · Sep 16, 2022 · 14526db · 14526db
1 parent ea2a7a8
commit 14526db
Show file tree

Hide file tree

Showing 11 changed files with 281 additions and 48 deletions.
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -56,6 +56,8 @@ jobs:
             -v -s -m "not gpu" ${{ matrix.test-path }}
           echo "::set-output name=codecov_flag::docarray"
         timeout-minutes: 30
+        env:
+          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
       - name: Check codecov file
         id: check_files
         uses: andstor/file-existence-action@v1

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -145,6 +145,8 @@ jobs:
             -v -s -m "not gpu" ${{ matrix.test-path }}
           echo "::set-output name=codecov_flag::docarray"
         timeout-minutes: 30
+        env:
+          JINA_AUTH_TOKEN: "${{ secrets.JINA_AUTH_TOKEN }}"
       - name: Check codecov file
         id: check_files
         uses: andstor/file-existence-action@v1

diff --git a/docarray/array/mixins/io/pushpull.py b/docarray/array/mixins/io/pushpull.py
@@ -1,24 +1,159 @@
+import json
 import os
+import os.path
 import warnings
+from collections import Counter
 from pathlib import Path
-from typing import Dict, Type, TYPE_CHECKING, Optional
+from typing import Dict, Type, TYPE_CHECKING, List, Optional, Any
 
-from docarray.helper import get_request_header, __cache_path__
+import hubble
+from hubble import Client as HubbleClient
+from hubble.client.endpoints import EndpointsV2
+
+
+from docarray.helper import get_request_header, __cache_path__, _get_array_info
 
 if TYPE_CHECKING:
     from docarray.typing import T
 
 
+def _get_length_from_summary(summary: List[Dict]) -> Optional[int]:
+    """Get the length from summary."""
+    for item in summary:
+        if 'Length' == item['name']:
+            return item['value']
+
+
 class PushPullMixin:
     """Transmitting :class:`DocumentArray` via Jina Cloud Service"""
 
     _max_bytes = 4 * 1024 * 1024 * 1024
 
+    @staticmethod
+    def cloud_list(show_table: bool = False) -> List[str]:
+        """List all available arrays in the cloud.
+
+        :param show_table: if true, show the table of the arrays.
+        :returns: List of available DocumentArray's names.
+        """
+        from rich import print
+
+        result = []
+        from rich.table import Table
+        from rich import box
+
+        resp = HubbleClient(jsonify=True).list_artifacts(
+            filter={'type': 'documentArray'}, sort={'createdAt': 1}
+        )
+
+        table = Table(
+            title=f'You have {resp["meta"]["total"]} DocumentArray on the cloud',
+            box=box.SIMPLE,
+            highlight=True,
+        )
+        table.add_column('Name')
+        table.add_column('Length')
+        table.add_column('Access')
+        table.add_column('Created at', justify='center')
+        table.add_column('Updated at', justify='center')
+
+        for da in resp['data']:
+            result.append(da['name'])
+
+            table.add_row(
+                da['name'],
+                str(_get_length_from_summary(da['metaData'].get('summary', []))),
+                da['visibility'],
+                da['createdAt'],
+                da['updatedAt'],
+            )
+
+        if show_table:
+            print(table)
+        return result
+
+    @staticmethod
+    def cloud_delete(name: str) -> None:
+        """
+        Delete a DocumentArray from the cloud.
+        :param name: the name of the DocumentArray to delete.
+        """
+        HubbleClient(jsonify=True).delete_artifact(name=name)
+
+    def _get_raw_summary(self) -> List[Dict[str, Any]]:
+        (
+            is_homo,
+            _nested_in,
+            _nested_items,
+            attr_counter,
+            all_attrs_names,
+        ) = _get_array_info(self)
+
+        items = [
+            dict(
+                name='Type',
+                value=self.__class__.__name__,
+                description='The type of the DocumentArray',
+            ),
+            dict(
+                name='Length',
+                value=len(self),
+                description='The length of the DocumentArray',
+            ),
+            dict(
+                name='Homogenous Documents',
+                value=is_homo,
+                description='Whether all documents are of the same structure, attributes',
+            ),
+            dict(
+                name='Common Attributes',
+                value=list(attr_counter.items())[0][0] if attr_counter else None,
+                description='The common attributes of all documents',
+            ),
+            dict(
+                name='Has nested Documents in',
+                value=tuple(_nested_in),
+                description='The field that contains nested Documents',
+            ),
+            dict(
+                name='Multimodal dataclass',
+                value=all(d.is_multimodal for d in self),
+                description='Whether all documents are multimodal',
+            ),
+            dict(
+                name='Subindices', value=tuple(getattr(self, '_subindices', {}).keys())
+            ),
+        ]
+
+        items.append(
+            dict(
+                name='Inspect attributes',
+                value=_nested_items,
+                description='Quick overview of attributes of all documents',
+            )
+        )
+
+        storage_infos = self._get_storage_infos()
+        _nested_items = []
+        if storage_infos:
+            for k, v in storage_infos.items():
+                _nested_items.append(dict(name=k, value=v))
+        items.append(
+            dict(
+                name='Storage backend',
+                value=_nested_items,
+                description='Quick overview of the Document Store',
+            )
+        )
+
+        return items
+
     def push(
         self,
         name: str,
         show_progress: bool = False,
         public: bool = True,
+        branding: Optional[Dict] = None,
     ) -> Dict:
         """Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push`
 
@@ -33,6 +168,7 @@ def push(
         :param show_progress: if to show a progress bar on pulling
         :param public: by default anyone can pull a DocumentArray if they know its name.
             Setting this to False will allow only the creator to pull it. This feature of course you to login first.
+        :param branding: a dict of branding information to be sent to Jina Cloud. {"icon": "emoji", "background": "#fff"}
         """
         import requests
 
@@ -47,11 +183,14 @@ def push(
                 'name': name,
                 'type': 'documentArray',
                 'public': public,
+                'metaData': json.dumps(
+                    {'summary': self._get_raw_summary(), 'branding': branding},
+                    sort_keys=True,
+                ),
             }
         )
 
         headers = {'Content-Type': ctype, **get_request_header()}
-        import hubble
 
         auth_token = hubble.get_token()
         if auth_token:
@@ -98,11 +237,9 @@ def _get_chunk(_batch):
             yield _tail
 
         with pbar:
-            from hubble import Client
-            from hubble.client.endpoints import EndpointsV2
 
             response = requests.post(
-                Client()._base_url + EndpointsV2.upload_artifact,
+                HubbleClient()._base_url + EndpointsV2.upload_artifact,
                 data=gen(),
                 headers=headers,
             )
@@ -133,17 +270,12 @@ def pull(
 
         headers = {}
 
-        import hubble
-
         auth_token = hubble.get_token()
 
         if auth_token:
             headers['Authorization'] = f'token {auth_token}'
 
-        from hubble import Client
-        from hubble.client.endpoints import EndpointsV2
-
-        url = Client()._base_url + EndpointsV2.download_artifact + f'?name={name}'
+        url = HubbleClient()._base_url + EndpointsV2.download_artifact + f'?name={name}'
         response = requests.get(url, headers=headers)
 
         if response.ok:
@@ -183,3 +315,6 @@ def pull(
                     fp.write(_source.content)
 
             return r
+
+    cloud_push = push
+    cloud_pull = pull
diff --git a/docarray/array/mixins/plot.py b/docarray/array/mixins/plot.py
@@ -11,6 +11,8 @@
 
 import numpy as np
 
+from docarray.helper import _get_array_info
+
 
 class PlotMixin:
     """Helper functions for plotting the arrays."""
@@ -37,44 +39,28 @@ def summary(self):
         tables = []
         console = Console()
 
-        all_attrs = self._get_attributes('non_empty_fields')
-        # remove underscore attribute
-        all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
-        attr_counter = Counter(all_attrs)
+        (
+            is_homo,
+            _nested_in,
+            _nested_items,
+            attr_counter,
+            all_attrs_names,
+        ) = _get_array_info(self)
 
         table = Table(box=box.SIMPLE, highlight=True)
         table.show_header = False
         table.add_row('Type', self.__class__.__name__)
         table.add_row('Length', str(len(self)))
-        is_homo = len(attr_counter) == 1
         table.add_row('Homogenous Documents', str(is_homo))
 
-        all_attrs_names = set(v for k in all_attrs for v in k)
-        _nested_in = []
-        if 'chunks' in all_attrs_names:
-            _nested_in.append('chunks')
-
-        if 'matches' in all_attrs_names:
-            _nested_in.append('matches')
-
         if _nested_in:
             table.add_row('Has nested Documents in', str(tuple(_nested_in)))
 
         if is_homo:
             table.add_row('Common Attributes', str(list(attr_counter.items())[0][0]))
-        else:
-            for _a, _n in attr_counter.most_common():
-                if _n == 1:
-                    _doc_text = f'{_n} Document has'
-                else:
-                    _doc_text = f'{_n} Documents have'
-                if len(_a) == 1:
-                    _text = f'{_doc_text} one attribute'
-                elif len(_a) == 0:
-                    _text = f'{_doc_text} no attribute'
-                else:
-                    _text = f'{_doc_text} attributes'
-                table.add_row(_text, str(_a))
+
+        for item in _nested_items:
+            table.add_row(item['name'], item['value'])
 
         is_multimodal = all(d.is_multimodal for d in self)
         table.add_row('Multimodal dataclass', str(is_multimodal))

diff --git a/docarray/helper.py b/docarray/helper.py
@@ -6,7 +6,11 @@
 import uuid
 import warnings
 from os.path import expanduser
-from typing import Any, Dict, Optional, Sequence, Tuple, Union
+from typing import Any, Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING
+from collections import Counter
+
+if TYPE_CHECKING:
+    from docarray import DocumentArray
 
 __resources_path__ = os.path.join(
     os.path.dirname(
@@ -455,3 +459,39 @@ def _safe_cast_int(value: Union[str, int, float]) -> int:
     if isinstance(value, float) and not value.is_integer():
         raise ValueError(f"Can't safely cast {value} to an int")
     return int(value)
+
+
+def _get_array_info(da: 'DocumentArray'):
+    all_attrs = da._get_attributes('non_empty_fields')
+    # remove underscore attribute
+    all_attrs = [tuple(vv for vv in v if not vv.startswith('_')) for v in all_attrs]
+    attr_counter = Counter(all_attrs)
+
+    all_attrs_names = set(v for k in all_attrs for v in k)
+    _nested_in = []
+    if 'chunks' in all_attrs_names:
+        _nested_in.append('chunks')
+
+    if 'matches' in all_attrs_names:
+        _nested_in.append('matches')
+
+    is_homo = len(attr_counter) == 1
+
+    _nested_items = []
+    if not is_homo:
+        for n_attributes, n_docs in attr_counter.most_common():
+            if n_docs == 1:
+                _doc_text = f'{n_docs} Document has'
+            else:
+                _doc_text = f'{n_docs} Documents have'
+            if len(n_attributes) == 1:
+                _text = f'{_doc_text} one attribute'
+            elif len(n_attributes) == 0:
+                _text = f'{_doc_text} no attribute'
+            else:
+                _text = f'{_doc_text} attributes'
+            _nested_items.append(
+                dict(name=_text, value=str(n_attributes), description='')
+            )
+
+    return is_homo, _nested_in, _nested_items, attr_counter, all_attrs_names
diff --git a/docs/fundamentals/documentarray/serialization.md b/docs/fundamentals/documentarray/serialization.md
@@ -393,3 +393,26 @@ The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compre
 
 To avoid unnecessary download when upstream DocumentArray is unchanged, you can add `DocumentArray.pull(..., local_cache=True)`.
 
+Furthermore, it is possible to list all `DocumentArray` objects stored on the cloud using:
+```python
+DocumentArray.cloud_list(show_table=True)
+```
+
+```text
+                      You have 1 DocumentArray on the cloud                       
+                                                                                  
+  Name     Length   Access          Created at                 Updated at         
+ ──────────────────────────────────────────────────────────────────────────────── 
+  da123    10       public   2022-09-15T07:14:54.256Z   2022-09-15T07:14:54.256Z  
+                                                                                  
+['da123']
+```
+
+```{tip}
+Use parameter `show_table` to show table summarizing information about DocumentArrays in the cloud.
+```
+
+It is also possible to delete DocumentArray objects in the cloud using:
+```python
+DocumentArray.cloud_delete('da123')
+```