Merge pull request #801 from daltonkell/remote_netcdf_tabledap

Remote NetCDF TableDAP
ioos · May 4, 2020 · 3dfe1f8 · 3dfe1f8
2 parents ebe727e + 7b18688
commit 3dfe1f8
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 12 deletions.
diff --git a/compliance_checker/__init__.py b/compliance_checker/__init__.py
@@ -4,6 +4,9 @@
 
 from ._version import get_versions
 
+from contextlib import contextmanager
+from tempfile import NamedTemporaryFile
+from typing import BinaryIO, Generator
 
 __version__ = get_versions()["version"]
 del get_versions
@@ -20,3 +23,30 @@ class MemoizedDataset(Dataset):
     @lru_cache(128)
     def get_variables_by_attributes(self, **kwargs):
         return super(MemoizedDataset, self).get_variables_by_attributes(**kwargs)
+
+@contextmanager
+def tempnc(data: BinaryIO) -> Generator[str, None, None]:
+    """
+    Create a temporary in-memory NetCDF file using a NamedTemporaryFile.
+    Close the file automatically after scope is exited.
+
+    Type aliasing and tempfile creation credit to @ocefpaf
+    https://github.com/ioos/compliance-checker/pull/799#discussion_r411420587
+
+    Parameters
+    ----------
+    data (bytes): raw bytes to store in the NamedTemporaryFile
+
+    Returns
+    -------
+    context-managed generator
+    """
+    tmp = None
+    try:
+        tmp = NamedTemporaryFile(suffix=".nc", prefix="compliance-checker_")
+        tmp.write(data)
+        tmp.flush()
+        yield tmp.name
+    finally:
+        if tmp is not None:
+            tmp.close()
diff --git a/compliance_checker/protocols/erddap.py b/compliance_checker/protocols/erddap.py
@@ -0,0 +1,14 @@
+def is_tabledap(url):
+    """
+    Identify a dataset as an ERDDAP TableDAP dataset.
+
+    Parameters
+    ----------
+    url (str) : URL to dataset
+
+    Returns
+    -------
+    bool
+    """
+
+    return "tabledap" in url
diff --git a/compliance_checker/protocols/netcdf.py b/compliance_checker/protocols/netcdf.py
@@ -5,6 +5,7 @@
 Functions to assist in determining if the URL points to a netCDF file
 """
 
+import requests
 
 def is_netcdf(url):
     """
@@ -57,3 +58,30 @@ def is_hdf5(file_buffer):
     if file_buffer == b"\x89\x48\x44\x46":
         return True
     return False
+
+def is_remote_netcdf(ds_str):
+    """
+    Check a remote path points to a NetCDF resource.
+
+    Parameters
+    ----------
+    ds_str (str): remote path to a dataset
+
+    Returns
+    -------
+    bool
+    """
+
+    # Some datasets do not support HEAD requests!  The vast majority will,
+    # however, support GET requests
+    try:
+        head_req = requests.head(ds_str, allow_redirects=True, timeout=10)
+        head_req.raise_for_status()
+    except:
+        content_type = None
+    else:
+        content_type = head_req.headers.get("content-type")
+
+    # if the Content-Type header returned was "application/x-netcdf",
+    # or a netCDF file (not OPeNDAP) we can open this into a Dataset
+    return content_type == "application/x-netcdf"
diff --git a/compliance_checker/protocols/opendap.py b/compliance_checker/protocols/opendap.py
@@ -5,7 +5,39 @@
 Functions to assist in determining if the URL is an OPeNDAP endpoint
 """
 import requests
+import urllib.parse
+import urllib.request
 
+def create_DAP_variable_str(url):
+    """
+    Create a URL-encoded string of variables for a given DAP dataset.
+    Works on OPeNDAP datasets.
+
+    Parameters
+    ----------
+    url (str): endpoint to *DAP dataset
+
+    Returns
+    -------
+    str
+    """
+
+    # get dds
+    with urllib.request.urlopen(f"{url}.dds") as resp:
+        _str = resp.read().decode()[8:]
+
+    # remove beginning and ending braces, split on newlines
+    no_braces_newlines = list(filter(lambda x: "{" not in x and "}" not in x, _str.split("\n")))
+
+    # remove all the extra space used in the DDS string
+    no_spaces = list(filter(None, map(lambda x: x.strip(" "), no_braces_newlines)))
+
+    # now need to split from type, grab only the variable and remove ;
+    vars_only = list(map(lambda x: x.split(" ")[-1].strip(";"), no_spaces))
+
+    # encode as proper URL characters
+    varstr = urllib.parse.quote(",".join(vars_only))
+    return varstr
 
 def is_opendap(url):
     """

diff --git a/compliance_checker/suite.py b/compliance_checker/suite.py
@@ -24,12 +24,13 @@
 from netCDF4 import Dataset
 from owslib.sos import SensorObservationService
 from owslib.swe.sensor.sml import SensorML
+from compliance_checker.protocols import opendap, netcdf, cdl, erddap
+from datetime import datetime
 from pkg_resources import working_set
 
-from compliance_checker import MemoizedDataset
+from compliance_checker import MemoizedDataset, tempnc
 from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value
 from compliance_checker.cf.cf import CFBaseCheck
-from compliance_checker.protocols import cdl, netcdf, opendap
 
 
 # Ensure output is encoded as Unicode when checker output is redirected or piped
@@ -765,20 +766,39 @@ def load_remote_dataset(self, ds_str):
         :param str ds_str: URL to the remote resource
         """
 
-        if opendap.is_opendap(ds_str):
+        if "tabledap" in ds_str: # ERDDAP TableDAP request
+            # modify ds_str to contain the full variable request
+            variables_str = opendap.create_DAP_variable_str(ds_str)
+
+            # join to create a URL to an .ncCF resource
+            ds_str = "{}.ncCF?{}".format(ds_str, variables_str)
+
+        if netcdf.is_remote_netcdf(ds_str):
+            response = requests.get(ds_str, allow_redirects=True,
+                                    timeout=60)
+            try:
+                return MemoizedDataset(response.content, memory=response.content)
+            except OSError as e:
+                # handle case when netCDF C libs weren't compiled with
+                # in-memory support by using tempfile
+                with tempnc(response.content) as _nc:
+                    return MemoizedDataset(_nc)
+
+        elif opendap.is_opendap(ds_str):
             return Dataset(ds_str)
-        else:
             # Check if the HTTP response is XML, if it is, it's likely SOS so
             # we'll attempt to parse the response as SOS
-            response = requests.get(ds_str, allow_redirects=True)
-            if "text/xml" in response.headers["content-type"]:
-                return self.process_doc(response.content)
 
-            raise ValueError(
-                "Unknown service with content-type: {}".format(
-                    response.headers["content-type"]
-                )
-            )
+
+        # some SOS servers don't seem to support HEAD requests.
+        # Issue GET instead if we reach here and can't get the response
+        response = requests.get(ds_str, allow_redirects=True,
+                                timeout=60)
+        content_type = response.headers.get("content-type")
+        if content_type == "text/xml":
+            return self.process_doc(response.content)
+        else:
+            raise ValueError("Unknown service with content-type: {}".format(content_type))
 
     def load_local_dataset(self, ds_str):
         """

diff --git a/compliance_checker/tests/test_protocols.py b/compliance_checker/tests/test_protocols.py
@@ -13,6 +13,17 @@
 
 @pytest.mark.integration
 class TestProtocols(TestCase):
+
+    def test_netcdf_content_type(self):
+        """
+        Check that urls with Content-Type header of "application/x-netcdf" can
+        successfully be read into memory for checks.
+        """
+        url = 'https://gliders.ioos.us/erddap/tabledap/amelia-20180501T0000.ncCF?&time%3E=max(time)-1%20hour'
+        cs = CheckSuite()
+        ds = cs.load_dataset(url)
+        assert ds is not None
+
     def test_erddap(self):
         """
         Tests that a connection can be made to ERDDAP's GridDAP