Skip to content

Commit

Permalink
Merge pull request #801 from daltonkell/remote_netcdf_tabledap
Browse files Browse the repository at this point in the history
Remote NetCDF TableDAP
  • Loading branch information
benjwadams authored May 4, 2020
2 parents ebe727e + 7b18688 commit 3dfe1f8
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 12 deletions.
30 changes: 30 additions & 0 deletions compliance_checker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

from ._version import get_versions

from contextlib import contextmanager
from tempfile import NamedTemporaryFile
from typing import BinaryIO, Generator

__version__ = get_versions()["version"]
del get_versions
Expand All @@ -20,3 +23,30 @@ class MemoizedDataset(Dataset):
@lru_cache(128)
def get_variables_by_attributes(self, **kwargs):
return super(MemoizedDataset, self).get_variables_by_attributes(**kwargs)

@contextmanager
def tempnc(data: BinaryIO) -> Generator[str, None, None]:
"""
Create a temporary in-memory NetCDF file using a NamedTemporaryFile.
Close the file automatically after scope is exited.
Type aliasing and tempfile creation credit to @ocefpaf
https://github.com/ioos/compliance-checker/pull/799#discussion_r411420587
Parameters
----------
data (bytes): raw bytes to store in the NamedTemporaryFile
Returns
-------
context-managed generator
"""
tmp = None
try:
tmp = NamedTemporaryFile(suffix=".nc", prefix="compliance-checker_")
tmp.write(data)
tmp.flush()
yield tmp.name
finally:
if tmp is not None:
tmp.close()
14 changes: 14 additions & 0 deletions compliance_checker/protocols/erddap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
def is_tabledap(url):
"""
Identify a dataset as an ERDDAP TableDAP dataset.
Parameters
----------
url (str) : URL to dataset
Returns
-------
bool
"""

return "tabledap" in url
28 changes: 28 additions & 0 deletions compliance_checker/protocols/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Functions to assist in determining if the URL points to a netCDF file
"""

import requests

def is_netcdf(url):
"""
Expand Down Expand Up @@ -57,3 +58,30 @@ def is_hdf5(file_buffer):
if file_buffer == b"\x89\x48\x44\x46":
return True
return False

def is_remote_netcdf(ds_str):
"""
Check a remote path points to a NetCDF resource.
Parameters
----------
ds_str (str): remote path to a dataset
Returns
-------
bool
"""

# Some datasets do not support HEAD requests! The vast majority will,
# however, support GET requests
try:
head_req = requests.head(ds_str, allow_redirects=True, timeout=10)
head_req.raise_for_status()
except:
content_type = None
else:
content_type = head_req.headers.get("content-type")

# if the Content-Type header returned was "application/x-netcdf",
# or a netCDF file (not OPeNDAP) we can open this into a Dataset
return content_type == "application/x-netcdf"
32 changes: 32 additions & 0 deletions compliance_checker/protocols/opendap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,39 @@
Functions to assist in determining if the URL is an OPeNDAP endpoint
"""
import requests
import urllib.parse
import urllib.request

def create_DAP_variable_str(url):
"""
Create a URL-encoded string of variables for a given DAP dataset.
Works on OPeNDAP datasets.
Parameters
----------
url (str): endpoint to *DAP dataset
Returns
-------
str
"""

# get dds
with urllib.request.urlopen(f"{url}.dds") as resp:
_str = resp.read().decode()[8:]

# remove beginning and ending braces, split on newlines
no_braces_newlines = list(filter(lambda x: "{" not in x and "}" not in x, _str.split("\n")))

# remove all the extra space used in the DDS string
no_spaces = list(filter(None, map(lambda x: x.strip(" "), no_braces_newlines)))

# now need to split from type, grab only the variable and remove ;
vars_only = list(map(lambda x: x.split(" ")[-1].strip(";"), no_spaces))

# encode as proper URL characters
varstr = urllib.parse.quote(",".join(vars_only))
return varstr

def is_opendap(url):
"""
Expand Down
44 changes: 32 additions & 12 deletions compliance_checker/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@
from netCDF4 import Dataset
from owslib.sos import SensorObservationService
from owslib.swe.sensor.sml import SensorML
from compliance_checker.protocols import opendap, netcdf, cdl, erddap
from datetime import datetime
from pkg_resources import working_set

from compliance_checker import MemoizedDataset
from compliance_checker import MemoizedDataset, tempnc
from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value
from compliance_checker.cf.cf import CFBaseCheck
from compliance_checker.protocols import cdl, netcdf, opendap


# Ensure output is encoded as Unicode when checker output is redirected or piped
Expand Down Expand Up @@ -765,20 +766,39 @@ def load_remote_dataset(self, ds_str):
:param str ds_str: URL to the remote resource
"""

if opendap.is_opendap(ds_str):
if "tabledap" in ds_str: # ERDDAP TableDAP request
# modify ds_str to contain the full variable request
variables_str = opendap.create_DAP_variable_str(ds_str)

# join to create a URL to an .ncCF resource
ds_str = "{}.ncCF?{}".format(ds_str, variables_str)

if netcdf.is_remote_netcdf(ds_str):
response = requests.get(ds_str, allow_redirects=True,
timeout=60)
try:
return MemoizedDataset(response.content, memory=response.content)
except OSError as e:
# handle case when netCDF C libs weren't compiled with
# in-memory support by using tempfile
with tempnc(response.content) as _nc:
return MemoizedDataset(_nc)

elif opendap.is_opendap(ds_str):
return Dataset(ds_str)
else:
# Check if the HTTP response is XML, if it is, it's likely SOS so
# we'll attempt to parse the response as SOS
response = requests.get(ds_str, allow_redirects=True)
if "text/xml" in response.headers["content-type"]:
return self.process_doc(response.content)

raise ValueError(
"Unknown service with content-type: {}".format(
response.headers["content-type"]
)
)

# some SOS servers don't seem to support HEAD requests.
# Issue GET instead if we reach here and can't get the response
response = requests.get(ds_str, allow_redirects=True,
timeout=60)
content_type = response.headers.get("content-type")
if content_type == "text/xml":
return self.process_doc(response.content)
else:
raise ValueError("Unknown service with content-type: {}".format(content_type))

def load_local_dataset(self, ds_str):
"""
Expand Down
11 changes: 11 additions & 0 deletions compliance_checker/tests/test_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@

@pytest.mark.integration
class TestProtocols(TestCase):

def test_netcdf_content_type(self):
"""
Check that urls with Content-Type header of "application/x-netcdf" can
successfully be read into memory for checks.
"""
url = 'https://gliders.ioos.us/erddap/tabledap/amelia-20180501T0000.ncCF?&time%3E=max(time)-1%20hour'
cs = CheckSuite()
ds = cs.load_dataset(url)
assert ds is not None

def test_erddap(self):
"""
Tests that a connection can be made to ERDDAP's GridDAP
Expand Down

0 comments on commit 3dfe1f8

Please sign in to comment.