Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remote NetCDF TableDAP #801

Merged
merged 4 commits into from
May 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions compliance_checker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

from ._version import get_versions

from contextlib import contextmanager
from tempfile import NamedTemporaryFile
from typing import BinaryIO, Generator

__version__ = get_versions()["version"]
del get_versions
Expand All @@ -20,3 +23,30 @@ class MemoizedDataset(Dataset):
@lru_cache(128)
def get_variables_by_attributes(self, **kwargs):
return super(MemoizedDataset, self).get_variables_by_attributes(**kwargs)

@contextmanager
def tempnc(data: BinaryIO) -> Generator[str, None, None]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe rename this to create_temporary_ncfile or something more explicit.

"""
Create a temporary in-memory NetCDF file using a NamedTemporaryFile.
Close the file automatically after scope is exited.

Type aliasing and tempfile creation credit to @ocefpaf
https://github.com/ioos/compliance-checker/pull/799#discussion_r411420587

Parameters
----------
data (bytes): raw bytes to store in the NamedTemporaryFile

Returns
-------
context-managed generator
"""
tmp = None
try:
tmp = NamedTemporaryFile(suffix=".nc", prefix="compliance-checker_")
tmp.write(data)
tmp.flush()
yield tmp.name
finally:
if tmp is not None:
tmp.close()
14 changes: 14 additions & 0 deletions compliance_checker/protocols/erddap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
def is_tabledap(url):
"""
Identify a dataset as an ERDDAP TableDAP dataset.

Parameters
----------
url (str) : URL to dataset

Returns
-------
bool
"""

return "tabledap" in url
28 changes: 28 additions & 0 deletions compliance_checker/protocols/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
Functions to assist in determining if the URL points to a netCDF file
"""

import requests

def is_netcdf(url):
"""
Expand Down Expand Up @@ -57,3 +58,30 @@ def is_hdf5(file_buffer):
if file_buffer == b"\x89\x48\x44\x46":
return True
return False

def is_remote_netcdf(ds_str):
"""
Check a remote path points to a NetCDF resource.

Parameters
----------
ds_str (str): remote path to a dataset

Returns
-------
bool
"""

# Some datasets do not support HEAD requests! The vast majority will,
# however, support GET requests
try:
head_req = requests.head(ds_str, allow_redirects=True, timeout=10)
head_req.raise_for_status()
except:
content_type = None
else:
content_type = head_req.headers.get("content-type")

# if the Content-Type header returned was "application/x-netcdf",
# or a netCDF file (not OPeNDAP) we can open this into a Dataset
return content_type == "application/x-netcdf"
32 changes: 32 additions & 0 deletions compliance_checker/protocols/opendap.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,39 @@
Functions to assist in determining if the URL is an OPeNDAP endpoint
"""
import requests
import urllib.parse
import urllib.request

def create_DAP_variable_str(url):
"""
Create a URL-encoded string of variables for a given DAP dataset.
Works on OPeNDAP datasets.

Parameters
----------
url (str): endpoint to *DAP dataset

Returns
-------
str
"""

# get dds
with urllib.request.urlopen(f"{url}.dds") as resp:
_str = resp.read().decode()[8:]

# remove beginning and ending braces, split on newlines
no_braces_newlines = list(filter(lambda x: "{" not in x and "}" not in x, _str.split("\n")))

# remove all the extra space used in the DDS string
no_spaces = list(filter(None, map(lambda x: x.strip(" "), no_braces_newlines)))

# now need to split from type, grab only the variable and remove ;
vars_only = list(map(lambda x: x.split(" ")[-1].strip(";"), no_spaces))

# encode as proper URL characters
varstr = urllib.parse.quote(",".join(vars_only))
return varstr

def is_opendap(url):
"""
Expand Down
44 changes: 32 additions & 12 deletions compliance_checker/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@
from netCDF4 import Dataset
from owslib.sos import SensorObservationService
from owslib.swe.sensor.sml import SensorML
from compliance_checker.protocols import opendap, netcdf, cdl, erddap
from datetime import datetime
from pkg_resources import working_set

from compliance_checker import MemoizedDataset
from compliance_checker import MemoizedDataset, tempnc
from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value
from compliance_checker.cf.cf import CFBaseCheck
from compliance_checker.protocols import cdl, netcdf, opendap


# Ensure output is encoded as Unicode when checker output is redirected or piped
Expand Down Expand Up @@ -765,20 +766,39 @@ def load_remote_dataset(self, ds_str):
:param str ds_str: URL to the remote resource
"""

if opendap.is_opendap(ds_str):
if "tabledap" in ds_str: # ERDDAP TableDAP request
# modify ds_str to contain the full variable request
variables_str = opendap.create_DAP_variable_str(ds_str)

# join to create a URL to an .ncCF resource
ds_str = "{}.ncCF?{}".format(ds_str, variables_str)

if netcdf.is_remote_netcdf(ds_str):
response = requests.get(ds_str, allow_redirects=True,
timeout=60)
try:
return MemoizedDataset(response.content, memory=response.content)
except OSError as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice

# handle case when netCDF C libs weren't compiled with
# in-memory support by using tempfile
with tempnc(response.content) as _nc:
return MemoizedDataset(_nc)

elif opendap.is_opendap(ds_str):
return Dataset(ds_str)
else:
# Check if the HTTP response is XML, if it is, it's likely SOS so
# we'll attempt to parse the response as SOS
response = requests.get(ds_str, allow_redirects=True)
if "text/xml" in response.headers["content-type"]:
return self.process_doc(response.content)

raise ValueError(
"Unknown service with content-type: {}".format(
response.headers["content-type"]
)
)

# some SOS servers don't seem to support HEAD requests.
# Issue GET instead if we reach here and can't get the response
response = requests.get(ds_str, allow_redirects=True,
timeout=60)
content_type = response.headers.get("content-type")
if content_type == "text/xml":
return self.process_doc(response.content)
else:
raise ValueError("Unknown service with content-type: {}".format(content_type))

def load_local_dataset(self, ds_str):
"""
Expand Down
11 changes: 11 additions & 0 deletions compliance_checker/tests/test_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@

@pytest.mark.integration
class TestProtocols(TestCase):

def test_netcdf_content_type(self):
"""
Check that urls with Content-Type header of "application/x-netcdf" can
successfully be read into memory for checks.
"""
url = 'https://gliders.ioos.us/erddap/tabledap/amelia-20180501T0000.ncCF?&time%3E=max(time)-1%20hour'
cs = CheckSuite()
ds = cs.load_dataset(url)
assert ds is not None

def test_erddap(self):
"""
Tests that a connection can be made to ERDDAP's GridDAP
Expand Down