Skip to content

Commit

Permalink
ADBC: add spatial support for DuckDB databases and GeoParquet
Browse files Browse the repository at this point in the history
- Automate loading duckdb_spatial extension when installed, and when
  the dataset is DuckDB or Parquet
- Retrieve geometries (GEOMETRY type) as OGR geometries
- Read GeoParquet metadata to figure out spatial extent, CRS and
  geometry type per geometry column
- Use duckdb_spatial ST_Intersects() for faster spatial filtering
  (when done with OGRLayer::SetSpatialFilter()), potentially
  leveraging DuckDB RTree when it is available.
- Use GeoParquet bounding box column in complement to above
- Passthrough forward of WHERE claused expresse through
  OGRLayer::SetAttributeFilter()
  • Loading branch information
rouault committed Dec 9, 2024
1 parent 04b227e commit e7332ae
Show file tree
Hide file tree
Showing 9 changed files with 1,053 additions and 58 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/linux_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ jobs:
# For cache
mkdir -p .gdal
mkdir -p .duckdb
docker run \
-e CI \
Expand All @@ -355,6 +356,7 @@ jobs:
--add-host=host.docker.internal:host-gateway \
--rm \
-v $(pwd)/.gdal:/.gdal \
-v $(pwd)/.duckdb:/.duckdb \
-v $(pwd):$(pwd) \
--workdir $(pwd)/build-${{ matrix.id }} \
${CONTAINER_NAME_FULL} \
Expand Down
6 changes: 4 additions & 2 deletions apps/test_ogrsf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4057,14 +4057,16 @@ static int64_t CountFeaturesUsingArrowStream(OGRLayer *poLayer,
if (nExpectedFID >= 0 && !bExpectedFIDFound)
{
bOK = false;
printf("ERROR: expected to find feature of id %" PRId64
printf("ERROR: CountFeaturesUsingArrowStream() :"
"expected to find feature of id %" PRId64
", but did not get it\n",
nExpectedFID);
}
if (nUnexpectedFID >= 0 && bUnexpectedFIDFound)
{
bOK = false;
printf("ERROR: expected *not* to find feature of id %" PRId64
printf("ERROR: CountFeaturesUsingArrowStream(): "
"expected *not* to find feature of id %" PRId64
", but did get it\n",
nUnexpectedFID);
}
Expand Down
Binary file not shown.
156 changes: 138 additions & 18 deletions autotest/ogr/ogr_adbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,29 +231,97 @@ def test_ogr_adbc_duckdb_parquet_with_sql_open_option():
###############################################################################


def test_ogr_adbc_duckdb_parquet_with_spatial():
@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"])
def test_ogr_adbc_duckdb_parquet_with_spatial(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL):

if not _has_libduckdb():
pytest.skip("libduckdb.so missing")

if gdaltest.is_travis_branch("ubuntu_2404"):
# Works locally for me when replicating the Dockerfile ...
pytest.skip("fails on ubuntu_2404 for unknown reason")
with gdal.config_option(
"OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL
):
with gdal.OpenEx(
"data/parquet/poly.parquet",
gdal.OF_VECTOR,
allowed_drivers=["ADBC"],
open_options=[
"PRELUDE_STATEMENTS=INSTALL spatial",
]
if OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL == "ON"
else [],
) as ds:
lyr = ds.GetLayer(0)
assert lyr.GetGeomType() == ogr.wkbPolygon
assert lyr.TestCapability(ogr.OLCFastGetExtent)
assert lyr.TestCapability(ogr.OLCFastSpatialFilter)
minx, maxx, miny, maxy = lyr.GetExtent()
assert (minx, maxx, miny, maxy) == (
478315.53125,
481645.3125,
4762880.5,
4765610.5,
)
assert lyr.GetExtent3D() == (
478315.53125,
481645.3125,
4762880.5,
4765610.5,
float("inf"),
float("-inf"),
)
assert lyr.GetSpatialRef().GetAuthorityCode(None) == "27700"
f = lyr.GetNextFeature()
assert f.GetGeometryRef().ExportToWkt().startswith("POLYGON ((")

with gdal.OpenEx(
"data/parquet/poly.parquet",
gdal.OF_VECTOR,
allowed_drivers=["ADBC"],
open_options=[
"PRELUDE_STATEMENTS=INSTALL spatial",
"PRELUDE_STATEMENTS=LOAD spatial",
],
) as ds:
assert lyr.GetFeatureCount() == 10
lyr.SetAttributeFilter("false")

assert lyr.GetFeatureCount() == 0
lyr.SetAttributeFilter("true")

lyr.SetAttributeFilter(None)
assert lyr.GetFeatureCount() == 10
lyr.SetSpatialFilterRect(minx, miny, maxx, maxy)
assert lyr.GetFeatureCount() == 10
lyr.SetSpatialFilterRect(minx, miny, minx, maxy)
assert lyr.GetFeatureCount() < 10
lyr.SetSpatialFilterRect(maxx, miny, maxx, maxy)
assert lyr.GetFeatureCount() < 10
lyr.SetSpatialFilterRect(minx, miny, maxx, miny)
assert lyr.GetFeatureCount() < 10
lyr.SetSpatialFilterRect(minx, maxy, maxx, maxy)
assert lyr.GetFeatureCount() < 10

lyr.SetAttributeFilter("true")
lyr.SetSpatialFilter(None)
assert lyr.GetFeatureCount() == 10
lyr.SetSpatialFilterRect(minx, miny, maxx, maxy)
assert lyr.GetFeatureCount() == 10

lyr.SetAttributeFilter("false")
lyr.SetSpatialFilterRect(minx, miny, maxx, maxy)
assert lyr.GetFeatureCount() == 0


###############################################################################


@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"])
def test_ogr_adbc_duckdb_with_spatial_index(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL):

if not _has_libduckdb():
pytest.skip("libduckdb.so missing")

with gdal.config_option(
"OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL
):
ds = ogr.Open("data/duckdb/poly_with_spatial_index.duckdb")
lyr = ds.GetLayer(0)
with ds.ExecuteSQL(
"SELECT ST_AsText(geometry) FROM read_parquet('data/parquet/poly.parquet')"
"SELECT 1 FROM duckdb_extensions() WHERE extension_name='spatial' AND loaded = true"
) as sql_lyr:
f = sql_lyr.GetNextFeature()
assert f.GetField(0).startswith("POLYGON")
spatial_loaded = sql_lyr.GetNextFeature() is not None
assert lyr.TestCapability(ogr.OLCFastSpatialFilter) == spatial_loaded


###############################################################################
Expand Down Expand Up @@ -325,6 +393,30 @@ def test_ogr_adbc_test_ogrsf_parquet_filename_with_glob():
assert "ERROR" not in ret


###############################################################################
# Run test_ogrsf on a GeoParquet file


@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"])
def test_ogr_adbc_test_ogrsf_geoparquet(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL):

if not _has_libduckdb():
pytest.skip("libduckdb.so missing")

import test_cli_utilities

if test_cli_utilities.get_test_ogrsf_path() is None:
pytest.skip()

ret = gdaltest.runexternal(
test_cli_utilities.get_test_ogrsf_path()
+ f" -ro ADBC:data/parquet/poly.parquet --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}"
)

assert "INFO" in ret
assert "ERROR" not in ret


###############################################################################
# Test DATETIME_AS_STRING=YES GetArrowStream() option

Expand Down Expand Up @@ -359,7 +451,34 @@ def test_ogr_adbc_arrow_stream_numpy_datetime_as_string(tmp_vsimem):
# Run test_ogrsf on a DuckDB dataset


def test_ogr_adbc_test_ogrsf_duckdb():
@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"])
def test_ogr_adbc_test_ogrsf_duckdb(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL):

if not _has_libduckdb():
pytest.skip("libduckdb.so missing")

import test_cli_utilities

if test_cli_utilities.get_test_ogrsf_path() is None:
pytest.skip()

ret = gdaltest.runexternal(
test_cli_utilities.get_test_ogrsf_path()
+ f" -ro ADBC:data/duckdb/poly.duckdb --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}"
)

assert "INFO" in ret
assert "ERROR" not in ret


###############################################################################
# Run test_ogrsf on a DuckDB dataset


@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"])
def test_ogr_adbc_test_ogrsf_duckdb_with_spatial_index(
OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL,
):

if not _has_libduckdb():
pytest.skip("libduckdb.so missing")
Expand All @@ -370,7 +489,8 @@ def test_ogr_adbc_test_ogrsf_duckdb():
pytest.skip()

ret = gdaltest.runexternal(
test_cli_utilities.get_test_ogrsf_path() + " -ro ADBC:data/duckdb/poly.duckdb"
test_cli_utilities.get_test_ogrsf_path()
+ f" -ro ADBC:data/duckdb/poly_with_spatial_index.duckdb --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}"
)

assert "INFO" in ret
Expand Down
15 changes: 14 additions & 1 deletion doc/source/drivers/vector/adbc.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ Consult the `installation instruction <https://arrow.apache.org/adbc/current/dri
for the various ADBC drivers. At time of writing, there are drivers for
SQLite3, PostgreSQL, Snowflake, BigQuery, DuckDB, Flight SQL, etc.

The driver is read-only, and there is no support for spatial data currently.
The driver is read-only.

There is spatial support when the underlying ADBC driver is DuckDB, for
native spatial DuckDB databases and GeoParquet datasets, and when the spatial
extension is installed.

Connection string
-----------------
Expand Down Expand Up @@ -106,6 +110,15 @@ GDAL ADBC driver as a way of locating and loading the ADBC driver if GDAL was
not built with ADBC Driver Manager support or if an embedding application has
an updated or augmented collection of drivers available.

Filtering
---------

Attribute filters are passed to the underlying ADBC engine.

Spatial filters are passed to DuckDB when it is the underlying ADBC engine
and for DuckDB spatial databases and GeoParquet datasets. GeoParquet bounding
box column and/or DuckDB native RTree spatial indices are used when available.

Examples
--------

Expand Down
51 changes: 49 additions & 2 deletions ogr/ogrsf_frmts/adbc/ogr_adbc.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,24 @@ class OGRADBCDataset;
class OGRADBCLayer final : public OGRLayer,
public OGRGetNextFeatureThroughRaw<OGRADBCLayer>
{
public:
//! Describe the bbox column of a geometry column
struct GeomColBBOX
{
std::string osXMin{}; // empty if no bbox column
std::string osYMin{};
std::string osXMax{};
std::string osYMax{};
};

private:
friend class OGRADBCDataset;

OGRADBCDataset *m_poDS = nullptr;
const std::string m_osBaseStatement{}; // as provided by user
std::string m_osModifiedBaseStatement{}; // above tuned to use ST_AsWKB()
std::string m_osModifiedSelect{}; // SELECT part of above
std::string m_osAttributeFilter{};
std::unique_ptr<AdbcStatement> m_statement{};
std::unique_ptr<OGRArrowArrayToOGRFeatureAdapterLayer> m_poAdapterLayer{};
std::unique_ptr<OGRArrowArrayStream> m_stream{};
Expand All @@ -103,17 +118,27 @@ class OGRADBCLayer final : public OGRLayer,
GIntBig m_nFeatureID = 0;
bool m_bIsParquetLayer = false;

std::vector<GeomColBBOX>
m_geomColBBOX{}; // same size as GetGeomFieldCount()
std::vector<OGREnvelope3D> m_extents{}; // same size as GetGeomFieldCount()

OGRFeature *GetNextRawFeature();
bool GetArrowStreamInternal(struct ArrowArrayStream *out_stream);
GIntBig GetFeatureCountParquet();

void BuildLayerDefn(bool bInternalUse);
bool ReplaceStatement(const char *pszNewStatement);
bool UpdateStatement();
std::string GetCurrentStatement() const;

CPL_DISALLOW_COPY_ASSIGN(OGRADBCLayer)

public:
OGRADBCLayer(OGRADBCDataset *poDS, const char *pszName,
const char *pszStatement,
std::unique_ptr<AdbcStatement> poStatement,
std::unique_ptr<OGRArrowArrayStream> poStream,
ArrowSchema *schema);
ArrowSchema *schema, bool bInternalUse);
~OGRADBCLayer() override;

OGRFeatureDefn *GetLayerDefn() override
Expand All @@ -128,6 +153,20 @@ class OGRADBCLayer final : public OGRLayer,
bool GetArrowStream(struct ArrowArrayStream *out_stream,
CSLConstList papszOptions = nullptr) override;
GIntBig GetFeatureCount(int bForce) override;

void SetSpatialFilter(OGRGeometry *poGeom) override
{
SetSpatialFilter(0, poGeom);
}

OGRErr SetAttributeFilter(const char *pszFilter) override;
void SetSpatialFilter(int iGeomField, OGRGeometry *poGeom) override;

OGRErr GetExtent(OGREnvelope *psExtent, int bForce = TRUE) override;
OGRErr GetExtent(int iGeomField, OGREnvelope *psExtent,
int bForce = TRUE) override;
OGRErr GetExtent3D(int iGeomField, OGREnvelope3D *psExtent,
int bForce = TRUE) override;
};

/************************************************************************/
Expand All @@ -143,6 +182,8 @@ class OGRADBCDataset final : public GDALDataset
std::unique_ptr<AdbcConnection> m_connection{};
std::vector<std::unique_ptr<OGRLayer>> m_apoLayers{};
std::string m_osParquetFilename{};
bool m_bIsDuckDB = false;
bool m_bSpatialLoaded = false;

public:
OGRADBCDataset() = default;
Expand All @@ -164,7 +205,13 @@ class OGRADBCDataset final : public GDALDataset
OGRLayer *GetLayerByName(const char *pszName) override;

std::unique_ptr<OGRADBCLayer> CreateLayer(const char *pszStatement,
const char *pszLayerName);
const char *pszLayerName,
bool bInternalUse);

std::unique_ptr<OGRADBCLayer> CreateInternalLayer(const char *pszStatement)
{
return CreateLayer(pszStatement, "temp", true);
}

OGRLayer *ExecuteSQL(const char *pszStatement, OGRGeometry *poSpatialFilter,
const char *pszDialect) override;
Expand Down
Loading

0 comments on commit e7332ae

Please sign in to comment.