From e7332ae72ce3f9117318b66cf9257f25dec7b3e2 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 7 Dec 2024 17:53:40 +0100 Subject: [PATCH] ADBC: add spatial support for DuckDB databases and GeoParquet - Automate loading duckdb_spatial extension when installed, and when the dataset is DuckDB or Parquet - Retrieve geometries (GEOMETRY type) as OGR geometries - Read GeoParquet metadata to figure out spatial extent, CRS and geometry type per geometry column - Use duckdb_spatial ST_Intersects() for faster spatial filtering (when done with OGRLayer::SetSpatialFilter()), potentially leveraging DuckDB RTree when it is available. - Use GeoParquet bounding box column in complement to above - Passthrough forward of WHERE claused expresse through OGRLayer::SetAttributeFilter() --- .github/workflows/linux_build.yml | 2 + apps/test_ogrsf.cpp | 6 +- .../duckdb/poly_with_spatial_index.duckdb | Bin 0 -> 798720 bytes autotest/ogr/ogr_adbc.py | 156 +++- doc/source/drivers/vector/adbc.rst | 15 +- ogr/ogrsf_frmts/adbc/ogr_adbc.h | 51 +- ogr/ogrsf_frmts/adbc/ogradbcdataset.cpp | 142 +++- ogr/ogrsf_frmts/adbc/ogradbclayer.cpp | 738 +++++++++++++++++- port/cpl_known_config_options.h | 1 + 9 files changed, 1053 insertions(+), 58 deletions(-) create mode 100644 autotest/ogr/data/duckdb/poly_with_spatial_index.duckdb diff --git a/.github/workflows/linux_build.yml b/.github/workflows/linux_build.yml index 678873756a39..8f6e304f5c2e 100644 --- a/.github/workflows/linux_build.yml +++ b/.github/workflows/linux_build.yml @@ -342,6 +342,7 @@ jobs: # For cache mkdir -p .gdal + mkdir -p .duckdb docker run \ -e CI \ @@ -355,6 +356,7 @@ jobs: --add-host=host.docker.internal:host-gateway \ --rm \ -v $(pwd)/.gdal:/.gdal \ + -v $(pwd)/.duckdb:/.duckdb \ -v $(pwd):$(pwd) \ --workdir $(pwd)/build-${{ matrix.id }} \ ${CONTAINER_NAME_FULL} \ diff --git a/apps/test_ogrsf.cpp b/apps/test_ogrsf.cpp index 0b4e72b6e9d4..adbb5b717d52 100644 --- a/apps/test_ogrsf.cpp +++ b/apps/test_ogrsf.cpp @@ -4057,14 +4057,16 @@ static int64_t CountFeaturesUsingArrowStream(OGRLayer *poLayer, if (nExpectedFID >= 0 && !bExpectedFIDFound) { bOK = false; - printf("ERROR: expected to find feature of id %" PRId64 + printf("ERROR: CountFeaturesUsingArrowStream() :" + "expected to find feature of id %" PRId64 ", but did not get it\n", nExpectedFID); } if (nUnexpectedFID >= 0 && bUnexpectedFIDFound) { bOK = false; - printf("ERROR: expected *not* to find feature of id %" PRId64 + printf("ERROR: CountFeaturesUsingArrowStream(): " + "expected *not* to find feature of id %" PRId64 ", but did get it\n", nUnexpectedFID); } diff --git a/autotest/ogr/data/duckdb/poly_with_spatial_index.duckdb b/autotest/ogr/data/duckdb/poly_with_spatial_index.duckdb new file mode 100644 index 0000000000000000000000000000000000000000..f5c7a6246f90ca0d58b4b6e0cf3fb30ef91764f7 GIT binary patch literal 798720 zcmeI*4X|8QeE{(L@uM3%>=krbtoY{+{_9*{4H3Qjw2 z#oCUY6kbb}LCb8ZUr-mn7NB z)!DM_zusiVrj6I!Ft~PGSg<|mNoMvWGtOSN^z7dB{i}QPT(%{LZBM3WIjl{Rby;3L z$stS6e)r1r&dE|8F?9K?OP0NJ={aHZ*9=`Tdu{K=^}SoJy&>CVUY>DUj@jofz4*eV zmo9s6XYRkzPQ3EUD>q)BZPAl&F*7Y_lFYb%ecy(zbm1}CR%v`gA-;b7s_Q$~4~d8F zDSMygUW<>kM1L`B_wF^xob*Vul~b%OC@uj41PBlyK!5-N0t5&g@B-P4zc-n)CY{Z8 zW%KT?b?Lh#oi(Rl;a{6<%NF-0FZ}2mPyNa6-Pt@pjh_7czy9SR*(^7EWYgew1|QBc znAVX&o+kfiFj<=Xk}TyR9Vz3kZcj@Q_w!Hb+wd6QPHrCdfMzc`eg5J_i_Un<_9VPr zyl~N@N%C0I^{r&~e(i4BaDKC=#Xp}&ch`Q~-rbAoX9n40)Q309gy{z~9oqHZ8Lnh` zlFeHODto}^u4tt}c#Lva*=IWK>Q7&{jl8F>%lo_c?CJKtEZCnt?HqYex9u8oOn067 z+DME4IP@7bF124Cj@(>H$J=IY zM_}Iq&aZtNP307m?fNp6Ehl5yoxhW@R+OnNFxK`t+4`%Ao@ow0v)JeVR2$9IqFn`( z<#nu0VWOuZldY*umU^13<(TMGaa_ke(T#Lmg__unoHrBeyRY2G+K=ulzvB9-UcXbl z{KfhZcM42&`|sS_fgd?v4(vJ}VT#fI>$wsCkZvl@F($euC*>G)q7PdB1SYA#SjPgc zb(6Frr%187@~@+3SJa6u@I<%MvF1P4^b@Ni_k9VduYCzw|EmPX+J1(gW29f3DSq{7 zgfGbKop6Njx9pv8gs-RUop8i&m&C$Hk9@=Q3tZ_PSccq>B>gH=N4%*J|5y6usE!1W zq+eg^Sa4r@>z4d>CcnzvIsya;5FkK+0D;LO5C(~Z^Zxb-x&4~`oL~EWyUdNaQ9JfZ zfB*pk1PBlyK!5-N0t5(5PJ#BkKfSNmko)-`-M3-Q#%tHF+S0di!^-Pcuj@_k@87<= z|GsYee{6Qs|CJx?+p=!ub(_*0R$a5QZ$nS-^?nB+v(9dmSBsBXLw^TgUG^ggN9A^V zewLuGC;e4|cIxyNiz)rB$EAO_n5@ZvF(JRvTzUt_t}h(dmoCiyhup2lg$iAr-_tMs zk%d9^tCF`ZNq=$S(xs*IE;?uFdrEuls&w&1`NGojcb|9Bxus>7E?c^^bW*!8C;!2= zVT!js{G`KUN%F0vD?NZ2y~$%;+mdPdFF)-0F^1i{zms%L|6VdP&oe9GkhV5w)7q7* zS8Z8!&BnE{jSYSZH95~2r6NFp009C72oNAZfWQ?7W{-2MEA-f%rku(N-M z*!3niyPv=N`Ny^~>R&_1+K~JCPr-guk@rX7VEdZ!Snmp3{6o9_u^w*QgUx?9gQ>J^ ztX~5j>#tIVi(e3|W z^K`x(*fl=F6vO@e$5HK*atu0-5+b>wqOm}+ZQAx5cE!KCZCCRq5R5zzbe%!3ITxp?hos-6?)VefX zmUdcR>Li)Is8#vp+WzhxuD?6oG|7&vef;U$#=Oe$XQ!{VS*I?`Uh7Yuo5#tk zLR_uBGmp!M$M~)g*LI(hr|;Olz2BFTR^^se{oOBoyj6L2$bNq_-9dbGY?m&p7V@nx zIVaz5b!XU5`Gj&P$Bq!!PCPrr_r`cfh->|^T%USJzCPI;;$;4kJZ`)%#^;B)wminW zLtHu|Zhv&xe{Eri%P)qwxg^BR=7o8Fr9Y4ByDkp-|5=D@TQ17u`fYLjond|QnYjL= zmxS%V6yjuOh#L>a^jEwqUtj)Ch?_AkJs#rH?3n)8SZ|Mn{nd^Naq_`XzDBIqhr(Lg~D77kGm-lzyb?S4Ko30vcPiM7jH9Zaf_Yd=Y zYe&X%yymj7-x(pUE?l0+rC849wU_2`ImWfaL)@4d($_bH?W<3R<8FL6fsMqHD7{4>b)w^Q)!$bPo_PG9aAuc~2 z;^YG%u09pluMOLmej4KP%HWYq569bhOYmyG6w`kuc$ap?^v}L0cqK9Yqwme*+RPZg zeMKHu&y0R|zAulP%i{XSLtMW+#MOtc$k!+B^JGU@U;1uX-~3)o|KD-@6Jq*-5GNlC zapPw({ntX=d_KhGABVX5ix^+HGL-kFnEs~TP`=lN_Er9Jh^r^W_W8jWpBUoW?3n({ z5GSAR3ERIb#P!2MT)!~JpIDu*Z=4sm|K^o>Tz*Gf|Kkvs7KXTbN;O|!dwqz@GecZj z6xTl-JuV4xBMEU5^GjC!QP^M1ule!I^Ef#*q%W-tas8AKH=hdinY-TgXj7~5{E&9> zs=;>AJU?8As#k=3N?qYP)L0SsyQ@32*ULg&TM^@BG5&t&Ps{V$+sE-qyM135#v{$& zjp;87aibLC=8O<0$HeWoE)DJUuo&+Q*Y#@qdQ%v86vrjS@lbKR)N$QR8H(4{X5qXj zogI9d7sT=o^=dmm@XSEvioX8t+jb39K6LfqbvGaU2J7aIGxOuBuMhc@-}JUTZoDzZ z*S$T&v0o`&7}v-8EzLVCUtiuB%P};+b~!e09jLrE?FY_0v{iXvNIB*W)_3Dmp&ZQ* zgnW|EzBSJ`xjMw9FD?jiEJwMtD35EqLjPRb8REvzL!2CadcM9H>!PzYyZ;-61Z0 zI^?@W>;DnzrL;AsuZD4M{c|BM zj&qCS;9@-&+d)TtwCnM~pAS^-UDMw^@9I|N;q`;<>G;8Vtp0u+r+hd#R&NUPglcSG zrCISjToUW~o-lr{$GGwBF#ayJ*FPP`=gszY@Q!fY^~GWPa-n?|kE?jx#rZ+0a2}Tm z=XtRos1^E&QlTFyb#?YLwL(8s?5CQAer&IPZs>WPGM87!^L|S>{`${Czu4Rw;^sv$ z{`(l;6UOJ&uY|bv$M1)Xvi>l;wq@7ZY)q4lceA-}_BlrO8A?`{*?E)onc4L( zyB25To@`8<&4;tkI{@?EQV{X3lIr}I=b5X0Zxv#(b{+nBsDz~ia?|$Lqt;)0O`nwN}%crNu^~*H>9pzS~o-U7z zy$~QkfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7csfrCom F{{b`gRU!ZY literal 0 HcmV?d00001 diff --git a/autotest/ogr/ogr_adbc.py b/autotest/ogr/ogr_adbc.py index 1429d4a1785f..e1c5dc118441 100755 --- a/autotest/ogr/ogr_adbc.py +++ b/autotest/ogr/ogr_adbc.py @@ -231,29 +231,97 @@ def test_ogr_adbc_duckdb_parquet_with_sql_open_option(): ############################################################################### -def test_ogr_adbc_duckdb_parquet_with_spatial(): +@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"]) +def test_ogr_adbc_duckdb_parquet_with_spatial(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL): if not _has_libduckdb(): pytest.skip("libduckdb.so missing") - if gdaltest.is_travis_branch("ubuntu_2404"): - # Works locally for me when replicating the Dockerfile ... - pytest.skip("fails on ubuntu_2404 for unknown reason") + with gdal.config_option( + "OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL + ): + with gdal.OpenEx( + "data/parquet/poly.parquet", + gdal.OF_VECTOR, + allowed_drivers=["ADBC"], + open_options=[ + "PRELUDE_STATEMENTS=INSTALL spatial", + ] + if OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL == "ON" + else [], + ) as ds: + lyr = ds.GetLayer(0) + assert lyr.GetGeomType() == ogr.wkbPolygon + assert lyr.TestCapability(ogr.OLCFastGetExtent) + assert lyr.TestCapability(ogr.OLCFastSpatialFilter) + minx, maxx, miny, maxy = lyr.GetExtent() + assert (minx, maxx, miny, maxy) == ( + 478315.53125, + 481645.3125, + 4762880.5, + 4765610.5, + ) + assert lyr.GetExtent3D() == ( + 478315.53125, + 481645.3125, + 4762880.5, + 4765610.5, + float("inf"), + float("-inf"), + ) + assert lyr.GetSpatialRef().GetAuthorityCode(None) == "27700" + f = lyr.GetNextFeature() + assert f.GetGeometryRef().ExportToWkt().startswith("POLYGON ((") - with gdal.OpenEx( - "data/parquet/poly.parquet", - gdal.OF_VECTOR, - allowed_drivers=["ADBC"], - open_options=[ - "PRELUDE_STATEMENTS=INSTALL spatial", - "PRELUDE_STATEMENTS=LOAD spatial", - ], - ) as ds: + assert lyr.GetFeatureCount() == 10 + lyr.SetAttributeFilter("false") + + assert lyr.GetFeatureCount() == 0 + lyr.SetAttributeFilter("true") + + lyr.SetAttributeFilter(None) + assert lyr.GetFeatureCount() == 10 + lyr.SetSpatialFilterRect(minx, miny, maxx, maxy) + assert lyr.GetFeatureCount() == 10 + lyr.SetSpatialFilterRect(minx, miny, minx, maxy) + assert lyr.GetFeatureCount() < 10 + lyr.SetSpatialFilterRect(maxx, miny, maxx, maxy) + assert lyr.GetFeatureCount() < 10 + lyr.SetSpatialFilterRect(minx, miny, maxx, miny) + assert lyr.GetFeatureCount() < 10 + lyr.SetSpatialFilterRect(minx, maxy, maxx, maxy) + assert lyr.GetFeatureCount() < 10 + + lyr.SetAttributeFilter("true") + lyr.SetSpatialFilter(None) + assert lyr.GetFeatureCount() == 10 + lyr.SetSpatialFilterRect(minx, miny, maxx, maxy) + assert lyr.GetFeatureCount() == 10 + + lyr.SetAttributeFilter("false") + lyr.SetSpatialFilterRect(minx, miny, maxx, maxy) + assert lyr.GetFeatureCount() == 0 + + +############################################################################### + + +@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"]) +def test_ogr_adbc_duckdb_with_spatial_index(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL): + + if not _has_libduckdb(): + pytest.skip("libduckdb.so missing") + + with gdal.config_option( + "OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL + ): + ds = ogr.Open("data/duckdb/poly_with_spatial_index.duckdb") + lyr = ds.GetLayer(0) with ds.ExecuteSQL( - "SELECT ST_AsText(geometry) FROM read_parquet('data/parquet/poly.parquet')" + "SELECT 1 FROM duckdb_extensions() WHERE extension_name='spatial' AND loaded = true" ) as sql_lyr: - f = sql_lyr.GetNextFeature() - assert f.GetField(0).startswith("POLYGON") + spatial_loaded = sql_lyr.GetNextFeature() is not None + assert lyr.TestCapability(ogr.OLCFastSpatialFilter) == spatial_loaded ############################################################################### @@ -325,6 +393,30 @@ def test_ogr_adbc_test_ogrsf_parquet_filename_with_glob(): assert "ERROR" not in ret +############################################################################### +# Run test_ogrsf on a GeoParquet file + + +@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"]) +def test_ogr_adbc_test_ogrsf_geoparquet(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL): + + if not _has_libduckdb(): + pytest.skip("libduckdb.so missing") + + import test_cli_utilities + + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal( + test_cli_utilities.get_test_ogrsf_path() + + f" -ro ADBC:data/parquet/poly.parquet --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}" + ) + + assert "INFO" in ret + assert "ERROR" not in ret + + ############################################################################### # Test DATETIME_AS_STRING=YES GetArrowStream() option @@ -359,7 +451,34 @@ def test_ogr_adbc_arrow_stream_numpy_datetime_as_string(tmp_vsimem): # Run test_ogrsf on a DuckDB dataset -def test_ogr_adbc_test_ogrsf_duckdb(): +@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"]) +def test_ogr_adbc_test_ogrsf_duckdb(OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL): + + if not _has_libduckdb(): + pytest.skip("libduckdb.so missing") + + import test_cli_utilities + + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal( + test_cli_utilities.get_test_ogrsf_path() + + f" -ro ADBC:data/duckdb/poly.duckdb --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}" + ) + + assert "INFO" in ret + assert "ERROR" not in ret + + +############################################################################### +# Run test_ogrsf on a DuckDB dataset + + +@pytest.mark.parametrize("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", ["ON", "OFF"]) +def test_ogr_adbc_test_ogrsf_duckdb_with_spatial_index( + OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL, +): if not _has_libduckdb(): pytest.skip("libduckdb.so missing") @@ -370,7 +489,8 @@ def test_ogr_adbc_test_ogrsf_duckdb(): pytest.skip() ret = gdaltest.runexternal( - test_cli_utilities.get_test_ogrsf_path() + " -ro ADBC:data/duckdb/poly.duckdb" + test_cli_utilities.get_test_ogrsf_path() + + f" -ro ADBC:data/duckdb/poly_with_spatial_index.duckdb --config OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL={OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL}" ) assert "INFO" in ret diff --git a/doc/source/drivers/vector/adbc.rst b/doc/source/drivers/vector/adbc.rst index 40a312323367..abc50b2bb989 100644 --- a/doc/source/drivers/vector/adbc.rst +++ b/doc/source/drivers/vector/adbc.rst @@ -25,7 +25,11 @@ Consult the `installation instruction { + public: + //! Describe the bbox column of a geometry column + struct GeomColBBOX + { + std::string osXMin{}; // empty if no bbox column + std::string osYMin{}; + std::string osXMax{}; + std::string osYMax{}; + }; + + private: friend class OGRADBCDataset; OGRADBCDataset *m_poDS = nullptr; + const std::string m_osBaseStatement{}; // as provided by user + std::string m_osModifiedBaseStatement{}; // above tuned to use ST_AsWKB() + std::string m_osModifiedSelect{}; // SELECT part of above + std::string m_osAttributeFilter{}; std::unique_ptr m_statement{}; std::unique_ptr m_poAdapterLayer{}; std::unique_ptr m_stream{}; @@ -103,17 +118,27 @@ class OGRADBCLayer final : public OGRLayer, GIntBig m_nFeatureID = 0; bool m_bIsParquetLayer = false; + std::vector + m_geomColBBOX{}; // same size as GetGeomFieldCount() + std::vector m_extents{}; // same size as GetGeomFieldCount() + OGRFeature *GetNextRawFeature(); bool GetArrowStreamInternal(struct ArrowArrayStream *out_stream); GIntBig GetFeatureCountParquet(); + void BuildLayerDefn(bool bInternalUse); + bool ReplaceStatement(const char *pszNewStatement); + bool UpdateStatement(); + std::string GetCurrentStatement() const; + CPL_DISALLOW_COPY_ASSIGN(OGRADBCLayer) public: OGRADBCLayer(OGRADBCDataset *poDS, const char *pszName, + const char *pszStatement, std::unique_ptr poStatement, std::unique_ptr poStream, - ArrowSchema *schema); + ArrowSchema *schema, bool bInternalUse); ~OGRADBCLayer() override; OGRFeatureDefn *GetLayerDefn() override @@ -128,6 +153,20 @@ class OGRADBCLayer final : public OGRLayer, bool GetArrowStream(struct ArrowArrayStream *out_stream, CSLConstList papszOptions = nullptr) override; GIntBig GetFeatureCount(int bForce) override; + + void SetSpatialFilter(OGRGeometry *poGeom) override + { + SetSpatialFilter(0, poGeom); + } + + OGRErr SetAttributeFilter(const char *pszFilter) override; + void SetSpatialFilter(int iGeomField, OGRGeometry *poGeom) override; + + OGRErr GetExtent(OGREnvelope *psExtent, int bForce = TRUE) override; + OGRErr GetExtent(int iGeomField, OGREnvelope *psExtent, + int bForce = TRUE) override; + OGRErr GetExtent3D(int iGeomField, OGREnvelope3D *psExtent, + int bForce = TRUE) override; }; /************************************************************************/ @@ -143,6 +182,8 @@ class OGRADBCDataset final : public GDALDataset std::unique_ptr m_connection{}; std::vector> m_apoLayers{}; std::string m_osParquetFilename{}; + bool m_bIsDuckDB = false; + bool m_bSpatialLoaded = false; public: OGRADBCDataset() = default; @@ -164,7 +205,13 @@ class OGRADBCDataset final : public GDALDataset OGRLayer *GetLayerByName(const char *pszName) override; std::unique_ptr CreateLayer(const char *pszStatement, - const char *pszLayerName); + const char *pszLayerName, + bool bInternalUse); + + std::unique_ptr CreateInternalLayer(const char *pszStatement) + { + return CreateLayer(pszStatement, "temp", true); + } OGRLayer *ExecuteSQL(const char *pszStatement, OGRGeometry *poSpatialFilter, const char *pszDialect) override; diff --git a/ogr/ogrsf_frmts/adbc/ogradbcdataset.cpp b/ogr/ogrsf_frmts/adbc/ogradbcdataset.cpp index 66d822167c1a..1256147033ca 100644 --- a/ogr/ogrsf_frmts/adbc/ogradbcdataset.cpp +++ b/ogr/ogrsf_frmts/adbc/ogradbcdataset.cpp @@ -15,6 +15,7 @@ #include "ogradbcdrivercore.h" #include "ogr_mem.h" #include "ogr_p.h" +#include "cpl_error.h" #include "cpl_json.h" #include "gdal_adbc.h" @@ -79,7 +80,8 @@ OGRADBCDataset::~OGRADBCDataset() /************************************************************************/ std::unique_ptr -OGRADBCDataset::CreateLayer(const char *pszStatement, const char *pszLayerName) +OGRADBCDataset::CreateLayer(const char *pszStatement, const char *pszLayerName, + bool bInternalUse) { OGRADBCError error; @@ -164,7 +166,8 @@ OGRADBCDataset::CreateLayer(const char *pszStatement, const char *pszLayerName) } return std::make_unique( - this, pszLayerName, std::move(statement), std::move(stream), &schema); + this, pszLayerName, osStatement.c_str(), std::move(statement), + std::move(stream), &schema, bInternalUse); } /************************************************************************/ @@ -181,7 +184,7 @@ OGRLayer *OGRADBCDataset::ExecuteSQL(const char *pszStatement, pszDialect); } - auto poLayer = CreateLayer(pszStatement, "RESULTSET"); + auto poLayer = CreateLayer(pszStatement, "RESULTSET", false); if (poLayer && poSpatialFilter) { if (poLayer->GetGeomType() == wkbNone) @@ -211,7 +214,7 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) } const char *pszADBCDriverName = CSLFetchNameValue(poOpenInfo->papszOpenOptions, "ADBC_DRIVER"); - const bool bIsDuckDB = OGRADBCDriverIsDuckDB(poOpenInfo); + m_bIsDuckDB = OGRADBCDriverIsDuckDB(poOpenInfo); const bool bIsSQLite3 = (pszADBCDriverName && EQUAL(pszADBCDriverName, "adbc_driver_sqlite")) || OGRADBCDriverIsSQLite3(poOpenInfo); @@ -221,7 +224,7 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) if (!pszADBCDriverName) { - if (bIsDuckDB || bIsParquet) + if (m_bIsDuckDB || bIsParquet) { pszADBCDriverName = #ifdef _WIN32 @@ -249,7 +252,7 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) // Load the driver if (pszADBCDriverName && - (bIsDuckDB || bIsParquet || strstr(pszADBCDriverName, "duckdb"))) + (m_bIsDuckDB || bIsParquet || strstr(pszADBCDriverName, "duckdb"))) { if (OGRADBCLoadDriver(pszADBCDriverName, "duckdb_adbc_init", &m_driver, error) != ADBC_STATUS_OK) @@ -280,7 +283,7 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) // Set options if (pszADBCDriverName && - (bIsDuckDB || bIsParquet || strstr(pszADBCDriverName, "duckdb"))) + (m_bIsDuckDB || bIsParquet || strstr(pszADBCDriverName, "duckdb"))) { if (ADBC_CALL(DatabaseSetOption, &m_database, "path", bIsParquet ? ":memory:" : pszFilename, @@ -346,9 +349,30 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) for (const char *pszStatement : cpl::Iterate(CSLConstList(papszPreludeStatements))) { - CreateLayer(pszStatement, "temp"); + CreateInternalLayer(pszStatement); } CSLDestroy(papszPreludeStatements); + if ((bIsParquet || m_bIsDuckDB) && + CPLTestBool( + CPLGetConfigOption("OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", "ON"))) + { + auto poTmpLayer = + CreateInternalLayer("SELECT 1 FROM duckdb_extensions() WHERE " + "extension_name='spatial' AND loaded = false"); + if (poTmpLayer && std::unique_ptr( + poTmpLayer->GetNextFeature()) != nullptr) + { + CPLErrorStateBackuper oBackuper(CPLQuietErrorHandler); + CreateInternalLayer("LOAD spatial"); + } + + poTmpLayer = + CreateInternalLayer("SELECT 1 FROM duckdb_extensions() WHERE " + "extension_name='spatial' AND loaded = true"); + m_bSpatialLoaded = + poTmpLayer && std::unique_ptr( + poTmpLayer->GetNextFeature()) != nullptr; + } std::string osLayerName = "RESULTSET"; std::string osSQL; @@ -374,18 +398,89 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) { if (pszSQL[0]) { - auto poLayer = CreateLayer(pszSQL, osLayerName.c_str()); + std::unique_ptr poLayer; + if ((bIsParquet || m_bIsDuckDB) && m_bSpatialLoaded) + { + std::string osErrorMsg; + { + CPLErrorStateBackuper oBackuper(CPLQuietErrorHandler); + poLayer = CreateLayer(pszSQL, osLayerName.c_str(), false); + if (!poLayer) + osErrorMsg = CPLGetLastErrorMsg(); + } + if (!poLayer) + { + CPLDebug("ADBC", + "Connecting with 'LOAD spatial' did not work " + "(%s). Retrying without it", + osErrorMsg.c_str()); + ADBC_CALL(ConnectionRelease, m_connection.get(), error); + m_connection.reset(); + + ADBC_CALL(DatabaseRelease, &m_database, error); + memset(&m_database, 0, sizeof(m_database)); + + if (ADBC_CALL(DatabaseNew, &m_database, error) != + ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcDatabaseNew() failed: %s", + error.message()); + return false; + } + if (ADBC_CALL(DatabaseSetOption, &m_database, "path", + ":memory:", error) != ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcDatabaseSetOption() failed: %s", + error.message()); + return false; + } + + if (ADBC_CALL(DatabaseInit, &m_database, error) != + ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcDatabaseInit() failed: %s", + error.message()); + return false; + } + + m_connection = std::make_unique(); + if (ADBC_CALL(ConnectionNew, m_connection.get(), error) != + ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcConnectionNew() failed: %s", + error.message()); + return false; + } + + if (ADBC_CALL(ConnectionInit, m_connection.get(), + &m_database, error) != ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcConnectionInit() failed: %s", + error.message()); + return false; + } + } + } if (!poLayer) - return false; + { + poLayer = CreateLayer(pszSQL, osLayerName.c_str(), false); + if (!poLayer) + return false; + } + poLayer->m_bIsParquetLayer = bIsParquetLayer; m_apoLayers.emplace_back(std::move(poLayer)); } } - else if (bIsDuckDB || bIsSQLite3) + else if (m_bIsDuckDB || bIsSQLite3) { - auto poLayerList = CreateLayer( - "SELECT name FROM sqlite_master WHERE type IN ('table', 'view')", - "LAYERLIST"); + auto poLayerList = CreateInternalLayer( + "SELECT name FROM sqlite_master WHERE type IN ('table', 'view')"); if (!poLayerList || poLayerList->GetLayerDefn()->GetFieldCount() != 1) { return false; @@ -400,7 +495,8 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) CPLSPrintf("SELECT * FROM \"%s\"", OGRDuplicateCharacter(pszLayerName, '"').c_str()); CPLTurnFailureIntoWarning(true); - auto poLayer = CreateLayer(osStatement.c_str(), pszLayerName); + auto poLayer = + CreateLayer(osStatement.c_str(), pszLayerName, false); CPLTurnFailureIntoWarning(false); if (poLayer) { @@ -410,13 +506,12 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) } else if (bIsPostgreSQL) { - auto poLayerList = CreateLayer( + auto poLayerList = CreateInternalLayer( "SELECT n.nspname, c.relname FROM pg_class c " "JOIN pg_namespace n ON c.relnamespace = n.oid " "AND c.relkind in ('r','v','m','f') " "AND n.nspname NOT IN ('pg_catalog', 'information_schema') " - "ORDER BY c.oid", - "LAYERLIST"); + "ORDER BY c.oid"); if (!poLayerList || poLayerList->GetLayerDefn()->GetFieldCount() != 2) { return false; @@ -432,9 +527,9 @@ bool OGRADBCDataset::Open(const GDALOpenInfo *poOpenInfo) OGRDuplicateCharacter(pszTableName, '"').c_str()); CPLTurnFailureIntoWarning(true); - auto poLayer = - CreateLayer(osStatement.c_str(), - CPLSPrintf("%s.%s", pszNamespace, pszTableName)); + auto poLayer = CreateLayer( + osStatement.c_str(), + CPLSPrintf("%s.%s", pszNamespace, pszTableName), false); CPLTurnFailureIntoWarning(false); if (poLayer) { @@ -474,8 +569,9 @@ OGRLayer *OGRADBCDataset::GetLayerByName(const char *pszName) } auto statement = std::make_unique(); - OGRADBCLayer tmpLayer(this, "", std::move(statement), - std::move(objectsStream), &schema); + OGRADBCLayer tmpLayer(this, "", "", std::move(statement), + std::move(objectsStream), &schema, + /* bInternalUse = */ true); const auto tmpLayerDefn = tmpLayer.GetLayerDefn(); if (tmpLayerDefn->GetFieldIndex("catalog_name") < 0 || tmpLayerDefn->GetFieldIndex("catalog_db_schemas") < 0) diff --git a/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp b/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp index 66169b486dca..015abf34f567 100644 --- a/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp +++ b/ogr/ogrsf_frmts/adbc/ogradbclayer.cpp @@ -12,31 +12,434 @@ ****************************************************************************/ #include "ogr_adbc.h" +#include "ogr_spatialref.h" #include "ogr_p.h" +#include "cpl_json.h" + +#include +#include +#include +#include #define ADBC_CALL(func, ...) m_poDS->m_driver.func(__VA_ARGS__) +/************************************************************************/ +/* GetGeometryTypeFromString() */ +/************************************************************************/ + +static OGRwkbGeometryType GetGeometryTypeFromString(const std::string &osType) +{ + OGRwkbGeometryType eGeomType = wkbUnknown; + OGRReadWKTGeometryType(osType.c_str(), &eGeomType); + if (eGeomType == wkbUnknown && !osType.empty()) + { + CPLDebug("ADBC", "Unknown geometry type: %s", osType.c_str()); + } + return eGeomType; +} + /************************************************************************/ /* OGRADBCLayer() */ /************************************************************************/ OGRADBCLayer::OGRADBCLayer(OGRADBCDataset *poDS, const char *pszName, + const char *pszStatement, std::unique_ptr poStatement, std::unique_ptr poStream, - ArrowSchema *schema) - : m_poDS(poDS), m_statement(std::move(poStatement)), - m_stream(std::move(poStream)) + ArrowSchema *schema, bool bInternalUse) + : m_poDS(poDS), m_osBaseStatement(pszStatement), + m_osModifiedBaseStatement(m_osBaseStatement), + m_statement(std::move(poStatement)), m_stream(std::move(poStream)) { SetDescription(pszName); memcpy(&m_schema, schema, sizeof(m_schema)); schema->release = nullptr; - m_poAdapterLayer = - std::make_unique(pszName); + BuildLayerDefn(bInternalUse); +} + +/************************************************************************/ +/* ParseGeometryColumnCovering() */ +/************************************************************************/ + +//! Parse bounding box column definition +static bool ParseGeometryColumnCovering(const CPLJSONObject &oJSONDef, + std::string &osBBOXColumn, + std::string &osXMin, + std::string &osYMin, + std::string &osXMax, + std::string &osYMax) +{ + const auto oCovering = oJSONDef["covering"]; + if (oCovering.IsValid() && + oCovering.GetType() == CPLJSONObject::Type::Object) + { + const auto oBBOX = oCovering["bbox"]; + if (oBBOX.IsValid() && oBBOX.GetType() == CPLJSONObject::Type::Object) + { + const auto oXMin = oBBOX["xmin"]; + const auto oYMin = oBBOX["ymin"]; + const auto oXMax = oBBOX["xmax"]; + const auto oYMax = oBBOX["ymax"]; + if (oXMin.IsValid() && oYMin.IsValid() && oXMax.IsValid() && + oYMax.IsValid() && + oXMin.GetType() == CPLJSONObject::Type::Array && + oYMin.GetType() == CPLJSONObject::Type::Array && + oXMax.GetType() == CPLJSONObject::Type::Array && + oYMax.GetType() == CPLJSONObject::Type::Array) + { + const auto osXMinArray = oXMin.ToArray(); + const auto osYMinArray = oYMin.ToArray(); + const auto osXMaxArray = oXMax.ToArray(); + const auto osYMaxArray = oYMax.ToArray(); + if (osXMinArray.Size() == 2 && osYMinArray.Size() == 2 && + osXMaxArray.Size() == 2 && osYMaxArray.Size() == 2 && + osXMinArray[0].GetType() == CPLJSONObject::Type::String && + osXMinArray[1].GetType() == CPLJSONObject::Type::String && + osYMinArray[0].GetType() == CPLJSONObject::Type::String && + osYMinArray[1].GetType() == CPLJSONObject::Type::String && + osXMaxArray[0].GetType() == CPLJSONObject::Type::String && + osXMaxArray[1].GetType() == CPLJSONObject::Type::String && + osYMaxArray[0].GetType() == CPLJSONObject::Type::String && + osYMaxArray[1].GetType() == CPLJSONObject::Type::String && + osXMinArray[0].ToString() == osYMinArray[0].ToString() && + osXMinArray[0].ToString() == osXMaxArray[0].ToString() && + osXMinArray[0].ToString() == osYMaxArray[0].ToString()) + { + osBBOXColumn = osXMinArray[0].ToString(); + osXMin = osXMinArray[1].ToString(); + osYMin = osYMinArray[1].ToString(); + osXMax = osXMaxArray[1].ToString(); + osYMax = osYMaxArray[1].ToString(); + return true; + } + } + } + } + return false; +} + +/************************************************************************/ +/* ParseGeoParquetColumn() */ +/************************************************************************/ + +static void ParseGeoParquetColumn( + const CPLJSONObject &oColumn, + std::map &oMapType, + std::map &oMapExtent, + std::map + &oMapGeomColumnToCoveringBBOXColumn, + std::map> + &oMapGeomColumnsFromGeoParquet, + std::set &oSetCoveringBBoxColumn) +{ + auto oCrs = oColumn.GetObj("crs"); + if (!oCrs.IsValid()) + { + // WGS 84 is implied if no crs member is found. + auto poSRS = std::make_unique(); + poSRS->SetAxisMappingStrategy(OAMS_TRADITIONAL_GIS_ORDER); + poSRS->importFromEPSG(4326); + oMapGeomColumnsFromGeoParquet[oColumn.GetName()] = std::move(poSRS); + } + else if (oCrs.GetType() == CPLJSONObject::Type::Object) + { + // CRS encoded as PROJJSON (extension) + const auto oType = oCrs["type"]; + if (oType.IsValid() && oType.GetType() == CPLJSONObject::Type::String) + { + const auto osType = oType.ToString(); + if (osType.find("CRS") != std::string::npos) + { + auto poSRS = std::make_unique(); + poSRS->SetAxisMappingStrategy(OAMS_TRADITIONAL_GIS_ORDER); + + if (poSRS->SetFromUserInput(oCrs.ToString().c_str()) == + OGRERR_NONE) + { + oMapGeomColumnsFromGeoParquet[oColumn.GetName()] = + std::move(poSRS); + } + } + } + } + else + { + oMapGeomColumnsFromGeoParquet[oColumn.GetName()] = nullptr; + } + + OGRwkbGeometryType eGeomType = wkbUnknown; + auto oType = oColumn.GetObj("geometry_types"); + if (oType.GetType() == CPLJSONObject::Type::Array) + { + const auto oTypeArray = oType.ToArray(); + if (oTypeArray.Size() == 1) + { + eGeomType = GetGeometryTypeFromString(oTypeArray[0].ToString()); + } + else if (oTypeArray.Size() > 1) + { + const auto PromoteToCollection = [](OGRwkbGeometryType eType) + { + if (eType == wkbPoint) + return wkbMultiPoint; + if (eType == wkbLineString) + return wkbMultiLineString; + if (eType == wkbPolygon) + return wkbMultiPolygon; + return eType; + }; + bool bMixed = false; + bool bHasMulti = false; + bool bHasZ = false; + bool bHasM = false; + const auto eFirstType = OGR_GT_Flatten( + GetGeometryTypeFromString(oTypeArray[0].ToString())); + const auto eFirstTypeCollection = PromoteToCollection(eFirstType); + for (int i = 0; i < oTypeArray.Size(); ++i) + { + const auto eThisGeom = + GetGeometryTypeFromString(oTypeArray[i].ToString()); + if (PromoteToCollection(OGR_GT_Flatten(eThisGeom)) != + eFirstTypeCollection) + { + bMixed = true; + break; + } + bHasZ |= OGR_GT_HasZ(eThisGeom) != FALSE; + bHasM |= OGR_GT_HasM(eThisGeom) != FALSE; + bHasMulti |= (PromoteToCollection(OGR_GT_Flatten(eThisGeom)) == + OGR_GT_Flatten(eThisGeom)); + } + if (!bMixed) + { + if (eFirstTypeCollection == wkbMultiPolygon || + eFirstTypeCollection == wkbMultiLineString) + { + if (bHasMulti) + eGeomType = OGR_GT_SetModifier(eFirstTypeCollection, + bHasZ, bHasM); + else + eGeomType = + OGR_GT_SetModifier(eFirstType, bHasZ, bHasM); + } + } + } + } + + oMapType[oColumn.GetName()] = eGeomType; + + OGREnvelope3D sExtent; + const auto oBBox = oColumn.GetArray("bbox"); + if (oBBox.IsValid() && oBBox.Size() == 4) + { + sExtent.MinX = oBBox[0].ToDouble(); + sExtent.MinY = oBBox[1].ToDouble(); + sExtent.MinZ = std::numeric_limits::infinity(); + sExtent.MaxX = oBBox[2].ToDouble(); + sExtent.MaxY = oBBox[3].ToDouble(); + sExtent.MaxZ = -std::numeric_limits::infinity(); + if (sExtent.MinX <= sExtent.MaxX) + { + oMapExtent[oColumn.GetName()] = sExtent; + } + } + else if (oBBox.IsValid() && oBBox.Size() == 6) + { + sExtent.MinX = oBBox[0].ToDouble(); + sExtent.MinY = oBBox[1].ToDouble(); + sExtent.MinZ = oBBox[2].ToDouble(); + sExtent.MaxX = oBBox[3].ToDouble(); + sExtent.MaxY = oBBox[4].ToDouble(); + sExtent.MaxZ = oBBox[5].ToDouble(); + if (sExtent.MinX <= sExtent.MaxX) + { + oMapExtent[oColumn.GetName()] = sExtent; + } + } + + std::string osBBOXColumn; + std::string osXMin, osYMin, osXMax, osYMax; + if (ParseGeometryColumnCovering(oColumn, osBBOXColumn, osXMin, osYMin, + osXMax, osYMax)) + { + OGRADBCLayer::GeomColBBOX geomColBBOX; + const std::string osPrefix = + std::string("\"") + .append(OGRDuplicateCharacter(osBBOXColumn, '"')) + .append("\".\""); + geomColBBOX.osXMin = std::string(osPrefix) + .append(OGRDuplicateCharacter(osXMin, '"')) + .append("\""); + geomColBBOX.osYMin = std::string(osPrefix) + .append(OGRDuplicateCharacter(osYMin, '"')) + .append("\""); + geomColBBOX.osXMax = std::string(osPrefix) + .append(OGRDuplicateCharacter(osXMax, '"')) + .append("\""); + geomColBBOX.osYMax = std::string(osPrefix) + .append(OGRDuplicateCharacter(osYMax, '"')) + .append("\""); + oMapGeomColumnToCoveringBBOXColumn[oColumn.GetName()] = geomColBBOX; + oSetCoveringBBoxColumn.insert(osBBOXColumn); + } +} + +/************************************************************************/ +/* BuildLayerDefn() */ +/************************************************************************/ + +void OGRADBCLayer::BuildLayerDefn(bool bInternalUse) +{ + // Identify geometry columns for Parquet files, and query them with + // ST_AsWKB() to avoid getting duckdb_spatial own geometry encoding + // (https://github.com/duckdb/duckdb_spatial/blob/a60aa3733741a99c49baaf33390c0f7c8a9598a3/spatial/src/spatial/core/geometry/geometry_serialization.cpp#L11) + std::map> oMapGeomColumns; + std::map oMapType; + std::map oMapExtent; + std::map oMapGeomColumnToCoveringBBOXColumn; + if (!bInternalUse && STARTS_WITH_CI(m_osBaseStatement.c_str(), "SELECT ") && + (m_poDS->m_bIsDuckDB || + (!m_poDS->m_osParquetFilename.empty() && + CPLString(m_osBaseStatement) + .ifind(std::string(" FROM '").append(OGRDuplicateCharacter( + m_poDS->m_osParquetFilename, '\''))) != + std::string::npos))) + { + // Try to read GeoParquet 'geo' metadata + std::map> + oMapGeomColumnsFromGeoParquet; + std::set oSetCoveringBBoxColumn; + + std::string osGeoParquetMD; + if (!m_poDS->m_osParquetFilename.empty()) + { + auto poMetadataLayer = m_poDS->CreateInternalLayer( + std::string("SELECT value FROM parquet_kv_metadata('") + .append(OGRDuplicateCharacter(m_poDS->m_osParquetFilename, + '\'')) + .append("') WHERE key = 'geo'") + .c_str()); + if (poMetadataLayer) + { + auto f = std::unique_ptr( + poMetadataLayer->GetNextFeature()); + if (f) + { + int nBytes = 0; + const GByte *pabyData = f->GetFieldAsBinary(0, &nBytes); + osGeoParquetMD.assign( + reinterpret_cast(pabyData), nBytes); + // CPLDebug("ADBC", "%s", osGeoParquetMD.c_str()); + } + } + } + CPLJSONDocument oDoc; + if (!osGeoParquetMD.empty() && oDoc.LoadMemory(osGeoParquetMD)) + { + const auto oColums = oDoc.GetRoot().GetObj("columns"); + for (const auto &oColumn : oColums.GetChildren()) + { + if (oColumn.GetString("encoding") == "WKB") + { + ParseGeoParquetColumn(oColumn, oMapType, oMapExtent, + oMapGeomColumnToCoveringBBOXColumn, + oMapGeomColumnsFromGeoParquet, + oSetCoveringBBoxColumn); + } + } + } + + auto poDescribeLayer = m_poDS->CreateInternalLayer( + std::string("DESCRIBE ").append(m_osBaseStatement).c_str()); + std::string osNewStatement; + bool bNewStatement = false; + if (poDescribeLayer && + // cppcheck-suppress knownConditionTrueFalse + (m_poDS->m_bIsDuckDB || !oMapGeomColumnsFromGeoParquet.empty())) + { + for (auto &&f : *poDescribeLayer) + { + const char *pszColName = f->GetFieldAsString("column_name"); + if (cpl::contains(oSetCoveringBBoxColumn, pszColName)) + { + bNewStatement = true; + continue; + } + + // f->DumpReadable(stdout); + if (osNewStatement.empty()) + osNewStatement = "SELECT "; + else + osNewStatement += ", "; + + auto oIter = oMapGeomColumnsFromGeoParquet.find(pszColName); + if (oIter != oMapGeomColumnsFromGeoParquet.end()) + { + oMapGeomColumns[pszColName] = std::move(oIter->second); + } + if (EQUAL(f->GetFieldAsString("column_type"), "GEOMETRY") && + m_poDS->m_bSpatialLoaded) + { + bNewStatement = true; + osNewStatement += "ST_AsWKB(\""; + osNewStatement += OGRDuplicateCharacter(pszColName, '"'); + osNewStatement += "\") AS "; + if (oIter == oMapGeomColumnsFromGeoParquet.end()) + oMapGeomColumns[pszColName] = nullptr; + } + osNewStatement += '"'; + osNewStatement += OGRDuplicateCharacter(pszColName, '"'); + osNewStatement += '"'; + } + m_osModifiedSelect = osNewStatement; + osNewStatement += " FROM ("; + osNewStatement += m_osBaseStatement; + osNewStatement += " )"; + } + + if (bNewStatement) + { + // CPLDebug("ADBC", "%s -> %s", m_osBaseStatement.c_str(), osNewStatement.c_str()); + if (ReplaceStatement(osNewStatement.c_str())) + { + m_osModifiedBaseStatement = osNewStatement; + } + else + { + m_osModifiedSelect.clear(); + oMapGeomColumns.clear(); + } + } + } + + m_poAdapterLayer = std::make_unique( + GetDescription()); + for (int i = 0; i < m_schema.n_children; ++i) { - m_poAdapterLayer->CreateFieldFromArrowSchema(m_schema.children[i]); + const char *pszColName = m_schema.children[i]->name; + auto oIter = oMapGeomColumns.find(pszColName); + if (oIter != oMapGeomColumns.end()) + { + OGRGeomFieldDefn oGeomFieldDefn(pszColName, oMapType[pszColName]); + auto poSRS = std::move(oIter->second).release(); + if (poSRS) + { + oGeomFieldDefn.SetSpatialRef(poSRS); + poSRS->Release(); + } + m_poAdapterLayer->m_poLayerDefn->AddGeomFieldDefn(&oGeomFieldDefn); + + m_extents.push_back(oMapExtent[pszColName]); + m_geomColBBOX.push_back( + oMapGeomColumnToCoveringBBOXColumn[pszColName]); + } + else + { + m_poAdapterLayer->CreateFieldFromArrowSchema(m_schema.children[i]); + } } } @@ -53,6 +456,67 @@ OGRADBCLayer::~OGRADBCLayer() m_schema.release(&m_schema); } +/************************************************************************/ +/* ReplaceStatement() */ +/************************************************************************/ + +bool OGRADBCLayer::ReplaceStatement(const char *pszNewStatement) +{ + // CPLDebug("ADBC", "%s", pszNewStatement); + OGRADBCError error; + auto statement = std::make_unique(); + if (ADBC_CALL(StatementNew, m_poDS->m_connection.get(), statement.get(), + error) != ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, "AdbcStatementNew() failed: %s", + error.message()); + ADBC_CALL(StatementRelease, statement.get(), error); + } + else if (ADBC_CALL(StatementSetSqlQuery, statement.get(), pszNewStatement, + error) != ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcStatementSetSqlQuery() failed: %s", error.message()); + error.clear(); + ADBC_CALL(StatementRelease, statement.get(), error); + } + else + { + auto stream = std::make_unique(); + int64_t rows_affected = -1; + ArrowSchema newSchema; + memset(&newSchema, 0, sizeof(newSchema)); + if (ADBC_CALL(StatementExecuteQuery, statement.get(), stream->get(), + &rows_affected, error) != ADBC_STATUS_OK) + { + CPLError(CE_Failure, CPLE_AppDefined, + "AdbcStatementExecuteQuery() failed: %s", error.message()); + error.clear(); + ADBC_CALL(StatementRelease, statement.get(), error); + } + else if (stream->get_schema(&newSchema) != 0) + { + CPLError(CE_Failure, CPLE_AppDefined, "get_schema() failed"); + ADBC_CALL(StatementRelease, statement.get(), error); + } + else + { + if (m_schema.release) + m_schema.release(&m_schema); + memcpy(&m_schema, &newSchema, sizeof(newSchema)); + + if (m_statement) + ADBC_CALL(StatementRelease, m_statement.get(), error); + m_statement = std::move(statement); + + m_stream = std::move(stream); + + return true; + } + } + return false; +} + /************************************************************************/ /* GetNextRawFeature() */ /************************************************************************/ @@ -99,6 +563,16 @@ OGRFeature *OGRADBCLayer::GetNextRawFeature() } auto poFeature = m_poAdapterLayer->m_apoFeatures[m_nIdx++].release(); + const int nGeomFieldCount = + m_poAdapterLayer->m_poLayerDefn->GetFieldCount(); + for (int i = 0; i < nGeomFieldCount; ++i) + { + auto poGeom = poFeature->GetGeomFieldRef(i); + if (poGeom) + poGeom->assignSpatialReference( + m_poAdapterLayer->m_poLayerDefn->GetGeomFieldDefn(i) + ->GetSpatialRef()); + } poFeature->SetFID(m_nFeatureID++); return poFeature; } @@ -119,6 +593,194 @@ void OGRADBCLayer::ResetReading() } } +/************************************************************************/ +/* GetExtent() */ +/************************************************************************/ + +OGRErr OGRADBCLayer::GetExtent(OGREnvelope *psExtent, int bForce) +{ + return GetExtent(0, psExtent, bForce); +} + +/************************************************************************/ +/* GetExtent() */ +/************************************************************************/ + +OGRErr OGRADBCLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, + int bForce) +{ + if (iGeomField < 0 || iGeomField >= GetLayerDefn()->GetGeomFieldCount()) + { + if (iGeomField != 0) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Invalid geometry field index : %d", iGeomField); + } + return OGRERR_FAILURE; + } + + *psExtent = m_extents[iGeomField]; + if (psExtent->IsInit()) + return OGRERR_NONE; + + return GetExtentInternal(iGeomField, psExtent, bForce); +} + +/************************************************************************/ +/* GetExtent3D() */ +/************************************************************************/ + +OGRErr OGRADBCLayer::GetExtent3D(int iGeomField, OGREnvelope3D *psExtent, + int bForce) +{ + if (iGeomField < 0 || iGeomField >= GetLayerDefn()->GetGeomFieldCount()) + { + if (iGeomField != 0) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Invalid geometry field index : %d", iGeomField); + } + return OGRERR_FAILURE; + } + + *psExtent = m_extents[iGeomField]; + if (psExtent->IsInit()) + return OGRERR_NONE; + + return GetExtentInternal(iGeomField, psExtent, bForce); +} + +/************************************************************************/ +/* GetCurrentStatement() */ +/************************************************************************/ + +std::string OGRADBCLayer::GetCurrentStatement() const +{ + if (!m_osModifiedSelect.empty() && + STARTS_WITH_CI(m_osBaseStatement.c_str(), "SELECT * FROM ") && + (!m_osAttributeFilter.empty() || + (m_poFilterGeom && + (!m_geomColBBOX[m_iGeomFieldFilter].osXMin.empty() || + m_poDS->m_bSpatialLoaded)))) + { + std::string osStatement(m_osModifiedSelect); + osStatement.append(" FROM (") + .append(m_osBaseStatement) + .append(") WHERE "); + + bool bAddAnd = false; + if (m_poFilterGeom) + { + const double dfMinX = std::isinf(m_sFilterEnvelope.MinX) + ? -std::numeric_limits::max() + : m_sFilterEnvelope.MinX; + const double dfMinY = std::isinf(m_sFilterEnvelope.MinY) + ? -std::numeric_limits::max() + : m_sFilterEnvelope.MinY; + const double dfMaxX = std::isinf(m_sFilterEnvelope.MaxX) + ? std::numeric_limits::max() + : m_sFilterEnvelope.MaxX; + const double dfMaxY = std::isinf(m_sFilterEnvelope.MaxY) + ? std::numeric_limits::max() + : m_sFilterEnvelope.MaxY; + if (!m_geomColBBOX[m_iGeomFieldFilter].osXMin.empty()) + { + bAddAnd = true; + osStatement.append(m_geomColBBOX[m_iGeomFieldFilter].osXMin) + .append(" <= ") + .append(CPLSPrintf("%.17g", dfMaxX)) + .append(" AND ") + .append(m_geomColBBOX[m_iGeomFieldFilter].osXMax) + .append(" >= ") + .append(CPLSPrintf("%.17g", dfMinX)) + .append(" AND ") + .append(m_geomColBBOX[m_iGeomFieldFilter].osYMin) + .append(" <= ") + .append(CPLSPrintf("%.17g", dfMaxY)) + .append(" AND ") + .append(m_geomColBBOX[m_iGeomFieldFilter].osYMax) + .append(" >= ") + .append(CPLSPrintf("%.17g", dfMinY)); + } + if (m_poDS->m_bSpatialLoaded) + { + if (bAddAnd) + osStatement.append(" AND "); + bAddAnd = true; + osStatement.append("ST_Intersects(\"") + .append(OGRDuplicateCharacter( + m_poAdapterLayer->m_poLayerDefn + ->GetGeomFieldDefn(m_iGeomFieldFilter) + ->GetNameRef(), + '"')) + .append(CPLSPrintf( + "\", ST_MakeEnvelope(%.17g,%.17g,%.17g,%.17g))", dfMinX, + dfMinY, dfMaxX, dfMaxY)); + } + } + if (!m_osAttributeFilter.empty()) + { + if (bAddAnd) + osStatement.append(" AND "); + osStatement.append("("); + osStatement.append(m_osAttributeFilter); + osStatement.append(")"); + } + + return osStatement; + } + else + { + return m_osModifiedBaseStatement; + } +} + +/************************************************************************/ +/* UpdateStatement() */ +/************************************************************************/ + +bool OGRADBCLayer::UpdateStatement() +{ + return ReplaceStatement(GetCurrentStatement().c_str()); +} + +/***********************************************************************/ +/* SetAttributeFilter() */ +/***********************************************************************/ + +OGRErr OGRADBCLayer::SetAttributeFilter(const char *pszFilter) +{ + if (!m_osModifiedSelect.empty() && + STARTS_WITH_CI(m_osBaseStatement.c_str(), "SELECT * FROM ")) + { + m_osAttributeFilter = pszFilter ? pszFilter : ""; + return UpdateStatement() ? OGRERR_NONE : OGRERR_FAILURE; + } + else + { + return OGRLayer::SetAttributeFilter(pszFilter); + } +} + +/************************************************************************/ +/* SetSpatialFilter() */ +/************************************************************************/ + +void OGRADBCLayer::SetSpatialFilter(int iGeomField, OGRGeometry *poGeomIn) + +{ + if (!ValidateGeometryFieldIndexForSetSpatialFilter(iGeomField, poGeomIn)) + return; + + if (iGeomField < GetLayerDefn()->GetGeomFieldCount()) + { + m_iGeomFieldFilter = iGeomField; + if (InstallFilter(poGeomIn)) + ResetReading(); + UpdateStatement(); + } +} + /************************************************************************/ /* TestCapability() */ /************************************************************************/ @@ -127,16 +789,46 @@ int OGRADBCLayer::TestCapability(const char *pszCap) { if (EQUAL(pszCap, OLCFastGetArrowStream)) { - return !m_poFilterGeom && !m_poAttrQuery; + return !m_poFilterGeom && !m_poAttrQuery && m_osAttributeFilter.empty(); } else if (EQUAL(pszCap, OLCFastFeatureCount)) { - return !m_poFilterGeom && !m_poAttrQuery && m_bIsParquetLayer; + return !m_poFilterGeom && !m_poAttrQuery && + m_osAttributeFilter.empty() && m_bIsParquetLayer; } - else + else if (EQUAL(pszCap, OLCFastGetExtent)) { - return false; + return !m_extents.empty() && m_extents[0].IsInit(); + } + else if (EQUAL(pszCap, OLCFastSpatialFilter) && m_iGeomFieldFilter >= 0 && + m_iGeomFieldFilter < GetLayerDefn()->GetGeomFieldCount()) + { + if (m_poDS->m_bSpatialLoaded && m_poDS->m_bIsDuckDB) + { + const char *pszGeomColName = + m_poAdapterLayer->m_poLayerDefn + ->GetGeomFieldDefn(m_iGeomFieldFilter) + ->GetNameRef(); + auto poTmpLayer = m_poDS->CreateInternalLayer(CPLSPrintf( + "SELECT 1 FROM sqlite_master WHERE tbl_name = '%s' AND type = " + "'index' AND (sql LIKE '%%USING RTREE (%s)%%' OR sql LIKE " + "'%%USING RTREE (\"%s\")%%')", + OGRDuplicateCharacter(GetDescription(), '\'').c_str(), + pszGeomColName, + OGRDuplicateCharacter(pszGeomColName, '"').c_str())); + return poTmpLayer && + std::unique_ptr(poTmpLayer->GetNextFeature()); + } + else if (!m_geomColBBOX[m_iGeomFieldFilter].osXMin.empty()) + { + // Let's assume that the presence of a geometry bounding box + // column is sufficient enough to pretend to have fast spatial + // filter capabilities + return true; + } } + + return false; } /************************************************************************/ @@ -196,8 +888,30 @@ bool OGRADBCLayer::GetArrowStreamInternal(struct ArrowArrayStream *out_stream) GIntBig OGRADBCLayer::GetFeatureCount(int bForce) { - if (m_poFilterGeom || m_poAttrQuery) + if (m_poFilterGeom || m_poAttrQuery || !m_osAttributeFilter.empty()) { + if (!m_osModifiedSelect.empty() && + STARTS_WITH_CI(m_osBaseStatement.c_str(), "SELECT * FROM ") && + (!m_poFilterGeom || + !m_geomColBBOX[m_iGeomFieldFilter].osXMin.empty() || + m_poDS->m_bSpatialLoaded)) + { + const std::string osCurStatement = GetCurrentStatement(); + auto poCountLayer = m_poDS->CreateInternalLayer( + std::string("SELECT COUNT(*) FROM (") + .append(osCurStatement) + .append(")") + .c_str()); + if (poCountLayer && + poCountLayer->GetLayerDefn()->GetFieldCount() == 1) + { + auto poFeature = + std::unique_ptr(poCountLayer->GetNextFeature()); + if (poFeature) + return poFeature->GetFieldAsInteger64(0); + } + } + return OGRLayer::GetFeatureCount(bForce); } @@ -249,7 +963,7 @@ GIntBig OGRADBCLayer::GetFeatureCountParquet() const std::string osSQL(CPLSPrintf( "SELECT CAST(SUM(num_rows) AS BIGINT) FROM parquet_file_metadata('%s')", OGRDuplicateCharacter(m_poDS->m_osParquetFilename, '\'').c_str())); - auto poCountLayer = m_poDS->CreateLayer(osSQL.c_str(), "numrows"); + auto poCountLayer = m_poDS->CreateInternalLayer(osSQL.c_str()); if (poCountLayer && poCountLayer->GetLayerDefn()->GetFieldCount() == 1) { auto poFeature = diff --git a/port/cpl_known_config_options.h b/port/cpl_known_config_options.h index 1dfefb8400ea..89c63c882eb6 100644 --- a/port/cpl_known_config_options.h +++ b/port/cpl_known_config_options.h @@ -650,6 +650,7 @@ constexpr static const char* const apszKnownConfigOptions[] = "ODS_RESOLVE_FORMULAS", // from ogrodsdatasource.cpp "OGR2OGR_MIN_FEATURES_FOR_THREADED_REPROJ", // from ogr2ogr_lib.cpp "OGR2OGR_USE_ARROW_API", // from ogr2ogr_lib.cpp + "OGR_ADBC_AUTO_LOAD_DUCKDB_SPATIAL", // from ogradbcdataset.cpp "OGR_API_SPY_FILE", // from ograpispy.cpp "OGR_API_SPY_SNAPSHOT_PATH", // from ograpispy.cpp "OGR_APPLY_GEOM_SET_PRECISION", // from ogr2ogr_lib.cpp, ogrlayer.cpp