From 48e4a28711dcc1559a0ed5fe0bdc3086431614ac Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Wed, 20 Nov 2024 13:07:20 +0100 Subject: [PATCH] /vsicurl/: fix to allow to read Parquet partitionned datasets from public Azure container using /vsicurl/ Fixes #11309 --- autotest/ogr/ogr_parquet.py | 21 +++++++ port/cpl_vsil_curl.cpp | 112 ++++++++++++++++++++++-------------- port/cpl_vsil_curl_class.h | 1 + 3 files changed, 91 insertions(+), 43 deletions(-) diff --git a/autotest/ogr/ogr_parquet.py b/autotest/ogr/ogr_parquet.py index 912ca56c34e3..add20d7952d0 100755 --- a/autotest/ogr/ogr_parquet.py +++ b/autotest/ogr/ogr_parquet.py @@ -3355,6 +3355,27 @@ def test_ogr_parquet_bbox_float32_but_no_covering_in_metadata(use_dataset): ############################################################################### +@gdaltest.enable_exceptions() +@pytest.mark.require_curl +def test_ogr_parquet_overture_from_azure(): + + if not _has_arrow_dataset(): + pytest.skip("Test requires build with ArrowDataset") + + url = "https://overturemapswestus2.blob.core.windows.net/release?comp=list&delimiter=%2F&prefix=2024-11-13.0%2Ftheme%3Ddivisions%2Ftype%3Ddivision_area%2F&restype=container" + if gdaltest.gdalurlopen(url, timeout=5) is None: + pytest.skip(reason=f"{url} is down") + + with ogr.Open( + "PARQUET:/vsicurl/https://overturemapswestus2.blob.core.windows.net/release/2024-11-13.0/theme=divisions/type=division_area" + ) as ds: + lyr = ds.GetLayer(0) + assert lyr.GetFeatureCount() > 0 + + +############################################################################### + + @gdaltest.enable_exceptions() def test_ogr_parquet_write_arrow(tmp_vsimem): diff --git a/port/cpl_vsil_curl.cpp b/port/cpl_vsil_curl.cpp index 6899cf6af6bd..afdc67bb8982 100644 --- a/port/cpl_vsil_curl.cpp +++ b/port/cpl_vsil_curl.cpp @@ -1348,49 +1348,6 @@ vsi_l_offset VSICurlHandle::GetFileSizeOrHeaders(bool bSetError, if (sWriteFuncHeaderData.pBuffer != nullptr && (response_code == 200 || response_code == 206)) { - const char *pzETag = - strstr(sWriteFuncHeaderData.pBuffer, "ETag: \""); - if (pzETag) - { - pzETag += strlen("ETag: \""); - const char *pszEndOfETag = strchr(pzETag, '"'); - if (pszEndOfETag) - { - oFileProp.ETag.assign(pzETag, pszEndOfETag - pzETag); - } - } - - // Azure Data Lake Storage - const char *pszPermissions = - strstr(sWriteFuncHeaderData.pBuffer, "x-ms-permissions: "); - if (pszPermissions) - { - pszPermissions += strlen("x-ms-permissions: "); - const char *pszEOL = strstr(pszPermissions, "\r\n"); - if (pszEOL) - { - bool bIsDir = - strstr(sWriteFuncHeaderData.pBuffer, - "x-ms-resource-type: directory\r\n") != nullptr; - bool bIsFile = - strstr(sWriteFuncHeaderData.pBuffer, - "x-ms-resource-type: file\r\n") != nullptr; - if (bIsDir || bIsFile) - { - oFileProp.bIsDirectory = bIsDir; - std::string osPermissions; - osPermissions.assign(pszPermissions, - pszEOL - pszPermissions); - if (bIsDir) - oFileProp.nMode = S_IFDIR; - else - oFileProp.nMode = S_IFREG; - oFileProp.nMode |= - VSICurlParseUnixPermissions(osPermissions.c_str()); - } - } - } - { char **papszHeaders = CSLTokenizeString2(sWriteFuncHeaderData.pBuffer, "\r\n", 0); @@ -1412,6 +1369,44 @@ vsi_l_offset VSICurlHandle::GetFileSizeOrHeaders(bool bSetError, { m_bCached = false; } + + else if (EQUAL(pszKey, "ETag")) + { + std::string osValue(pszValue); + if (osValue.size() >= 2 && osValue.front() == '"' && + osValue.back() == '"') + osValue = osValue.substr(1, osValue.size() - 2); + oFileProp.ETag = osValue; + } + + // Azure Data Lake Storage + else if (EQUAL(pszKey, "x-ms-resource-type")) + { + if (EQUAL(pszValue, "file")) + { + oFileProp.nMode |= S_IFREG; + } + else if (EQUAL(pszValue, "directory")) + { + oFileProp.bIsDirectory = true; + oFileProp.nMode |= S_IFDIR; + } + } + else if (EQUAL(pszKey, "x-ms-permissions")) + { + oFileProp.nMode |= + VSICurlParseUnixPermissions(pszValue); + } + + // https://overturemapswestus2.blob.core.windows.net/release/2024-11-13.0/theme%3Ddivisions/type%3Ddivision_area + // returns a x-ms-meta-hdi_isfolder: true header + else if (EQUAL(pszKey, "x-ms-meta-hdi_isfolder") && + EQUAL(pszValue, "true")) + { + oFileProp.bIsAzureFolder = true; + oFileProp.bIsDirectory = true; + oFileProp.nMode |= S_IFDIR; + } } CPLFree(pszKey); } @@ -4893,6 +4888,37 @@ char **VSICurlFilesystemHandlerBase::GetFileList(const char *pszDirname, if (!bListDir) return nullptr; + // Deal with publicly visible Azure directories. + if (STARTS_WITH(osURL.c_str(), "https://")) + { + const char *pszBlobCore = + strstr(osURL.c_str(), ".blob.core.windows.net/"); + if (pszBlobCore) + { + FileProp cachedFileProp; + GetCachedFileProp(osURL.c_str(), cachedFileProp); + if (cachedFileProp.bIsAzureFolder) + { + const char *pszURLWithoutHTTPS = + osURL.c_str() + strlen("https://"); + const std::string osStorageAccount( + pszURLWithoutHTTPS, pszBlobCore - pszURLWithoutHTTPS); + CPLConfigOptionSetter oSetter1("AZURE_NO_SIGN_REQUEST", "YES", + false); + CPLConfigOptionSetter oSetter2("AZURE_STORAGE_ACCOUNT", + osStorageAccount.c_str(), false); + const std::string osVSIAZ(std::string("/vsiaz/").append( + pszBlobCore + strlen(".blob.core.windows.net/"))); + char **papszFileList = VSIReadDirEx(osVSIAZ.c_str(), nMaxFiles); + if (papszFileList) + { + *pbGotFileList = true; + return papszFileList; + } + } + } + } + // HACK (optimization in fact) for MBTiles driver. if (strstr(pszDirname, ".tiles.mapbox.com") != nullptr) return nullptr; diff --git a/port/cpl_vsil_curl_class.h b/port/cpl_vsil_curl_class.h index 4fb1e4ff9bce..6b47050dd638 100644 --- a/port/cpl_vsil_curl_class.h +++ b/port/cpl_vsil_curl_class.h @@ -80,6 +80,7 @@ class FileProp std::string osRedirectURL{}; bool bHasComputedFileSize = false; bool bIsDirectory = false; + bool bIsAzureFolder = false; int nMode = 0; // st_mode member of struct stat bool bS3LikeRedirect = false; std::string ETag{};