diff --git a/autotest/ogr/ogr_csv.py b/autotest/ogr/ogr_csv.py index ef22bab214c9..7cb00c1394ce 100755 --- a/autotest/ogr/ogr_csv.py +++ b/autotest/ogr/ogr_csv.py @@ -26,6 +26,7 @@ # Boston, MA 02111-1307, USA. ############################################################################### +import json import math import pathlib import sys @@ -3182,6 +3183,285 @@ def test_ogr_csv_invalid_wkt(tmp_vsimem): assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)" +############################################################################### +# Test schema override open option with GeoJSON driver +# +@pytest.mark.parametrize( + "open_options, expected_field_types, expected_field_names, expected_warning", + [ + ( + [], + [ + ogr.OFTString, + ogr.OFTInteger, + ogr.OFTReal, + ogr.OFTInteger, # bool subType + ogr.OFTString, # int string + ogr.OFTString, # real string + (ogr.OFTString, ogr.OFSTNone), # json subType + ogr.OFTString, # uuid subType + ], + [], + None, + ), + # Override string field with integer + ( + [ + r'OGR_SCHEMA={"layers": [{"name": "test_point", "fields": [{ "name": "str", "type": "Integer" }]}]}' + ], + [ + ogr.OFTInteger, # <-- overridden + ogr.OFTInteger, + ogr.OFTReal, + ogr.OFTInteger, # bool subType + ogr.OFTString, # int string + ogr.OFTString, # real string + ogr.OFTString, # json subType + ogr.OFTString, # uuid subType + ], + [], + None, + ), + # Override full schema and JSON subtype + ( + [ + r'OGR_SCHEMA={ "layers": [{"name": "test_point", "schemaType": "Full", "fields": [{ "name": "json_str", "subType": "JSON", "new_name": "json_str" }]}]}' + ], + [ + (ogr.OFTString, ogr.OFSTJSON), # json subType + ], + ["json_str"], + None, + ), + # Test width and precision override + ( + [ + r'OGR_SCHEMA={ "layers": [{"name": "test_point", "fields": [{ "name": "real", "width": 7, "precision": 3 }]}]}' + ], + [ + ogr.OFTString, + ogr.OFTInteger, + ogr.OFTReal, + ogr.OFTInteger, # bool subType + ogr.OFTString, # int string + ogr.OFTString, # real string + (ogr.OFTString, ogr.OFSTNone), # json subType + ogr.OFTString, # uuid subType + ], + [], + None, + ), + # Test boolean and short integer subtype + ( + [ + r'OGR_SCHEMA={ "layers": [{"name": "test_point", "fields": [{ "name": "int", "subType": "Boolean" }, { "name": "real", "type": "Integer", "subType": "Int16" }]}]}' + ], + [ + ogr.OFTString, + (ogr.OFTInteger, ogr.OFSTBoolean), # bool overridden subType + (ogr.OFTInteger, ogr.OFSTInt16), # int16 overridden subType + ogr.OFTInteger, # bool subType + ogr.OFTString, # int string + ogr.OFTString, # real string + ogr.OFTString, # json subType + ogr.OFTString, # uuid subType + ], + [], + None, + ), + # Test invalid schema + ( + [ + r'OGR_SCHEMA={ "layers": [{"name": "test_point", "fields": [{ "name": "str", "type": "xxxxx" }]}]}' + ], + [], + [], + "Unsupported field type: xxxxx for field str", + ), + ], +) +def test_ogr_csv_schema_override( + tmp_path, open_options, expected_field_types, expected_field_names, expected_warning +): + + csv_data = """str,int,real,bool,int_str,real_str,json_str,uuid_str +"1",2,3.4,1,"2","3.4","{""a"": 1}","12345678-1234-5678-1234-567812345678" +""" + + csvt_data = r"String,Integer,Real,Integer,String,String,String,String" + + csv_file = tmp_path / "test_point.csv" + with open(csv_file, "w") as f: + f.write(csv_data) + + csvt_file = tmp_path / "test_point.csvt" + with open(csvt_file, "w") as f: + f.write(csvt_data) + + gdal.ErrorReset() + + try: + schema = open_options[0].split("=")[1] + open_options = open_options[1:] + except IndexError: + schema = None + + with gdal.quiet_errors(): + + if schema: + open_options.append("OGR_SCHEMA=" + schema) + else: + open_options = [] + + # Validate the JSON schema + if not expected_warning and schema: + schema = json.loads(schema) + gdaltest.validate_json(schema, "ogr_fields_override.schema.json") + + # Check error if expected_field_types is empty + if not expected_field_types: + ds = gdal.OpenEx( + tmp_path / "test_point.csv", + gdal.OF_VECTOR | gdal.OF_READONLY, + open_options=open_options, + allowed_drivers=["CSV"], + ) + assert ( + gdal.GetLastErrorMsg().find(expected_warning) != -1 + ), f"Warning {expected_warning} not found, got {gdal.GetLastErrorMsg()} instead" + assert ds is None + else: + + ds = gdal.OpenEx( + tmp_path / "test_point.csv", + gdal.OF_VECTOR | gdal.OF_READONLY, + open_options=open_options, + allowed_drivers=["CSV"], + ) + + assert ds is not None + + lyr = ds.GetLayer(0) + + assert lyr.GetFeatureCount() == 1 + + lyr_defn = lyr.GetLayerDefn() + + assert lyr_defn.GetFieldCount() == len(expected_field_types) + + if len(expected_field_names) == 0: + expected_field_names = [ + "str", + "int", + "real", + "bool", + "int_str", + "real_str", + "json_str", + "uuid_str", + ] + + feat = lyr.GetNextFeature() + + # Check field types + for i in range(len(expected_field_names)): + try: + expected_type, expected_subtype = expected_field_types[i] + assert feat.GetFieldDefnRef(i).GetType() == expected_type + assert feat.GetFieldDefnRef(i).GetSubType() == expected_subtype + except TypeError: + expected_type = expected_field_types[i] + assert feat.GetFieldDefnRef(i).GetType() == expected_type + assert feat.GetFieldDefnRef(i).GetName() == expected_field_names[i] + + # Test width and precision override + if len(open_options) > 0 and "precision" in open_options[0]: + assert feat.GetFieldDefnRef(2).GetWidth() == 7 + assert feat.GetFieldDefnRef(2).GetPrecision() == 3 + + # Check feature content + if len(expected_field_names) > 0: + if "int" in expected_field_names: + int_sub_type = feat.GetFieldDefnRef("int").GetSubType() + assert ( + feat.GetFieldAsInteger("int") == 1 + if int_sub_type == ogr.OFSTBoolean + else 2 + ) + if "str" in expected_field_names: + assert feat.GetFieldAsString("str") == "1" + if "new_str" in expected_field_names: + assert feat.GetFieldAsString("new_str") == "1" + else: + assert feat.GetFieldAsInteger("int") == 2 + assert feat.GetFieldAsString("str") == "1" + + if expected_warning: + assert ( + gdal.GetLastErrorMsg().find(expected_warning) != -1 + ), f"Warning {expected_warning} not found, got {gdal.GetLastErrorMsg()} instead" + + +def test_ogr_schema_override_wkt(tmp_vsimem): + + filename = str(tmp_vsimem / "test.csv") + with gdaltest.vsi_open(filename, "wb") as fdest: + fdest.write(b"id,WKT,foo\n") + fdest.write(b'1,"POINT (1 2)",bar\n') + + with gdal.quiet_errors(): + + ds = gdal.OpenEx(filename) + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f["WKT"] == "POINT (1 2)" + assert f["foo"] == "bar" + assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)" + + ds = gdal.OpenEx( + filename, + open_options=[ + r'OGR_SCHEMA={"layers": [{"name": "test", "fields": [{ "name": "WKT", "type": "Integer" }]}]}' + ], + ) + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f["WKT"] == None + assert f["foo"] == "bar" + assert ( + gdal.GetLastErrorMsg().find( + "Invalid value type found in record 1 for field WKT" + ) + != -1 + ) + + ds = gdal.OpenEx( + filename, + open_options=[ + r'OGR_SCHEMA={"layers": [{"name": "test", "schemaType": "Full", "fields": [{ "name": "foo" }]}]}' + ], + ) + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + try: + assert f["WKT"] == None + except KeyError as ex: + assert str(ex).find("Illegal field requested in GetField()") != -1 + assert f["foo"] == "bar" + + ds = gdal.OpenEx( + filename, + open_options=[ + r'OGR_SCHEMA={"layers": [{"name": "test", "schemaType": "Full", "fields": [{ "name": "foo" }, { "name": "WKT" }]}]}' + ], + ) + lyr = ds.GetLayer(0) + f = lyr.GetNextFeature() + assert f["WKT"] == "POINT (1 2)" + assert f["foo"] == "bar" + assert f.GetGeometryRef().ExportToWkt() == "POINT (1 2)" + + ############################################################################### diff --git a/doc/source/drivers/vector/csv.rst b/doc/source/drivers/vector/csv.rst index f3e20f391f5b..b027c2102a15 100644 --- a/doc/source/drivers/vector/csv.rst +++ b/doc/source/drivers/vector/csv.rst @@ -115,8 +115,8 @@ Example (employee.csv): :: ID,Salary,Name,Comments + 131,11000.0,Jane Lake,Chief Technical Officer 132,55000.0,John Walker,"The ""big"" cheese." - 133,11000.0,Jane Lake,Cleaning Staff Note that the Comments value for the first data record is placed in double quotes because the value contains quotes, and those quotes have @@ -413,6 +413,16 @@ The following open options are supported: Maximum number of bytes for a line (-1=unlimited). +- .. oo:: OGR_SCHEMA + :choices: | + :since: 3.11.0 + + Partially or totally overrides the auto-detected schema to use for creating the layer. + The overrides are defined as a JSON list of field definitions. + This can be a filename, a URL or JSON string conformant with the `ogr_fields_override.schema.json schema `_ + This option takes precedence over any other option and over the .csvt file. + + Creation Issues --------------- @@ -550,7 +560,7 @@ Examples $ cat input.csv WKT,ID,Name "LINESTRING (-900 -1450,-900 100)",0,900W - + $ ogr2ogr -segmentize 400 -lco GEOMETRY=AS_WKT \ -sql "SELECT ID, Name FROM input" output.csv input.csv diff --git a/ogr/ogrsf_frmts/csv/ogr_csv.h b/ogr/ogrsf_frmts/csv/ogr_csv.h index 8e3c30e1f0cd..bcab9127ab9b 100644 --- a/ogr/ogrsf_frmts/csv/ogr_csv.h +++ b/ogr/ogrsf_frmts/csv/ogr_csv.h @@ -274,6 +274,13 @@ class OGRCSVDataSource final : public GDALDataset bool bEnableGeometryFields = false; + bool DealWithOgrSchemaOpenOption(CSLConstList papszOpenOptions); + + /* When OGR_SCHEMA and schemaType=Full, this will contain the list + * of removed field (if any). + */ + std::vector m_oDeletedFieldIndexes; + public: OGRCSVDataSource(); virtual ~OGRCSVDataSource() override; @@ -309,6 +316,7 @@ class OGRCSVDataSource final : public GDALDataset } static CPLString GetRealExtension(CPLString osFilename); + const std::vector &DeletedFieldIndexes() const; }; #endif // ndef OGR_CSV_H_INCLUDED diff --git a/ogr/ogrsf_frmts/csv/ogrcsvdatasource.cpp b/ogr/ogrsf_frmts/csv/ogrcsvdatasource.cpp index 1f4f49b03823..a81cff8d4ce6 100644 --- a/ogr/ogrsf_frmts/csv/ogrcsvdatasource.cpp +++ b/ogr/ogrsf_frmts/csv/ogrcsvdatasource.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "cpl_conv.h" #include "cpl_csv.h" @@ -30,6 +31,7 @@ #include "ogr_spatialref.h" #include "ogreditablelayer.h" #include "ogrsf_frmts.h" +#include "ogr_schema_override.h" /************************************************************************/ /* OGRCSVEditableLayerSynchronizer */ @@ -717,6 +719,127 @@ bool OGRCSVDataSource::Open(const char *pszFilename, bool bUpdateIn, return bForceOpen || nNotCSVCount < GetLayerCount(); } +const std::vector &OGRCSVDataSource::DeletedFieldIndexes() const +{ + return m_oDeletedFieldIndexes; +} + +/************************************************************************/ +/* DealWithOgrSchemaOpenOption() */ +/************************************************************************/ +bool OGRCSVDataSource::DealWithOgrSchemaOpenOption( + CSLConstList papszOpenOptionsIn) +{ + std::string osFieldsSchemaOverrideParam = + CSLFetchNameValueDef(papszOpenOptionsIn, "OGR_SCHEMA", ""); + + if (!osFieldsSchemaOverrideParam.empty()) + { + if (bUpdate) + { + CPLError(CE_Failure, CPLE_NotSupported, + "OGR_SCHEMA open option is not supported in update mode."); + return false; + } + + OGRSchemaOverride osSchemaOverride; + if (!osSchemaOverride.LoadFromJSON(osFieldsSchemaOverrideParam) || + !osSchemaOverride.IsValid()) + { + return false; + } + + const auto &oLayerOverrides = osSchemaOverride.GetLayerOverrides(); + for (const auto &oLayer : oLayerOverrides) + { + const auto &oLayerName = oLayer.first; + const auto &oLayerFieldOverride = oLayer.second; + const bool bIsFullOverride{oLayerFieldOverride.IsFullOverride()}; + auto oFieldOverrides = oLayerFieldOverride.GetFieldOverrides(); + std::vector aoFields; + + CPLDebug("CSV", "Applying schema override for layer %s", + oLayerName.c_str()); + + // Fail if the layer name does not exist + auto poLayer = GetLayerByName(oLayerName.c_str()); + if (poLayer == nullptr) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Layer %s not found in CSV file", oLayerName.c_str()); + return false; + } + + // Patch field definitions + auto poLayerDefn = poLayer->GetLayerDefn(); + for (int i = 0; i < poLayerDefn->GetFieldCount(); i++) + { + auto poFieldDefn = poLayerDefn->GetFieldDefn(i); + auto oFieldOverride = + oFieldOverrides.find(poFieldDefn->GetNameRef()); + if (oFieldOverride != oFieldOverrides.cend()) + { + if (oFieldOverride->second.GetFieldType().has_value()) + whileUnsealing(poFieldDefn) + ->SetType( + oFieldOverride->second.GetFieldType().value()); + if (oFieldOverride->second.GetFieldWidth().has_value()) + whileUnsealing(poFieldDefn) + ->SetWidth( + oFieldOverride->second.GetFieldWidth().value()); + if (oFieldOverride->second.GetFieldPrecision().has_value()) + whileUnsealing(poFieldDefn) + ->SetPrecision( + oFieldOverride->second.GetFieldPrecision() + .value()); + if (oFieldOverride->second.GetFieldSubType().has_value()) + whileUnsealing(poFieldDefn) + ->SetSubType( + oFieldOverride->second.GetFieldSubType() + .value()); + if (oFieldOverride->second.GetFieldName().has_value()) + whileUnsealing(poFieldDefn) + ->SetName(oFieldOverride->second.GetFieldName() + .value() + .c_str()); + + if (bIsFullOverride) + { + aoFields.push_back(poFieldDefn); + } + oFieldOverrides.erase(oFieldOverride); + } + } + + // Error if any field override is not found + if (!oFieldOverrides.empty()) + { + CPLError(CE_Failure, CPLE_AppDefined, + "Field %s not found in layer %s", + oFieldOverrides.cbegin()->first.c_str(), + oLayerName.c_str()); + return false; + } + + // Remove fields not in the override + if (bIsFullOverride) + { + for (int i = poLayerDefn->GetFieldCount() - 1; i >= 0; i--) + { + auto poFieldDefn = poLayerDefn->GetFieldDefn(i); + if (std::find(aoFields.begin(), aoFields.end(), + poFieldDefn) == aoFields.end()) + { + whileUnsealing(poLayerDefn)->DeleteFieldDefn(i); + m_oDeletedFieldIndexes.push_back(i); + } + } + } + } + } + return true; +} + /************************************************************************/ /* OpenTable() */ /************************************************************************/ @@ -896,6 +1019,12 @@ bool OGRCSVDataSource::OpenTable(const char *pszFilename, else { m_apoLayers.emplace_back(std::move(poCSVLayer)); + + if (!DealWithOgrSchemaOpenOption(papszOpenOptionsIn)) + { + m_apoLayers.pop_back(); + return false; + } } return true; diff --git a/ogr/ogrsf_frmts/csv/ogrcsvdriver.cpp b/ogr/ogrsf_frmts/csv/ogrcsvdriver.cpp index 1611cfab692b..3734315a4b3c 100644 --- a/ogr/ogrsf_frmts/csv/ogrcsvdriver.cpp +++ b/ogr/ogrsf_frmts/csv/ogrcsvdriver.cpp @@ -426,6 +426,15 @@ void RegisterOGRCSV() "