From 01ffb6476b70d36a8433eff8baeb4e16cc76b414 Mon Sep 17 00:00:00 2001 From: Daniel Hegberg Date: Thu, 19 Dec 2024 06:26:09 +0700 Subject: [PATCH] Support Null regex override in csv parser options. (#13228) Co-authored-by: Andrew Lamb --- datafusion-cli/Cargo.lock | 1 + datafusion/common/src/config.rs | 3 + datafusion/core/Cargo.toml | 1 + datafusion/core/benches/csv_load.rs | 10 ++ .../core/src/datasource/file_format/csv.rs | 81 +++++++++++++- .../src/datasource/file_format/options.rs | 12 ++- .../csv/aggregate_test_100_with_nulls.csv | 101 ++++++++++++++++++ .../proto/datafusion_common.proto | 9 +- datafusion/proto-common/src/from_proto/mod.rs | 2 + .../proto-common/src/generated/pbjson.rs | 18 ++++ .../proto-common/src/generated/prost.rs | 98 +++++++++-------- datafusion/proto-common/src/to_proto/mod.rs | 1 + .../src/generated/datafusion_proto_common.rs | 11 +- .../proto/src/logical_plan/file_formats.rs | 6 ++ 14 files changed, 292 insertions(+), 62 deletions(-) create mode 100644 datafusion/core/tests/data/csv/aggregate_test_100_with_nulls.csv diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 9552542befd8..bac4192b10ce 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1261,6 +1261,7 @@ dependencies = [ "parking_lot", "parquet", "rand", + "regex", "sqlparser", "tempfile", "tokio", diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3a07a238a4c9..4948833b1f5f 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1655,7 +1655,10 @@ config_namespace! { pub timestamp_format: Option, default = None pub timestamp_tz_format: Option, default = None pub time_format: Option, default = None + // The output format for Nulls in the CSV writer. pub null_value: Option, default = None + // The input regex for Nulls when loading CSVs. + pub null_regex: Option, default = None pub comment: Option, default = None } } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1995ab4ca075..01bf03f32e8e 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -121,6 +121,7 @@ object_store = { workspace = true } parking_lot = { workspace = true } parquet = { workspace = true, optional = true, default-features = true } rand = { workspace = true } +regex = { workspace = true } sqlparser = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/core/benches/csv_load.rs b/datafusion/core/benches/csv_load.rs index 5f707b31a6a9..2d42121ec9b2 100644 --- a/datafusion/core/benches/csv_load.rs +++ b/datafusion/core/benches/csv_load.rs @@ -75,6 +75,16 @@ fn criterion_benchmark(c: &mut Criterion) { ) }) }); + + group.bench_function("null regex override", |b| { + b.iter(|| { + load_csv( + ctx.clone(), + test_file.path().to_str().unwrap(), + CsvReadOptions::default().null_regex(Some("^NULL$|^$".to_string())), + ) + }) + }); } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index e9a93475d3ce..e8fb3690efbf 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -59,6 +59,7 @@ use datafusion_physical_expr_common::sort_expr::LexRequirement; use futures::stream::BoxStream; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; +use regex::Regex; #[derive(Default)] /// Factory struct used to create [CsvFormatFactory] @@ -218,6 +219,13 @@ impl CsvFormat { self } + /// Set the regex to use for null values in the CSV reader. + /// - default to treat empty values as null. + pub fn with_null_regex(mut self, null_regex: Option) -> Self { + self.options.null_regex = null_regex; + self + } + /// Returns `Some(true)` if the first line is a header, `Some(false)` if /// it is not, and `None` if it is not specified. pub fn has_header(&self) -> Option { @@ -502,6 +510,12 @@ impl CsvFormat { .with_delimiter(self.options.delimiter) .with_quote(self.options.quote); + if let Some(null_regex) = &self.options.null_regex { + let regex = Regex::new(null_regex.as_str()) + .expect("Unable to parse CSV null regex."); + format = format.with_null_regex(regex); + } + if let Some(escape) = self.options.escape { format = format.with_escape(escape); } @@ -813,8 +827,67 @@ mod tests { let state = session_ctx.state(); let projection = None; - let exec = - get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; + let root = "./tests/data/csv"; + let format = CsvFormat::default().with_has_header(true); + let exec = scan_format( + &state, + &format, + root, + "aggregate_test_100_with_nulls.csv", + projection, + None, + ) + .await?; + + let x: Vec = exec + .schema() + .fields() + .iter() + .map(|f| format!("{}: {:?}", f.name(), f.data_type())) + .collect(); + assert_eq!( + vec![ + "c1: Utf8", + "c2: Int64", + "c3: Int64", + "c4: Int64", + "c5: Int64", + "c6: Int64", + "c7: Int64", + "c8: Int64", + "c9: Int64", + "c10: Utf8", + "c11: Float64", + "c12: Float64", + "c13: Utf8", + "c14: Null", + "c15: Utf8" + ], + x + ); + + Ok(()) + } + + #[tokio::test] + async fn infer_schema_with_null_regex() -> Result<()> { + let session_ctx = SessionContext::new(); + let state = session_ctx.state(); + + let projection = None; + let root = "./tests/data/csv"; + let format = CsvFormat::default() + .with_has_header(true) + .with_null_regex(Some("^NULL$|^$".to_string())); + let exec = scan_format( + &state, + &format, + root, + "aggregate_test_100_with_nulls.csv", + projection, + None, + ) + .await?; let x: Vec = exec .schema() @@ -836,7 +909,9 @@ mod tests { "c10: Utf8", "c11: Float64", "c12: Float64", - "c13: Utf8" + "c13: Utf8", + "c14: Null", + "c15: Null" ], x ); diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index 95576f448c64..2f32479ed2b0 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -87,6 +87,8 @@ pub struct CsvReadOptions<'a> { pub file_compression_type: FileCompressionType, /// Indicates how the file is sorted pub file_sort_order: Vec>, + /// Optional regex to match null values + pub null_regex: Option, } impl Default for CsvReadOptions<'_> { @@ -112,6 +114,7 @@ impl<'a> CsvReadOptions<'a> { file_compression_type: FileCompressionType::UNCOMPRESSED, file_sort_order: vec![], comment: None, + null_regex: None, } } @@ -212,6 +215,12 @@ impl<'a> CsvReadOptions<'a> { self.file_sort_order = file_sort_order; self } + + /// Configure the null parsing regex. + pub fn null_regex(mut self, null_regex: Option) -> Self { + self.null_regex = null_regex; + self + } } /// Options that control the reading of Parquet files. @@ -534,7 +543,8 @@ impl ReadOptions<'_> for CsvReadOptions<'_> { .with_terminator(self.terminator) .with_newlines_in_values(self.newlines_in_values) .with_schema_infer_max_rec(self.schema_infer_max_records) - .with_file_compression_type(self.file_compression_type.to_owned()); + .with_file_compression_type(self.file_compression_type.to_owned()) + .with_null_regex(self.null_regex.clone()); ListingOptions::new(Arc::new(file_format)) .with_file_extension(self.file_extension) diff --git a/datafusion/core/tests/data/csv/aggregate_test_100_with_nulls.csv b/datafusion/core/tests/data/csv/aggregate_test_100_with_nulls.csv new file mode 100644 index 000000000000..0aabb2785250 --- /dev/null +++ b/datafusion/core/tests/data/csv/aggregate_test_100_with_nulls.csv @@ -0,0 +1,101 @@ +c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15 +c,2,1,18109,2033001162,-6513304855495910254,25,43062,1491205016,5863949479783605708,0.110830784,0.9294097332465232,6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW,,NULL +d,5,-40,22614,706441268,-7542719935673075327,155,14337,3373581039,11720144131976083864,0.69632107,0.3114712539863804,C2GT5KVyOPZpgKVl110TyZO0NcJ434,,NULL +b,1,29,-18218,994303988,5983957848665088916,204,9489,3275293996,14857091259186476033,0.53840446,0.17909035118828576,AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz,,NULL +a,1,-85,-15154,1171968280,1919439543497968449,77,52286,774637006,12101411955859039553,0.12285209,0.6864391962767343,0keZ5G8BffGwgF2RwQD59TFzMStxCB,,NULL +b,5,-82,22080,1824882165,7373730676428214987,208,34331,3342719438,3330177516592499461,0.82634634,0.40975383525297016,Ig1QcuKsjHXkproePdERo2w0mYzIqd,,NULL +b,4,-111,-1967,-4229382,1892872227362838079,67,9832,1243785310,8382489916947120498,0.06563997,0.152498292971736,Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH,,NULL +e,3,104,-25136,1738331255,300633854973581194,139,20807,3577318119,13079037564113702254,0.40154034,0.7764360990307122,DuJNG8tufSqW0ZstHqWj3aGvFLMg4A,,NULL +a,3,13,12613,1299719633,2020498574254265315,191,17835,3998790955,14881411008939145569,0.041445434,0.8813167497816289,Amn2K87Db5Es3dFQO9cw9cvpAM6h35,,NULL +d,1,38,18384,-335410409,-1632237090406591229,26,57510,2712615025,1842662804748246269,0.6064476,0.6404495093354053,4HX6feIvmNXBN7XGqgO4YVBkhu8GDI,,NULL +a,4,-38,20744,762932956,308913475857409919,7,45465,1787652631,878137512938218976,0.7459874,0.02182578039211991,ydkwycaISlYSlEq3TlkS2m15I2pcp8,,NULL +d,1,57,28781,-1143802338,2662536767954229885,202,62167,879082834,4338034436871150616,0.7618384,0.42950521730777025,VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4,,NULL +a,4,-54,-2376,434021400,5502271306323260832,113,15777,2502326480,7966148640299601101,0.5720931,0.30585375151301186,KJFcmTVjdkCMv94wYCtfHMFhzyRsmH,,NULL +e,3,112,-6823,-421042466,8535335158538929274,129,32712,3759340273,9916295859593918600,0.6424343,0.6316565296547284,BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE,,NULL +d,2,113,3917,-108973366,-7220140168410319165,197,24380,63044568,4225581724448081782,0.11867094,0.2944158618048994,90gAtmGEeIqUTbo1ZrxCvWtsseukXC,,NULL +b,1,54,-18410,1413111008,-7145106120930085900,249,5382,1842680163,17818611040257178339,0.8881188,0.24899794314659673,6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ,,NULL +c,1,103,-22186,431378678,1346564663822463162,146,12393,3766999078,10901819591635583995,0.064453244,0.7784918983501654,2T3wSlHdEmASmO0xcXHnndkKEt6bz8,,NULL +e,2,49,24495,-587831330,9178511478067509438,129,12757,1289293657,10948666249269100825,0.5610077,0.5991138115095911,bgK1r6v3BCTh0aejJUhkA1Hn6idXGp,,NULL +d,1,-98,13630,-1991133944,1184110014998006843,220,2986,225513085,9634106610243643486,0.89651865,0.1640882545084913,y7C453hRWd4E7ImjNDWlpexB8nUqjh,,NULL +d,3,77,15091,-1302295658,8795481303066536947,154,35477,2093538928,17419098323248948387,0.11952883,0.7035635283169166,O66j6PaYuZhEUtqV6fuU7TyjM2WxC5,,NULL +e,2,97,18167,1593800404,-9112448817105133638,163,45185,3188005828,2792105417953811674,0.38175434,0.4094218353587008,ukOiFGGFnQJDHFgZxHMpvhD3zybF0M,,NULL +e,4,-56,-31500,1544188174,3096047390018154410,220,417,557517119,2774306934041974261,0.15459597,0.19113293583306745,IZTkHMLvIKuiLjhDjYMmIHxh166we4,,NULL +d,1,-99,5613,1213926989,-8863698443222021480,19,18736,4216440507,14933742247195536130,0.6067944,0.33639590659276175,aDxBtor7Icd9C5hnTvvw5NrIre740e,,NULL +a,5,36,-16974,623103518,6834444206535996609,71,29458,141047417,17448660630302620693,0.17100024,0.04429073092078406,OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh,,NULL +e,4,-53,13788,2064155045,-691093532952651300,243,35106,2778168728,9463973906560740422,0.34515214,0.27159190516490006,0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm,,NULL +c,2,-29,25305,-537142430,-7683452043175617798,150,31648,598822671,11759014161799384683,0.8315913,0.946325164889271,9UbObCsVkmYpJGcGrgfK90qOnwb2Lj,,NULL +a,1,-25,15295,383352709,4980135132406487265,231,102,3276123488,12763583666216333412,0.53796273,0.17592486905979987,XemNcT1xp61xcM1Qz3wZ1VECCnq06O,,NULL +c,4,123,16620,852509237,-3087630526856906991,196,33715,3566741189,4546434653720168472,0.07606989,0.819715865079681,8LIh0b6jmDGm87BmIyjdxNIpX4ugjD,,NULL +a,5,-31,-12907,586844478,-4862189775214031241,170,28086,1013876852,11005002152861474932,0.35319167,0.05573662213439634,MeSTAXq8gVxVjbEjgkvU9YLte0X9uE,,NULL +a,2,45,15673,-1899175111,398282800995316041,99,2555,145294611,8554426087132697832,0.17333257,0.6405262429561641,b3b9esRhTzFEawbs6XhpKnD9ojutHB,,NULL +b,3,17,14457,670497898,-2390782464845307388,255,24770,1538863055,12662506238151717757,0.34077626,0.7614304100703713,6x93sxYioWuq5c9Kkk8oTAAORM7cH0,,NULL +e,4,97,-13181,2047637360,6176835796788944083,158,53000,2042457019,9726016502640071617,0.7085086,0.12357539988406441,oHJMNvWuunsIMIWFnYG31RCfkOo2V7,,NULL +c,2,-60,-16312,-1808210365,-3368300253197863813,71,39635,2844041986,7045482583778080653,0.805363,0.6425694115212065,BJqx5WokrmrrezZA0dUbleMYkG5U2O,,NULL +e,1,36,-21481,-928766616,-3471238138418013024,150,52569,2610290479,7788847578701297242,0.2578469,0.7670021786149205,gpo8K5qtYePve6jyPt6xgJx4YOVjms,,NULL +b,5,-5,24896,1955646088,2430204191283109071,118,43655,2424630722,11429640193932435507,0.87989986,0.7328050041291218,JafwVLSVk5AVoXFuzclesQ000EE2k1,,NULL +a,3,13,32064,912707948,3826618523497875379,42,21463,2214035726,10771380284714693539,0.6133468,0.7325106678655877,i6RQVXKUh7MzuGMDaNclUYnFUAireU,,NULL +c,1,41,-4667,-644225469,7049620391314639084,196,48099,2125812933,15419512479294091215,0.5780736,0.9255031346434324,mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS,,NULL +d,2,93,-12642,2053379412,6468763445799074329,147,50842,1000948272,5536487915963301239,0.4279275,0.28534428578703896,lqhzgLsXZ8JhtpeeUWWNbMz8PHI705,,NULL +c,3,73,-9565,-382483011,1765659477910680019,186,1535,1088543984,2906943497598597237,0.680652,0.6009475544728957,Ow5PGpfTm4dXCfTDsXAOTatXRoAydR,,NULL +c,3,-2,-18655,-2141999138,-3154042970870838072,251,34970,3862393166,13062025193350212516,0.034291923,0.7697753383420857,IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr,,NULL +c,3,22,13741,-2098805236,8604102724776612452,45,2516,1362369177,196777795886465166,0.94669616,0.0494924465469434,6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE,,NULL +b,2,63,21456,-2138770630,-2380041687053733364,181,57594,2705709344,13144161537396946288,0.09683716,0.3051364088814128,nYVJnVicpGRqKZibHyBAmtmzBXAFfT,,NULL +d,4,102,-24558,1991172974,-7823479531661596016,14,36599,1534194097,2240998421986827216,0.028003037,0.8824879447595726,0og6hSkhbX8AC1ktFS4kounvTzy8Vo,,NULL +d,1,-8,27138,-1383162419,7682021027078563072,36,64517,2861376515,9904216782086286050,0.80954456,0.9463098243875633,AFGCj7OWlEB5QfniEFgonMq90Tq5uH,,NULL +a,3,17,-22796,1337043149,-1282905594104562444,167,2809,754775609,732272194388185106,0.3884129,0.658671129040488,VDhtJkYjAYPykCgOU9x3v7v3t4SO1a,,NULL +e,2,52,23388,715235348,605432070100399212,165,56980,3314983189,7386391799827871203,0.46076488,0.980809631269599,jQimhdepw3GKmioWUlVSWeBVRKFkY3,,NULL +b,5,68,21576,1188285940,5717755781990389024,224,27600,974297360,9865419128970328044,0.80895734,0.7973920072996036,ioEncce3mPOXD2hWhpZpCPWGATG6GU,,NULL +b,2,31,23127,-800561771,-8706387435232961848,153,27034,1098639440,3343692635488765507,0.35692692,0.5590205548347534,okOkcWflkNXIy4R8LzmySyY1EC3sYd,,NULL +c,1,-24,-24085,-1882293856,7385529783747709716,41,48048,520189543,2402288956117186783,0.39761502,0.3600766362333053,Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u,,NULL +a,4,65,-28462,-1813935549,7602389238442209730,18,363,1865307672,11378396836996498283,0.09130204,0.5593249815276734,WHmjWk2AY4c6m7DA4GitUx6nmb1yYS,,NULL +d,1,125,31106,-1176490478,-4306856842351827308,90,17910,3625286410,17869394731126786457,0.8882508,0.7631239070049998,dVdvo6nUD5FgCgsbOZLds28RyGTpnx,,NULL +b,4,17,-28070,-673237643,1904316899655860234,188,27744,933879086,3732692885824435932,0.41860116,0.40342283197779727,JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ,,NULL +c,2,-106,-1114,-1927628110,1080308211931669384,177,20421,141680161,7464432081248293405,0.56749094,0.565352842229935,Vp3gmWunM5A7wOC9YW2JroFqTWjvTi,,NULL +d,5,-59,2045,-2117946883,1170799768349713170,189,63353,1365198901,2501626630745849169,0.75173044,0.18628859265874176,F7NSTjWvQJyBburN7CXRUlbgp2dIrA,,NULL +d,4,55,-1471,1902023838,1252101628560265705,157,3691,811650497,1524771507450695976,0.2968701,0.5437595540422571,f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX,,NULL +b,2,-60,-21739,-1908480893,-8897292622858103761,59,50009,2525744318,1719090662556698549,0.52930677,0.560333188635217,l7uwDoTepWwnAP0ufqtHJS3CRi7RfP,,NULL +d,3,-76,8809,141218956,-9110406195556445909,58,5494,1824517658,12046662515387914426,0.8557294,0.6668423897406515,Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK,,NULL +e,4,73,-22501,1282464673,2541794052864382235,67,21119,538589788,9575476605699527641,0.48515016,0.296036538664718,4JznSdBajNWhu4hRQwjV1FjTTxY68i,,NULL +b,4,-117,19316,2051224722,-5534418579506232438,133,52046,3023531799,13684453606722360110,0.62608826,0.8506721053047003,mhjME0zBHbrK6NMkytMTQzOssOa1gF,,NULL +a,4,-101,11640,1993193190,2992662416070659899,230,40566,466439833,16778113360088370541,0.3991115,0.574210838214554,NEhyk8uIx4kEULJGa8qIyFjjBcP2G6,,NULL +b,5,62,16337,41423756,-2274773899098124524,121,34206,2307004493,10575647935385523483,0.23794776,0.1754261586710173,qnPOOmslCJaT45buUisMRnM0rc77EK,,NULL +c,4,-79,5281,-237425046,373011991904079451,121,55620,2818832252,2464584078983135763,0.49774808,0.9237877978193884,t6fQUjJejPcjc04wHvHTPe55S65B4V,,NULL +b,2,68,15874,49866617,1179733259727844435,121,23948,3455216719,3898128009708892708,0.6306253,0.9185813970744787,802bgTGl6Bk5TlkPYYTxp5JkKyaYUA,,NULL +c,1,70,27752,1325868318,1241882478563331892,63,61637,473294098,4976799313755010034,0.13801557,0.5081765563442366,Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn,,NULL +e,2,-61,-2888,-1660426473,2553892468492435401,126,35429,4144173353,939909697866979632,0.4405142,0.9231889896940375,BPtQMxnuSPpxMExYV9YkDa6cAN7GP3,,NULL +e,4,74,-12612,-1885422396,1702850374057819332,130,3583,3198969145,10767179755613315144,0.5518061,0.5614503754617461,QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv,,NULL +d,2,122,10130,-168758331,-3179091803916845592,30,794,4061635107,15695681119022625322,0.69592506,0.9748360509016578,OPwBqCEK5PWTjWaiOyL45u2NLTaDWv,,NULL +e,3,71,194,1436496767,-5639533800082367925,158,44507,3105312559,3998472996619161534,0.930117,0.6108938307533,pTeu0WMjBRTaNRT15rLCuEh3tBJVc5,,NULL +c,5,-94,-15880,2025611582,-3348824099853919681,5,40622,4268716378,12849419495718510869,0.34163946,0.4830878559436823,RilTlL1tKkPOUFuzmLydHAVZwv1OGl,,NULL +d,1,-72,25590,1188089983,3090286296481837049,241,832,3542840110,5885937420286765261,0.41980565,0.21535402343780985,wwXqSGKLyBQyPkonlzBNYUJTCo4LRS,,NULL +e,1,71,-5479,-1339586153,-3920238763788954243,123,53012,4229654142,10297218950720052365,0.73473036,0.5773498217058918,cBGc0kSm32ylBDnxogG727C0uhZEYZ,,NULL +e,4,96,-30336,427197269,7506304308750926996,95,48483,3521368277,5437030162957481122,0.58104324,0.42073125331890115,3BEOHQsMEFZ58VcNTOJYShTBpAPzbt,,NULL +a,2,-48,-18025,439738328,-313657814587041987,222,13763,3717551163,9135746610908713318,0.055064857,0.9800193410444061,ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8,,NULL +a,1,-56,8692,2106705285,-7811675384226570375,231,15573,1454057357,677091006469429514,0.42794758,0.2739938529235548,JN0VclewmjwYlSl8386MlWv5rEhWCz,,NULL +e,2,52,-12056,-1090239422,9011500141803970147,238,4168,2013662838,12565360638488684051,0.6694766,0.39144436569161134,xipQ93429ksjNcXPX5326VSg1xJZcW,,NULL +a,1,-5,12636,794623392,2909750622865366631,15,24022,2669374863,4776679784701509574,0.29877836,0.2537253407987472,waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs,,NULL +b,1,12,7652,-1448995523,-5332734971209541785,136,49283,4076864659,15449267433866484283,0.6214579,0.05636955101974106,akiiY5N0I44CMwEnBL6RTBk7BRkxEj,,NULL +e,5,64,-26526,1689098844,8950618259486183091,224,45253,662099130,16127995415060805595,0.2897315,0.5759450483859969,56MZa5O1hVtX4c5sbnCfxuX5kDChqI,,NULL +c,4,-90,-2935,1579876740,6733733506744649678,254,12876,3593959807,4094315663314091142,0.5708688,0.5603062368164834,Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV,,NULL +e,5,-86,32514,-467659022,-8012578250188146150,254,2684,2861911482,2126626171973341689,0.12559289,0.01479305307777301,gxfHWUF8XgY2KdFxigxvNEXe2V2XMl,,NULL +c,2,-117,-30187,-1222533990,-191957437217035800,136,47061,2293105904,12659011877190539078,0.2047385,0.9706712283358269,pLk3i59bZwd5KBZrI1FiweYTd5hteG,,NULL +a,3,14,28162,397430452,-452851601758273256,57,14722,431948861,8164671015278284913,0.40199697,0.07260475960924484,TtDKUZxzVxsq758G6AWPSYuZgVgbcl,,NULL +c,2,29,-3855,1354539333,4742062657200940467,81,53815,3398507249,562977550464243101,0.7124534,0.991517828651004,Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0,,NULL +b,4,-59,25286,1423957796,2646602445954944051,0,61069,3570297463,15100310750150419896,0.49619365,0.04893135681998029,fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG,,NULL +a,1,83,-14704,2143473091,-4387559599038777245,37,829,4015442341,4602675983996931623,0.89542526,0.9567595541247681,ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU,,NULL +a,3,-12,-9168,1489733240,-1569376002217735076,206,33821,3959216334,16060348691054629425,0.9488028,0.9293883502480845,oLZ21P2JEDooxV1pU31cIxQHEeeoLu,,NULL +c,4,3,-30508,659422734,-6455460736227846736,133,59663,2306130875,8622584762448622224,0.16999894,0.4273123318932347,EcCuckwsF3gV1Ecgmh5v4KM8g1ozif,,NULL +a,3,-72,-11122,-2141451704,-2578916903971263854,83,30296,1995343206,17452974532402389080,0.94209343,0.3231750610081745,e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG,,NULL +c,2,-107,-2904,-1011669561,782342092880993439,18,29527,1157161427,4403623840168496677,0.31988364,0.36936304600612724,QYlaIAnJA6r8rlAb6f59wcxvcPcWFf,,NULL +c,5,118,19208,-134213907,-2120241105523909127,86,57751,1229567292,16493024289408725403,0.5536642,0.9723580396501548,TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX,,NULL +c,3,97,29106,-903316089,2874859437662206732,207,42171,3473924576,8188072741116415408,0.32792538,0.2667177795079635,HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g,,NULL +b,3,-101,-13217,-346989627,5456800329302529236,26,54276,243203849,17929716297117857676,0.05422181,0.09465635123783445,MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ,,NULL +a,2,-43,13080,370975815,5881039805148485053,2,20120,2939920218,906367167997372130,0.42733806,0.16301110515739792,m6jD0LBIQWaMfenwRCTANI9eOdyyto,,NULL +a,5,-101,-12484,-842693467,-6140627905445351305,57,57885,2496054700,2243924747182709810,0.59520596,0.9491397432856566,QJYm7YRA3YetcBHI5wkMZeLXVmfuNy,,NULL +b,5,-44,15788,-629486480,5822642169425315613,13,11872,3457053821,2413406423648025909,0.44318348,0.32869374687050157,ALuRhobVWbnQTTWZdSOk0iVe8oYFhW,,NULL +d,4,5,-7688,702611616,6239356364381313700,4,39363,3126475872,35363005357834672,0.3766935,0.061029375346466685,H5j5ZHy1FGesOAHjkQEDYCucbpKWRu,,NULL +e,1,120,10837,-1331533190,6342019705133850847,245,3975,2830981072,16439861276703750332,0.6623719,0.9965400387585364,LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW,,NULL +e,3,-95,13611,2030965207,927403809957470678,119,59134,559847112,10966649192992996919,0.5301289,0.047343434291126085,gTpyQnEODMcpsPnJMZC66gh33i3m0b,,NULL +d,3,123,29533,240273900,1176001466590906949,117,30972,2592330556,12883447461717956514,0.39075065,0.38870280983958583,1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO,,NULL +b,4,47,20690,-1009656194,-2027442591571700798,200,7781,326151275,2881913079548128905,0.57360977,0.2145232647388039,52mKlRE3aHCBZtjECq6sY9OqVf8Dze,,NULL +e,4,30,-16110,61035129,-3356533792537910152,159,299,28774375,13526465947516666293,0.6999775,0.03968347085780355,cq4WSAIFwx3wwTUS5bp1wCe71R6U5I,,NULL \ No newline at end of file diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 2da8b6066742..69626f97fd80 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -421,10 +421,11 @@ message CsvOptions { string timestamp_tz_format = 10; // Optional timestamp with timezone format string time_format = 11; // Optional time format string null_value = 12; // Optional representation of null value - bytes comment = 13; // Optional comment character as a byte - bytes double_quote = 14; // Indicates if quotes are doubled - bytes newlines_in_values = 15; // Indicates if newlines are supported in values - bytes terminator = 16; // Optional terminator character as a byte + string null_regex = 13; // Optional representation of null loading regex + bytes comment = 14; // Optional comment character as a byte + bytes double_quote = 15; // Indicates if quotes are doubled + bytes newlines_in_values = 16; // Indicates if newlines are supported in values + bytes terminator = 17; // Optional terminator character as a byte } // Options controlling CSV format diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 14375c0590a4..eb6976aa0c06 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -882,6 +882,8 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions { .then(|| proto_opts.time_format.clone()), null_value: (!proto_opts.null_value.is_empty()) .then(|| proto_opts.null_value.clone()), + null_regex: (!proto_opts.null_regex.is_empty()) + .then(|| proto_opts.null_regex.clone()), comment: proto_opts.comment.first().copied(), }) } diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 6a75b14d35a8..e88c1497af08 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -1533,6 +1533,9 @@ impl serde::Serialize for CsvOptions { if !self.null_value.is_empty() { len += 1; } + if !self.null_regex.is_empty() { + len += 1; + } if !self.comment.is_empty() { len += 1; } @@ -1594,6 +1597,9 @@ impl serde::Serialize for CsvOptions { if !self.null_value.is_empty() { struct_ser.serialize_field("nullValue", &self.null_value)?; } + if !self.null_regex.is_empty() { + struct_ser.serialize_field("nullRegex", &self.null_regex)?; + } if !self.comment.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -1644,6 +1650,8 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { "timeFormat", "null_value", "nullValue", + "null_regex", + "nullRegex", "comment", "double_quote", "doubleQuote", @@ -1666,6 +1674,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { TimestampTzFormat, TimeFormat, NullValue, + NullRegex, Comment, DoubleQuote, NewlinesInValues, @@ -1703,6 +1712,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { "timestampTzFormat" | "timestamp_tz_format" => Ok(GeneratedField::TimestampTzFormat), "timeFormat" | "time_format" => Ok(GeneratedField::TimeFormat), "nullValue" | "null_value" => Ok(GeneratedField::NullValue), + "nullRegex" | "null_regex" => Ok(GeneratedField::NullRegex), "comment" => Ok(GeneratedField::Comment), "doubleQuote" | "double_quote" => Ok(GeneratedField::DoubleQuote), "newlinesInValues" | "newlines_in_values" => Ok(GeneratedField::NewlinesInValues), @@ -1738,6 +1748,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { let mut timestamp_tz_format__ = None; let mut time_format__ = None; let mut null_value__ = None; + let mut null_regex__ = None; let mut comment__ = None; let mut double_quote__ = None; let mut newlines_in_values__ = None; @@ -1826,6 +1837,12 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { } null_value__ = Some(map_.next_value()?); } + GeneratedField::NullRegex => { + if null_regex__.is_some() { + return Err(serde::de::Error::duplicate_field("nullRegex")); + } + null_regex__ = Some(map_.next_value()?); + } GeneratedField::Comment => { if comment__.is_some() { return Err(serde::de::Error::duplicate_field("comment")); @@ -1873,6 +1890,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { timestamp_tz_format: timestamp_tz_format__.unwrap_or_default(), time_format: time_format__.unwrap_or_default(), null_value: null_value__.unwrap_or_default(), + null_regex: null_regex__.unwrap_or_default(), comment: comment__.unwrap_or_default(), double_quote: double_quote__.unwrap_or_default(), newlines_in_values: newlines_in_values__.unwrap_or_default(), diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 50a3cff5f568..6b8509775847 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -593,17 +593,20 @@ pub struct CsvOptions { /// Optional representation of null value #[prost(string, tag = "12")] pub null_value: ::prost::alloc::string::String, + /// Optional representation of null loading regex + #[prost(string, tag = "13")] + pub null_regex: ::prost::alloc::string::String, /// Optional comment character as a byte - #[prost(bytes = "vec", tag = "13")] + #[prost(bytes = "vec", tag = "14")] pub comment: ::prost::alloc::vec::Vec, /// Indicates if quotes are doubled - #[prost(bytes = "vec", tag = "14")] + #[prost(bytes = "vec", tag = "15")] pub double_quote: ::prost::alloc::vec::Vec, /// Indicates if newlines are supported in values - #[prost(bytes = "vec", tag = "15")] + #[prost(bytes = "vec", tag = "16")] pub newlines_in_values: ::prost::alloc::vec::Vec, /// Optional terminator character as a byte - #[prost(bytes = "vec", tag = "16")] + #[prost(bytes = "vec", tag = "17")] pub terminator: ::prost::alloc::vec::Vec, } /// Options controlling CSV format @@ -638,27 +641,33 @@ pub struct ParquetColumnSpecificOptions { #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetColumnOptions { #[prost(oneof = "parquet_column_options::BloomFilterEnabledOpt", tags = "1")] - pub bloom_filter_enabled_opt: - ::core::option::Option, + pub bloom_filter_enabled_opt: ::core::option::Option< + parquet_column_options::BloomFilterEnabledOpt, + >, #[prost(oneof = "parquet_column_options::EncodingOpt", tags = "2")] pub encoding_opt: ::core::option::Option, #[prost(oneof = "parquet_column_options::DictionaryEnabledOpt", tags = "3")] - pub dictionary_enabled_opt: - ::core::option::Option, + pub dictionary_enabled_opt: ::core::option::Option< + parquet_column_options::DictionaryEnabledOpt, + >, #[prost(oneof = "parquet_column_options::CompressionOpt", tags = "4")] pub compression_opt: ::core::option::Option, #[prost(oneof = "parquet_column_options::StatisticsEnabledOpt", tags = "5")] - pub statistics_enabled_opt: - ::core::option::Option, + pub statistics_enabled_opt: ::core::option::Option< + parquet_column_options::StatisticsEnabledOpt, + >, #[prost(oneof = "parquet_column_options::BloomFilterFppOpt", tags = "6")] - pub bloom_filter_fpp_opt: - ::core::option::Option, + pub bloom_filter_fpp_opt: ::core::option::Option< + parquet_column_options::BloomFilterFppOpt, + >, #[prost(oneof = "parquet_column_options::BloomFilterNdvOpt", tags = "7")] - pub bloom_filter_ndv_opt: - ::core::option::Option, + pub bloom_filter_ndv_opt: ::core::option::Option< + parquet_column_options::BloomFilterNdvOpt, + >, #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: - ::core::option::Option, + pub max_statistics_size_opt: ::core::option::Option< + parquet_column_options::MaxStatisticsSizeOpt, + >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -763,22 +772,27 @@ pub struct ParquetOptions { #[prost(string, tag = "16")] pub created_by: ::prost::alloc::string::String, #[prost(oneof = "parquet_options::MetadataSizeHintOpt", tags = "4")] - pub metadata_size_hint_opt: - ::core::option::Option, + pub metadata_size_hint_opt: ::core::option::Option< + parquet_options::MetadataSizeHintOpt, + >, #[prost(oneof = "parquet_options::CompressionOpt", tags = "10")] pub compression_opt: ::core::option::Option, #[prost(oneof = "parquet_options::DictionaryEnabledOpt", tags = "11")] - pub dictionary_enabled_opt: - ::core::option::Option, + pub dictionary_enabled_opt: ::core::option::Option< + parquet_options::DictionaryEnabledOpt, + >, #[prost(oneof = "parquet_options::StatisticsEnabledOpt", tags = "13")] - pub statistics_enabled_opt: - ::core::option::Option, + pub statistics_enabled_opt: ::core::option::Option< + parquet_options::StatisticsEnabledOpt, + >, #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: - ::core::option::Option, + pub max_statistics_size_opt: ::core::option::Option< + parquet_options::MaxStatisticsSizeOpt, + >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] - pub column_index_truncate_length_opt: - ::core::option::Option, + pub column_index_truncate_length_opt: ::core::option::Option< + parquet_options::ColumnIndexTruncateLengthOpt, + >, #[prost(oneof = "parquet_options::EncodingOpt", tags = "19")] pub encoding_opt: ::core::option::Option, #[prost(oneof = "parquet_options::BloomFilterFppOpt", tags = "21")] @@ -861,9 +875,7 @@ pub struct ColumnStats { #[prost(message, optional, tag = "4")] pub distinct_count: ::core::option::Option, } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinType { Inner = 0, @@ -910,9 +922,7 @@ impl JoinType { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinConstraint { On = 0, @@ -938,9 +948,7 @@ impl JoinConstraint { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum TimeUnit { Second = 0, @@ -972,9 +980,7 @@ impl TimeUnit { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum IntervalUnit { YearMonth = 0, @@ -1003,9 +1009,7 @@ impl IntervalUnit { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum UnionMode { Sparse = 0, @@ -1031,9 +1035,7 @@ impl UnionMode { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum CompressionTypeVariant { Gzip = 0, @@ -1068,9 +1070,7 @@ impl CompressionTypeVariant { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum JoinSide { LeftSide = 0, @@ -1099,9 +1099,7 @@ impl JoinSide { } } } -#[derive( - Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration, -)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum PrecisionInfo { Exact = 0, diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 1b9583516ced..a7cea607cb6d 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -928,6 +928,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions { timestamp_tz_format: opts.timestamp_tz_format.clone().unwrap_or_default(), time_format: opts.time_format.clone().unwrap_or_default(), null_value: opts.null_value.clone().unwrap_or_default(), + null_regex: opts.null_regex.clone().unwrap_or_default(), comment: opts.comment.map_or_else(Vec::new, |h| vec![h]), }) } diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index fa77d23a6ae6..6b8509775847 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -593,17 +593,20 @@ pub struct CsvOptions { /// Optional representation of null value #[prost(string, tag = "12")] pub null_value: ::prost::alloc::string::String, + /// Optional representation of null loading regex + #[prost(string, tag = "13")] + pub null_regex: ::prost::alloc::string::String, /// Optional comment character as a byte - #[prost(bytes = "vec", tag = "13")] + #[prost(bytes = "vec", tag = "14")] pub comment: ::prost::alloc::vec::Vec, /// Indicates if quotes are doubled - #[prost(bytes = "vec", tag = "14")] + #[prost(bytes = "vec", tag = "15")] pub double_quote: ::prost::alloc::vec::Vec, /// Indicates if newlines are supported in values - #[prost(bytes = "vec", tag = "15")] + #[prost(bytes = "vec", tag = "16")] pub newlines_in_values: ::prost::alloc::vec::Vec, /// Optional terminator character as a byte - #[prost(bytes = "vec", tag = "16")] + #[prost(bytes = "vec", tag = "17")] pub terminator: ::prost::alloc::vec::Vec, } /// Options controlling CSV format diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 1e2b12dacc29..62405b2fef21 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -67,6 +67,7 @@ impl CsvOptionsProto { .unwrap_or_default(), time_format: options.time_format.clone().unwrap_or_default(), null_value: options.null_value.clone().unwrap_or_default(), + null_regex: options.null_regex.clone().unwrap_or_default(), comment: options.comment.map_or(vec![], |v| vec![v]), newlines_in_values: options .newlines_in_values @@ -141,6 +142,11 @@ impl From<&CsvOptionsProto> for CsvOptions { } else { Some(proto.null_value.clone()) }, + null_regex: if proto.null_regex.is_empty() { + None + } else { + Some(proto.null_regex.clone()) + }, comment: if !proto.comment.is_empty() { Some(proto.comment[0]) } else {