Merge remote-tracking branch 'apache/main' into alamb/refine_spm-docs

apache · Dec 20, 2024 · 1cc4fa9 · 1cc4fa9
2 parents 19fa523 + 75202b5
commit 1cc4fa9
Show file tree

Hide file tree

Showing 107 changed files with 2,903 additions and 966 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -147,7 +147,7 @@ recursive = "0.1.1"
 regex = "1.8"
 rstest = "0.23.0"
 serde_json = "1"
-sqlparser = { version = "0.52.0", features = ["visitor"] }
+sqlparser = { version = "0.53.0", features = ["visitor"] }
 tempfile = "3"
 tokio = { version = "1.36", features = ["macros", "rt", "sync"] }
 url = "2.2"

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs
@@ -33,11 +33,12 @@ use crate::{
 };
 
 use datafusion::common::instant::Instant;
-use datafusion::common::plan_datafusion_err;
+use datafusion::common::{plan_datafusion_err, plan_err};
 use datafusion::config::ConfigFileType;
 use datafusion::datasource::listing::ListingTableUrl;
 use datafusion::error::{DataFusionError, Result};
 use datafusion::logical_expr::{DdlStatement, LogicalPlan};
+use datafusion::physical_plan::execution_plan::EmissionType;
 use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties};
 use datafusion::sql::parser::{DFParser, Statement};
 use datafusion::sql::sqlparser::dialect::dialect_from_str;
@@ -234,10 +235,19 @@ pub(super) async fn exec_and_print(
         let df = ctx.execute_logical_plan(plan).await?;
         let physical_plan = df.create_physical_plan().await?;
 
-        if physical_plan.execution_mode().is_unbounded() {
+        if physical_plan.boundedness().is_unbounded() {
+            if physical_plan.pipeline_behavior() == EmissionType::Final {
+                return plan_err!(
+                    "The given query can generate a valid result only once \
+                    the source finishes, but the source is unbounded"
+                );
+            }
+            // As the input stream comes, we can generate results.
+            // However, memory safety is not guaranteed.
             let stream = execute_stream(physical_plan, task_ctx.clone())?;
             print_options.print_stream(stream, now).await?;
         } else {
+            // Bounded stream; collected results are printed after all input consumed.
             let schema = physical_plan.schema();
             let results = collect(physical_plan, task_ctx.clone()).await?;
             adjusted.into_inner().print_batches(schema, &results, now)?;

diff --git a/datafusion-cli/src/object_storage.rs b/datafusion-cli/src/object_storage.rs
@@ -472,12 +472,13 @@ mod tests {
 
     #[tokio::test]
     async fn s3_object_store_builder() -> Result<()> {
-        let access_key_id = "fake_access_key_id";
-        let secret_access_key = "fake_secret_access_key";
+        // "fake" is uppercase to ensure the values are not lowercased when parsed
+        let access_key_id = "FAKE_access_key_id";
+        let secret_access_key = "FAKE_secret_access_key";
         let region = "fake_us-east-2";
         let endpoint = "endpoint33";
-        let session_token = "fake_session_token";
-        let location = "s3://bucket/path/file.parquet";
+        let session_token = "FAKE_session_token";
+        let location = "s3://bucket/path/FAKE/file.parquet";
 
         let table_url = ListingTableUrl::parse(location)?;
         let scheme = table_url.scheme();

diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs
@@ -30,10 +30,11 @@ use datafusion::error::Result;
 use datafusion::execution::context::TaskContext;
 use datafusion::logical_expr::LogicalPlanBuilder;
 use datafusion::physical_expr::EquivalenceProperties;
+use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType};
 use datafusion::physical_plan::memory::MemoryStream;
 use datafusion::physical_plan::{
-    project_schema, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan,
-    Partitioning, PlanProperties, SendableRecordBatchStream,
+    project_schema, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
+    PlanProperties, SendableRecordBatchStream,
 };
 use datafusion::prelude::*;
 
@@ -214,7 +215,8 @@ impl CustomExec {
         PlanProperties::new(
             eq_properties,
             Partitioning::UnknownPartitioning(1),
-            ExecutionMode::Bounded,
+            EmissionType::Incremental,
+            Boundedness::Bounded,
         )
     }
 }

diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml
@@ -57,6 +57,7 @@ half = { workspace = true }
 hashbrown = { workspace = true }
 indexmap = { workspace = true }
 libc = "0.2.140"
+log = { workspace = true }
 object_store = { workspace = true, optional = true }
 parquet = { workspace = true, optional = true, default-features = true }
 paste = "1.0.15"

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -19,6 +19,7 @@
 
 use std::any::Any;
 use std::collections::{BTreeMap, HashMap};
+use std::error::Error;
 use std::fmt::{self, Display};
 use std::str::FromStr;
 
@@ -29,7 +30,9 @@ use crate::{DataFusionError, Result};
 
 /// A macro that wraps a configuration struct and automatically derives
 /// [`Default`] and [`ConfigField`] for it, allowing it to be used
-/// in the [`ConfigOptions`] configuration tree
+/// in the [`ConfigOptions`] configuration tree.
+///
+/// `transform` is used to normalize values before parsing.
 ///
 /// For example,
 ///
@@ -38,7 +41,7 @@ use crate::{DataFusionError, Result};
 ///    /// Amazing config
 ///    pub struct MyConfig {
 ///        /// Field 1 doc
-///        field1: String, default = "".to_string()
+///        field1: String, transform = str::to_lowercase, default = "".to_string()
 ///
 ///        /// Field 2 doc
 ///        field2: usize, default = 232
@@ -67,9 +70,12 @@ use crate::{DataFusionError, Result};
 ///     fn set(&mut self, key: &str, value: &str) -> Result<()> {
 ///         let (key, rem) = key.split_once('.').unwrap_or((key, ""));
 ///         match key {
-///             "field1" => self.field1.set(rem, value),
-///             "field2" => self.field2.set(rem, value),
-///             "field3" => self.field3.set(rem, value),
+///             "field1" => {
+///                 let value = str::to_lowercase(value);
+///                 self.field1.set(rem, value.as_ref())
+///             },
+///             "field2" => self.field2.set(rem, value.as_ref()),
+///             "field3" => self.field3.set(rem, value.as_ref()),
 ///             _ => _internal_err!(
 ///                 "Config value \"{}\" not found on MyConfig",
 ///                 key
@@ -102,15 +108,14 @@ use crate::{DataFusionError, Result};
 /// ```
 ///
 /// NB: Misplaced commas may result in nonsensical errors
-///
 #[macro_export]
 macro_rules! config_namespace {
     (
      $(#[doc = $struct_d:tt])*
      $vis:vis struct $struct_name:ident {
         $(
         $(#[doc = $d:tt])*
-        $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
+        $field_vis:vis $field_name:ident : $field_type:ty, $(warn = $warn: expr,)? $(transform = $transform:expr,)? default = $default:expr
         )*$(,)*
     }
     ) => {
@@ -127,9 +132,14 @@ macro_rules! config_namespace {
         impl ConfigField for $struct_name {
             fn set(&mut self, key: &str, value: &str) -> Result<()> {
                 let (key, rem) = key.split_once('.').unwrap_or((key, ""));
+
                 match key {
                     $(
-                       stringify!($field_name) => self.$field_name.set(rem, value),
+                       stringify!($field_name) => {
+                           $(let value = $transform(value);)?
+                           $(log::warn!($warn);)?
+                           self.$field_name.set(rem, value.as_ref())
+                       },
                     )*
                     _ => return _config_err!(
                         "Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -211,12 +221,15 @@ config_namespace! {
         /// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
         pub enable_ident_normalization: bool, default = true
 
-        /// When set to true, SQL parser will normalize options value (convert value to lowercase)
-        pub enable_options_value_normalization: bool, default = true
+        /// When set to true, SQL parser will normalize options value (convert value to lowercase).
+        /// Note that this option is ignored and will be removed in the future. All case-insensitive values
+        /// are normalized automatically.
+        pub enable_options_value_normalization: bool, warn = "`enable_options_value_normalization` is deprecated and ignored", default = false
 
         /// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
         /// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
         pub dialect: String, default = "generic".to_string()
+        // no need to lowercase because `sqlparser::dialect_from_str`] is case-insensitive
 
         /// If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but
         /// ignore the length. If false, error if a `VARCHAR` with a length is
@@ -431,7 +444,7 @@ config_namespace! {
         ///
         /// Note that this default setting is not the same as
         /// the default parquet writer setting.
-        pub compression: Option<String>, default = Some("zstd(3)".into())
+        pub compression: Option<String>, transform = str::to_lowercase, default = Some("zstd(3)".into())
 
         /// (writing) Sets if dictionary encoding is enabled. If NULL, uses
         /// default parquet writer setting
@@ -444,7 +457,7 @@ config_namespace! {
         /// Valid values are: "none", "chunk", and "page"
         /// These values are not case sensitive. If NULL, uses
         /// default parquet writer setting
-        pub statistics_enabled: Option<String>, default = Some("page".into())
+        pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into())
 
         /// (writing) Sets max statistics size for any column. If NULL, uses
         /// default parquet writer setting
@@ -470,7 +483,7 @@ config_namespace! {
         /// delta_byte_array, rle_dictionary, and byte_stream_split.
         /// These values are not case sensitive. If NULL, uses
         /// default parquet writer setting
-        pub encoding: Option<String>, default = None
+        pub encoding: Option<String>, transform = str::to_lowercase, default = None
 
         /// (writing) Use any available bloom filters when reading parquet files
         pub bloom_filter_on_read: bool, default = true
@@ -971,29 +984,45 @@ impl<F: ConfigField + Default> ConfigField for Option<F> {
     }
 }
 
+fn default_transform<T>(input: &str) -> Result<T>
+where
+    T: FromStr,
+    <T as FromStr>::Err: Sync + Send + Error + 'static,
+{
+    input.parse().map_err(|e| {
+        DataFusionError::Context(
+            format!(
+                "Error parsing '{}' as {}",
+                input,
+                std::any::type_name::<T>()
+            ),
+            Box::new(DataFusionError::External(Box::new(e))),
+        )
+    })
+}
+
 #[macro_export]
 macro_rules! config_field {
     ($t:ty) => {
+        config_field!($t, value => default_transform(value)?);
+    };
+
+    ($t:ty, $arg:ident => $transform:expr) => {
         impl ConfigField for $t {
             fn visit<V: Visit>(&self, v: &mut V, key: &str, description: &'static str) {
                 v.some(key, self, description)
             }
 
-            fn set(&mut self, _: &str, value: &str) -> Result<()> {
-                *self = value.parse().map_err(|e| {
-                    DataFusionError::Context(
-                        format!(concat!("Error parsing {} as ", stringify!($t),), value),
-                        Box::new(DataFusionError::External(Box::new(e))),
-                    )
-                })?;
+            fn set(&mut self, _: &str, $arg: &str) -> Result<()> {
+                *self = $transform;
                 Ok(())
             }
         }
     };
 }
 
 config_field!(String);
-config_field!(bool);
+config_field!(bool, value => default_transform(value.to_lowercase().as_str())?);
 config_field!(usize);
 config_field!(f64);
 config_field!(u64);
@@ -1508,7 +1537,7 @@ macro_rules! config_namespace_with_hashmap {
      $vis:vis struct $struct_name:ident {
         $(
         $(#[doc = $d:tt])*
-        $field_vis:vis $field_name:ident : $field_type:ty, default = $default:expr
+        $field_vis:vis $field_name:ident : $field_type:ty, $(transform = $transform:expr,)? default = $default:expr
         )*$(,)*
     }
     ) => {
@@ -1527,7 +1556,10 @@ macro_rules! config_namespace_with_hashmap {
                 let (key, rem) = key.split_once('.').unwrap_or((key, ""));
                 match key {
                     $(
-                       stringify!($field_name) => self.$field_name.set(rem, value),
+                       stringify!($field_name) => {
+                           $(let value = $transform(value);)?
+                           self.$field_name.set(rem, value.as_ref())
+                       },
                     )*
                     _ => _config_err!(
                         "Config value \"{}\" not found on {}", key, stringify!($struct_name)
@@ -1606,7 +1638,7 @@ config_namespace_with_hashmap! {
         /// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
         /// These values are not case-sensitive. If NULL, uses
         /// default parquet options
-        pub compression: Option<String>, default = None
+        pub compression: Option<String>, transform = str::to_lowercase, default = None
 
         /// Sets if statistics are enabled for the column
         /// Valid values are: "none", "chunk", and "page"