Skip to content

Commit

Permalink
Merge branch 'apache_main' into feature/preserve-ordering-equivalencies
Browse files Browse the repository at this point in the history
  • Loading branch information
berkaysynnada committed Dec 20, 2024
2 parents 4d37673 + 95d296c commit 4f8a206
Show file tree
Hide file tree
Showing 627 changed files with 19,527 additions and 9,350 deletions.
14 changes: 7 additions & 7 deletions .github/actions/setup-builder/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@ runs:
- name: Install Build Dependencies
shell: bash
run: |
RETRY="ci/scripts/retry"
"${RETRY}" apt-get update
"${RETRY}" apt-get install -y protobuf-compiler
RETRY=("ci/scripts/retry" timeout 120)
"${RETRY[@]}" apt-get update
"${RETRY[@]}" apt-get install -y protobuf-compiler
- name: Setup Rust toolchain
shell: bash
# rustfmt is needed for the substrait build script
run: |
RETRY="ci/scripts/retry"
RETRY=("ci/scripts/retry" timeout 120)
echo "Installing ${{ inputs.rust-version }}"
"${RETRY}" rustup toolchain install ${{ inputs.rust-version }}
"${RETRY}" rustup default ${{ inputs.rust-version }}
"${RETRY}" rustup component add rustfmt
"${RETRY[@]}" rustup toolchain install ${{ inputs.rust-version }}
"${RETRY[@]}" rustup default ${{ inputs.rust-version }}
"${RETRY[@]}" rustup component add rustfmt
- name: Configure rust runtime env
uses: ./.github/actions/setup-rust-runtime
- name: Fixup git permissions
Expand Down
47 changes: 27 additions & 20 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,12 @@ jobs:
- name: Check datafusion-common without default features
run: cargo check --all-targets --no-default-features -p datafusion-common

- name: Check datafusion-functions
- name: Check datafusion-functions without default features
run: cargo check --all-targets --no-default-features -p datafusion-functions

- name: Check datafusion-substrait without default features
run: cargo check --all-targets --no-default-features -p datafusion-substrait

- name: Check workspace in debug mode
run: cargo check --all-targets --workspace

Expand Down Expand Up @@ -323,22 +326,26 @@ jobs:
env:
POSTGRES_PORT: ${{ job.services.postgres.ports[5432] }}

windows:
name: cargo test (win64)
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Setup Rust toolchain
uses: ./.github/actions/setup-windows-builder
- name: Run tests (excluding doctests)
shell: bash
run: |
export PATH=$PATH:$HOME/d/protoc/bin
cargo test --lib --tests --bins --features avro,json,backtrace
cd datafusion-cli
cargo test --lib --tests --bins --all-features
# Temporarily commenting out the Windows flow, the reason is enormously slow running build
# Waiting for new Windows 2025 github runner
# Details: https://github.com/apache/datafusion/issues/13726
#
# windows:
# name: cargo test (win64)
# runs-on: windows-latest
# steps:
# - uses: actions/checkout@v4
# with:
# submodules: true
# - name: Setup Rust toolchain
# uses: ./.github/actions/setup-windows-builder
# - name: Run tests (excluding doctests)
# shell: bash
# run: |
# export PATH=$PATH:$HOME/d/protoc/bin
# cargo test --lib --tests --bins --features avro,json,backtrace
# cd datafusion-cli
# cargo test --lib --tests --bins --all-features

macos:
name: cargo test (macos)
Expand Down Expand Up @@ -582,9 +589,9 @@ jobs:
#
# To reproduce:
# 1. Install the version of Rust that is failing. Example:
# rustup install 1.79.0
# rustup install 1.80.1
# 2. Run the command that failed with that version. Example:
# cargo +1.79.0 check -p datafusion
# cargo +1.80.1 check -p datafusion
#
# To resolve, either:
# 1. Change your code to use older Rust features,
Expand All @@ -603,4 +610,4 @@ jobs:
run: cargo msrv --output-format json --log-target stdout verify
- name: Check datafusion-cli
working-directory: datafusion-cli
run: cargo msrv --output-format json --log-target stdout verify
run: cargo msrv --output-format json --log-target stdout verify
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ datafusion/sqllogictests/test_files/tpch/data/*
# Scratch temp dir for sqllogictests
datafusion/sqllogictest/test_files/scratch*

# temp file for core
datafusion/core/*.parquet

# Generated core benchmark data
datafusion/core/benches/data/*

# rat
filtered_rat.txt
rat.txt
30 changes: 16 additions & 14 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ members = [
"datafusion/functions",
"datafusion/functions-aggregate",
"datafusion/functions-aggregate-common",
"datafusion/functions-table",
"datafusion/functions-nested",
"datafusion/functions-window",
"datafusion/functions-window-common",
Expand All @@ -52,6 +53,8 @@ members = [
"datafusion-examples/examples/ffi/ffi_module_loader",
"test-utils",
"benchmarks",
"datafusion/macros",
"datafusion/doc",
]
resolver = "2"

Expand All @@ -62,7 +65,7 @@ homepage = "https://datafusion.apache.org"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/datafusion"
rust-version = "1.79"
rust-version = "1.80.1"
version = "43.0.0"

[workspace.dependencies]
Expand All @@ -74,24 +77,23 @@ version = "43.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
arrow = { version = "53.2.0", features = [
arrow = { version = "53.3.0", features = [
"prettyprint",
] }
arrow-array = { version = "53.2.0", default-features = false, features = [
arrow-array = { version = "53.3.0", default-features = false, features = [
"chrono-tz",
] }
arrow-buffer = { version = "53.2.0", default-features = false }
arrow-flight = { version = "53.2.0", features = [
arrow-buffer = { version = "53.3.0", default-features = false }
arrow-flight = { version = "53.3.0", features = [
"flight-sql-experimental",
] }
arrow-ipc = { version = "53.2.0", default-features = false, features = [
arrow-ipc = { version = "53.3.0", default-features = false, features = [
"lz4",
] }
arrow-ord = { version = "53.2.0", default-features = false }
arrow-schema = { version = "53.2.0", default-features = false }
arrow-string = { version = "53.2.0", default-features = false }
arrow-ord = { version = "53.3.0", default-features = false }
arrow-schema = { version = "53.3.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "=0.4.1"
bigdecimal = "0.4.7"
bytes = "1.4"
chrono = { version = "0.4.38", default-features = false }
ctor = "0.2.0"
Expand All @@ -100,6 +102,7 @@ datafusion = { path = "datafusion/core", version = "43.0.0", default-features =
datafusion-catalog = { path = "datafusion/catalog", version = "43.0.0" }
datafusion-common = { path = "datafusion/common", version = "43.0.0", default-features = false }
datafusion-common-runtime = { path = "datafusion/common-runtime", version = "43.0.0" }
datafusion-doc = { path = "datafusion/doc", version = "43.0.0" }
datafusion-execution = { path = "datafusion/execution", version = "43.0.0" }
datafusion-expr = { path = "datafusion/expr", version = "43.0.0" }
datafusion-expr-common = { path = "datafusion/expr-common", version = "43.0.0" }
Expand All @@ -108,8 +111,10 @@ datafusion-functions = { path = "datafusion/functions", version = "43.0.0" }
datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "43.0.0" }
datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "43.0.0" }
datafusion-functions-nested = { path = "datafusion/functions-nested", version = "43.0.0" }
datafusion-functions-table = { path = "datafusion/functions-table", version = "43.0.0" }
datafusion-functions-window = { path = "datafusion/functions-window", version = "43.0.0" }
datafusion-functions-window-common = { path = "datafusion/functions-window-common", version = "43.0.0" }
datafusion-macros = { path = "datafusion/macros", version = "43.0.0" }
datafusion-optimizer = { path = "datafusion/optimizer", version = "43.0.0", default-features = false }
datafusion-physical-expr = { path = "datafusion/physical-expr", version = "43.0.0", default-features = false }
datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "43.0.0", default-features = false }
Expand All @@ -118,8 +123,6 @@ datafusion-physical-plan = { path = "datafusion/physical-plan", version = "43.0.
datafusion-proto = { path = "datafusion/proto", version = "43.0.0" }
datafusion-proto-common = { path = "datafusion/proto-common", version = "43.0.0" }
datafusion-sql = { path = "datafusion/sql", version = "43.0.0" }
datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "43.0.0" }
datafusion-substrait = { path = "datafusion/substrait", version = "43.0.0" }
doc-comment = "0.3"
env_logger = "0.11"
futures = "0.3"
Expand All @@ -128,10 +131,9 @@ hashbrown = { version = "0.14.5", features = ["raw"] }
indexmap = "2.0.0"
itertools = "0.13"
log = "^0.4"
num_cpus = "1.13.0"
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
parquet = { version = "53.2.0", default-features = false, features = [
parquet = { version = "53.3.0", default-features = false, features = [
"arrow",
"async",
"object_store",
Expand Down
22 changes: 14 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
[Chat](https://discord.com/channels/885562378132000778/885562378132000781)

<a href="https://datafusion.apache.org/">
<img src="./docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>
<img src="https://github.com/apache/datafusion/raw/HEAD/docs/source/_static/images/2x_bgwhite_original.png" width="512" alt="logo"/>
</a>

DataFusion is an extensible query engine written in [Rust] that
Expand Down Expand Up @@ -126,16 +126,22 @@ Optional features:

## Rust Version Compatibility Policy

DataFusion's Minimum Required Stable Rust Version (MSRV) policy is to support stable [4 latest
Rust versions](https://releases.rs) OR the stable minor Rust version as of 4 months, whichever is lower.
The Rust toolchain releases are tracked at [Rust Versions](https://releases.rs) and follow
[semantic versioning](https://semver.org/). A Rust toolchain release can be identified
by a version string like `1.80.0`, or more generally `major.minor.patch`.

DataFusion's supports the last 4 stable Rust minor versions released and any such versions released within the last 4 months.

For example, given the releases `1.78.0`, `1.79.0`, `1.80.0`, `1.80.1` and `1.81.0` DataFusion will support 1.78.0, which is 3 minor versions prior to the most minor recent `1.81`.

If a hotfix is released for the minimum supported Rust version (MSRV), the MSRV will be the minor version with all hotfixes, even if it surpasses the four-month window.
Note: If a Rust hotfix is released for the current MSRV, the MSRV will be updated to the specific minor version that includes all applicable hotfixes preceding other policies.

DataFusion enforces MSRV policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)

We enforce this policy using a [MSRV CI Check](https://github.com/search?q=repo%3Aapache%2Fdatafusion+rust-version+language%3ATOML+path%3A%2F%5ECargo.toml%2F&type=code)
## DataFusion API Evolution and Deprecation Guidelines

## DataFusion API evolution policy
Public methods in Apache DataFusion evolve over time: while we try to maintain a
stable API, we also improve the API over time. As a result, we typically
deprecate methods before removing them, according to the [deprecation guidelines].

Public methods in Apache DataFusion are subject to evolve as part of the API lifecycle.
Deprecated methods will be phased out in accordance with the [policy](https://datafusion.apache.org/library-user-guide/api-health.html), ensuring the API is stable and healthy.
[deprecation guidelines]: https://datafusion.apache.org/library-user-guide/api-health.html
1 change: 0 additions & 1 deletion benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ env_logger = { workspace = true }
futures = { workspace = true }
log = { workspace = true }
mimalloc = { version = "0.1", optional = true, default-features = false }
num_cpus = { workspace = true }
parquet = { workspace = true, default-features = true }
serde = { version = "1.0.136", features = ["derive"] }
serde_json = { workspace = true }
Expand Down
40 changes: 22 additions & 18 deletions benchmarks/src/bin/external_aggr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;
use std::sync::OnceLock;
use std::sync::LazyLock;
use structopt::StructOpt;

use arrow::record_batch::RecordBatch;
Expand All @@ -33,12 +33,14 @@ use datafusion::datasource::{MemTable, TableProvider};
use datafusion::error::Result;
use datafusion::execution::memory_pool::FairSpillPool;
use datafusion::execution::memory_pool::{human_readable_size, units};
use datafusion::execution::runtime_env::RuntimeConfig;
use datafusion::execution::runtime_env::RuntimeEnvBuilder;
use datafusion::execution::SessionStateBuilder;
use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion::prelude::*;
use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
use datafusion_common::instant::Instant;
use datafusion_common::utils::get_available_parallelism;
use datafusion_common::{exec_datafusion_err, exec_err, DEFAULT_PARQUET_EXTENSION};

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -89,7 +91,13 @@ struct QueryResult {
/// Memory limits to run: 64MiB, 32MiB, 16MiB
/// Q2 requires 250MiB for aggregation
/// Memory limits to run: 512MiB, 256MiB, 128MiB, 64MiB, 32MiB
static QUERY_MEMORY_LIMITS: OnceLock<HashMap<usize, Vec<u64>>> = OnceLock::new();
static QUERY_MEMORY_LIMITS: LazyLock<HashMap<usize, Vec<u64>>> = LazyLock::new(|| {
use units::*;
let mut map = HashMap::new();
map.insert(1, vec![64 * MB, 32 * MB, 16 * MB]);
map.insert(2, vec![512 * MB, 256 * MB, 128 * MB, 64 * MB, 32 * MB]);
map
});

impl ExternalAggrConfig {
const AGGR_TABLES: [&'static str; 1] = ["lineitem"];
Expand All @@ -112,16 +120,6 @@ impl ExternalAggrConfig {
"#,
];

fn init_query_memory_limits() -> &'static HashMap<usize, Vec<u64>> {
use units::*;
QUERY_MEMORY_LIMITS.get_or_init(|| {
let mut map = HashMap::new();
map.insert(1, vec![64 * MB, 32 * MB, 16 * MB]);
map.insert(2, vec![512 * MB, 256 * MB, 128 * MB, 64 * MB, 32 * MB]);
map
})
}

/// If `--query` and `--memory-limit` is not speicified, run all queries
/// with pre-configured memory limits
/// If only `--query` is specified, run the query with all memory limits
Expand Down Expand Up @@ -159,8 +157,7 @@ impl ExternalAggrConfig {
query_executions.push((query_id, limit));
}
None => {
let memory_limits_table = Self::init_query_memory_limits();
let memory_limits = memory_limits_table.get(&query_id).unwrap();
let memory_limits = QUERY_MEMORY_LIMITS.get(&query_id).unwrap();
for limit in memory_limits {
query_executions.push((query_id, *limit));
}
Expand Down Expand Up @@ -194,10 +191,15 @@ impl ExternalAggrConfig {
let query_name =
format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
let config = self.common.config();
let runtime_config = RuntimeConfig::new()
let runtime_env = RuntimeEnvBuilder::new()
.with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
.build_arc()?;
let ctx = SessionContext::new_with_config_rt(config, runtime_config);
let state = SessionStateBuilder::new()
.with_config(config)
.with_runtime_env(runtime_env)
.with_default_features()
.build();
let ctx = SessionContext::from(state);

// register tables
self.register_tables(&ctx).await?;
Expand Down Expand Up @@ -325,7 +327,9 @@ impl ExternalAggrConfig {
}

fn partitions(&self) -> usize {
self.common.partitions.unwrap_or(num_cpus::get())
self.common
.partitions
.unwrap_or(get_available_parallelism())
}

/// Parse memory limit from string to number of bytes
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/src/bin/h2o.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use datafusion::datasource::MemTable;
use datafusion::prelude::CsvReadOptions;
use datafusion::{arrow::util::pretty, error::Result, prelude::SessionContext};
use datafusion_benchmarks::util::BenchmarkRun;
use datafusion_common::utils::get_available_parallelism;
use std::path::PathBuf;
use std::sync::Arc;
use structopt::StructOpt;
Expand Down Expand Up @@ -91,7 +92,7 @@ async fn group_by(opt: &GroupBy) -> Result<()> {
.with_listing_options(ListingOptions::new(Arc::new(CsvFormat::default())))
.with_schema(Arc::new(schema));
let csv = ListingTable::try_new(listing_config)?;
let partition_size = num_cpus::get();
let partition_size = get_available_parallelism();
let memtable =
MemTable::load(Arc::new(csv), Some(partition_size), &ctx.state()).await?;
ctx.register_table("x", Arc::new(memtable))?;
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/src/imdb/run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan;
use datafusion::physical_plan::{collect, displayable};
use datafusion::prelude::*;
use datafusion_common::instant::Instant;
use datafusion_common::utils::get_available_parallelism;
use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION};

use log::info;
Expand Down Expand Up @@ -468,7 +469,9 @@ impl RunOpt {
}

fn partitions(&self) -> usize {
self.common.partitions.unwrap_or(num_cpus::get())
self.common
.partitions
.unwrap_or(get_available_parallelism())
}
}

Expand Down
Loading

0 comments on commit 4f8a206

Please sign in to comment.