diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index e4d941004b..dd8fe7198b 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -41,13 +41,6 @@ jobs: steps: - uses: actions/checkout@v4 - uses: ./.github/actions/cleanup - - name: Configure AWS Credentials for benchmark data - uses: aws-actions/configure-aws-credentials@v4 - if: matrix.benchmark.id == 'clickbench' - with: - audience: sts.amazonaws.com - aws-region: us-west-2 - role-to-assume: arn:aws:iam::375504701696:role/vortex-benchmark - uses: ./.github/actions/setup-rust - uses: spiraldb/actions/.github/actions/setup-uv@0.2.0 @@ -63,11 +56,6 @@ jobs: run: | echo "TMPDIR=/work" >> $GITHUB_ENV - - name: Download Clickbench data - if: matrix.benchmark.id == 'clickbench' - run: - aws s3 cp s3://vortex-bench-dev/clickbench/processed.parquet bench-vortex/data/clickbench/ - - name: Run benchmark shell: bash env: @@ -94,10 +82,6 @@ jobs: | jq --slurp --compact-output '.' >${{ matrix.benchmark.id }}.json cat ${{ matrix.benchmark.id }}.json - - name: Cleanup Raw Data - if: always() - run: - rm -rf bench-vortex/data/ - name: Store benchmark result if: '!cancelled()' uses: benchmark-action/github-action-benchmark@v1 diff --git a/bench-vortex/benches/clickbench.rs b/bench-vortex/benches/clickbench.rs index 3e594906a2..03067d3bb9 100644 --- a/bench-vortex/benches/clickbench.rs +++ b/bench-vortex/benches/clickbench.rs @@ -25,7 +25,7 @@ fn benchmark(c: &mut Criterion) { epoch_ms(ClientEventTime * 1000) AS ClientEventTime, \ epoch_ms(LocalEventTime * 1000) AS LocalEventTime, \ DATE '1970-01-01' + INTERVAL (EventDate) DAYS AS EventDate) \ - FROM read_parquet('https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{idx}.parquet')) TO '{}' (FORMAT 'parquet');", + FROM read_parquet('https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{idx}.parquet', binary_as_string=True)) TO '{}' (FORMAT 'parquet');", output_path.to_str().unwrap() ); Command::new("duckdb") diff --git a/bench-vortex/src/clickbench.rs b/bench-vortex/src/clickbench.rs index cfde337ed0..0c8882d3e6 100644 --- a/bench-vortex/src/clickbench.rs +++ b/bench-vortex/src/clickbench.rs @@ -147,7 +147,7 @@ pub async fn register_vortex_file( let vortex_dir = input_path.parent().unwrap().join("vortex_compressed"); create_dir_all(&vortex_dir).await?; - for idx in 0..1 { + for idx in 0..100 { let parquet_file_path = input_path.join(format!("hits_{idx}.parquet")); let output_path = vortex_dir.join(format!("hits_{idx}.{VORTEX_FILE_EXTENSION}")); idempotent_async(&output_path, |vtx_file| async move { diff --git a/bench-vortex/src/tpch/mod.rs b/bench-vortex/src/tpch/mod.rs index b8a10804cc..3746a0c454 100644 --- a/bench-vortex/src/tpch/mod.rs +++ b/bench-vortex/src/tpch/mod.rs @@ -26,7 +26,6 @@ use vortex_datafusion::memory::VortexMemTableOptions; use vortex_datafusion::persistent::format::VortexFormat; use vortex_datafusion::SessionContextExt; -use crate::clickbench::HITS_SCHEMA; use crate::{idempotent_async, CTX, TARGET_BLOCK_BYTESIZE, TARGET_BLOCK_SIZE}; pub mod dbgen; @@ -337,9 +336,8 @@ async fn register_vortex_file( let table_url = ListingTableUrl::parse(vtx_file.to_str().unwrap())?; let config = ListingTableConfig::new(table_url) .with_listing_options(ListingOptions::new(format as _)) - .with_schema(HITS_SCHEMA.clone().into()); - // .infer_schema(&session.state()) - // .await?; + .infer_schema(&session.state()) + .await?; let listing_table = Arc::new(ListingTable::try_new(config)?);