From 27cc9511cc8e5e0a82c22ea8af7340d0f251f6b3 Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 12:58:39 +0000 Subject: [PATCH 1/7] init benchmarking for pow poly --- .../benchmark/ivc_bench/ivc.bench.cpp | 8 +++--- .../barretenberg/polynomials/pow.bench.cpp | 27 +++++++++++++++++++ .../cpp/src/barretenberg/polynomials/pow.hpp | 26 +++++++++++++++++- barretenberg/cpp/srs_db/download_grumpkin.sh | 2 +- 4 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp index 95eeb7d5b94..6fddc86057d 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp @@ -168,10 +168,10 @@ BENCHMARK_DEFINE_F(IvcBench, Translator)(benchmark::State& state) ->Arg(1 << 6) BENCHMARK_REGISTER_F(IvcBench, Full)->Unit(benchmark::kMillisecond)->ARGS; -BENCHMARK_REGISTER_F(IvcBench, Accumulate)->Unit(benchmark::kMillisecond)->ARGS; -BENCHMARK_REGISTER_F(IvcBench, Decide)->Unit(benchmark::kMillisecond)->ARGS; -BENCHMARK_REGISTER_F(IvcBench, ECCVM)->Unit(benchmark::kMillisecond)->ARGS; -BENCHMARK_REGISTER_F(IvcBench, Translator)->Unit(benchmark::kMillisecond)->ARGS; +// BENCHMARK_REGISTER_F(IvcBench, Accumulate)->Unit(benchmark::kMillisecond)->ARGS; +// BENCHMARK_REGISTER_F(IvcBench, Decide)->Unit(benchmark::kMillisecond)->ARGS; +// BENCHMARK_REGISTER_F(IvcBench, ECCVM)->Unit(benchmark::kMillisecond)->ARGS; +// BENCHMARK_REGISTER_F(IvcBench, Translator)->Unit(benchmark::kMillisecond)->ARGS; } // namespace diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp new file mode 100644 index 00000000000..44f7b19b457 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp @@ -0,0 +1,27 @@ +#include "barretenberg/polynomials/pow.hpp" +#include "barretenberg/ecc/curves/bn254/fr.hpp" +#include + +using namespace benchmark; +using namespace bb; + +namespace { + +void compute_pow_poly(benchmark::State& state) +{ + // just set up huge vector + auto betas = std::vector{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 }; + + for (auto _ : state) { + auto num_betas = state.range(0); + std::vector cur_betas(betas.begin(), betas.begin() + num_betas); + auto pow = PowPolynomial(cur_betas); + pow.compute_values(); + } +} + +BENCHMARK(compute_pow_poly)->Unit(benchmark::kMillisecond)->Arg(20); + +} // namespace +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp index 9079724c98a..3eb23329004 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp @@ -1,4 +1,5 @@ #pragma once +#include "barretenberg/common/compiler_hints.hpp" #include "barretenberg/common/thread.hpp" #include @@ -121,7 +122,7 @@ template struct PowPolynomial { * @brief Given \vec{β} = {β_0,...,β_{d-1}} compute pow_\vec{β}(i) for i=0,...,2^{d}-1 * */ - void compute_values() + BB_PROFILE void compute_values() { size_t pow_size = 1 << betas.size(); pow_betas = std::vector(pow_size); @@ -136,10 +137,19 @@ template struct PowPolynomial { size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 size_t iterations_per_thread = pow_size / num_threads; // actual iterations per thread + + // // first generate values for each thread with one thread + // size_t cur_pow = 1; + // while (cur_pow < max_num_threads) { + + // pow_betas[] + // } + // each thread computes a tree parallel_for(num_threads, [&](size_t thread_idx) { size_t start = thread_idx * iterations_per_thread; size_t end = (thread_idx + 1) * iterations_per_thread; for (size_t i = start; i < end; i++) { + // compute a tree actually auto res = FF(1); for (size_t j = i, beta_idx = 0; j > 0; j >>= 1, beta_idx++) { if ((j & 1) == 1) { @@ -149,6 +159,20 @@ template struct PowPolynomial { pow_betas[i] = res; } }); + + // parallel_for(num_threads, [&](size_t thread_idx) { + // size_t start = thread_idx * iterations_per_thread; + // size_t end = (thread_idx + 1) * iterations_per_thread; + // for (size_t i = start; i < end; i++) { + // auto res = FF(1); + // for (size_t j = i, beta_idx = 0; j > 0; j >>= 1, beta_idx++) { + // if ((j & 1) == 1) { + // res *= betas[beta_idx]; + // } + // } + // pow_betas[i] = res; + // } + // }); } }; } // namespace bb \ No newline at end of file diff --git a/barretenberg/cpp/srs_db/download_grumpkin.sh b/barretenberg/cpp/srs_db/download_grumpkin.sh index fb59a1ec806..9d756454cf5 100755 --- a/barretenberg/cpp/srs_db/download_grumpkin.sh +++ b/barretenberg/cpp/srs_db/download_grumpkin.sh @@ -6,6 +6,6 @@ set -eu # Enter build directory sibling to our script folder. cd $(dirname $0)/../build -./bin/grumpkin_srs_gen 1048576 +./bin/grumpkin_srs_gen 524288 mkdir -p ~/.bb-crs ln -s ../srs_db/grumpkin/monomial ~/.bb-crs/monomial \ No newline at end of file From d39ff0e188bd30a1753b53177d0fee6e666bae80 Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 17:51:49 +0000 Subject: [PATCH 2/7] wip, but seems like impact is low --- .../cpp/src/barretenberg/polynomials/pow.hpp | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp index 3eb23329004..9aa210b6038 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp @@ -124,6 +124,7 @@ template struct PowPolynomial { */ BB_PROFILE void compute_values() { + BB_OP_COUNT_TIME(); size_t pow_size = 1 << betas.size(); pow_betas = std::vector(pow_size); @@ -137,19 +138,31 @@ template struct PowPolynomial { size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 size_t iterations_per_thread = pow_size / num_threads; // actual iterations per thread - + static_cast(iterations_per_thread); // // first generate values for each thread with one thread // size_t cur_pow = 1; + // size_t level = betas.size() - 1; // while (cur_pow < max_num_threads) { - - // pow_betas[] + // for (size_t i = 0; i < cur_pow; i++) { + // pow_betas[(i << (level + 1)) + (1 << level)] = pow_betas[(i << (level + 1))] * betas[level]; + // } + // cur_pow++; + // level--; // } - // each thread computes a tree + // // each thread computes a tree at i< 0) { + // for (size_t i =) + // level--; + // } + // }); + parallel_for(num_threads, [&](size_t thread_idx) { size_t start = thread_idx * iterations_per_thread; size_t end = (thread_idx + 1) * iterations_per_thread; for (size_t i = start; i < end; i++) { - // compute a tree actually auto res = FF(1); for (size_t j = i, beta_idx = 0; j > 0; j >>= 1, beta_idx++) { if ((j & 1) == 1) { @@ -159,20 +172,6 @@ template struct PowPolynomial { pow_betas[i] = res; } }); - - // parallel_for(num_threads, [&](size_t thread_idx) { - // size_t start = thread_idx * iterations_per_thread; - // size_t end = (thread_idx + 1) * iterations_per_thread; - // for (size_t i = start; i < end; i++) { - // auto res = FF(1); - // for (size_t j = i, beta_idx = 0; j > 0; j >>= 1, beta_idx++) { - // if ((j & 1) == 1) { - // res *= betas[beta_idx]; - // } - // } - // pow_betas[i] = res; - // } - // }); } }; } // namespace bb \ No newline at end of file From c1dc1675afadb71b25a1fd17a974feb21879f55e Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 18:23:39 +0000 Subject: [PATCH 3/7] adding issues to two places for future optimizations --- .../commitment_schemes/ipa/ipa.hpp | 3 +++ .../cpp/src/barretenberg/polynomials/pow.hpp | 23 ++++--------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index a0c7f735096..f89b42ca60a 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -235,6 +235,9 @@ template class IPA { // Compute G_zero // First construct s_vec std::vector s_vec(poly_degree); + // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its O(nlogn). + // This can be optimized to be linear by computing a tree of products. Its very readable, so we're + // leaving it unoptimized for now. run_loop_in_parallel_if_effective( poly_degree, [&s_vec, &round_challenges_inv, log_poly_degree](size_t start, size_t end) { diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp index 9aa210b6038..495766d099b 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp @@ -139,26 +139,11 @@ template struct PowPolynomial { num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 size_t iterations_per_thread = pow_size / num_threads; // actual iterations per thread static_cast(iterations_per_thread); - // // first generate values for each thread with one thread - // size_t cur_pow = 1; - // size_t level = betas.size() - 1; - // while (cur_pow < max_num_threads) { - // for (size_t i = 0; i < cur_pow; i++) { - // pow_betas[(i << (level + 1)) + (1 << level)] = pow_betas[(i << (level + 1))] * betas[level]; - // } - // cur_pow++; - // level--; - // } - // // each thread computes a tree at i< 0) { - // for (size_t i =) - // level--; - // } - // }); + // TODO(https://github.com/AztecProtocol/barretenberg/issues/864): This computation is asymtotically slow as it + // does pow_size * log(pow_size) work. However, in practice, its super efficient because its trivially + // parallelizable and only takes 45ms for the whole 6 iter IVC benchmark. Its also very readable, so we're + // leaving it unoptimized for now. parallel_for(num_threads, [&](size_t thread_idx) { size_t start = thread_idx * iterations_per_thread; size_t end = (thread_idx + 1) * iterations_per_thread; From 221888cad72cf404c24ff804719e107a64893ffc Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 18:25:07 +0000 Subject: [PATCH 4/7] undo uncomment --- .../src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp b/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp index 6fddc86057d..95eeb7d5b94 100644 --- a/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/benchmark/ivc_bench/ivc.bench.cpp @@ -168,10 +168,10 @@ BENCHMARK_DEFINE_F(IvcBench, Translator)(benchmark::State& state) ->Arg(1 << 6) BENCHMARK_REGISTER_F(IvcBench, Full)->Unit(benchmark::kMillisecond)->ARGS; -// BENCHMARK_REGISTER_F(IvcBench, Accumulate)->Unit(benchmark::kMillisecond)->ARGS; -// BENCHMARK_REGISTER_F(IvcBench, Decide)->Unit(benchmark::kMillisecond)->ARGS; -// BENCHMARK_REGISTER_F(IvcBench, ECCVM)->Unit(benchmark::kMillisecond)->ARGS; -// BENCHMARK_REGISTER_F(IvcBench, Translator)->Unit(benchmark::kMillisecond)->ARGS; +BENCHMARK_REGISTER_F(IvcBench, Accumulate)->Unit(benchmark::kMillisecond)->ARGS; +BENCHMARK_REGISTER_F(IvcBench, Decide)->Unit(benchmark::kMillisecond)->ARGS; +BENCHMARK_REGISTER_F(IvcBench, ECCVM)->Unit(benchmark::kMillisecond)->ARGS; +BENCHMARK_REGISTER_F(IvcBench, Translator)->Unit(benchmark::kMillisecond)->ARGS; } // namespace From dff82f29518dfa0ed088f00c443d78c32d6fca7d Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 18:25:47 +0000 Subject: [PATCH 5/7] undo grumpkin change --- barretenberg/cpp/srs_db/download_grumpkin.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/barretenberg/cpp/srs_db/download_grumpkin.sh b/barretenberg/cpp/srs_db/download_grumpkin.sh index 9d756454cf5..fb59a1ec806 100755 --- a/barretenberg/cpp/srs_db/download_grumpkin.sh +++ b/barretenberg/cpp/srs_db/download_grumpkin.sh @@ -6,6 +6,6 @@ set -eu # Enter build directory sibling to our script folder. cd $(dirname $0)/../build -./bin/grumpkin_srs_gen 524288 +./bin/grumpkin_srs_gen 1048576 mkdir -p ~/.bb-crs ln -s ../srs_db/grumpkin/monomial ~/.bb-crs/monomial \ No newline at end of file From 978200e32c5678bddec9103ac3517c449c69d422 Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Thu, 22 Feb 2024 18:52:59 +0000 Subject: [PATCH 6/7] build fix --- barretenberg/cpp/src/barretenberg/polynomials/pow.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp index 495766d099b..7dcc2fbddeb 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.hpp @@ -1,5 +1,6 @@ #pragma once #include "barretenberg/common/compiler_hints.hpp" +#include "barretenberg/common/op_count.hpp" #include "barretenberg/common/thread.hpp" #include @@ -138,7 +139,6 @@ template struct PowPolynomial { size_t num_threads = std::min(desired_num_threads, max_num_threads); // fewer than max if justified num_threads = num_threads > 0 ? num_threads : 1; // ensure num threads is >= 1 size_t iterations_per_thread = pow_size / num_threads; // actual iterations per thread - static_cast(iterations_per_thread); // TODO(https://github.com/AztecProtocol/barretenberg/issues/864): This computation is asymtotically slow as it // does pow_size * log(pow_size) work. However, in practice, its super efficient because its trivially From 32b94eb7b5c53b7eab0cddd66331a1af08bcaf1d Mon Sep 17 00:00:00 2001 From: lucasxia01 Date: Fri, 23 Feb 2024 11:31:00 +0000 Subject: [PATCH 7/7] style update --- .../cpp/src/barretenberg/polynomials/pow.bench.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp b/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp index 44f7b19b457..083ff70db5e 100644 --- a/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp +++ b/barretenberg/cpp/src/barretenberg/polynomials/pow.bench.cpp @@ -10,13 +10,13 @@ namespace { void compute_pow_poly(benchmark::State& state) { // just set up huge vector - auto betas = std::vector{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 }; + std::vector betas{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 }; for (auto _ : state) { - auto num_betas = state.range(0); + int64_t num_betas = state.range(0); std::vector cur_betas(betas.begin(), betas.begin() + num_betas); - auto pow = PowPolynomial(cur_betas); + PowPolynomial pow{ cur_betas }; pow.compute_values(); } }