From a4bd3e14f6cde05f4d59bc48142e9ef4bc78f0ae Mon Sep 17 00:00:00 2001 From: Zachary James Williamson Date: Tue, 29 Oct 2024 12:33:43 -0700 Subject: [PATCH] feat: 20-30% cost reduction in recursive ipa algorithm (#9420) eccvm_recursive_verifier_test measurements (size-512 eccvm recursive verification) Old: 876,214 New: 678,751 The relative performance delta should be much greater for large eccvm instances as this PR removes an nlogn algorithm. This PR resolves issue [#857](https://github.com/AztecProtocol/barretenberg/issues/857) and issue [#1023](https://github.com/AztecProtocol/barretenberg/issues/1023) (single batch mul in IPA) Re: [#1023](https://github.com/AztecProtocol/barretenberg/issues/1023). The code still performs 2 batch muls, but all additional * operator calls have been combined into the batch muls. It is not worth combining both batch muls, as it would require a multiplication operation on a large number of scalar multipliers. In the recursive setting the scalars are bigfield elements - the extra bigfield::operator* cost is not worth combining both batch_mul calls. Additional improvements: removed unneccessary uses of `pow` operator in ipa - in the recursive setting these were stdlib::bigfield::pow calls and very expensive removed the number of distinct multiplication calls in ipa::reduce_verify_internal cycle_scalar::cycle_scalar(stdlib::bigfield) constructor now more optimally constructs a cycle_scalar out of a bigfield element. New method leverages the fact that `scalar.lo` and `scalar.hi` are implicitly range-constrained to remove reundant bigfield constructor calls and arithmetic calls, and the process of performing a scalar multiplication applies a modular reduction to the imput, which makes the explicit call to `validate_scalar_is_in_field` unneccessary --------- Co-authored-by: lucasxia01 --- .../commitment_schemes/ipa/ipa.hpp | 131 ++++++++++-------- .../eccvm_recursive_verifier.test.cpp | 6 +- .../goblin_recursive_verifier.test.cpp | 7 +- .../stdlib/primitives/group/cycle_group.cpp | 111 +++++++++++++-- .../op_queue/ecc_op_queue.hpp | 4 +- 5 files changed, 179 insertions(+), 80 deletions(-) diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 859bb7876ff..f877488330e 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -369,20 +369,29 @@ template class IPA { // Construct vector s std::vector s_vec(poly_length, Fr::one()); - // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its - // O(nlogn). This can be optimized to be linear by computing a tree of products. Its very readable, so we're - // leaving it unoptimized for now. - parallel_for_heuristic( - poly_length, - [&](size_t i) { - for (size_t j = (log_poly_degree - 1); j != static_cast(-1); j--) { - auto bit = (i >> j) & 1; - bool b = static_cast(bit); - if (b) { - s_vec[i] *= round_challenges_inv[log_poly_degree - 1 - j]; - } - } - }, thread_heuristics::FF_MULTIPLICATION_COST * log_poly_degree); + std::vector s_vec_temporaries(poly_length / 2); + + Fr* previous_round_s = &s_vec_temporaries[0]; + Fr* current_round_s = &s_vec[0]; + // if number of rounds is even we need to swap these so that s_vec always contains the result + if ((log_poly_degree & 1) == 0) + { + std::swap(previous_round_s, current_round_s); + } + previous_round_s[0] = Fr(1); + for (size_t i = 0; i < log_poly_degree; ++i) + { + const size_t round_size = 1 << (i + 1); + const Fr round_challenge = round_challenges_inv[i]; + parallel_for_heuristic( + round_size / 2, + [&](size_t j) { + current_round_s[j * 2] = previous_round_s[j]; + current_round_s[j * 2 + 1] = previous_round_s[j] * round_challenge; + }, thread_heuristics::FF_MULTIPLICATION_COST * 2); + std::swap(current_round_s, previous_round_s); + } + std::span srs_elements = vk->get_monomial_points(); if (poly_length * 2 > srs_elements.size()) { @@ -454,28 +463,20 @@ template class IPA { const Fr generator_challenge = transcript->template get_challenge("IPA:generator_challenge"); auto builder = generator_challenge.get_context(); - Commitment aux_generator = Commitment::one(builder) * generator_challenge; - const auto log_poly_degree = numeric::get_msb(static_cast(poly_length)); - - // Step 3. - // Compute C' = C + f(\beta) ⋅ U - GroupElement C_prime = opening_claim.commitment + aux_generator * opening_claim.opening_pair.evaluation; - auto pippenger_size = 2 * log_poly_degree; std::vector round_challenges(log_poly_degree); std::vector round_challenges_inv(log_poly_degree); std::vector msm_elements(pippenger_size); std::vector msm_scalars(pippenger_size); - // Step 4. + // Step 3. // Receive all L_i and R_i and prepare for MSM for (size_t i = 0; i < log_poly_degree; i++) { std::string index = std::to_string(log_poly_degree - i - 1); auto element_L = transcript->template receive_from_prover("IPA:L_" + index); auto element_R = transcript->template receive_from_prover("IPA:R_" + index); round_challenges[i] = transcript->template get_challenge("IPA:round_challenge_" + index); - round_challenges_inv[i] = round_challenges[i].invert(); msm_elements[2 * i] = element_L; @@ -484,63 +485,73 @@ template class IPA { msm_scalars[2 * i + 1] = round_challenges[i]; } - // Step 5. - // Compute C₀ = C' + ∑_{j ∈ [k]} u_j^{-1}L_j + ∑_{j ∈ [k]} u_jR_j - GroupElement LR_sums = GroupElement::batch_mul(msm_elements, msm_scalars); - - GroupElement C_zero = C_prime + LR_sums; - - // Step 6. + // Step 4. // Compute b_zero where b_zero can be computed using the polynomial: // g(X) = ∏_{i ∈ [k]} (1 + u_{i-1}^{-1}.X^{2^{i-1}}). // b_zero = g(evaluation) = ∏_{i ∈ [k]} (1 + u_{i-1}^{-1}. (evaluation)^{2^{i-1}}) Fr b_zero = Fr(1); + Fr challenge = opening_claim.opening_pair.challenge; for (size_t i = 0; i < log_poly_degree; i++) { - b_zero *= Fr(1) + (round_challenges_inv[log_poly_degree - 1 - i] * - opening_claim.opening_pair.challenge.pow(1 << i)); + b_zero *= Fr(1) + (round_challenges_inv[log_poly_degree - 1 - i] * challenge); + if (i != log_poly_degree - 1) + { + challenge = challenge * challenge; + } } - // Step 7. + + // Step 5. // Construct vector s + // We implement a linear-time algorithm to optimally compute this vector + // Note: currently requires an extra vector of size `poly_length / 2` to cache temporaries + // this might able to be optimized if we care enough, but the size of this poly shouldn't be large relative to the builder polynomial sizes + std::vector s_vec_temporaries(poly_length / 2); std::vector s_vec(poly_length); - // TODO(https://github.com/AztecProtocol/barretenberg/issues/857): This code is not efficient as its - // O(nlogn). This can be optimized to be linear by computing a tree of products. - for (size_t i = 0; i < poly_length; i++) { - Fr s_vec_scalar = Fr(1); - for (size_t j = (log_poly_degree - 1); j != static_cast(-1); j--) { - auto bit = (i >> j) & 1; - bool b = static_cast(bit); - if (b) { - s_vec_scalar *= round_challenges_inv[log_poly_degree - 1 - j]; - } + Fr* previous_round_s = &s_vec_temporaries[0]; + Fr* current_round_s = &s_vec[0]; + // if number of rounds is even we need to swap these so that s_vec always contains the result + if ((log_poly_degree & 1) == 0) + { + std::swap(previous_round_s, current_round_s); + } + previous_round_s[0] = Fr(1); + for (size_t i = 0; i < log_poly_degree; ++i) + { + const size_t round_size = 1 << (i + 1); + const Fr round_challenge = round_challenges_inv[i]; + for (size_t j = 0; j < round_size / 2; ++j) + { + current_round_s[j * 2] = previous_round_s[j]; + current_round_s[j * 2 + 1] = previous_round_s[j] * round_challenge; } - s_vec[i] = s_vec_scalar; + std::swap(current_round_s, previous_round_s); } - auto srs_elements = vk->get_monomial_points(); - - // TODO(https://github.com/AztecProtocol/barretenberg/issues/1023): Unify the two batch_muls + // Step 6. + // Receive a₀ from the prover + const auto a_zero = transcript->template receive_from_prover("IPA:a_0"); - // Step 8. + // Step 7. // Compute G₀ // Unlike the native verification function, the verifier commitment key only containts the SRS so we can apply // batch_mul directly on it. + const std::vector srs_elements = vk->get_monomial_points(); Commitment G_zero = Commitment::batch_mul(srs_elements, s_vec); - // Step 9. - // Receive a₀ from the prover - auto a_zero = transcript->template receive_from_prover("IPA:a_0"); - - // Step 10. - // Compute C_right - GroupElement right_hand_side = G_zero * a_zero + aux_generator * a_zero * b_zero; - - // Step 11. - // Check if C_right == C₀ - C_zero.assert_equal(right_hand_side); - return (C_zero.get_value() == right_hand_side.get_value()); + // Step 8. + // Compute R = C' + ∑_{j ∈ [k]} u_j^{-1}L_j + ∑_{j ∈ [k]} u_jR_j - G₀ * a₀ - (f(\beta) + a₀ * b₀) ⋅ U + // This is a combination of several IPA relations into a large batch mul + // which should be equal to -C + msm_elements.emplace_back(-G_zero); + msm_elements.emplace_back(-Commitment::one(builder)); + msm_scalars.emplace_back(a_zero); + msm_scalars.emplace_back(generator_challenge * a_zero.madd(b_zero, {opening_claim.opening_pair.evaluation})); + GroupElement ipa_relation = GroupElement::batch_mul(msm_elements, msm_scalars); + ipa_relation.assert_equal(-opening_claim.commitment); + + return (ipa_relation.get_value() == -opening_claim.commitment.get_value()); } public: diff --git a/barretenberg/cpp/src/barretenberg/stdlib/eccvm_verifier/eccvm_recursive_verifier.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/eccvm_verifier/eccvm_recursive_verifier.test.cpp index 5dc17cb76b5..77325bad7e6 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/eccvm_verifier/eccvm_recursive_verifier.test.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/eccvm_verifier/eccvm_recursive_verifier.test.cpp @@ -84,7 +84,7 @@ template class ECCVMRecursiveTests : public ::testing OuterBuilder outer_circuit; RecursiveVerifier verifier{ &outer_circuit, verification_key }; verifier.verify_proof(proof); - info("Recursive Verifier: num gates = ", outer_circuit.num_gates); + info("Recursive Verifier: num gates = ", outer_circuit.get_estimated_num_finalized_gates()); // Check for a failure flag in the recursive verifier circuit EXPECT_EQ(outer_circuit.failed(), false) << outer_circuit.err(); @@ -135,10 +135,10 @@ template class ECCVMRecursiveTests : public ::testing OuterBuilder outer_circuit; RecursiveVerifier verifier{ &outer_circuit, verification_key }; verifier.verify_proof(proof); - info("Recursive Verifier: num gates = ", outer_circuit.num_gates); + info("Recursive Verifier: estimated num finalized gates = ", outer_circuit.get_estimated_num_finalized_gates()); // Check for a failure flag in the recursive verifier circuit - EXPECT_EQ(outer_circuit.failed(), true) << outer_circuit.err(); + EXPECT_FALSE(CircuitChecker::check(outer_circuit)); } }; using FlavorTypes = testing::Types>; diff --git a/barretenberg/cpp/src/barretenberg/stdlib/goblin_verifier/goblin_recursive_verifier.test.cpp b/barretenberg/cpp/src/barretenberg/stdlib/goblin_verifier/goblin_recursive_verifier.test.cpp index bcf05ed2583..10ece46e9e8 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/goblin_verifier/goblin_recursive_verifier.test.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/goblin_verifier/goblin_recursive_verifier.test.cpp @@ -113,7 +113,9 @@ TEST_F(GoblinRecursiveVerifierTests, ECCVMFailure) // Tamper with the ECCVM proof for (auto& val : proof.eccvm_proof) { - if (val > 0) { // tamper by finding the first non-zero value and incrementing it by 1 + if (val > 0) { // tamper by finding the tenth non-zero value and incrementing it by 1 + // tamper by finding the first non-zero value + // and incrementing it by 1 val += 1; break; } @@ -121,9 +123,8 @@ TEST_F(GoblinRecursiveVerifierTests, ECCVMFailure) Builder builder; GoblinRecursiveVerifier verifier{ &builder, verifier_input }; - verifier.verify(proof); - EXPECT_FALSE(CircuitChecker::check(builder)); + EXPECT_DEBUG_DEATH(verifier.verify(proof), "(sumcheck_verified && batched_opening_verified)"); } /** diff --git a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp index 789c5eb7627..352d4d6b679 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp +++ b/barretenberg/cpp/src/barretenberg/stdlib/primitives/group/cycle_group.cpp @@ -678,23 +678,110 @@ typename cycle_group::cycle_scalar cycle_group::cycle_scalar:: template cycle_group::cycle_scalar::cycle_scalar(BigScalarField& scalar) { auto* ctx = get_context() ? get_context() : scalar.get_context(); - const uint256_t value((scalar.get_value() % uint512_t(ScalarField::modulus)).lo); - const uint256_t value_lo = value.slice(0, LO_BITS); - const uint256_t value_hi = value.slice(LO_BITS, HI_BITS); + if (scalar.is_constant()) { + const uint256_t value((scalar.get_value() % uint512_t(ScalarField::modulus)).lo); + const uint256_t value_lo = value.slice(0, LO_BITS); + const uint256_t value_hi = value.slice(LO_BITS, HI_BITS); + lo = value_lo; hi = value_hi; // N.B. to be able to call assert equal, these cannot be constants } else { - lo = witness_t(ctx, value_lo); - hi = witness_t(ctx, value_hi); - field_t zero = field_t(0); - zero.convert_constant_to_fixed_witness(ctx); - BigScalarField lo_big(lo, zero); - BigScalarField hi_big(hi, zero); - BigScalarField res = lo_big + hi_big * BigScalarField((uint256_t(1) << LO_BITS)); - scalar.assert_equal(res); - validate_scalar_is_in_field(); + // To efficiently convert a bigfield into a cycle scalar, + // we are going to explicitly rely on the fact that `scalar.lo` and `scalar.hi` + // are implicitly range-constrained to be 128 bits when they are converted into 4-bit lookup window slices + + // First check: can the scalar actually fit into LO_BITS + HI_BITS? + // If it can, we can tolerate the scalar being > ScalarField::modulus, because performing a scalar mul + // implicilty performs a modular reduction + // If not, call `self_reduce` to cut enougn modulus multiples until the above condition is met + if (scalar.get_maximum_value() >= (uint512_t(1) << (LO_BITS + HI_BITS))) { + scalar.self_reduce(); + } + + field_t limb0 = scalar.binary_basis_limbs[0].element; + field_t limb1 = scalar.binary_basis_limbs[1].element; + field_t limb2 = scalar.binary_basis_limbs[2].element; + field_t limb3 = scalar.binary_basis_limbs[3].element; + + // The general plan is as follows: + // 1. ensure limb0 contains no more than BigScalarField::NUM_LIMB_BITS + // 2. define limb1_lo = limb1.slice(0, LO_BITS - BigScalarField::NUM_LIMB_BITS) + // 3. define limb1_hi = limb1.slice(LO_BITS - BigScalarField::NUM_LIMB_BITS, ) + // 4. construct *this.lo out of limb0 and limb1_lo + // 5. construct *this.hi out of limb1_hi, limb2 and limb3 + // This is a lot of logic, but very cheap on constraints. + // For fresh bignums that have come out of a MUL operation, + // the only "expensive" part is a size (LO_BITS - BigScalarField::NUM_LIMB_BITS) range check + + // to convert into a cycle_scalar, we need to convert 4*68 bit limbs into 2*128 bit limbs + // we also need to ensure that the number of bits in cycle_scalar is < LO_BITS + HI_BITS + // note: we do not need to validate that the scalar is within the field modulus + // because performing a scalar multiplication implicitly performs a modular reduction (ecc group is + // multiplicative modulo BigField::modulus) + + uint256_t limb1_max = scalar.binary_basis_limbs[1].maximum_value; + + // Ensure that limb0 only contains at most NUM_LIMB_BITS. If it exceeds this value, slice of the excess and add + // it into limb1 + if (scalar.binary_basis_limbs[0].maximum_value > BigScalarField::DEFAULT_MAXIMUM_LIMB) { + const uint256_t limb = limb0.get_value(); + const uint256_t lo_v = limb.slice(0, BigScalarField::NUM_LIMB_BITS); + const uint256_t hi_v = limb >> BigScalarField::NUM_LIMB_BITS; + field_t lo = field_t::from_witness(ctx, lo_v); + field_t hi = field_t::from_witness(ctx, hi_v); + + uint256_t hi_max = (scalar.binary_basis_limbs[0].maximum_value >> BigScalarField::NUM_LIMB_BITS); + const uint64_t hi_bits = hi_max.get_msb() + 1; + lo.create_range_constraint(BigScalarField::NUM_LIMB_BITS); + hi.create_range_constraint(static_cast(hi_bits)); + limb0.assert_equal(lo + hi * BigScalarField::shift_1); + + limb1 += hi; + limb1_max += hi_max; + limb0 = lo; + } + + // sanity check that limb[1] is the limb that contributs both to *this.lo and *this.hi + ASSERT((BigScalarField::NUM_LIMB_BITS * 2 > LO_BITS) && (BigScalarField::NUM_LIMB_BITS < LO_BITS)); + + // limb1 is the tricky one as it contributs to both *this.lo and *this.hi + // By this point, we know that limb1 fits in the range `1 << BigScalarField::NUM_LIMB_BITS to (1 << + // BigScalarField::NUM_LIMB_BITS) + limb1_max.get_maximum_value() we need to slice this limb into 2. The first + // is LO_BITS - BigScalarField::NUM_LIMB_BITS (which reprsents its contribution to *this.lo) and the second + // represents the limbs contribution to *this.hi Step 1: compute the max bit sizes of both slices + const size_t lo_bits_in_limb_1 = LO_BITS - BigScalarField::NUM_LIMB_BITS; + const size_t hi_bits_in_limb_1 = (static_cast(limb1_max.get_msb()) + 1) - lo_bits_in_limb_1; + + // Step 2: compute the witness values of both slices + const uint256_t limb_1 = limb1.get_value(); + const uint256_t limb_1_hi_multiplicand = (uint256_t(1) << lo_bits_in_limb_1); + const uint256_t limb_1_hi_v = limb_1 >> lo_bits_in_limb_1; + const uint256_t limb_1_lo_v = limb_1 - (limb_1_hi_v << lo_bits_in_limb_1); + + // Step 3: instantiate both slices as witnesses and validate their sum equals limb1 + field_t limb_1_lo = field_t::from_witness(ctx, limb_1_lo_v); + field_t limb_1_hi = field_t::from_witness(ctx, limb_1_hi_v); + limb1.assert_equal(limb_1_hi * limb_1_hi_multiplicand + limb_1_lo); + + // Step 4: apply range constraints to validate both slices represent the expected contributions to *this.lo and + // *this,hi + limb_1_lo.create_range_constraint(lo_bits_in_limb_1); + limb_1_hi.create_range_constraint(hi_bits_in_limb_1); + + // construct *this.lo out of: + // a. `limb0` (the first NUM_LIMB_BITS bits of scalar) + // b. `limb_1_lo` (the first LO_BITS - NUM_LIMB_BITS) of limb1 + lo = limb0 + (limb_1_lo * BigScalarField::shift_1); + + const uint256_t limb_2_shift = uint256_t(1) << (BigScalarField::NUM_LIMB_BITS - lo_bits_in_limb_1); + const uint256_t limb_3_shift = + uint256_t(1) << ((BigScalarField::NUM_LIMB_BITS - lo_bits_in_limb_1) + BigScalarField::NUM_LIMB_BITS); + + // construct *this.hi out of limb2, limb3 and the remaining term from limb1 not contributing to `lo` + hi = limb_1_hi.add_two(limb2 * limb_2_shift, limb3 * limb_3_shift); } }; diff --git a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/op_queue/ecc_op_queue.hpp b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/op_queue/ecc_op_queue.hpp index 91e64b764a0..93ebc0a35d9 100644 --- a/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/op_queue/ecc_op_queue.hpp +++ b/barretenberg/cpp/src/barretenberg/stdlib_circuit_builders/op_queue/ecc_op_queue.hpp @@ -485,8 +485,8 @@ class ECCOpQueue { // Decompose point coordinates (Fq) into hi-lo chunks (Fr) const size_t CHUNK_SIZE = 2 * DEFAULT_NON_NATIVE_FIELD_LIMB_BITS; - auto x_256 = uint256_t(point.x); - auto y_256 = uint256_t(point.y); + uint256_t x_256(point.x); + uint256_t y_256(point.y); ultra_op.return_is_infinity = point.is_point_at_infinity(); ultra_op.x_lo = Fr(x_256.slice(0, CHUNK_SIZE)); ultra_op.x_hi = Fr(x_256.slice(CHUNK_SIZE, CHUNK_SIZE * 2));