Skip to content

Commit

Permalink
merge fix
Browse files Browse the repository at this point in the history
  • Loading branch information
EkamSinghPandher committed Oct 30, 2024
2 parents d36f304 + 8a00c2b commit 55c6501
Show file tree
Hide file tree
Showing 25 changed files with 2,513 additions and 213 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ members = ["field", "maybe_rayon", "plonky2", "starky", "util", "gen", "u32", "e
resolver = "2"

[workspace.dependencies]
cryptography_cuda = { git = "ssh://[email protected]/okx/cryptography_cuda.git", rev = "2a7c42d29ee72d7c2c2da9378ae816384c43cdec" }
cryptography_cuda = { git = "ssh://[email protected]/okx/cryptography_cuda.git", rev = "547192b2ef42dc7519435059c86f88431b8de999" }
ahash = { version = "0.8.7", default-features = false, features = [
"compile-time-rng",
] } # NOTE: Be sure to keep this version the same as the dependency in `hashbrown`.
Expand Down
16 changes: 0 additions & 16 deletions field/src/fft.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
use alloc::vec::Vec;
use core::cmp::{max, min};

#[cfg(feature = "cuda")]
use cryptography_cuda::{ntt, types::NTTInputOutputOrder};
use plonky2_util::{log2_strict, reverse_index_bits_in_place};
use unroll::unroll_for_loops;

Expand Down Expand Up @@ -34,20 +32,6 @@ pub fn fft_root_table<F: Field>(n: usize) -> FftRootTable<F> {
root_table
}

#[allow(dead_code)]
#[cfg(feature = "cuda")]
fn fft_dispatch_gpu<F: Field>(
input: &mut [F],
zero_factor: Option<usize>,
root_table: Option<&FftRootTable<F>>,
) {
if F::CUDA_SUPPORT {
return ntt(0, input, NTTInputOutputOrder::NN);
} else {
return fft_dispatch_cpu(input, zero_factor, root_table);
}
}

fn fft_dispatch_cpu<F: Field>(
input: &mut [F],
zero_factor: Option<usize>,
Expand Down
6 changes: 5 additions & 1 deletion plonky2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ serde = { workspace = true, features = ["rc"] }
static_assertions = { workspace = true }
unroll = { workspace = true }
web-time = { version = "1.0.0", optional = true }
once_cell = { version = "1.18.0" }
once_cell = { version = "1.20.2" }
papi-bindings = { version = "0.5.2" }

# Local dependencies
Expand Down Expand Up @@ -80,6 +80,10 @@ harness = false
name = "ffts"
harness = false

[[bench]]
name = "lde"
harness = false

[[bench]]
name = "hashing"
harness = false
Expand Down
59 changes: 59 additions & 0 deletions plonky2/benches/lde.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
mod allocator;

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
#[cfg(feature = "cuda")]
use cryptography_cuda::init_cuda_degree_rs;
use plonky2::field::extension::Extendable;
use plonky2::field::goldilocks_field::GoldilocksField;
use plonky2::field::polynomial::PolynomialCoeffs;
use plonky2::fri::oracle::PolynomialBatch;
use plonky2::hash::hash_types::RichField;
use plonky2::plonk::config::{GenericConfig, PoseidonGoldilocksConfig};
use plonky2::util::timing::TimingTree;
use tynm::type_name;

pub(crate) fn bench_batch_lde<
F: RichField + Extendable<D>,
C: GenericConfig<D, F = F>,
const D: usize,
>(
c: &mut Criterion,
) {
const RATE_BITS: usize = 3;

let mut group = c.benchmark_group(&format!("lde<{}>", type_name::<F>()));

#[cfg(feature = "cuda")]
init_cuda_degree_rs(16);

for size_log in [13, 14, 15] {
let orig_size = 1 << (size_log - RATE_BITS);
let lde_size = 1 << size_log;
let batch_size = 1 << 4;

group.bench_with_input(BenchmarkId::from_parameter(lde_size), &lde_size, |b, _| {
let polynomials: Vec<PolynomialCoeffs<F>> = (0..batch_size)
.into_iter()
.map(|_i| PolynomialCoeffs::new(F::rand_vec(orig_size)))
.collect();
let mut timing = TimingTree::new("lde", log::Level::Error);
b.iter(|| {
PolynomialBatch::<F, C, D>::from_coeffs(
polynomials.clone(),
RATE_BITS,
false,
1,
&mut timing,
None,
)
});
});
}
}

fn criterion_benchmark(c: &mut Criterion) {
bench_batch_lde::<GoldilocksField, PoseidonGoldilocksConfig, 2>(c);
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
158 changes: 152 additions & 6 deletions plonky2/src/fri/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,22 @@ use crate::util::reducing::ReducingFactor;
use crate::util::timing::TimingTree;
use crate::util::{log2_strict, reverse_bits, reverse_index_bits_in_place, transpose};

#[cfg(all(feature = "cuda", any(test, doctest)))]
pub static GPU_INIT: once_cell::sync::Lazy<std::sync::Arc<std::sync::Mutex<u64>>> =
once_cell::sync::Lazy::new(|| std::sync::Arc::new(std::sync::Mutex::new(0)));

#[cfg(all(feature = "cuda", any(test, doctest)))]
fn init_gpu() {
use cryptography_cuda::init_cuda_rs;

let mut init = GPU_INIT.lock().unwrap();
if *init == 0 {
println!("Init GPU!");
init_cuda_rs();
*init = 1;
}
}

/// Four (~64 bit) field elements gives ~128 bit security.
pub const SALT_SIZE: usize = 4;

Expand Down Expand Up @@ -307,10 +323,22 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
let pols = polynomials.len();
let degree = polynomials[0].len();
let log_n = log2_strict(degree);
<<<<<<< HEAD

if log_n + rate_bits > 1 && polynomials.len() > 0 {
=======

#[cfg(any(test, doctest))]
init_gpu();

if log_n + rate_bits > 1
&& polynomials.len() > 0
&& pols * (1 << (log_n + rate_bits)) < (1 << 31)
{
>>>>>>> 8a00c2bc54a76355a0bf73dcaabb560d688cab4d
let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
Expand Down Expand Up @@ -347,17 +375,17 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
}

#[cfg(feature = "cuda")]
pub fn from_coeffs_gpu(
fn from_coeffs_gpu(
polynomials: &[PolynomialCoeffs<F>],
rate_bits: usize,
_blinding: bool,
blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
_fft_root_table: Option<&FftRootTable<F>>,
log_n: usize,
_degree: usize,
) -> MerkleTree<F, <C as GenericConfig<D>>::Hasher> {
// let salt_size = if blinding { SALT_SIZE } else { 0 };
let salt_size = if blinding { SALT_SIZE } else { 0 };
// println!("salt_size: {:?}", salt_size);
let output_domain_size = log_n + rate_bits;

Expand All @@ -370,8 +398,9 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
let total_num_of_fft = polynomials.len();
// println!("total_num_of_fft: {:?}", total_num_of_fft);

let num_of_cols = total_num_of_fft + salt_size; // if blinding, extend by salt_size
let total_num_input_elements = total_num_of_fft * (1 << log_n);
let total_num_output_elements = total_num_of_fft * (1 << output_domain_size);
let total_num_output_elements = num_of_cols * (1 << output_domain_size);

let mut gpu_input: Vec<F> = polynomials
.into_iter()
Expand All @@ -385,6 +414,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
cfg_lde.is_multi_gpu = true;
cfg_lde.salt_size = salt_size as u32;

let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();
Expand Down Expand Up @@ -415,7 +445,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
}

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = total_num_of_fft as u32;
cfg_trans.batches = num_of_cols as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

Expand All @@ -440,10 +470,14 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
MerkleTree::new_from_gpu_leaves(
&device_transpose_data,
1 << output_domain_size,
total_num_of_fft,
num_of_cols,
cap_height
)
);

drop(device_transpose_data);
drop(device_output_data);

mt
}

Expand All @@ -453,11 +487,123 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
blinding: bool,
fft_root_table: Option<&FftRootTable<F>>,
) -> Vec<Vec<F>> {
#[cfg(all(feature = "cuda", any(test, doctest)))]
init_gpu();

let degree = polynomials[0].len();
// If blinding, salt with two random elements to each leaf vector.
let salt_size = if blinding { SALT_SIZE } else { 0 };
// println!("salt_size: {:?}", salt_size);
<<<<<<< HEAD

=======

#[cfg(all(feature = "cuda", feature = "batch"))]
let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();
// let num_gpus: usize = 1;
#[cfg(all(feature = "cuda", feature = "batch"))]
println!("get num of gpus: {:?}", num_gpus);
#[cfg(all(feature = "cuda", feature = "batch"))]
let total_num_of_fft = polynomials.len();
// println!("total_num_of_fft: {:?}", total_num_of_fft);
#[cfg(all(feature = "cuda", feature = "batch"))]
let per_device_batch = total_num_of_fft.div_ceil(num_gpus);

#[cfg(all(feature = "cuda", feature = "batch"))]
let chunk_size = total_num_of_fft.div_ceil(num_gpus);

#[cfg(all(feature = "cuda", feature = "batch"))]
if log_n > 10 && polynomials.len() > 0 {
println!("log_n: {:?}", log_n);
let start_lde = std::time::Instant::now();

// let poly_chunk = polynomials;
// let id = 0;
let ret = polynomials
.par_chunks(chunk_size)
.enumerate()
.flat_map(|(id, poly_chunk)| {
println!(
"invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}",
id, per_device_batch
);

let start = std::time::Instant::now();

let input_domain_size = 1 << log2_strict(degree);
let device_input_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(
id as i32,
input_domain_size * polynomials.len(),
)
.unwrap();
let device_input_data = std::sync::RwLock::new(device_input_data);

poly_chunk.par_iter().enumerate().for_each(|(i, p)| {
// println!("copy for index: {:?}", i);
let _guard = device_input_data.read().unwrap();
let _ = _guard.copy_from_host_offset(
p.coeffs.as_slice(),
input_domain_size * i,
input_domain_size,
);
});

println!("data transform elapsed: {:?}", start.elapsed());
let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = per_device_batch as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = true;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
println!(
"start cuda_malloc with elements: {:?}",
(1 << log_n) * per_device_batch
);
let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let start = std::time::Instant::now();
lde_batch::<F>(
id,
device_output_data.as_mut_ptr(),
device_input_data.read().unwrap().as_ptr(),
log2_strict(degree),
cfg_lde,
);
println!("real lde_batch elapsed: {:?}", start.elapsed());
let start = std::time::Instant::now();
let nums: Vec<usize> = (0..poly_chunk.len()).collect();
let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; 1 << log_n];
let _ = device_output_data.copy_to_host_offset(
host_data.as_mut_slice(),
(1 << log_n) * i,
1 << log_n,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
r
})
.chain(
(0..salt_size)
.into_par_iter()
.map(|_| F::rand_vec(degree << rate_bits)),
)
.collect();
println!("real lde elapsed: {:?}", start_lde.elapsed());
return ret;
}

>>>>>>> 8a00c2bc54a76355a0bf73dcaabb560d688cab4d
let ret = polynomials
.par_iter()
.map(|p| {
Expand Down
2 changes: 1 addition & 1 deletion plonky2/src/gates/gate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use core::ops::Range;
use std::sync::Arc;

use hashbrown::HashMap;
use serde::{ Serialize, Serializer};
use serde::{Serialize, Serializer};

use crate::field::batch_util::batch_multiply_inplace;
use crate::field::extension::{Extendable, FieldExtension};
Expand Down
6 changes: 1 addition & 5 deletions plonky2/src/gates/low_degree_interpolation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,7 @@ impl<F: RichField + Extendable<D>, const D: usize> Gate<F, D> for LowDegreeInter
fn id(&self) -> String {
format!("{self:?}<D={D}>")
}
fn serialize(
&self,
dst: &mut Vec<u8>,
_common_data: &CommonCircuitData<F, D>,
) -> IoResult<()> {
fn serialize(&self, dst: &mut Vec<u8>, _common_data: &CommonCircuitData<F, D>) -> IoResult<()> {
dst.write_usize(self.subgroup_bits)?;
Ok(())
}
Expand Down
13 changes: 10 additions & 3 deletions plonky2/src/hash/arch/x86_64/goldilocks_avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,16 @@ pub fn add_avx_a_sc(a_sc: &__m256i, b: &__m256i) -> __m256i {

#[inline(always)]
pub fn add_avx(a: &__m256i, b: &__m256i) -> __m256i {
let a_sc = shift_avx(a);
// let a_sc = toCanonical_avx_s(&a_s);
add_avx_a_sc(&a_sc, b)
unsafe {
let msb = _mm256_set_epi64x(MSB_1, MSB_1, MSB_1, MSB_1);
let a_sc = _mm256_xor_si256(*a, msb);
let c0_s = _mm256_add_epi64(a_sc, *b);
let p_n = _mm256_set_epi64x(P_N_1, P_N_1, P_N_1, P_N_1);
let mask_ = _mm256_cmpgt_epi64(a_sc, c0_s);
let corr_ = _mm256_and_si256(mask_, p_n);
let c_s = _mm256_add_epi64(c0_s, corr_);
_mm256_xor_si256(c_s, msb)
}
}

#[inline(always)]
Expand Down
Loading

0 comments on commit 55c6501

Please sign in to comment.