Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add point value lde #39

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions field/src/polynomial/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ impl<F: Field> PolynomialCoeffs<F> {
.collect()
}

///WIP: We can try a tree based estriens method for this. Maybe will speed it up?
pub fn eval(&self, x: F) -> F {
self.coeffs
.iter()
Expand Down
326 changes: 162 additions & 164 deletions plonky2/src/fri/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize> D
impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
PolynomialBatch<F, C, D>
{
#[cfg(not(feature = "cuda"))]
/// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
pub fn from_values(
values: Vec<PolynomialValues<F>>,
Expand All @@ -82,6 +83,25 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
cap_height: usize,
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
Self::from_values_cpu(
values,
rate_bits,
blinding,
cap_height,
timing,
fft_root_table,
)
}

/// Creates a list polynomial commitment for the polynomials interpolating the values in `values`.
pub fn from_values_cpu(
values: Vec<PolynomialValues<F>>,
rate_bits: usize,
blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
// #[cfg(any(not(feature = "cuda"), not(feature = "batch")))]
let coeffs = timed!(
Expand All @@ -90,53 +110,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
values.into_par_iter().map(|v| v.ifft()).collect::<Vec<_>>()
);

// #[cfg(all(feature = "cuda", feature = "batch"))]
// let degree = values[0].len();
// #[cfg(all(feature = "cuda", feature = "batch"))]
// let log_n = log2_strict(degree);

// #[cfg(all(feature = "cuda", feature = "batch"))]
// let num_gpus: usize = std::env::var("NUM_OF_GPUS")
// .expect("NUM_OF_GPUS should be set")
// .parse()
// .unwrap();
// // let num_gpus = 1;
// #[cfg(all(feature = "cuda", feature = "batch"))]
// let total_num_of_fft = values.len();
// #[cfg(all(feature = "cuda", feature = "batch"))]
// let per_device_batch = total_num_of_fft.div_ceil(num_gpus);

// #[cfg(all(feature = "cuda", feature = "batch"))]
// let chunk_size = total_num_of_fft.div_ceil(num_gpus);
// #[cfg(all(feature = "cuda", feature = "batch"))]
// println!(
// "invoking intt_batch, total_nums: {:?}, log_n: {:?}, num_gpus: {:?}",
// total_num_of_fft, log_n, num_gpus
// );

// #[cfg(all(feature = "cuda", feature = "batch"))]
// let coeffs = timed!(
// timing,
// "IFFT",
// values
// .par_chunks(chunk_size)
// .enumerate()
// .flat_map(|(id, poly_chunk)| {
// let mut polys_values: Vec<F> =
// poly_chunk.iter().flat_map(|p| p.values.clone()).collect();
// let mut ntt_cfg = NTTConfig::default();
// ntt_cfg.batches = per_device_batch as u32;

// intt_batch(id, polys_values.as_mut_ptr(), log_n, ntt_cfg);
// polys_values
// .chunks(1 << log_n)
// .map(|buffer| PolynomialCoeffs::new(buffer.to_vec()))
// .collect::<Vec<PolynomialCoeffs<F>>>()
// })
// .collect()
// );

Self::from_coeffs(
Self::from_coeffs_cpu(
coeffs,
rate_bits,
blinding,
Expand All @@ -146,6 +120,146 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
)
}

#[cfg(feature = "cuda")]
pub fn from_values(
values: Vec<PolynomialValues<F>>,
rate_bits: usize,
blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
let degree = values[0].len();
let log_n = log2_strict(degree);

if log_n + rate_bits > 1 && values.len() > 0 {
let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();

Self::from_values_gpu(
values.as_slice(),
rate_bits,
blinding,
cap_height,
timing,
fft_root_table,
log_n,
degree,
)
} else {
Self::from_values_cpu(
values,
rate_bits,
blinding,
cap_height,
timing,
fft_root_table,
)
}
}

#[cfg(feature = "cuda")]
pub fn from_values_gpu(
values: &[PolynomialValues<F>],
rate_bits: usize,
_blinding: bool,
cap_height: usize,
timing: &mut TimingTree,
_fft_root_table: Option<&FftRootTable<F>>,
log_n: usize,
_degree: usize,
) -> Self {
let output_domain_size = log_n + rate_bits;

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();
// let num_gpus: usize = 1;
// println!("get num of gpus: {:?}", num_gpus);
let total_num_of_fft = values.len();
// println!("total_num_of_fft: {:?}", total_num_of_fft);

let total_num_input_elements = total_num_of_fft * (1 << log_n);
let total_num_output_elements = total_num_of_fft * (1 << output_domain_size);

let mut gpu_input: Vec<F> = values
.into_iter()
.flat_map(|v| v.values.iter().cloned())
.collect();

let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = total_num_of_fft as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = false;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
cfg_lde.is_multi_gpu = true;
cfg_lde.is_coeffs = false;

let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();
if num_gpus == 1 {
let _ = timed!(
timing,
"LDE on 1 GPU",
lde_batch(
0,
device_output_data.as_mut_ptr(),
gpu_input.as_mut_ptr(),
log_n,
cfg_lde.clone()
)
);
} else {
let _ = timed!(
timing,
"LDE on multi GPU",
lde_batch_multi_gpu::<F>(
device_output_data.as_mut_ptr(),
gpu_input.as_mut_ptr(),
num_gpus,
cfg_lde.clone(),
log_n,
)
);
}

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = total_num_of_fft as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();

let _ = timed!(
timing,
"transpose",
transpose_rev_batch(
0 as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
output_domain_size,
cfg_trans
)
);

let mt = timed!(
timing,
"Merkle tree with GPU data",
MerkleTree::new_from_gpu_leaves(
&device_transpose_data,
1 << output_domain_size,
total_num_of_fft,
cap_height
)
);
mt
}

/// Creates a list polynomial commitment for the polynomials `polynomials`.
pub fn from_coeffs_cpu(
polynomials: Vec<PolynomialCoeffs<F>>,
Expand Down Expand Up @@ -212,13 +326,7 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
let degree = polynomials[0].len();
let log_n = log2_strict(degree);

#[cfg(any(test, doctest))]
init_gpu();

if log_n + rate_bits > 1
&& polynomials.len() > 0
&& pols * (1 << (log_n + rate_bits)) < (1 << 31)
{
if log_n + rate_bits > 1 && polynomials.len() > 0 {
let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
Expand Down Expand Up @@ -320,8 +428,6 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
num_gpus,
cfg_lde.clone(),
log_n,
total_num_input_elements,
total_num_output_elements,
)
);
}
Expand Down Expand Up @@ -373,118 +479,10 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
init_gpu();

let degree = polynomials[0].len();
#[cfg(all(feature = "cuda", feature = "batch"))]
let log_n = log2_strict(degree) + rate_bits;

// If blinding, salt with two random elements to each leaf vector.
let salt_size = if blinding { SALT_SIZE } else { 0 };
// println!("salt_size: {:?}", salt_size);

#[cfg(all(feature = "cuda", feature = "batch"))]
let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();
// let num_gpus: usize = 1;
#[cfg(all(feature = "cuda", feature = "batch"))]
println!("get num of gpus: {:?}", num_gpus);
#[cfg(all(feature = "cuda", feature = "batch"))]
let total_num_of_fft = polynomials.len();
// println!("total_num_of_fft: {:?}", total_num_of_fft);
#[cfg(all(feature = "cuda", feature = "batch"))]
let per_device_batch = total_num_of_fft.div_ceil(num_gpus);

#[cfg(all(feature = "cuda", feature = "batch"))]
let chunk_size = total_num_of_fft.div_ceil(num_gpus);

#[cfg(all(feature = "cuda", feature = "batch"))]
if log_n > 10 && polynomials.len() > 0 {
println!("log_n: {:?}", log_n);
let start_lde = std::time::Instant::now();

// let poly_chunk = polynomials;
// let id = 0;
let ret = polynomials
.par_chunks(chunk_size)
.enumerate()
.flat_map(|(id, poly_chunk)| {
println!(
"invoking ntt_batch, device_id: {:?}, per_device_batch: {:?}",
id, per_device_batch
);

let start = std::time::Instant::now();

let input_domain_size = 1 << log2_strict(degree);
let device_input_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(
id as i32,
input_domain_size * polynomials.len(),
)
.unwrap();
let device_input_data = std::sync::RwLock::new(device_input_data);

poly_chunk.par_iter().enumerate().for_each(|(i, p)| {
// println!("copy for index: {:?}", i);
let _guard = device_input_data.read().unwrap();
let _ = _guard.copy_from_host_offset(
p.coeffs.as_slice(),
input_domain_size * i,
input_domain_size,
);
});

println!("data transform elapsed: {:?}", start.elapsed());
let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = per_device_batch as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = true;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
println!(
"start cuda_malloc with elements: {:?}",
(1 << log_n) * per_device_batch
);
let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(id as i32, (1 << log_n) * per_device_batch)
.unwrap();

let start = std::time::Instant::now();
lde_batch::<F>(
id,
device_output_data.as_mut_ptr(),
device_input_data.read().unwrap().as_ptr(),
log2_strict(degree),
cfg_lde,
);
println!("real lde_batch elapsed: {:?}", start.elapsed());
let start = std::time::Instant::now();
let nums: Vec<usize> = (0..poly_chunk.len()).collect();
let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; 1 << log_n];
let _ = device_output_data.copy_to_host_offset(
host_data.as_mut_slice(),
(1 << log_n) * i,
1 << log_n,
);
PolynomialValues::new(host_data).values
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
r
})
.chain(
(0..salt_size)
.into_par_iter()
.map(|_| F::rand_vec(degree << rate_bits)),
)
.collect();
println!("real lde elapsed: {:?}", start_lde.elapsed());
return ret;
}

let ret = polynomials
.par_iter()
.map(|p| {
Expand Down