Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Transpose Data Copy #14

Merged
merged 3 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion depends/cryptography_cuda
2 changes: 1 addition & 1 deletion field/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ quote = "1"
default = ["no_cuda"]
cuda = ["cryptography_cuda/cuda"]
precompile = []
no_cuda = ["cryptography_cuda/no_cuda"]
no_cuda = ["cryptography_cuda/no_cuda"]
146 changes: 145 additions & 1 deletion plonky2/src/fri/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use alloc::vec::Vec;
#[cfg(feature = "cuda")]
use cryptography_cuda::{
device::memory::HostOrDeviceSlice, device::stream::CudaStream, intt_batch, lde_batch,
ntt_batch, types::*,
ntt_batch, lde_batch_multi_gpu, transpose_rev_batch, types::*,
};
use itertools::Itertools;
use plonky2_field::types::Field;
Expand Down Expand Up @@ -140,6 +140,44 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
fft_root_table: Option<&FftRootTable<F>>,
) -> Self {
let degree = polynomials[0].len();

#[cfg(feature = "cuda")]
let log_n = log2_strict(degree);

#[cfg(feature = "cuda")]
if log_n + rate_bits > 1 && polynomials.len() > 0 {
println!("invoke from_coeffs_gpu with log_n: {:?}", log_n);
let lde_values = Self::from_coeffs_gpu(
&polynomials,
rate_bits,
blinding,
cap_height,
timing,
fft_root_table,
log_n,
degree
);

let _num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();

let merkle_tree = timed!(
timing,
"build Merkle tree",
MerkleTree::new(lde_values, cap_height)
);

return Self {
polynomials,
merkle_tree,
degree_log: log2_strict(degree),
rate_bits,
blinding,
};
}

let lde_values = timed!(
timing,
"FFT + blinding",
Expand All @@ -163,6 +201,112 @@ impl<F: RichField + Extendable<D>, C: GenericConfig<D, F = F>, const D: usize>
}
}

#[cfg(feature = "cuda")]
pub fn from_coeffs_gpu(
polynomials: &[PolynomialCoeffs<F>],
rate_bits: usize,
blinding: bool,
_cap_height: usize,
_timing: &mut TimingTree,
_fft_root_table: Option<&FftRootTable<F>>,
log_n: usize,
_degree: usize
)-> Vec<Vec<F>>{
// If blinding, salt with two random elements to each leaf vector.


let salt_size = if blinding { SALT_SIZE } else { 0 };
println!("salt_size: {:?}", salt_size);
let output_domain_size = log_n + rate_bits;

let num_gpus: usize = std::env::var("NUM_OF_GPUS")
.expect("NUM_OF_GPUS should be set")
.parse()
.unwrap();
// let num_gpus: usize = 1;

println!("get num of gpus: {:?}", num_gpus);
let total_num_of_fft = polynomials.len();
println!("total_num_of_fft: {:?}", total_num_of_fft);

let total_num_input_elements = total_num_of_fft * (1 << log_n);
let total_num_output_elements = total_num_of_fft * (1 << output_domain_size);

let start_lde = std::time::Instant::now();

let mut gpu_input: Vec<F> = polynomials
.into_iter()
.flat_map(
|v|
v.coeffs.iter().cloned()
)
.collect();

let mut cfg_lde = NTTConfig::default();
cfg_lde.batches = total_num_of_fft as u32;
cfg_lde.extension_rate_bits = rate_bits as u32;
cfg_lde.are_inputs_on_device = false;
cfg_lde.are_outputs_on_device = true;
cfg_lde.with_coset = true;
cfg_lde.is_multi_gpu = true;


let mut device_output_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements).unwrap();
println!("start lde_batch_multi_gpu");
lde_batch_multi_gpu::<F>(
device_output_data.as_mut_ptr(),
gpu_input.as_mut_ptr(),
num_gpus,
cfg_lde.clone(),
log_n,
total_num_input_elements,
total_num_output_elements,
);

println!("real lde_batch elapsed: {:?}", start_lde.elapsed());

let mut cfg_trans = TransposeConfig::default();
cfg_trans.batches = total_num_of_fft as u32;
cfg_trans.are_inputs_on_device = true;
cfg_trans.are_outputs_on_device = true;

let mut device_transpose_data: HostOrDeviceSlice<'_, F> =
HostOrDeviceSlice::cuda_malloc(0 as i32, total_num_output_elements)
.unwrap();

let start = std::time::Instant::now();

transpose_rev_batch(
0 as i32,
device_transpose_data.as_mut_ptr(),
device_output_data.as_mut_ptr(),
output_domain_size,
cfg_trans
);

println!("real transpose_rev_batch elapsed: {:?}", start.elapsed());

let start = std::time::Instant::now();
let nums: Vec<usize> = (0..(1<< output_domain_size)).collect();
let r = nums
.par_iter()
.map(|i| {
let mut host_data: Vec<F> = vec![F::ZERO; total_num_of_fft];
device_transpose_data.copy_to_host_offset(
host_data.as_mut_slice(),
i*total_num_of_fft,
total_num_of_fft,
).expect("copy to host error");
// PolynomialValues::new(host_data).values
host_data
})
.collect::<Vec<Vec<F>>>();
println!("collect data from gpu used: {:?}", start.elapsed());
println!("real lde elapsed: {:?}", start_lde.elapsed());
return r;
}

fn lde_values(
polynomials: &[PolynomialCoeffs<F>],
rate_bits: usize,
Expand Down
Loading