Skip to content

Commit

Permalink
Implement potential
Browse files Browse the repository at this point in the history
General strategy: modify permutation to
1. sort each group of atoms into a contiguous block
2. hilbert sort each group separately
  • Loading branch information
mcwitt committed Feb 15, 2022
1 parent 980185e commit 945e62a
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 17 deletions.
43 changes: 43 additions & 0 deletions timemachine/cpp/src/kernels/k_nonbonded.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,49 @@ void __global__ k_coords_to_kv(
vals[atom_idx] = atom_idx;
}

// TODO: DRY with k_coords_to_kv
void __global__ k_coords_to_kv_gather(
const int N,
const unsigned int *atom_idxs,
const double *coords,
const double *box,
const unsigned int *bin_to_idx,
unsigned int *keys,
unsigned int *vals) {

const int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx >= N) {
return;
}

const int atom_idx = atom_idxs[idx];

// these coords have to be centered
double bx = box[0 * 3 + 0];
double by = box[1 * 3 + 1];
double bz = box[2 * 3 + 2];

double binWidth = max(max(bx, by), bz) / 255.0;

double x = coords[atom_idx * 3 + 0];
double y = coords[atom_idx * 3 + 1];
double z = coords[atom_idx * 3 + 2];

x -= bx * floor(x / bx);
y -= by * floor(y / by);
z -= bz * floor(z / bz);

unsigned int bin_x = x / binWidth;
unsigned int bin_y = y / binWidth;
unsigned int bin_z = z / binWidth;

keys[idx] = bin_to_idx[bin_x * 256 * 256 + bin_y * 256 + bin_z];
// uncomment below if you want to preserve the atom ordering
// keys[idx] = atom_idx;
vals[idx] = atom_idx;
}

void __global__ k_arange(int N, unsigned int *arr) {
const int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (atom_idx >= N) {
Expand Down
10 changes: 10 additions & 0 deletions timemachine/cpp/src/kernels/k_nonbonded.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@ void __global__ k_coords_to_kv(
unsigned int *keys,
unsigned int *vals);

// variant of k_coords_to_kv allowing the selection of a subset of coordinates
void __global__ k_coords_to_kv_gather(
const int N, // number of atoms in selection
const unsigned int *atom_idxs, // [N] indices of atoms to select
const double *coords,
const double *box,
const unsigned int *bin_to_idx,
unsigned int *keys,
unsigned int *vals);

template <typename RealType>
void __global__ k_check_rebuild_box(const int N, const double *new_box, const double *old_box, int *rebuild) {

Expand Down
52 changes: 36 additions & 16 deletions timemachine/cpp/src/nonbonded_interaction_group.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ NonbondedInteractionGroup<RealType, Interpolated>::NonbondedInteractionGroup(
const double beta,
const double cutoff,
const std::string &kernel_src)
: N_(lambda_offset_idxs.size()), NR_(row_atom_idxs.size()), NC_(N_ - NR_), cutoff_(cutoff), nblist_(N_),
: N_(lambda_offset_idxs.size()), NR_(row_atom_idxs.size()), NC_(N_ - NR_), cutoff_(cutoff), nblist_(NC_, NR_),
beta_(beta), d_sort_storage_(nullptr), d_sort_storage_bytes_(0), nblist_padding_(0.1), disable_hilbert_(false),
kernel_ptrs_({// enumerate over every possible kernel combination
// U: Compute U
Expand Down Expand Up @@ -160,7 +160,13 @@ NonbondedInteractionGroup<RealType, Interpolated>::NonbondedInteractionGroup(

// estimate size needed to do radix sorting, this can use uninitialized data.
cub::DeviceRadixSort::SortPairs(
d_sort_storage_, d_sort_storage_bytes_, d_sort_keys_in_, d_sort_keys_out_, d_sort_vals_in_, d_perm_, N_);
d_sort_storage_,
d_sort_storage_bytes_,
d_sort_keys_in_,
d_sort_keys_out_,
d_sort_vals_in_,
d_perm_,
std::max(NC_, NR_));

gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMalloc(&d_sort_storage_, d_sort_storage_bytes_));
Expand Down Expand Up @@ -215,12 +221,18 @@ void NonbondedInteractionGroup<RealType, Interpolated>::disable_hilbert_sort() {

template <typename RealType, bool Interpolated>
void NonbondedInteractionGroup<RealType, Interpolated>::hilbert_sort(
const double *d_coords, const double *d_box, cudaStream_t stream) {
const int N,
const unsigned int *d_atom_idxs,
const double *d_coords,
const double *d_box,
unsigned int *d_perm,
cudaStream_t stream) {

const int tpb = 32;
const int B = (N_ + tpb - 1) / tpb;
const int B = ceil_divide(N, tpb);

k_coords_to_kv<<<B, tpb, 0, stream>>>(N_, d_coords, d_box, d_bin_to_idx_, d_sort_keys_in_, d_sort_vals_in_);
k_coords_to_kv_gather<<<B, tpb, 0, stream>>>(
N, d_atom_idxs, d_coords, d_box, d_bin_to_idx_, d_sort_keys_in_, d_sort_vals_in_);

gpuErrchk(cudaPeekAtLastError());

Expand All @@ -230,8 +242,8 @@ void NonbondedInteractionGroup<RealType, Interpolated>::hilbert_sort(
d_sort_keys_in_,
d_sort_keys_out_,
d_sort_vals_in_,
d_perm_,
N_,
d_perm,
N,
0, // begin bit
sizeof(*d_sort_keys_in_) * 8, // end bit
stream // cudaStream
Expand Down Expand Up @@ -285,37 +297,43 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(
// identify which tiles contain interpolated parameters

const int tpb = 32;
const int B = (N + tpb - 1) / tpb;

dim3 dimGrid(B, 3, 1);
const int B = ceil_divide(N_, tpb);

// (ytz) see if we need to rebuild the neighborlist.
// (ytz + jfass): note that this logic needs to change if we use NPT later on since a resize in the box
// can introduce new interactions.
k_check_rebuild_coords_and_box<RealType>
<<<B, tpb, 0, stream>>>(N, d_x, d_nblist_x_, d_box, d_nblist_box_, nblist_padding_, d_rebuild_nblist_);
<<<B, tpb, 0, stream>>>(N_, d_x, d_nblist_x_, d_box, d_nblist_box_, nblist_padding_, d_rebuild_nblist_);

gpuErrchk(cudaPeekAtLastError());

// we can optimize this away by doing the check on the GPU directly.
gpuErrchk(cudaMemcpyAsync(
p_rebuild_nblist_, d_rebuild_nblist_, 1 * sizeof(*p_rebuild_nblist_), cudaMemcpyDeviceToHost, stream));
gpuErrchk(cudaStreamSynchronize(stream)); // slow!

dim3 dimGrid(B, 3, 1);

if (p_rebuild_nblist_[0] > 0) {

// (ytz): update the permutation index before building neighborlist, as the neighborlist is tied
// to a particular sort order
if (!disable_hilbert_) {
this->hilbert_sort(d_x, d_box, stream);
this->hilbert_sort(NC_, d_col_atom_idxs_, d_x, d_box, d_perm_, stream);
this->hilbert_sort(NR_, d_row_atom_idxs_, d_x, d_box, d_perm_ + NC_, stream);
} else {
k_arange<<<B, tpb, 0, stream>>>(N, d_perm_);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpyAsync(
d_perm_, d_col_atom_idxs_, NC_ * sizeof(*d_col_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
gpuErrchk(cudaMemcpyAsync(
d_perm_ + NC_, d_row_atom_idxs_, NR_ * sizeof(*d_row_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
}

// compute new coordinates, new lambda_idxs, new_plane_idxs
k_permute<<<dimGrid, tpb, 0, stream>>>(N, d_perm_, d_x, d_sorted_x_);
k_permute<<<dimGrid, tpb, 0, stream>>>(N_, d_perm_, d_x, d_sorted_x_);
gpuErrchk(cudaPeekAtLastError());
nblist_.build_nblist_device(N, d_sorted_x_, d_box, cutoff_ + nblist_padding_, stream);

nblist_.build_nblist_device(
NC_, NR_, d_sorted_x_, d_sorted_x_ + 3 * NC_, d_box, cutoff_ + nblist_padding_, stream);
gpuErrchk(cudaMemcpyAsync(
p_ixn_count_, nblist_.get_ixn_count(), 1 * sizeof(*p_ixn_count_), cudaMemcpyDeviceToHost, stream));

Expand All @@ -325,6 +343,7 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(
// then a particle can interact with multiple periodic copies.
const double db_cutoff = (cutoff_ + nblist_padding_) * 2;
cudaStreamSynchronize(stream);

// Verify that box is orthogonal and the width of the box in all dimensions is greater than twice the cutoff
for (int i = 0; i < 9; i++) {
if (i == 0 || i == 4 || i == 8) {
Expand Down Expand Up @@ -369,6 +388,7 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(

// update new w coordinates
// (tbd): cache lambda value for equilibrium calculations
// TODO: skip computing w coords for non-interacting atoms?
CUresult result = compute_w_coords_instance_.configure(B, tpb, 0, stream)
.launch(N, lambda, cutoff_, d_lambda_plane_idxs_, d_lambda_offset_idxs_, d_w_, d_dw_dl_);
if (result != 0) {
Expand Down
8 changes: 7 additions & 1 deletion timemachine/cpp/src/nonbonded_interaction_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,13 @@ template <typename RealType, bool Interpolated> class NonbondedInteractionGroup

bool disable_hilbert_;

void hilbert_sort(const double *d_x, const double *d_box, cudaStream_t stream);
void hilbert_sort(
const int N,
const unsigned int *d_atom_idxs,
const double *d_x,
const double *d_box,
unsigned int *d_perm,
cudaStream_t stream);

jitify::JitCache kernel_cache_;
jitify::KernelInstantiation compute_w_coords_instance_;
Expand Down

0 comments on commit 945e62a

Please sign in to comment.