Implement potential

General strategy: modify permutation to 1. sort each group of atoms into a contiguous block 2. hilbert sort each group separately
proteneer · Feb 15, 2022 · 945e62a · 945e62a
1 parent 980185e
commit 945e62a
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 17 deletions.
diff --git a/timemachine/cpp/src/kernels/k_nonbonded.cu b/timemachine/cpp/src/kernels/k_nonbonded.cu
@@ -39,6 +39,49 @@ void __global__ k_coords_to_kv(
     vals[atom_idx] = atom_idx;
 }
 
+// TODO: DRY with k_coords_to_kv
+void __global__ k_coords_to_kv_gather(
+    const int N,
+    const unsigned int *atom_idxs,
+    const double *coords,
+    const double *box,
+    const unsigned int *bin_to_idx,
+    unsigned int *keys,
+    unsigned int *vals) {
+
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx >= N) {
+        return;
+    }
+
+    const int atom_idx = atom_idxs[idx];
+
+    // these coords have to be centered
+    double bx = box[0 * 3 + 0];
+    double by = box[1 * 3 + 1];
+    double bz = box[2 * 3 + 2];
+
+    double binWidth = max(max(bx, by), bz) / 255.0;
+
+    double x = coords[atom_idx * 3 + 0];
+    double y = coords[atom_idx * 3 + 1];
+    double z = coords[atom_idx * 3 + 2];
+
+    x -= bx * floor(x / bx);
+    y -= by * floor(y / by);
+    z -= bz * floor(z / bz);
+
+    unsigned int bin_x = x / binWidth;
+    unsigned int bin_y = y / binWidth;
+    unsigned int bin_z = z / binWidth;
+
+    keys[idx] = bin_to_idx[bin_x * 256 * 256 + bin_y * 256 + bin_z];
+    // uncomment below if you want to preserve the atom ordering
+    // keys[idx] = atom_idx;
+    vals[idx] = atom_idx;
+}
+
 void __global__ k_arange(int N, unsigned int *arr) {
     const int atom_idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (atom_idx >= N) {

diff --git a/timemachine/cpp/src/kernels/k_nonbonded.cuh b/timemachine/cpp/src/kernels/k_nonbonded.cuh
@@ -16,6 +16,16 @@ void __global__ k_coords_to_kv(
     unsigned int *keys,
     unsigned int *vals);
 
+// variant of k_coords_to_kv allowing the selection of a subset of coordinates
+void __global__ k_coords_to_kv_gather(
+    const int N,                   // number of atoms in selection
+    const unsigned int *atom_idxs, // [N] indices of atoms to select
+    const double *coords,
+    const double *box,
+    const unsigned int *bin_to_idx,
+    unsigned int *keys,
+    unsigned int *vals);
+
 template <typename RealType>
 void __global__ k_check_rebuild_box(const int N, const double *new_box, const double *old_box, int *rebuild) {
 

diff --git a/timemachine/cpp/src/nonbonded_interaction_group.cu b/timemachine/cpp/src/nonbonded_interaction_group.cu
@@ -36,7 +36,7 @@ NonbondedInteractionGroup<RealType, Interpolated>::NonbondedInteractionGroup(
     const double beta,
     const double cutoff,
     const std::string &kernel_src)
-    : N_(lambda_offset_idxs.size()), NR_(row_atom_idxs.size()), NC_(N_ - NR_), cutoff_(cutoff), nblist_(N_),
+    : N_(lambda_offset_idxs.size()), NR_(row_atom_idxs.size()), NC_(N_ - NR_), cutoff_(cutoff), nblist_(NC_, NR_),
       beta_(beta), d_sort_storage_(nullptr), d_sort_storage_bytes_(0), nblist_padding_(0.1), disable_hilbert_(false),
       kernel_ptrs_({// enumerate over every possible kernel combination
                     // U: Compute U
@@ -160,7 +160,13 @@ NonbondedInteractionGroup<RealType, Interpolated>::NonbondedInteractionGroup(
 
     // estimate size needed to do radix sorting, this can use uninitialized data.
     cub::DeviceRadixSort::SortPairs(
-        d_sort_storage_, d_sort_storage_bytes_, d_sort_keys_in_, d_sort_keys_out_, d_sort_vals_in_, d_perm_, N_);
+        d_sort_storage_,
+        d_sort_storage_bytes_,
+        d_sort_keys_in_,
+        d_sort_keys_out_,
+        d_sort_vals_in_,
+        d_perm_,
+        std::max(NC_, NR_));
 
     gpuErrchk(cudaPeekAtLastError());
     gpuErrchk(cudaMalloc(&d_sort_storage_, d_sort_storage_bytes_));
@@ -215,12 +221,18 @@ void NonbondedInteractionGroup<RealType, Interpolated>::disable_hilbert_sort() {
 
 template <typename RealType, bool Interpolated>
 void NonbondedInteractionGroup<RealType, Interpolated>::hilbert_sort(
-    const double *d_coords, const double *d_box, cudaStream_t stream) {
+    const int N,
+    const unsigned int *d_atom_idxs,
+    const double *d_coords,
+    const double *d_box,
+    unsigned int *d_perm,
+    cudaStream_t stream) {
 
     const int tpb = 32;
-    const int B = (N_ + tpb - 1) / tpb;
+    const int B = ceil_divide(N, tpb);
 
-    k_coords_to_kv<<<B, tpb, 0, stream>>>(N_, d_coords, d_box, d_bin_to_idx_, d_sort_keys_in_, d_sort_vals_in_);
+    k_coords_to_kv_gather<<<B, tpb, 0, stream>>>(
+        N, d_atom_idxs, d_coords, d_box, d_bin_to_idx_, d_sort_keys_in_, d_sort_vals_in_);
 
     gpuErrchk(cudaPeekAtLastError());
 
@@ -230,8 +242,8 @@ void NonbondedInteractionGroup<RealType, Interpolated>::hilbert_sort(
         d_sort_keys_in_,
         d_sort_keys_out_,
         d_sort_vals_in_,
-        d_perm_,
-        N_,
+        d_perm,
+        N,
         0,                            // begin bit
         sizeof(*d_sort_keys_in_) * 8, // end bit
         stream                        // cudaStream
@@ -285,37 +297,43 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(
     // identify which tiles contain interpolated parameters
 
     const int tpb = 32;
-    const int B = (N + tpb - 1) / tpb;
-
-    dim3 dimGrid(B, 3, 1);
+    const int B = ceil_divide(N_, tpb);
 
     // (ytz) see if we need to rebuild the neighborlist.
     // (ytz + jfass): note that this logic needs to change if we use NPT later on since a resize in the box
     // can introduce new interactions.
     k_check_rebuild_coords_and_box<RealType>
-        <<<B, tpb, 0, stream>>>(N, d_x, d_nblist_x_, d_box, d_nblist_box_, nblist_padding_, d_rebuild_nblist_);
+        <<<B, tpb, 0, stream>>>(N_, d_x, d_nblist_x_, d_box, d_nblist_box_, nblist_padding_, d_rebuild_nblist_);
+
     gpuErrchk(cudaPeekAtLastError());
 
     // we can optimize this away by doing the check on the GPU directly.
     gpuErrchk(cudaMemcpyAsync(
         p_rebuild_nblist_, d_rebuild_nblist_, 1 * sizeof(*p_rebuild_nblist_), cudaMemcpyDeviceToHost, stream));
     gpuErrchk(cudaStreamSynchronize(stream)); // slow!
 
+    dim3 dimGrid(B, 3, 1);
+
     if (p_rebuild_nblist_[0] > 0) {
 
         // (ytz): update the permutation index before building neighborlist, as the neighborlist is tied
         // to a particular sort order
         if (!disable_hilbert_) {
-            this->hilbert_sort(d_x, d_box, stream);
+            this->hilbert_sort(NC_, d_col_atom_idxs_, d_x, d_box, d_perm_, stream);
+            this->hilbert_sort(NR_, d_row_atom_idxs_, d_x, d_box, d_perm_ + NC_, stream);
         } else {
-            k_arange<<<B, tpb, 0, stream>>>(N, d_perm_);
-            gpuErrchk(cudaPeekAtLastError());
+            gpuErrchk(cudaMemcpyAsync(
+                d_perm_, d_col_atom_idxs_, NC_ * sizeof(*d_col_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
+            gpuErrchk(cudaMemcpyAsync(
+                d_perm_ + NC_, d_row_atom_idxs_, NR_ * sizeof(*d_row_atom_idxs_), cudaMemcpyDeviceToDevice, stream));
         }
 
         // compute new coordinates, new lambda_idxs, new_plane_idxs
-        k_permute<<<dimGrid, tpb, 0, stream>>>(N, d_perm_, d_x, d_sorted_x_);
+        k_permute<<<dimGrid, tpb, 0, stream>>>(N_, d_perm_, d_x, d_sorted_x_);
         gpuErrchk(cudaPeekAtLastError());
-        nblist_.build_nblist_device(N, d_sorted_x_, d_box, cutoff_ + nblist_padding_, stream);
+
+        nblist_.build_nblist_device(
+            NC_, NR_, d_sorted_x_, d_sorted_x_ + 3 * NC_, d_box, cutoff_ + nblist_padding_, stream);
         gpuErrchk(cudaMemcpyAsync(
             p_ixn_count_, nblist_.get_ixn_count(), 1 * sizeof(*p_ixn_count_), cudaMemcpyDeviceToHost, stream));
 
@@ -325,6 +343,7 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(
         // then a particle can interact with multiple periodic copies.
         const double db_cutoff = (cutoff_ + nblist_padding_) * 2;
         cudaStreamSynchronize(stream);
+
         // Verify that box is orthogonal and the width of the box in all dimensions is greater than twice the cutoff
         for (int i = 0; i < 9; i++) {
             if (i == 0 || i == 4 || i == 8) {
@@ -369,6 +388,7 @@ void NonbondedInteractionGroup<RealType, Interpolated>::execute_device(
 
     // update new w coordinates
     // (tbd): cache lambda value for equilibrium calculations
+    // TODO: skip computing w coords for non-interacting atoms?
     CUresult result = compute_w_coords_instance_.configure(B, tpb, 0, stream)
                           .launch(N, lambda, cutoff_, d_lambda_plane_idxs_, d_lambda_offset_idxs_, d_w_, d_dw_dl_);
     if (result != 0) {

diff --git a/timemachine/cpp/src/nonbonded_interaction_group.hpp b/timemachine/cpp/src/nonbonded_interaction_group.hpp
@@ -77,7 +77,13 @@ template <typename RealType, bool Interpolated> class NonbondedInteractionGroup
 
     bool disable_hilbert_;
 
-    void hilbert_sort(const double *d_x, const double *d_box, cudaStream_t stream);
+    void hilbert_sort(
+        const int N,
+        const unsigned int *d_atom_idxs,
+        const double *d_x,
+        const double *d_box,
+        unsigned int *d_perm,
+        cudaStream_t stream);
 
     jitify::JitCache kernel_cache_;
     jitify::KernelInstantiation compute_w_coords_instance_;