Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
Fix issue of test_gru_bidirectional #11219 and add robust code (#11333)
Browse files Browse the repository at this point in the history
* fix param init bug and remove memcpy/memset

* fix bug for bidirection size

* add 100 times loop for test_gru_bidirectional robust checking

* add test_loop_gru_bidirectional

* record number of passed

* remove 1000 times case
  • Loading branch information
Hao Li authored and szha committed Jun 22, 2018
1 parent 529ee85 commit 5550c0a
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 25 deletions.
100 changes: 79 additions & 21 deletions src/operator/rnn_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ void LstmForwardTraining(DType* ws,
const int y_offset = T * N * H * 5;
const int cell_size = N * H;
int idx = 0; // state & cell state's idx;
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int i = 0; i < L; ++i) {
const int input_size = i ? H * D : I;
const int w_size = (input_size + H) * H * 4;
Expand Down Expand Up @@ -172,7 +173,10 @@ void LstmForwardTraining(DType* ws,
}
}
}
memcpy(y_ptr, rs + y_offset, T * N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
y_ptr[i] = (rs + y_offset)[i];
}
}

template<typename DType>
Expand Down Expand Up @@ -334,9 +338,18 @@ void LstmBackwardSingleLayer(DType* ws,
DType *c_ptr = bid ? rs + T * N * H * 7 : rs;
const Tensor<cpu, 3, DType> c(c_ptr, Shape3(T, N, H));
const Tensor<cpu, 4, DType> ifgo(c_ptr + T * N * H, Shape4(T, N, H, 4));
memset(dwh.dptr_, 0, H * H * 4 * sizeof(DType));
memset(dbx.dptr_, 0, H * 4 * sizeof(DType));
memset(dbh.dptr_, 0, H * 4 * sizeof(DType));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (req_params != kNullOp && req_params != kAddTo) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * 4 * H; ++i) {
dwh.dptr_[i] = 0;
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < 4 * H; ++i) {
dbx.dptr_[i] = 0;
dbh.dptr_[i] = 0;
}
}
Tensor<cpu, 4, DType> difgo(ws, Shape4(T, N, 4, H));
Tensor<cpu, 2, DType> dh(ws + T * N * H * 4, Shape2(N, H));
Tensor<cpu, 2, DType> dc(dh.dptr_ + N * H, Shape2(N, H));
Expand All @@ -348,13 +361,28 @@ void LstmBackwardSingleLayer(DType* ws,
const DType beta2 = 2.0;
const int cell_size = N * H;
if (dhy_ptr != NULL) {
memcpy(dh.dptr_, dhy_ptr, cell_size * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dh.dptr_[i] = dhy_ptr[i];
}

This comment has been minimized.

Copy link
@szha

szha Jun 22, 2018

Member

Should we wrap this into our own memcpy and use it everywhere?

} else {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dh.dptr_[i] = 0;
}
}
if (dcy_ptr != NULL) {
memcpy(dc.dptr_, dcy_ptr, cell_size * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dc.dptr_[i] = dcy_ptr[i];
}
} else {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dc.dptr_[i] = 0;
}
}

const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int i = T - 1; i >= 0; --i) {
int t = bid ? T - 1 - i : i;
int tnext = bid ? t + 1 : t - 1;
Expand Down Expand Up @@ -422,8 +450,11 @@ void LstmBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf, Shape2(col, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + col * T, Shape2(col, T));
memset(tmp_dbx.dptr_, 0, col * T * sizeof(DType));
memset(tmp_dbh.dptr_, 0, col * T * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < col * T; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}
for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
for (int j = 0; j < col; ++j) {
Expand Down Expand Up @@ -618,11 +649,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
// perform the second direction
if (D == 2) {
gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
(reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);

#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N; ++i) {
Expand Down Expand Up @@ -839,11 +870,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
zt = back_gateZ + (T - 1 - t) * N * H;
nt = back_gateN + (T - 1 - t) * N * H;
gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
(reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);

DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H;
#pragma omp parallel for num_threads(omp_threads)
Expand Down Expand Up @@ -949,7 +980,11 @@ void GruForwardTraining(DType* ws,
}
wh_l = wx_l + I * 3 * H;
}
memcpy(y_ptr, y_l, T * N * H * D * sizeof(DType));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
y_ptr[i] = y_l[i];
}
}

template <typename DType>
Expand Down Expand Up @@ -1012,6 +1047,17 @@ void GruBackwardSingleLayer(DType* ws,
const Tensor<cpu, 2, DType> back_wx(back_wx_ptr, Shape2(H * 3, I));
const Tensor<cpu, 2, DType> back_wh(back_wh_ptr, Shape2(H * 3, H));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (req_params != kNullOp && req_params != kAddTo) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < D * H * 3 * H; ++i) {
dwh[i] = 0;
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < D * 3 * H; ++i) {
dbx[i] = 0;
dbh[i] = 0;
}
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N * H; ++i) {
if (dhy_ptr) {
Expand Down Expand Up @@ -1121,8 +1167,11 @@ void GruBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * T * 3; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}

for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
Expand Down Expand Up @@ -1236,9 +1285,11 @@ void GruBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));

#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * T * 3; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}
for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < 3 * H; ++i) {
Expand Down Expand Up @@ -1272,7 +1323,10 @@ void GruBackwardSingleLayer(DType* ws,
}
}
if (req_state != kNullOp) {
memcpy(dhx, dht1, N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N * H * D; ++i) {
dhx[i] = dht1[i];
}
}
}

Expand Down Expand Up @@ -1335,6 +1389,7 @@ void GruBackward(DType* ws,
Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
int inputsize = I;
DType* y_tmp = y_l - T * N * H * D;
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int l = L - 1; l >= 0; --l) {
if (l == 0) {
I = inputsize;
Expand All @@ -1349,7 +1404,10 @@ void GruBackward(DType* ws,
dhy_l, gateR_l, gateZ_l, gateN_l, Mnh_l, dx_l, dhx_l,
dwx_l, dwh_l, dbx_l, dbh_l, req_data, req_params, req_state);
if (l > 0) {
memcpy(dy_l, dx_l, T * N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
dy_l[i] = dx_l[i];
}
gateR_l = gateR_l - T * D * N * H;
gateZ_l = gateZ_l - T * D * N * H;
gateN_l = gateN_l - T * D * N * H;
Expand Down
5 changes: 1 addition & 4 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,6 @@ def test_gru_sym():
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')


@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11219")
@with_seed()
def test_gru_bidirectional():
T, N, I, H = 5, 20, 800, 800
Expand All @@ -134,12 +132,11 @@ def test_gru_bidirectional():
mx.rnn.GRUCell(H, prefix='l1_'),
mx.rnn.GRUCell(H, prefix='r1_'),
output_prefix='bi_gru_1_'))

check_rnn_consistency(fused, stack, T, N, I, H, 'write')
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')


# Currently, fused LSTM operator doesn't support dropout.
# Will change this test after dropout is supported
@with_seed()
Expand Down

0 comments on commit 5550c0a

Please sign in to comment.