Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Fix issue of test_gru_bidirectional #11219 and add robust code #11333

Merged
merged 6 commits into from
Jun 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 79 additions & 21 deletions src/operator/rnn_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ void LstmForwardTraining(DType* ws,
const int y_offset = T * N * H * 5;
const int cell_size = N * H;
int idx = 0; // state & cell state's idx;
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int i = 0; i < L; ++i) {
const int input_size = i ? H * D : I;
const int w_size = (input_size + H) * H * 4;
Expand Down Expand Up @@ -172,7 +173,10 @@ void LstmForwardTraining(DType* ws,
}
}
}
memcpy(y_ptr, rs + y_offset, T * N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
y_ptr[i] = (rs + y_offset)[i];
}
}

template<typename DType>
Expand Down Expand Up @@ -334,9 +338,18 @@ void LstmBackwardSingleLayer(DType* ws,
DType *c_ptr = bid ? rs + T * N * H * 7 : rs;
const Tensor<cpu, 3, DType> c(c_ptr, Shape3(T, N, H));
const Tensor<cpu, 4, DType> ifgo(c_ptr + T * N * H, Shape4(T, N, H, 4));
memset(dwh.dptr_, 0, H * H * 4 * sizeof(DType));
memset(dbx.dptr_, 0, H * 4 * sizeof(DType));
memset(dbh.dptr_, 0, H * 4 * sizeof(DType));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (req_params != kNullOp && req_params != kAddTo) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * 4 * H; ++i) {
dwh.dptr_[i] = 0;
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < 4 * H; ++i) {
dbx.dptr_[i] = 0;
dbh.dptr_[i] = 0;
}
}
Tensor<cpu, 4, DType> difgo(ws, Shape4(T, N, 4, H));
Tensor<cpu, 2, DType> dh(ws + T * N * H * 4, Shape2(N, H));
Tensor<cpu, 2, DType> dc(dh.dptr_ + N * H, Shape2(N, H));
Expand All @@ -348,13 +361,28 @@ void LstmBackwardSingleLayer(DType* ws,
const DType beta2 = 2.0;
const int cell_size = N * H;
if (dhy_ptr != NULL) {
memcpy(dh.dptr_, dhy_ptr, cell_size * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dh.dptr_[i] = dhy_ptr[i];
}
} else {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dh.dptr_[i] = 0;
}
}
if (dcy_ptr != NULL) {
memcpy(dc.dptr_, dcy_ptr, cell_size * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dc.dptr_[i] = dcy_ptr[i];
}
} else {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < cell_size; ++i) {
dc.dptr_[i] = 0;
}
}

const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int i = T - 1; i >= 0; --i) {
int t = bid ? T - 1 - i : i;
int tnext = bid ? t + 1 : t - 1;
Expand Down Expand Up @@ -422,8 +450,11 @@ void LstmBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf, Shape2(col, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + col * T, Shape2(col, T));
memset(tmp_dbx.dptr_, 0, col * T * sizeof(DType));
memset(tmp_dbh.dptr_, 0, col * T * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < col * T; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}
for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
for (int j = 0; j < col; ++j) {
Expand Down Expand Up @@ -618,11 +649,11 @@ void GruForwardInferenceSingleLayer(DType* ws,
// perform the second direction
if (D == 2) {
gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
(reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);

#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N; ++i) {
Expand Down Expand Up @@ -839,11 +870,11 @@ void GruForwardTrainingSingleLayer(DType* ws,
zt = back_gateZ + (T - 1 - t) * N * H;
nt = back_gateN + (T - 1 - t) * N * H;
gemmC1_t = back_gemmC1 + (T - 1 - t) * N * 3 * H;
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1, Shape2(N, D * H));
Tensor<cpu, 2, DType> dback_ht_1(back_ht_1 - H, Shape2(N, D * H));
Tensor<cpu, 3, DType> dback_ht_1_tmp = Tensor<cpu, 3, DType>
(reinterpret_cast<DType*>(tmp_buf), Shape3(D, H, N));
dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N));
linalg_gemm(dback_ht_1_tmp[0], back_wh, dgemmC2, alpha, beta, true, true);
linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true);

DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H;
#pragma omp parallel for num_threads(omp_threads)
Expand Down Expand Up @@ -949,7 +980,11 @@ void GruForwardTraining(DType* ws,
}
wh_l = wx_l + I * 3 * H;
}
memcpy(y_ptr, y_l, T * N * H * D * sizeof(DType));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
y_ptr[i] = y_l[i];
}
}

template <typename DType>
Expand Down Expand Up @@ -1012,6 +1047,17 @@ void GruBackwardSingleLayer(DType* ws,
const Tensor<cpu, 2, DType> back_wx(back_wx_ptr, Shape2(H * 3, I));
const Tensor<cpu, 2, DType> back_wh(back_wh_ptr, Shape2(H * 3, H));
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
if (req_params != kNullOp && req_params != kAddTo) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < D * H * 3 * H; ++i) {
dwh[i] = 0;
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < D * 3 * H; ++i) {
dbx[i] = 0;
dbh[i] = 0;
}
}
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N * H; ++i) {
if (dhy_ptr) {
Expand Down Expand Up @@ -1121,8 +1167,11 @@ void GruBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * T * 3; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}

for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
Expand Down Expand Up @@ -1236,9 +1285,11 @@ void GruBackwardSingleLayer(DType* ws,
} else {
const Tensor<cpu, 2, DType> tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T));
const Tensor<cpu, 2, DType> tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T));
memset(tmp_dbx.dptr_, 0, H * T * 3 * sizeof(DType));
memset(tmp_dbh.dptr_, 0, H * T * 3 * sizeof(DType));

#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < H * T * 3; ++i) {
tmp_dbx.dptr_[i] = 0;
tmp_dbh.dptr_[i] = 0;
}
for (int t = T - 1; t >= 0; --t) {
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < 3 * H; ++i) {
Expand Down Expand Up @@ -1272,7 +1323,10 @@ void GruBackwardSingleLayer(DType* ws,
}
}
if (req_state != kNullOp) {
memcpy(dhx, dht1, N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < N * H * D; ++i) {
dhx[i] = dht1[i];
}
}
}

Expand Down Expand Up @@ -1335,6 +1389,7 @@ void GruBackward(DType* ws,
Tensor<cpu, 3, DType> hx(hx_ptr, Shape3(L, D * N, H));
int inputsize = I;
DType* y_tmp = y_l - T * N * H * D;
const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
for (int l = L - 1; l >= 0; --l) {
if (l == 0) {
I = inputsize;
Expand All @@ -1349,7 +1404,10 @@ void GruBackward(DType* ws,
dhy_l, gateR_l, gateZ_l, gateN_l, Mnh_l, dx_l, dhx_l,
dwx_l, dwh_l, dbx_l, dbh_l, req_data, req_params, req_state);
if (l > 0) {
memcpy(dy_l, dx_l, T * N * H * D * sizeof(DType));
#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < T * N * H * D; ++i) {
dy_l[i] = dx_l[i];
}
gateR_l = gateR_l - T * D * N * H;
gateZ_l = gateZ_l - T * D * N * H;
gateN_l = gateN_l - T * D * N * H;
Expand Down
5 changes: 1 addition & 4 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,6 @@ def test_gru_sym():
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')


@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/11219")
@with_seed()
def test_gru_bidirectional():
T, N, I, H = 5, 20, 800, 800
Expand All @@ -134,12 +132,11 @@ def test_gru_bidirectional():
mx.rnn.GRUCell(H, prefix='l1_'),
mx.rnn.GRUCell(H, prefix='r1_'),
output_prefix='bi_gru_1_'))

check_rnn_consistency(fused, stack, T, N, I, H, 'write')
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')


# Currently, fused LSTM operator doesn't support dropout.
# Will change this test after dropout is supported
@with_seed()
Expand Down