-
Notifications
You must be signed in to change notification settings - Fork 6.8k
[MXNET-537] add_n(dense, csr, dense) = dense and add_n([dense, csr, rsp]*, dense, [dense, csr, rsp]*) = dense on CPU & GPU #11330
Conversation
755682c
to
fedab5c
Compare
f787b49
to
7355c34
Compare
Benchmark results for warp-optimized GPU kernel for elemwise_add/sub(dense, csr): |
Benchmark result for add_n(dense, csr, dense) = dense: import mxnet as mx
import sys
import os
import scipy
import numpy as np
from mxnet.test_utils import rand_ndarray, assert_almost_equal
import time
def measure_cost(repeat, a, b, c, out=None):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(mx.nd.sparse.add_n(a, b, c, out=out))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def measure_fallback(repeat, a):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(a.tostype('default'))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def main():
shape = (128, 1000000)
dns = np.random.uniform(size=shape)
# context = mx.gpu(0)
context = mx.cpu()
mx_dns1 = mx.nd.array(dns, ctx=context)
mx_dns2 = mx.nd.array(dns, ctx=context)
for density in [0.01, 0.005, 0.001, 0.0005, 0.0001]:
mx_csr = rand_ndarray(shape=shape, stype='csr', density=density).as_in_context(context)
mx_csr_dns = mx_csr.tostype('default')
sparse_cost = 0.0
dns_cost = 0.0
mx.nd.waitall()
#warmup
check = mx.nd.sparse.add_n(mx_dns1, mx_csr, mx_dns2)
dns1 = dns + mx_csr_dns.asnumpy() + dns
assert_almost_equal(check.asnumpy(), dns1, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(20):
sparse_cost += measure_cost(5, mx_dns1, mx_csr, mx_dns2)
dns_cost += measure_cost(5, mx_dns1, mx_csr_dns, mx_dns2)
print("%.2f %%" % (density*100), dns_cost / sparse_cost)
if __name__ == "__main__":
main() |
Benchmark result for add_n(more than 4 inputs with at least 1 dense) = dense (the combination being benchmarked here is add_n(dense, csr, dense, rsp, dense) = dense): import mxnet as mx
import sys
import os
import scipy
import numpy as np
from mxnet.test_utils import rand_ndarray, assert_almost_equal
import time
def measure_cost(repeat, a, b, c, d, e, out=None):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(mx.nd.sparse.add_n(a, b, c, d, e, out=out))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def measure_fallback(repeat, a):
# start bench
start = time.time()
results = []
for i in range(repeat):
results.append(a.tostype('default'))
for result in results:
result.wait_to_read()
end = time.time()
diff = end - start
return diff / repeat
def main():
shape = (1000000, 128)
dns = np.random.uniform(size=shape)
context = mx.gpu(0)
# context = mx.cpu()
mx_dns1 = mx.nd.array(dns, ctx=context)
mx_dns2 = mx.nd.array(dns, ctx=context)
mx_dns3 = mx.nd.array(dns, ctx=context)
for density in [0.01, 0.005, 0.001, 0.0005, 0.0001]:
mx_csr = rand_ndarray(shape=shape, stype='csr', density=density).as_in_context(context)
mx_csr_dns = mx_csr.tostype('default')
mx_rsp = rand_ndarray(shape=shape, stype='row_sparse', density=density).as_in_context(context)
mx_rsp_dns = mx_rsp.tostype('default')
sparse_cost = 0.0
dns_cost = 0.0
mx.nd.waitall()
#warmup
check = mx.nd.sparse.add_n(mx_dns1, mx_csr, mx_rsp, mx_dns2, mx_dns3)
dns1 = dns + mx_csr_dns.asnumpy() + mx_rsp_dns.asnumpy() + dns + dns
assert_almost_equal(check.asnumpy(), dns1, atol=1e-5, rtol=1e-4)
mx.nd.waitall()
for i in range(20):
sparse_cost += measure_cost(5, mx_dns1, mx_csr, mx_dns2, mx_rsp, mx_dns3)
dns_cost += measure_cost(5, mx_dns1, mx_csr_dns, mx_dns2, mx_rsp_dns, mx_dns3)
print("%.2f %%" % (density*100), dns_cost / sparse_cost)
if __name__ == "__main__":
main() |
7355c34
to
868aabb
Compare
src/ndarray/ndarray_function.cu
Outdated
MSHADOW_IDX_TYPE_SWITCH(nd_indptr.type_flag_, CType, { // indptr type | ||
if (nd.storage_initialized()) { | ||
Kernel<ElemwiseDnsCsrDnsWarpKernel<kWriteTo, mshadow_op::plus>, gpu>::Launch( | ||
s, 32 * num_rows, out_data.dptr<DType>(), out_data.dptr<DType>(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suggest use a const var with meaningful name instead of 32
dc54bf4
to
c316d1b
Compare
@eric-haibin-lin build passed, should be good for merge. |
c316d1b
to
b110958
Compare
b110958
to
d85c334
Compare
d85c334
to
f74eeff
Compare
…sp]*, dense, [dense, csr, rsp]*) = dense on CPU & GPU (apache#11330) * support for add_n(dense, csr, dense) = dense with tests * eliminate magic number
Description
As title
Checklist
Essentials
Changes
Comments
Also comes with a optimized GPU kernel for elemwise_add/sub(dns, csr)/elemwise_add/sub(csr, dns), for benchmark results please see comments.