diff --git a/CMakeLists.txt b/CMakeLists.txt index 000f23b8..100027e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ option(WITH_AEON "CryptoNight-Lite support" ON) option(WITH_SUMO "CryptoNight-Heavy support" ON) option(WITH_CN_PICO "CryptoNight-Pico support" ON) option(WITH_CN_GPU "CryptoNight-GPU support" ON) +option(WITH_RANDOMX "RandomX support" ON) option(WITH_HTTPD "HTTP REST API" ON) option(WITH_TLS "Enable OpenSSL support" ON) option(WITH_ASM "Enable ASM PoW implementations" ON) @@ -233,6 +234,52 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake") find_package(UV REQUIRED) +if (WITH_RANDOMX) + include_directories(src/crypto/randomx) + add_definitions(/DXMRIG_ALGO_RANDOMX) + set(SOURCES_CRYPTO + "${SOURCES_CRYPTO}" + src/crypto/randomx/aes_hash.cpp + src/crypto/randomx/argon2_ref.c + src/crypto/randomx/bytecode_machine.cpp + src/crypto/randomx/dataset.cpp + src/crypto/randomx/soft_aes.cpp + src/crypto/randomx/virtual_memory.cpp + src/crypto/randomx/vm_interpreted.cpp + src/crypto/randomx/allocator.cpp + src/crypto/randomx/randomx.cpp + src/crypto/randomx/superscalar.cpp + src/crypto/randomx/vm_compiled.cpp + src/crypto/randomx/vm_interpreted_light.cpp + src/crypto/randomx/argon2_core.c + src/crypto/randomx/blake2_generator.cpp + src/crypto/randomx/instructions_portable.cpp + src/crypto/randomx/reciprocal.c + src/crypto/randomx/virtual_machine.cpp + src/crypto/randomx/vm_compiled_light.cpp + src/crypto/randomx/blake2/blake2b.c + ) + if (NOT ARCH_ID) + set(ARCH_ID ${CMAKE_HOST_SYSTEM_PROCESSOR}) + endif() + if (CMAKE_C_COMPILER_ID MATCHES MSVC) + enable_language(ASM_MASM) + list(APPEND SOURCES_CRYPTO + src/crypto/randomx/jit_compiler_x86_static.asm + src/crypto/randomx/jit_compiler_x86.cpp + ) + elseif (NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + list(APPEND SOURCES_CRYPTO + src/crypto/randomx/jit_compiler_x86_static.S + src/crypto/randomx/jit_compiler_x86.cpp + ) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/crypto/randomx/jit_compiler_x86_static.S PROPERTY LANGUAGE C) + endif() +else() + remove_definitions(/DXMRIG_ALGO_RANDOMX) +endif() + include(cmake/flags.cmake) add_definitions(/DCL_TARGET_OPENCL_VERSION=200) diff --git a/src/amd/GpuContext.h b/src/amd/GpuContext.h index bc51fe47..aaa2fb97 100644 --- a/src/amd/GpuContext.h +++ b/src/amd/GpuContext.h @@ -47,6 +47,7 @@ struct GpuContext deviceIdx(0), rawIntensity(0), workSize(0), + bfactor(6), threads(0), stridedIndex(2), memChunk(2), @@ -69,15 +70,25 @@ struct GpuContext freeMem(0), globalMem(0), computeUnits(0), - Nonce(0) + Nonce(0), + rx_variant(xmrig::VARIANT_AUTO), + rx_dataset(nullptr), + rx_scratchpads(nullptr), + rx_hashes(nullptr), + rx_entropy(nullptr), + rx_vm_states(nullptr), + rx_rounding(nullptr) { memset(Kernels, 0, sizeof(Kernels)); + memset(rx_dataset_seedhash, 0, sizeof(rx_dataset_seedhash)); + memset(rx_kernels, 0, sizeof(rx_kernels)); } /*Input vars*/ size_t deviceIdx; size_t rawIntensity; size_t workSize; + size_t bfactor; size_t threads; int stridedIndex; int memChunk; @@ -107,6 +118,16 @@ struct GpuContext xmrig::String name; uint32_t Nonce; + + uint8_t rx_dataset_seedhash[32]; + xmrig::Variant rx_variant; + cl_mem rx_dataset; + cl_mem rx_scratchpads; + cl_mem rx_hashes; + cl_mem rx_entropy; + cl_mem rx_vm_states; + cl_mem rx_rounding; + cl_kernel rx_kernels[32]; }; diff --git a/src/amd/OclCache.cpp b/src/amd/OclCache.cpp index 906fd9c8..cdced26a 100644 --- a/src/amd/OclCache.cpp +++ b/src/amd/OclCache.cpp @@ -68,20 +68,42 @@ cl_int OclCache::wait_build(cl_program program, cl_device_id device) void OclCache::getOptions(xmrig::Algo algo, xmrig::Variant, const GpuContext* ctx, char* options, size_t options_size) { - snprintf(options, options_size, "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu " - "-DALGO=%d -DUNROLL_FACTOR=%d -DOPENCL_DRIVER_MAJOR=%d -DWORKSIZE_GPU=%zu -cl-fp32-correctly-rounded-divide-sqrt", - xmrig::cn_select_iter(algo, xmrig::VARIANT_AUTO), - xmrig::cn_select_mask(algo), - ctx->workSize, - ctx->stridedIndex, - static_cast(1u << ctx->memChunk), - ctx->compMode, - xmrig::cn_select_memory(algo), - static_cast(algo), - ctx->unrollFactor, - ctx->amdDriverMajorVersion, - worksize(ctx, xmrig::VARIANT_GPU) - ); +# ifdef XMRIG_ALGO_RANDOMX + if (algo == xmrig::RANDOM_X) { + uint32_t workSize; + switch (ctx->workSize) + { + case 2: + case 4: + case 8: + case 16: + workSize = ctx->workSize; + break; + + default: + workSize = 8; + } + + snprintf(options, options_size, "-DWORKERS_PER_HASH=%u", workSize); + } + else +# endif + { + snprintf(options, options_size, "-DITERATIONS=%u -DMASK=%u -DWORKSIZE=%zu -DSTRIDED_INDEX=%d -DMEM_CHUNK_EXPONENT=%d -DCOMP_MODE=%d -DMEMORY=%zu " + "-DALGO=%d -DUNROLL_FACTOR=%d -DOPENCL_DRIVER_MAJOR=%d -DWORKSIZE_GPU=%zu -cl-fp32-correctly-rounded-divide-sqrt", + xmrig::cn_select_iter(algo, xmrig::VARIANT_AUTO), + xmrig::cn_select_mask(algo), + ctx->workSize, + ctx->stridedIndex, + static_cast(1u << ctx->memChunk), + ctx->compMode, + xmrig::cn_select_memory(algo), + static_cast(algo), + ctx->unrollFactor, + ctx->amdDriverMajorVersion, + worksize(ctx, xmrig::VARIANT_GPU) + ); + } } bool OclCache::load() diff --git a/src/amd/OclGPU.cpp b/src/amd/OclGPU.cpp index 06124c75..63823501 100644 --- a/src/amd/OclGPU.cpp +++ b/src/amd/OclGPU.cpp @@ -44,6 +44,7 @@ #include "core/Config.h" #include "crypto/CryptoNight_constants.h" #include "cryptonight.h" +#include "../workers/Workers.h" constexpr const char *kSetKernelArgErr = "Error %s when calling clSetKernelArg for kernel %d, argument %d."; @@ -204,44 +205,87 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch } size_t g_thd = ctx->rawIntensity; - ctx->ExtraBuffers[0] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, xmrig::cn_select_memory(config->algorithm().algo()) * g_thd, nullptr, &ret); - if (ret != CL_SUCCESS) { - LOG_ERR("Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); - return OCL_ERR_API; - } +#ifdef XMRIG_ALGO_RANDOMX + if (config->algorithm().algo() == xmrig::RANDOM_X) { + const size_t dataset_size = randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE; + ctx->rx_dataset = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX dataset.", err_to_str(ret)); + return OCL_ERR_API; + } - ctx->ExtraBuffers[1] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, nullptr, &ret); - if(ret != CL_SUCCESS) { - LOG_ERR("Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); - return OCL_ERR_API; - } + ctx->rx_scratchpads = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, (xmrig::rx_select_memory(config->algorithm().variant()) + 64) * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX scratchpads.", err_to_str(ret)); + return OCL_ERR_API; + } - // Blake-256 branches - ctx->ExtraBuffers[2] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); - if (ret != CL_SUCCESS){ - LOG_ERR("Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); - return OCL_ERR_API; - } + ctx->rx_hashes = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, 64 * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX hashes buffer.", err_to_str(ret)); + return OCL_ERR_API; + } - // Groestl-256 branches - ctx->ExtraBuffers[3] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); - if(ret != CL_SUCCESS) { - LOG_ERR("Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); - return OCL_ERR_API; - } + ctx->rx_entropy = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, (128 + 2560) * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX entropy buffer.", err_to_str(ret)); + return OCL_ERR_API; + } - // JH-256 branches - ctx->ExtraBuffers[4] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); - if (ret != CL_SUCCESS) { - LOG_ERR("Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); - return OCL_ERR_API; + ctx->rx_vm_states = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, 2560 * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX VM states buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + ctx->rx_rounding = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(uint32_t) * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create RandomX rounding buffer.", err_to_str(ret)); + return OCL_ERR_API; + } } + else +#endif + { + ctx->ExtraBuffers[0] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, xmrig::cn_select_memory(config->algorithm().algo()) * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret)); + return OCL_ERR_API; + } - // Skein-512 branches - ctx->ExtraBuffers[5] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); - if (ret != CL_SUCCESS) { - LOG_ERR("Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); - return OCL_ERR_API; + ctx->ExtraBuffers[1] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create hash states buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + // Blake-256 branches + ctx->ExtraBuffers[2] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create Branch 0 buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + // Groestl-256 branches + ctx->ExtraBuffers[3] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create Branch 1 buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + // JH-256 branches + ctx->ExtraBuffers[4] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create Branch 2 buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + // Skein-512 branches + ctx->ExtraBuffers[5] = OclLib::createBuffer(opencl_ctx, CL_MEM_READ_WRITE, sizeof(cl_uint) * (g_thd + 2), nullptr, &ret); + if (ret != CL_SUCCESS) { + LOG_ERR("Error %s when calling clCreateBuffer to create Branch 3 buffer.", err_to_str(ret)); + return OCL_ERR_API; + } } // Assume we may find up to 0xFF nonces in one run - it's reasonable @@ -256,29 +300,229 @@ size_t InitOpenCLGpu(int index, cl_context opencl_ctx, GpuContext* ctx, const ch return OCL_ERR_API; } - const char *KernelNames[] = { - "cn0", "cn1", "cn2", - "Blake", "Groestl", "JH", "Skein", - "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube", "cn1_v2_monero", "cn1_v2_half", -# ifndef XMRIG_NO_CN_GPU - "cn0_cn_gpu", "cn00_cn_gpu", "cn1_cn_gpu", "cn2_cn_gpu", -# else - "", "", "", "", -# endif - "cn1_v2_rwz", "cn1_v2_zls", "cn1_v2_double", +#ifdef XMRIG_ALGO_RANDOMX + if (config->algorithm().algo() == xmrig::RANDOM_X) { + const char* KernelNamesRX[] = { + "fillAes1Rx4_scratchpad", "fillAes4Rx4_entropy", "hashAes1Rx4", + "blake2b_initial_hash", "blake2b_hash_registers_32", "blake2b_hash_registers_64", + "init_vm", "execute_vm", "find_shares", + nullptr + }; + + for (int i = 0; KernelNamesRX[i]; ++i) { + if (!KernelNamesRX[i][0]) { + continue; + } + ctx->rx_kernels[i] = OclLib::createKernel(ctx->Program, KernelNamesRX[i], &ret); + if (ret != CL_SUCCESS) { + return OCL_ERR_API; + } + } - nullptr - }; - for (int i = 0; KernelNames[i]; ++i) { - if (!KernelNames[i][0]) { - continue; + // fillAes1Rx4_scratchpad + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[0], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 0, 0); + return OCL_ERR_API; } - ctx->Kernels[i] = OclLib::createKernel(ctx->Program, KernelNames[i], &ret); - if (ret != CL_SUCCESS) { + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[0], 1, sizeof(cl_mem), &ctx->rx_scratchpads)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 0, 1); + return OCL_ERR_API; + } + + const uint32_t batch_size = g_thd; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[0], 2, sizeof(uint32_t), &batch_size)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 0, 2); + return OCL_ERR_API; + } + + const uint32_t rx_version = (config->algorithm().variant() == xmrig::VARIANT_RX_WOW) ? 103 : 104; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[0], 3, sizeof(uint32_t), &rx_version)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 0, 3); + return OCL_ERR_API; + } + + // fillAes4Rx4_entropy + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[1], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 1, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[1], 1, sizeof(cl_mem), &ctx->rx_entropy)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 1, 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[1], 2, sizeof(uint32_t), &batch_size)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 1, 2); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[1], 3, sizeof(uint32_t), &rx_version)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 1, 3); + return OCL_ERR_API; + } + + // hashAes1Rx4 + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[2], 0, sizeof(cl_mem), &ctx->rx_scratchpads)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 2, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[2], 1, sizeof(cl_mem), &ctx->rx_vm_states)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 2, 1); + return OCL_ERR_API; + } + + const uint32_t hashOffsetBytes = 192; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[2], 2, sizeof(uint32_t), &hashOffsetBytes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 2, 2); + return OCL_ERR_API; + } + + const uint32_t hashStrideBytes = (config->algorithm().variant() == xmrig::VARIANT_RX_LOKI) ? RandomX_LokiConfig.ProgramSize * 8 : RandomX_MoneroConfig.ProgramSize * 8; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[2], 3, sizeof(uint32_t), &hashStrideBytes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 2, 3); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[2], 4, sizeof(uint32_t), &batch_size)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 2, 4); + return OCL_ERR_API; + } + + // blake2b_initial_hash + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[3], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 3, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[3], 1, sizeof(cl_mem), &ctx->InputBuffer)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 3, 1); + return OCL_ERR_API; + } + + // blockTemplateSize is set in RXSetJob() + // start_nonce is set in RXRunJob() + + // blake2b_hash_registers_32 + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[4], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 4, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[4], 1, sizeof(cl_mem), &ctx->rx_vm_states)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 4, 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[4], 2, sizeof(uint32_t), &hashStrideBytes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 4, 2); + return OCL_ERR_API; + } + + // blake2b_hash_registers_64 + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[5], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 5, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[5], 1, sizeof(cl_mem), &ctx->rx_vm_states)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 5, 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[5], 2, sizeof(uint32_t), &hashStrideBytes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 5, 2); + return OCL_ERR_API; + } + + // init_vm + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[6], 0, sizeof(cl_mem), &ctx->rx_entropy)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 6, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[6], 1, sizeof(cl_mem), &ctx->rx_vm_states)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 6, 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[6], 2, sizeof(cl_mem), &ctx->rx_rounding)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 6, 2); + return OCL_ERR_API; + } + + // iteration is set in RXRunJob() + + // execute_vm + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 0, sizeof(cl_mem), &ctx->rx_vm_states)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 0); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 1, sizeof(cl_mem), &ctx->rx_rounding)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 2, sizeof(cl_mem), &ctx->rx_scratchpads)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 2); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 3, sizeof(cl_mem), &ctx->rx_dataset)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 3); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 4, sizeof(uint32_t), &batch_size)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 4); + return OCL_ERR_API; + } + + // num_iterations, first, last are set in RXRunJob() + + // find_shares + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[8], 0, sizeof(cl_mem), &ctx->rx_hashes)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 8, 0); + return OCL_ERR_API; + } + + // target is set in RXSetJob() + // start_nonce is set in RXRunJob() + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[8], 3, sizeof(cl_mem), &ctx->OutputBuffer)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 8, 3); return OCL_ERR_API; } } + else +#endif + { + const char *KernelNames[] = { + "cn0", "cn1", "cn2", + "Blake", "Groestl", "JH", "Skein", + "cn1_monero", "cn1_msr", "cn1_xao", "cn1_tube", "cn1_v2_monero", "cn1_v2_half", +# ifndef XMRIG_NO_CN_GPU + "cn0_cn_gpu", "cn00_cn_gpu", "cn1_cn_gpu", "cn2_cn_gpu", +# else + "", "", "", "", +# endif + "cn1_v2_rwz", "cn1_v2_zls", "cn1_v2_double", + nullptr + }; + + for (int i = 0; KernelNames[i]; ++i) { + if (!KernelNames[i][0]) { + continue; + } + ctx->Kernels[i] = OclLib::createKernel(ctx->Program, KernelNames[i], &ret); + if (ret != CL_SUCCESS) { + return OCL_ERR_API; + } + } + } ctx->Nonce = 0; return 0; @@ -490,47 +734,92 @@ size_t InitOpenCL(const std::vector &contexts, xmrig::Config *conf return OCL_ERR_API; } - const char *cryptonightCL = - #include "./opencl/cryptonight.cl" - ; - const char *cryptonightCL2 = - #include "./opencl/cryptonight2.cl" - ; - const char *blake256CL = - #include "./opencl/blake256.cl" - ; - const char *groestl256CL = - #include "./opencl/groestl256.cl" - ; - const char *jhCL = - #include "./opencl/jh.cl" - ; - const char *wolfAesCL = - #include "./opencl/wolf-aes.cl" - ; - const char *wolfSkeinCL = - #include "./opencl/wolf-skein.cl" - ; - const char *fastIntMathV2CL = - #include "./opencl/fast_int_math_v2.cl" - ; - const char *fastDivHeavyCL = - #include "./opencl/fast_div_heavy.cl" - ; - const char *cryptonight_gpu = - #include "./opencl/cryptonight_gpu.cl" - ; - - std::string source_code(cryptonightCL); - source_code.append(cryptonightCL2); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"), jhCL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); - source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_CN_GPU"), cryptonight_gpu); + std::string source_code; + +#ifdef XMRIG_ALGO_RANDOMX + if (config->algorithm().algo() == xmrig::RANDOM_X) { + const char* randomx_constants_wow_h = + #include "./opencl/RandomX/randomx_constants_wow.h" + ; + const char* randomx_constants_loki_h = + #include "./opencl/RandomX/randomx_constants_loki.h" + ; + const char* aesCL = + #include "./opencl/RandomX/aes.cl" + ; + const char* fillAes1Rx4CL = + #include "./opencl/RandomX/fillAes1Rx4.cl" + ; + const char* blake2bCL = + #include "./opencl/RandomX/blake2b.cl" + ; + const char* blake2b_double_blockCL = + #include "./opencl/RandomX/blake2b_double_block.cl" + ; + const char* randomx_vmCL = + #include "./opencl/RandomX/randomx_vm.cl" + ; + + switch (config->algorithm().variant()) + { + case xmrig::VARIANT_RX_WOW: + source_code.append(randomx_constants_wow_h); + break; + + case xmrig::VARIANT_RX_LOKI: + source_code.append(randomx_constants_loki_h); + break; + } + + source_code.append(std::regex_replace(aesCL, std::regex("#include \"fillAes1Rx4.cl\""), fillAes1Rx4CL)); + source_code.append(std::regex_replace(blake2bCL, std::regex("#include \"blake2b_double_block.cl\""), blake2b_double_blockCL)); + source_code.append(randomx_vmCL); + } + else +#endif + { + const char *cryptonightCL = + #include "./opencl/cryptonight.cl" + ; + const char *cryptonightCL2 = + #include "./opencl/cryptonight2.cl" + ; + const char *blake256CL = + #include "./opencl/blake256.cl" + ; + const char *groestl256CL = + #include "./opencl/groestl256.cl" + ; + const char *jhCL = + #include "./opencl/jh.cl" + ; + const char *wolfAesCL = + #include "./opencl/wolf-aes.cl" + ; + const char *wolfSkeinCL = + #include "./opencl/wolf-skein.cl" + ; + const char *fastIntMathV2CL = + #include "./opencl/fast_int_math_v2.cl" + ; + const char *fastDivHeavyCL = + #include "./opencl/fast_div_heavy.cl" + ; + const char *cryptonight_gpu = + #include "./opencl/cryptonight_gpu.cl" + ; + + source_code.append(cryptonightCL); + source_code.append(cryptonightCL2); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_AES"), wolfAesCL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_WOLF_SKEIN"), wolfSkeinCL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_JH"), jhCL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_BLAKE256"), blake256CL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_GROESTL256"), groestl256CL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL); + source_code = std::regex_replace(source_code, std::regex("XMRIG_INCLUDE_CN_GPU"), cryptonight_gpu); + } for (size_t i = 0; i < num_gpus; ++i) { if (contexts[i]->stridedIndex == 2 && (contexts[i]->rawIntensity % contexts[i]->workSize) != 0) { @@ -866,6 +1155,198 @@ size_t XMRRunJob(GpuContext *ctx, cl_uint *HashOutput, xmrig::Variant variant) return OCL_ERR_SUCCESS; } +#ifdef XMRIG_ALGO_RANDOMX +size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, xmrig::Variant variant) +{ + cl_int ret; + randomx_dataset* dataset = Workers::getDataset(seed_hash, variant); + const size_t dataset_size = randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE; + + if ((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0) || (ctx->rx_variant != variant)) { + memcpy(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)); + ctx->rx_variant = variant; + if ((ret = OclLib::enqueueWriteBuffer(ctx->CommandQueues, ctx->rx_dataset, CL_TRUE, 0, dataset_size, randomx_get_dataset_memory(dataset), 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueWriteBuffer to fill RandomX dataset.", err_to_str(ret)); + return OCL_ERR_API; + } + } + + if (input_len < 128) { + memset(input + input_len, 0, 128 - input_len); + } + + if ((ret = OclLib::enqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); + return OCL_ERR_API; + } + + const uint32_t blockTemplateSize = static_cast(input_len); + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[3], 2, sizeof(uint32_t), &blockTemplateSize)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 3, 2); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[8], 1, sizeof(uint64_t), &target)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 8, 1); + return OCL_ERR_API; + } + + return OCL_ERR_SUCCESS; +} + +size_t RXRunJob(GpuContext *ctx, cl_uint *HashOutput, xmrig::Variant variant) +{ + const uint32_t g_intensity = static_cast(ctx->rawIntensity); + + cl_int ret; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[3], 3, sizeof(uint32_t), &ctx->Nonce)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 3, 3); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[8], 2, sizeof(uint32_t), &ctx->Nonce)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 8, 2); + return OCL_ERR_API; + } + + size_t globalWorkSize = g_intensity; + size_t globalWorkSize4 = g_intensity * 4; + size_t globalWorkSize8 = g_intensity * 8; + size_t globalWorkSize16 = g_intensity * 16; + size_t localWorkSize = 64; + size_t localWorkSize32 = 32; + size_t localWorkSize16 = 16; + + uint32_t zero = 0; + if ((ret = OclLib::enqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(uint32_t), &zero, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); + return OCL_ERR_API; + } + + // blake2b_initial_hash + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[3], 1, nullptr, &globalWorkSize, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 3); + return OCL_ERR_API; + } + + // fillAes1Rx4_scratchpad + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[0], 1, nullptr, &globalWorkSize4, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 0); + return OCL_ERR_API; + } + + uint32_t bfactor = static_cast(ctx->bfactor); + if (bfactor > 8) { + bfactor = 8; + } + + for (uint32_t i = 0; i < RandomX_CurrentConfig.ProgramCount; ++i) { + // fillAes4Rx4_entropy + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[1], 1, nullptr, &globalWorkSize4, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[6], 3, sizeof(uint32_t), &i)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 6, 3); + return OCL_ERR_API; + } + + // init_vm + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[6], 1, nullptr, &globalWorkSize8, &localWorkSize32, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 6); + return OCL_ERR_API; + } + + // execute_vm + uint32_t num_iterations = RandomX_CurrentConfig.ProgramIterations >> bfactor; + uint32_t first = 1; + uint32_t last = 0; + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 5, sizeof(uint32_t), &num_iterations)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 5); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 6, sizeof(uint32_t), &first)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 6); + return OCL_ERR_API; + } + + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 7, sizeof(uint32_t), &last)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 7); + return OCL_ERR_API; + } + + for (int j = 0, n = 1 << bfactor; j < n; ++j) { + if (j == n - 1) { + last = 1; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 7, sizeof(uint32_t), &last)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 7); + return OCL_ERR_API; + } + } + + // execute_vm + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[7], 1, nullptr, (ctx->workSize == 16) ? &globalWorkSize16 : &globalWorkSize8, (ctx->workSize == 16) ? &localWorkSize32 : &localWorkSize16, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 7); + return OCL_ERR_API; + } + + if (j == 0) { + first = 0; + if ((ret = OclLib::setKernelArg(ctx->rx_kernels[7], 6, sizeof(uint32_t), &first)) != CL_SUCCESS) { + LOG_ERR(kSetKernelArgErr, err_to_str(ret), 7, 6); + return OCL_ERR_API; + } + } + } + + if (i == RandomX_CurrentConfig.ProgramCount - 1) { + // hashAes1Rx4 + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[2], 1, nullptr, &globalWorkSize4, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 2); + return OCL_ERR_API; + } + + // blake2b_hash_registers_32 + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[4], 1, nullptr, &globalWorkSize, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 4); + return OCL_ERR_API; + } + } else { + // blake2b_hash_registers_64 + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[5], 1, nullptr, &globalWorkSize, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 5); + return OCL_ERR_API; + } + } + } + + if ((ret = OclLib::enqueueNDRangeKernel(ctx->CommandQueues, ctx->rx_kernels[8], 1, nullptr, &globalWorkSize, &localWorkSize, 0, nullptr, nullptr)) != CL_SUCCESS) { + LOG_ERR("Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 8); + return OCL_ERR_API; + } + + if (OclLib::enqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, nullptr, nullptr) != CL_SUCCESS) { + return OCL_ERR_API; + } + + OclLib::finish(ctx->CommandQueues); + + cl_uint& numHashValues = HashOutput[0xFF]; + + // avoid out of memory read, we have only storage for 0xFF results + if (numHashValues > 0xFF) { + numHashValues = 0xFF; + } + + ctx->Nonce += (uint32_t) g_intensity; + + return OCL_ERR_SUCCESS; +} +#endif + void ReleaseOpenCl(GpuContext* ctx) { @@ -874,7 +1355,9 @@ void ReleaseOpenCl(GpuContext* ctx) int buffer_count = sizeof(ctx->ExtraBuffers) / sizeof(ctx->ExtraBuffers[0]); for (int b = 0; b < buffer_count; ++b) { - OclLib::releaseMemObject(ctx->ExtraBuffers[b]); + if (ctx->ExtraBuffers[b]) { + OclLib::releaseMemObject(ctx->ExtraBuffers[b]); + } } OclLib::releaseProgram(ctx->Program); @@ -886,6 +1369,20 @@ void ReleaseOpenCl(GpuContext* ctx) } } + if (ctx->rx_dataset) OclLib::releaseMemObject(ctx->rx_dataset); + if (ctx->rx_scratchpads) OclLib::releaseMemObject(ctx->rx_scratchpads); + if (ctx->rx_hashes) OclLib::releaseMemObject(ctx->rx_hashes); + if (ctx->rx_entropy) OclLib::releaseMemObject(ctx->rx_entropy); + if (ctx->rx_vm_states) OclLib::releaseMemObject(ctx->rx_vm_states); + if (ctx->rx_rounding) OclLib::releaseMemObject(ctx->rx_rounding); + + kernel_count = sizeof(ctx->rx_kernels) / sizeof(ctx->rx_kernels[0]); + for (int k = 0; k < kernel_count; ++k) { + if (ctx->rx_kernels[k]) { + OclLib::releaseKernel(ctx->rx_kernels[k]); + } + } + OclLib::releaseCommandQueue(ctx->CommandQueues); } diff --git a/src/amd/OclGPU.h b/src/amd/OclGPU.h index 2bfc7b47..f9df90e1 100644 --- a/src/amd/OclGPU.h +++ b/src/amd/OclGPU.h @@ -56,6 +56,12 @@ void printPlatforms(); size_t InitOpenCL(const std::vector &contexts, xmrig::Config *config, cl_context *opencl_ctx); size_t XMRSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, xmrig::Variant variant, uint64_t height); size_t XMRRunJob(GpuContext *ctx, cl_uint *HashOutput, xmrig::Variant variant); + +#ifdef XMRIG_ALGO_RANDOMX +size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, xmrig::Variant variant); +size_t RXRunJob(GpuContext *ctx, cl_uint *HashOutput, xmrig::Variant variant); +#endif + void ReleaseOpenCl(GpuContext* ctx); void ReleaseOpenClContext(cl_context opencl_ctx); #endif /* XMRIG_OCLGPU_H */ diff --git a/src/amd/opencl/RandomX/aes.cl b/src/amd/opencl/RandomX/aes.cl new file mode 100644 index 00000000..36ee37b4 --- /dev/null +++ b/src/amd/opencl/RandomX/aes.cl @@ -0,0 +1,647 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +__constant static const uint AES_TABLE[2048] = +{ + 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, + 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, + 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, + 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, + 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, + 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, + 0xecadad41U, 0x67d4d4b3U, 0xfda2a25fU, 0xeaafaf45U, + 0xbf9c9c23U, 0xf7a4a453U, 0x967272e4U, 0x5bc0c09bU, + 0xc2b7b775U, 0x1cfdfde1U, 0xae93933dU, 0x6a26264cU, + 0x5a36366cU, 0x413f3f7eU, 0x02f7f7f5U, 0x4fcccc83U, + 0x5c343468U, 0xf4a5a551U, 0x34e5e5d1U, 0x08f1f1f9U, + 0x937171e2U, 0x73d8d8abU, 0x53313162U, 0x3f15152aU, + 0x0c040408U, 0x52c7c795U, 0x65232346U, 0x5ec3c39dU, + 0x28181830U, 0xa1969637U, 0x0f05050aU, 0xb59a9a2fU, + 0x0907070eU, 0x36121224U, 0x9b80801bU, 0x3de2e2dfU, + 0x26ebebcdU, 0x6927274eU, 0xcdb2b27fU, 0x9f7575eaU, + 0x1b090912U, 0x9e83831dU, 0x742c2c58U, 0x2e1a1a34U, + 0x2d1b1b36U, 0xb26e6edcU, 0xee5a5ab4U, 0xfba0a05bU, + 0xf65252a4U, 0x4d3b3b76U, 0x61d6d6b7U, 0xceb3b37dU, + 0x7b292952U, 0x3ee3e3ddU, 0x712f2f5eU, 0x97848413U, + 0xf55353a6U, 0x68d1d1b9U, 0x00000000U, 0x2cededc1U, + 0x60202040U, 0x1ffcfce3U, 0xc8b1b179U, 0xed5b5bb6U, + 0xbe6a6ad4U, 0x46cbcb8dU, 0xd9bebe67U, 0x4b393972U, + 0xde4a4a94U, 0xd44c4c98U, 0xe85858b0U, 0x4acfcf85U, + 0x6bd0d0bbU, 0x2aefefc5U, 0xe5aaaa4fU, 0x16fbfbedU, + 0xc5434386U, 0xd74d4d9aU, 0x55333366U, 0x94858511U, + 0xcf45458aU, 0x10f9f9e9U, 0x06020204U, 0x817f7ffeU, + 0xf05050a0U, 0x443c3c78U, 0xba9f9f25U, 0xe3a8a84bU, + 0xf35151a2U, 0xfea3a35dU, 0xc0404080U, 0x8a8f8f05U, + 0xad92923fU, 0xbc9d9d21U, 0x48383870U, 0x04f5f5f1U, + 0xdfbcbc63U, 0xc1b6b677U, 0x75dadaafU, 0x63212142U, + 0x30101020U, 0x1affffe5U, 0x0ef3f3fdU, 0x6dd2d2bfU, + 0x4ccdcd81U, 0x140c0c18U, 0x35131326U, 0x2fececc3U, + 0xe15f5fbeU, 0xa2979735U, 0xcc444488U, 0x3917172eU, + 0x57c4c493U, 0xf2a7a755U, 0x827e7efcU, 0x473d3d7aU, + 0xac6464c8U, 0xe75d5dbaU, 0x2b191932U, 0x957373e6U, + 0xa06060c0U, 0x98818119U, 0xd14f4f9eU, 0x7fdcdca3U, + 0x66222244U, 0x7e2a2a54U, 0xab90903bU, 0x8388880bU, + 0xca46468cU, 0x29eeeec7U, 0xd3b8b86bU, 0x3c141428U, + 0x79dedea7U, 0xe25e5ebcU, 0x1d0b0b16U, 0x76dbdbadU, + 0x3be0e0dbU, 0x56323264U, 0x4e3a3a74U, 0x1e0a0a14U, + 0xdb494992U, 0x0a06060cU, 0x6c242448U, 0xe45c5cb8U, + 0x5dc2c29fU, 0x6ed3d3bdU, 0xefacac43U, 0xa66262c4U, + 0xa8919139U, 0xa4959531U, 0x37e4e4d3U, 0x8b7979f2U, + 0x32e7e7d5U, 0x43c8c88bU, 0x5937376eU, 0xb76d6ddaU, + 0x8c8d8d01U, 0x64d5d5b1U, 0xd24e4e9cU, 0xe0a9a949U, + 0xb46c6cd8U, 0xfa5656acU, 0x07f4f4f3U, 0x25eaeacfU, + 0xaf6565caU, 0x8e7a7af4U, 0xe9aeae47U, 0x18080810U, + 0xd5baba6fU, 0x887878f0U, 0x6f25254aU, 0x722e2e5cU, + 0x241c1c38U, 0xf1a6a657U, 0xc7b4b473U, 0x51c6c697U, + 0x23e8e8cbU, 0x7cdddda1U, 0x9c7474e8U, 0x211f1f3eU, + 0xdd4b4b96U, 0xdcbdbd61U, 0x868b8b0dU, 0x858a8a0fU, + 0x907070e0U, 0x423e3e7cU, 0xc4b5b571U, 0xaa6666ccU, + 0xd8484890U, 0x05030306U, 0x01f6f6f7U, 0x120e0e1cU, + 0xa36161c2U, 0x5f35356aU, 0xf95757aeU, 0xd0b9b969U, + 0x91868617U, 0x58c1c199U, 0x271d1d3aU, 0xb99e9e27U, + 0x38e1e1d9U, 0x13f8f8ebU, 0xb398982bU, 0x33111122U, + 0xbb6969d2U, 0x70d9d9a9U, 0x898e8e07U, 0xa7949433U, + 0xb69b9b2dU, 0x221e1e3cU, 0x92878715U, 0x20e9e9c9U, + 0x49cece87U, 0xff5555aaU, 0x78282850U, 0x7adfdfa5U, + 0x8f8c8c03U, 0xf8a1a159U, 0x80898909U, 0x170d0d1aU, + 0xdabfbf65U, 0x31e6e6d7U, 0xc6424284U, 0xb86868d0U, + 0xc3414182U, 0xb0999929U, 0x772d2d5aU, 0x110f0f1eU, + 0xcbb0b07bU, 0xfc5454a8U, 0xd6bbbb6dU, 0x3a16162cU, + 0x6363c6a5U, 0x7c7cf884U, 0x7777ee99U, 0x7b7bf68dU, + 0xf2f2ff0dU, 0x6b6bd6bdU, 0x6f6fdeb1U, 0xc5c59154U, + 0x30306050U, 0x01010203U, 0x6767cea9U, 0x2b2b567dU, + 0xfefee719U, 0xd7d7b562U, 0xabab4de6U, 0x7676ec9aU, + 0xcaca8f45U, 0x82821f9dU, 0xc9c98940U, 0x7d7dfa87U, + 0xfafaef15U, 0x5959b2ebU, 0x47478ec9U, 0xf0f0fb0bU, + 0xadad41ecU, 0xd4d4b367U, 0xa2a25ffdU, 0xafaf45eaU, + 0x9c9c23bfU, 0xa4a453f7U, 0x7272e496U, 0xc0c09b5bU, + 0xb7b775c2U, 0xfdfde11cU, 0x93933daeU, 0x26264c6aU, + 0x36366c5aU, 0x3f3f7e41U, 0xf7f7f502U, 0xcccc834fU, + 0x3434685cU, 0xa5a551f4U, 0xe5e5d134U, 0xf1f1f908U, + 0x7171e293U, 0xd8d8ab73U, 0x31316253U, 0x15152a3fU, + 0x0404080cU, 0xc7c79552U, 0x23234665U, 0xc3c39d5eU, + 0x18183028U, 0x969637a1U, 0x05050a0fU, 0x9a9a2fb5U, + 0x07070e09U, 0x12122436U, 0x80801b9bU, 0xe2e2df3dU, + 0xebebcd26U, 0x27274e69U, 0xb2b27fcdU, 0x7575ea9fU, + 0x0909121bU, 0x83831d9eU, 0x2c2c5874U, 0x1a1a342eU, + 0x1b1b362dU, 0x6e6edcb2U, 0x5a5ab4eeU, 0xa0a05bfbU, + 0x5252a4f6U, 0x3b3b764dU, 0xd6d6b761U, 0xb3b37dceU, + 0x2929527bU, 0xe3e3dd3eU, 0x2f2f5e71U, 0x84841397U, + 0x5353a6f5U, 0xd1d1b968U, 0x00000000U, 0xededc12cU, + 0x20204060U, 0xfcfce31fU, 0xb1b179c8U, 0x5b5bb6edU, + 0x6a6ad4beU, 0xcbcb8d46U, 0xbebe67d9U, 0x3939724bU, + 0x4a4a94deU, 0x4c4c98d4U, 0x5858b0e8U, 0xcfcf854aU, + 0xd0d0bb6bU, 0xefefc52aU, 0xaaaa4fe5U, 0xfbfbed16U, + 0x434386c5U, 0x4d4d9ad7U, 0x33336655U, 0x85851194U, + 0x45458acfU, 0xf9f9e910U, 0x02020406U, 0x7f7ffe81U, + 0x5050a0f0U, 0x3c3c7844U, 0x9f9f25baU, 0xa8a84be3U, + 0x5151a2f3U, 0xa3a35dfeU, 0x404080c0U, 0x8f8f058aU, + 0x92923fadU, 0x9d9d21bcU, 0x38387048U, 0xf5f5f104U, + 0xbcbc63dfU, 0xb6b677c1U, 0xdadaaf75U, 0x21214263U, + 0x10102030U, 0xffffe51aU, 0xf3f3fd0eU, 0xd2d2bf6dU, + 0xcdcd814cU, 0x0c0c1814U, 0x13132635U, 0xececc32fU, + 0x5f5fbee1U, 0x979735a2U, 0x444488ccU, 0x17172e39U, + 0xc4c49357U, 0xa7a755f2U, 0x7e7efc82U, 0x3d3d7a47U, + 0x6464c8acU, 0x5d5dbae7U, 0x1919322bU, 0x7373e695U, + 0x6060c0a0U, 0x81811998U, 0x4f4f9ed1U, 0xdcdca37fU, + 0x22224466U, 0x2a2a547eU, 0x90903babU, 0x88880b83U, + 0x46468ccaU, 0xeeeec729U, 0xb8b86bd3U, 0x1414283cU, + 0xdedea779U, 0x5e5ebce2U, 0x0b0b161dU, 0xdbdbad76U, + 0xe0e0db3bU, 0x32326456U, 0x3a3a744eU, 0x0a0a141eU, + 0x494992dbU, 0x06060c0aU, 0x2424486cU, 0x5c5cb8e4U, + 0xc2c29f5dU, 0xd3d3bd6eU, 0xacac43efU, 0x6262c4a6U, + 0x919139a8U, 0x959531a4U, 0xe4e4d337U, 0x7979f28bU, + 0xe7e7d532U, 0xc8c88b43U, 0x37376e59U, 0x6d6ddab7U, + 0x8d8d018cU, 0xd5d5b164U, 0x4e4e9cd2U, 0xa9a949e0U, + 0x6c6cd8b4U, 0x5656acfaU, 0xf4f4f307U, 0xeaeacf25U, + 0x6565caafU, 0x7a7af48eU, 0xaeae47e9U, 0x08081018U, + 0xbaba6fd5U, 0x7878f088U, 0x25254a6fU, 0x2e2e5c72U, + 0x1c1c3824U, 0xa6a657f1U, 0xb4b473c7U, 0xc6c69751U, + 0xe8e8cb23U, 0xdddda17cU, 0x7474e89cU, 0x1f1f3e21U, + 0x4b4b96ddU, 0xbdbd61dcU, 0x8b8b0d86U, 0x8a8a0f85U, + 0x7070e090U, 0x3e3e7c42U, 0xb5b571c4U, 0x6666ccaaU, + 0x484890d8U, 0x03030605U, 0xf6f6f701U, 0x0e0e1c12U, + 0x6161c2a3U, 0x35356a5fU, 0x5757aef9U, 0xb9b969d0U, + 0x86861791U, 0xc1c19958U, 0x1d1d3a27U, 0x9e9e27b9U, + 0xe1e1d938U, 0xf8f8eb13U, 0x98982bb3U, 0x11112233U, + 0x6969d2bbU, 0xd9d9a970U, 0x8e8e0789U, 0x949433a7U, + 0x9b9b2db6U, 0x1e1e3c22U, 0x87871592U, 0xe9e9c920U, + 0xcece8749U, 0x5555aaffU, 0x28285078U, 0xdfdfa57aU, + 0x8c8c038fU, 0xa1a159f8U, 0x89890980U, 0x0d0d1a17U, + 0xbfbf65daU, 0xe6e6d731U, 0x424284c6U, 0x6868d0b8U, + 0x414182c3U, 0x999929b0U, 0x2d2d5a77U, 0x0f0f1e11U, + 0xb0b07bcbU, 0x5454a8fcU, 0xbbbb6dd6U, 0x16162c3aU, + 0x63c6a563U, 0x7cf8847cU, 0x77ee9977U, 0x7bf68d7bU, + 0xf2ff0df2U, 0x6bd6bd6bU, 0x6fdeb16fU, 0xc59154c5U, + 0x30605030U, 0x01020301U, 0x67cea967U, 0x2b567d2bU, + 0xfee719feU, 0xd7b562d7U, 0xab4de6abU, 0x76ec9a76U, + 0xca8f45caU, 0x821f9d82U, 0xc98940c9U, 0x7dfa877dU, + 0xfaef15faU, 0x59b2eb59U, 0x478ec947U, 0xf0fb0bf0U, + 0xad41ecadU, 0xd4b367d4U, 0xa25ffda2U, 0xaf45eaafU, + 0x9c23bf9cU, 0xa453f7a4U, 0x72e49672U, 0xc09b5bc0U, + 0xb775c2b7U, 0xfde11cfdU, 0x933dae93U, 0x264c6a26U, + 0x366c5a36U, 0x3f7e413fU, 0xf7f502f7U, 0xcc834fccU, + 0x34685c34U, 0xa551f4a5U, 0xe5d134e5U, 0xf1f908f1U, + 0x71e29371U, 0xd8ab73d8U, 0x31625331U, 0x152a3f15U, + 0x04080c04U, 0xc79552c7U, 0x23466523U, 0xc39d5ec3U, + 0x18302818U, 0x9637a196U, 0x050a0f05U, 0x9a2fb59aU, + 0x070e0907U, 0x12243612U, 0x801b9b80U, 0xe2df3de2U, + 0xebcd26ebU, 0x274e6927U, 0xb27fcdb2U, 0x75ea9f75U, + 0x09121b09U, 0x831d9e83U, 0x2c58742cU, 0x1a342e1aU, + 0x1b362d1bU, 0x6edcb26eU, 0x5ab4ee5aU, 0xa05bfba0U, + 0x52a4f652U, 0x3b764d3bU, 0xd6b761d6U, 0xb37dceb3U, + 0x29527b29U, 0xe3dd3ee3U, 0x2f5e712fU, 0x84139784U, + 0x53a6f553U, 0xd1b968d1U, 0x00000000U, 0xedc12cedU, + 0x20406020U, 0xfce31ffcU, 0xb179c8b1U, 0x5bb6ed5bU, + 0x6ad4be6aU, 0xcb8d46cbU, 0xbe67d9beU, 0x39724b39U, + 0x4a94de4aU, 0x4c98d44cU, 0x58b0e858U, 0xcf854acfU, + 0xd0bb6bd0U, 0xefc52aefU, 0xaa4fe5aaU, 0xfbed16fbU, + 0x4386c543U, 0x4d9ad74dU, 0x33665533U, 0x85119485U, + 0x458acf45U, 0xf9e910f9U, 0x02040602U, 0x7ffe817fU, + 0x50a0f050U, 0x3c78443cU, 0x9f25ba9fU, 0xa84be3a8U, + 0x51a2f351U, 0xa35dfea3U, 0x4080c040U, 0x8f058a8fU, + 0x923fad92U, 0x9d21bc9dU, 0x38704838U, 0xf5f104f5U, + 0xbc63dfbcU, 0xb677c1b6U, 0xdaaf75daU, 0x21426321U, + 0x10203010U, 0xffe51affU, 0xf3fd0ef3U, 0xd2bf6dd2U, + 0xcd814ccdU, 0x0c18140cU, 0x13263513U, 0xecc32fecU, + 0x5fbee15fU, 0x9735a297U, 0x4488cc44U, 0x172e3917U, + 0xc49357c4U, 0xa755f2a7U, 0x7efc827eU, 0x3d7a473dU, + 0x64c8ac64U, 0x5dbae75dU, 0x19322b19U, 0x73e69573U, + 0x60c0a060U, 0x81199881U, 0x4f9ed14fU, 0xdca37fdcU, + 0x22446622U, 0x2a547e2aU, 0x903bab90U, 0x880b8388U, + 0x468cca46U, 0xeec729eeU, 0xb86bd3b8U, 0x14283c14U, + 0xdea779deU, 0x5ebce25eU, 0x0b161d0bU, 0xdbad76dbU, + 0xe0db3be0U, 0x32645632U, 0x3a744e3aU, 0x0a141e0aU, + 0x4992db49U, 0x060c0a06U, 0x24486c24U, 0x5cb8e45cU, + 0xc29f5dc2U, 0xd3bd6ed3U, 0xac43efacU, 0x62c4a662U, + 0x9139a891U, 0x9531a495U, 0xe4d337e4U, 0x79f28b79U, + 0xe7d532e7U, 0xc88b43c8U, 0x376e5937U, 0x6ddab76dU, + 0x8d018c8dU, 0xd5b164d5U, 0x4e9cd24eU, 0xa949e0a9U, + 0x6cd8b46cU, 0x56acfa56U, 0xf4f307f4U, 0xeacf25eaU, + 0x65caaf65U, 0x7af48e7aU, 0xae47e9aeU, 0x08101808U, + 0xba6fd5baU, 0x78f08878U, 0x254a6f25U, 0x2e5c722eU, + 0x1c38241cU, 0xa657f1a6U, 0xb473c7b4U, 0xc69751c6U, + 0xe8cb23e8U, 0xdda17cddU, 0x74e89c74U, 0x1f3e211fU, + 0x4b96dd4bU, 0xbd61dcbdU, 0x8b0d868bU, 0x8a0f858aU, + 0x70e09070U, 0x3e7c423eU, 0xb571c4b5U, 0x66ccaa66U, + 0x4890d848U, 0x03060503U, 0xf6f701f6U, 0x0e1c120eU, + 0x61c2a361U, 0x356a5f35U, 0x57aef957U, 0xb969d0b9U, + 0x86179186U, 0xc19958c1U, 0x1d3a271dU, 0x9e27b99eU, + 0xe1d938e1U, 0xf8eb13f8U, 0x982bb398U, 0x11223311U, + 0x69d2bb69U, 0xd9a970d9U, 0x8e07898eU, 0x9433a794U, + 0x9b2db69bU, 0x1e3c221eU, 0x87159287U, 0xe9c920e9U, + 0xce8749ceU, 0x55aaff55U, 0x28507828U, 0xdfa57adfU, + 0x8c038f8cU, 0xa159f8a1U, 0x89098089U, 0x0d1a170dU, + 0xbf65dabfU, 0xe6d731e6U, 0x4284c642U, 0x68d0b868U, + 0x4182c341U, 0x9929b099U, 0x2d5a772dU, 0x0f1e110fU, + 0xb07bcbb0U, 0x54a8fc54U, 0xbb6dd6bbU, 0x162c3a16U, + 0xc6a56363U, 0xf8847c7cU, 0xee997777U, 0xf68d7b7bU, + 0xff0df2f2U, 0xd6bd6b6bU, 0xdeb16f6fU, 0x9154c5c5U, + 0x60503030U, 0x02030101U, 0xcea96767U, 0x567d2b2bU, + 0xe719fefeU, 0xb562d7d7U, 0x4de6ababU, 0xec9a7676U, + 0x8f45cacaU, 0x1f9d8282U, 0x8940c9c9U, 0xfa877d7dU, + 0xef15fafaU, 0xb2eb5959U, 0x8ec94747U, 0xfb0bf0f0U, + 0x41ecadadU, 0xb367d4d4U, 0x5ffda2a2U, 0x45eaafafU, + 0x23bf9c9cU, 0x53f7a4a4U, 0xe4967272U, 0x9b5bc0c0U, + 0x75c2b7b7U, 0xe11cfdfdU, 0x3dae9393U, 0x4c6a2626U, + 0x6c5a3636U, 0x7e413f3fU, 0xf502f7f7U, 0x834fccccU, + 0x685c3434U, 0x51f4a5a5U, 0xd134e5e5U, 0xf908f1f1U, + 0xe2937171U, 0xab73d8d8U, 0x62533131U, 0x2a3f1515U, + 0x080c0404U, 0x9552c7c7U, 0x46652323U, 0x9d5ec3c3U, + 0x30281818U, 0x37a19696U, 0x0a0f0505U, 0x2fb59a9aU, + 0x0e090707U, 0x24361212U, 0x1b9b8080U, 0xdf3de2e2U, + 0xcd26ebebU, 0x4e692727U, 0x7fcdb2b2U, 0xea9f7575U, + 0x121b0909U, 0x1d9e8383U, 0x58742c2cU, 0x342e1a1aU, + 0x362d1b1bU, 0xdcb26e6eU, 0xb4ee5a5aU, 0x5bfba0a0U, + 0xa4f65252U, 0x764d3b3bU, 0xb761d6d6U, 0x7dceb3b3U, + 0x527b2929U, 0xdd3ee3e3U, 0x5e712f2fU, 0x13978484U, + 0xa6f55353U, 0xb968d1d1U, 0x00000000U, 0xc12cededU, + 0x40602020U, 0xe31ffcfcU, 0x79c8b1b1U, 0xb6ed5b5bU, + 0xd4be6a6aU, 0x8d46cbcbU, 0x67d9bebeU, 0x724b3939U, + 0x94de4a4aU, 0x98d44c4cU, 0xb0e85858U, 0x854acfcfU, + 0xbb6bd0d0U, 0xc52aefefU, 0x4fe5aaaaU, 0xed16fbfbU, + 0x86c54343U, 0x9ad74d4dU, 0x66553333U, 0x11948585U, + 0x8acf4545U, 0xe910f9f9U, 0x04060202U, 0xfe817f7fU, + 0xa0f05050U, 0x78443c3cU, 0x25ba9f9fU, 0x4be3a8a8U, + 0xa2f35151U, 0x5dfea3a3U, 0x80c04040U, 0x058a8f8fU, + 0x3fad9292U, 0x21bc9d9dU, 0x70483838U, 0xf104f5f5U, + 0x63dfbcbcU, 0x77c1b6b6U, 0xaf75dadaU, 0x42632121U, + 0x20301010U, 0xe51affffU, 0xfd0ef3f3U, 0xbf6dd2d2U, + 0x814ccdcdU, 0x18140c0cU, 0x26351313U, 0xc32fececU, + 0xbee15f5fU, 0x35a29797U, 0x88cc4444U, 0x2e391717U, + 0x9357c4c4U, 0x55f2a7a7U, 0xfc827e7eU, 0x7a473d3dU, + 0xc8ac6464U, 0xbae75d5dU, 0x322b1919U, 0xe6957373U, + 0xc0a06060U, 0x19988181U, 0x9ed14f4fU, 0xa37fdcdcU, + 0x44662222U, 0x547e2a2aU, 0x3bab9090U, 0x0b838888U, + 0x8cca4646U, 0xc729eeeeU, 0x6bd3b8b8U, 0x283c1414U, + 0xa779dedeU, 0xbce25e5eU, 0x161d0b0bU, 0xad76dbdbU, + 0xdb3be0e0U, 0x64563232U, 0x744e3a3aU, 0x141e0a0aU, + 0x92db4949U, 0x0c0a0606U, 0x486c2424U, 0xb8e45c5cU, + 0x9f5dc2c2U, 0xbd6ed3d3U, 0x43efacacU, 0xc4a66262U, + 0x39a89191U, 0x31a49595U, 0xd337e4e4U, 0xf28b7979U, + 0xd532e7e7U, 0x8b43c8c8U, 0x6e593737U, 0xdab76d6dU, + 0x018c8d8dU, 0xb164d5d5U, 0x9cd24e4eU, 0x49e0a9a9U, + 0xd8b46c6cU, 0xacfa5656U, 0xf307f4f4U, 0xcf25eaeaU, + 0xcaaf6565U, 0xf48e7a7aU, 0x47e9aeaeU, 0x10180808U, + 0x6fd5babaU, 0xf0887878U, 0x4a6f2525U, 0x5c722e2eU, + 0x38241c1cU, 0x57f1a6a6U, 0x73c7b4b4U, 0x9751c6c6U, + 0xcb23e8e8U, 0xa17cddddU, 0xe89c7474U, 0x3e211f1fU, + 0x96dd4b4bU, 0x61dcbdbdU, 0x0d868b8bU, 0x0f858a8aU, + 0xe0907070U, 0x7c423e3eU, 0x71c4b5b5U, 0xccaa6666U, + 0x90d84848U, 0x06050303U, 0xf701f6f6U, 0x1c120e0eU, + 0xc2a36161U, 0x6a5f3535U, 0xaef95757U, 0x69d0b9b9U, + 0x17918686U, 0x9958c1c1U, 0x3a271d1dU, 0x27b99e9eU, + 0xd938e1e1U, 0xeb13f8f8U, 0x2bb39898U, 0x22331111U, + 0xd2bb6969U, 0xa970d9d9U, 0x07898e8eU, 0x33a79494U, + 0x2db69b9bU, 0x3c221e1eU, 0x15928787U, 0xc920e9e9U, + 0x8749ceceU, 0xaaff5555U, 0x50782828U, 0xa57adfdfU, + 0x038f8c8cU, 0x59f8a1a1U, 0x09808989U, 0x1a170d0dU, + 0x65dabfbfU, 0xd731e6e6U, 0x84c64242U, 0xd0b86868U, + 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, + 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U, + 0x50a7f451U, 0x5365417eU, 0xc3a4171aU, 0x965e273aU, + 0xcb6bab3bU, 0xf1459d1fU, 0xab58faacU, 0x9303e34bU, + 0x55fa3020U, 0xf66d76adU, 0x9176cc88U, 0x254c02f5U, + 0xfcd7e54fU, 0xd7cb2ac5U, 0x80443526U, 0x8fa362b5U, + 0x495ab1deU, 0x671bba25U, 0x980eea45U, 0xe1c0fe5dU, + 0x02752fc3U, 0x12f04c81U, 0xa397468dU, 0xc6f9d36bU, + 0xe75f8f03U, 0x959c9215U, 0xeb7a6dbfU, 0xda595295U, + 0x2d83bed4U, 0xd3217458U, 0x2969e049U, 0x44c8c98eU, + 0x6a89c275U, 0x78798ef4U, 0x6b3e5899U, 0xdd71b927U, + 0xb64fe1beU, 0x17ad88f0U, 0x66ac20c9U, 0xb43ace7dU, + 0x184adf63U, 0x82311ae5U, 0x60335197U, 0x457f5362U, + 0xe07764b1U, 0x84ae6bbbU, 0x1ca081feU, 0x942b08f9U, + 0x58684870U, 0x19fd458fU, 0x876cde94U, 0xb7f87b52U, + 0x23d373abU, 0xe2024b72U, 0x578f1fe3U, 0x2aab5566U, + 0x0728ebb2U, 0x03c2b52fU, 0x9a7bc586U, 0xa50837d3U, + 0xf2872830U, 0xb2a5bf23U, 0xba6a0302U, 0x5c8216edU, + 0x2b1ccf8aU, 0x92b479a7U, 0xf0f207f3U, 0xa1e2694eU, + 0xcdf4da65U, 0xd5be0506U, 0x1f6234d1U, 0x8afea6c4U, + 0x9d532e34U, 0xa055f3a2U, 0x32e18a05U, 0x75ebf6a4U, + 0x39ec830bU, 0xaaef6040U, 0x069f715eU, 0x51106ebdU, + 0xf98a213eU, 0x3d06dd96U, 0xae053eddU, 0x46bde64dU, +)===" +R"===( + 0xb58d5491U, 0x055dc471U, 0x6fd40604U, 0xff155060U, + 0x24fb9819U, 0x97e9bdd6U, 0xcc434089U, 0x779ed967U, + 0xbd42e8b0U, 0x888b8907U, 0x385b19e7U, 0xdbeec879U, + 0x470a7ca1U, 0xe90f427cU, 0xc91e84f8U, 0x00000000U, + 0x83868009U, 0x48ed2b32U, 0xac70111eU, 0x4e725a6cU, + 0xfbff0efdU, 0x5638850fU, 0x1ed5ae3dU, 0x27392d36U, + 0x64d90f0aU, 0x21a65c68U, 0xd1545b9bU, 0x3a2e3624U, + 0xb1670a0cU, 0x0fe75793U, 0xd296eeb4U, 0x9e919b1bU, + 0x4fc5c080U, 0xa220dc61U, 0x694b775aU, 0x161a121cU, + 0x0aba93e2U, 0xe52aa0c0U, 0x43e0223cU, 0x1d171b12U, + 0x0b0d090eU, 0xadc78bf2U, 0xb9a8b62dU, 0xc8a91e14U, + 0x8519f157U, 0x4c0775afU, 0xbbdd99eeU, 0xfd607fa3U, + 0x9f2601f7U, 0xbcf5725cU, 0xc53b6644U, 0x347efb5bU, + 0x7629438bU, 0xdcc623cbU, 0x68fcedb6U, 0x63f1e4b8U, + 0xcadc31d7U, 0x10856342U, 0x40229713U, 0x2011c684U, + 0x7d244a85U, 0xf83dbbd2U, 0x1132f9aeU, 0x6da129c7U, + 0x4b2f9e1dU, 0xf330b2dcU, 0xec52860dU, 0xd0e3c177U, + 0x6c16b32bU, 0x99b970a9U, 0xfa489411U, 0x2264e947U, + 0xc48cfca8U, 0x1a3ff0a0U, 0xd82c7d56U, 0xef903322U, + 0xc74e4987U, 0xc1d138d9U, 0xfea2ca8cU, 0x360bd498U, + 0xcf81f5a6U, 0x28de7aa5U, 0x268eb7daU, 0xa4bfad3fU, + 0xe49d3a2cU, 0x0d927850U, 0x9bcc5f6aU, 0x62467e54U, + 0xc2138df6U, 0xe8b8d890U, 0x5ef7392eU, 0xf5afc382U, + 0xbe805d9fU, 0x7c93d069U, 0xa92dd56fU, 0xb31225cfU, + 0x3b99acc8U, 0xa77d1810U, 0x6e639ce8U, 0x7bbb3bdbU, + 0x097826cdU, 0xf418596eU, 0x01b79aecU, 0xa89a4f83U, + 0x656e95e6U, 0x7ee6ffaaU, 0x08cfbc21U, 0xe6e815efU, + 0xd99be7baU, 0xce366f4aU, 0xd4099feaU, 0xd67cb029U, + 0xafb2a431U, 0x31233f2aU, 0x3094a5c6U, 0xc066a235U, + 0x37bc4e74U, 0xa6ca82fcU, 0xb0d090e0U, 0x15d8a733U, + 0x4a9804f1U, 0xf7daec41U, 0x0e50cd7fU, 0x2ff69117U, + 0x8dd64d76U, 0x4db0ef43U, 0x544daaccU, 0xdf0496e4U, + 0xe3b5d19eU, 0x1b886a4cU, 0xb81f2cc1U, 0x7f516546U, + 0x04ea5e9dU, 0x5d358c01U, 0x737487faU, 0x2e410bfbU, + 0x5a1d67b3U, 0x52d2db92U, 0x335610e9U, 0x1347d66dU, + 0x8c61d79aU, 0x7a0ca137U, 0x8e14f859U, 0x893c13ebU, + 0xee27a9ceU, 0x35c961b7U, 0xede51ce1U, 0x3cb1477aU, + 0x59dfd29cU, 0x3f73f255U, 0x79ce1418U, 0xbf37c773U, + 0xeacdf753U, 0x5baafd5fU, 0x146f3ddfU, 0x86db4478U, + 0x81f3afcaU, 0x3ec468b9U, 0x2c342438U, 0x5f40a3c2U, + 0x72c31d16U, 0x0c25e2bcU, 0x8b493c28U, 0x41950dffU, + 0x7101a839U, 0xdeb30c08U, 0x9ce4b4d8U, 0x90c15664U, + 0x6184cb7bU, 0x70b632d5U, 0x745c6c48U, 0x4257b8d0U, + 0xa7f45150U, 0x65417e53U, 0xa4171ac3U, 0x5e273a96U, + 0x6bab3bcbU, 0x459d1ff1U, 0x58faacabU, 0x03e34b93U, + 0xfa302055U, 0x6d76adf6U, 0x76cc8891U, 0x4c02f525U, + 0xd7e54ffcU, 0xcb2ac5d7U, 0x44352680U, 0xa362b58fU, + 0x5ab1de49U, 0x1bba2567U, 0x0eea4598U, 0xc0fe5de1U, + 0x752fc302U, 0xf04c8112U, 0x97468da3U, 0xf9d36bc6U, + 0x5f8f03e7U, 0x9c921595U, 0x7a6dbfebU, 0x595295daU, + 0x83bed42dU, 0x217458d3U, 0x69e04929U, 0xc8c98e44U, + 0x89c2756aU, 0x798ef478U, 0x3e58996bU, 0x71b927ddU, + 0x4fe1beb6U, 0xad88f017U, 0xac20c966U, 0x3ace7db4U, + 0x4adf6318U, 0x311ae582U, 0x33519760U, 0x7f536245U, + 0x7764b1e0U, 0xae6bbb84U, 0xa081fe1cU, 0x2b08f994U, + 0x68487058U, 0xfd458f19U, 0x6cde9487U, 0xf87b52b7U, + 0xd373ab23U, 0x024b72e2U, 0x8f1fe357U, 0xab55662aU, + 0x28ebb207U, 0xc2b52f03U, 0x7bc5869aU, 0x0837d3a5U, + 0x872830f2U, 0xa5bf23b2U, 0x6a0302baU, 0x8216ed5cU, + 0x1ccf8a2bU, 0xb479a792U, 0xf207f3f0U, 0xe2694ea1U, + 0xf4da65cdU, 0xbe0506d5U, 0x6234d11fU, 0xfea6c48aU, + 0x532e349dU, 0x55f3a2a0U, 0xe18a0532U, 0xebf6a475U, + 0xec830b39U, 0xef6040aaU, 0x9f715e06U, 0x106ebd51U, + 0x8a213ef9U, 0x06dd963dU, 0x053eddaeU, 0xbde64d46U, + 0x8d5491b5U, 0x5dc47105U, 0xd406046fU, 0x155060ffU, + 0xfb981924U, 0xe9bdd697U, 0x434089ccU, 0x9ed96777U, + 0x42e8b0bdU, 0x8b890788U, 0x5b19e738U, 0xeec879dbU, + 0x0a7ca147U, 0x0f427ce9U, 0x1e84f8c9U, 0x00000000U, + 0x86800983U, 0xed2b3248U, 0x70111eacU, 0x725a6c4eU, + 0xff0efdfbU, 0x38850f56U, 0xd5ae3d1eU, 0x392d3627U, + 0xd90f0a64U, 0xa65c6821U, 0x545b9bd1U, 0x2e36243aU, + 0x670a0cb1U, 0xe757930fU, 0x96eeb4d2U, 0x919b1b9eU, + 0xc5c0804fU, 0x20dc61a2U, 0x4b775a69U, 0x1a121c16U, + 0xba93e20aU, 0x2aa0c0e5U, 0xe0223c43U, 0x171b121dU, + 0x0d090e0bU, 0xc78bf2adU, 0xa8b62db9U, 0xa91e14c8U, + 0x19f15785U, 0x0775af4cU, 0xdd99eebbU, 0x607fa3fdU, + 0x2601f79fU, 0xf5725cbcU, 0x3b6644c5U, 0x7efb5b34U, + 0x29438b76U, 0xc623cbdcU, 0xfcedb668U, 0xf1e4b863U, + 0xdc31d7caU, 0x85634210U, 0x22971340U, 0x11c68420U, + 0x244a857dU, 0x3dbbd2f8U, 0x32f9ae11U, 0xa129c76dU, + 0x2f9e1d4bU, 0x30b2dcf3U, 0x52860decU, 0xe3c177d0U, + 0x16b32b6cU, 0xb970a999U, 0x489411faU, 0x64e94722U, + 0x8cfca8c4U, 0x3ff0a01aU, 0x2c7d56d8U, 0x903322efU, + 0x4e4987c7U, 0xd138d9c1U, 0xa2ca8cfeU, 0x0bd49836U, + 0x81f5a6cfU, 0xde7aa528U, 0x8eb7da26U, 0xbfad3fa4U, + 0x9d3a2ce4U, 0x9278500dU, 0xcc5f6a9bU, 0x467e5462U, + 0x138df6c2U, 0xb8d890e8U, 0xf7392e5eU, 0xafc382f5U, + 0x805d9fbeU, 0x93d0697cU, 0x2dd56fa9U, 0x1225cfb3U, + 0x99acc83bU, 0x7d1810a7U, 0x639ce86eU, 0xbb3bdb7bU, + 0x7826cd09U, 0x18596ef4U, 0xb79aec01U, 0x9a4f83a8U, + 0x6e95e665U, 0xe6ffaa7eU, 0xcfbc2108U, 0xe815efe6U, + 0x9be7bad9U, 0x366f4aceU, 0x099fead4U, 0x7cb029d6U, + 0xb2a431afU, 0x233f2a31U, 0x94a5c630U, 0x66a235c0U, + 0xbc4e7437U, 0xca82fca6U, 0xd090e0b0U, 0xd8a73315U, + 0x9804f14aU, 0xdaec41f7U, 0x50cd7f0eU, 0xf691172fU, + 0xd64d768dU, 0xb0ef434dU, 0x4daacc54U, 0x0496e4dfU, + 0xb5d19ee3U, 0x886a4c1bU, 0x1f2cc1b8U, 0x5165467fU, + 0xea5e9d04U, 0x358c015dU, 0x7487fa73U, 0x410bfb2eU, + 0x1d67b35aU, 0xd2db9252U, 0x5610e933U, 0x47d66d13U, + 0x61d79a8cU, 0x0ca1377aU, 0x14f8598eU, 0x3c13eb89U, + 0x27a9ceeeU, 0xc961b735U, 0xe51ce1edU, 0xb1477a3cU, + 0xdfd29c59U, 0x73f2553fU, 0xce141879U, 0x37c773bfU, + 0xcdf753eaU, 0xaafd5f5bU, 0x6f3ddf14U, 0xdb447886U, + 0xf3afca81U, 0xc468b93eU, 0x3424382cU, 0x40a3c25fU, + 0xc31d1672U, 0x25e2bc0cU, 0x493c288bU, 0x950dff41U, + 0x01a83971U, 0xb30c08deU, 0xe4b4d89cU, 0xc1566490U, + 0x84cb7b61U, 0xb632d570U, 0x5c6c4874U, 0x57b8d042U, + 0xf45150a7U, 0x417e5365U, 0x171ac3a4U, 0x273a965eU, + 0xab3bcb6bU, 0x9d1ff145U, 0xfaacab58U, 0xe34b9303U, + 0x302055faU, 0x76adf66dU, 0xcc889176U, 0x02f5254cU, + 0xe54ffcd7U, 0x2ac5d7cbU, 0x35268044U, 0x62b58fa3U, + 0xb1de495aU, 0xba25671bU, 0xea45980eU, 0xfe5de1c0U, + 0x2fc30275U, 0x4c8112f0U, 0x468da397U, 0xd36bc6f9U, + 0x8f03e75fU, 0x9215959cU, 0x6dbfeb7aU, 0x5295da59U, + 0xbed42d83U, 0x7458d321U, 0xe0492969U, 0xc98e44c8U, + 0xc2756a89U, 0x8ef47879U, 0x58996b3eU, 0xb927dd71U, + 0xe1beb64fU, 0x88f017adU, 0x20c966acU, 0xce7db43aU, + 0xdf63184aU, 0x1ae58231U, 0x51976033U, 0x5362457fU, + 0x64b1e077U, 0x6bbb84aeU, 0x81fe1ca0U, 0x08f9942bU, + 0x48705868U, 0x458f19fdU, 0xde94876cU, 0x7b52b7f8U, + 0x73ab23d3U, 0x4b72e202U, 0x1fe3578fU, 0x55662aabU, + 0xebb20728U, 0xb52f03c2U, 0xc5869a7bU, 0x37d3a508U, + 0x2830f287U, 0xbf23b2a5U, 0x0302ba6aU, 0x16ed5c82U, + 0xcf8a2b1cU, 0x79a792b4U, 0x07f3f0f2U, 0x694ea1e2U, + 0xda65cdf4U, 0x0506d5beU, 0x34d11f62U, 0xa6c48afeU, + 0x2e349d53U, 0xf3a2a055U, 0x8a0532e1U, 0xf6a475ebU, + 0x830b39ecU, 0x6040aaefU, 0x715e069fU, 0x6ebd5110U, + 0x213ef98aU, 0xdd963d06U, 0x3eddae05U, 0xe64d46bdU, + 0x5491b58dU, 0xc471055dU, 0x06046fd4U, 0x5060ff15U, + 0x981924fbU, 0xbdd697e9U, 0x4089cc43U, 0xd967779eU, + 0xe8b0bd42U, 0x8907888bU, 0x19e7385bU, 0xc879dbeeU, + 0x7ca1470aU, 0x427ce90fU, 0x84f8c91eU, 0x00000000U, + 0x80098386U, 0x2b3248edU, 0x111eac70U, 0x5a6c4e72U, + 0x0efdfbffU, 0x850f5638U, 0xae3d1ed5U, 0x2d362739U, + 0x0f0a64d9U, 0x5c6821a6U, 0x5b9bd154U, 0x36243a2eU, + 0x0a0cb167U, 0x57930fe7U, 0xeeb4d296U, 0x9b1b9e91U, + 0xc0804fc5U, 0xdc61a220U, 0x775a694bU, 0x121c161aU, + 0x93e20abaU, 0xa0c0e52aU, 0x223c43e0U, 0x1b121d17U, + 0x090e0b0dU, 0x8bf2adc7U, 0xb62db9a8U, 0x1e14c8a9U, + 0xf1578519U, 0x75af4c07U, 0x99eebbddU, 0x7fa3fd60U, + 0x01f79f26U, 0x725cbcf5U, 0x6644c53bU, 0xfb5b347eU, + 0x438b7629U, 0x23cbdcc6U, 0xedb668fcU, 0xe4b863f1U, + 0x31d7cadcU, 0x63421085U, 0x97134022U, 0xc6842011U, + 0x4a857d24U, 0xbbd2f83dU, 0xf9ae1132U, 0x29c76da1U, + 0x9e1d4b2fU, 0xb2dcf330U, 0x860dec52U, 0xc177d0e3U, + 0xb32b6c16U, 0x70a999b9U, 0x9411fa48U, 0xe9472264U, + 0xfca8c48cU, 0xf0a01a3fU, 0x7d56d82cU, 0x3322ef90U, + 0x4987c74eU, 0x38d9c1d1U, 0xca8cfea2U, 0xd498360bU, + 0xf5a6cf81U, 0x7aa528deU, 0xb7da268eU, 0xad3fa4bfU, + 0x3a2ce49dU, 0x78500d92U, 0x5f6a9bccU, 0x7e546246U, + 0x8df6c213U, 0xd890e8b8U, 0x392e5ef7U, 0xc382f5afU, + 0x5d9fbe80U, 0xd0697c93U, 0xd56fa92dU, 0x25cfb312U, + 0xacc83b99U, 0x1810a77dU, 0x9ce86e63U, 0x3bdb7bbbU, + 0x26cd0978U, 0x596ef418U, 0x9aec01b7U, 0x4f83a89aU, + 0x95e6656eU, 0xffaa7ee6U, 0xbc2108cfU, 0x15efe6e8U, + 0xe7bad99bU, 0x6f4ace36U, 0x9fead409U, 0xb029d67cU, + 0xa431afb2U, 0x3f2a3123U, 0xa5c63094U, 0xa235c066U, + 0x4e7437bcU, 0x82fca6caU, 0x90e0b0d0U, 0xa73315d8U, + 0x04f14a98U, 0xec41f7daU, 0xcd7f0e50U, 0x91172ff6U, + 0x4d768dd6U, 0xef434db0U, 0xaacc544dU, 0x96e4df04U, + 0xd19ee3b5U, 0x6a4c1b88U, 0x2cc1b81fU, 0x65467f51U, + 0x5e9d04eaU, 0x8c015d35U, 0x87fa7374U, 0x0bfb2e41U, + 0x67b35a1dU, 0xdb9252d2U, 0x10e93356U, 0xd66d1347U, + 0xd79a8c61U, 0xa1377a0cU, 0xf8598e14U, 0x13eb893cU, + 0xa9ceee27U, 0x61b735c9U, 0x1ce1ede5U, 0x477a3cb1U, + 0xd29c59dfU, 0xf2553f73U, 0x141879ceU, 0xc773bf37U, + 0xf753eacdU, 0xfd5f5baaU, 0x3ddf146fU, 0x447886dbU, + 0xafca81f3U, 0x68b93ec4U, 0x24382c34U, 0xa3c25f40U, + 0x1d1672c3U, 0xe2bc0c25U, 0x3c288b49U, 0x0dff4195U, + 0xa8397101U, 0x0c08deb3U, 0xb4d89ce4U, 0x566490c1U, + 0xcb7b6184U, 0x32d570b6U, 0x6c48745cU, 0xb8d04257U, + 0x5150a7f4U, 0x7e536541U, 0x1ac3a417U, 0x3a965e27U, + 0x3bcb6babU, 0x1ff1459dU, 0xacab58faU, 0x4b9303e3U, + 0x2055fa30U, 0xadf66d76U, 0x889176ccU, 0xf5254c02U, + 0x4ffcd7e5U, 0xc5d7cb2aU, 0x26804435U, 0xb58fa362U, + 0xde495ab1U, 0x25671bbaU, 0x45980eeaU, 0x5de1c0feU, + 0xc302752fU, 0x8112f04cU, 0x8da39746U, 0x6bc6f9d3U, + 0x03e75f8fU, 0x15959c92U, 0xbfeb7a6dU, 0x95da5952U, + 0xd42d83beU, 0x58d32174U, 0x492969e0U, 0x8e44c8c9U, + 0x756a89c2U, 0xf478798eU, 0x996b3e58U, 0x27dd71b9U, + 0xbeb64fe1U, 0xf017ad88U, 0xc966ac20U, 0x7db43aceU, + 0x63184adfU, 0xe582311aU, 0x97603351U, 0x62457f53U, + 0xb1e07764U, 0xbb84ae6bU, 0xfe1ca081U, 0xf9942b08U, + 0x70586848U, 0x8f19fd45U, 0x94876cdeU, 0x52b7f87bU, + 0xab23d373U, 0x72e2024bU, 0xe3578f1fU, 0x662aab55U, + 0xb20728ebU, 0x2f03c2b5U, 0x869a7bc5U, 0xd3a50837U, + 0x30f28728U, 0x23b2a5bfU, 0x02ba6a03U, 0xed5c8216U, + 0x8a2b1ccfU, 0xa792b479U, 0xf3f0f207U, 0x4ea1e269U, + 0x65cdf4daU, 0x06d5be05U, 0xd11f6234U, 0xc48afea6U, + 0x349d532eU, 0xa2a055f3U, 0x0532e18aU, 0xa475ebf6U, + 0x0b39ec83U, 0x40aaef60U, 0x5e069f71U, 0xbd51106eU, + 0x3ef98a21U, 0x963d06ddU, 0xddae053eU, 0x4d46bde6U, + 0x91b58d54U, 0x71055dc4U, 0x046fd406U, 0x60ff1550U, + 0x1924fb98U, 0xd697e9bdU, 0x89cc4340U, 0x67779ed9U, + 0xb0bd42e8U, 0x07888b89U, 0xe7385b19U, 0x79dbeec8U, + 0xa1470a7cU, 0x7ce90f42U, 0xf8c91e84U, 0x00000000U, + 0x09838680U, 0x3248ed2bU, 0x1eac7011U, 0x6c4e725aU, + 0xfdfbff0eU, 0x0f563885U, 0x3d1ed5aeU, 0x3627392dU, + 0x0a64d90fU, 0x6821a65cU, 0x9bd1545bU, 0x243a2e36U, + 0x0cb1670aU, 0x930fe757U, 0xb4d296eeU, 0x1b9e919bU, + 0x804fc5c0U, 0x61a220dcU, 0x5a694b77U, 0x1c161a12U, + 0xe20aba93U, 0xc0e52aa0U, 0x3c43e022U, 0x121d171bU, + 0x0e0b0d09U, 0xf2adc78bU, 0x2db9a8b6U, 0x14c8a91eU, + 0x578519f1U, 0xaf4c0775U, 0xeebbdd99U, 0xa3fd607fU, + 0xf79f2601U, 0x5cbcf572U, 0x44c53b66U, 0x5b347efbU, + 0x8b762943U, 0xcbdcc623U, 0xb668fcedU, 0xb863f1e4U, + 0xd7cadc31U, 0x42108563U, 0x13402297U, 0x842011c6U, + 0x857d244aU, 0xd2f83dbbU, 0xae1132f9U, 0xc76da129U, + 0x1d4b2f9eU, 0xdcf330b2U, 0x0dec5286U, 0x77d0e3c1U, + 0x2b6c16b3U, 0xa999b970U, 0x11fa4894U, 0x472264e9U, + 0xa8c48cfcU, 0xa01a3ff0U, 0x56d82c7dU, 0x22ef9033U, + 0x87c74e49U, 0xd9c1d138U, 0x8cfea2caU, 0x98360bd4U, + 0xa6cf81f5U, 0xa528de7aU, 0xda268eb7U, 0x3fa4bfadU, + 0x2ce49d3aU, 0x500d9278U, 0x6a9bcc5fU, 0x5462467eU, + 0xf6c2138dU, 0x90e8b8d8U, 0x2e5ef739U, 0x82f5afc3U, + 0x9fbe805dU, 0x697c93d0U, 0x6fa92dd5U, 0xcfb31225U, + 0xc83b99acU, 0x10a77d18U, 0xe86e639cU, 0xdb7bbb3bU, + 0xcd097826U, 0x6ef41859U, 0xec01b79aU, 0x83a89a4fU, + 0xe6656e95U, 0xaa7ee6ffU, 0x2108cfbcU, 0xefe6e815U, + 0xbad99be7U, 0x4ace366fU, 0xead4099fU, 0x29d67cb0U, + 0x31afb2a4U, 0x2a31233fU, 0xc63094a5U, 0x35c066a2U, + 0x7437bc4eU, 0xfca6ca82U, 0xe0b0d090U, 0x3315d8a7U, + 0xf14a9804U, 0x41f7daecU, 0x7f0e50cdU, 0x172ff691U, + 0x768dd64dU, 0x434db0efU, 0xcc544daaU, 0xe4df0496U, + 0x9ee3b5d1U, 0x4c1b886aU, 0xc1b81f2cU, 0x467f5165U, + 0x9d04ea5eU, 0x015d358cU, 0xfa737487U, 0xfb2e410bU, + 0xb35a1d67U, 0x9252d2dbU, 0xe9335610U, 0x6d1347d6U, + 0x9a8c61d7U, 0x377a0ca1U, 0x598e14f8U, 0xeb893c13U, + 0xceee27a9U, 0xb735c961U, 0xe1ede51cU, 0x7a3cb147U, + 0x9c59dfd2U, 0x553f73f2U, 0x1879ce14U, 0x73bf37c7U, + 0x53eacdf7U, 0x5f5baafdU, 0xdf146f3dU, 0x7886db44U, + 0xca81f3afU, 0xb93ec468U, 0x382c3424U, 0xc25f40a3U, + 0x1672c31dU, 0xbc0c25e2U, 0x288b493cU, 0xff41950dU, + 0x397101a8U, 0x08deb30cU, 0xd89ce4b4U, 0x6490c156U, + 0x7b6184cbU, 0xd570b632U, 0x48745c6cU, 0xd04257b8U, +}; + +)===" +R"===( + +__constant static const uint AES_KEY_FILL[16] = { + 0x6daca553, 0x62716609, 0xdbb5552b, 0xb4f44917, + 0x6d7caf07, 0x846a710d, 0x1725d378, 0x0da1dc4e, + 0x3f1262f1, 0x9f947ec6, 0xf4c0794f, 0x3e20e345, + 0x6aef8135, 0xb1ba317c, 0x16314c88, 0x49169154, +}; + +__constant static const uint AES_STATE_HASH[16] = { + 0x92b52c0d, 0x9fa856de, 0xcc82db47, 0xd7983aad, + 0x338d996e, 0x15c7b798, 0xf59e125a, 0xace78057, + 0x6a770017, 0xae62c7d0, 0x5079506b, 0xe8a07ce4, + 0x630a240c, 0x07ad828d, 0x79a10005, 0x7e994948, +}; + +uint get_byte32(uint a, uint start_bit) { return (a >> start_bit) & 0xFF; } + +#define fillAes_name fillAes1Rx4_scratchpad +#define outputSize RANDOMX_SCRATCHPAD_L3 +#define outputSize0 (outputSize + 64) +#define strided SCRATCHPAD_STRIDED +#define unroll_factor 8 +#define num_rounds 1 + #include "fillAes1Rx4.cl" +#undef num_rounds +#undef unroll_factor +#undef strided +#undef outputSize +#undef outputSize0 +#undef fillAes_name + +#define fillAes_name fillAes4Rx4_entropy +#define outputSize ENTROPY_SIZE +#define outputSize0 outputSize +#define strided 0 +#define unroll_factor 2 +#define num_rounds 4 + #include "fillAes1Rx4.cl" +#undef num_rounds +#undef unroll_factor +#undef strided +#undef outputSize +#undef outputSize0 +#undef fillAes_name + +#define inputSize RANDOMX_SCRATCHPAD_L3 + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void hashAes1Rx4(__global const void* input, __global void* hash, uint hashOffsetBytes, uint hashStrideBytes, uint batch_size) +{ + __local uint T[2048]; + + const uint stride_size = batch_size * 4; + const uint global_index = get_global_id(0); + if (global_index >= stride_size) + return; + + const uint idx = global_index / 4; + const uint sub = global_index % 4; + + for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step) + T[i] = AES_TABLE[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + uint x[4] = { AES_STATE_HASH[sub * 4], AES_STATE_HASH[sub * 4 + 1], AES_STATE_HASH[sub * 4 + 2], AES_STATE_HASH[sub * 4 + 3] }; + + const uint s1 = ((sub & 1) == 0) ? 8 : 24; + const uint s3 = ((sub & 1) == 0) ? 24 : 8; + + __global const uint4* p = SCRATCHPAD_STRIDED ? (((__global uint4*) input) + idx * 4 + sub) : (((__global uint4*) input) + idx * ((inputSize + 64) / sizeof(uint4)) + sub); + + __local const uint* const t0 = ((sub & 1) == 0) ? T : (T + 1024); + __local const uint* const t1 = ((sub & 1) == 0) ? (T + 256) : (T + 1792); + __local const uint* const t2 = ((sub & 1) == 0) ? (T + 512) : (T + 1536); + __local const uint* const t3 = ((sub & 1) == 0) ? (T + 768) : (T + 1280); + + #pragma unroll(8) + for (uint i = 0; i < inputSize / sizeof(uint4); i += 4, p += SCRATCHPAD_STRIDED ? stride_size : 4) + { + uint k[4], y[4]; + *(uint4*)(k) = *p; + y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[0]; + y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[1]; + y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[2]; + y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[3]; + x[0] = y[0]; + x[1] = y[1]; + x[2] = y[2]; + x[3] = y[3]; + } + + uint y[4]; + + y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ 0xf6fa8389; + y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ 0x8b24949f; + y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ 0x90dc56bf; + y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ 0x06890201; + + x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ 0x61b263d1; + x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ 0x51f4e03c; + x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ 0xee1043c6; + x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ 0xed18f99b; + + *((__global uint4*)(hash) + idx * (hashStrideBytes / sizeof(uint4)) + sub + (hashOffsetBytes / sizeof(uint4))) = *(uint4*)(x); +} +)===" diff --git a/src/amd/opencl/RandomX/blake2b.cl b/src/amd/opencl/RandomX/blake2b.cl new file mode 100644 index 00000000..2fbb9564 --- /dev/null +++ b/src/amd/opencl/RandomX/blake2b.cl @@ -0,0 +1,159 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +__constant static const uchar blake2b_sigma[12 * 16] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, + 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8, + 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13, + 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9, + 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11, + 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, +}; + +enum Blake2b_IV +{ + iv0 = 0x6a09e667f3bcc908ul, + iv1 = 0xbb67ae8584caa73bul, + iv2 = 0x3c6ef372fe94f82bul, + iv3 = 0xa54ff53a5f1d36f1ul, + iv4 = 0x510e527fade682d1ul, + iv5 = 0x9b05688c2b3e6c1ful, + iv6 = 0x1f83d9abfb41bd6bul, + iv7 = 0x5be0cd19137e2179ul, +}; + +ulong rotr64(ulong a, ulong shift) { return rotate(a, 64 - shift); } + +#define G(r, i, a, b, c, d) \ + do { \ + a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r * 16 + 2 * i + 1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while (0) + +#define ROUND(r) \ + do { \ + G(r, 0, v[0], v[4], v[8], v[12]); \ + G(r, 1, v[1], v[5], v[9], v[13]); \ + G(r, 2, v[2], v[6], v[10], v[14]); \ + G(r, 3, v[3], v[7], v[11], v[15]); \ + G(r, 4, v[0], v[5], v[10], v[15]); \ + G(r, 5, v[1], v[6], v[11], v[12]); \ + G(r, 6, v[2], v[7], v[8], v[13]); \ + G(r, 7, v[3], v[4], v[9], v[14]); \ + } while (0) + +#define BLAKE2B_ROUNDS() ROUND(0);ROUND(1);ROUND(2);ROUND(3);ROUND(4);ROUND(5);ROUND(6);ROUND(7);ROUND(8);ROUND(9);ROUND(10);ROUND(11); + +void blake2b_512_process_single_block(ulong *h, const ulong* m, uint blockTemplateSize) +{ + ulong v[16] = + { + iv0 ^ 0x01010040, iv1, iv2, iv3, iv4 , iv5, iv6, iv7, + iv0 , iv1, iv2, iv3, iv4 ^ blockTemplateSize, iv5, ~iv6, iv7, + }; + + BLAKE2B_ROUNDS(); + + h[0] = v[0] ^ v[ 8] ^ iv0 ^ 0x01010040; + h[1] = v[1] ^ v[ 9] ^ iv1; + h[2] = v[2] ^ v[10] ^ iv2; + h[3] = v[3] ^ v[11] ^ iv3; + h[4] = v[4] ^ v[12] ^ iv4; + h[5] = v[5] ^ v[13] ^ iv5; + h[6] = v[6] ^ v[14] ^ iv6; + h[7] = v[7] ^ v[15] ^ iv7; +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void blake2b_initial_hash(__global void *out, __global const void* blockTemplate, uint blockTemplateSize, uint start_nonce) +{ + const uint global_index = get_global_id(0); + + __global const ulong* p = (__global const ulong*) blockTemplate; + ulong m[16] = { + (blockTemplateSize > 0) ? p[ 0] : 0, + (blockTemplateSize > 8) ? p[ 1] : 0, + (blockTemplateSize > 16) ? p[ 2] : 0, + (blockTemplateSize > 24) ? p[ 3] : 0, + (blockTemplateSize > 32) ? p[ 4] : 0, + (blockTemplateSize > 40) ? p[ 5] : 0, + (blockTemplateSize > 48) ? p[ 6] : 0, + (blockTemplateSize > 56) ? p[ 7] : 0, + (blockTemplateSize > 64) ? p[ 8] : 0, + (blockTemplateSize > 72) ? p[ 9] : 0, + (blockTemplateSize > 80) ? p[10] : 0, + (blockTemplateSize > 88) ? p[11] : 0, + (blockTemplateSize > 96) ? p[12] : 0, + (blockTemplateSize > 104) ? p[13] : 0, + (blockTemplateSize > 112) ? p[14] : 0, + (blockTemplateSize > 120) ? p[15] : 0, + }; + + if (blockTemplateSize % sizeof(ulong)) + m[blockTemplateSize / sizeof(ulong)] &= (ulong)(-1) >> (64 - (blockTemplateSize % sizeof(ulong)) * 8); + + const ulong nonce = start_nonce + global_index; + m[4] = (m[4] & ((ulong)(-1) >> 8)) | (nonce << 56); + m[5] = (m[5] & ((ulong)(-1) << 24)) | (nonce >> 8); + + ulong hash[8]; + blake2b_512_process_single_block(hash, m, blockTemplateSize); + + __global ulong* t = ((__global ulong*) out) + global_index * 8; + t[0] = hash[0]; + t[1] = hash[1]; + t[2] = hash[2]; + t[3] = hash[3]; + t[4] = hash[4]; + t[5] = hash[5]; + t[6] = hash[6]; + t[7] = hash[7]; +} + +#define in_len 256 + +#define out_len 32 +#define blake2b_512_process_double_block_name blake2b_512_process_double_block_32 +#define blake2b_hash_registers_name blake2b_hash_registers_32 + #include "blake2b_double_block.cl" +#undef blake2b_hash_registers_name +#undef blake2b_512_process_double_block_name +#undef out_len + +#define out_len 64 +#define blake2b_512_process_double_block_name blake2b_512_process_double_block_64 +#define blake2b_hash_registers_name blake2b_hash_registers_64 + #include "blake2b_double_block.cl" +#undef blake2b_hash_registers_name +#undef blake2b_512_process_double_block_name +#undef out_len +)===" diff --git a/src/amd/opencl/RandomX/blake2b_double_block.cl b/src/amd/opencl/RandomX/blake2b_double_block.cl new file mode 100644 index 00000000..36d3377f --- /dev/null +++ b/src/amd/opencl/RandomX/blake2b_double_block.cl @@ -0,0 +1,102 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +void blake2b_512_process_double_block_name(ulong *out, ulong* m, __global const ulong* in) +{ + ulong v[16] = + { + iv0 ^ (0x01010000u | out_len), iv1, iv2, iv3, iv4 , iv5, iv6, iv7, + iv0 , iv1, iv2, iv3, iv4 ^ 128, iv5, iv6, iv7, + }; + + BLAKE2B_ROUNDS(); + + ulong h[8]; + v[0] = h[0] = v[0] ^ v[8] ^ iv0 ^ (0x01010000u | out_len); + v[1] = h[1] = v[1] ^ v[9] ^ iv1; + v[2] = h[2] = v[2] ^ v[10] ^ iv2; + v[3] = h[3] = v[3] ^ v[11] ^ iv3; + v[4] = h[4] = v[4] ^ v[12] ^ iv4; + v[5] = h[5] = v[5] ^ v[13] ^ iv5; + v[6] = h[6] = v[6] ^ v[14] ^ iv6; + v[7] = h[7] = v[7] ^ v[15] ^ iv7; + v[8] = iv0; + v[9] = iv1; + v[10] = iv2; + v[11] = iv3; + v[12] = iv4 ^ in_len; + v[13] = iv5; + v[14] = ~iv6; + v[15] = iv7; + + m[ 0] = (in_len > 128) ? in[16] : 0; + m[ 1] = (in_len > 136) ? in[17] : 0; + m[ 2] = (in_len > 144) ? in[18] : 0; + m[ 3] = (in_len > 152) ? in[19] : 0; + m[ 4] = (in_len > 160) ? in[20] : 0; + m[ 5] = (in_len > 168) ? in[21] : 0; + m[ 6] = (in_len > 176) ? in[22] : 0; + m[ 7] = (in_len > 184) ? in[23] : 0; + m[ 8] = (in_len > 192) ? in[24] : 0; + m[ 9] = (in_len > 200) ? in[25] : 0; + m[10] = (in_len > 208) ? in[26] : 0; + m[11] = (in_len > 216) ? in[27] : 0; + m[12] = (in_len > 224) ? in[28] : 0; + m[13] = (in_len > 232) ? in[29] : 0; + m[14] = (in_len > 240) ? in[30] : 0; + m[15] = (in_len > 248) ? in[31] : 0; + + if (in_len % sizeof(ulong)) + m[(in_len - 128) / sizeof(ulong)] &= (ulong)(-1) >> (64 - (in_len % sizeof(ulong)) * 8); + + BLAKE2B_ROUNDS(); + + if (out_len > 0) out[0] = h[0] ^ v[0] ^ v[8]; + if (out_len > 8) out[1] = h[1] ^ v[1] ^ v[9]; + if (out_len > 16) out[2] = h[2] ^ v[2] ^ v[10]; + if (out_len > 24) out[3] = h[3] ^ v[3] ^ v[11]; + if (out_len > 32) out[4] = h[4] ^ v[4] ^ v[12]; + if (out_len > 40) out[5] = h[5] ^ v[5] ^ v[13]; + if (out_len > 48) out[6] = h[6] ^ v[6] ^ v[14]; + if (out_len > 56) out[7] = h[7] ^ v[7] ^ v[15]; +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void blake2b_hash_registers_name(__global void *out, __global const void* in, uint inStrideBytes) +{ + const uint global_index = get_global_id(0); + __global const ulong* p = ((__global const ulong*) in) + global_index * (inStrideBytes / sizeof(ulong)); + __global ulong* h = ((__global ulong*) out) + global_index * (out_len / sizeof(ulong)); + + ulong m[16] = { p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15] }; + + ulong hash[8]; + blake2b_512_process_double_block_name(hash, m, p); + + if (out_len > 0) h[0] = hash[0]; + if (out_len > 8) h[1] = hash[1]; + if (out_len > 16) h[2] = hash[2]; + if (out_len > 24) h[3] = hash[3]; + if (out_len > 32) h[4] = hash[4]; + if (out_len > 40) h[5] = hash[5]; + if (out_len > 48) h[6] = hash[6]; + if (out_len > 56) h[7] = hash[7]; +} +)===" diff --git a/src/amd/opencl/RandomX/fillAes1Rx4.cl b/src/amd/opencl/RandomX/fillAes1Rx4.cl new file mode 100644 index 00000000..f06992ca --- /dev/null +++ b/src/amd/opencl/RandomX/fillAes1Rx4.cl @@ -0,0 +1,120 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void fillAes_name(__global void* state, __global void* out, uint batch_size, uint rx_version) +{ + __local uint T[2048]; + + const uint stride_size = batch_size * 4; + const uint global_index = get_global_id(0); + if (global_index >= stride_size) + return; + + const uint idx = global_index / 4; + const uint sub = global_index % 4; + + for (uint i = get_local_id(0), step = get_local_size(0); i < 2048; i += step) + T[i] = AES_TABLE[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + +#if num_rounds != 4 + const uint k[4] = { AES_KEY_FILL[sub * 4], AES_KEY_FILL[sub * 4 + 1], AES_KEY_FILL[sub * 4 + 2], AES_KEY_FILL[sub * 4 + 3] }; +#else + const bool b1 = (rx_version < 104); + const bool b2 = (sub < 2); + + uint k[16]; + k[ 0] = b1 ? 0xf890465du : (b2 ? 0x6421aaddu : 0xb5826f73u); + k[ 1] = b1 ? 0x7ffbe4a6u : (b2 ? 0xd1833ddbu : 0xe3d6a7a6u); + k[ 2] = b1 ? 0x141f82b7u : (b2 ? 0x2f546d2bu : 0x3d518b6du); + k[ 3] = b1 ? 0xcf359e95u : (b2 ? 0x99e5d23fu : 0x229effb4u); + k[ 4] = b1 ? 0x6a55c450u : (b2 ? 0xb20e3450u : 0xc7566bf3u); + k[ 5] = b1 ? 0xfee8278au : (b2 ? 0xb6913f55u : 0x9c10b3d9u); + k[ 6] = b1 ? 0xbd5c5ac3u : (b2 ? 0x06f79d53u : 0xe9024d4eu); + k[ 7] = b1 ? 0x6741ffdcu : (b2 ? 0xa5dfcde5u : 0xb272b7d2u); + k[ 8] = b1 ? 0x114c47a4u : (b2 ? 0x5c3ed904u : 0xf273c9e7u); + k[ 9] = b1 ? 0xd524fde4u : (b2 ? 0x515e7bafu : 0xf765a38bu); + k[10] = b1 ? 0xa7279ad2u : (b2 ? 0x0aa4679fu : 0x2ba9660au); + k[11] = b1 ? 0x3d324aacu : (b2 ? 0x171c02bfu : 0xf63befa7u); + k[12] = b1 ? 0x810c3a2au : (b2 ? 0x85623763u : 0x7a7cd609u); + k[13] = b1 ? 0x99a9aeffu : (b2 ? 0xe78f5d08u : 0x915839deu); + k[14] = b1 ? 0x42d3dbd9u : (b2 ? 0xcd673785u : 0x0c06d1fdu); + k[15] = b1 ? 0x76f6db08u : (b2 ? 0xd8ded291u : 0xc0b0762du); +#endif + + __global uint* s = ((__global uint*) state) + idx * (64 / sizeof(uint)) + sub * (16 / sizeof(uint)); + uint x[4] = { s[0], s[1], s[2], s[3] }; + + const uint s1 = (sub & 1) ? 8 : 24; + const uint s3 = (sub & 1) ? 24 : 8; + + __global uint4* p = strided ? (((__global uint4*) out) + idx * 4 + sub) : (((__global uint4*) out) + idx * (outputSize0 / sizeof(uint4)) + sub); + + const __local uint* const t0 = (sub & 1) ? T : (T + 1024); + const __local uint* const t1 = (sub & 1) ? (T + 256) : (T + 1792); + const __local uint* const t2 = (sub & 1) ? (T + 512) : (T + 1536); + const __local uint* const t3 = (sub & 1) ? (T + 768) : (T + 1280); + + #pragma unroll(unroll_factor) + for (uint i = 0; i < outputSize / sizeof(uint4); i += 4, p += strided ? stride_size : 4) + { + uint y[4]; + +#if num_rounds != 4 + y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[0]; + y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[1]; + y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[2]; + y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[3]; + + *p = *(uint4*)(y); + + x[0] = y[0]; + x[1] = y[1]; + x[2] = y[2]; + x[3] = y[3]; +#else + y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 0]; + y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 1]; + y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[ 2]; + y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[ 3]; + + x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[ 4]; + x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[ 5]; + x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[ 6]; + x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[ 7]; + + y[0] = t0[get_byte32(x[0], 0)] ^ t1[get_byte32(x[1], s1)] ^ t2[get_byte32(x[2], 16)] ^ t3[get_byte32(x[3], s3)] ^ k[ 8]; + y[1] = t0[get_byte32(x[1], 0)] ^ t1[get_byte32(x[2], s1)] ^ t2[get_byte32(x[3], 16)] ^ t3[get_byte32(x[0], s3)] ^ k[ 9]; + y[2] = t0[get_byte32(x[2], 0)] ^ t1[get_byte32(x[3], s1)] ^ t2[get_byte32(x[0], 16)] ^ t3[get_byte32(x[1], s3)] ^ k[10]; + y[3] = t0[get_byte32(x[3], 0)] ^ t1[get_byte32(x[0], s1)] ^ t2[get_byte32(x[1], 16)] ^ t3[get_byte32(x[2], s3)] ^ k[11]; + + x[0] = t0[get_byte32(y[0], 0)] ^ t1[get_byte32(y[1], s1)] ^ t2[get_byte32(y[2], 16)] ^ t3[get_byte32(y[3], s3)] ^ k[12]; + x[1] = t0[get_byte32(y[1], 0)] ^ t1[get_byte32(y[2], s1)] ^ t2[get_byte32(y[3], 16)] ^ t3[get_byte32(y[0], s3)] ^ k[13]; + x[2] = t0[get_byte32(y[2], 0)] ^ t1[get_byte32(y[3], s1)] ^ t2[get_byte32(y[0], 16)] ^ t3[get_byte32(y[1], s3)] ^ k[14]; + x[3] = t0[get_byte32(y[3], 0)] ^ t1[get_byte32(y[0], s1)] ^ t2[get_byte32(y[1], 16)] ^ t3[get_byte32(y[2], s3)] ^ k[15]; + + *p = *(uint4*)(x); +#endif + } + *(__global uint4*)(s) = *(uint4*)(x); +} +)===" diff --git a/src/amd/opencl/RandomX/randomx_constants_loki.h b/src/amd/opencl/RandomX/randomx_constants_loki.h new file mode 100644 index 00000000..c56381c9 --- /dev/null +++ b/src/amd/opencl/RandomX/randomx_constants_loki.h @@ -0,0 +1,100 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +//Dataset base size in bytes. Must be a power of 2. +#define RANDOMX_DATASET_BASE_SIZE 2147483648 + +//Dataset extra size. Must be divisible by 64. +#define RANDOMX_DATASET_EXTRA_SIZE 33554368 + +//Scratchpad L3 size in bytes. Must be a power of 2. +#define RANDOMX_SCRATCHPAD_L3 2097152 + +//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. +#define RANDOMX_SCRATCHPAD_L2 262144 + +//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. +#define RANDOMX_SCRATCHPAD_L1 16384 + +//Jump condition mask size in bits. +#define RANDOMX_JUMP_BITS 8 + +//Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. +#define RANDOMX_JUMP_OFFSET 8 + +//Integer instructions +#define RANDOMX_FREQ_IADD_RS 25 +#define RANDOMX_FREQ_IADD_M 7 +#define RANDOMX_FREQ_ISUB_R 16 +#define RANDOMX_FREQ_ISUB_M 7 +#define RANDOMX_FREQ_IMUL_R 16 +#define RANDOMX_FREQ_IMUL_M 4 +#define RANDOMX_FREQ_IMULH_R 4 +#define RANDOMX_FREQ_IMULH_M 1 +#define RANDOMX_FREQ_ISMULH_R 4 +#define RANDOMX_FREQ_ISMULH_M 1 +#define RANDOMX_FREQ_IMUL_RCP 8 +#define RANDOMX_FREQ_INEG_R 2 +#define RANDOMX_FREQ_IXOR_R 15 +#define RANDOMX_FREQ_IXOR_M 5 +#define RANDOMX_FREQ_IROR_R 8 +#define RANDOMX_FREQ_IROL_R 2 +#define RANDOMX_FREQ_ISWAP_R 4 + +//Floating point instructions +#define RANDOMX_FREQ_FSWAP_R 4 +#define RANDOMX_FREQ_FADD_R 16 +#define RANDOMX_FREQ_FADD_M 5 +#define RANDOMX_FREQ_FSUB_R 16 +#define RANDOMX_FREQ_FSUB_M 5 +#define RANDOMX_FREQ_FSCAL_R 6 +#define RANDOMX_FREQ_FMUL_R 32 +#define RANDOMX_FREQ_FDIV_M 4 +#define RANDOMX_FREQ_FSQRT_R 6 + +//Control instructions +#define RANDOMX_FREQ_CBRANCH 16 +#define RANDOMX_FREQ_CFROUND 1 + +//Store instruction +#define RANDOMX_FREQ_ISTORE 16 + +//No-op instruction +#define RANDOMX_FREQ_NOP 0 + +#define RANDOMX_DATASET_ITEM_SIZE 64 + +#define RANDOMX_PROGRAM_SIZE 320 + +#define SCRATCHPAD_STRIDED 0 + +#define HASH_SIZE 64 +#define ENTROPY_SIZE (128 + RANDOMX_PROGRAM_SIZE * 8) +#define REGISTERS_SIZE 256 +#define IMM_BUF_SIZE (RANDOMX_PROGRAM_SIZE * 4 - REGISTERS_SIZE) +#define IMM_INDEX_COUNT ((IMM_BUF_SIZE / 4) - 2) +#define VM_STATE_SIZE (REGISTERS_SIZE + IMM_BUF_SIZE + RANDOMX_PROGRAM_SIZE * 4) +#define ROUNDING_MODE (RANDOMX_FREQ_CFROUND ? -1 : 0) + +// Scratchpad L1/L2/L3 bits +#define LOC_L1 (32 - 14) +#define LOC_L2 (32 - 18) +#define LOC_L3 (32 - 21) +)===" diff --git a/src/amd/opencl/RandomX/randomx_constants_wow.h b/src/amd/opencl/RandomX/randomx_constants_wow.h new file mode 100644 index 00000000..27cbae94 --- /dev/null +++ b/src/amd/opencl/RandomX/randomx_constants_wow.h @@ -0,0 +1,100 @@ +R"===( +/* +Copyright (c) 2019 SChernykh + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +//Dataset base size in bytes. Must be a power of 2. +#define RANDOMX_DATASET_BASE_SIZE 2147483648 + +//Dataset extra size. Must be divisible by 64. +#define RANDOMX_DATASET_EXTRA_SIZE 33554368 + +//Scratchpad L3 size in bytes. Must be a power of 2. +#define RANDOMX_SCRATCHPAD_L3 1048576 + +//Scratchpad L2 size in bytes. Must be a power of two and less than or equal to RANDOMX_SCRATCHPAD_L3. +#define RANDOMX_SCRATCHPAD_L2 131072 + +//Scratchpad L1 size in bytes. Must be a power of two (minimum 64) and less than or equal to RANDOMX_SCRATCHPAD_L2. +#define RANDOMX_SCRATCHPAD_L1 16384 + +//Jump condition mask size in bits. +#define RANDOMX_JUMP_BITS 8 + +//Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. +#define RANDOMX_JUMP_OFFSET 8 + +//Integer instructions +#define RANDOMX_FREQ_IADD_RS 25 +#define RANDOMX_FREQ_IADD_M 7 +#define RANDOMX_FREQ_ISUB_R 16 +#define RANDOMX_FREQ_ISUB_M 7 +#define RANDOMX_FREQ_IMUL_R 16 +#define RANDOMX_FREQ_IMUL_M 4 +#define RANDOMX_FREQ_IMULH_R 4 +#define RANDOMX_FREQ_IMULH_M 1 +#define RANDOMX_FREQ_ISMULH_R 4 +#define RANDOMX_FREQ_ISMULH_M 1 +#define RANDOMX_FREQ_IMUL_RCP 8 +#define RANDOMX_FREQ_INEG_R 2 +#define RANDOMX_FREQ_IXOR_R 15 +#define RANDOMX_FREQ_IXOR_M 5 +#define RANDOMX_FREQ_IROR_R 10 +#define RANDOMX_FREQ_IROL_R 0 +#define RANDOMX_FREQ_ISWAP_R 4 + +//Floating point instructions +#define RANDOMX_FREQ_FSWAP_R 8 +#define RANDOMX_FREQ_FADD_R 20 +#define RANDOMX_FREQ_FADD_M 5 +#define RANDOMX_FREQ_FSUB_R 20 +#define RANDOMX_FREQ_FSUB_M 5 +#define RANDOMX_FREQ_FSCAL_R 6 +#define RANDOMX_FREQ_FMUL_R 20 +#define RANDOMX_FREQ_FDIV_M 4 +#define RANDOMX_FREQ_FSQRT_R 6 + +//Control instructions +#define RANDOMX_FREQ_CBRANCH 16 +#define RANDOMX_FREQ_CFROUND 1 + +//Store instruction +#define RANDOMX_FREQ_ISTORE 16 + +//No-op instruction +#define RANDOMX_FREQ_NOP 0 + +#define RANDOMX_DATASET_ITEM_SIZE 64 + +#define RANDOMX_PROGRAM_SIZE 256 + +#define SCRATCHPAD_STRIDED 0 + +#define HASH_SIZE 64 +#define ENTROPY_SIZE (128 + RANDOMX_PROGRAM_SIZE * 8) +#define REGISTERS_SIZE 256 +#define IMM_BUF_SIZE (RANDOMX_PROGRAM_SIZE * 4 - REGISTERS_SIZE) +#define IMM_INDEX_COUNT ((IMM_BUF_SIZE / 4) - 2) +#define VM_STATE_SIZE (REGISTERS_SIZE + IMM_BUF_SIZE + RANDOMX_PROGRAM_SIZE * 4) +#define ROUNDING_MODE (RANDOMX_FREQ_CFROUND ? -1 : 0) + +// Scratchpad L1/L2/L3 bits +#define LOC_L1 (32 - 14) +#define LOC_L2 (32 - 17) +#define LOC_L3 (32 - 20) +)===" diff --git a/src/amd/opencl/RandomX/randomx_vm.cl b/src/amd/opencl/RandomX/randomx_vm.cl new file mode 100644 index 00000000..7204467a --- /dev/null +++ b/src/amd/opencl/RandomX/randomx_vm.cl @@ -0,0 +1,2070 @@ +R"===( +/* +Copyright (c) 2019 SChernykh +Portions Copyright (c) 2018-2019 tevador + +This file is part of RandomX OpenCL. + +RandomX OpenCL is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +RandomX OpenCL is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with RandomX OpenCL. If not, see . +*/ + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#define CacheLineSize 64 +#define ScratchpadL3Mask64 (RANDOMX_SCRATCHPAD_L3 - CacheLineSize) +#define CacheLineAlignMask ((RANDOMX_DATASET_BASE_SIZE - 1) & ~(CacheLineSize - 1)) + +#define mantissaSize 52 +#define exponentSize 11 +#define mantissaMask ((1UL << mantissaSize) - 1) +#define exponentMask ((1UL << exponentSize) - 1) +#define exponentBias 1023 +#define constExponentBits 0x300 +#define dynamicExponentBits 4 +#define staticExponentBits 4 +#define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1) + +#define RegistersCount 8 +#define RegisterCountFlt (RegistersCount / 2) +#define ConditionMask ((1 << RANDOMX_JUMP_BITS) - 1) +#define ConditionOffset RANDOMX_JUMP_OFFSET +#define StoreL3Condition 14 +#define DatasetExtraItems (RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE) + +#define RegisterNeedsDisplacement 5 + +// +// VM state: +// +// Bytes 0-255: registers +// Bytes 256-1023: imm32 values (up to 192 values can be stored). IMUL_RCP and CBRANCH use 2 consecutive imm32 values. +// Bytes 1024-2047: up to 256 instructions +// +// Instruction encoding: +// +// Bits 0-2: dst (0-7) +// Bits 3-5: src (0-7) +// Bits 6-13: imm32/64 offset (in DWORDs, 0-191) +// Bit 14: src location (register, scratchpad) +// Bits 15-16: src shift (0-3), ADD/MUL switch for FMA instruction +// Bit 17: src=imm32 +// Bit 18: src=imm64 +// Bit 19: src = -src +// Bits 20-23: opcode (add_rs, add, mul, umul_hi, imul_hi, neg, xor, ror, swap, cbranch, store, fswap, fma, fsqrt, fdiv, cfround) +// Bits 24-27: how many parallel instructions to run starting with this one (1-16) +// Bits 28-31: how many of them are FP instructions (0-8) +// + +#define DST_OFFSET 0 +#define SRC_OFFSET 3 +#define IMM_OFFSET 6 +#define LOC_OFFSET 14 +#define SHIFT_OFFSET 15 +#define SRC_IS_IMM32_OFFSET 17 +#define SRC_IS_IMM64_OFFSET 18 +#define NEGATIVE_SRC_OFFSET 19 +#define OPCODE_OFFSET 20 +#define NUM_INSTS_OFFSET 24 +#define NUM_FP_INSTS_OFFSET 28 + +// ISWAP r0, r0 +#define INST_NOP (8 << OPCODE_OFFSET) + +typedef uchar uint8_t; +typedef ushort uint16_t; +typedef uint uint32_t; +typedef ulong uint64_t; + +typedef int int32_t; +typedef long int64_t; + +double getSmallPositiveFloatBits(uint64_t entropy) +{ + uint64_t exponent = entropy >> 59; //0..31 + uint64_t mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return as_double(exponent | mantissa); +} + +uint64_t getStaticExponent(uint64_t entropy) +{ + uint64_t exponent = constExponentBits; + exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits; + exponent <<= mantissaSize; + return exponent; +} + +uint64_t getFloatMask(uint64_t entropy) +{ + const uint64_t mask22bit = (1UL << 22) - 1; + return (entropy & mask22bit) | getStaticExponent(entropy); +} + +void set_buffer(__local uint32_t *dst_buf, uint32_t N, const uint32_t value) +{ + uint32_t i = get_local_id(0) * sizeof(uint32_t); + const uint32_t step = get_local_size(0) * sizeof(uint32_t); + __local uint8_t* dst = ((__local uint8_t*)dst_buf) + i; + while (i < sizeof(uint32_t) * N) + { + *(__local uint32_t*)(dst) = value; + dst += step; + i += step; + } +} + +uint64_t imul_rcp_value(uint32_t divisor) +{ + if ((divisor & (divisor - 1)) == 0) + { + return 1UL; + } + + const uint64_t p2exp63 = 1UL << 63; + + uint64_t quotient = p2exp63 / divisor; + uint64_t remainder = p2exp63 % divisor; + + const uint32_t bsr = 31 - clz(divisor); + + for (uint32_t shift = 0; shift <= bsr; ++shift) + { + const bool b = (remainder >= divisor - remainder); + quotient = (quotient << 1) | (b ? 1 : 0); + remainder = (remainder << 1) - (b ? divisor : 0); + } + + return quotient; +} + +)===" +R"===( + +#define set_byte(a, position, value) do { ((uint8_t*)&(a))[(position)] = (value); } while (0) +uint32_t get_byte(uint64_t a, uint32_t position) { return (a >> (position << 3)) & 0xFF; } +#define update_max(value, next_value) do { if ((value) < (next_value)) (value) = (next_value); } while (0) + +__attribute__((reqd_work_group_size(32, 1, 1))) +__kernel void init_vm(__global const void* entropy_data, __global void* vm_states, __global uint32_t* rounding, uint32_t iteration) +{ +#if RANDOMX_PROGRAM_SIZE <= 256 + typedef uint8_t exec_t; +#else + typedef uint16_t exec_t; +#endif + + __local uint32_t execution_plan_buf[RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * (32 / 8) * sizeof(exec_t) / sizeof(uint32_t)]; + + set_buffer(execution_plan_buf, sizeof(execution_plan_buf) / sizeof(uint32_t), 0); + barrier(CLK_LOCAL_MEM_FENCE); + + const uint32_t global_index = get_global_id(0); + const uint32_t idx = global_index / 8; + const uint32_t sub = global_index % 8; + + __local exec_t* execution_plan = (__local exec_t*)(execution_plan_buf + (get_local_id(0) / 8) * RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * sizeof(exec_t) / sizeof(uint32_t)); + + __global uint64_t* R = ((__global uint64_t*)vm_states) + idx * VM_STATE_SIZE / sizeof(uint64_t); + R[sub] = 0; + + const __global uint64_t* entropy = ((const __global uint64_t*)entropy_data) + idx * ENTROPY_SIZE / sizeof(uint64_t); + + __global double* A = (__global double*)(R + 24); + A[sub] = getSmallPositiveFloatBits(entropy[sub]); + + if (sub == 0) + { + if (iteration == 0) + rounding[idx] = 0; + + __global uint2* src_program = (__global uint2*)(entropy + 128 / sizeof(uint64_t)); + +#if RANDOMX_PROGRAM_SIZE <= 256 + uint64_t registerLastChanged = 0; + uint64_t registerWasChanged = 0; +#else + int32_t registerLastChanged[8] = { -1, -1, -1, -1, -1, -1, -1, -1 }; +#endif + + // Initialize CBRANCH instructions + for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) + { + // Clear all src flags (branch target, FP, branch) + *(__global uint32_t*)(src_program + i) &= ~(0xF8U << 8); + + const uint2 src_inst = src_program[i]; + uint2 inst = src_inst; + + uint32_t opcode = inst.x & 0xff; + const uint32_t dst = (inst.x >> 8) & 7; + const uint32_t src = (inst.x >> 16) & 7; + + if (opcode < RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M) + { +#if RANDOMX_PROGRAM_SIZE <= 256 + set_byte(registerLastChanged, dst, i); + set_byte(registerWasChanged, dst, 1); +#else + registerLastChanged[dst] = i; +#endif + continue; + } + opcode -= RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M; + + if (opcode < RANDOMX_FREQ_IMUL_RCP) + { + if (inst.y & (inst.y - 1)) + { +#if RANDOMX_PROGRAM_SIZE <= 256 + set_byte(registerLastChanged, dst, i); + set_byte(registerWasChanged, dst, 1); +#else + registerLastChanged[dst] = i; +#endif + } + continue; + } + opcode -= RANDOMX_FREQ_IMUL_RCP; + + if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) + { +#if RANDOMX_PROGRAM_SIZE <= 256 + set_byte(registerLastChanged, dst, i); + set_byte(registerWasChanged, dst, 1); +#else + registerLastChanged[dst] = i; +#endif + continue; + } + opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; + + if (opcode < RANDOMX_FREQ_ISWAP_R) + { + if (src != dst) + { +#if RANDOMX_PROGRAM_SIZE <= 256 + set_byte(registerLastChanged, dst, i); + set_byte(registerWasChanged, dst, 1); + set_byte(registerLastChanged, src, i); + set_byte(registerWasChanged, src, 1); +#else + registerLastChanged[dst] = i; + registerLastChanged[src] = i; +#endif + } + continue; + } + opcode -= RANDOMX_FREQ_ISWAP_R; + + if (opcode < RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R) + { + // Mark FP instruction (src |= 0x20) + *(__global uint32_t*)(src_program + i) |= 0x20 << 8; + continue; + } + opcode -= RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; + + if (opcode < RANDOMX_FREQ_CBRANCH) + { + const uint32_t creg = dst; +#if RANDOMX_PROGRAM_SIZE <= 256 + const uint32_t change = get_byte(registerLastChanged, dst); + const int32_t lastChanged = (get_byte(registerWasChanged, dst) == 0) ? -1 : (int32_t)(change); + + // Store condition register and branch target in CBRANCH instruction + *(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | ((lastChanged == -1) ? 0x90 : 0x10)) << 8) | (((uint32_t)(lastChanged) & 0xFF) << 16); +#else + const int32_t lastChanged = registerLastChanged[dst]; + + // Store condition register in CBRANCH instruction + *(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | 0x10) << 8); +#endif + + // Mark branch target instruction (src |= 0x40) + *(__global uint32_t*)(src_program + lastChanged + 1) |= 0x40 << 8; + +#if RANDOMX_PROGRAM_SIZE <= 256 + uint32_t tmp = i | (i << 8); + registerLastChanged = tmp | (tmp << 16); + registerLastChanged = registerLastChanged | (registerLastChanged << 32); + + registerWasChanged = 0x0101010101010101UL; +#else + registerLastChanged[0] = i; + registerLastChanged[1] = i; + registerLastChanged[2] = i; + registerLastChanged[3] = i; + registerLastChanged[4] = i; + registerLastChanged[5] = i; + registerLastChanged[6] = i; + registerLastChanged[7] = i; +#endif + } + } + + uint64_t registerLatency = 0; + uint64_t registerReadCycle = 0; + uint64_t registerLatencyFP = 0; + uint64_t registerReadCycleFP = 0; + uint32_t ScratchpadHighLatency = 0; + uint32_t ScratchpadLatency = 0; + + int32_t first_available_slot = 0; + int32_t first_allowed_slot_cfround = 0; + int32_t last_used_slot = -1; + int32_t last_memory_op_slot = -1; + + uint32_t num_slots_used = 0; + uint32_t num_instructions = 0; + + int32_t first_instruction_slot = -1; + bool first_instruction_fp = false; + + //if (global_index == 0) + //{ + // for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j) + // { + // print_inst(src_program[j]); + // printf("\n"); + // } + // printf("\n"); + //} + +)===" +R"===( + + // Schedule instructions + bool update_branch_target_mark = false; + bool first_available_slot_is_branch_target = false; + for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) + { + const uint2 inst = src_program[i]; + + uint32_t opcode = inst.x & 0xff; + uint32_t dst = (inst.x >> 8) & 7; + const uint32_t src = (inst.x >> 16) & 7; + const uint32_t mod = (inst.x >> 24); + + bool is_branch_target = (inst.x & (0x40 << 8)) != 0; + if (is_branch_target) + { + // If an instruction is a branch target, we can't move it before any previous instructions + first_available_slot = last_used_slot + 1; + + // Mark this slot as a branch target + // Whatever instruction takes this slot will receive branch target flag + first_available_slot_is_branch_target = true; + } + + const uint32_t dst_latency = get_byte(registerLatency, dst); + const uint32_t src_latency = get_byte(registerLatency, src); + const uint32_t reg_read_latency = (dst_latency > src_latency) ? dst_latency : src_latency; + const uint32_t mem_read_latency = ((dst == src) && ((inst.y & ScratchpadL3Mask64) >= RANDOMX_SCRATCHPAD_L2)) ? ScratchpadHighLatency : ScratchpadLatency; + + uint32_t full_read_latency = mem_read_latency; + update_max(full_read_latency, reg_read_latency); + + uint32_t latency = 0; + bool is_memory_op = false; + bool is_memory_store = false; + bool is_nop = false; + bool is_branch = false; + bool is_swap = false; + bool is_src_read = true; + bool is_fp = false; + bool is_cfround = false; + + do { + if (opcode < RANDOMX_FREQ_IADD_RS) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_IADD_RS; + + if (opcode < RANDOMX_FREQ_IADD_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_IADD_M; + + if (opcode < RANDOMX_FREQ_ISUB_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_ISUB_R; + + if (opcode < RANDOMX_FREQ_ISUB_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_ISUB_M; + + if (opcode < RANDOMX_FREQ_IMUL_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_IMUL_R; + + if (opcode < RANDOMX_FREQ_IMUL_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_IMUL_M; + + if (opcode < RANDOMX_FREQ_IMULH_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_IMULH_R; + + if (opcode < RANDOMX_FREQ_IMULH_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_IMULH_M; + + if (opcode < RANDOMX_FREQ_ISMULH_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_ISMULH_R; + + if (opcode < RANDOMX_FREQ_ISMULH_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_ISMULH_M; + + if (opcode < RANDOMX_FREQ_IMUL_RCP) + { + is_src_read = false; + if (inst.y & (inst.y - 1)) + latency = dst_latency; + else + is_nop = true; + break; + } + opcode -= RANDOMX_FREQ_IMUL_RCP; + + if (opcode < RANDOMX_FREQ_INEG_R) + { + is_src_read = false; + latency = dst_latency; + break; + } + opcode -= RANDOMX_FREQ_INEG_R; + + if (opcode < RANDOMX_FREQ_IXOR_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_IXOR_R; + + if (opcode < RANDOMX_FREQ_IXOR_M) + { + latency = full_read_latency; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_IXOR_M; + + if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) + { + latency = reg_read_latency; + break; + } + opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; + + if (opcode < RANDOMX_FREQ_ISWAP_R) + { + is_swap = true; + if (dst != src) + latency = reg_read_latency; + else + is_nop = true; + break; + } + opcode -= RANDOMX_FREQ_ISWAP_R; + + if (opcode < RANDOMX_FREQ_FSWAP_R) + { + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FSWAP_R; + + if (opcode < RANDOMX_FREQ_FADD_R) + { + dst %= RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FADD_R; + + if (opcode < RANDOMX_FREQ_FADD_M) + { + dst %= RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + update_max(latency, src_latency); + update_max(latency, ScratchpadLatency); + is_fp = true; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_FADD_M; + + if (opcode < RANDOMX_FREQ_FSUB_R) + { + dst %= RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FSUB_R; + + if (opcode < RANDOMX_FREQ_FSUB_M) + { + dst %= RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + update_max(latency, src_latency); + update_max(latency, ScratchpadLatency); + is_fp = true; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_FSUB_M; + + if (opcode < RANDOMX_FREQ_FSCAL_R) + { + dst %= RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FSCAL_R; + + if (opcode < RANDOMX_FREQ_FMUL_R) + { + dst = (dst % RegisterCountFlt) + RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FMUL_R; + + if (opcode < RANDOMX_FREQ_FDIV_M) + { + dst = (dst % RegisterCountFlt) + RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + update_max(latency, src_latency); + update_max(latency, ScratchpadLatency); + is_fp = true; + is_memory_op = true; + break; + } + opcode -= RANDOMX_FREQ_FDIV_M; + + if (opcode < RANDOMX_FREQ_FSQRT_R) + { + dst = (dst % RegisterCountFlt) + RegisterCountFlt; + latency = get_byte(registerLatencyFP, dst); + is_fp = true; + is_src_read = false; + break; + } + opcode -= RANDOMX_FREQ_FSQRT_R; + + if (opcode < RANDOMX_FREQ_CBRANCH) + { + is_src_read = false; + is_branch = true; + latency = dst_latency; + + // We can't move CBRANCH before any previous instructions + first_available_slot = last_used_slot + 1; + break; + } + opcode -= RANDOMX_FREQ_CBRANCH; + + if (opcode < RANDOMX_FREQ_CFROUND) + { + latency = src_latency; + is_cfround = true; + break; + } + opcode -= RANDOMX_FREQ_CFROUND; + + if (opcode < RANDOMX_FREQ_ISTORE) + { + latency = reg_read_latency; + update_max(latency, (last_memory_op_slot + WORKERS_PER_HASH) / WORKERS_PER_HASH); + is_memory_op = true; + is_memory_store = true; + break; + } + opcode -= RANDOMX_FREQ_ISTORE; + + is_nop = true; + } while (false); + + if (is_nop) + { + if (is_branch_target) + { + // Mark next non-NOP instruction as the branch target instead of this NOP + update_branch_target_mark = true; + } + continue; + } + + if (update_branch_target_mark) + { + *(__global uint32_t*)(src_program + i) |= 0x40 << 8; + update_branch_target_mark = false; + is_branch_target = true; + } + + int32_t first_allowed_slot = first_available_slot; + update_max(first_allowed_slot, latency * WORKERS_PER_HASH); + if (is_cfround) + update_max(first_allowed_slot, first_allowed_slot_cfround); + else + update_max(first_allowed_slot, get_byte(is_fp ? registerReadCycleFP : registerReadCycle, dst) * WORKERS_PER_HASH); + + if (is_swap) + update_max(first_allowed_slot, get_byte(registerReadCycle, src) * WORKERS_PER_HASH); + + int32_t slot_to_use = last_used_slot + 1; + update_max(slot_to_use, first_allowed_slot); + + if (is_fp) + { + slot_to_use = -1; + for (int32_t j = first_allowed_slot; slot_to_use < 0; ++j) + { + if ((execution_plan[j] == 0) && (execution_plan[j + 1] == 0) && ((j + 1) % WORKERS_PER_HASH)) + { + bool blocked = false; + for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k) + { + if (execution_plan[k] || (k == first_instruction_slot)) + { + const uint32_t inst = src_program[execution_plan[k]].x; + + // If there is an integer instruction which is a branch target or a branch, or this FP instruction is a branch target itself, we can't reorder it to add more FP instructions to this cycle + if (((inst & (0x20 << 8)) == 0) && (((inst & (0x50 << 8)) != 0) || is_branch_target)) + { + blocked = true; + continue; + } + } + } + + if (!blocked) + { + for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k) + { + if (execution_plan[k] || (k == first_instruction_slot)) + { + const uint32_t inst = src_program[execution_plan[k]].x; + if ((inst & (0x20 << 8)) == 0) + { + execution_plan[j] = execution_plan[k]; + execution_plan[j + 1] = execution_plan[k + 1]; + if (first_instruction_slot == k) first_instruction_slot = j; + if (first_instruction_slot == k + 1) first_instruction_slot = j + 1; + slot_to_use = k; + break; + } + } + } + + if (slot_to_use < 0) + { + slot_to_use = j; + } + + break; + } + } + } + } + else + { + for (int32_t j = first_allowed_slot; j <= last_used_slot; ++j) + { + if (execution_plan[j] == 0) + { + slot_to_use = j; + break; + } + } + } + + if (i == 0) + { + first_instruction_slot = slot_to_use; + first_instruction_fp = is_fp; + } + + if (is_cfround) + { + first_allowed_slot_cfround = slot_to_use - (slot_to_use % WORKERS_PER_HASH) + WORKERS_PER_HASH; + } + + ++num_instructions; + + execution_plan[slot_to_use] = i; + ++num_slots_used; + + if (is_fp) + { + execution_plan[slot_to_use + 1] = i; + ++num_slots_used; + } + + const uint32_t next_latency = (slot_to_use / WORKERS_PER_HASH) + 1; + + if (is_src_read) + { + int32_t value = get_byte(registerReadCycle, src); + update_max(value, slot_to_use / WORKERS_PER_HASH); + set_byte(registerReadCycle, src, value); + } + + if (is_memory_op) + { + update_max(last_memory_op_slot, slot_to_use); + } + + if (is_cfround) + { + const uint32_t t = next_latency | (next_latency << 8); + registerLatencyFP = t | (t << 16); + registerLatencyFP = registerLatencyFP | (registerLatencyFP << 32); + } + else if (is_fp) + { + set_byte(registerLatencyFP, dst, next_latency); + + int32_t value = get_byte(registerReadCycleFP, dst); + update_max(value, slot_to_use / WORKERS_PER_HASH); + set_byte(registerReadCycleFP, dst, value); + } + else + { + if (!is_memory_store && !is_nop) + { + set_byte(registerLatency, dst, next_latency); + if (is_swap) + set_byte(registerLatency, src, next_latency); + + int32_t value = get_byte(registerReadCycle, dst); + update_max(value, slot_to_use / WORKERS_PER_HASH); + set_byte(registerReadCycle, dst, value); + } + + if (is_branch) + { + const uint32_t t = next_latency | (next_latency << 8); + registerLatency = t | (t << 16); + registerLatency = registerLatency | (registerLatency << 32); + } + + if (is_memory_store) + { + int32_t value = get_byte(registerReadCycle, dst); + update_max(value, slot_to_use / WORKERS_PER_HASH); + set_byte(registerReadCycle, dst, value); + ScratchpadLatency = (slot_to_use / WORKERS_PER_HASH) + 1; + if ((mod >> 4) >= StoreL3Condition) + ScratchpadHighLatency = (slot_to_use / WORKERS_PER_HASH) + 1; + } + } + + if (execution_plan[first_available_slot] || (first_available_slot == first_instruction_slot)) + { + if (first_available_slot_is_branch_target) + { + src_program[i].x |= 0x40 << 8; + first_available_slot_is_branch_target = false; + } + + if (is_fp) + ++first_available_slot; + + do { + ++first_available_slot; + } while ((first_available_slot < RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH) && (execution_plan[first_available_slot] != 0)); + } + + if (is_branch_target) + { + update_max(first_available_slot, is_fp ? (slot_to_use + 2) : (slot_to_use + 1)); + } + + update_max(last_used_slot, is_fp ? (slot_to_use + 1) : slot_to_use); + while (execution_plan[last_used_slot] || (last_used_slot == first_instruction_slot) || ((last_used_slot == first_instruction_slot + 1) && first_instruction_fp)) + { + ++last_used_slot; + } + --last_used_slot; + + if (is_fp && (last_used_slot >= first_allowed_slot_cfround)) + first_allowed_slot_cfround = last_used_slot + 1; + + //if (global_index == 0) + //{ + // printf("slot_to_use = %d, first_available_slot = %d, last_used_slot = %d\n", slot_to_use, first_available_slot, last_used_slot); + // for (int j = 0; j <= last_used_slot; ++j) + // { + // if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp)) + // { + // print_inst(src_program[execution_plan[j]]); + // printf(" | "); + // } + // else + // { + // printf(" | "); + // } + // if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n"); + // } + // printf("\n\n"); + //} + } + + //if (global_index == 0) + //{ + // printf("IPC = %.3f, WPC = %.3f, num_instructions = %u, num_slots_used = %u, first_instruction_slot = %d, last_used_slot = %d, registerLatency = %016llx, registerLatencyFP = %016llx \n", + // num_instructions / static_cast(last_used_slot / WORKERS_PER_HASH + 1), + // num_slots_used / static_cast(last_used_slot / WORKERS_PER_HASH + 1), + // num_instructions, + // num_slots_used, + // first_instruction_slot, + // last_used_slot, + // registerLatency, + // registerLatencyFP + // ); + + // //for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j) + // //{ + // // print_inst(src_program[j]); + // // printf("\n"); + // //} + // //printf("\n"); + + // for (int j = 0; j <= last_used_slot; ++j) + // { + // if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp)) + // { + // print_inst(src_program[execution_plan[j]]); + // printf(" | "); + // } + // else + // { + // printf(" | "); + // } + // if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n"); + // } + // printf("\n\n"); + //} + + //atomicAdd((uint32_t*)num_vm_cycles, (last_used_slot / WORKERS_PER_HASH) + 1); + //atomicAdd((uint32_t*)(num_vm_cycles) + 1, num_slots_used); + + uint32_t ma = (uint32_t)(entropy[8]) & CacheLineAlignMask; + uint32_t mx = (uint32_t)(entropy[10]) & CacheLineAlignMask; + + uint32_t addressRegisters = (uint32_t)(entropy[12]); + addressRegisters = ((addressRegisters & 1) | (((addressRegisters & 2) ? 3U : 2U) << 8) | (((addressRegisters & 4) ? 5U : 4U) << 16) | (((addressRegisters & 8) ? 7U : 6U) << 24)) * sizeof(uint64_t); + + uint32_t datasetOffset = (entropy[13] & DatasetExtraItems) * CacheLineSize; + + ulong2 eMask = *(__global ulong2*)(entropy + 14); + eMask.x = getFloatMask(eMask.x); + eMask.y = getFloatMask(eMask.y); + + ((__global uint32_t*)(R + 16))[0] = ma; + ((__global uint32_t*)(R + 16))[1] = mx; + ((__global uint32_t*)(R + 16))[2] = addressRegisters; + ((__global uint32_t*)(R + 16))[3] = datasetOffset; + ((__global ulong2*)(R + 18))[0] = eMask; + + __global uint32_t* imm_buf = (__global uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t)); + uint32_t imm_index = 0; + int32_t imm_index_fscal_r = -1; + __global uint32_t* compiled_program = (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t)); + +)===" +R"===( + + // Generate opcodes for execute_vm + int32_t branch_target_slot = -1; + int32_t k = -1; + for (int32_t i = 0; i <= last_used_slot; ++i) + { + if (!(execution_plan[i] || (i == first_instruction_slot) || ((i == first_instruction_slot + 1) && first_instruction_fp))) + continue; + + uint32_t num_workers = 1; + uint32_t num_fp_insts = 0; + while ((i + num_workers <= last_used_slot) && ((i + num_workers) % WORKERS_PER_HASH) && (execution_plan[i + num_workers] || (i + num_workers == first_instruction_slot) || ((i + num_workers == first_instruction_slot + 1) && first_instruction_fp))) + { + if ((num_workers & 1) && ((src_program[execution_plan[i + num_workers]].x & (0x20 << 8)) != 0)) + ++num_fp_insts; + ++num_workers; + } + + //if (global_index == 0) + // printf("i = %d, num_workers = %u, num_fp_insts = %u\n", i, num_workers, num_fp_insts); + + num_workers = ((num_workers - 1) << NUM_INSTS_OFFSET) | (num_fp_insts << NUM_FP_INSTS_OFFSET); + + const uint2 src_inst = src_program[execution_plan[i]]; + uint2 inst = src_inst; + + uint32_t opcode = inst.x & 0xff; + const uint32_t dst = (inst.x >> 8) & 7; + const uint32_t src = (inst.x >> 16) & 7; + const uint32_t mod = (inst.x >> 24); + + const bool is_fp = (src_inst.x & (0x20 << 8)) != 0; + if (is_fp && ((i & 1) == 0)) + ++i; + + const bool is_branch_target = (src_inst.x & (0x40 << 8)) != 0; + if (is_branch_target && (branch_target_slot < 0)) + branch_target_slot = k; + + ++k; + + inst.x = INST_NOP; + + if (opcode < RANDOMX_FREQ_IADD_RS) + { + const uint32_t shift = (mod >> 2) % 4; + + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (shift << SHIFT_OFFSET); + + if (dst != RegisterNeedsDisplacement) + { + // Encode regular ADD (opcode 1) + inst.x |= (1 << OPCODE_OFFSET); + } + else + { + // Encode ADD with src and imm32 (opcode 0) + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = inst.y; + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IADD_RS; + + if (opcode < RANDOMX_FREQ_IADD_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IADD_M; + + if (opcode < RANDOMX_FREQ_ISUB_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); + if (src == dst) + { + inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = inst.y; + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISUB_R; + + if (opcode < RANDOMX_FREQ_ISUB_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISUB_M; + + if (opcode < RANDOMX_FREQ_IMUL_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET); + if (src == dst) + { + inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = inst.y; + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IMUL_R; + + if (opcode < RANDOMX_FREQ_IMUL_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (2 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IMUL_M; + + if (opcode < RANDOMX_FREQ_IMULH_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (6 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IMULH_R; + + if (opcode < RANDOMX_FREQ_IMULH_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (6 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IMULH_M; + + if (opcode < RANDOMX_FREQ_ISMULH_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (4 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISMULH_R; + + if (opcode < RANDOMX_FREQ_ISMULH_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (4 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISMULH_M; + + if (opcode < RANDOMX_FREQ_IMUL_RCP) + { + const uint64_t r = imul_rcp_value(inst.y); + if (r == 1) + { + *(compiled_program++) = INST_NOP | num_workers; + continue; + } + + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET); + inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM64_OFFSET); + + if (imm_index < IMM_INDEX_COUNT - 1) + { + imm_buf[imm_index] = ((const uint32_t*)&r)[0]; + imm_buf[imm_index + 1] = ((const uint32_t*)&r)[1]; + imm_index += 2; + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IMUL_RCP; + + if (opcode < RANDOMX_FREQ_INEG_R) + { + inst.x = (dst << DST_OFFSET) | (5 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_INEG_R; + + if (opcode < RANDOMX_FREQ_IXOR_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (3 << OPCODE_OFFSET); + if (src == dst) + { + inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = inst.y; + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IXOR_R; + + if (opcode < RANDOMX_FREQ_IXOR_M) + { + const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (3 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IXOR_M; + + if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (7 << OPCODE_OFFSET); + if (src == dst) + { + inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = inst.y; + } + if (opcode >= RANDOMX_FREQ_IROR_R) + { + inst.x |= (1 << NEGATIVE_SRC_OFFSET); + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; + + if (opcode < RANDOMX_FREQ_ISWAP_R) + { + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (8 << OPCODE_OFFSET); + + *(compiled_program++) = ((src != dst) ? inst.x : INST_NOP) | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISWAP_R; + + if (opcode < RANDOMX_FREQ_FSWAP_R) + { + inst.x = (dst << DST_OFFSET) | (11 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FSWAP_R; + + if (opcode < RANDOMX_FREQ_FADD_R) + { + inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FADD_R; + + if (opcode < RANDOMX_FREQ_FADD_M) + { + const uint32_t location = (mod % 4) ? 1 : 2; + inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FADD_M; + + if (opcode < RANDOMX_FREQ_FSUB_R) + { + inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FSUB_R; + + if (opcode < RANDOMX_FREQ_FSUB_M) + { + const uint32_t location = (mod % 4) ? 1 : 2; + inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FSUB_M; + + if (opcode < RANDOMX_FREQ_FSCAL_R) + { + inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (1 << SRC_IS_IMM64_OFFSET) | (3 << OPCODE_OFFSET); + if (imm_index_fscal_r >= 0) + { + inst.x |= (imm_index_fscal_r << IMM_OFFSET); + } + else + { + imm_index_fscal_r = imm_index; + inst.x |= (imm_index << IMM_OFFSET); + + if (imm_index < IMM_INDEX_COUNT - 1) + { + imm_buf[imm_index] = 0; + imm_buf[imm_index + 1] = 0x80F00000UL; + imm_index += 2; + } + } + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FSCAL_R; + + if (opcode < RANDOMX_FREQ_FMUL_R) + { + inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (1 << SHIFT_OFFSET) | (12 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FMUL_R; + + if (opcode < RANDOMX_FREQ_FDIV_M) + { + const uint32_t location = (mod % 4) ? 1 : 2; + inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (15 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FDIV_M; + + if (opcode < RANDOMX_FREQ_FSQRT_R) + { + inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (14 << OPCODE_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_FSQRT_R; + + if (opcode < RANDOMX_FREQ_CBRANCH) + { + inst.x = (dst << DST_OFFSET) | (9 << OPCODE_OFFSET); + inst.x |= (imm_index << IMM_OFFSET); + + const uint32_t cshift = (mod >> 4) + ConditionOffset; + + uint32_t imm = inst.y | (1U << cshift); + if (cshift > 0) + imm &= ~(1U << (cshift - 1)); + + if (imm_index < IMM_INDEX_COUNT - 1) + { + imm_buf[imm_index] = imm; + imm_buf[imm_index + 1] = cshift | ((uint32_t)(branch_target_slot) << 5); + imm_index += 2; + } + else + { + // Data doesn't fit, skip it + inst.x = INST_NOP; + } + + branch_target_slot = -1; + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_CBRANCH; + + if (opcode < RANDOMX_FREQ_CFROUND) + { + inst.x = (src << SRC_OFFSET) | (13 << OPCODE_OFFSET) | ((inst.y & 63) << IMM_OFFSET); + + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_CFROUND; + + if (opcode < RANDOMX_FREQ_ISTORE) + { + const uint32_t location = ((mod >> 4) >= StoreL3Condition) ? 3 : ((mod % 4) ? 1 : 2); + inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (10 << OPCODE_OFFSET); + inst.x |= imm_index << IMM_OFFSET; + if (imm_index < IMM_INDEX_COUNT) + imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); + else + inst.x = INST_NOP; + *(compiled_program++) = inst.x | num_workers; + continue; + } + opcode -= RANDOMX_FREQ_ISTORE; + + *(compiled_program++) = inst.x | num_workers; + } + + ((__global uint32_t*)(R + 20))[0] = (uint32_t)(compiled_program - (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t))); + } +} + +void load_buffer(__local uint64_t *dst_buf, size_t N, __global const void* src_buf) +{ + uint32_t i = get_local_id(0) * sizeof(uint64_t); + const uint32_t step = get_local_size(0) * sizeof(uint64_t); + __global const uint8_t* src = ((__global const uint8_t*)src_buf) + get_group_id(0) * sizeof(uint64_t) * N + i; + __local uint8_t* dst = ((__local uint8_t*)dst_buf) + i; + while (i < sizeof(uint64_t) * N) + { + *(__local uint64_t*)(dst) = *(__global uint64_t*)(src); + src += step; + dst += step; + i += step; + } +} + +double load_F_E_groups(int value, uint64_t andMask, uint64_t orMask) +{ + double t = convert_double_rtn(value); + uint64_t x = as_ulong(t); + x &= andMask; + x |= orMask; + return as_double(x); +} + +)===" +R"===( + +// You're one ugly motherfucker! +double fma_soft(double a, double b, double c, uint32_t rounding_mode) +{ + if (rounding_mode == 0) + return fma(a, b, c); + + if ((a == 0.0) || (b == 0.0)) + return c; + + if (b == 1.0) + { + if (c == 0.0) + return a; + + if (c == -a) + { + const uint64_t minus_zero = 1UL << 63; + return (rounding_mode == 1) ? as_double(minus_zero) : 0.0; + } + } + + const uint64_t mantissa_size = 52; + const uint64_t mantissa_mask = (1UL << mantissa_size) - 1; + const uint64_t mantissa_high_bit = 1UL << mantissa_size; + + const uint64_t exponent_size = 11; + const uint64_t exponent_mask = (1 << exponent_size) - 1; + + const uint32_t exponent_a = (as_uint2(a).y >> 20) & exponent_mask; + const uint32_t exponent_b = (as_uint2(b).y >> 20) & exponent_mask; + const uint32_t exponent_c = (as_uint2(c).y >> 20) & exponent_mask; + + if ((exponent_a == 2047) || (exponent_b == 2047) || (exponent_c == 2047)) + { + const uint64_t inf = 2047UL << 52; + return as_double(inf); + } + + const uint64_t mantissa_a = (as_ulong(a) & mantissa_mask) | mantissa_high_bit; + const uint64_t mantissa_b = (as_ulong(b) & mantissa_mask) | mantissa_high_bit; + const uint64_t mantissa_c = (as_ulong(c) & mantissa_mask) | mantissa_high_bit; + + const uint32_t sign_a = as_uint2(a).y >> 31; + const uint32_t sign_b = as_uint2(b).y >> 31; + const uint32_t sign_c = as_uint2(c).y >> 31; + + uint64_t mul_result[2]; + mul_result[0] = mantissa_a * mantissa_b; + mul_result[1] = mul_hi(mantissa_a, mantissa_b); + + uint32_t exp_correction = mul_result[1] >> 41; + uint32_t exponent_mul_result = exponent_a + exponent_b + exp_correction - 1023; + uint32_t sign_mul_result = sign_a ^ sign_b; + + if (exponent_mul_result >= 2047) + { + const uint64_t inf_rnd = (2047UL << 52) - (rounding_mode & 1); + return as_double(inf_rnd); + } + + uint64_t fma_result[2]; + uint64_t t[2]; + uint32_t exponent_fma_result; + + if (exponent_mul_result >= exponent_c) + { + uint32_t shift = 23 - exp_correction; + fma_result[0] = mul_result[0] << shift; + fma_result[1] = (mul_result[1] << shift) | (mul_result[0] >> (64 - shift)); + + int32_t shift2 = (127 - 52) + (int32_t)(exponent_c - exponent_mul_result); + + if (shift2 >= 0) + { + if (shift2 >= 64) + { + t[0] = 0; + t[1] = mantissa_c << (shift2 - 64); + } + else + { + t[0] = mantissa_c << shift2; + t[1] = shift2 ? (mantissa_c >> (64 - shift2)) : 0; + } + } + else + { + t[0] = (shift2 < -52) ? 0 : (mantissa_c >> (-shift2)); + t[1] = 0; + if ((t[0] == 0) && (c != 0.0)) + t[0] = 1; + } + + exponent_fma_result = exponent_mul_result; + } + else + { + t[0] = 0; + t[1] = mantissa_c << 11; + + int32_t shift2 = (127 - 104 - exp_correction) + (int32_t)(exponent_mul_result - exponent_c); + if (shift2 >= 0) + { + fma_result[0] = mul_result[0] << shift2; + fma_result[1] = (mul_result[1] << shift2) | (shift2 ? (mul_result[0] >> (64 - shift2)) : 0); + } + else + { + shift2 = -shift2; + if (shift2 >= 64) + { + shift2 -= 64; + fma_result[0] = (shift2 < 64) ? (mul_result[1] >> shift2) : 0; + fma_result[1] = 0; + if (fma_result[0] == 0) + fma_result[0] = 1; + } + else + { + fma_result[0] = (mul_result[0] >> shift2) | (mul_result[1] << (64 - shift2)); + fma_result[1] = mul_result[1] >> shift2; + } + } + + exponent_fma_result = exponent_c; + } + + uint32_t sign_fma_result; + + if (sign_mul_result == sign_c) + { + fma_result[0] += t[0]; + fma_result[1] += t[1] + ((fma_result[0] < t[0]) ? 1 : 0); + + exp_correction = (fma_result[1] < t[1]) ? 1 : 0; + sign_fma_result = sign_mul_result; + } + else + { + const uint32_t borrow = (fma_result[0] < t[0]) ? 1 : 0; + fma_result[0] -= t[0]; + + t[1] += borrow; + const uint32_t change_sign = (fma_result[1] < t[1]) ? 1 : 0; + fma_result[1] -= t[1]; + + sign_fma_result = sign_mul_result ^ change_sign; + if (change_sign) + { + fma_result[0] = -(int64_t)(fma_result[0]); + fma_result[1] = ~fma_result[1]; + fma_result[1] += fma_result[0] ? 0 : 1; + } + + if (fma_result[1] == 0) + { + if (fma_result[0] == 0) + return 0.0; + + exponent_fma_result -= 64; + fma_result[1] = fma_result[0]; + fma_result[0] = 0; + } + + const uint32_t index = clz(fma_result[1]); + if (index) + { + exponent_fma_result -= index; + fma_result[1] = (fma_result[1] << index) | (fma_result[0] >> (64 - index)); + } + + exp_correction = 0; + } + + const uint32_t shift = 11 + exp_correction; + const uint32_t round_up = (fma_result[0] || (fma_result[1] & ((1 << shift) - 1))) ? 1 : 0; + + fma_result[1] >>= shift; + fma_result[1] &= mantissa_mask; + if (rounding_mode + sign_fma_result == 2) + { + fma_result[1] += round_up; + if (fma_result[1] == mantissa_high_bit) + { + fma_result[1] = 0; + ++exponent_fma_result; + } + } + fma_result[1] |= (uint64_t)(exponent_fma_result + exp_correction) << mantissa_size; + fma_result[1] |= (uint64_t)(sign_fma_result) << 63; + + return as_double(fma_result[1]); +} + +double div_rnd(double a, double b, uint32_t fprc) +{ + double y0 = 1.0 / b; + + // Do 1 Newton-Raphson iteration to get correct rounding + const double t0 = a * y0; + const double t1 = fma(-b, t0, a); + double result = fma_soft(y0, t1, t0, fprc); + + // Check for infinity/NaN + const uint64_t inf = 2047UL << 52; + const uint64_t inf_rnd = inf - (fprc & 1); + + if (((as_ulong(result) >> 52) & 2047) == 2047) result = as_double(inf_rnd); + if (as_ulong(a) == inf) result = a; + + return (a == b) ? 1.0 : result; +} + +double sqrt_rnd(double x, uint32_t fprc) +{ + double y0 = rsqrt(x); + + // First Newton-Raphson iteration + double t0 = y0 * x; + double t1 = y0 * -0.5; + t1 = fma(t1, t0, 0.5); // 0.5 * (1.0 - y0 * y0 * x) + const double y1_x = fma(t0, t1, t0); // y1 * x = 0.5 * y0 * x * (3.0 - y0 * y0 * x) + + // Second Newton-Raphson iteration + y0 *= 0.5; + y0 = fma(y0, t1, y0); // 0.5 * y1 + t1 = fma(-y1_x, y1_x, x); // x * (1.0 - x * y1 * y1) + + double result = fma_soft(t1, y0, y1_x, fprc); // x * 0.5 * y1 * (3.0 - x * y1 * y1) + + // Check for infinity + if (*((uint64_t*) &x) == (2047UL << 52)) result = x; + + return result; +} + +uint32_t inner_loop( + const uint32_t program_length, + __local const uint32_t* compiled_program, + const int32_t sub, + __global uint8_t* scratchpad, + const uint32_t fp_reg_offset, + const uint32_t fp_reg_group_A_offset, + __local uint64_t* R, + __local uint32_t* imm_buf, + const uint32_t batch_size, + uint32_t fprc, + const uint32_t fp_workers_mask, + const uint64_t xexponentMask, + const uint32_t workers_mask +) +{ + const int32_t sub2 = sub >> 1; + imm_buf[IMM_INDEX_COUNT + 1] = fprc; + + #pragma unroll(1) + for (int32_t ip = 0; ip < program_length;) + { + imm_buf[IMM_INDEX_COUNT] = ip; + + uint32_t inst = compiled_program[ip]; + const int32_t num_workers = (inst >> NUM_INSTS_OFFSET) & (WORKERS_PER_HASH - 1); + const int32_t num_fp_insts = (inst >> NUM_FP_INSTS_OFFSET) & (WORKERS_PER_HASH - 1); + const int32_t num_insts = num_workers - num_fp_insts; + + if (sub <= num_workers) + { + const int32_t inst_offset = sub - num_fp_insts; + const bool is_fp = inst_offset < num_fp_insts; + inst = compiled_program[ip + (is_fp ? sub2 : inst_offset)]; + //if ((idx == 0) && (ic == 0)) + //{ + // printf("num_fp_insts = %u, sub = %u, ip = %u, inst = %08x\n", num_fp_insts, sub, ip + ((sub < num_fp_insts * 2) ? (sub / 2) : (sub - num_fp_insts)), inst); + //} + + //asm("// INSTRUCTION DECODING BEGIN"); + + uint32_t opcode = (inst >> OPCODE_OFFSET) & 15; + const uint32_t location = (inst >> LOC_OFFSET) & 1; + + const uint32_t reg_size_shift = is_fp ? 4 : 3; + const uint32_t reg_base_offset = is_fp ? fp_reg_offset : 0; + const uint32_t reg_base_src_offset = is_fp ? fp_reg_group_A_offset : 0; + + uint32_t dst_offset = (inst >> DST_OFFSET) & 7; + dst_offset = reg_base_offset + (dst_offset << reg_size_shift); + + uint32_t src_offset = (inst >> SRC_OFFSET) & 7; + src_offset = (src_offset << 3) + (location ? 0 : reg_base_src_offset); + + __local uint64_t* dst_ptr = (__local uint64_t*)((__local uint8_t*)(R) + dst_offset); + __local uint64_t* src_ptr = (__local uint64_t*)((__local uint8_t*)(R) + src_offset); + + const uint32_t imm_offset = (inst >> IMM_OFFSET) & 255; + __local const uint32_t* imm_ptr = imm_buf + imm_offset; + + uint64_t dst = *dst_ptr; + uint64_t src = *src_ptr; + uint2 imm; + imm.x = imm_ptr[0]; + imm.y = imm_ptr[1]; + + //asm("// INSTRUCTION DECODING END"); + + if (location) + { + //asm("// SCRATCHPAD ACCESS BEGIN"); + + const uint32_t loc_shift = (imm.x >> 21) & 31; + const uint32_t mask = (0xFFFFFFFFU >> loc_shift) - 7; + + const bool is_read = (opcode != 10); + uint32_t addr = is_read ? ((loc_shift == LOC_L3) ? 0 : (uint32_t)(src)) : (uint32_t)(dst); + addr += (int32_t)(imm.x); + addr &= mask; + + __global uint64_t* ptr = (__global uint64_t*)(scratchpad + addr); + + if (is_read) + { + src = *ptr; + } + else + { + *ptr = src; + goto execution_end; + } + + //asm("// SCRATCHPAD ACCESS END"); + } + + { + //asm("// EXECUTION BEGIN"); + + if (inst & (1 << SRC_IS_IMM32_OFFSET)) src = (uint64_t)((int64_t)((int32_t)(imm.x))); + + // Check instruction opcodes (most frequent instructions come first) + if (opcode <= 3) + { + //asm("// IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256) ------>"); + if (inst & (1 << NEGATIVE_SRC_OFFSET)) src = (uint64_t)(-(int64_t)(src)); + if (opcode == 0) dst += (int32_t)(imm.x); + const uint32_t shift = (inst >> SHIFT_OFFSET) & 3; + if (opcode < 2) dst += src << shift; + const uint64_t imm64 = *((uint64_t*) &imm); + if (inst & (1 << SRC_IS_IMM64_OFFSET)) src = imm64; + if (opcode == 2) dst *= src; + if (opcode == 3) dst ^= src; + //asm("// <------ IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256)"); + } + else if (opcode == 12) + { + //asm("// FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256) ------>"); + + if (location) src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32)))); + if (inst & (1 << NEGATIVE_SRC_OFFSET)) src ^= 0x8000000000000000UL; + + const bool is_mul = (inst & (1 << SHIFT_OFFSET)) != 0; + const double a = as_double(dst); + const double b = as_double(src); + + dst = as_ulong(fma_soft(a, is_mul ? b : 1.0, is_mul ? 0.0 : b, fprc)); + + //asm("// <------ FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256)"); + } + else if (opcode == 9) + { + //asm("// CBRANCH (16/256) ------>"); + dst += (int32_t)(imm.x); + if (((uint32_t)(dst) & (ConditionMask << (imm.y & 31))) == 0) + { + imm_buf[IMM_INDEX_COUNT] = (uint32_t)(((int32_t)(imm.y) >> 5) - num_insts); + } + //asm("// <------ CBRANCH (16/256)"); + } + else if (opcode == 7) + { + //asm("// IROR_R, IROL_R (10/256) ------>"); + uint32_t shift1 = src & 63; +#if RANDOMX_FREQ_IROL_R > 0 + const uint32_t shift2 = 64 - shift1; + const bool is_rol = (inst & (1 << NEGATIVE_SRC_OFFSET)); + dst = (dst >> (is_rol ? shift2 : shift1)) | (dst << (is_rol ? shift1 : shift2)); +#else + dst = (dst >> shift1) | (dst << (64 - shift1)); +#endif + //asm("// <------ IROR_R, IROL_R (10/256)"); + } + else if (opcode == 14) + { + //asm("// FSQRT_R (6/256) ------>"); + dst = as_ulong(sqrt_rnd(as_double(dst), fprc)); + //asm("// <------ FSQRT_R (6/256)"); + } + else if (opcode == 6) + { + //asm("// IMULH_R, IMULH_M (5/256) ------>"); + dst = mul_hi(dst, src); + //asm("// <------ IMULH_R, IMULH_M (5/256)"); + } + else if (opcode == 4) + { + //asm("// ISMULH_R, ISMULH_M (5/256) ------>"); + dst = (uint64_t)(mul_hi((int64_t)(dst), (int64_t)(src))); + //asm("// <------ ISMULH_R, ISMULH_M (5/256)"); + } + else if (opcode == 11) + { + //asm("// FSWAP_R (4/256) ------>"); + dst = *(__local uint64_t*)((__local uint8_t*)(R) + (dst_offset ^ 8)); + //asm("// <------ FSWAP_R (4/256)"); + } + else if (opcode == 8) + { + //asm("// ISWAP_R (4/256) ------>"); + *src_ptr = dst; + dst = src; + //asm("// <------ ISWAP_R (4/256)"); + } + else if (opcode == 15) + { + //asm("// FDIV_M (4/256) ------>"); + src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32)))); + src &= dynamicMantissaMask; + src |= xexponentMask; + dst = as_ulong(div_rnd(as_double(dst), as_double(src), fprc)); + //asm("// <------ FDIV_M (4/256)"); + } + else if (opcode == 5) + { + //asm("// INEG_R (2/256) ------>"); + dst = (uint64_t)(-(int64_t)(dst)); + //asm("// <------ INEG_R (2/256)"); + } + // CFROUND check will be skipped and removed entirely by the compiler if ROUNDING_MODE >= 0 + else if (ROUNDING_MODE < 0) + { + //asm("// CFROUND (1/256) ------>"); + imm_buf[IMM_INDEX_COUNT + 1] = ((src >> imm_offset) | (src << (64 - imm_offset))) & 3; + //asm("// <------ CFROUND (1/256)"); + goto execution_end; + } + + *dst_ptr = dst; + //asm("// EXECUTION END"); + } + } + + execution_end: + { + //asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE BEGIN"); + + barrier(CLK_LOCAL_MEM_FENCE); + ip = imm_buf[IMM_INDEX_COUNT]; + fprc = imm_buf[IMM_INDEX_COUNT + 1]; + + //asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE END"); + + ip += num_insts + 1; + } + } + + return fprc; +} + +)===" +R"===( + +#if WORKERS_PER_HASH == 16 +__attribute__((reqd_work_group_size(32, 1, 1))) +#else +__attribute__((reqd_work_group_size(16, 1, 1))) +#endif +__kernel void execute_vm(__global void* vm_states, __global void* rounding, __global void* scratchpads, __global const void* dataset_ptr, uint32_t batch_size, uint32_t num_iterations, uint32_t first, uint32_t last) +{ + // 2 hashes per warp, 4 KB shared memory for VM states + __local uint64_t vm_states_local[(VM_STATE_SIZE * 2) / sizeof(uint64_t)]; + + load_buffer(vm_states_local, sizeof(vm_states_local) / sizeof(uint64_t), vm_states); + + barrier(CLK_LOCAL_MEM_FENCE); + + enum { IDX_WIDTH = (WORKERS_PER_HASH == 16) ? 16 : 8 }; + + __local uint64_t* R = vm_states_local + (get_local_id(0) / IDX_WIDTH) * VM_STATE_SIZE / sizeof(uint64_t); + __local double* F = (__local double*)(R + 8); + __local double* E = (__local double*)(R + 16); + + const uint32_t global_index = get_global_id(0); + const int32_t idx = global_index / IDX_WIDTH; + const int32_t sub = global_index % IDX_WIDTH; + + uint32_t ma = ((__local uint32_t*)(R + 16))[0]; + uint32_t mx = ((__local uint32_t*)(R + 16))[1]; + + const uint32_t addressRegisters = ((__local uint32_t*)(R + 16))[2]; + __local const uint64_t* readReg0 = (__local uint64_t*)(((__local uint8_t*)R) + (addressRegisters & 0xff)); + __local const uint64_t* readReg1 = (__local uint64_t*)(((__local uint8_t*)R) + ((addressRegisters >> 8) & 0xff)); + __local const uint32_t* readReg2 = (__local uint32_t*)(((__local uint8_t*)R) + ((addressRegisters >> 16) & 0xff)); + __local const uint32_t* readReg3 = (__local uint32_t*)(((__local uint8_t*)R) + (addressRegisters >> 24)); + + const uint32_t datasetOffset = ((__local uint32_t*)(R + 16))[3]; + __global const uint8_t* dataset = ((__global const uint8_t*)dataset_ptr) + datasetOffset; + + const uint32_t fp_reg_offset = 64 + ((global_index & 1) << 3); + const uint32_t fp_reg_group_A_offset = 192 + ((global_index & 1) << 3); + + __local uint64_t* eMask = R + 18; + + const uint32_t program_length = ((__local uint32_t*)(R + 20))[0]; + uint32_t fprc = ((__global uint32_t*)rounding)[idx]; + + uint32_t spAddr0 = first ? mx : 0; + uint32_t spAddr1 = first ? ma : 0; + + __global uint8_t* scratchpad = ((__global uint8_t*)scratchpads) + idx * (uint64_t)(RANDOMX_SCRATCHPAD_L3 + 64); + + const bool f_group = (sub < 4); + + __local double* fe = f_group ? (F + sub * 2) : (E + (sub - 4) * 2); + __local double* f = F + sub; + __local double* e = E + sub; + + const uint64_t andMask = f_group ? (uint64_t)(-1) : dynamicMantissaMask; + const uint64_t orMask1 = f_group ? 0 : eMask[0]; + const uint64_t orMask2 = f_group ? 0 : eMask[1]; + const uint64_t xexponentMask = (sub & 1) ? eMask[1] : eMask[0]; + + __local uint32_t* imm_buf = (__local uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t)); + __local const uint32_t* compiled_program = (__local const uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t)); + + const uint32_t workers_mask = ((1 << WORKERS_PER_HASH) - 1) << ((get_local_id(0) / IDX_WIDTH) * IDX_WIDTH); + const uint32_t fp_workers_mask = 3 << (((sub >> 1) << 1) + (get_local_id(0) / IDX_WIDTH) * IDX_WIDTH); + + #pragma unroll(1) + for (int ic = 0; ic < num_iterations; ++ic) + { + __local uint64_t *r; + __global uint64_t *p0, *p1; + if ((WORKERS_PER_HASH <= 8) || (sub < 8)) + { + const uint64_t spMix = *readReg0 ^ *readReg1; + spAddr0 ^= ((const uint32_t*)&spMix)[0]; + spAddr1 ^= ((const uint32_t*)&spMix)[1]; + spAddr0 &= ScratchpadL3Mask64; + spAddr1 &= ScratchpadL3Mask64; + + p0 = (__global uint64_t*)(scratchpad + spAddr0 + sub * 8); + p1 = (__global uint64_t*)(scratchpad + spAddr1 + sub * 8); + + r = R + sub; + *r ^= *p0; + + uint64_t global_mem_data = *p1; + int32_t* q = (int32_t*)&global_mem_data; + + fe[0] = load_F_E_groups(q[0], andMask, orMask1); + fe[1] = load_F_E_groups(q[1], andMask, orMask2); + } + + //if ((global_index == 0) && (ic == 0)) + //{ + // printf("ic = %d (before)\n", ic); + // for (int i = 0; i < 8; ++i) + // printf("f%d = %016llx\n", i, bit_cast(F[i])); + // for (int i = 0; i < 8; ++i) + // printf("e%d = %016llx\n", i, bit_cast(E[i])); + // printf("\n"); + //} + + if ((WORKERS_PER_HASH == IDX_WIDTH) || (sub < WORKERS_PER_HASH)) + fprc = inner_loop(program_length, compiled_program, sub, scratchpad, fp_reg_offset, fp_reg_group_A_offset, R, imm_buf, batch_size, fprc, fp_workers_mask, xexponentMask, workers_mask); + + //if ((global_index == 0) && (ic == RANDOMX_PROGRAM_ITERATIONS - 1)) + //{ + // printf("ic = %d (after)\n", ic); + // for (int i = 0; i < 8; ++i) + // printf("r%d = %016llx\n", i, R[i]); + // for (int i = 0; i < 8; ++i) + // printf("f%d = %016llx\n", i, bit_cast(F[i])); + // for (int i = 0; i < 8; ++i) + // printf("e%d = %016llx\n", i, bit_cast(E[i])); + // printf("\n"); + //} + + if ((WORKERS_PER_HASH <= 8) || (sub < 8)) + { + mx ^= *readReg2 ^ *readReg3; + mx &= CacheLineAlignMask; + + const uint64_t next_r = *r ^ *(__global const uint64_t*)(dataset + ma + sub * 8); + *r = next_r; + + *p1 = next_r; + *p0 = as_ulong(f[0]) ^ as_ulong(e[0]); + + uint32_t tmp = ma; + ma = mx; + mx = tmp; + + spAddr0 = 0; + spAddr1 = 0; + } + } + + //if (global_index == 0) + //{ + // for (int i = 0; i < 8; ++i) + // printf("r%d = %016llx\n", i, R[i]); + // for (int i = 0; i < 8; ++i) + // printf("fe%d = %016llx\n", i, bit_cast(F[i]) ^ bit_cast(E[i])); + // printf("\n"); + //} + + if ((WORKERS_PER_HASH > 8) && (sub >= 8)) + return; + + __global uint64_t* p = ((__global uint64_t*)vm_states) + idx * (VM_STATE_SIZE / sizeof(uint64_t)); + p[sub] = R[sub]; + + if (sub == 0) + { + ((__global uint32_t*)rounding)[idx] = fprc; + } + + if (last) + { + p[sub + 8] = as_ulong(F[sub]) ^ as_ulong(E[sub]); + p[sub + 16] = as_ulong(E[sub]); + } + else if (sub == 0) + { + ((__global uint32_t*)(p + 16))[0] = ma; + ((__global uint32_t*)(p + 16))[1] = mx; + } +} + +__attribute__((reqd_work_group_size(64, 1, 1))) +__kernel void find_shares(__global const uint64_t* hashes, uint64_t target, uint32_t start_nonce, __global uint32_t* shares) +{ + const uint32_t global_index = get_global_id(0); + + if (hashes[global_index * 4 + 3] < target) { + //if (global_index == 0) { + const uint32_t idx = atomic_inc(shares + 0xFF); + if (idx < 0xFF) { + shares[idx] = start_nonce + global_index; + } + } +} + +)===" diff --git a/src/base/net/Pool.cpp b/src/base/net/Pool.cpp index 9d4f2bde..9184dffe 100644 --- a/src/base/net/Pool.cpp +++ b/src/base/net/Pool.cpp @@ -500,6 +500,10 @@ void xmrig::Pool::rebuild() addVariant(VARIANT_RWZ); addVariant(VARIANT_ZLS); addVariant(VARIANT_DOUBLE); +# ifdef XMRIG_ALGO_RANDOMX + addVariant(VARIANT_RX_WOW); + addVariant(VARIANT_RX_LOKI); +# endif addVariant(VARIANT_AUTO); # endif } diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index f14d034d..3d72be76 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -70,6 +70,9 @@ static AlgoData const algorithms[] = { { "cryptonight/zls", "cn/zls", xmrig::CRYPTONIGHT, xmrig::VARIANT_ZLS }, { "cryptonight/double", "cn/double", xmrig::CRYPTONIGHT, xmrig::VARIANT_DOUBLE }, + { "randomx/wow", "rx/wow", xmrig::RANDOM_X, xmrig::VARIANT_RX_WOW }, + { "randomx/loki", "rx/loki", xmrig::RANDOM_X, xmrig::VARIANT_RX_LOKI }, + # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, { "cryptonight-light", "cn-light", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, @@ -138,7 +141,9 @@ static const char *variants[] = { "r", "rwz", "zls", - "double" + "double", + "rx/wow", + "rx/loki", }; diff --git a/src/common/net/Client.cpp b/src/common/net/Client.cpp index 7cd09d46..295df0e4 100644 --- a/src/common/net/Client.cpp +++ b/src/common/net/Client.cpp @@ -359,6 +359,14 @@ bool xmrig::Client::parseJob(const rapidjson::Value ¶ms, int *code) } } + if (params.HasMember("seed_hash")) { + const rapidjson::Value &variant = params["seed_hash"]; + + if (variant.IsString()) { + job.setSeedHash(variant.GetString()); + } + } + if (params.HasMember("height")) { const rapidjson::Value &variant = params["height"]; diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index cb6be4e6..550381db 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -66,6 +66,7 @@ xmrig::Job::Job() : m_size(0), m_diff(0), m_target(0), + m_seedHash(), m_blob(), m_height(0) { @@ -80,6 +81,7 @@ xmrig::Job::Job(int poolId, bool nicehash, const Algorithm &algorithm, const Id m_size(0), m_diff(0), m_target(0), + m_seedHash(), m_blob(), m_height(0), m_algorithm(algorithm), @@ -194,6 +196,15 @@ void xmrig::Job::setHeight(uint64_t height) } +bool xmrig::Job::setSeedHash(const char *hash) +{ + if (!hash || (strlen(hash) != sizeof(m_seedHash) * 2)) + return false; + + return fromHex(hash, sizeof(m_seedHash) * 2, m_seedHash); +} + + bool xmrig::Job::fromHex(const char* in, unsigned int len, unsigned char* out) { bool error = false; diff --git a/src/common/net/Job.h b/src/common/net/Job.h index 49ddc7da..c2c44cec 100644 --- a/src/common/net/Job.h +++ b/src/common/net/Job.h @@ -54,12 +54,14 @@ class Job bool setTarget(const char *target); void setAlgorithm(const char *algo); void setHeight(uint64_t height); + bool setSeedHash(const char *hash); inline bool isNicehash() const { return m_nicehash; } inline bool isValid() const { return m_size > 0 && m_diff > 0; } inline bool setId(const char *id) { return m_id.setId(id); } inline const uint32_t *nonce() const { return reinterpret_cast(m_blob + 39); } inline const uint8_t *blob() const { return m_blob; } + inline const uint8_t *seed_hash() const { return m_seedHash; } inline const Algorithm &algorithm() const { return m_algorithm; } inline const Id &clientId() const { return m_clientId; } inline const Id &id() const { return m_id; } @@ -104,6 +106,7 @@ class Job size_t m_size; uint64_t m_diff; uint64_t m_target; + uint8_t m_seedHash[32]; uint8_t m_blob[kMaxBlobSize]; uint64_t m_height; xmrig::Algorithm m_algorithm; diff --git a/src/common/xmrig.h b/src/common/xmrig.h index e8ca8857..cbfe330e 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -36,6 +36,7 @@ enum Algo { CRYPTONIGHT_LITE, /* CryptoNight (1 MB) */ CRYPTONIGHT_HEAVY, /* CryptoNight (4 MB) */ CRYPTONIGHT_PICO, /* CryptoNight (256 KB) */ + RANDOM_X, /* RandomX */ ALGO_MAX }; @@ -79,6 +80,8 @@ enum Variant { VARIANT_RWZ = 14, // CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft) VARIANT_ZLS = 15, // CryptoNight variant 2 with 3/4 iterations (Zelerius) VARIANT_DOUBLE = 16, // CryptoNight variant 2 with double iterations (X-CASH) + VARIANT_RX_WOW = 17, // RandomX (Wownero) + VARIANT_RX_LOKI = 18, // RandomX (Loki) VARIANT_MAX }; diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 74a47f3e..408d21f6 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -243,6 +243,9 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif cryptonight_single_hash_wrapper, cryptonight_single_hash, + nullptr, nullptr, // VARIANT_RX_WOW + nullptr, nullptr, // VARIANT_RX_LOKI + # ifndef XMRIG_NO_AEON cryptonight_single_hash, cryptonight_single_hash, @@ -265,6 +268,8 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, // VARIANT_RWZ nullptr, nullptr, // VARIANT_ZLS nullptr, nullptr, // VARIANT_DOUBLE + nullptr, nullptr, // VARIANT_RX_WOW + nullptr, nullptr, // VARIANT_RX_LOKI # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, @@ -274,6 +279,7 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, # endif @@ -303,6 +309,8 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, // VARIANT_RWZ nullptr, nullptr, // VARIANT_ZLS nullptr, nullptr, // VARIANT_DOUBLE + nullptr, nullptr, // VARIANT_RX_WOW + nullptr, nullptr, // VARIANT_RX_LOKI # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, @@ -312,6 +320,7 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, # endif # ifndef XMRIG_NO_CN_PICO @@ -339,7 +348,10 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, // VARIANT_RWZ nullptr, nullptr, // VARIANT_ZLS nullptr, nullptr, // VARIANT_DOUBLE - #else + nullptr, nullptr, // VARIANT_RX_WOW + nullptr, nullptr, // VARIANT_RX_LOKI +# else + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, @@ -350,6 +362,16 @@ CryptoNight::cn_hash_fun CryptoNight::fn(xmrig::Algo algorithm, xmrig::AlgoVerif nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, # endif + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, }; static_assert((VARIANT_MAX * 2 * ALGO_MAX) == sizeof(func_table) / sizeof(func_table[0]), "func_table size mismatch"); @@ -421,6 +443,10 @@ bool CryptoNight::selfTest() { } # endif + if (m_algorithm == xmrig::RANDOM_X) { + return true; + } + return false; } diff --git a/src/crypto/CryptoNight_constants.h b/src/crypto/CryptoNight_constants.h index 1bc06a3b..1f228e57 100644 --- a/src/crypto/CryptoNight_constants.h +++ b/src/crypto/CryptoNight_constants.h @@ -75,6 +75,7 @@ inline size_t cn_select_memory(Algo algorithm) switch(algorithm) { case CRYPTONIGHT: + case RANDOM_X: return CRYPTONIGHT_MEMORY; case CRYPTONIGHT_LITE: @@ -93,6 +94,11 @@ inline size_t cn_select_memory(Algo algorithm) return 0; } +inline size_t rx_select_memory(Variant variant) +{ + return ((variant == xmrig::VARIANT_RX_WOW) ? 1048576 : 2097152); +} + template inline constexpr uint32_t cn_select_mask() { return 0; } template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp new file mode 100644 index 00000000..5d6cb743 --- /dev/null +++ b/src/crypto/randomx/aes_hash.cpp @@ -0,0 +1,214 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "soft_aes.h" +#include "randomx.h" + +#define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d +#define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e +#define AES_HASH_1R_STATE2 0xe8a07ce4, 0x5079506b, 0xae62c7d0, 0x6a770017 +#define AES_HASH_1R_STATE3 0x7e994948, 0x79a10005, 0x07ad828d, 0x630a240c + +#define AES_HASH_1R_XKEY0 0x06890201, 0x90dc56bf, 0x8b24949f, 0xf6fa8389 +#define AES_HASH_1R_XKEY1 0xed18f99b, 0xee1043c6, 0x51f4e03c, 0x61b263d1 + +/* + Calculate a 512-bit hash of 'input' using 4 lanes of AES. + The input is treated as a set of round keys for the encryption + of the initial state. + + 'inputSize' must be a multiple of 64. + + For a 2 MiB input, this has the same security as 32768-round + AES encryption. + + Hashing throughput: >20 GiB/s per CPU core with hardware AES +*/ +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash) { + const uint8_t* inptr = (uint8_t*)input; + const uint8_t* inputEnd = inptr + inputSize; + + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 in0, in1, in2, in3; + + //intial state + state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + //process 64 bytes at a time in 4 lanes + while (inptr < inputEnd) { + in0 = rx_load_vec_i128((rx_vec_i128*)inptr + 0); + in1 = rx_load_vec_i128((rx_vec_i128*)inptr + 1); + in2 = rx_load_vec_i128((rx_vec_i128*)inptr + 2); + in3 = rx_load_vec_i128((rx_vec_i128*)inptr + 3); + + state0 = aesenc(state0, in0); + state1 = aesdec(state1, in1); + state2 = aesenc(state2, in2); + state3 = aesdec(state3, in3); + + inptr += 64; + } + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + state0 = aesenc(state0, xkey0); + state1 = aesdec(state1, xkey0); + state2 = aesenc(state2, xkey0); + state3 = aesdec(state3, xkey0); + + state0 = aesenc(state0, xkey1); + state1 = aesdec(state1, xkey1); + state2 = aesenc(state2, xkey1); + state3 = aesdec(state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, state3); +} + +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); +template void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +#define AES_GEN_1R_KEY0 0xb4f44917, 0xdbb5552b, 0x62716609, 0x6daca553 +#define AES_GEN_1R_KEY1 0x0da1dc4e, 0x1725d378, 0x846a710d, 0x6d7caf07 +#define AES_GEN_1R_KEY2 0x3e20e345, 0xf4c0794f, 0x9f947ec6, 0x3f1262f1 +#define AES_GEN_1R_KEY3 0x49169154, 0x16314c88, 0xb1ba317c, 0x6aef8135 + +/* + Fill 'buffer' with pseudorandom data based on 512-bit 'state'. + The state is encrypted using a single AES round per 16 bytes of output + in 4 lanes. + + 'outputSize' must be a multiple of 64. + + The modified state is written back to 'state' to allow multiple + calls to this function. +*/ +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer) { + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 key0, key1, key2, key3; + + key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); + state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); + state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); + state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key2); + state3 = aesenc(state3, key3); + + rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); + rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); + rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); + rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); + + outptr += 64; + } + + rx_store_vec_i128((rx_vec_i128*)state + 0, state0); + rx_store_vec_i128((rx_vec_i128*)state + 1, state1); + rx_store_vec_i128((rx_vec_i128*)state + 2, state2); + rx_store_vec_i128((rx_vec_i128*)state + 3, state3); +} + +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes1Rx4(void *state, size_t outputSize, void *buffer); + +template +void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { + const uint8_t* outptr = (uint8_t*)buffer; + const uint8_t* outputEnd = outptr + outputSize; + + rx_vec_i128 state0, state1, state2, state3; + rx_vec_i128 key0, key1, key2, key3, key4, key5, key6, key7; + + key0 = RandomX_CurrentConfig.fillAes4Rx4_Key[0]; + key1 = RandomX_CurrentConfig.fillAes4Rx4_Key[1]; + key2 = RandomX_CurrentConfig.fillAes4Rx4_Key[2]; + key3 = RandomX_CurrentConfig.fillAes4Rx4_Key[3]; + key4 = RandomX_CurrentConfig.fillAes4Rx4_Key[4]; + key5 = RandomX_CurrentConfig.fillAes4Rx4_Key[5]; + key6 = RandomX_CurrentConfig.fillAes4Rx4_Key[6]; + key7 = RandomX_CurrentConfig.fillAes4Rx4_Key[7]; + + state0 = rx_load_vec_i128((rx_vec_i128*)state + 0); + state1 = rx_load_vec_i128((rx_vec_i128*)state + 1); + state2 = rx_load_vec_i128((rx_vec_i128*)state + 2); + state3 = rx_load_vec_i128((rx_vec_i128*)state + 3); + + while (outptr < outputEnd) { + state0 = aesdec(state0, key0); + state1 = aesenc(state1, key0); + state2 = aesdec(state2, key4); + state3 = aesenc(state3, key4); + + state0 = aesdec(state0, key1); + state1 = aesenc(state1, key1); + state2 = aesdec(state2, key5); + state3 = aesenc(state3, key5); + + state0 = aesdec(state0, key2); + state1 = aesenc(state1, key2); + state2 = aesdec(state2, key6); + state3 = aesenc(state3, key6); + + state0 = aesdec(state0, key3); + state1 = aesenc(state1, key3); + state2 = aesdec(state2, key7); + state3 = aesenc(state3, key7); + + rx_store_vec_i128((rx_vec_i128*)outptr + 0, state0); + rx_store_vec_i128((rx_vec_i128*)outptr + 1, state1); + rx_store_vec_i128((rx_vec_i128*)outptr + 2, state2); + rx_store_vec_i128((rx_vec_i128*)outptr + 3, state3); + + outptr += 64; + } +} + +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); +template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/crypto/randomx/aes_hash.hpp b/src/crypto/randomx/aes_hash.hpp new file mode 100644 index 00000000..b4d0e940 --- /dev/null +++ b/src/crypto/randomx/aes_hash.hpp @@ -0,0 +1,40 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +template +void hashAes1Rx4(const void *input, size_t inputSize, void *hash); + +template +void fillAes1Rx4(void *state, size_t outputSize, void *buffer); + +template +void fillAes4Rx4(void *state, size_t outputSize, void *buffer); diff --git a/src/crypto/randomx/allocator.cpp b/src/crypto/randomx/allocator.cpp new file mode 100644 index 00000000..2ddbed98 --- /dev/null +++ b/src/crypto/randomx/allocator.cpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "allocator.hpp" +#include "intrin_portable.h" +#include "virtual_memory.hpp" +#include "common.hpp" + +namespace randomx { + + template + void* AlignedAllocator::allocMemory(size_t count) { + void *mem = rx_aligned_alloc(count, alignment); + if (mem == nullptr) + throw std::bad_alloc(); + return mem; + } + + template + void AlignedAllocator::freeMemory(void* ptr, size_t count) { + rx_aligned_free(ptr); + } + + template class AlignedAllocator; + + void* LargePageAllocator::allocMemory(size_t count) { + return allocLargePagesMemory(count); + } + + void LargePageAllocator::freeMemory(void* ptr, size_t count) { + freePagedMemory(ptr, count); + }; + +} \ No newline at end of file diff --git a/src/crypto/randomx/allocator.hpp b/src/crypto/randomx/allocator.hpp new file mode 100644 index 00000000..d7aa3f95 --- /dev/null +++ b/src/crypto/randomx/allocator.hpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +namespace randomx { + + template + struct AlignedAllocator { + static void* allocMemory(size_t); + static void freeMemory(void*, size_t); + }; + + struct LargePageAllocator { + static void* allocMemory(size_t); + static void freeMemory(void*, size_t); + }; + +} \ No newline at end of file diff --git a/src/crypto/randomx/argon2.h b/src/crypto/randomx/argon2.h new file mode 100644 index 00000000..9d427159 --- /dev/null +++ b/src/crypto/randomx/argon2.h @@ -0,0 +1,229 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#pragma once + +#include +#include +#include + +/* + * Argon2 input parameter restrictions + */ + + /* Minimum and maximum number of lanes (degree of parallelism) */ +#define ARGON2_MIN_LANES UINT32_C(1) +#define ARGON2_MAX_LANES UINT32_C(0xFFFFFF) + +/* Minimum and maximum number of threads */ +#define ARGON2_MIN_THREADS UINT32_C(1) +#define ARGON2_MAX_THREADS UINT32_C(0xFFFFFF) + +/* Number of synchronization points between lanes per pass */ +#define ARGON2_SYNC_POINTS UINT32_C(4) + +/* Minimum and maximum digest size in bytes */ +#define ARGON2_MIN_OUTLEN UINT32_C(4) +#define ARGON2_MAX_OUTLEN UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum number of memory blocks (each of BLOCK_SIZE bytes) */ +#define ARGON2_MIN_MEMORY (2 * ARGON2_SYNC_POINTS) /* 2 blocks per slice */ + +#define ARGON2_MIN(a, b) ((a) < (b) ? (a) : (b)) +/* Max memory size is addressing-space/2, topping at 2^32 blocks (4 TB) */ +#define ARGON2_MAX_MEMORY_BITS \ + ARGON2_MIN(UINT32_C(32), (sizeof(void *) * CHAR_BIT - 10 - 1)) +#define ARGON2_MAX_MEMORY \ + ARGON2_MIN(UINT32_C(0xFFFFFFFF), UINT64_C(1) << ARGON2_MAX_MEMORY_BITS) + +/* Minimum and maximum number of passes */ +#define ARGON2_MIN_TIME UINT32_C(1) +#define ARGON2_MAX_TIME UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum password length in bytes */ +#define ARGON2_MIN_PWD_LENGTH UINT32_C(0) +#define ARGON2_MAX_PWD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum associated data length in bytes */ +#define ARGON2_MIN_AD_LENGTH UINT32_C(0) +#define ARGON2_MAX_AD_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum salt length in bytes */ +#define ARGON2_MIN_SALT_LENGTH UINT32_C(8) +#define ARGON2_MAX_SALT_LENGTH UINT32_C(0xFFFFFFFF) + +/* Minimum and maximum key length in bytes */ +#define ARGON2_MIN_SECRET UINT32_C(0) +#define ARGON2_MAX_SECRET UINT32_C(0xFFFFFFFF) + +/* Flags to determine which fields are securely wiped (default = no wipe). */ +#define ARGON2_DEFAULT_FLAGS UINT32_C(0) +#define ARGON2_FLAG_CLEAR_PASSWORD (UINT32_C(1) << 0) +#define ARGON2_FLAG_CLEAR_SECRET (UINT32_C(1) << 1) + + +/* Error codes */ +typedef enum Argon2_ErrorCodes { + ARGON2_OK = 0, + + ARGON2_OUTPUT_PTR_NULL = -1, + + ARGON2_OUTPUT_TOO_SHORT = -2, + ARGON2_OUTPUT_TOO_LONG = -3, + + ARGON2_PWD_TOO_SHORT = -4, + ARGON2_PWD_TOO_LONG = -5, + + ARGON2_SALT_TOO_SHORT = -6, + ARGON2_SALT_TOO_LONG = -7, + + ARGON2_AD_TOO_SHORT = -8, + ARGON2_AD_TOO_LONG = -9, + + ARGON2_SECRET_TOO_SHORT = -10, + ARGON2_SECRET_TOO_LONG = -11, + + ARGON2_TIME_TOO_SMALL = -12, + ARGON2_TIME_TOO_LARGE = -13, + + ARGON2_MEMORY_TOO_LITTLE = -14, + ARGON2_MEMORY_TOO_MUCH = -15, + + ARGON2_LANES_TOO_FEW = -16, + ARGON2_LANES_TOO_MANY = -17, + + ARGON2_PWD_PTR_MISMATCH = -18, /* NULL ptr with non-zero length */ + ARGON2_SALT_PTR_MISMATCH = -19, /* NULL ptr with non-zero length */ + ARGON2_SECRET_PTR_MISMATCH = -20, /* NULL ptr with non-zero length */ + ARGON2_AD_PTR_MISMATCH = -21, /* NULL ptr with non-zero length */ + + ARGON2_MEMORY_ALLOCATION_ERROR = -22, + + ARGON2_FREE_MEMORY_CBK_NULL = -23, + ARGON2_ALLOCATE_MEMORY_CBK_NULL = -24, + + ARGON2_INCORRECT_PARAMETER = -25, + ARGON2_INCORRECT_TYPE = -26, + + ARGON2_OUT_PTR_MISMATCH = -27, + + ARGON2_THREADS_TOO_FEW = -28, + ARGON2_THREADS_TOO_MANY = -29, + + ARGON2_MISSING_ARGS = -30, + + ARGON2_ENCODING_FAIL = -31, + + ARGON2_DECODING_FAIL = -32, + + ARGON2_THREAD_FAIL = -33, + + ARGON2_DECODING_LENGTH_FAIL = -34, + + ARGON2_VERIFY_MISMATCH = -35 +} argon2_error_codes; + +/* Memory allocator types --- for external allocation */ +typedef int(*allocate_fptr)(uint8_t **memory, size_t bytes_to_allocate); +typedef void(*deallocate_fptr)(uint8_t *memory, size_t bytes_to_allocate); + +/* Argon2 external data structures */ + +/* + ***** + * Context: structure to hold Argon2 inputs: + * output array and its length, + * password and its length, + * salt and its length, + * secret and its length, + * associated data and its length, + * number of passes, amount of used memory (in KBytes, can be rounded up a bit) + * number of parallel threads that will be run. + * All the parameters above affect the output hash value. + * Additionally, two function pointers can be provided to allocate and + * deallocate the memory (if NULL, memory will be allocated internally). + * Also, three flags indicate whether to erase password, secret as soon as they + * are pre-hashed (and thus not needed anymore), and the entire memory + ***** + * Simplest situation: you have output array out[8], password is stored in + * pwd[32], salt is stored in salt[16], you do not have keys nor associated + * data. You need to spend 1 GB of RAM and you run 5 passes of Argon2d with + * 4 parallel lanes. + * You want to erase the password, but you're OK with last pass not being + * erased. You want to use the default memory allocator. + * Then you initialize: + Argon2_Context(out,8,pwd,32,salt,16,NULL,0,NULL,0,5,1<<20,4,4,NULL,NULL,true,false,false,false) + */ +typedef struct Argon2_Context { + uint8_t *out; /* output array */ + uint32_t outlen; /* digest length */ + + uint8_t *pwd; /* password array */ + uint32_t pwdlen; /* password length */ + + uint8_t *salt; /* salt array */ + uint32_t saltlen; /* salt length */ + + uint8_t *secret; /* key array */ + uint32_t secretlen; /* key length */ + + uint8_t *ad; /* associated data array */ + uint32_t adlen; /* associated data length */ + + uint32_t t_cost; /* number of passes */ + uint32_t m_cost; /* amount of memory requested (KB) */ + uint32_t lanes; /* number of lanes */ + uint32_t threads; /* maximum number of threads */ + + uint32_t version; /* version number */ + + allocate_fptr allocate_cbk; /* pointer to memory allocator */ + deallocate_fptr free_cbk; /* pointer to memory deallocator */ + + uint32_t flags; /* array of bool options */ +} argon2_context; + +/* Argon2 primitive type */ +typedef enum Argon2_type { + Argon2_d = 0, + Argon2_i = 1, + Argon2_id = 2 +} argon2_type; + +/* Version of the algorithm */ +typedef enum Argon2_version { + ARGON2_VERSION_10 = 0x10, + ARGON2_VERSION_13 = 0x13, + ARGON2_VERSION_NUMBER = ARGON2_VERSION_13 +} argon2_version; diff --git a/src/crypto/randomx/argon2_core.c b/src/crypto/randomx/argon2_core.c new file mode 100644 index 00000000..e9174222 --- /dev/null +++ b/src/crypto/randomx/argon2_core.c @@ -0,0 +1,516 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + + /*For memory wiping*/ +#ifdef _MSC_VER +#include +#include /* For SecureZeroMemory */ +#endif +#if defined __STDC_LIB_EXT1__ +#define __STDC_WANT_LIB_EXT1__ 1 +#endif +#define VC_GE_2005(version) (version >= 1400) + +#include +#include +#include + +#include "argon2_core.h" +#include "blake2/blake2.h" +#include "blake2/blake2-impl.h" + +#ifdef GENKAT +#include "genkat.h" +#endif + +#if defined(__clang__) +#if __has_attribute(optnone) +#define NOT_OPTIMIZED __attribute__((optnone)) +#endif +#elif defined(__GNUC__) +#define GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 40400 +#define NOT_OPTIMIZED __attribute__((optimize("O0"))) +#endif +#endif +#ifndef NOT_OPTIMIZED +#define NOT_OPTIMIZED +#endif + +/***************Instance and Position constructors**********/ +void rxa2_init_block_value(block *b, uint8_t in) { memset(b->v, in, sizeof(b->v)); } + +void rxa2_copy_block(block *dst, const block *src) { + memcpy(dst->v, src->v, sizeof(uint64_t) * ARGON2_QWORDS_IN_BLOCK); +} + +void rxa2_xor_block(block *dst, const block *src) { + int i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] ^= src->v[i]; + } +} + +static void load_block(block *dst, const void *input) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + dst->v[i] = load64((const uint8_t *)input + i * sizeof(dst->v[i])); + } +} + +static void store_block(void *output, const block *src) { + unsigned i; + for (i = 0; i < ARGON2_QWORDS_IN_BLOCK; ++i) { + store64((uint8_t *)output + i * sizeof(src->v[i]), src->v[i]); + } +} + +/***************Memory functions*****************/ + +int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, + size_t num, size_t size) { + size_t memory_size = num * size; + if (memory == NULL) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + /* 1. Check for multiplication overflow */ + if (size != 0 && memory_size / size != num) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + /* 2. Try to allocate with appropriate allocator */ + if (context->allocate_cbk) { + (context->allocate_cbk)(memory, memory_size); + } + else { + *memory = (uint8_t*)malloc(memory_size); + } + + if (*memory == NULL) { + return ARGON2_MEMORY_ALLOCATION_ERROR; + } + + return ARGON2_OK; +} + +void rxa2_free_memory(const argon2_context *context, uint8_t *memory, + size_t num, size_t size) { + size_t memory_size = num * size; + rxa2_clear_internal_memory(memory, memory_size); + if (context->free_cbk) { + (context->free_cbk)(memory, memory_size); + } + else { + free(memory); + } +} + +void NOT_OPTIMIZED rxa2_secure_wipe_memory(void *v, size_t n) { +#if defined(_MSC_VER) && VC_GE_2005(_MSC_VER) + SecureZeroMemory(v, n); +#elif defined memset_s + memset_s(v, n, 0, n); +#elif defined(__OpenBSD__) + explicit_bzero(v, n); +#else + static void *(*const volatile memset_sec)(void *, int, size_t) = &memset; + memset_sec(v, 0, n); +#endif +} + +/* Memory clear flag defaults to true. */ +#define FLAG_clear_internal_memory 0 +void rxa2_clear_internal_memory(void *v, size_t n) { + if (FLAG_clear_internal_memory && v) { + rxa2_secure_wipe_memory(v, n); + } +} + +uint32_t rxa2_index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane) { + /* + * Pass 0: + * This lane : all already finished segments plus already constructed + * blocks in this segment + * Other lanes : all already finished segments + * Pass 1+: + * This lane : (SYNC_POINTS - 1) last segments plus already constructed + * blocks in this segment + * Other lanes : (SYNC_POINTS - 1) last segments + */ + uint32_t reference_area_size; + uint64_t relative_position; + uint32_t start_position, absolute_position; + + if (0 == position->pass) { + /* First pass */ + if (0 == position->slice) { + /* First slice */ + reference_area_size = + position->index - 1; /* all but the previous */ + } + else { + if (same_lane) { + /* The same lane => add current segment */ + reference_area_size = + position->slice * instance->segment_length + + position->index - 1; + } + else { + reference_area_size = + position->slice * instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + } + else { + /* Second pass */ + if (same_lane) { + reference_area_size = instance->lane_length - + instance->segment_length + position->index - + 1; + } + else { + reference_area_size = instance->lane_length - + instance->segment_length + + ((position->index == 0) ? (-1) : 0); + } + } + + /* 1.2.4. Mapping pseudo_rand to 0.. and produce + * relative position */ + relative_position = pseudo_rand; + relative_position = relative_position * relative_position >> 32; + relative_position = reference_area_size - 1 - + (reference_area_size * relative_position >> 32); + + /* 1.2.5 Computing starting position */ + start_position = 0; + + if (0 != position->pass) { + start_position = (position->slice == ARGON2_SYNC_POINTS - 1) + ? 0 + : (position->slice + 1) * instance->segment_length; + } + + /* 1.2.6. Computing absolute position */ + absolute_position = (start_position + relative_position) % + instance->lane_length; /* absolute position */ + return absolute_position; +} + +/* Single-threaded version for p=1 case */ +static int fill_memory_blocks_st(argon2_instance_t *instance) { + uint32_t r, s, l; + + for (r = 0; r < instance->passes; ++r) { + for (s = 0; s < ARGON2_SYNC_POINTS; ++s) { + for (l = 0; l < instance->lanes; ++l) { + argon2_position_t position = { r, l, (uint8_t)s, 0 }; + rxa2_fill_segment(instance, position); + } + } +#ifdef GENKAT + internal_kat(instance, r); /* Print all memory blocks */ +#endif + } + return ARGON2_OK; +} + +int rxa2_fill_memory_blocks(argon2_instance_t *instance) { + if (instance == NULL || instance->lanes == 0) { + return ARGON2_INCORRECT_PARAMETER; + } + return fill_memory_blocks_st(instance); +} + +int rxa2_validate_inputs(const argon2_context *context) { + if (NULL == context) { + return ARGON2_INCORRECT_PARAMETER; + } + + if (NULL == context->out) { + return ARGON2_OUTPUT_PTR_NULL; + } + + /* Validate output length */ + if (ARGON2_MIN_OUTLEN > context->outlen) { + return ARGON2_OUTPUT_TOO_SHORT; + } + + if (ARGON2_MAX_OUTLEN < context->outlen) { + return ARGON2_OUTPUT_TOO_LONG; + } + + /* Validate password (required param) */ + if (NULL == context->pwd) { + if (0 != context->pwdlen) { + return ARGON2_PWD_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_PWD_LENGTH > context->pwdlen) { + return ARGON2_PWD_TOO_SHORT; + } + + if (ARGON2_MAX_PWD_LENGTH < context->pwdlen) { + return ARGON2_PWD_TOO_LONG; + } + + /* Validate salt (required param) */ + if (NULL == context->salt) { + if (0 != context->saltlen) { + return ARGON2_SALT_PTR_MISMATCH; + } + } + + if (ARGON2_MIN_SALT_LENGTH > context->saltlen) { + return ARGON2_SALT_TOO_SHORT; + } + + if (ARGON2_MAX_SALT_LENGTH < context->saltlen) { + return ARGON2_SALT_TOO_LONG; + } + + /* Validate secret (optional param) */ + if (NULL == context->secret) { + if (0 != context->secretlen) { + return ARGON2_SECRET_PTR_MISMATCH; + } + } + else { + if (ARGON2_MIN_SECRET > context->secretlen) { + return ARGON2_SECRET_TOO_SHORT; + } + if (ARGON2_MAX_SECRET < context->secretlen) { + return ARGON2_SECRET_TOO_LONG; + } + } + + /* Validate associated data (optional param) */ + if (NULL == context->ad) { + if (0 != context->adlen) { + return ARGON2_AD_PTR_MISMATCH; + } + } + else { + if (ARGON2_MIN_AD_LENGTH > context->adlen) { + return ARGON2_AD_TOO_SHORT; + } + if (ARGON2_MAX_AD_LENGTH < context->adlen) { + return ARGON2_AD_TOO_LONG; + } + } + + /* Validate memory cost */ + if (ARGON2_MIN_MEMORY > context->m_cost) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + if (ARGON2_MAX_MEMORY < context->m_cost) { + return ARGON2_MEMORY_TOO_MUCH; + } + + if (context->m_cost < 8 * context->lanes) { + return ARGON2_MEMORY_TOO_LITTLE; + } + + /* Validate time cost */ + if (ARGON2_MIN_TIME > context->t_cost) { + return ARGON2_TIME_TOO_SMALL; + } + + if (ARGON2_MAX_TIME < context->t_cost) { + return ARGON2_TIME_TOO_LARGE; + } + + /* Validate lanes */ + if (ARGON2_MIN_LANES > context->lanes) { + return ARGON2_LANES_TOO_FEW; + } + + if (ARGON2_MAX_LANES < context->lanes) { + return ARGON2_LANES_TOO_MANY; + } + + /* Validate threads */ + if (ARGON2_MIN_THREADS > context->threads) { + return ARGON2_THREADS_TOO_FEW; + } + + if (ARGON2_MAX_THREADS < context->threads) { + return ARGON2_THREADS_TOO_MANY; + } + + if (NULL != context->allocate_cbk && NULL == context->free_cbk) { + return ARGON2_FREE_MEMORY_CBK_NULL; + } + + if (NULL == context->allocate_cbk && NULL != context->free_cbk) { + return ARGON2_ALLOCATE_MEMORY_CBK_NULL; + } + + return ARGON2_OK; +} + +void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance) { + uint32_t l; + /* Make the first and second block in each lane as G(H0||0||i) or + G(H0||1||i) */ + uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE]; + for (l = 0; l < instance->lanes; ++l) { + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 0); + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH + 4, l); + rxa2_blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 0], + blockhash_bytes); + + store32(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, 1); + rxa2_blake2b_long(blockhash_bytes, ARGON2_BLOCK_SIZE, blockhash, + ARGON2_PREHASH_SEED_LENGTH); + load_block(&instance->memory[l * instance->lane_length + 1], + blockhash_bytes); + } + rxa2_clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE); +} + +void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, argon2_type type) { + blake2b_state BlakeHash; + uint8_t value[sizeof(uint32_t)]; + + if (NULL == context || NULL == blockhash) { + return; + } + + blake2b_init(&BlakeHash, ARGON2_PREHASH_DIGEST_LENGTH); + + store32(&value, context->lanes); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->outlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->m_cost); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->t_cost); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->version); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, (uint32_t)type); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + store32(&value, context->pwdlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->pwd != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->pwd, + context->pwdlen); + + if (context->flags & ARGON2_FLAG_CLEAR_PASSWORD) { + rxa2_secure_wipe_memory(context->pwd, context->pwdlen); + context->pwdlen = 0; + } + } + + store32(&value, context->saltlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->salt != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->salt, context->saltlen); + } + + store32(&value, context->secretlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->secret != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->secret, + context->secretlen); + + if (context->flags & ARGON2_FLAG_CLEAR_SECRET) { + rxa2_secure_wipe_memory(context->secret, context->secretlen); + context->secretlen = 0; + } + } + + store32(&value, context->adlen); + blake2b_update(&BlakeHash, (const uint8_t *)&value, sizeof(value)); + + if (context->ad != NULL) { + blake2b_update(&BlakeHash, (const uint8_t *)context->ad, + context->adlen); + } + + blake2b_final(&BlakeHash, blockhash, ARGON2_PREHASH_DIGEST_LENGTH); +} + +int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context) { + uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; + int result = ARGON2_OK; + + if (instance == NULL || context == NULL) + return ARGON2_INCORRECT_PARAMETER; + instance->context_ptr = context; + + /* 1. Memory allocation */ + /*result = allocate_memory(context, (uint8_t **)&(instance->memory), instance->memory_blocks, sizeof(block)); + if (result != ARGON2_OK) { + return result; + }*/ + + /* 2. Initial hashing */ + /* H_0 + 8 extra bytes to produce the first blocks */ + /* uint8_t blockhash[ARGON2_PREHASH_SEED_LENGTH]; */ + /* Hashing all inputs */ + rxa2_initial_hash(blockhash, context, instance->type); + /* Zeroing 8 extra bytes */ + rxa2_clear_internal_memory(blockhash + ARGON2_PREHASH_DIGEST_LENGTH, + ARGON2_PREHASH_SEED_LENGTH - + ARGON2_PREHASH_DIGEST_LENGTH); + + /* 3. Creating first blocks, we always have at least two blocks in a slice + */ + rxa2_fill_first_blocks(blockhash, instance); + /* Clearing the hash */ + rxa2_clear_internal_memory(blockhash, ARGON2_PREHASH_SEED_LENGTH); + + return ARGON2_OK; +} diff --git a/src/crypto/randomx/argon2_core.h b/src/crypto/randomx/argon2_core.h new file mode 100644 index 00000000..efd56d99 --- /dev/null +++ b/src/crypto/randomx/argon2_core.h @@ -0,0 +1,254 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef ARGON2_CORE_H +#define ARGON2_CORE_H + +#include +#include "argon2.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +#define CONST_CAST(x) (x)(uintptr_t) + + /**********************Argon2 internal constants*******************************/ + +enum argon2_core_constants { + /* Memory block size in bytes */ + ARGON2_BLOCK_SIZE = 1024, + ARGON2_QWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 8, + ARGON2_OWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 16, + ARGON2_HWORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 32, + ARGON2_512BIT_WORDS_IN_BLOCK = ARGON2_BLOCK_SIZE / 64, + + /* Number of pseudo-random values generated by one call to Blake in Argon2i + to + generate reference block positions */ + ARGON2_ADDRESSES_IN_BLOCK = 128, + + /* Pre-hashing digest length and its extension*/ + ARGON2_PREHASH_DIGEST_LENGTH = 64, + ARGON2_PREHASH_SEED_LENGTH = 72 +}; + +/*************************Argon2 internal data types***********************/ + +/* + * Structure for the (1KB) memory block implemented as 128 64-bit words. + * Memory blocks can be copied, XORed. Internal words can be accessed by [] (no + * bounds checking). + */ +typedef struct block_ { uint64_t v[ARGON2_QWORDS_IN_BLOCK]; } block; + +/*****************Functions that work with the block******************/ + +/* Initialize each byte of the block with @in */ +void rxa2_init_block_value(block *b, uint8_t in); + +/* Copy block @src to block @dst */ +void rxa2_copy_block(block *dst, const block *src); + +/* XOR @src onto @dst bytewise */ +void rxa2_xor_block(block *dst, const block *src); + +/* + * Argon2 instance: memory pointer, number of passes, amount of memory, type, + * and derived values. + * Used to evaluate the number and location of blocks to construct in each + * thread + */ +typedef struct Argon2_instance_t { + block *memory; /* Memory pointer */ + uint32_t version; + uint32_t passes; /* Number of passes */ + uint32_t memory_blocks; /* Number of blocks in memory */ + uint32_t segment_length; + uint32_t lane_length; + uint32_t lanes; + uint32_t threads; + argon2_type type; + int print_internals; /* whether to print the memory blocks */ + argon2_context *context_ptr; /* points back to original context */ +} argon2_instance_t; + +/* + * Argon2 position: where we construct the block right now. Used to distribute + * work between threads. + */ +typedef struct Argon2_position_t { + uint32_t pass; + uint32_t lane; + uint8_t slice; + uint32_t index; +} argon2_position_t; + +/*Struct that holds the inputs for thread handling FillSegment*/ +typedef struct Argon2_thread_data { + argon2_instance_t *instance_ptr; + argon2_position_t pos; +} argon2_thread_data; + +/*************************Argon2 core functions********************************/ + +/* Allocates memory to the given pointer, uses the appropriate allocator as + * specified in the context. Total allocated memory is num*size. + * @param context argon2_context which specifies the allocator + * @param memory pointer to the pointer to the memory + * @param size the size in bytes for each element to be allocated + * @param num the number of elements to be allocated + * @return ARGON2_OK if @memory is a valid pointer and memory is allocated + */ +int rxa2_allocate_memory(const argon2_context *context, uint8_t **memory, + size_t num, size_t size); + +/* + * Frees memory at the given pointer, uses the appropriate deallocator as + * specified in the context. Also cleans the memory using clear_internal_memory. + * @param context argon2_context which specifies the deallocator + * @param memory pointer to buffer to be freed + * @param size the size in bytes for each element to be deallocated + * @param num the number of elements to be deallocated + */ +void rxa2_free_memory(const argon2_context *context, uint8_t *memory, + size_t num, size_t size); + +/* Function that securely cleans the memory. This ignores any flags set + * regarding clearing memory. Usually one just calls clear_internal_memory. + * @param mem Pointer to the memory + * @param s Memory size in bytes + */ +void rxa2_secure_wipe_memory(void *v, size_t n); + +/* Function that securely clears the memory if FLAG_clear_internal_memory is + * set. If the flag isn't set, this function does nothing. + * @param mem Pointer to the memory + * @param s Memory size in bytes + */ +void rxa2_clear_internal_memory(void *v, size_t n); + +/* + * Computes absolute position of reference block in the lane following a skewed + * distribution and using a pseudo-random value as input + * @param instance Pointer to the current instance + * @param position Pointer to the current position + * @param pseudo_rand 32-bit pseudo-random value used to determine the position + * @param same_lane Indicates if the block will be taken from the current lane. + * If so we can reference the current segment + * @pre All pointers must be valid + */ +uint32_t rxa2_index_alpha(const argon2_instance_t *instance, + const argon2_position_t *position, uint32_t pseudo_rand, + int same_lane); + +/* + * Function that validates all inputs against predefined restrictions and return + * an error code + * @param context Pointer to current Argon2 context + * @return ARGON2_OK if everything is all right, otherwise one of error codes + * (all defined in + */ +int rxa2_validate_inputs(const argon2_context *context); + +/* + * Hashes all the inputs into @a blockhash[PREHASH_DIGEST_LENGTH], clears + * password and secret if needed + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param blockhash Buffer for pre-hashing digest + * @param type Argon2 type + * @pre @a blockhash must have at least @a PREHASH_DIGEST_LENGTH bytes + * allocated + */ +void rxa2_initial_hash(uint8_t *blockhash, argon2_context *context, + argon2_type type); + +/* + * Function creates first 2 blocks per lane + * @param instance Pointer to the current instance + * @param blockhash Pointer to the pre-hashing digest + * @pre blockhash must point to @a PREHASH_SEED_LENGTH allocated values + */ +void rxa2_fill_first_blocks(uint8_t *blockhash, const argon2_instance_t *instance); + +/* + * Function allocates memory, hashes the inputs with Blake, and creates first + * two blocks. Returns the pointer to the main memory with 2 blocks per lane + * initialized + * @param context Pointer to the Argon2 internal structure containing memory + * pointer, and parameters for time and space requirements. + * @param instance Current Argon2 instance + * @return Zero if successful, -1 if memory failed to allocate. @context->state + * will be modified if successful. + */ +int rxa2_argon_initialize(argon2_instance_t *instance, argon2_context *context); + +/* + * XORing the last block of each lane, hashing it, making the tag. Deallocates + * the memory. + * @param context Pointer to current Argon2 context (use only the out parameters + * from it) + * @param instance Pointer to current instance of Argon2 + * @pre instance->state must point to necessary amount of memory + * @pre context->out must point to outlen bytes of memory + * @pre if context->free_cbk is not NULL, it should point to a function that + * deallocates memory + */ +void rxa2_finalize(const argon2_context *context, argon2_instance_t *instance); + +/* + * Function that fills the segment using previous segments also from other + * threads + * @param context current context + * @param instance Pointer to the current instance + * @param position Current position + * @pre all block pointers must be valid + */ +void rxa2_fill_segment(const argon2_instance_t *instance, + argon2_position_t position); + +/* + * Function that fills the entire memory t_cost times based on the first two + * blocks in each lane + * @param instance Pointer to the current instance + * @return ARGON2_OK if successful, @context->state + */ +int rxa2_fill_memory_blocks(argon2_instance_t *instance); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/crypto/randomx/argon2_ref.c b/src/crypto/randomx/argon2_ref.c new file mode 100644 index 00000000..018b985b --- /dev/null +++ b/src/crypto/randomx/argon2_ref.c @@ -0,0 +1,214 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "argon2.h" +#include "argon2_core.h" + +#include "blake2/blamka-round-ref.h" +#include "blake2/blake2-impl.h" +#include "blake2/blake2.h" + + /* + * Function fills a new memory block and optionally XORs the old block over the new one. + * @next_block must be initialized. + * @param prev_block Pointer to the previous block + * @param ref_block Pointer to the reference block + * @param next_block Pointer to the block to be constructed + * @param with_xor Whether to XOR into the new block (1) or just overwrite (0) + * @pre all block pointers must be valid + */ +static void fill_block(const block *prev_block, const block *ref_block, + block *next_block, int with_xor) { + block blockR, block_tmp; + unsigned i; + + rxa2_copy_block(&blockR, ref_block); + rxa2_xor_block(&blockR, prev_block); + rxa2_copy_block(&block_tmp, &blockR); + /* Now blockR = ref_block + prev_block and block_tmp = ref_block + prev_block */ + if (with_xor) { + /* Saving the next block contents for XOR over: */ + rxa2_xor_block(&block_tmp, next_block); + /* Now blockR = ref_block + prev_block and + block_tmp = ref_block + prev_block + next_block */ + } + + /* Apply Blake2 on columns of 64-bit words: (0,1,...,15) , then + (16,17,..31)... finally (112,113,...127) */ + for (i = 0; i < 8; ++i) { + BLAKE2_ROUND_NOMSG( + blockR.v[16 * i], blockR.v[16 * i + 1], blockR.v[16 * i + 2], + blockR.v[16 * i + 3], blockR.v[16 * i + 4], blockR.v[16 * i + 5], + blockR.v[16 * i + 6], blockR.v[16 * i + 7], blockR.v[16 * i + 8], + blockR.v[16 * i + 9], blockR.v[16 * i + 10], blockR.v[16 * i + 11], + blockR.v[16 * i + 12], blockR.v[16 * i + 13], blockR.v[16 * i + 14], + blockR.v[16 * i + 15]); + } + + /* Apply Blake2 on rows of 64-bit words: (0,1,16,17,...112,113), then + (2,3,18,19,...,114,115).. finally (14,15,30,31,...,126,127) */ + for (i = 0; i < 8; i++) { + BLAKE2_ROUND_NOMSG( + blockR.v[2 * i], blockR.v[2 * i + 1], blockR.v[2 * i + 16], + blockR.v[2 * i + 17], blockR.v[2 * i + 32], blockR.v[2 * i + 33], + blockR.v[2 * i + 48], blockR.v[2 * i + 49], blockR.v[2 * i + 64], + blockR.v[2 * i + 65], blockR.v[2 * i + 80], blockR.v[2 * i + 81], + blockR.v[2 * i + 96], blockR.v[2 * i + 97], blockR.v[2 * i + 112], + blockR.v[2 * i + 113]); + } + + rxa2_copy_block(next_block, &block_tmp); + rxa2_xor_block(next_block, &blockR); +} + +static void next_addresses(block *address_block, block *input_block, + const block *zero_block) { + input_block->v[6]++; + fill_block(zero_block, input_block, address_block, 0); + fill_block(zero_block, address_block, address_block, 0); +} + +void rxa2_fill_segment(const argon2_instance_t *instance, + argon2_position_t position) { + block *ref_block = NULL, *curr_block = NULL; + block address_block, input_block, zero_block; + uint64_t pseudo_rand, ref_index, ref_lane; + uint32_t prev_offset, curr_offset; + uint32_t starting_index; + uint32_t i; + int data_independent_addressing; + + if (instance == NULL) { + return; + } + + data_independent_addressing = + (instance->type == Argon2_i) || + (instance->type == Argon2_id && (position.pass == 0) && + (position.slice < ARGON2_SYNC_POINTS / 2)); + + if (data_independent_addressing) { + rxa2_init_block_value(&zero_block, 0); + rxa2_init_block_value(&input_block, 0); + + input_block.v[0] = position.pass; + input_block.v[1] = position.lane; + input_block.v[2] = position.slice; + input_block.v[3] = instance->memory_blocks; + input_block.v[4] = instance->passes; + input_block.v[5] = instance->type; + } + + starting_index = 0; + + if ((0 == position.pass) && (0 == position.slice)) { + starting_index = 2; /* we have already generated the first two blocks */ + + /* Don't forget to generate the first block of addresses: */ + if (data_independent_addressing) { + next_addresses(&address_block, &input_block, &zero_block); + } + } + + /* Offset of the current block */ + curr_offset = position.lane * instance->lane_length + + position.slice * instance->segment_length + starting_index; + + if (0 == curr_offset % instance->lane_length) { + /* Last block in this lane */ + prev_offset = curr_offset + instance->lane_length - 1; + } + else { + /* Previous block */ + prev_offset = curr_offset - 1; + } + + for (i = starting_index; i < instance->segment_length; + ++i, ++curr_offset, ++prev_offset) { + /*1.1 Rotating prev_offset if needed */ + if (curr_offset % instance->lane_length == 1) { + prev_offset = curr_offset - 1; + } + + /* 1.2 Computing the index of the reference block */ + /* 1.2.1 Taking pseudo-random value from the previous block */ + if (data_independent_addressing) { + if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) { + next_addresses(&address_block, &input_block, &zero_block); + } + pseudo_rand = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK]; + } + else { + pseudo_rand = instance->memory[prev_offset].v[0]; + } + + /* 1.2.2 Computing the lane of the reference block */ + ref_lane = ((pseudo_rand >> 32)) % instance->lanes; + + if ((position.pass == 0) && (position.slice == 0)) { + /* Can not reference other lanes yet */ + ref_lane = position.lane; + } + + /* 1.2.3 Computing the number of possible reference block within the + * lane. + */ + position.index = i; + ref_index = rxa2_index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF, + ref_lane == position.lane); + + /* 2 Creating a new block */ + ref_block = + instance->memory + instance->lane_length * ref_lane + ref_index; + curr_block = instance->memory + curr_offset; + if (ARGON2_VERSION_10 == instance->version) { + /* version 1.2.1 and earlier: overwrite, not XOR */ + fill_block(instance->memory + prev_offset, ref_block, curr_block, 0); + } + else { + if (0 == position.pass) { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 0); + } + else { + fill_block(instance->memory + prev_offset, ref_block, + curr_block, 1); + } + } + } +} diff --git a/src/crypto/randomx/asm/program_epilogue_linux.inc b/src/crypto/randomx/asm/program_epilogue_linux.inc new file mode 100644 index 00000000..eaacae54 --- /dev/null +++ b/src/crypto/randomx/asm/program_epilogue_linux.inc @@ -0,0 +1,10 @@ + ;# restore callee-saved registers - System V AMD64 ABI + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + + ;# program finished + ret 0 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_epilogue_store.inc b/src/crypto/randomx/asm/program_epilogue_store.inc new file mode 100644 index 00000000..b94fa4d9 --- /dev/null +++ b/src/crypto/randomx/asm/program_epilogue_store.inc @@ -0,0 +1,19 @@ + ;# save VM register values + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + movdqa xmmword ptr [rcx+64], xmm0 + movdqa xmmword ptr [rcx+80], xmm1 + movdqa xmmword ptr [rcx+96], xmm2 + movdqa xmmword ptr [rcx+112], xmm3 + lea rcx, [rcx+64] + movdqa xmmword ptr [rcx+64], xmm4 + movdqa xmmword ptr [rcx+80], xmm5 + movdqa xmmword ptr [rcx+96], xmm6 + movdqa xmmword ptr [rcx+112], xmm7 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_epilogue_win64.inc b/src/crypto/randomx/asm/program_epilogue_win64.inc new file mode 100644 index 00000000..8d70a0a3 --- /dev/null +++ b/src/crypto/randomx/asm/program_epilogue_win64.inc @@ -0,0 +1,24 @@ + ;# restore callee-saved registers - Microsoft x64 calling convention + movdqu xmm15, xmmword ptr [rsp] + movdqu xmm14, xmmword ptr [rsp+16] + movdqu xmm13, xmmword ptr [rsp+32] + movdqu xmm12, xmmword ptr [rsp+48] + movdqu xmm11, xmmword ptr [rsp+64] + add rsp, 80 + movdqu xmm10, xmmword ptr [rsp] + movdqu xmm9, xmmword ptr [rsp+16] + movdqu xmm8, xmmword ptr [rsp+32] + movdqu xmm7, xmmword ptr [rsp+48] + movdqu xmm6, xmmword ptr [rsp+64] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + + ;# program finished + ret diff --git a/src/crypto/randomx/asm/program_loop_load.inc b/src/crypto/randomx/asm/program_loop_load.inc new file mode 100644 index 00000000..374af66a --- /dev/null +++ b/src/crypto/randomx/asm/program_loop_load.inc @@ -0,0 +1,32 @@ + mov rdx, rax + and eax, RANDOMX_SCRATCHPAD_MASK + lea rcx, [rsi+rax] + push rcx + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + ror rdx, 32 + and edx, RANDOMX_SCRATCHPAD_MASK + lea rcx, [rsi+rdx] + push rcx + cvtdq2pd xmm0, qword ptr [rcx+0] + cvtdq2pd xmm1, qword ptr [rcx+8] + cvtdq2pd xmm2, qword ptr [rcx+16] + cvtdq2pd xmm3, qword ptr [rcx+24] + cvtdq2pd xmm4, qword ptr [rcx+32] + cvtdq2pd xmm5, qword ptr [rcx+40] + cvtdq2pd xmm6, qword ptr [rcx+48] + cvtdq2pd xmm7, qword ptr [rcx+56] + andps xmm4, xmm13 + andps xmm5, xmm13 + andps xmm6, xmm13 + andps xmm7, xmm13 + orps xmm4, xmm14 + orps xmm5, xmm14 + orps xmm6, xmm14 + orps xmm7, xmm14 diff --git a/src/crypto/randomx/asm/program_loop_store.inc b/src/crypto/randomx/asm/program_loop_store.inc new file mode 100644 index 00000000..53164cb0 --- /dev/null +++ b/src/crypto/randomx/asm/program_loop_store.inc @@ -0,0 +1,19 @@ + xor eax, eax + pop rcx + mov qword ptr [rcx+0], r8 + mov qword ptr [rcx+8], r9 + mov qword ptr [rcx+16], r10 + mov qword ptr [rcx+24], r11 + mov qword ptr [rcx+32], r12 + mov qword ptr [rcx+40], r13 + mov qword ptr [rcx+48], r14 + mov qword ptr [rcx+56], r15 + pop rcx + xorpd xmm0, xmm4 + xorpd xmm1, xmm5 + xorpd xmm2, xmm6 + xorpd xmm3, xmm7 + movapd xmmword ptr [rcx+0], xmm0 + movapd xmmword ptr [rcx+16], xmm1 + movapd xmmword ptr [rcx+32], xmm2 + movapd xmmword ptr [rcx+48], xmm3 diff --git a/src/crypto/randomx/asm/program_prologue_linux.inc b/src/crypto/randomx/asm/program_prologue_linux.inc new file mode 100644 index 00000000..ffde152c --- /dev/null +++ b/src/crypto/randomx/asm/program_prologue_linux.inc @@ -0,0 +1,34 @@ + ;# callee-saved registers - System V AMD64 ABI + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + ;# function arguments + mov rbx, rcx ;# loop counter + push rdi ;# RegisterFile& registerFile + mov rcx, rdi + mov rbp, qword ptr [rsi] ;# "mx", "ma" + mov rdi, qword ptr [rsi+8] ;# uint8_t* dataset + mov rsi, rdx ;# uint8_t* scratchpad + + mov rax, rbp + + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] diff --git a/src/crypto/randomx/asm/program_prologue_win64.inc b/src/crypto/randomx/asm/program_prologue_win64.inc new file mode 100644 index 00000000..590a98de --- /dev/null +++ b/src/crypto/randomx/asm/program_prologue_win64.inc @@ -0,0 +1,47 @@ + ;# callee-saved registers - Microsoft x64 calling convention + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm6 + movdqu xmmword ptr [rsp+48], xmm7 + movdqu xmmword ptr [rsp+32], xmm8 + movdqu xmmword ptr [rsp+16], xmm9 + movdqu xmmword ptr [rsp+0], xmm10 + sub rsp, 80 + movdqu xmmword ptr [rsp+64], xmm11 + movdqu xmmword ptr [rsp+48], xmm12 + movdqu xmmword ptr [rsp+32], xmm13 + movdqu xmmword ptr [rsp+16], xmm14 + movdqu xmmword ptr [rsp+0], xmm15 + + ;# function arguments + push rcx ;# RegisterFile& registerFile + mov rbp, qword ptr [rdx] ;# "mx", "ma" + mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset + mov rsi, r8 ;# uint8_t* scratchpad + mov rbx, r9 ;# loop counter + + mov rax, rbp + + ;# zero integer registers + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + + ;# load constant registers + lea rcx, [rcx+120] + movapd xmm8, xmmword ptr [rcx+72] + movapd xmm9, xmmword ptr [rcx+88] + movapd xmm10, xmmword ptr [rcx+104] + movapd xmm11, xmmword ptr [rcx+120] diff --git a/src/crypto/randomx/asm/program_read_dataset.inc b/src/crypto/randomx/asm/program_read_dataset.inc new file mode 100644 index 00000000..b81d0c32 --- /dev/null +++ b/src/crypto/randomx/asm/program_read_dataset.inc @@ -0,0 +1,17 @@ + xor rbp, rax ;# modify "mx" + mov edx, ebp ;# edx = mx + and edx, RANDOMX_DATASET_BASE_MASK + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + mov edx, ebp ;# edx = ma + and edx, RANDOMX_DATASET_BASE_MASK + lea rcx, [rdi+rdx] ;# dataset cache line + xor r8, qword ptr [rcx+0] + xor r9, qword ptr [rcx+8] + xor r10, qword ptr [rcx+16] + xor r11, qword ptr [rcx+24] + xor r12, qword ptr [rcx+32] + xor r13, qword ptr [rcx+40] + xor r14, qword ptr [rcx+48] + xor r15, qword ptr [rcx+56] + \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_read_dataset_sshash_fin.inc b/src/crypto/randomx/asm/program_read_dataset_sshash_fin.inc new file mode 100644 index 00000000..f5a067d2 --- /dev/null +++ b/src/crypto/randomx/asm/program_read_dataset_sshash_fin.inc @@ -0,0 +1,10 @@ + mov rbx, qword ptr [rsp+64] + xor r8, qword ptr [rsp+56] + xor r9, qword ptr [rsp+48] + xor r10, qword ptr [rsp+40] + xor r11, qword ptr [rsp+32] + xor r12, qword ptr [rsp+24] + xor r13, qword ptr [rsp+16] + xor r14, qword ptr [rsp+8] + xor r15, qword ptr [rsp+0] + add rsp, 72 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_read_dataset_sshash_init.inc b/src/crypto/randomx/asm/program_read_dataset_sshash_init.inc new file mode 100644 index 00000000..6fe9525d --- /dev/null +++ b/src/crypto/randomx/asm/program_read_dataset_sshash_init.inc @@ -0,0 +1,17 @@ + sub rsp, 72 + mov qword ptr [rsp+64], rbx + mov qword ptr [rsp+56], r8 + mov qword ptr [rsp+48], r9 + mov qword ptr [rsp+40], r10 + mov qword ptr [rsp+32], r11 + mov qword ptr [rsp+24], r12 + mov qword ptr [rsp+16], r13 + mov qword ptr [rsp+8], r14 + mov qword ptr [rsp+0], r15 + xor rbp, rax ;# modify "mx" + ror rbp, 32 ;# swap "ma" and "mx" + mov ebx, ebp ;# ecx = ma + and ebx, RANDOMX_DATASET_BASE_MASK + shr ebx, 6 ;# ebx = Dataset block number + ;# add ebx, datasetOffset / 64 + ;# call 32768 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_sshash_constants.inc b/src/crypto/randomx/asm/program_sshash_constants.inc new file mode 100644 index 00000000..53dc1755 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_constants.inc @@ -0,0 +1,24 @@ +r0_mul: + ;#/ 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_add: + ;#/ 9298411001130361340 + db 252, 161, 245, 89, 138, 151, 10, 129 +r2_add: + ;#/ 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_add: + ;#/ 9306329213124626780 + db 92, 73, 34, 191, 28, 185, 38, 129 +r4_add: + ;#/ 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_add: + ;#/ 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_add: + ;#/ 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_add: + ;#/ 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_sshash_load.inc b/src/crypto/randomx/asm/program_sshash_load.inc new file mode 100644 index 00000000..53513569 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_load.inc @@ -0,0 +1,8 @@ + xor r8, qword ptr [rbx+0] + xor r9, qword ptr [rbx+8] + xor r10, qword ptr [rbx+16] + xor r11, qword ptr [rbx+24] + xor r12, qword ptr [rbx+32] + xor r13, qword ptr [rbx+40] + xor r14, qword ptr [rbx+48] + xor r15, qword ptr [rbx+56] \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_sshash_prefetch.inc b/src/crypto/randomx/asm/program_sshash_prefetch.inc new file mode 100644 index 00000000..26efb515 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_prefetch.inc @@ -0,0 +1,4 @@ + and rbx, RANDOMX_CACHE_MASK + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_xmm_constants.inc b/src/crypto/randomx/asm/program_xmm_constants.inc new file mode 100644 index 00000000..296237a4 --- /dev/null +++ b/src/crypto/randomx/asm/program_xmm_constants.inc @@ -0,0 +1,6 @@ +mantissaMask: + db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0 +exp240: + db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +scaleMask: + db 0, 0, 0, 0, 0, 0, 240, 128, 0, 0, 0, 0, 0, 0, 240, 128 \ No newline at end of file diff --git a/src/crypto/randomx/asm/randomx_reciprocal.inc b/src/crypto/randomx/asm/randomx_reciprocal.inc new file mode 100644 index 00000000..e1f22fdc --- /dev/null +++ b/src/crypto/randomx/asm/randomx_reciprocal.inc @@ -0,0 +1,7 @@ + mov edx, 1 + mov r8, rcx + xor eax, eax + bsr rcx, rcx + shl rdx, cl + div r8 + ret \ No newline at end of file diff --git a/src/crypto/randomx/blake2/blake2-impl.h b/src/crypto/randomx/blake2/blake2-impl.h new file mode 100644 index 00000000..617f7c8a --- /dev/null +++ b/src/crypto/randomx/blake2/blake2-impl.h @@ -0,0 +1,76 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef PORTABLE_BLAKE2_IMPL_H +#define PORTABLE_BLAKE2_IMPL_H + +#include + +#include "endian.h" + +static FORCE_INLINE uint64_t load48(const void *src) { + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + return w; +} + +static FORCE_INLINE void store48(void *dst, uint64_t w) { + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +} + +static FORCE_INLINE uint32_t rotr32(const uint32_t w, const unsigned c) { + return (w >> c) | (w << (32 - c)); +} + +static FORCE_INLINE uint64_t rotr64(const uint64_t w, const unsigned c) { + return (w >> c) | (w << (64 - c)); +} + +#endif diff --git a/src/crypto/randomx/blake2/blake2.h b/src/crypto/randomx/blake2/blake2.h new file mode 100644 index 00000000..7ac301ce --- /dev/null +++ b/src/crypto/randomx/blake2/blake2.h @@ -0,0 +1,107 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef PORTABLE_BLAKE2_H +#define PORTABLE_BLAKE2_H + +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + +#pragma pack(push, 1) + typedef struct __blake2b_param { + uint8_t digest_length; /* 1 */ + uint8_t key_length; /* 2 */ + uint8_t fanout; /* 3 */ + uint8_t depth; /* 4 */ + uint32_t leaf_length; /* 8 */ + uint64_t node_offset; /* 16 */ + uint8_t node_depth; /* 17 */ + uint8_t inner_length; /* 18 */ + uint8_t reserved[14]; /* 32 */ + uint8_t salt[BLAKE2B_SALTBYTES]; /* 48 */ + uint8_t personal[BLAKE2B_PERSONALBYTES]; /* 64 */ + } blake2b_param; +#pragma pack(pop) + + typedef struct __blake2b_state { + uint64_t h[8]; + uint64_t t[2]; + uint64_t f[2]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + unsigned buflen; + unsigned outlen; + uint8_t last_node; + } blake2b_state; + + /* Ensure param structs have not been wrongly padded */ + /* Poor man's static_assert */ + enum { + blake2_size_check_0 = 1 / !!(CHAR_BIT == 8), + blake2_size_check_2 = + 1 / !!(sizeof(blake2b_param) == sizeof(uint64_t) * CHAR_BIT) + }; + + /* Streaming API */ + int blake2b_init(blake2b_state *S, size_t outlen); + int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, + size_t keylen); + int blake2b_init_param(blake2b_state *S, const blake2b_param *P); + int blake2b_update(blake2b_state *S, const void *in, size_t inlen); + int blake2b_final(blake2b_state *S, void *out, size_t outlen); + + /* Simple API */ + int blake2b(void *out, size_t outlen, const void *in, size_t inlen, + const void *key, size_t keylen); + + /* Argon2 Team - Begin Code */ + int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); + /* Argon2 Team - End Code */ + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c new file mode 100644 index 00000000..5931ee5c --- /dev/null +++ b/src/crypto/randomx/blake2/blake2b.c @@ -0,0 +1,409 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include + +#include "blake2.h" +#include "blake2-impl.h" + +static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; + +static const unsigned int blake2b_sigma[12][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, + {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, + {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, + {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, + {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, +}; + +static FORCE_INLINE void blake2b_set_lastnode(blake2b_state *S) { + S->f[1] = (uint64_t)-1; +} + +static FORCE_INLINE void blake2b_set_lastblock(blake2b_state *S) { + if (S->last_node) { + blake2b_set_lastnode(S); + } + S->f[0] = (uint64_t)-1; +} + +static FORCE_INLINE void blake2b_increment_counter(blake2b_state *S, + uint64_t inc) { + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +} + +static FORCE_INLINE void blake2b_invalidate_state(blake2b_state *S) { + //clear_internal_memory(S, sizeof(*S)); /* wipe */ + blake2b_set_lastblock(S); /* invalidate for further use */ +} + +static FORCE_INLINE void blake2b_init0(blake2b_state *S) { + memset(S, 0, sizeof(*S)); + memcpy(S->h, blake2b_IV, sizeof(S->h)); +} + +int blake2b_init_param(blake2b_state *S, const blake2b_param *P) { + const unsigned char *p = (const unsigned char *)P; + unsigned int i; + + if (NULL == P || NULL == S) { + return -1; + } + + blake2b_init0(S); + /* IV XOR Parameter Block */ + for (i = 0; i < 8; ++i) { + S->h[i] ^= load64(&p[i * sizeof(S->h[i])]); + } + S->outlen = P->digest_length; + return 0; +} + +/* Sequential blake2b initialization */ +int blake2b_init(blake2b_state *S, size_t outlen) { + blake2b_param P; + + if (S == NULL) { + return -1; + } + + if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + /* Setup Parameter Block for unkeyed BLAKE2 */ + P.digest_length = (uint8_t)outlen; + P.key_length = 0; + P.fanout = 1; + P.depth = 1; + P.leaf_length = 0; + P.node_offset = 0; + P.node_depth = 0; + P.inner_length = 0; + memset(P.reserved, 0, sizeof(P.reserved)); + memset(P.salt, 0, sizeof(P.salt)); + memset(P.personal, 0, sizeof(P.personal)); + + return blake2b_init_param(S, &P); +} + +int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t keylen) { + blake2b_param P; + + if (S == NULL) { + return -1; + } + + if ((outlen == 0) || (outlen > BLAKE2B_OUTBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + if ((key == 0) || (keylen == 0) || (keylen > BLAKE2B_KEYBYTES)) { + blake2b_invalidate_state(S); + return -1; + } + + /* Setup Parameter Block for keyed BLAKE2 */ + P.digest_length = (uint8_t)outlen; + P.key_length = (uint8_t)keylen; + P.fanout = 1; + P.depth = 1; + P.leaf_length = 0; + P.node_offset = 0; + P.node_depth = 0; + P.inner_length = 0; + memset(P.reserved, 0, sizeof(P.reserved)); + memset(P.salt, 0, sizeof(P.salt)); + memset(P.personal, 0, sizeof(P.personal)); + + if (blake2b_init_param(S, &P) < 0) { + blake2b_invalidate_state(S); + return -1; + } + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); + blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + /* Burn the key from stack */ + //clear_internal_memory(block, BLAKE2B_BLOCKBYTES); + } + return 0; +} + +static void blake2b_compress(blake2b_state *S, const uint8_t *block) { + uint64_t m[16]; + uint64_t v[16]; + unsigned int i, r; + + for (i = 0; i < 16; ++i) { + m[i] = load64(block + i * sizeof(m[i])); + } + + for (i = 0; i < 8; ++i) { + v[i] = S->h[i]; + } + + v[8] = blake2b_IV[0]; + v[9] = blake2b_IV[1]; + v[10] = blake2b_IV[2]; + v[11] = blake2b_IV[3]; + v[12] = blake2b_IV[4] ^ S->t[0]; + v[13] = blake2b_IV[5] ^ S->t[1]; + v[14] = blake2b_IV[6] ^ S->f[0]; + v[15] = blake2b_IV[7] ^ S->f[1]; + +#define G(r, i, a, b, c, d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define ROUND(r) \ + do { \ + G(r, 0, v[0], v[4], v[8], v[12]); \ + G(r, 1, v[1], v[5], v[9], v[13]); \ + G(r, 2, v[2], v[6], v[10], v[14]); \ + G(r, 3, v[3], v[7], v[11], v[15]); \ + G(r, 4, v[0], v[5], v[10], v[15]); \ + G(r, 5, v[1], v[6], v[11], v[12]); \ + G(r, 6, v[2], v[7], v[8], v[13]); \ + G(r, 7, v[3], v[4], v[9], v[14]); \ + } while ((void)0, 0) + + for (r = 0; r < 12; ++r) { + ROUND(r); + } + + for (i = 0; i < 8; ++i) { + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + } + +#undef G +#undef ROUND +} + +int blake2b_update(blake2b_state *S, const void *in, size_t inlen) { + const uint8_t *pin = (const uint8_t *)in; + + if (inlen == 0) { + return 0; + } + + /* Sanity check */ + if (S == NULL || in == NULL) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + if (S->buflen + inlen > BLAKE2B_BLOCKBYTES) { + /* Complete current block */ + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + memcpy(&S->buf[left], pin, fill); + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, S->buf); + S->buflen = 0; + inlen -= fill; + pin += fill; + /* Avoid buffer copies when possible */ + while (inlen > BLAKE2B_BLOCKBYTES) { + blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); + blake2b_compress(S, pin); + inlen -= BLAKE2B_BLOCKBYTES; + pin += BLAKE2B_BLOCKBYTES; + } + } + memcpy(&S->buf[S->buflen], pin, inlen); + S->buflen += (unsigned int)inlen; + return 0; +} + +int blake2b_final(blake2b_state *S, void *out, size_t outlen) { + uint8_t buffer[BLAKE2B_OUTBYTES] = { 0 }; + unsigned int i; + + /* Sanity checks */ + if (S == NULL || out == NULL || outlen < S->outlen) { + return -1; + } + + /* Is this a reused state? */ + if (S->f[0] != 0) { + return -1; + } + + blake2b_increment_counter(S, S->buflen); + blake2b_set_lastblock(S); + memset(&S->buf[S->buflen], 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + + for (i = 0; i < 8; ++i) { /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); + } + + memcpy(out, buffer, S->outlen); + //clear_internal_memory(buffer, sizeof(buffer)); + //clear_internal_memory(S->buf, sizeof(S->buf)); + //clear_internal_memory(S->h, sizeof(S->h)); + return 0; +} + +int blake2b(void *out, size_t outlen, const void *in, size_t inlen, + const void *key, size_t keylen) { + blake2b_state S; + int ret = -1; + + /* Verify parameters */ + if (NULL == in && inlen > 0) { + goto fail; + } + + if (NULL == out || outlen == 0 || outlen > BLAKE2B_OUTBYTES) { + goto fail; + } + + if ((NULL == key && keylen > 0) || keylen > BLAKE2B_KEYBYTES) { + goto fail; + } + + if (keylen > 0) { + if (blake2b_init_key(&S, outlen, key, keylen) < 0) { + goto fail; + } + } + else { + if (blake2b_init(&S, outlen) < 0) { + goto fail; + } + } + + if (blake2b_update(&S, in, inlen) < 0) { + goto fail; + } + ret = blake2b_final(&S, out, outlen); + +fail: + //clear_internal_memory(&S, sizeof(S)); + return ret; +} + +/* Argon2 Team - Begin Code */ +int rxa2_blake2b_long(void *pout, size_t outlen, const void *in, size_t inlen) { + uint8_t *out = (uint8_t *)pout; + blake2b_state blake_state; + uint8_t outlen_bytes[sizeof(uint32_t)] = { 0 }; + int ret = -1; + + if (outlen > UINT32_MAX) { + goto fail; + } + + /* Ensure little-endian byte order! */ + store32(outlen_bytes, (uint32_t)outlen); + +#define TRY(statement) \ + do { \ + ret = statement; \ + if (ret < 0) { \ + goto fail; \ + } \ + } while ((void)0, 0) + + if (outlen <= BLAKE2B_OUTBYTES) { + TRY(blake2b_init(&blake_state, outlen)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, in, inlen)); + TRY(blake2b_final(&blake_state, out, outlen)); + } + else { + uint32_t toproduce; + uint8_t out_buffer[BLAKE2B_OUTBYTES]; + uint8_t in_buffer[BLAKE2B_OUTBYTES]; + TRY(blake2b_init(&blake_state, BLAKE2B_OUTBYTES)); + TRY(blake2b_update(&blake_state, outlen_bytes, sizeof(outlen_bytes))); + TRY(blake2b_update(&blake_state, in, inlen)); + TRY(blake2b_final(&blake_state, out_buffer, BLAKE2B_OUTBYTES)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce = (uint32_t)outlen - BLAKE2B_OUTBYTES / 2; + + while (toproduce > BLAKE2B_OUTBYTES) { + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, BLAKE2B_OUTBYTES, in_buffer, + BLAKE2B_OUTBYTES, NULL, 0)); + memcpy(out, out_buffer, BLAKE2B_OUTBYTES / 2); + out += BLAKE2B_OUTBYTES / 2; + toproduce -= BLAKE2B_OUTBYTES / 2; + } + + memcpy(in_buffer, out_buffer, BLAKE2B_OUTBYTES); + TRY(blake2b(out_buffer, toproduce, in_buffer, BLAKE2B_OUTBYTES, NULL, + 0)); + memcpy(out, out_buffer, toproduce); + } +fail: + //clear_internal_memory(&blake_state, sizeof(blake_state)); + return ret; +#undef TRY +} +/* Argon2 Team - End Code */ + diff --git a/src/crypto/randomx/blake2/blamka-round-ref.h b/src/crypto/randomx/blake2/blamka-round-ref.h new file mode 100644 index 00000000..f1fb50bf --- /dev/null +++ b/src/crypto/randomx/blake2/blamka-round-ref.h @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#ifndef BLAKE_ROUND_MKA_H +#define BLAKE_ROUND_MKA_H + +#include "blake2.h" +#include "blake2-impl.h" + + /* designed by the Lyra PHC team */ +static FORCE_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) { + const uint64_t m = UINT64_C(0xFFFFFFFF); + const uint64_t xy = (x & m) * (y & m); + return x + y + 2 * xy; +} + +#define G(a, b, c, d) \ + do { \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 32); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 24); \ + a = fBlaMka(a, b); \ + d = rotr64(d ^ a, 16); \ + c = fBlaMka(c, d); \ + b = rotr64(b ^ c, 63); \ + } while ((void)0, 0) + +#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \ + v12, v13, v14, v15) \ + do { \ + G(v0, v4, v8, v12); \ + G(v1, v5, v9, v13); \ + G(v2, v6, v10, v14); \ + G(v3, v7, v11, v15); \ + G(v0, v5, v10, v15); \ + G(v1, v6, v11, v12); \ + G(v2, v7, v8, v13); \ + G(v3, v4, v9, v14); \ + } while ((void)0, 0) + +#endif diff --git a/src/crypto/randomx/blake2/endian.h b/src/crypto/randomx/blake2/endian.h new file mode 100644 index 00000000..c7afed26 --- /dev/null +++ b/src/crypto/randomx/blake2/endian.h @@ -0,0 +1,107 @@ +#pragma once +#include +#include + +#if defined(_MSC_VER) +#define FORCE_INLINE __inline +#elif defined(__GNUC__) || defined(__clang__) +#define FORCE_INLINE __inline__ +#else +#define FORCE_INLINE +#endif + + /* Argon2 Team - Begin Code */ + /* + Not an exhaustive list, but should cover the majority of modern platforms + Additionally, the code will always be correct---this is only a performance + tweak. + */ +#if (defined(__BYTE_ORDER__) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ + defined(__LITTLE_ENDIAN__) || defined(__ARMEL__) || defined(__MIPSEL__) || \ + defined(__AARCH64EL__) || defined(__amd64__) || defined(__i386__) || \ + defined(_M_IX86) || defined(_M_X64) || defined(_M_AMD64) || \ + defined(_M_ARM) +#define NATIVE_LITTLE_ENDIAN +#endif + /* Argon2 Team - End Code */ + +static FORCE_INLINE uint32_t load32(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = (const uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static FORCE_INLINE uint64_t load64_native(const void *src) { + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +} + +static FORCE_INLINE uint64_t load64(const void *src) { +#if defined(NATIVE_LITTLE_ENDIAN) + return load64_native(src); +#else + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + w |= (uint64_t)(*p++) << 48; + w |= (uint64_t)(*p++) << 56; + return w; +#endif +} + +static FORCE_INLINE void store32(void *dst, uint32_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static FORCE_INLINE void store64_native(void *dst, uint64_t w) { + memcpy(dst, &w, sizeof w); +} + +static FORCE_INLINE void store64(void *dst, uint64_t w) { +#if defined(NATIVE_LITTLE_ENDIAN) + store64_native(dst, w); +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; + w >>= 8; + *p++ = (uint8_t)w; +#endif +} diff --git a/src/crypto/randomx/blake2_generator.cpp b/src/crypto/randomx/blake2_generator.cpp new file mode 100644 index 00000000..dcf51cec --- /dev/null +++ b/src/crypto/randomx/blake2_generator.cpp @@ -0,0 +1,62 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "blake2/blake2.h" +#include "blake2/endian.h" +#include "blake2_generator.hpp" + +namespace randomx { + + constexpr int maxSeedSize = 60; + + Blake2Generator::Blake2Generator(const void* seed, size_t seedSize, int nonce) : dataIndex(sizeof(data)) { + memset(data, 0, sizeof(data)); + memcpy(data, seed, seedSize > maxSeedSize ? maxSeedSize : seedSize); + store32(&data[maxSeedSize], nonce); + } + + uint8_t Blake2Generator::getByte() { + checkData(1); + return data[dataIndex++]; + } + + uint32_t Blake2Generator::getInt32() { + checkData(4); + auto ret = load32(&data[dataIndex]); + dataIndex += 4; + return ret; + } + + void Blake2Generator::checkData(const size_t bytesNeeded) { + if (dataIndex + bytesNeeded > sizeof(data)) { + blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); + dataIndex = 0; + } + } +} \ No newline at end of file diff --git a/src/crypto/randomx/blake2_generator.hpp b/src/crypto/randomx/blake2_generator.hpp new file mode 100644 index 00000000..b5ac0801 --- /dev/null +++ b/src/crypto/randomx/blake2_generator.hpp @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +namespace randomx { + + class Blake2Generator { + public: + Blake2Generator(const void* seed, size_t seedSize, int nonce = 0); + uint8_t getByte(); + uint32_t getInt32(); + private: + void checkData(const size_t); + + uint8_t data[64]; + size_t dataIndex; + }; +} \ No newline at end of file diff --git a/src/crypto/randomx/bytecode_machine.cpp b/src/crypto/randomx/bytecode_machine.cpp new file mode 100644 index 00000000..8447318e --- /dev/null +++ b/src/crypto/randomx/bytecode_machine.cpp @@ -0,0 +1,482 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "bytecode_machine.hpp" +#include "reciprocal.h" + +namespace randomx { + + const int_reg_t BytecodeMachine::zero = 0; + +#define INSTR_CASE(x) case InstructionType::x: \ + exe_ ## x(ibc, pc, scratchpad, config); \ + break; + + void BytecodeMachine::executeInstruction(RANDOMX_EXE_ARGS) { + switch (ibc.type) + { + INSTR_CASE(IADD_RS) + INSTR_CASE(IADD_M) + INSTR_CASE(ISUB_R) + INSTR_CASE(ISUB_M) + INSTR_CASE(IMUL_R) + INSTR_CASE(IMUL_M) + INSTR_CASE(IMULH_R) + INSTR_CASE(IMULH_M) + INSTR_CASE(ISMULH_R) + INSTR_CASE(ISMULH_M) + INSTR_CASE(INEG_R) + INSTR_CASE(IXOR_R) + INSTR_CASE(IXOR_M) + INSTR_CASE(IROR_R) + INSTR_CASE(IROL_R) + INSTR_CASE(ISWAP_R) + INSTR_CASE(FSWAP_R) + INSTR_CASE(FADD_R) + INSTR_CASE(FADD_M) + INSTR_CASE(FSUB_R) + INSTR_CASE(FSUB_M) + INSTR_CASE(FSCAL_R) + INSTR_CASE(FMUL_R) + INSTR_CASE(FDIV_M) + INSTR_CASE(FSQRT_R) + INSTR_CASE(CBRANCH) + INSTR_CASE(CFROUND) + INSTR_CASE(ISTORE) + + case InstructionType::NOP: + break; + + case InstructionType::IMUL_RCP: //executed as IMUL_R + default: + UNREACHABLE; + } + } + + void BytecodeMachine::compileInstruction(RANDOMX_GEN_ARGS) { + int opcode = instr.opcode; + + if (opcode < RandomX_CurrentConfig.CEIL_IADD_RS) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_RS; + ibc.idst = &nreg->r[dst]; + if (dst != RegisterNeedsDisplacement) { + ibc.isrc = &nreg->r[src]; + ibc.shift = instr.getModShift(); + ibc.imm = 0; + } + else { + ibc.isrc = &nreg->r[src]; + ibc.shift = instr.getModShift(); + ibc.imm = signExtend2sCompl(instr.getImm32()); + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IADD_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IADD_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISUB_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISUB_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISUB_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IMUL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IMUL_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMUL_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_R; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IMULH_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISMULH_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_R; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISMULH_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISMULH_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IMUL_RCP) { + uint64_t divisor = instr.getImm32(); + if (!isPowerOf2(divisor)) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::IMUL_R; + ibc.idst = &nreg->r[dst]; + ibc.imm = randomx_reciprocal(divisor); + ibc.isrc = &ibc.imm; + registerUsage[dst] = i; + } + else { + ibc.type = InstructionType::NOP; + } + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_INEG_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::INEG_R; + ibc.idst = &nreg->r[dst]; + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IXOR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = signExtend2sCompl(instr.getImm32()); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IXOR_M) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IXOR_M; + ibc.idst = &nreg->r[dst]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (src != dst) { + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + ibc.isrc = &zero; + ibc.memMask = ScratchpadL3Mask; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IROR_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROR_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = instr.getImm32(); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_IROL_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::IROL_R; + ibc.idst = &nreg->r[dst]; + if (src != dst) { + ibc.isrc = &nreg->r[src]; + } + else { + ibc.imm = instr.getImm32(); + ibc.isrc = &ibc.imm; + } + registerUsage[dst] = i; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISWAP_R) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + if (src != dst) { + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + ibc.type = InstructionType::ISWAP_R; + registerUsage[dst] = i; + registerUsage[src] = i; + } + else { + ibc.type = InstructionType::NOP; + } + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FSWAP_R) { + auto dst = instr.dst % RegistersCount; + ibc.type = InstructionType::FSWAP_R; + if (dst < RegisterCountFlt) + ibc.fdst = &nreg->f[dst]; + else + ibc.fdst = &nreg->e[dst - RegisterCountFlt]; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FADD_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FADD_R; + ibc.fdst = &nreg->f[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FADD_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FADD_M; + ibc.fdst = &nreg->f[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FSUB_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FSUB_R; + ibc.fdst = &nreg->f[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FSUB_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FSUB_M; + ibc.fdst = &nreg->f[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FSCAL_R) { + auto dst = instr.dst % RegisterCountFlt; + ibc.fdst = &nreg->f[dst]; + ibc.type = InstructionType::FSCAL_R; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FMUL_R) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegisterCountFlt; + ibc.type = InstructionType::FMUL_R; + ibc.fdst = &nreg->e[dst]; + ibc.fsrc = &nreg->a[src]; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FDIV_M) { + auto dst = instr.dst % RegisterCountFlt; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::FDIV_M; + ibc.fdst = &nreg->e[dst]; + ibc.isrc = &nreg->r[src]; + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + ibc.imm = signExtend2sCompl(instr.getImm32()); + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_FSQRT_R) { + auto dst = instr.dst % RegisterCountFlt; + ibc.type = InstructionType::FSQRT_R; + ibc.fdst = &nreg->e[dst]; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_CBRANCH) { + ibc.type = InstructionType::CBRANCH; + //jump condition + int creg = instr.dst % RegistersCount; + ibc.idst = &nreg->r[creg]; + ibc.target = registerUsage[creg]; + int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; + ibc.imm = signExtend2sCompl(instr.getImm32()) | (1ULL << shift); + if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0) //clear the bit below the condition mask - this limits the number of successive jumps to 2 + ibc.imm &= ~(1ULL << (shift - 1)); + ibc.memMask = RandomX_CurrentConfig.ConditionMask_Calculated << shift; + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + registerUsage[j] = i; + } + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_CFROUND) { + auto src = instr.src % RegistersCount; + ibc.isrc = &nreg->r[src]; + ibc.type = InstructionType::CFROUND; + ibc.imm = instr.getImm32() & 63; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_ISTORE) { + auto dst = instr.dst % RegistersCount; + auto src = instr.src % RegistersCount; + ibc.type = InstructionType::ISTORE; + ibc.idst = &nreg->r[dst]; + ibc.isrc = &nreg->r[src]; + ibc.imm = signExtend2sCompl(instr.getImm32()); + if (instr.getModCond() < StoreL3Condition) + ibc.memMask = (instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + else + ibc.memMask = ScratchpadL3Mask; + return; + } + + if (opcode < RandomX_CurrentConfig.CEIL_NOP) { + ibc.type = InstructionType::NOP; + return; + } + + UNREACHABLE; + } +} diff --git a/src/crypto/randomx/bytecode_machine.hpp b/src/crypto/randomx/bytecode_machine.hpp new file mode 100644 index 00000000..3b05f378 --- /dev/null +++ b/src/crypto/randomx/bytecode_machine.hpp @@ -0,0 +1,288 @@ +/* +Copyright (c) 2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include "common.hpp" +#include "intrin_portable.h" +#include "instruction.hpp" +#include "program.hpp" + +namespace randomx { + + //register file in machine byte order + struct NativeRegisterFile { + int_reg_t r[RegistersCount] = { 0 }; + rx_vec_f128 f[RegisterCountFlt]; + rx_vec_f128 e[RegisterCountFlt]; + rx_vec_f128 a[RegisterCountFlt]; + }; + + struct InstructionByteCode { + union { + int_reg_t* idst; + rx_vec_f128* fdst; + }; + union { + const int_reg_t* isrc; + const rx_vec_f128* fsrc; + }; + union { + uint64_t imm; + int64_t simm; + }; + InstructionType type; + union { + int16_t target; + uint16_t shift; + }; + uint32_t memMask; + }; + +#define RANDOMX_EXE_ARGS InstructionByteCode& ibc, int& pc, uint8_t* scratchpad, ProgramConfiguration& config +#define RANDOMX_GEN_ARGS Instruction& instr, int i, InstructionByteCode& ibc + + class BytecodeMachine; + + typedef void(BytecodeMachine::*InstructionGenBytecode)(RANDOMX_GEN_ARGS); + + class BytecodeMachine { + public: + void beginCompilation(NativeRegisterFile& regFile) { + for (unsigned i = 0; i < RegistersCount; ++i) { + registerUsage[i] = -1; + } + nreg = ®File; + } + + void compileProgram(Program& program, InstructionByteCode* bytecode, NativeRegisterFile& regFile) { + beginCompilation(regFile); + for (unsigned i = 0; i < RandomX_CurrentConfig.ProgramSize; ++i) { + auto& instr = program(i); + auto& ibc = bytecode[i]; + compileInstruction(instr, i, ibc); + } + } + + static void executeBytecode(InstructionByteCode* bytecode, uint8_t* scratchpad, ProgramConfiguration& config) { + for (int pc = 0; pc < RandomX_CurrentConfig.ProgramSize; ++pc) { + auto& ibc = bytecode[pc]; + executeInstruction(ibc, pc, scratchpad, config); + } + } + + void compileInstruction(RANDOMX_GEN_ARGS) +#ifdef RANDOMX_GEN_TABLE + { + auto generator = genTable[instr.opcode]; + (this->*generator)(instr, i, ibc); + } +#else + ; +#endif + + static void executeInstruction(RANDOMX_EXE_ARGS); + + static void exe_IADD_RS(RANDOMX_EXE_ARGS) { + *ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm; + } + + static void exe_IADD_M(RANDOMX_EXE_ARGS) { + *ibc.idst += load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_ISUB_R(RANDOMX_EXE_ARGS) { + *ibc.idst -= *ibc.isrc; + } + + static void exe_ISUB_M(RANDOMX_EXE_ARGS) { + *ibc.idst -= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IMUL_R(RANDOMX_EXE_ARGS) { + *ibc.idst *= *ibc.isrc; + } + + static void exe_IMUL_M(RANDOMX_EXE_ARGS) { + *ibc.idst *= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IMULH_R(RANDOMX_EXE_ARGS) { + *ibc.idst = mulh(*ibc.idst, *ibc.isrc); + } + + static void exe_IMULH_M(RANDOMX_EXE_ARGS) { + *ibc.idst = mulh(*ibc.idst, load64(getScratchpadAddress(ibc, scratchpad))); + } + + static void exe_ISMULH_R(RANDOMX_EXE_ARGS) { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(*ibc.isrc)); + } + + static void exe_ISMULH_M(RANDOMX_EXE_ARGS) { + *ibc.idst = smulh(unsigned64ToSigned2sCompl(*ibc.idst), unsigned64ToSigned2sCompl(load64(getScratchpadAddress(ibc, scratchpad)))); + } + + static void exe_INEG_R(RANDOMX_EXE_ARGS) { + *ibc.idst = ~(*ibc.idst) + 1; //two's complement negative + } + + static void exe_IXOR_R(RANDOMX_EXE_ARGS) { + *ibc.idst ^= *ibc.isrc; + } + + static void exe_IXOR_M(RANDOMX_EXE_ARGS) { + *ibc.idst ^= load64(getScratchpadAddress(ibc, scratchpad)); + } + + static void exe_IROR_R(RANDOMX_EXE_ARGS) { + *ibc.idst = rotr64(*ibc.idst, *ibc.isrc & 63); + } + + static void exe_IROL_R(RANDOMX_EXE_ARGS) { + *ibc.idst = rotl64(*ibc.idst, *ibc.isrc & 63); + } + + static void exe_ISWAP_R(RANDOMX_EXE_ARGS) { + int_reg_t temp = *ibc.isrc; + *(int_reg_t*)ibc.isrc = *ibc.idst; + *ibc.idst = temp; + } + + static void exe_FSWAP_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_swap_vec_f128(*ibc.fdst); + } + + static void exe_FADD_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FADD_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)); + *ibc.fdst = rx_add_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSUB_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FSUB_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)); + *ibc.fdst = rx_sub_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSCAL_R(RANDOMX_EXE_ARGS) { + const rx_vec_f128 mask = rx_set1_vec_f128(0x80F0000000000000); + *ibc.fdst = rx_xor_vec_f128(*ibc.fdst, mask); + } + + static void exe_FMUL_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_mul_vec_f128(*ibc.fdst, *ibc.fsrc); + } + + static void exe_FDIV_M(RANDOMX_EXE_ARGS) { + rx_vec_f128 fsrc = maskRegisterExponentMantissa( + config, + rx_cvt_packed_int_vec_f128(getScratchpadAddress(ibc, scratchpad)) + ); + *ibc.fdst = rx_div_vec_f128(*ibc.fdst, fsrc); + } + + static void exe_FSQRT_R(RANDOMX_EXE_ARGS) { + *ibc.fdst = rx_sqrt_vec_f128(*ibc.fdst); + } + + static void exe_CBRANCH(RANDOMX_EXE_ARGS) { + *ibc.idst += ibc.imm; + if ((*ibc.idst & ibc.memMask) == 0) { + pc = ibc.target; + } + } + + static void exe_CFROUND(RANDOMX_EXE_ARGS) { + rx_set_rounding_mode(rotr64(*ibc.isrc, ibc.imm) % 4); + } + + static void exe_ISTORE(RANDOMX_EXE_ARGS) { + store64(scratchpad + ((*ibc.idst + ibc.imm) & ibc.memMask), *ibc.isrc); + } + protected: + static rx_vec_f128 maskRegisterExponentMantissa(ProgramConfiguration& config, rx_vec_f128 x) { + const rx_vec_f128 xmantissaMask = rx_set_vec_f128(dynamicMantissaMask, dynamicMantissaMask); + const rx_vec_f128 xexponentMask = rx_load_vec_f128((const double*)&config.eMask); + x = rx_and_vec_f128(x, xmantissaMask); + x = rx_or_vec_f128(x, xexponentMask); + return x; + } + + private: + static const int_reg_t zero; + int registerUsage[RegistersCount]; + NativeRegisterFile* nreg; + + static void* getScratchpadAddress(InstructionByteCode& ibc, uint8_t* scratchpad) { + uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask; + return scratchpad + addr; + } + +#ifdef RANDOMX_GEN_TABLE + static InstructionGenBytecode genTable[256]; + + void gen_IADD_RS(RANDOMX_GEN_ARGS); + void gen_IADD_M(RANDOMX_GEN_ARGS); + void gen_ISUB_R(RANDOMX_GEN_ARGS); + void gen_ISUB_M(RANDOMX_GEN_ARGS); + void gen_IMUL_R(RANDOMX_GEN_ARGS); + void gen_IMUL_M(RANDOMX_GEN_ARGS); + void gen_IMULH_R(RANDOMX_GEN_ARGS); + void gen_IMULH_M(RANDOMX_GEN_ARGS); + void gen_ISMULH_R(RANDOMX_GEN_ARGS); + void gen_ISMULH_M(RANDOMX_GEN_ARGS); + void gen_IMUL_RCP(RANDOMX_GEN_ARGS); + void gen_INEG_R(RANDOMX_GEN_ARGS); + void gen_IXOR_R(RANDOMX_GEN_ARGS); + void gen_IXOR_M(RANDOMX_GEN_ARGS); + void gen_IROR_R(RANDOMX_GEN_ARGS); + void gen_IROL_R(RANDOMX_GEN_ARGS); + void gen_ISWAP_R(RANDOMX_GEN_ARGS); + void gen_FSWAP_R(RANDOMX_GEN_ARGS); + void gen_FADD_R(RANDOMX_GEN_ARGS); + void gen_FADD_M(RANDOMX_GEN_ARGS); + void gen_FSUB_R(RANDOMX_GEN_ARGS); + void gen_FSUB_M(RANDOMX_GEN_ARGS); + void gen_FSCAL_R(RANDOMX_GEN_ARGS); + void gen_FMUL_R(RANDOMX_GEN_ARGS); + void gen_FDIV_M(RANDOMX_GEN_ARGS); + void gen_FSQRT_R(RANDOMX_GEN_ARGS); + void gen_CBRANCH(RANDOMX_GEN_ARGS); + void gen_CFROUND(RANDOMX_GEN_ARGS); + void gen_ISTORE(RANDOMX_GEN_ARGS); + void gen_NOP(RANDOMX_GEN_ARGS); +#endif + }; +} diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp new file mode 100644 index 00000000..7f7ea0ed --- /dev/null +++ b/src/crypto/randomx/common.hpp @@ -0,0 +1,173 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "blake2/endian.h" +#include "configuration.h" +#include "randomx.h" + +namespace randomx { + + //static_assert(RANDOMX_ARGON_MEMORY > 0, "RANDOMX_ARGON_MEMORY must be greater than 0."); + //static_assert((RANDOMX_ARGON_MEMORY & (RANDOMX_ARGON_MEMORY - 1)) == 0, "RANDOMX_ARGON_MEMORY must be a power of 2."); + //static_assert(RANDOMX_DATASET_BASE_SIZE >= 64, "RANDOMX_DATASET_BASE_SIZE must be at least 64."); + //static_assert((RANDOMX_DATASET_BASE_SIZE & (RANDOMX_DATASET_BASE_SIZE - 1)) == 0, "RANDOMX_DATASET_BASE_SIZE must be a power of 2."); + //static_assert(RANDOMX_DATASET_BASE_SIZE <= 4294967296ULL, "RANDOMX_DATASET_BASE_SIZE must not exceed 4294967296."); + //static_assert(RANDOMX_DATASET_EXTRA_SIZE % 64 == 0, "RANDOMX_DATASET_EXTRA_SIZE must be divisible by 64."); + //static_assert((uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE <= 17179869184, "Dataset size must not exceed 16 GiB."); + //static_assert(RANDOMX_PROGRAM_SIZE > 0, "RANDOMX_PROGRAM_SIZE must be greater than 0"); + //static_assert(RANDOMX_PROGRAM_ITERATIONS > 0, "RANDOMX_PROGRAM_ITERATIONS must be greater than 0"); + //static_assert(RANDOMX_PROGRAM_COUNT > 0, "RANDOMX_PROGRAM_COUNT must be greater than 0"); + //static_assert((RANDOMX_SCRATCHPAD_L3 & (RANDOMX_SCRATCHPAD_L3 - 1)) == 0, "RANDOMX_SCRATCHPAD_L3 must be a power of 2."); + //static_assert(RANDOMX_SCRATCHPAD_L3 >= RANDOMX_SCRATCHPAD_L2, "RANDOMX_SCRATCHPAD_L3 must be greater than or equal to RANDOMX_SCRATCHPAD_L2."); + //static_assert((RANDOMX_SCRATCHPAD_L2 & (RANDOMX_SCRATCHPAD_L2 - 1)) == 0, "RANDOMX_SCRATCHPAD_L2 must be a power of 2."); + //static_assert(RANDOMX_SCRATCHPAD_L2 >= RANDOMX_SCRATCHPAD_L1, "RANDOMX_SCRATCHPAD_L2 must be greater than or equal to RANDOMX_SCRATCHPAD_L1."); + //static_assert(RANDOMX_SCRATCHPAD_L1 >= 64, "RANDOMX_SCRATCHPAD_L1 must be at least 64."); + //static_assert((RANDOMX_SCRATCHPAD_L1 & (RANDOMX_SCRATCHPAD_L1 - 1)) == 0, "RANDOMX_SCRATCHPAD_L1 must be a power of 2."); + //static_assert(RANDOMX_CACHE_ACCESSES > 1, "RANDOMX_CACHE_ACCESSES must be greater than 1"); + //static_assert(RANDOMX_SUPERSCALAR_LATENCY > 0, "RANDOMX_SUPERSCALAR_LATENCY must be greater than 0"); + //static_assert(RANDOMX_JUMP_BITS > 0, "RANDOMX_JUMP_BITS must be greater than 0."); + //static_assert(RANDOMX_JUMP_OFFSET >= 0, "RANDOMX_JUMP_OFFSET must be greater than or equal to 0."); + //static_assert(RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET <= 16, "RANDOMX_JUMP_BITS + RANDOMX_JUMP_OFFSET must not exceed 16."); + + /*constexpr int wtSum = RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + \ + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + \ + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M + RANDOMX_FREQ_IMUL_RCP + \ + RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R + RANDOMX_FREQ_ISWAP_R + \ + RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + \ + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R + RANDOMX_FREQ_CBRANCH + \ + RANDOMX_FREQ_CFROUND + RANDOMX_FREQ_ISTORE + RANDOMX_FREQ_NOP;*/ + + //static_assert(wtSum == 256, "Sum of instruction frequencies must be 256."); + + + constexpr uint32_t ArgonBlockSize = 1024; + constexpr int SuperscalarMaxSize = 3 * RANDOMX_SUPERSCALAR_MAX_LATENCY + 2; + constexpr size_t CacheLineSize = RANDOMX_DATASET_ITEM_SIZE; + #define ScratchpadSize RandomX_CurrentConfig.ScratchpadL3_Size + #define CacheLineAlignMask RandomX_CurrentConfig.CacheLineAlignMask_Calculated + #define DatasetExtraItems RandomX_CurrentConfig.DatasetExtraItems_Calculated + constexpr int StoreL3Condition = 14; + + //Prevent some unsafe configurations. +#ifndef RANDOMX_UNSAFE + //static_assert((uint64_t)ArgonBlockSize * RANDOMX_CACHE_ACCESSES * RANDOMX_ARGON_MEMORY + 33554432 >= (uint64_t)RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE, "Unsafe configuration: Memory-time tradeoffs"); + //static_assert((128 + RANDOMX_PROGRAM_SIZE * RANDOMX_FREQ_ISTORE / 256) * (RANDOMX_PROGRAM_COUNT * RANDOMX_PROGRAM_ITERATIONS) >= RANDOMX_SCRATCHPAD_L3, "Unsafe configuration: Insufficient Scratchpad writes"); + //static_assert(RANDOMX_PROGRAM_COUNT > 1, "Unsafe configuration: Program filtering strategies"); + //static_assert(RANDOMX_PROGRAM_SIZE >= 64, "Unsafe configuration: Low program entropy"); + //static_assert(RANDOMX_PROGRAM_ITERATIONS >= 400, "Unsafe configuration: High compilation overhead"); +#endif + +#ifdef TRACE + constexpr bool trace = true; +#else + constexpr bool trace = false; +#endif + +#ifndef UNREACHABLE +#ifdef __GNUC__ +#define UNREACHABLE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE __assume(false) +#else +#define UNREACHABLE +#endif +#endif + +#if defined(_M_X64) || defined(__x86_64__) + #define RANDOMX_HAVE_COMPILER 1 + class JitCompilerX86; + using JitCompiler = JitCompilerX86; +#elif defined(__aarch64__) + #define RANDOMX_HAVE_COMPILER 0 + class JitCompilerA64; + using JitCompiler = JitCompilerA64; +#else + #define RANDOMX_HAVE_COMPILER 0 + class JitCompilerFallback; + using JitCompiler = JitCompilerFallback; +#endif + + using addr_t = uint32_t; + + using int_reg_t = uint64_t; + + struct fpu_reg_t { + double lo; + double hi; + }; + + #define ScratchpadL1Mask RandomX_CurrentConfig.ScratchpadL1Mask_Calculated + #define ScratchpadL1Mask16 RandomX_CurrentConfig.ScratchpadL1Mask16_Calculated + #define ScratchpadL2Mask RandomX_CurrentConfig.ScratchpadL2Mask_Calculated + #define ScratchpadL2Mask16 RandomX_CurrentConfig.ScratchpadL2Mask16_Calculated + #define ScratchpadL3Mask RandomX_CurrentConfig.ScratchpadL3Mask_Calculated + #define ScratchpadL3Mask64 RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated + constexpr int RegistersCount = 8; + constexpr int RegisterCountFlt = RegistersCount / 2; + constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register + constexpr int RegisterNeedsSib = 4; //x86 r12 register + + inline bool isPowerOf2(uint64_t x) { + return (x & (x - 1)) == 0; + } + + constexpr int mantissaSize = 52; + constexpr int exponentSize = 11; + constexpr uint64_t mantissaMask = (1ULL << mantissaSize) - 1; + constexpr uint64_t exponentMask = (1ULL << exponentSize) - 1; + constexpr int exponentBias = 1023; + constexpr int dynamicExponentBits = 4; + constexpr int staticExponentBits = 4; + constexpr uint64_t constExponentBits = 0x300; + constexpr uint64_t dynamicMantissaMask = (1ULL << (mantissaSize + dynamicExponentBits)) - 1; + + struct MemoryRegisters { + addr_t mx, ma; + uint8_t* memory = nullptr; + }; + + //register file in little-endian byte order + struct RegisterFile { + int_reg_t r[RegistersCount]; + fpu_reg_t f[RegisterCountFlt]; + fpu_reg_t e[RegisterCountFlt]; + fpu_reg_t a[RegisterCountFlt]; + }; + + typedef void(ProgramFunc)(RegisterFile&, MemoryRegisters&, uint8_t* /* scratchpad */, uint64_t); + typedef void(DatasetInitFunc)(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); + + typedef void(DatasetDeallocFunc)(randomx_dataset*); + typedef void(CacheDeallocFunc)(randomx_cache*); + typedef void(CacheInitializeFunc)(randomx_cache*, const void*, size_t); +} diff --git a/src/crypto/randomx/configuration.h b/src/crypto/randomx/configuration.h new file mode 100644 index 00000000..678cb2f8 --- /dev/null +++ b/src/crypto/randomx/configuration.h @@ -0,0 +1,47 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +// Increase it if some configs use more cache accesses +#define RANDOMX_CACHE_MAX_ACCESSES 16 + +// Increase it if some configs use larger superscalar latency +#define RANDOMX_SUPERSCALAR_MAX_LATENCY 256 + +// Increase it if some configs use larger cache +#define RANDOMX_CACHE_MAX_SIZE 268435456 + +// Increase it if some configs use larger dataset +#define RANDOMX_DATASET_MAX_SIZE 2181038080 + +// Increase it if some configs use larger programs +#define RANDOMX_PROGRAM_MAX_SIZE 512 + +// Increase it if some configs use larger scratchpad +#define RANDOMX_SCRATCHPAD_L3_MAX_SIZE 2097152 diff --git a/src/crypto/randomx/dataset.cpp b/src/crypto/randomx/dataset.cpp new file mode 100644 index 00000000..3951b55b --- /dev/null +++ b/src/crypto/randomx/dataset.cpp @@ -0,0 +1,189 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#include +#include +#include +#include +#include +#include + +#include "common.hpp" +#include "dataset.hpp" +#include "virtual_memory.hpp" +#include "superscalar.hpp" +#include "blake2_generator.hpp" +#include "reciprocal.h" +#include "blake2/endian.h" +#include "argon2.h" +#include "argon2_core.h" +#include "jit_compiler.hpp" +#include "intrin_portable.h" + +//static_assert(RANDOMX_ARGON_MEMORY % (RANDOMX_ARGON_LANES * ARGON2_SYNC_POINTS) == 0, "RANDOMX_ARGON_MEMORY - invalid value"); +static_assert(ARGON2_BLOCK_SIZE == randomx::ArgonBlockSize, "Unpexpected value of ARGON2_BLOCK_SIZE"); + +namespace randomx { + + template + void deallocCache(randomx_cache* cache) { + if (cache->memory != nullptr) + Allocator::freeMemory(cache->memory, RANDOMX_CACHE_MAX_SIZE); + if (cache->jit != nullptr) + delete cache->jit; + } + + template void deallocCache(randomx_cache* cache); + template void deallocCache(randomx_cache* cache); + + void initCache(randomx_cache* cache, const void* key, size_t keySize) { + uint32_t memory_blocks, segment_length; + argon2_instance_t instance; + argon2_context context; + + context.out = nullptr; + context.outlen = 0; + context.pwd = CONST_CAST(uint8_t *)key; + context.pwdlen = (uint32_t)keySize; + context.salt = CONST_CAST(uint8_t *)RandomX_CurrentConfig.ArgonSalt; + context.saltlen = (uint32_t)strlen(RandomX_CurrentConfig.ArgonSalt); + context.secret = NULL; + context.secretlen = 0; + context.ad = NULL; + context.adlen = 0; + context.t_cost = RandomX_CurrentConfig.ArgonIterations; + context.m_cost = RandomX_CurrentConfig.ArgonMemory; + context.lanes = RandomX_CurrentConfig.ArgonLanes; + context.threads = 1; + context.allocate_cbk = NULL; + context.free_cbk = NULL; + context.flags = ARGON2_DEFAULT_FLAGS; + context.version = ARGON2_VERSION_NUMBER; + + /* 2. Align memory size */ + /* Minimum memory_blocks = 8L blocks, where L is the number of lanes */ + memory_blocks = context.m_cost; + + segment_length = memory_blocks / (context.lanes * ARGON2_SYNC_POINTS); + + instance.version = context.version; + instance.memory = NULL; + instance.passes = context.t_cost; + instance.memory_blocks = memory_blocks; + instance.segment_length = segment_length; + instance.lane_length = segment_length * ARGON2_SYNC_POINTS; + instance.lanes = context.lanes; + instance.threads = context.threads; + instance.type = Argon2_d; + instance.memory = (block*)cache->memory; + + if (instance.threads > instance.lanes) { + instance.threads = instance.lanes; + } + + /* 3. Initialization: Hashing inputs, allocating memory, filling first + * blocks + */ + rxa2_argon_initialize(&instance, &context); + + rxa2_fill_memory_blocks(&instance); + + cache->reciprocalCache.clear(); + randomx::Blake2Generator gen(key, keySize); + for (int i = 0; i < RandomX_CurrentConfig.CacheAccesses; ++i) { + randomx::generateSuperscalar(cache->programs[i], gen); + for (unsigned j = 0; j < cache->programs[i].getSize(); ++j) { + auto& instr = cache->programs[i](j); + if ((SuperscalarInstructionType)instr.opcode == SuperscalarInstructionType::IMUL_RCP) { + auto rcp = randomx_reciprocal(instr.getImm32()); + instr.setImm32(cache->reciprocalCache.size()); + cache->reciprocalCache.push_back(rcp); + } + } + } + } + + void initCacheCompile(randomx_cache* cache, const void* key, size_t keySize) { + initCache(cache, key, keySize); + cache->jit->generateSuperscalarHash(cache->programs, cache->reciprocalCache); + cache->jit->generateDatasetInitCode(); + } + + constexpr uint64_t superscalarMul0 = 6364136223846793005ULL; + constexpr uint64_t superscalarAdd1 = 9298411001130361340ULL; + constexpr uint64_t superscalarAdd2 = 12065312585734608966ULL; + constexpr uint64_t superscalarAdd3 = 9306329213124626780ULL; + constexpr uint64_t superscalarAdd4 = 5281919268842080866ULL; + constexpr uint64_t superscalarAdd5 = 10536153434571861004ULL; + constexpr uint64_t superscalarAdd6 = 3398623926847679864ULL; + constexpr uint64_t superscalarAdd7 = 9549104520008361294ULL; + + static inline uint8_t* getMixBlock(uint64_t registerValue, uint8_t *memory) { + const uint32_t mask = (RandomX_CurrentConfig.ArgonMemory * randomx::ArgonBlockSize) / CacheLineSize - 1; + return memory + (registerValue & mask) * CacheLineSize; + } + + void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t itemNumber) { + int_reg_t rl[8]; + uint8_t* mixBlock; + uint64_t registerValue = itemNumber; + rl[0] = (itemNumber + 1) * superscalarMul0; + rl[1] = rl[0] ^ superscalarAdd1; + rl[2] = rl[0] ^ superscalarAdd2; + rl[3] = rl[0] ^ superscalarAdd3; + rl[4] = rl[0] ^ superscalarAdd4; + rl[5] = rl[0] ^ superscalarAdd5; + rl[6] = rl[0] ^ superscalarAdd6; + rl[7] = rl[0] ^ superscalarAdd7; + for (unsigned i = 0; i < RandomX_CurrentConfig.CacheAccesses; ++i) { + mixBlock = getMixBlock(registerValue, cache->memory); + rx_prefetch_nta(mixBlock); + SuperscalarProgram& prog = cache->programs[i]; + + executeSuperscalar(rl, prog, &cache->reciprocalCache); + + for (unsigned q = 0; q < 8; ++q) + rl[q] ^= load64_native(mixBlock + 8 * q); + + registerValue = rl[prog.getAddressRegister()]; + } + + memcpy(out, &rl, CacheLineSize); + } + + void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startItem, uint32_t endItem) { + for (uint32_t itemNumber = startItem; itemNumber < endItem; ++itemNumber, dataset += CacheLineSize) + initDatasetItem(cache, dataset, itemNumber); + } +} diff --git a/src/crypto/randomx/dataset.hpp b/src/crypto/randomx/dataset.hpp new file mode 100644 index 00000000..42d4f05e --- /dev/null +++ b/src/crypto/randomx/dataset.hpp @@ -0,0 +1,80 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" +#include "superscalar_program.hpp" +#include "allocator.hpp" + +/* Global scope for C binding */ +struct randomx_dataset { + uint8_t* memory = nullptr; + randomx::DatasetDeallocFunc* dealloc; +}; + +/* Global scope for C binding */ +struct randomx_cache { + uint8_t* memory = nullptr; + randomx::CacheDeallocFunc* dealloc; + randomx::JitCompiler* jit; + randomx::CacheInitializeFunc* initialize; + randomx::DatasetInitFunc* datasetInit; + randomx::SuperscalarProgram programs[RANDOMX_CACHE_MAX_ACCESSES]; + std::vector reciprocalCache; + + bool isInitialized() { + return programs[0].getSize() != 0; + } +}; + +//A pointer to a standard-layout struct object points to its initial member +static_assert(std::is_standard_layout(), "randomx_dataset must be a standard-layout struct"); +static_assert(std::is_standard_layout(), "randomx_cache must be a standard-layout struct"); + +namespace randomx { + + using DefaultAllocator = AlignedAllocator; + + template + void deallocDataset(randomx_dataset* dataset) { + if (dataset->memory != nullptr) + Allocator::freeMemory(dataset->memory, RANDOMX_DATASET_MAX_SIZE); + } + + template + void deallocCache(randomx_cache* cache); + + void initCache(randomx_cache*, const void*, size_t); + void initCacheCompile(randomx_cache*, const void*, size_t); + void initDatasetItem(randomx_cache* cache, uint8_t* out, uint64_t blockNumber); + void initDataset(randomx_cache* cache, uint8_t* dataset, uint32_t startBlock, uint32_t endBlock); +} diff --git a/src/crypto/randomx/instruction.hpp b/src/crypto/randomx/instruction.hpp new file mode 100644 index 00000000..1904afc0 --- /dev/null +++ b/src/crypto/randomx/instruction.hpp @@ -0,0 +1,102 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "blake2/endian.h" + +namespace randomx { + + class Instruction; + + enum class InstructionType : uint16_t { + IADD_RS = 0, + IADD_M = 1, + ISUB_R = 2, + ISUB_M = 3, + IMUL_R = 4, + IMUL_M = 5, + IMULH_R = 6, + IMULH_M = 7, + ISMULH_R = 8, + ISMULH_M = 9, + IMUL_RCP = 10, + INEG_R = 11, + IXOR_R = 12, + IXOR_M = 13, + IROR_R = 14, + IROL_R = 15, + ISWAP_R = 16, + FSWAP_R = 17, + FADD_R = 18, + FADD_M = 19, + FSUB_R = 20, + FSUB_M = 21, + FSCAL_R = 22, + FMUL_R = 23, + FDIV_M = 24, + FSQRT_R = 25, + CBRANCH = 26, + CFROUND = 27, + ISTORE = 28, + NOP = 29, + }; + + class Instruction { + public: + uint32_t getImm32() const { + return load32(&imm32); + } + void setImm32(uint32_t val) { + return store32(&imm32, val); + } + int getModMem() const { + return mod % 4; //bits 0-1 + } + int getModShift() const { + return (mod >> 2) % 4; //bits 2-3 + } + int getModCond() const { + return mod >> 4; //bits 4-7 + } + void setMod(uint8_t val) { + mod = val; + } + + uint8_t opcode; + uint8_t dst; + uint8_t src; + uint8_t mod; + uint32_t imm32; + }; + + static_assert(sizeof(Instruction) == 8, "Invalid size of struct randomx::Instruction"); + static_assert(std::is_standard_layout(), "randomx::Instruction must be a standard-layout struct"); +} diff --git a/src/crypto/randomx/instructions_portable.cpp b/src/crypto/randomx/instructions_portable.cpp new file mode 100644 index 00000000..8c466ebe --- /dev/null +++ b/src/crypto/randomx/instructions_portable.cpp @@ -0,0 +1,193 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include "common.hpp" +#include "intrin_portable.h" +#include "blake2/endian.h" + +#if defined(__SIZEOF_INT128__) + typedef unsigned __int128 uint128_t; + typedef __int128 int128_t; + uint64_t mulh(uint64_t a, uint64_t b) { + return ((uint128_t)a * b) >> 64; + } + int64_t smulh(int64_t a, int64_t b) { + return ((int128_t)a * b) >> 64; + } + #define HAVE_MULH + #define HAVE_SMULH +#endif + +#if defined(_MSC_VER) + #define HAS_VALUE(X) X ## 0 + #define EVAL_DEFINE(X) HAS_VALUE(X) + #include + #include + + uint64_t rotl64(uint64_t x, unsigned int c) { + return _rotl64(x, c); + } + uint64_t rotr64(uint64_t x, unsigned int c) { + return _rotr64(x, c); + } + #define HAVE_ROTL64 + #define HAVE_ROTR64 + + #if EVAL_DEFINE(__MACHINEARM64_X64(1)) + uint64_t mulh(uint64_t a, uint64_t b) { + return __umulh(a, b); + } + #define HAVE_MULH + #endif + + #if EVAL_DEFINE(__MACHINEX64(1)) + int64_t smulh(int64_t a, int64_t b) { + int64_t hi; + _mul128(a, b, &hi); + return hi; + } + #define HAVE_SMULH + #endif + + static void setRoundMode_(uint32_t mode) { + _controlfp(mode, _MCW_RC); + } + #define HAVE_SETROUNDMODE_IMPL +#endif + +#ifndef HAVE_SETROUNDMODE_IMPL + static void setRoundMode_(uint32_t mode) { + fesetround(mode); + } +#endif + +#ifndef HAVE_ROTR64 + uint64_t rotr64(uint64_t a, unsigned int b) { + return (a >> b) | (a << (-b & 63)); + } + #define HAVE_ROTR64 +#endif + +#ifndef HAVE_ROTL64 + uint64_t rotl64(uint64_t a, unsigned int b) { + return (a << b) | (a >> (-b & 63)); + } + #define HAVE_ROTL64 +#endif + +#ifndef HAVE_MULH + #define LO(x) ((x)&0xffffffff) + #define HI(x) ((x)>>32) + uint64_t mulh(uint64_t a, uint64_t b) { + uint64_t ah = HI(a), al = LO(a); + uint64_t bh = HI(b), bl = LO(b); + uint64_t x00 = al * bl; + uint64_t x01 = al * bh; + uint64_t x10 = ah * bl; + uint64_t x11 = ah * bh; + uint64_t m1 = LO(x10) + LO(x01) + HI(x00); + uint64_t m2 = HI(x10) + HI(x01) + LO(x11) + HI(m1); + uint64_t m3 = HI(x11) + HI(m2); + + return (m3 << 32) + LO(m2); + } + #define HAVE_MULH +#endif + +#ifndef HAVE_SMULH + int64_t smulh(int64_t a, int64_t b) { + int64_t hi = mulh(a, b); + if (a < 0LL) hi -= b; + if (b < 0LL) hi -= a; + return hi; + } + #define HAVE_SMULH +#endif + +#ifdef RANDOMX_DEFAULT_FENV + +void rx_reset_float_state() { + setRoundMode_(FE_TONEAREST); + rx_set_double_precision(); //set precision to 53 bits if needed by the platform +} + +void rx_set_rounding_mode(uint32_t mode) { + switch (mode & 3) { + case RoundDown: + setRoundMode_(FE_DOWNWARD); + break; + case RoundUp: + setRoundMode_(FE_UPWARD); + break; + case RoundToZero: + setRoundMode_(FE_TOWARDZERO); + break; + case RoundToNearest: + setRoundMode_(FE_TONEAREST); + break; + default: + UNREACHABLE; + } +} + +#endif + +#ifdef RANDOMX_USE_X87 + +#ifdef _M_IX86 + +void rx_set_double_precision() { + _control87(_PC_53, _MCW_PC); +} + +#elif defined(__i386) + +void rx_set_double_precision() { + uint16_t volatile x87cw; + asm volatile("fstcw %0" : "=m" (x87cw)); + x87cw &= ~0x300; + x87cw |= 0x200; + asm volatile("fldcw %0" : : "m" (x87cw)); +} + +#endif + +#endif //RANDOMX_USE_X87 + +union double_ser_t { + double f; + uint64_t i; +}; + +double loadDoublePortable(const void* addr) { + double_ser_t ds; + ds.i = load64(addr); + return ds.f; +} diff --git a/src/crypto/randomx/intrin_portable.h b/src/crypto/randomx/intrin_portable.h new file mode 100644 index 00000000..83acbe65 --- /dev/null +++ b/src/crypto/randomx/intrin_portable.h @@ -0,0 +1,605 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "blake2/endian.h" + +constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); +} + +constexpr int64_t unsigned64ToSigned2sCompl(uint64_t x) { + return (-1 == ~0) ? (int64_t)x : (x > INT64_MAX ? (-(int64_t)(UINT64_MAX - x) - 1) : (int64_t)x); +} + +constexpr uint64_t signExtend2sCompl(uint32_t x) { + return (-1 == ~0) ? (int64_t)(int32_t)(x) : (x > INT32_MAX ? (x | 0xffffffff00000000ULL) : (uint64_t)x); +} + +constexpr int RoundToNearest = 0; +constexpr int RoundDown = 1; +constexpr int RoundUp = 2; +constexpr int RoundToZero = 3; + +//MSVC doesn't define __SSE2__, so we have to define it manually if SSE2 is available +#if !defined(__SSE2__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)) +#define __SSE2__ 1 +#endif + +//MSVC doesn't define __AES__ +#if defined(_MSC_VER) && defined(__SSE2__) +#define __AES__ +#endif + +//the library "sqrt" function provided by MSVC for x86 targets doesn't give +//the correct results, so we have to use inline assembly to call x87 fsqrt directly +#if !defined(__SSE2__) +#if defined(_M_IX86) +inline double __cdecl rx_sqrt(double x) { + __asm { + fld x + fsqrt + } +} +#define rx_sqrt rx_sqrt + +void rx_set_double_precision(); +#define RANDOMX_USE_X87 + +#elif defined(__i386) + +void rx_set_double_precision(); +#define RANDOMX_USE_X87 + +#endif +#endif //__SSE2__ + +#if !defined(rx_sqrt) +#define rx_sqrt sqrt +#endif + +#if !defined(RANDOMX_USE_X87) +#define rx_set_double_precision(x) +#endif + +#ifdef __SSE2__ +#ifdef __GNUC__ +#include +#else +#include +#endif + +typedef __m128i rx_vec_i128; +typedef __m128d rx_vec_f128; + +#define rx_aligned_alloc(a, b) _mm_malloc(a,b) +#define rx_aligned_free(a) _mm_free(a) +#define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) + +#define rx_load_vec_f128 _mm_load_pd +#define rx_store_vec_f128 _mm_store_pd +#define rx_add_vec_f128 _mm_add_pd +#define rx_sub_vec_f128 _mm_sub_pd +#define rx_mul_vec_f128 _mm_mul_pd +#define rx_div_vec_f128 _mm_div_pd +#define rx_sqrt_vec_f128 _mm_sqrt_pd + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + return _mm_shuffle_pd(a, a, 1); +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + return _mm_castsi128_pd(_mm_set_epi64x(x1, x0)); +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return _mm_castsi128_pd(_mm_set1_epi64x(x)); +} + +#define rx_xor_vec_f128 _mm_xor_pd +#define rx_and_vec_f128 _mm_and_pd +#define rx_or_vec_f128 _mm_or_pd + +#ifdef __AES__ + +#define rx_aesenc_vec_i128 _mm_aesenc_si128 +#define rx_aesdec_vec_i128 _mm_aesdec_si128 + +#define HAVE_AES + +#endif //__AES__ + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return _mm_cvtsi128_si32(a); +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xaa)); +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xff)); +} + +#define rx_set_int_vec_i128 _mm_set_epi32 +#define rx_xor_vec_i128 _mm_xor_si128 +#define rx_load_vec_i128 _mm_load_si128 +#define rx_store_vec_i128 _mm_store_si128 + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + __m128i ix = _mm_loadl_epi64((const __m128i*)addr); + return _mm_cvtepi32_pd(ix); +} + +constexpr uint32_t rx_mxcsr_default = 0x9FC0; //Flush to zero, denormals are zero, default rounding mode, all exceptions disabled + +FORCE_INLINE void rx_reset_float_state() { + _mm_setcsr(rx_mxcsr_default); +} + +FORCE_INLINE void rx_set_rounding_mode(uint32_t mode) { + _mm_setcsr(rx_mxcsr_default | (mode << 13)); +} + +#elif defined(__PPC64__) && defined(__ALTIVEC__) && defined(__VSX__) //sadly only POWER7 and newer will be able to use SIMD acceleration. Earlier processors cant use doubles or 64 bit integers with SIMD +#include +#include +#include +#include +#undef vector +#undef pixel +#undef bool + +typedef __vector uint8_t __m128i; +typedef __vector uint32_t __m128l; +typedef __vector int __m128li; +typedef __vector uint64_t __m128ll; +typedef __vector double __m128d; + +typedef __m128i rx_vec_i128; +typedef __m128d rx_vec_f128; +typedef union{ + rx_vec_i128 i; + rx_vec_f128 d; + uint64_t u64[2]; + double d64[2]; + uint32_t u32[4]; + int i32[4]; +} vec_u; + +#define rx_aligned_alloc(a, b) malloc(a) +#define rx_aligned_free(a) free(a) +#define rx_prefetch_nta(x) + +/* Splat 64-bit long long to 2 64-bit long longs */ +FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) +{ return (__m128i) vec_splats (scalar); } + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { +#if defined(NATIVE_LITTLE_ENDIAN) + return (rx_vec_f128)vec_vsx_ld(0,pd); +#else + vec_u t; + t.u64[0] = load64(pd + 0); + t.u64[1] = load64(pd + 1); + return (rx_vec_f128)t.d; +#endif +} + +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { +#if defined(NATIVE_LITTLE_ENDIAN) + vec_vsx_st(a,0,(rx_vec_f128*)mem_addr); +#else + vec_u _a; + _a.d = a; + store64(mem_addr + 0, _a.u64[0]); + store64(mem_addr + 1, _a.u64[1]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + return (rx_vec_f128)vec_perm((__m128i)a,(__m128i)a,(__m128i){8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7}); +} + +FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_add(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_sub(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_mul(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_div(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { + return (rx_vec_f128)vec_sqrt(a); +} + +FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { + return (rx_vec_i128)vec_splat2sd(a); +} + +FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { + return (rx_vec_f128)a; +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + return (rx_vec_f128)(__m128ll){x0,x1}; +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + return (rx_vec_f128)vec_splat2sd(x); +} + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_xor(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_and(a,b); +} + +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + return (rx_vec_f128)vec_or(a,b); +} + +#if defined(__CRYPTO__) + +FORCE_INLINE __m128ll vrev(__m128i v){ +#if defined(NATIVE_LITTLE_ENDIAN) + return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}); +#else + return (__m128ll)vec_perm((__m128i)v,(__m128i){0},(__m128i){3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}); +#endif +} + +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + __m128ll _v = vrev(v); + __m128ll _rkey = vrev(rkey); + __m128ll result = vrev((__m128i)__builtin_crypto_vcipher(_v,_rkey)); + return (rx_vec_i128)result; +} + +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + __m128ll _v = vrev(v); + __m128ll zero = (__m128ll){0}; + __m128ll out = vrev((__m128i)__builtin_crypto_vncipher(_v,zero)); + return (rx_vec_i128)vec_xor((__m128i)out,rkey); +} +#define HAVE_AES + +#endif //__CRYPTO__ + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[0]; +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[1]; +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[2]; +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + vec_u _a; + _a.i = a; + return _a.i32[3]; +} + +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + return (rx_vec_i128)((__m128li){_I0,_I1,_I2,_I3}); +}; + +FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { + return (rx_vec_i128)vec_xor(_A,_B); +} + +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *_P) { +#if defined(NATIVE_LITTLE_ENDIAN) + return *_P; +#else + uint32_t* ptr = (uint32_t*)_P; + vec_u c; + c.u32[0] = load32(ptr + 0); + c.u32[1] = load32(ptr + 1); + c.u32[2] = load32(ptr + 2); + c.u32[3] = load32(ptr + 3); + return (rx_vec_i128)c.i; +#endif +} + +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { +#if defined(NATIVE_LITTLE_ENDIAN) + *_P = _B; +#else + uint32_t* ptr = (uint32_t*)_P; + vec_u B; + B.i = _B; + store32(ptr + 0, B.u32[0]); + store32(ptr + 1, B.u32[1]); + store32(ptr + 2, B.u32[2]); + store32(ptr + 3, B.u32[3]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + vec_u x; + x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + return (rx_vec_f128)x.d; +} + +#define RANDOMX_DEFAULT_FENV + +void rx_reset_float_state(); + +void rx_set_rounding_mode(uint32_t mode); + +#else //end altivec + +#include +#include +#include +#include + +typedef union { + uint64_t u64[2]; + uint32_t u32[4]; + uint16_t u16[8]; + uint8_t u8[16]; +} rx_vec_i128; + +typedef union { + struct { + double lo; + double hi; + }; + rx_vec_i128 i; +} rx_vec_f128; + +#define rx_aligned_alloc(a, b) malloc(a) +#define rx_aligned_free(a) free(a) +#define rx_prefetch_nta(x) + +FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { + rx_vec_f128 x; + x.i.u64[0] = load64(pd + 0); + x.i.u64[1] = load64(pd + 1); + return x; +} + +FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 a) { + store64(mem_addr + 0, a.i.u64[0]); + store64(mem_addr + 1, a.i.u64[1]); +} + +FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) { + double temp = a.hi; + a.hi = a.lo; + a.lo = temp; + return a; +} + +FORCE_INLINE rx_vec_f128 rx_add_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo + b.lo; + x.hi = a.hi + b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_sub_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo - b.lo; + x.hi = a.hi - b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_mul_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo * b.lo; + x.hi = a.hi * b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_div_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.lo = a.lo / b.lo; + x.hi = a.hi / b.hi; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_sqrt_vec_f128(rx_vec_f128 a) { + rx_vec_f128 x; + x.lo = rx_sqrt(a.lo); + x.hi = rx_sqrt(a.hi); + return x; +} + +FORCE_INLINE rx_vec_i128 rx_set1_long_vec_i128(uint64_t a) { + rx_vec_i128 x; + x.u64[0] = a; + x.u64[1] = a; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_vec_i128_vec_f128(rx_vec_i128 a) { + rx_vec_f128 x; + x.i = a; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) { + rx_vec_f128 v; + v.i.u64[0] = x0; + v.i.u64[1] = x1; + return v; +} + +FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) { + rx_vec_f128 v; + v.i.u64[0] = x; + v.i.u64[1] = x; + return v; +} + + +FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] ^ b.i.u64[1]; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] & b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] & b.i.u64[1]; + return x; +} + +FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) { + rx_vec_f128 x; + x.i.u64[0] = a.i.u64[0] | b.i.u64[0]; + x.i.u64[1] = a.i.u64[1] | b.i.u64[1]; + return x; +} + +FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) { + return a.u32[0]; +} + +FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) { + return a.u32[1]; +} + +FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) { + return a.u32[2]; +} + +FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) { + return a.u32[3]; +} + +FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) { + rx_vec_i128 v; + v.u32[0] = _I0; + v.u32[1] = _I1; + v.u32[2] = _I2; + v.u32[3] = _I3; + return v; +}; + +FORCE_INLINE rx_vec_i128 rx_xor_vec_i128(rx_vec_i128 _A, rx_vec_i128 _B) { + rx_vec_i128 c; + c.u32[0] = _A.u32[0] ^ _B.u32[0]; + c.u32[1] = _A.u32[1] ^ _B.u32[1]; + c.u32[2] = _A.u32[2] ^ _B.u32[2]; + c.u32[3] = _A.u32[3] ^ _B.u32[3]; + return c; +} + +FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const*_P) { +#if defined(NATIVE_LITTLE_ENDIAN) + return *_P; +#else + uint32_t* ptr = (uint32_t*)_P; + rx_vec_i128 c; + c.u32[0] = load32(ptr + 0); + c.u32[1] = load32(ptr + 1); + c.u32[2] = load32(ptr + 2); + c.u32[3] = load32(ptr + 3); + return c; +#endif +} + +FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *_P, rx_vec_i128 _B) { +#if defined(NATIVE_LITTLE_ENDIAN) + *_P = _B; +#else + uint32_t* ptr = (uint32_t*)_P; + store32(ptr + 0, _B.u32[0]); + store32(ptr + 1, _B.u32[1]); + store32(ptr + 2, _B.u32[2]); + store32(ptr + 3, _B.u32[3]); +#endif +} + +FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { + rx_vec_f128 x; + x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + return x; +} + +#define RANDOMX_DEFAULT_FENV + +void rx_reset_float_state(); + +void rx_set_rounding_mode(uint32_t mode); + +#endif + +#ifndef HAVE_AES +static const char* platformError = "Platform doesn't support hardware AES"; + +#include + +FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + throw std::runtime_error(platformError); +} + +FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) { + throw std::runtime_error(platformError); +} +#endif + +double loadDoublePortable(const void* addr); +uint64_t mulh(uint64_t, uint64_t); +int64_t smulh(int64_t, int64_t); +uint64_t rotl64(uint64_t, unsigned int); +uint64_t rotr64(uint64_t, unsigned int); diff --git a/src/crypto/randomx/jit_compiler.hpp b/src/crypto/randomx/jit_compiler.hpp new file mode 100644 index 00000000..bd9c2b0e --- /dev/null +++ b/src/crypto/randomx/jit_compiler.hpp @@ -0,0 +1,37 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#if defined(_M_X64) || defined(__x86_64__) +#include "jit_compiler_x86.hpp" +#elif defined(__aarch64__) +#include "jit_compiler_a64.hpp" +#else +#include "jit_compiler_fallback.hpp" +#endif diff --git a/src/crypto/randomx/jit_compiler_a64.hpp b/src/crypto/randomx/jit_compiler_a64.hpp new file mode 100644 index 00000000..58aa25c4 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_a64.hpp @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" + +namespace randomx { + + class Program; + class ProgramConfiguration; + class SuperscalarProgram; + + class JitCompilerA64 { + public: + JitCompilerA64() { + throw std::runtime_error("ARM64 JIT compiler is not implemented yet."); + } + void generateProgram(Program&, ProgramConfiguration&) { + + } + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { + + } + template + void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &) { + + } + void generateDatasetInitCode() { + + } + ProgramFunc* getProgramFunc() { + return nullptr; + } + DatasetInitFunc* getDatasetInitFunc() { + return nullptr; + } + uint8_t* getCode() { + return nullptr; + } + size_t getCodeSize() { + return 0; + } + }; +} \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_fallback.hpp b/src/crypto/randomx/jit_compiler_fallback.hpp new file mode 100644 index 00000000..8103a632 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_fallback.hpp @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" + +namespace randomx { + + class Program; + class ProgramConfiguration; + class SuperscalarProgram; + + class JitCompilerFallback { + public: + JitCompilerFallback() { + throw std::runtime_error("JIT compilation is not supported on this platform"); + } + void generateProgram(Program&, ProgramConfiguration&) { + + } + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t) { + + } + template + void generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &) { + + } + void generateDatasetInitCode() { + + } + ProgramFunc* getProgramFunc() { + return nullptr; + } + DatasetInitFunc* getDatasetInitFunc() { + return nullptr; + } + uint8_t* getCode() { + return nullptr; + } + size_t getCodeSize() { + return 0; + } + }; +} \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp new file mode 100644 index 00000000..8870a018 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -0,0 +1,774 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include "jit_compiler_x86.hpp" +#include "jit_compiler_x86_static.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.hpp" + +namespace randomx { + /* + + REGISTER ALLOCATION: + + ; rax -> temporary + ; rbx -> iteration counter "ic" + ; rcx -> temporary + ; rdx -> temporary + ; rsi -> scratchpad pointer + ; rdi -> dataset pointer + ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits) + ; rsp -> stack pointer + ; r8 -> "r0" + ; r9 -> "r1" + ; r10 -> "r2" + ; r11 -> "r3" + ; r12 -> "r4" + ; r13 -> "r5" + ; r14 -> "r6" + ; r15 -> "r7" + ; xmm0 -> "f0" + ; xmm1 -> "f1" + ; xmm2 -> "f2" + ; xmm3 -> "f3" + ; xmm4 -> "e0" + ; xmm5 -> "e1" + ; xmm6 -> "e2" + ; xmm7 -> "e3" + ; xmm8 -> "a0" + ; xmm9 -> "a1" + ; xmm10 -> "a2" + ; xmm11 -> "a3" + ; xmm12 -> temporary + ; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff + ; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000****** + ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 + + */ + + const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue; + const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin; + const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load; + const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start; + const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset; + const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin; + const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init; + const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store; + const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end; + const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue; + const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end; + const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load; + const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch; + const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end; + const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init; + + const int32_t prologueSize = codeLoopBegin - codePrologue; + const int32_t loopLoadSize = codeProgamStart - codeLoopLoad; + const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset; + const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit; + const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin; + const int32_t loopStoreSize = codeLoopEnd - codeLoopStore; + const int32_t datasetInitSize = codeEpilogue - codeDatasetInit; + const int32_t epilogueSize = codeShhLoad - codeEpilogue; + const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad; + const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch; + const int32_t codeSshInitSize = codeProgramEnd - codeShhInit; + + const int32_t epilogueOffset = CodeSize - epilogueSize; + constexpr int32_t superScalarHashOffset = 32768; + + static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 }; + static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 }; + static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; + static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b }; + static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b }; + static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; + static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; + static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; + static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 }; + static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; + static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; + static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 }; + static const uint8_t REX_81[] = { 0x49, 0x81 }; + static const uint8_t AND_EAX_I = 0x25; + static const uint8_t MOV_EAX_I = 0xb8; + static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; + static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 }; + static const uint8_t REX_LEA[] = { 0x4f, 0x8d }; + static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e }; + static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e }; + static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 }; + static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 }; + static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 }; + static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea }; + static const uint8_t REX_SH[] = { 0x49, 0xc1 }; + static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f }; + static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 }; + static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 }; + static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 }; + static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 }; + static const uint8_t ADD_R_RAX[] = { 0x4C, 0x03 }; + static const uint8_t XOR_EAX_EAX[] = { 0x33, 0xC0 }; + static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 }; + static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 }; + static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA }; + static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 }; + static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0 }; + static const uint8_t REX_NEG[] = { 0x49, 0xF7 }; + static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; + static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; + static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 }; + static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 }; + static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; + static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 }; + static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 }; + static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 }; + static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c }; + static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 }; + static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 }; + static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f }; + static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e }; + static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 }; + static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x50, 0x0F, 0xAE, 0x14, 0x24, 0x58 }; + static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 }; + static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 }; + static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 }; + static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 }; + static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 }; + static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 }; + static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 }; + static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 }; + static const uint8_t JNZ[] = { 0x0f, 0x85 }; + static const uint8_t JMP = 0xe9; + static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 }; + static const uint8_t REX_XCHG[] = { 0x4d, 0x87 }; + static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 }; + static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f }; + static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 }; + static const uint8_t CALL = 0xe8; + static const uint8_t REX_ADD_I[] = { 0x49, 0x81 }; + static const uint8_t REX_TEST[] = { 0x49, 0xF7 }; + static const uint8_t JZ[] = { 0x0f, 0x84 }; + static const uint8_t RET = 0xc3; + static const uint8_t LEA_32[] = { 0x67, 0x41, 0x8d }; + static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 }; + static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 }; + + static const uint8_t NOP1[] = { 0x90 }; + static const uint8_t NOP2[] = { 0x66, 0x90 }; + static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 }; + static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; + static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + + static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; + + size_t JitCompilerX86::getCodeSize() { + return codePos - prologueSize; + } + + JitCompilerX86::JitCompilerX86() { + code = (uint8_t*)allocExecutableMemory(CodeSize); + memcpy(code, codePrologue, prologueSize); + memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); + } + + JitCompilerX86::~JitCompilerX86() { + freePagedMemory(code, CodeSize); + } + + void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + generateProgramPrologue(prog, pcfg); + memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); + codePos += readDatasetSize; + generateProgramEpilogue(prog); + } + + void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + generateProgramPrologue(prog, pcfg); + emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize); + emit(ADD_EBX_I); + emit32(datasetOffset / CacheLineSize); + emitByte(CALL); + emit32(superScalarHashOffset - (codePos + 4)); + emit(codeReadDatasetLightSshFin, readDatasetLightFinSize); + generateProgramEpilogue(prog); + } + + template + void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector &reciprocalCache) { + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); + codePos = superScalarHashOffset + codeSshInitSize; + for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + generateSuperscalarCode(instr, reciprocalCache); + } + emit(codeShhLoad, codeSshLoadSize); + if (j < RandomX_CurrentConfig.CacheAccesses - 1) { + emit(REX_MOV_RR64); + emitByte(0xd8 + prog.getAddressRegister()); + emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize); +#ifdef RANDOMX_ALIGN + int align = (codePos % 16); + while (align != 0) { + int nopSize = 16 - align; + if (nopSize > 8) nopSize = 8; + emit(NOPX[nopSize - 1], nopSize); + align = (codePos % 16); + } +#endif + } + } + emitByte(RET); + } + + template + void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES], std::vector &reciprocalCache); + + void JitCompilerX86::generateDatasetInitCode() { + memcpy(code, codeDatasetInit, datasetInitSize); + } + + void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { + instructionOffsets.clear(); + for (unsigned i = 0; i < 8; ++i) { + registerUsage[i] = -1; + } + codePos = prologueSize; + memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask)); + emit(REX_XOR_RAX_R64); + emitByte(0xc0 + pcfg.readReg0); + emit(REX_XOR_RAX_R64); + emitByte(0xc0 + pcfg.readReg1); + memcpy(code + codePos, RandomX_CurrentConfig.codeLoopLoadTweaked, loopLoadSize); + codePos += loopLoadSize; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction& instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + generateCode(instr, i); + } + emit(REX_MOV_RR); + emitByte(0xc0 + pcfg.readReg2); + emit(REX_XOR_EAX); + emitByte(0xc0 + pcfg.readReg3); + } + + void JitCompilerX86::generateProgramEpilogue(Program& prog) { + memcpy(code + codePos, codeLoopStore, loopStoreSize); + codePos += loopStoreSize; + emit(SUB_EBX); + emit(JNZ); + emit32(prologueSize - codePos - 4); + emitByte(JMP); + emit32(epilogueOffset - codePos - 4); + } + + void JitCompilerX86::generateCode(Instruction& instr, int i) { + instructionOffsets.push_back(codePos); + auto generator = engine[instr.opcode]; + (this->*generator)(instr, i); + } + + void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector &reciprocalCache) { + switch ((SuperscalarInstructionType)instr.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + emit(REX_LEA); + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.getModShift(), instr.src, instr.dst); + break; + case randomx::SuperscalarInstructionType::IMUL_R: + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + break; + case randomx::SuperscalarInstructionType::IROR_C: + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + break; + case randomx::SuperscalarInstructionType::IADD_C7: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + break; + case randomx::SuperscalarInstructionType::IADD_C8: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP1); +#endif + break; + case randomx::SuperscalarInstructionType::IXOR_C8: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP1); +#endif + break; + case randomx::SuperscalarInstructionType::IADD_C9: + emit(REX_81); + emitByte(0xc0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP2); +#endif + break; + case randomx::SuperscalarInstructionType::IXOR_C9: + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); +#ifdef RANDOMX_ALIGN + emit(NOP2); +#endif + break; + case randomx::SuperscalarInstructionType::IMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + emit(MOV_RAX_I); + emit64(reciprocalCache[instr.getImm32()]); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + break; + default: + UNREACHABLE; + } + } + + void JitCompilerX86::genAddressReg(Instruction& instr, bool rax = true) { + emit(LEA_32); + emitByte(0x80 + instr.src + (rax ? 0 : 8)); + if (instr.src == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); + if (rax) + emitByte(AND_EAX_I); + else + emit(AND_ECX_I); + emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + + void JitCompilerX86::genAddressRegDst(Instruction& instr) { + emit(LEA_32); + emitByte(0x80 + instr.dst); + if (instr.dst == RegisterNeedsSib) { + emitByte(0x24); + } + emit32(instr.getImm32()); + emitByte(AND_EAX_I); + if (instr.getModCond() < StoreL3Condition) { + emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask); + } + else { + emit32(ScratchpadL3Mask); + } + } + + void JitCompilerX86::genAddressImm(Instruction& instr) { + emit32(instr.getImm32() & ScratchpadL3Mask); + } + + void JitCompilerX86::h_IADD_RS(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_LEA); + if (instr.dst == RegisterNeedsDisplacement) + emitByte(0xac); + else + emitByte(0x04 + 8 * instr.dst); + genSIB(instr.getModShift(), instr.src, instr.dst); + if (instr.dst == RegisterNeedsDisplacement) + emit32(instr.getImm32()); + } + + void JitCompilerX86::h_IADD_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_ADD_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_ADD_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::genSIB(int scale, int index, int base) { + emitByte((scale << 6) | (index << 3) | base); + } + + void JitCompilerX86::h_ISUB_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_SUB_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_81); + emitByte(0xe8 + instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_ISUB_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_SUB_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_SUB_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMUL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_IMUL_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_IMUL_RRI); + emitByte(0xc0 + 9 * instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_IMUL_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_IMUL_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_IMUL_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe0 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xa6); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_R); + emitByte(0xe8 + instr.src); + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_ISMULH_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr, false); + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_IMUL_MEM); + } + else { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.dst); + emit(REX_MUL_M); + emitByte(0xae); + genAddressImm(instr); + } + emit(REX_MOV_R64R); + emitByte(0xc2 + 8 * instr.dst); + } + + void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { + uint64_t divisor = instr.getImm32(); + if (!isPowerOf2(divisor)) { + registerUsage[instr.dst] = i; + emit(MOV_RAX_I); + emit64(randomx_reciprocal_fast(divisor)); + emit(REX_IMUL_RM); + emitByte(0xc0 + 8 * instr.dst); + } + } + + void JitCompilerX86::h_INEG_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + emit(REX_NEG); + emitByte(0xd8 + instr.dst); + } + + void JitCompilerX86::h_IXOR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_XOR_RR); + emitByte(0xc0 + 8 * instr.dst + instr.src); + } + else { + emit(REX_XOR_RI); + emitByte(0xf0 + instr.dst); + emit32(instr.getImm32()); + } + } + + void JitCompilerX86::h_IXOR_M(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + genAddressReg(instr); + emit(REX_XOR_RM); + emitByte(0x04 + 8 * instr.dst); + emitByte(0x06); + } + else { + emit(REX_XOR_RM); + emitByte(0x86 + 8 * instr.dst); + genAddressImm(instr); + } + } + + void JitCompilerX86::h_IROR_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc8 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc8 + instr.dst); + emitByte(instr.getImm32() & 63); + } + } + + void JitCompilerX86::h_IROL_R(Instruction& instr, int i) { + registerUsage[instr.dst] = i; + if (instr.src != instr.dst) { + emit(REX_MOV_RR); + emitByte(0xc8 + instr.src); + emit(REX_ROT_CL); + emitByte(0xc0 + instr.dst); + } + else { + emit(REX_ROT_I8); + emitByte(0xc0 + instr.dst); + emitByte(instr.getImm32() & 63); + } + } + + void JitCompilerX86::h_ISWAP_R(Instruction& instr, int i) { + if (instr.src != instr.dst) { + registerUsage[instr.dst] = i; + registerUsage[instr.src] = i; + emit(REX_XCHG); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + } + + void JitCompilerX86::h_FSWAP_R(Instruction& instr, int i) { + emit(SHUFPD); + emitByte(0xc0 + 9 * instr.dst); + emitByte(1); + } + + void JitCompilerX86::h_FADD_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_ADDPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FADD_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ADDPD); + emitByte(0xc4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSUB_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_SUBPD); + emitByte(0xc0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FSUB_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_SUBPD); + emitByte(0xc4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSCAL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + emit(REX_XORPS); + emitByte(0xc7 + 8 * instr.dst); + } + + void JitCompilerX86::h_FMUL_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + instr.src %= RegisterCountFlt; + emit(REX_MULPD); + emitByte(0xe0 + instr.src + 8 * instr.dst); + } + + void JitCompilerX86::h_FDIV_M(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + genAddressReg(instr); + emit(REX_CVTDQ2PD_XMM12); + emit(REX_ANDPS_XMM12); + emit(REX_DIVPD); + emitByte(0xe4 + 8 * instr.dst); + } + + void JitCompilerX86::h_FSQRT_R(Instruction& instr, int i) { + instr.dst %= RegisterCountFlt; + emit(SQRTPD); + emitByte(0xe4 + 9 * instr.dst); + } + + void JitCompilerX86::h_CFROUND(Instruction& instr, int i) { + emit(REX_MOV_RR64); + emitByte(0xc0 + instr.src); + int rotate = (13 - (instr.getImm32() & 63)) & 63; + if (rotate != 0) { + emit(ROL_RAX); + emitByte(rotate); + } + emit(AND_OR_MOV_LDMXCSR); + } + + void JitCompilerX86::h_CBRANCH(Instruction& instr, int i) { + int reg = instr.dst; + int target = registerUsage[reg] + 1; + emit(REX_ADD_I); + emitByte(0xc0 + reg); + int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset; + uint32_t imm = instr.getImm32() | (1UL << shift); + if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + emit32(imm); + emit(REX_TEST); + emitByte(0xc0 + reg); + emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift); + emit(JZ); + emit32(instructionOffsets[target] - (codePos + 4)); + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + registerUsage[j] = i; + } + } + + void JitCompilerX86::h_ISTORE(Instruction& instr, int i) { + genAddressRegDst(instr); + emit(REX_MOV_MR); + emitByte(0x04 + 8 * instr.src); + emitByte(0x06); + } + + void JitCompilerX86::h_NOP(Instruction& instr, int i) { + emit(NOP1); + } + + InstructionGeneratorX86 JitCompilerX86::engine[256] = {}; + +} diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp new file mode 100644 index 00000000..a81c8f4e --- /dev/null +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "common.hpp" + +namespace randomx { + + class Program; + class ProgramConfiguration; + class SuperscalarProgram; + class JitCompilerX86; + class Instruction; + + typedef void(JitCompilerX86::*InstructionGeneratorX86)(Instruction&, int); + + constexpr uint32_t CodeSize = 64 * 1024; + + class JitCompilerX86 { + public: + JitCompilerX86(); + ~JitCompilerX86(); + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + template + void generateSuperscalarHash(SuperscalarProgram (&programs)[N], std::vector &); + void generateDatasetInitCode(); + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)code; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)code; + } + uint8_t* getCode() { + return code; + } + size_t getCodeSize(); + + static InstructionGeneratorX86 engine[256]; + std::vector instructionOffsets; + int registerUsage[RegistersCount]; + uint8_t* code; + int32_t codePos; + + void generateProgramPrologue(Program&, ProgramConfiguration&); + void generateProgramEpilogue(Program&); + void genAddressReg(Instruction&, bool); + void genAddressRegDst(Instruction&); + void genAddressImm(Instruction&); + void genSIB(int scale, int index, int base); + + void generateCode(Instruction&, int); + void generateSuperscalarCode(Instruction &, std::vector &); + + void emitByte(uint8_t val) { + code[codePos] = val; + codePos++; + } + + void emit32(uint32_t val) { + memcpy(code + codePos, &val, sizeof val); + codePos += sizeof val; + } + + void emit64(uint64_t val) { + memcpy(code + codePos, &val, sizeof val); + codePos += sizeof val; + } + + template + void emit(const uint8_t (&src)[N]) { + emit(src, N); + } + + void emit(const uint8_t* src, size_t count) { + memcpy(code + codePos, src, count); + codePos += count; + } + + void h_IADD_RS(Instruction&, int); + void h_IADD_M(Instruction&, int); + void h_ISUB_R(Instruction&, int); + void h_ISUB_M(Instruction&, int); + void h_IMUL_R(Instruction&, int); + void h_IMUL_M(Instruction&, int); + void h_IMULH_R(Instruction&, int); + void h_IMULH_M(Instruction&, int); + void h_ISMULH_R(Instruction&, int); + void h_ISMULH_M(Instruction&, int); + void h_IMUL_RCP(Instruction&, int); + void h_INEG_R(Instruction&, int); + void h_IXOR_R(Instruction&, int); + void h_IXOR_M(Instruction&, int); + void h_IROR_R(Instruction&, int); + void h_IROL_R(Instruction&, int); + void h_ISWAP_R(Instruction&, int); + void h_FSWAP_R(Instruction&, int); + void h_FADD_R(Instruction&, int); + void h_FADD_M(Instruction&, int); + void h_FSUB_R(Instruction&, int); + void h_FSUB_M(Instruction&, int); + void h_FSCAL_R(Instruction&, int); + void h_FMUL_R(Instruction&, int); + void h_FDIV_M(Instruction&, int); + void h_FSQRT_R(Instruction&, int); + void h_CBRANCH(Instruction&, int); + void h_CFROUND(Instruction&, int); + void h_ISTORE(Instruction&, int); + void h_NOP(Instruction&, int); + }; + +} \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S new file mode 100644 index 00000000..9cffaab5 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -0,0 +1,208 @@ +# Copyright (c) 2018-2019, tevador +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the copyright holder nor the +# names of its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.intel_syntax noprefix +#if defined(__APPLE__) +.text +#define DECL(x) _##x +#else +.section .text +#define DECL(x) x +#endif + +#if defined(__WIN32__) || defined(__CYGWIN__) +#define WINABI +#endif + +.global DECL(randomx_program_prologue) +.global DECL(randomx_program_loop_begin) +.global DECL(randomx_program_loop_load) +.global DECL(randomx_program_start) +.global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_read_dataset_sshash_init) +.global DECL(randomx_program_read_dataset_sshash_fin) +.global DECL(randomx_program_loop_store) +.global DECL(randomx_program_loop_end) +.global DECL(randomx_dataset_init) +.global DECL(randomx_program_epilogue) +.global DECL(randomx_sshash_load) +.global DECL(randomx_sshash_prefetch) +.global DECL(randomx_sshash_end) +.global DECL(randomx_sshash_init) +.global DECL(randomx_program_end) +.global DECL(randomx_reciprocal_fast) + +#define RANDOMX_SCRATCHPAD_MASK 2097088 +#define RANDOMX_DATASET_BASE_MASK 2147483584 +#define RANDOMX_CACHE_MASK 4194303 + +#define db .byte + +.balign 64 +DECL(randomx_program_prologue): +#if defined(WINABI) + #include "asm/program_prologue_win64.inc" +#else + #include "asm/program_prologue_linux.inc" +#endif + movapd xmm13, xmmword ptr mantissaMask[rip] + movapd xmm14, xmmword ptr exp240[rip] + movapd xmm15, xmmword ptr scaleMask[rip] + jmp DECL(randomx_program_loop_begin) + +.balign 64 + #include "asm/program_xmm_constants.inc" + +.balign 64 +DECL(randomx_program_loop_begin): + nop + +DECL(randomx_program_loop_load): + #include "asm/program_loop_load.inc" + +DECL(randomx_program_start): + nop + +DECL(randomx_program_read_dataset): + #include "asm/program_read_dataset.inc" + +DECL(randomx_program_read_dataset_sshash_init): + #include "asm/program_read_dataset_sshash_init.inc" + +DECL(randomx_program_read_dataset_sshash_fin): + #include "asm/program_read_dataset_sshash_fin.inc" + +DECL(randomx_program_loop_store): + #include "asm/program_loop_store.inc" + +DECL(randomx_program_loop_end): + nop + +.balign 64 +DECL(randomx_dataset_init): + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +#if defined(WINABI) + push rdi + push rsi + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +#else + mov rdi, qword ptr [rdi] ;# cache->memory + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +#endif +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + .byte 232 ;# 0xE8 = call + ;# .set CALL_LOC, + .int 32768 - (call_offset - DECL(randomx_dataset_init)) +call_offset: + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop rax +#if defined(WINABI) + pop rsi + pop rdi +#endif + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + +.balign 64 +DECL(randomx_program_epilogue): + #include "asm/program_epilogue_store.inc" +#if defined(WINABI) + #include "asm/program_epilogue_win64.inc" +#else + #include "asm/program_epilogue_linux.inc" +#endif + +.balign 64 +DECL(randomx_sshash_load): + #include "asm/program_sshash_load.inc" + +DECL(randomx_sshash_prefetch): + #include "asm/program_sshash_prefetch.inc" + +DECL(randomx_sshash_end): + nop + +.balign 64 +DECL(randomx_sshash_init): + lea r8, [rbx+1] + #include "asm/program_sshash_prefetch.inc" + imul r8, qword ptr r0_mul[rip] + mov r9, qword ptr r1_add[rip] + xor r9, r8 + mov r10, qword ptr r2_add[rip] + xor r10, r8 + mov r11, qword ptr r3_add[rip] + xor r11, r8 + mov r12, qword ptr r4_add[rip] + xor r12, r8 + mov r13, qword ptr r5_add[rip] + xor r13, r8 + mov r14, qword ptr r6_add[rip] + xor r14, r8 + mov r15, qword ptr r7_add[rip] + xor r15, r8 + jmp DECL(randomx_program_end) + +.balign 64 + #include "asm/program_sshash_constants.inc" + +.balign 64 +DECL(randomx_program_end): + nop + +DECL(randomx_reciprocal_fast): +#if !defined(WINABI) + mov rcx, rdi +#endif + #include "asm/randomx_reciprocal.inc" diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm new file mode 100644 index 00000000..5ecfb435 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -0,0 +1,199 @@ +; Copyright (c) 2018-2019, tevador +; +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; * Neither the name of the copyright holder nor the +; names of its contributors may be used to endorse or promote products +; derived from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +IFDEF RAX + +_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE + +PUBLIC randomx_program_prologue +PUBLIC randomx_program_loop_begin +PUBLIC randomx_program_loop_load +PUBLIC randomx_program_start +PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_read_dataset_sshash_init +PUBLIC randomx_program_read_dataset_sshash_fin +PUBLIC randomx_dataset_init +PUBLIC randomx_program_loop_store +PUBLIC randomx_program_loop_end +PUBLIC randomx_program_epilogue +PUBLIC randomx_sshash_load +PUBLIC randomx_sshash_prefetch +PUBLIC randomx_sshash_end +PUBLIC randomx_sshash_init +PUBLIC randomx_program_end +PUBLIC randomx_reciprocal_fast + +RANDOMX_SCRATCHPAD_MASK EQU 2097088 +RANDOMX_DATASET_BASE_MASK EQU 2147483584 +RANDOMX_CACHE_MASK EQU 4194303 + +ALIGN 64 +randomx_program_prologue PROC + include asm/program_prologue_win64.inc + movapd xmm13, xmmword ptr [mantissaMask] + movapd xmm14, xmmword ptr [exp240] + movapd xmm15, xmmword ptr [scaleMask] + jmp randomx_program_loop_begin +randomx_program_prologue ENDP + +ALIGN 64 + include asm/program_xmm_constants.inc + +ALIGN 64 +randomx_program_loop_begin PROC + nop +randomx_program_loop_begin ENDP + +randomx_program_loop_load PROC + include asm/program_loop_load.inc +randomx_program_loop_load ENDP + +randomx_program_start PROC + nop +randomx_program_start ENDP + +randomx_program_read_dataset PROC + include asm/program_read_dataset.inc +randomx_program_read_dataset ENDP + +randomx_program_read_dataset_sshash_init PROC + include asm/program_read_dataset_sshash_init.inc +randomx_program_read_dataset_sshash_init ENDP + +randomx_program_read_dataset_sshash_fin PROC + include asm/program_read_dataset_sshash_fin.inc +randomx_program_read_dataset_sshash_fin ENDP + +randomx_program_loop_store PROC + include asm/program_loop_store.inc +randomx_program_loop_store ENDP + +randomx_program_loop_end PROC + nop +randomx_program_loop_end ENDP + +ALIGN 64 +randomx_dataset_init PROC + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +init_block_loop: + prefetchw byte ptr [rsi] + mov rbx, rbp + db 232 ;# 0xE8 = call + dd 32768 - distance + distance equ $ - offset randomx_dataset_init + mov qword ptr [rsi+0], r8 + mov qword ptr [rsi+8], r9 + mov qword ptr [rsi+16], r10 + mov qword ptr [rsi+24], r11 + mov qword ptr [rsi+32], r12 + mov qword ptr [rsi+40], r13 + mov qword ptr [rsi+48], r14 + mov qword ptr [rsi+56], r15 + add rbp, 1 + add rsi, 64 + cmp rbp, qword ptr [rsp] + jb init_block_loop + pop r9 + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret +randomx_dataset_init ENDP + +ALIGN 64 +randomx_program_epilogue PROC + include asm/program_epilogue_store.inc + include asm/program_epilogue_win64.inc +randomx_program_epilogue ENDP + +ALIGN 64 +randomx_sshash_load PROC + include asm/program_sshash_load.inc +randomx_sshash_load ENDP + +randomx_sshash_prefetch PROC + include asm/program_sshash_prefetch.inc +randomx_sshash_prefetch ENDP + +randomx_sshash_end PROC + nop +randomx_sshash_end ENDP + +ALIGN 64 +randomx_sshash_init PROC + lea r8, [rbx+1] + include asm/program_sshash_prefetch.inc + imul r8, qword ptr [r0_mul] + mov r9, qword ptr [r1_add] + xor r9, r8 + mov r10, qword ptr [r2_add] + xor r10, r8 + mov r11, qword ptr [r3_add] + xor r11, r8 + mov r12, qword ptr [r4_add] + xor r12, r8 + mov r13, qword ptr [r5_add] + xor r13, r8 + mov r14, qword ptr [r6_add] + xor r14, r8 + mov r15, qword ptr [r7_add] + xor r15, r8 + jmp randomx_program_end +randomx_sshash_init ENDP + +ALIGN 64 + include asm/program_sshash_constants.inc + +ALIGN 64 +randomx_program_end PROC + nop +randomx_program_end ENDP + +randomx_reciprocal_fast PROC + include asm/randomx_reciprocal.inc +randomx_reciprocal_fast ENDP + +_RANDOMX_JITX86_STATIC ENDS + +ENDIF + +END \ No newline at end of file diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp new file mode 100644 index 00000000..ba196862 --- /dev/null +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -0,0 +1,48 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_program_prologue(); + void randomx_program_loop_begin(); + void randomx_program_loop_load(); + void randomx_program_start(); + void randomx_program_read_dataset(); + void randomx_program_read_dataset_sshash_init(); + void randomx_program_read_dataset_sshash_fin(); + void randomx_program_loop_store(); + void randomx_program_loop_end(); + void randomx_dataset_init(); + void randomx_program_epilogue(); + void randomx_sshash_load(); + void randomx_sshash_prefetch(); + void randomx_sshash_end(); + void randomx_sshash_init(); + void randomx_program_end(); +} diff --git a/src/crypto/randomx/program.hpp b/src/crypto/randomx/program.hpp new file mode 100644 index 00000000..ef8ac748 --- /dev/null +++ b/src/crypto/randomx/program.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "common.hpp" +#include "instruction.hpp" +#include "blake2/endian.h" + +namespace randomx { + + struct ProgramConfiguration { + uint64_t eMask[2]; + uint32_t readReg0, readReg1, readReg2, readReg3; + }; + + class Program { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + uint64_t getEntropy(int i) { + return load64(&entropyBuffer[i]); + } + uint32_t getSize() { + return RandomX_CurrentConfig.ProgramSize; + } + private: + uint64_t entropyBuffer[16]; + Instruction programBuffer[RANDOMX_PROGRAM_MAX_SIZE]; + }; + + static_assert(sizeof(Program) % 64 == 0, "Invalid size of class randomx::Program"); +} diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp new file mode 100644 index 00000000..dde838b9 --- /dev/null +++ b/src/crypto/randomx/randomx.cpp @@ -0,0 +1,476 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "randomx.h" +#include "dataset.hpp" +#include "vm_interpreted.hpp" +#include "vm_interpreted_light.hpp" +#include "vm_compiled.hpp" +#include "vm_compiled_light.hpp" +#include "blake2/blake2.h" +#include "jit_compiler_x86_static.hpp" +#include + +RandomX_ConfigurationWownero::RandomX_ConfigurationWownero() +{ + ArgonSalt = "RandomWOW\x01"; + ProgramIterations = 1024; + ProgramCount = 16; + ScratchpadL2_Size = 131072; + ScratchpadL3_Size = 1048576; + + RANDOMX_FREQ_IROR_R = 10; + RANDOMX_FREQ_IROL_R = 0; + RANDOMX_FREQ_FSWAP_R = 8; + RANDOMX_FREQ_FADD_R = 20; + RANDOMX_FREQ_FSUB_R = 20; + RANDOMX_FREQ_FMUL_R = 20; + + fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d); + fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450); + fillAes4Rx4_Key[2] = rx_set_int_vec_i128(0x3d324aac, 0xa7279ad2, 0xd524fde4, 0x114c47a4); + fillAes4Rx4_Key[3] = rx_set_int_vec_i128(0x76f6db08, 0x42d3dbd9, 0x99a9aeff, 0x810c3a2a); + fillAes4Rx4_Key[4] = fillAes4Rx4_Key[0]; + fillAes4Rx4_Key[5] = fillAes4Rx4_Key[1]; + fillAes4Rx4_Key[6] = fillAes4Rx4_Key[2]; + fillAes4Rx4_Key[7] = fillAes4Rx4_Key[3]; +} + +RandomX_ConfigurationLoki::RandomX_ConfigurationLoki() +{ + ArgonIterations = 4; + ArgonLanes = 2; + ArgonSalt = "RandomXL\x12"; + ProgramSize = 320; + ProgramCount = 7; +} + +RandomX_ConfigurationBase::RandomX_ConfigurationBase() + : ArgonMemory(262144) + , ArgonIterations(3) + , ArgonLanes(1) + , ArgonSalt("RandomX\x03") + , CacheAccesses(8) + , SuperscalarLatency(170) + , DatasetBaseSize(2147483648) + , DatasetExtraSize(33554368) + , ScratchpadL1_Size(16384) + , ScratchpadL2_Size(262144) + , ScratchpadL3_Size(2097152) + , ProgramSize(256) + , ProgramIterations(2048) + , ProgramCount(8) + , JumpBits(8) + , JumpOffset(8) + , RANDOMX_FREQ_IADD_RS(25) + , RANDOMX_FREQ_IADD_M(7) + , RANDOMX_FREQ_ISUB_R(16) + , RANDOMX_FREQ_ISUB_M(7) + , RANDOMX_FREQ_IMUL_R(16) + , RANDOMX_FREQ_IMUL_M(4) + , RANDOMX_FREQ_IMULH_R(4) + , RANDOMX_FREQ_IMULH_M(1) + , RANDOMX_FREQ_ISMULH_R(4) + , RANDOMX_FREQ_ISMULH_M(1) + , RANDOMX_FREQ_IMUL_RCP(8) + , RANDOMX_FREQ_INEG_R(2) + , RANDOMX_FREQ_IXOR_R(15) + , RANDOMX_FREQ_IXOR_M(5) + , RANDOMX_FREQ_IROR_R(8) + , RANDOMX_FREQ_IROL_R(2) + , RANDOMX_FREQ_ISWAP_R(4) + , RANDOMX_FREQ_FSWAP_R(4) + , RANDOMX_FREQ_FADD_R(16) + , RANDOMX_FREQ_FADD_M(5) + , RANDOMX_FREQ_FSUB_R(16) + , RANDOMX_FREQ_FSUB_M(5) + , RANDOMX_FREQ_FSCAL_R(6) + , RANDOMX_FREQ_FMUL_R(32) + , RANDOMX_FREQ_FDIV_M(4) + , RANDOMX_FREQ_FSQRT_R(6) + , RANDOMX_FREQ_CBRANCH(16) + , RANDOMX_FREQ_CFROUND(1) + , RANDOMX_FREQ_ISTORE(16) + , RANDOMX_FREQ_NOP(0) +{ + fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0x99e5d23f, 0x2f546d2b, 0xd1833ddb, 0x6421aadd); + fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0xa5dfcde5, 0x06f79d53, 0xb6913f55, 0xb20e3450); + fillAes4Rx4_Key[2] = rx_set_int_vec_i128(0x171c02bf, 0x0aa4679f, 0x515e7baf, 0x5c3ed904); + fillAes4Rx4_Key[3] = rx_set_int_vec_i128(0xd8ded291, 0xcd673785, 0xe78f5d08, 0x85623763); + fillAes4Rx4_Key[4] = rx_set_int_vec_i128(0x229effb4, 0x3d518b6d, 0xe3d6a7a6, 0xb5826f73); + fillAes4Rx4_Key[5] = rx_set_int_vec_i128(0xb272b7d2, 0xe9024d4e, 0x9c10b3d9, 0xc7566bf3); + fillAes4Rx4_Key[6] = rx_set_int_vec_i128(0xf63befa7, 0x2ba9660a, 0xf765a38b, 0xf273c9e7); + fillAes4Rx4_Key[7] = rx_set_int_vec_i128(0xc0b0762d, 0x0c06d1fd, 0x915839de, 0x7a7cd609); + +#if defined(_M_X64) || defined(__x86_64__) + { + const uint8_t* a = (const uint8_t*)&randomx_sshash_prefetch; + const uint8_t* b = (const uint8_t*)&randomx_sshash_end; + memcpy(codeShhPrefetchTweaked, a, b - a); + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + memcpy(codeReadDatasetTweaked, a, b - a); + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_fin; + memcpy(codeReadDatasetLightSshInitTweaked, a, b - a); + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_loop_load; + const uint8_t* b = (const uint8_t*)&randomx_program_start; + memcpy(codeLoopLoadTweaked, a, b - a); + } +#endif +} + +void RandomX_ConfigurationBase::Apply() +{ +#if defined(_M_X64) || defined(__x86_64__) + *(uint32_t*)(codeShhPrefetchTweaked + 3) = ArgonMemory * 16 - 1; + const uint32_t DatasetBaseMask = DatasetBaseSize - RANDOMX_DATASET_ITEM_SIZE; + *(uint32_t*)(codeReadDatasetTweaked + 7) = DatasetBaseMask; + *(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask; + *(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask; +#endif + + CacheLineAlignMask_Calculated = (DatasetBaseSize - 1) & ~(RANDOMX_DATASET_ITEM_SIZE - 1); + DatasetExtraItems_Calculated = DatasetExtraSize / RANDOMX_DATASET_ITEM_SIZE; + + ScratchpadL1Mask_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) - 1) * 8; + ScratchpadL1Mask16_Calculated = (ScratchpadL1_Size / sizeof(uint64_t) / 2 - 1) * 16; + ScratchpadL2Mask_Calculated = (ScratchpadL2_Size / sizeof(uint64_t) - 1) * 8; + ScratchpadL2Mask16_Calculated = (ScratchpadL2_Size / sizeof(uint64_t) / 2 - 1) * 16; + ScratchpadL3Mask_Calculated = (((ScratchpadL3_Size / sizeof(uint64_t)) - 1) * 8); + ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64; + +#if defined(_M_X64) || defined(__x86_64__) + *(uint32_t*)(codeLoopLoadTweaked + 4) = ScratchpadL3Mask64_Calculated; + *(uint32_t*)(codeLoopLoadTweaked + 50) = ScratchpadL3Mask64_Calculated; +#endif + + ConditionMask_Calculated = (1 << JumpBits) - 1; + + constexpr int CEIL_NULL = 0; + int k = 0; + +#if defined(_M_X64) || defined(__x86_64__) +#define JIT_HANDLE(x, prev) randomx::JitCompilerX86::engine[k] = &randomx::JitCompilerX86::h_##x +#else +#define JIT_HANDLE(x, prev) +#endif + +#define INST_HANDLE(x, prev) \ + CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \ + for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); } + + INST_HANDLE(IADD_RS, NULL); + INST_HANDLE(IADD_M, IADD_RS); + INST_HANDLE(ISUB_R, IADD_M); + INST_HANDLE(ISUB_M, ISUB_R); + INST_HANDLE(IMUL_R, ISUB_M); + INST_HANDLE(IMUL_M, IMUL_R); + INST_HANDLE(IMULH_R, IMUL_M); + INST_HANDLE(IMULH_M, IMULH_R); + INST_HANDLE(ISMULH_R, IMULH_M); + INST_HANDLE(ISMULH_M, ISMULH_R); + INST_HANDLE(IMUL_RCP, ISMULH_M); + INST_HANDLE(INEG_R, IMUL_RCP); + INST_HANDLE(IXOR_R, INEG_R); + INST_HANDLE(IXOR_M, IXOR_R); + INST_HANDLE(IROR_R, IXOR_M); + INST_HANDLE(IROL_R, IROR_R); + INST_HANDLE(ISWAP_R, IROL_R); + INST_HANDLE(FSWAP_R, ISWAP_R); + INST_HANDLE(FADD_R, FSWAP_R); + INST_HANDLE(FADD_M, FADD_R); + INST_HANDLE(FSUB_R, FADD_M); + INST_HANDLE(FSUB_M, FSUB_R); + INST_HANDLE(FSCAL_R, FSUB_M); + INST_HANDLE(FMUL_R, FSCAL_R); + INST_HANDLE(FDIV_M, FMUL_R); + INST_HANDLE(FSQRT_R, FDIV_M); + INST_HANDLE(CBRANCH, FSQRT_R); + INST_HANDLE(CFROUND, CBRANCH); + INST_HANDLE(ISTORE, CFROUND); + INST_HANDLE(NOP, ISTORE); +#undef INST_HANDLE +} + +RandomX_ConfigurationMonero RandomX_MoneroConfig; +RandomX_ConfigurationWownero RandomX_WowneroConfig; +RandomX_ConfigurationLoki RandomX_LokiConfig; + +RandomX_ConfigurationBase RandomX_CurrentConfig; + +extern "C" { + + randomx_cache *randomx_alloc_cache(randomx_flags flags) { + randomx_cache *cache; + + try { + cache = new randomx_cache(); + switch (flags & (RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)) { + case RANDOMX_FLAG_DEFAULT: + cache->dealloc = &randomx::deallocCache; + cache->jit = nullptr; + cache->initialize = &randomx::initCache; + cache->datasetInit = &randomx::initDataset; + cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(RANDOMX_CACHE_MAX_SIZE); + break; + + case RANDOMX_FLAG_JIT: + cache->dealloc = &randomx::deallocCache; + cache->jit = new randomx::JitCompiler(); + cache->initialize = &randomx::initCacheCompile; + cache->datasetInit = cache->jit->getDatasetInitFunc(); + cache->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(RANDOMX_CACHE_MAX_SIZE); + break; + + case RANDOMX_FLAG_LARGE_PAGES: + cache->dealloc = &randomx::deallocCache; + cache->jit = nullptr; + cache->initialize = &randomx::initCache; + cache->datasetInit = &randomx::initDataset; + cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(RANDOMX_CACHE_MAX_SIZE); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + cache->dealloc = &randomx::deallocCache; + cache->jit = new randomx::JitCompiler(); + cache->initialize = &randomx::initCacheCompile; + cache->datasetInit = cache->jit->getDatasetInitFunc(); + cache->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(RANDOMX_CACHE_MAX_SIZE); + break; + + default: + UNREACHABLE; + } + } + catch (std::exception &ex) { + if (cache != nullptr) { + randomx_release_cache(cache); + cache = nullptr; + } + } + + return cache; + } + + void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize) { + assert(cache != nullptr); + assert(keySize == 0 || key != nullptr); + cache->initialize(cache, key, keySize); + } + + void randomx_release_cache(randomx_cache* cache) { + assert(cache != nullptr); + cache->dealloc(cache); + delete cache; + } + + randomx_dataset *randomx_alloc_dataset(randomx_flags flags) { + randomx_dataset *dataset; + + try { + dataset = new randomx_dataset(); + if (flags & RANDOMX_FLAG_LARGE_PAGES) { + dataset->dealloc = &randomx::deallocDataset; + dataset->memory = (uint8_t*)randomx::LargePageAllocator::allocMemory(RANDOMX_DATASET_MAX_SIZE); + } + else { + dataset->dealloc = &randomx::deallocDataset; + dataset->memory = (uint8_t*)randomx::DefaultAllocator::allocMemory(RANDOMX_DATASET_MAX_SIZE); + } + } + catch (std::exception &ex) { + if (dataset != nullptr) { + randomx_release_dataset(dataset); + dataset = nullptr; + } + } + + return dataset; + } + + #define DatasetItemCount ((RandomX_CurrentConfig.DatasetBaseSize + RandomX_CurrentConfig.DatasetExtraSize) / RANDOMX_DATASET_ITEM_SIZE) + + unsigned long randomx_dataset_item_count() { + return DatasetItemCount; + } + + void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount) { + assert(dataset != nullptr); + assert(cache != nullptr); + assert(startItem < DatasetItemCount && itemCount <= DatasetItemCount); + assert(startItem + itemCount <= DatasetItemCount); + cache->datasetInit(cache, dataset->memory + startItem * randomx::CacheLineSize, startItem, startItem + itemCount); + } + + void *randomx_get_dataset_memory(randomx_dataset *dataset) { + assert(dataset != nullptr); + return dataset->memory; + } + + void randomx_release_dataset(randomx_dataset *dataset) { + assert(dataset != nullptr); + dataset->dealloc(dataset); + delete dataset; + } + + randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset) { + assert(cache != nullptr || (flags & RANDOMX_FLAG_FULL_MEM)); + assert(cache == nullptr || cache->isInitialized()); + assert(dataset != nullptr || !(flags & RANDOMX_FLAG_FULL_MEM)); + + randomx_vm *vm = nullptr; + + try { + switch (flags & (RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES)) { + case RANDOMX_FLAG_DEFAULT: + vm = new randomx::InterpretedLightVmDefault(); + break; + + case RANDOMX_FLAG_FULL_MEM: + vm = new randomx::InterpretedVmDefault(); + break; + + case RANDOMX_FLAG_JIT: + vm = new randomx::CompiledLightVmDefault(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT: + vm = new randomx::CompiledVmDefault(); + break; + + case RANDOMX_FLAG_HARD_AES: + vm = new randomx::InterpretedLightVmHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES: + vm = new randomx::InterpretedVmHardAes(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: + vm = new randomx::CompiledLightVmHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES: + vm = new randomx::CompiledVmHardAes(); + break; + + case RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedLightVmLargePage(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedVmLargePage(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::CompiledLightVmLargePage(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::CompiledVmLargePage(); + break; + + case RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedLightVmLargePageHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::InterpretedVmLargePageHardAes(); + break; + + case RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::CompiledLightVmLargePageHardAes(); + break; + + case RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT | RANDOMX_FLAG_HARD_AES | RANDOMX_FLAG_LARGE_PAGES: + vm = new randomx::CompiledVmLargePageHardAes(); + break; + + default: + UNREACHABLE; + } + + if(cache != nullptr) + vm->setCache(cache); + + if(dataset != nullptr) + vm->setDataset(dataset); + + vm->allocate(); + } + catch (std::exception &ex) { + delete vm; + vm = nullptr; + } + + return vm; + } + + void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache) { + assert(machine != nullptr); + assert(cache != nullptr && cache->isInitialized()); + machine->setCache(cache); + } + + void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset) { + assert(machine != nullptr); + assert(dataset != nullptr); + machine->setDataset(dataset); + } + + void randomx_destroy_vm(randomx_vm *machine) { + assert(machine != nullptr); + delete machine; + } + + void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output) { + assert(machine != nullptr); + assert(inputSize == 0 || input != nullptr); + assert(output != nullptr); + alignas(16) uint64_t tempHash[8]; + int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + assert(blakeResult == 0); + machine->initScratchpad(&tempHash); + machine->resetRoundingMode(); + for (int chain = 0; chain < RandomX_CurrentConfig.ProgramCount - 1; ++chain) { + machine->run(&tempHash); + blakeResult = blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + assert(blakeResult == 0); + } + machine->run(&tempHash); + machine->getFinalResult(output, RANDOMX_HASH_SIZE); + } + +} diff --git a/src/crypto/randomx/randomx.h b/src/crypto/randomx/randomx.h new file mode 100644 index 00000000..cd07ac64 --- /dev/null +++ b/src/crypto/randomx/randomx.h @@ -0,0 +1,332 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef RANDOMX_H +#define RANDOMX_H + +#include +#include +#include +#include "intrin_portable.h" + +#define RANDOMX_HASH_SIZE 32 +#define RANDOMX_DATASET_ITEM_SIZE 64 + +#ifndef RANDOMX_EXPORT +#define RANDOMX_EXPORT +#endif + +typedef enum { + RANDOMX_FLAG_DEFAULT = 0, + RANDOMX_FLAG_LARGE_PAGES = 1, + RANDOMX_FLAG_HARD_AES = 2, + RANDOMX_FLAG_FULL_MEM = 4, + RANDOMX_FLAG_JIT = 8, +} randomx_flags; + +typedef struct randomx_dataset randomx_dataset; +typedef struct randomx_cache randomx_cache; +typedef struct randomx_vm randomx_vm; + +struct RandomX_ConfigurationBase +{ + RandomX_ConfigurationBase(); + + void Apply(); + + uint32_t ArgonMemory; + uint32_t ArgonIterations; + uint32_t ArgonLanes; + const char* ArgonSalt; + uint32_t CacheAccesses; + uint32_t SuperscalarLatency; + + uint32_t DatasetBaseSize; + uint32_t DatasetExtraSize; + + uint32_t ScratchpadL1_Size; + uint32_t ScratchpadL2_Size; + uint32_t ScratchpadL3_Size; + + uint32_t ProgramSize; + uint32_t ProgramIterations; + uint32_t ProgramCount; + + uint32_t JumpBits; + uint32_t JumpOffset; + + uint32_t RANDOMX_FREQ_IADD_RS; + uint32_t RANDOMX_FREQ_IADD_M; + uint32_t RANDOMX_FREQ_ISUB_R; + uint32_t RANDOMX_FREQ_ISUB_M; + uint32_t RANDOMX_FREQ_IMUL_R; + uint32_t RANDOMX_FREQ_IMUL_M; + uint32_t RANDOMX_FREQ_IMULH_R; + uint32_t RANDOMX_FREQ_IMULH_M; + uint32_t RANDOMX_FREQ_ISMULH_R; + uint32_t RANDOMX_FREQ_ISMULH_M; + uint32_t RANDOMX_FREQ_IMUL_RCP; + uint32_t RANDOMX_FREQ_INEG_R; + uint32_t RANDOMX_FREQ_IXOR_R; + uint32_t RANDOMX_FREQ_IXOR_M; + uint32_t RANDOMX_FREQ_IROR_R; + uint32_t RANDOMX_FREQ_IROL_R; + uint32_t RANDOMX_FREQ_ISWAP_R; + uint32_t RANDOMX_FREQ_FSWAP_R; + uint32_t RANDOMX_FREQ_FADD_R; + uint32_t RANDOMX_FREQ_FADD_M; + uint32_t RANDOMX_FREQ_FSUB_R; + uint32_t RANDOMX_FREQ_FSUB_M; + uint32_t RANDOMX_FREQ_FSCAL_R; + uint32_t RANDOMX_FREQ_FMUL_R; + uint32_t RANDOMX_FREQ_FDIV_M; + uint32_t RANDOMX_FREQ_FSQRT_R; + uint32_t RANDOMX_FREQ_CBRANCH; + uint32_t RANDOMX_FREQ_CFROUND; + uint32_t RANDOMX_FREQ_ISTORE; + uint32_t RANDOMX_FREQ_NOP; + + rx_vec_i128 fillAes4Rx4_Key[8]; + + uint8_t codeShhPrefetchTweaked[20]; + uint8_t codeReadDatasetTweaked[64]; + uint8_t codeReadDatasetLightSshInitTweaked[68]; + uint8_t codeLoopLoadTweaked[140]; + + uint32_t CacheLineAlignMask_Calculated; + uint32_t DatasetExtraItems_Calculated; + + uint32_t ScratchpadL1Mask_Calculated; + uint32_t ScratchpadL1Mask16_Calculated; + uint32_t ScratchpadL2Mask_Calculated; + uint32_t ScratchpadL2Mask16_Calculated; + uint32_t ScratchpadL3Mask_Calculated; + uint32_t ScratchpadL3Mask64_Calculated; + + uint32_t ConditionMask_Calculated; + + int CEIL_IADD_RS; + int CEIL_IADD_M; + int CEIL_ISUB_R; + int CEIL_ISUB_M; + int CEIL_IMUL_R; + int CEIL_IMUL_M; + int CEIL_IMULH_R; + int CEIL_IMULH_M; + int CEIL_ISMULH_R; + int CEIL_ISMULH_M; + int CEIL_IMUL_RCP; + int CEIL_INEG_R; + int CEIL_IXOR_R; + int CEIL_IXOR_M; + int CEIL_IROR_R; + int CEIL_IROL_R; + int CEIL_ISWAP_R; + int CEIL_FSWAP_R; + int CEIL_FADD_R; + int CEIL_FADD_M; + int CEIL_FSUB_R; + int CEIL_FSUB_M; + int CEIL_FSCAL_R; + int CEIL_FMUL_R; + int CEIL_FDIV_M; + int CEIL_FSQRT_R; + int CEIL_CBRANCH; + int CEIL_CFROUND; + int CEIL_ISTORE; + int CEIL_NOP; +}; + +struct RandomX_ConfigurationMonero : public RandomX_ConfigurationBase {}; +struct RandomX_ConfigurationWownero : public RandomX_ConfigurationBase { RandomX_ConfigurationWownero(); }; +struct RandomX_ConfigurationLoki : public RandomX_ConfigurationBase { RandomX_ConfigurationLoki(); }; + +extern RandomX_ConfigurationMonero RandomX_MoneroConfig; +extern RandomX_ConfigurationWownero RandomX_WowneroConfig; +extern RandomX_ConfigurationLoki RandomX_LokiConfig; + +extern RandomX_ConfigurationBase RandomX_CurrentConfig; + +template +void randomx_apply_config(const T& config) +{ + static_assert(sizeof(T) == sizeof(RandomX_ConfigurationBase), "Invalid RandomX configuration struct size"); + static_assert(std::is_base_of::value, "Incompatible RandomX configuration struct"); + RandomX_CurrentConfig = config; + RandomX_CurrentConfig.Apply(); +} + +#if defined(__cplusplus) +extern "C" { +#endif + +/** + * Creates a randomx_cache structure and allocates memory for RandomX Cache. + * + * @param flags is any combination of these 2 flags (each flag can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages + * RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes + * subsequent Dataset initialization faster + * + * @return Pointer to an allocated randomx_cache structure. + * NULL is returned if memory allocation fails or if the RANDOMX_FLAG_JIT + * is set and JIT compilation is not supported on the current platform. + */ +RANDOMX_EXPORT randomx_cache *randomx_alloc_cache(randomx_flags flags); + +/** + * Initializes the cache memory and SuperscalarHash using the provided key value. + * + * @param cache is a pointer to a previously allocated randomx_cache structure. Must not be NULL. + * @param key is a pointer to memory which contains the key value. Must not be NULL. + * @param keySize is the number of bytes of the key. +*/ +RANDOMX_EXPORT void randomx_init_cache(randomx_cache *cache, const void *key, size_t keySize); + +/** + * Releases all memory occupied by the randomx_cache structure. + * + * @param cache is a pointer to a previously allocated randomx_cache structure. +*/ +RANDOMX_EXPORT void randomx_release_cache(randomx_cache* cache); + +/** + * Creates a randomx_dataset structure and allocates memory for RandomX Dataset. + * + * @param flags is the initialization flags. Only one flag is supported (can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages + * + * @return Pointer to an allocated randomx_dataset structure. + * NULL is returned if memory allocation fails. + */ +RANDOMX_EXPORT randomx_dataset *randomx_alloc_dataset(randomx_flags flags); + +/** + * Gets the number of items contained in the dataset. + * + * @return the number of items contained in the dataset. +*/ +RANDOMX_EXPORT unsigned long randomx_dataset_item_count(void); + +/** + * Initializes dataset items. + * + * Note: In order to use the Dataset, all items from 0 to (randomx_dataset_item_count() - 1) must be initialized. + * This may be done by several calls to this function using non-overlapping item sequences. + * + * @param dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL. + * @param cache is a pointer to a previously allocated and initialized randomx_cache structure. Must not be NULL. + * @param startItem is the item number where intialization should start. + * @param itemCount is the number of items that should be initialized. +*/ +RANDOMX_EXPORT void randomx_init_dataset(randomx_dataset *dataset, randomx_cache *cache, unsigned long startItem, unsigned long itemCount); + +/** + * Returns a pointer to the internal memory buffer of the dataset structure. The size + * of the internal memory buffer is randomx_dataset_item_count() * RANDOMX_DATASET_ITEM_SIZE. + * + * @param dataset is dataset is a pointer to a previously allocated randomx_dataset structure. Must not be NULL. + * + * @return Pointer to the internal memory buffer of the dataset structure. +*/ +RANDOMX_EXPORT void *randomx_get_dataset_memory(randomx_dataset *dataset); + +/** + * Releases all memory occupied by the randomx_dataset structure. + * + * @param dataset is a pointer to a previously allocated randomx_dataset structure. +*/ +RANDOMX_EXPORT void randomx_release_dataset(randomx_dataset *dataset); + +/** + * Creates and initializes a RandomX virtual machine. + * + * @param flags is any combination of these 4 flags (each flag can be set or not set): + * RANDOMX_FLAG_LARGE_PAGES - allocate scratchpad memory in large pages + * RANDOMX_FLAG_HARD_AES - virtual machine will use hardware accelerated AES + * RANDOMX_FLAG_FULL_MEM - virtual machine will use the full dataset + * RANDOMX_FLAG_JIT - virtual machine will use a JIT compiler + * The numeric values of the flags are ordered so that a higher value will provide + * faster hash calculation and a lower numeric value will provide higher portability. + * Using RANDOMX_FLAG_DEFAULT (all flags not set) works on all platforms, but is the slowest. + * @param cache is a pointer to an initialized randomx_cache structure. Can be + * NULL if RANDOMX_FLAG_FULL_MEM is set. + * @param dataset is a pointer to a randomx_dataset structure. Can be NULL + * if RANDOMX_FLAG_FULL_MEM is not set. + * + * @return Pointer to an initialized randomx_vm structure. + * Returns NULL if: + * (1) Scratchpad memory allocation fails. + * (2) The requested initialization flags are not supported on the current platform. + * (3) cache parameter is NULL and RANDOMX_FLAG_FULL_MEM is not set + * (4) dataset parameter is NULL and RANDOMX_FLAG_FULL_MEM is set +*/ +RANDOMX_EXPORT randomx_vm *randomx_create_vm(randomx_flags flags, randomx_cache *cache, randomx_dataset *dataset); + +/** + * Reinitializes a virtual machine with a new Cache. This function should be called anytime + * the Cache is reinitialized with a new key. + * + * @param machine is a pointer to a randomx_vm structure that was initialized + * without RANDOMX_FLAG_FULL_MEM. Must not be NULL. + * @param cache is a pointer to an initialized randomx_cache structure. Must not be NULL. +*/ +RANDOMX_EXPORT void randomx_vm_set_cache(randomx_vm *machine, randomx_cache* cache); + +/** + * Reinitializes a virtual machine with a new Dataset. + * + * @param machine is a pointer to a randomx_vm structure that was initialized + * with RANDOMX_FLAG_FULL_MEM. Must not be NULL. + * @param dataset is a pointer to an initialized randomx_dataset structure. Must not be NULL. +*/ +RANDOMX_EXPORT void randomx_vm_set_dataset(randomx_vm *machine, randomx_dataset *dataset); + +/** + * Releases all memory occupied by the randomx_vm structure. + * + * @param machine is a pointer to a previously created randomx_vm structure. +*/ +RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); + +/** + * Calculates a RandomX hash value. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/crypto/randomx/reciprocal.c b/src/crypto/randomx/reciprocal.c new file mode 100644 index 00000000..22620f53 --- /dev/null +++ b/src/crypto/randomx/reciprocal.c @@ -0,0 +1,80 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include "reciprocal.h" + +/* + Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64. + divisor must not be 0 or a power of 2 + + Equivalent x86 assembly (divisor in rcx): + + mov edx, 1 + mov r8, rcx + xor eax, eax + bsr rcx, rcx + shl rdx, cl + div r8 + ret + +*/ +uint64_t randomx_reciprocal(uint64_t divisor) { + + assert(divisor != 0); + + const uint64_t p2exp63 = 1ULL << 63; + + uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; + + unsigned bsr = 0; //highest set bit in divisor + + for (uint64_t bit = divisor; bit > 0; bit >>= 1) + bsr++; + + for (unsigned shift = 0; shift < bsr; shift++) { + if (remainder >= divisor - remainder) { + quotient = quotient * 2 + 1; + remainder = remainder * 2 - divisor; + } + else { + quotient = quotient * 2; + remainder = remainder * 2; + } + } + + return quotient; +} + +#if !RANDOMX_HAVE_FAST_RECIPROCAL + +uint64_t randomx_reciprocal_fast(uint64_t divisor) { + return randomx_reciprocal(divisor); +} + +#endif diff --git a/src/crypto/randomx/reciprocal.h b/src/crypto/randomx/reciprocal.h new file mode 100644 index 00000000..8858df2b --- /dev/null +++ b/src/crypto/randomx/reciprocal.h @@ -0,0 +1,48 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +#if defined(_M_X64) || defined(__x86_64__) +#define RANDOMX_HAVE_FAST_RECIPROCAL 1 +#else +#define RANDOMX_HAVE_FAST_RECIPROCAL 0 +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +uint64_t randomx_reciprocal(uint64_t); +uint64_t randomx_reciprocal_fast(uint64_t); + +#if defined(__cplusplus) +} +#endif diff --git a/src/crypto/randomx/soft_aes.cpp b/src/crypto/randomx/soft_aes.cpp new file mode 100644 index 00000000..3e82fa2e --- /dev/null +++ b/src/crypto/randomx/soft_aes.cpp @@ -0,0 +1,364 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "soft_aes.h" + +alignas(16) const uint8_t sbox[256] = { + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16, +}; + +alignas(16) const uint32_t lutEnc0[256] = { + 0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6, 0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591, + 0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56, 0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec, + 0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa, 0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb, + 0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45, 0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b, + 0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c, 0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83, + 0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9, 0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a, + 0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d, 0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f, + 0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df, 0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea, + 0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34, 0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b, + 0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d, 0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413, + 0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1, 0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6, + 0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972, 0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85, + 0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed, 0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511, + 0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe, 0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b, + 0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05, 0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1, + 0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142, 0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf, + 0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3, 0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e, + 0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a, 0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6, + 0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3, 0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b, + 0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428, 0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad, + 0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14, 0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8, + 0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4, 0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2, + 0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda, 0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949, + 0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf, 0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810, + 0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c, 0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697, + 0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e, 0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f, + 0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc, 0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c, + 0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969, 0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27, + 0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122, 0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433, + 0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9, 0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5, + 0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a, 0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0, + 0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e, 0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c, +}; + +alignas(16) const uint32_t lutEnc1[256] = { + 0x6363c6a5, 0x7c7cf884, 0x7777ee99, 0x7b7bf68d, 0xf2f2ff0d, 0x6b6bd6bd, 0x6f6fdeb1, 0xc5c59154, + 0x30306050, 0x01010203, 0x6767cea9, 0x2b2b567d, 0xfefee719, 0xd7d7b562, 0xabab4de6, 0x7676ec9a, + 0xcaca8f45, 0x82821f9d, 0xc9c98940, 0x7d7dfa87, 0xfafaef15, 0x5959b2eb, 0x47478ec9, 0xf0f0fb0b, + 0xadad41ec, 0xd4d4b367, 0xa2a25ffd, 0xafaf45ea, 0x9c9c23bf, 0xa4a453f7, 0x7272e496, 0xc0c09b5b, + 0xb7b775c2, 0xfdfde11c, 0x93933dae, 0x26264c6a, 0x36366c5a, 0x3f3f7e41, 0xf7f7f502, 0xcccc834f, + 0x3434685c, 0xa5a551f4, 0xe5e5d134, 0xf1f1f908, 0x7171e293, 0xd8d8ab73, 0x31316253, 0x15152a3f, + 0x0404080c, 0xc7c79552, 0x23234665, 0xc3c39d5e, 0x18183028, 0x969637a1, 0x05050a0f, 0x9a9a2fb5, + 0x07070e09, 0x12122436, 0x80801b9b, 0xe2e2df3d, 0xebebcd26, 0x27274e69, 0xb2b27fcd, 0x7575ea9f, + 0x0909121b, 0x83831d9e, 0x2c2c5874, 0x1a1a342e, 0x1b1b362d, 0x6e6edcb2, 0x5a5ab4ee, 0xa0a05bfb, + 0x5252a4f6, 0x3b3b764d, 0xd6d6b761, 0xb3b37dce, 0x2929527b, 0xe3e3dd3e, 0x2f2f5e71, 0x84841397, + 0x5353a6f5, 0xd1d1b968, 0x00000000, 0xededc12c, 0x20204060, 0xfcfce31f, 0xb1b179c8, 0x5b5bb6ed, + 0x6a6ad4be, 0xcbcb8d46, 0xbebe67d9, 0x3939724b, 0x4a4a94de, 0x4c4c98d4, 0x5858b0e8, 0xcfcf854a, + 0xd0d0bb6b, 0xefefc52a, 0xaaaa4fe5, 0xfbfbed16, 0x434386c5, 0x4d4d9ad7, 0x33336655, 0x85851194, + 0x45458acf, 0xf9f9e910, 0x02020406, 0x7f7ffe81, 0x5050a0f0, 0x3c3c7844, 0x9f9f25ba, 0xa8a84be3, + 0x5151a2f3, 0xa3a35dfe, 0x404080c0, 0x8f8f058a, 0x92923fad, 0x9d9d21bc, 0x38387048, 0xf5f5f104, + 0xbcbc63df, 0xb6b677c1, 0xdadaaf75, 0x21214263, 0x10102030, 0xffffe51a, 0xf3f3fd0e, 0xd2d2bf6d, + 0xcdcd814c, 0x0c0c1814, 0x13132635, 0xececc32f, 0x5f5fbee1, 0x979735a2, 0x444488cc, 0x17172e39, + 0xc4c49357, 0xa7a755f2, 0x7e7efc82, 0x3d3d7a47, 0x6464c8ac, 0x5d5dbae7, 0x1919322b, 0x7373e695, + 0x6060c0a0, 0x81811998, 0x4f4f9ed1, 0xdcdca37f, 0x22224466, 0x2a2a547e, 0x90903bab, 0x88880b83, + 0x46468cca, 0xeeeec729, 0xb8b86bd3, 0x1414283c, 0xdedea779, 0x5e5ebce2, 0x0b0b161d, 0xdbdbad76, + 0xe0e0db3b, 0x32326456, 0x3a3a744e, 0x0a0a141e, 0x494992db, 0x06060c0a, 0x2424486c, 0x5c5cb8e4, + 0xc2c29f5d, 0xd3d3bd6e, 0xacac43ef, 0x6262c4a6, 0x919139a8, 0x959531a4, 0xe4e4d337, 0x7979f28b, + 0xe7e7d532, 0xc8c88b43, 0x37376e59, 0x6d6ddab7, 0x8d8d018c, 0xd5d5b164, 0x4e4e9cd2, 0xa9a949e0, + 0x6c6cd8b4, 0x5656acfa, 0xf4f4f307, 0xeaeacf25, 0x6565caaf, 0x7a7af48e, 0xaeae47e9, 0x08081018, + 0xbaba6fd5, 0x7878f088, 0x25254a6f, 0x2e2e5c72, 0x1c1c3824, 0xa6a657f1, 0xb4b473c7, 0xc6c69751, + 0xe8e8cb23, 0xdddda17c, 0x7474e89c, 0x1f1f3e21, 0x4b4b96dd, 0xbdbd61dc, 0x8b8b0d86, 0x8a8a0f85, + 0x7070e090, 0x3e3e7c42, 0xb5b571c4, 0x6666ccaa, 0x484890d8, 0x03030605, 0xf6f6f701, 0x0e0e1c12, + 0x6161c2a3, 0x35356a5f, 0x5757aef9, 0xb9b969d0, 0x86861791, 0xc1c19958, 0x1d1d3a27, 0x9e9e27b9, + 0xe1e1d938, 0xf8f8eb13, 0x98982bb3, 0x11112233, 0x6969d2bb, 0xd9d9a970, 0x8e8e0789, 0x949433a7, + 0x9b9b2db6, 0x1e1e3c22, 0x87871592, 0xe9e9c920, 0xcece8749, 0x5555aaff, 0x28285078, 0xdfdfa57a, + 0x8c8c038f, 0xa1a159f8, 0x89890980, 0x0d0d1a17, 0xbfbf65da, 0xe6e6d731, 0x424284c6, 0x6868d0b8, + 0x414182c3, 0x999929b0, 0x2d2d5a77, 0x0f0f1e11, 0xb0b07bcb, 0x5454a8fc, 0xbbbb6dd6, 0x16162c3a, +}; + +alignas(16) const uint32_t lutEnc2[256] = { + 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b, 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5, + 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b, 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76, + 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d, 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0, + 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf, 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0, + 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26, 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc, + 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1, 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15, + 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3, 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a, + 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2, 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75, + 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a, 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0, + 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3, 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784, + 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced, 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b, + 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39, 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf, + 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb, 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485, + 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f, 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8, + 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f, 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5, + 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321, 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2, + 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec, 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917, + 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d, 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573, + 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc, 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388, + 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14, 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db, + 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a, 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c, + 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662, 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79, + 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d, 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9, + 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea, 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808, + 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e, 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6, + 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f, 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a, + 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66, 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e, + 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9, 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e, + 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311, 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794, + 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9, 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf, + 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d, 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868, + 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f, 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16, +}; + +alignas(16) const uint32_t lutEnc3[256] = { + 0xc6a56363, 0xf8847c7c, 0xee997777, 0xf68d7b7b, 0xff0df2f2, 0xd6bd6b6b, 0xdeb16f6f, 0x9154c5c5, + 0x60503030, 0x02030101, 0xcea96767, 0x567d2b2b, 0xe719fefe, 0xb562d7d7, 0x4de6abab, 0xec9a7676, + 0x8f45caca, 0x1f9d8282, 0x8940c9c9, 0xfa877d7d, 0xef15fafa, 0xb2eb5959, 0x8ec94747, 0xfb0bf0f0, + 0x41ecadad, 0xb367d4d4, 0x5ffda2a2, 0x45eaafaf, 0x23bf9c9c, 0x53f7a4a4, 0xe4967272, 0x9b5bc0c0, + 0x75c2b7b7, 0xe11cfdfd, 0x3dae9393, 0x4c6a2626, 0x6c5a3636, 0x7e413f3f, 0xf502f7f7, 0x834fcccc, + 0x685c3434, 0x51f4a5a5, 0xd134e5e5, 0xf908f1f1, 0xe2937171, 0xab73d8d8, 0x62533131, 0x2a3f1515, + 0x080c0404, 0x9552c7c7, 0x46652323, 0x9d5ec3c3, 0x30281818, 0x37a19696, 0x0a0f0505, 0x2fb59a9a, + 0x0e090707, 0x24361212, 0x1b9b8080, 0xdf3de2e2, 0xcd26ebeb, 0x4e692727, 0x7fcdb2b2, 0xea9f7575, + 0x121b0909, 0x1d9e8383, 0x58742c2c, 0x342e1a1a, 0x362d1b1b, 0xdcb26e6e, 0xb4ee5a5a, 0x5bfba0a0, + 0xa4f65252, 0x764d3b3b, 0xb761d6d6, 0x7dceb3b3, 0x527b2929, 0xdd3ee3e3, 0x5e712f2f, 0x13978484, + 0xa6f55353, 0xb968d1d1, 0x00000000, 0xc12ceded, 0x40602020, 0xe31ffcfc, 0x79c8b1b1, 0xb6ed5b5b, + 0xd4be6a6a, 0x8d46cbcb, 0x67d9bebe, 0x724b3939, 0x94de4a4a, 0x98d44c4c, 0xb0e85858, 0x854acfcf, + 0xbb6bd0d0, 0xc52aefef, 0x4fe5aaaa, 0xed16fbfb, 0x86c54343, 0x9ad74d4d, 0x66553333, 0x11948585, + 0x8acf4545, 0xe910f9f9, 0x04060202, 0xfe817f7f, 0xa0f05050, 0x78443c3c, 0x25ba9f9f, 0x4be3a8a8, + 0xa2f35151, 0x5dfea3a3, 0x80c04040, 0x058a8f8f, 0x3fad9292, 0x21bc9d9d, 0x70483838, 0xf104f5f5, + 0x63dfbcbc, 0x77c1b6b6, 0xaf75dada, 0x42632121, 0x20301010, 0xe51affff, 0xfd0ef3f3, 0xbf6dd2d2, + 0x814ccdcd, 0x18140c0c, 0x26351313, 0xc32fecec, 0xbee15f5f, 0x35a29797, 0x88cc4444, 0x2e391717, + 0x9357c4c4, 0x55f2a7a7, 0xfc827e7e, 0x7a473d3d, 0xc8ac6464, 0xbae75d5d, 0x322b1919, 0xe6957373, + 0xc0a06060, 0x19988181, 0x9ed14f4f, 0xa37fdcdc, 0x44662222, 0x547e2a2a, 0x3bab9090, 0x0b838888, + 0x8cca4646, 0xc729eeee, 0x6bd3b8b8, 0x283c1414, 0xa779dede, 0xbce25e5e, 0x161d0b0b, 0xad76dbdb, + 0xdb3be0e0, 0x64563232, 0x744e3a3a, 0x141e0a0a, 0x92db4949, 0x0c0a0606, 0x486c2424, 0xb8e45c5c, + 0x9f5dc2c2, 0xbd6ed3d3, 0x43efacac, 0xc4a66262, 0x39a89191, 0x31a49595, 0xd337e4e4, 0xf28b7979, + 0xd532e7e7, 0x8b43c8c8, 0x6e593737, 0xdab76d6d, 0x018c8d8d, 0xb164d5d5, 0x9cd24e4e, 0x49e0a9a9, + 0xd8b46c6c, 0xacfa5656, 0xf307f4f4, 0xcf25eaea, 0xcaaf6565, 0xf48e7a7a, 0x47e9aeae, 0x10180808, + 0x6fd5baba, 0xf0887878, 0x4a6f2525, 0x5c722e2e, 0x38241c1c, 0x57f1a6a6, 0x73c7b4b4, 0x9751c6c6, + 0xcb23e8e8, 0xa17cdddd, 0xe89c7474, 0x3e211f1f, 0x96dd4b4b, 0x61dcbdbd, 0x0d868b8b, 0x0f858a8a, + 0xe0907070, 0x7c423e3e, 0x71c4b5b5, 0xccaa6666, 0x90d84848, 0x06050303, 0xf701f6f6, 0x1c120e0e, + 0xc2a36161, 0x6a5f3535, 0xaef95757, 0x69d0b9b9, 0x17918686, 0x9958c1c1, 0x3a271d1d, 0x27b99e9e, + 0xd938e1e1, 0xeb13f8f8, 0x2bb39898, 0x22331111, 0xd2bb6969, 0xa970d9d9, 0x07898e8e, 0x33a79494, + 0x2db69b9b, 0x3c221e1e, 0x15928787, 0xc920e9e9, 0x8749cece, 0xaaff5555, 0x50782828, 0xa57adfdf, + 0x038f8c8c, 0x59f8a1a1, 0x09808989, 0x1a170d0d, 0x65dabfbf, 0xd731e6e6, 0x84c64242, 0xd0b86868, + 0x82c34141, 0x29b09999, 0x5a772d2d, 0x1e110f0f, 0x7bcbb0b0, 0xa8fc5454, 0x6dd6bbbb, 0x2c3a1616, +}; + +alignas(16) const uint32_t lutDec0[256] = { + 0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a, 0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b, + 0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5, 0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5, + 0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d, 0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b, + 0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295, 0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e, + 0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927, 0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d, + 0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362, 0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9, + 0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52, 0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566, + 0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3, 0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed, + 0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e, 0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4, + 0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4, 0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd, + 0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d, 0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060, + 0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967, 0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879, + 0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000, 0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c, + 0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36, 0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624, + 0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b, 0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c, + 0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12, 0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14, + 0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3, 0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b, + 0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8, 0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684, + 0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7, 0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177, + 0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947, 0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322, + 0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498, 0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f, + 0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54, 0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382, + 0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf, 0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb, + 0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83, 0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef, + 0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029, 0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235, + 0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733, 0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117, + 0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4, 0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546, + 0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb, 0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d, + 0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb, 0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a, + 0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773, 0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478, + 0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2, 0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff, + 0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664, 0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0, +}; + +alignas(16) const uint32_t lutDec1[256] = { + 0xa7f45150, 0x65417e53, 0xa4171ac3, 0x5e273a96, 0x6bab3bcb, 0x459d1ff1, 0x58faacab, 0x03e34b93, + 0xfa302055, 0x6d76adf6, 0x76cc8891, 0x4c02f525, 0xd7e54ffc, 0xcb2ac5d7, 0x44352680, 0xa362b58f, + 0x5ab1de49, 0x1bba2567, 0x0eea4598, 0xc0fe5de1, 0x752fc302, 0xf04c8112, 0x97468da3, 0xf9d36bc6, + 0x5f8f03e7, 0x9c921595, 0x7a6dbfeb, 0x595295da, 0x83bed42d, 0x217458d3, 0x69e04929, 0xc8c98e44, + 0x89c2756a, 0x798ef478, 0x3e58996b, 0x71b927dd, 0x4fe1beb6, 0xad88f017, 0xac20c966, 0x3ace7db4, + 0x4adf6318, 0x311ae582, 0x33519760, 0x7f536245, 0x7764b1e0, 0xae6bbb84, 0xa081fe1c, 0x2b08f994, + 0x68487058, 0xfd458f19, 0x6cde9487, 0xf87b52b7, 0xd373ab23, 0x024b72e2, 0x8f1fe357, 0xab55662a, + 0x28ebb207, 0xc2b52f03, 0x7bc5869a, 0x0837d3a5, 0x872830f2, 0xa5bf23b2, 0x6a0302ba, 0x8216ed5c, + 0x1ccf8a2b, 0xb479a792, 0xf207f3f0, 0xe2694ea1, 0xf4da65cd, 0xbe0506d5, 0x6234d11f, 0xfea6c48a, + 0x532e349d, 0x55f3a2a0, 0xe18a0532, 0xebf6a475, 0xec830b39, 0xef6040aa, 0x9f715e06, 0x106ebd51, + 0x8a213ef9, 0x06dd963d, 0x053eddae, 0xbde64d46, 0x8d5491b5, 0x5dc47105, 0xd406046f, 0x155060ff, + 0xfb981924, 0xe9bdd697, 0x434089cc, 0x9ed96777, 0x42e8b0bd, 0x8b890788, 0x5b19e738, 0xeec879db, + 0x0a7ca147, 0x0f427ce9, 0x1e84f8c9, 0x00000000, 0x86800983, 0xed2b3248, 0x70111eac, 0x725a6c4e, + 0xff0efdfb, 0x38850f56, 0xd5ae3d1e, 0x392d3627, 0xd90f0a64, 0xa65c6821, 0x545b9bd1, 0x2e36243a, + 0x670a0cb1, 0xe757930f, 0x96eeb4d2, 0x919b1b9e, 0xc5c0804f, 0x20dc61a2, 0x4b775a69, 0x1a121c16, + 0xba93e20a, 0x2aa0c0e5, 0xe0223c43, 0x171b121d, 0x0d090e0b, 0xc78bf2ad, 0xa8b62db9, 0xa91e14c8, + 0x19f15785, 0x0775af4c, 0xdd99eebb, 0x607fa3fd, 0x2601f79f, 0xf5725cbc, 0x3b6644c5, 0x7efb5b34, + 0x29438b76, 0xc623cbdc, 0xfcedb668, 0xf1e4b863, 0xdc31d7ca, 0x85634210, 0x22971340, 0x11c68420, + 0x244a857d, 0x3dbbd2f8, 0x32f9ae11, 0xa129c76d, 0x2f9e1d4b, 0x30b2dcf3, 0x52860dec, 0xe3c177d0, + 0x16b32b6c, 0xb970a999, 0x489411fa, 0x64e94722, 0x8cfca8c4, 0x3ff0a01a, 0x2c7d56d8, 0x903322ef, + 0x4e4987c7, 0xd138d9c1, 0xa2ca8cfe, 0x0bd49836, 0x81f5a6cf, 0xde7aa528, 0x8eb7da26, 0xbfad3fa4, + 0x9d3a2ce4, 0x9278500d, 0xcc5f6a9b, 0x467e5462, 0x138df6c2, 0xb8d890e8, 0xf7392e5e, 0xafc382f5, + 0x805d9fbe, 0x93d0697c, 0x2dd56fa9, 0x1225cfb3, 0x99acc83b, 0x7d1810a7, 0x639ce86e, 0xbb3bdb7b, + 0x7826cd09, 0x18596ef4, 0xb79aec01, 0x9a4f83a8, 0x6e95e665, 0xe6ffaa7e, 0xcfbc2108, 0xe815efe6, + 0x9be7bad9, 0x366f4ace, 0x099fead4, 0x7cb029d6, 0xb2a431af, 0x233f2a31, 0x94a5c630, 0x66a235c0, + 0xbc4e7437, 0xca82fca6, 0xd090e0b0, 0xd8a73315, 0x9804f14a, 0xdaec41f7, 0x50cd7f0e, 0xf691172f, + 0xd64d768d, 0xb0ef434d, 0x4daacc54, 0x0496e4df, 0xb5d19ee3, 0x886a4c1b, 0x1f2cc1b8, 0x5165467f, + 0xea5e9d04, 0x358c015d, 0x7487fa73, 0x410bfb2e, 0x1d67b35a, 0xd2db9252, 0x5610e933, 0x47d66d13, + 0x61d79a8c, 0x0ca1377a, 0x14f8598e, 0x3c13eb89, 0x27a9ceee, 0xc961b735, 0xe51ce1ed, 0xb1477a3c, + 0xdfd29c59, 0x73f2553f, 0xce141879, 0x37c773bf, 0xcdf753ea, 0xaafd5f5b, 0x6f3ddf14, 0xdb447886, + 0xf3afca81, 0xc468b93e, 0x3424382c, 0x40a3c25f, 0xc31d1672, 0x25e2bc0c, 0x493c288b, 0x950dff41, + 0x01a83971, 0xb30c08de, 0xe4b4d89c, 0xc1566490, 0x84cb7b61, 0xb632d570, 0x5c6c4874, 0x57b8d042, +}; + +alignas(16) const uint32_t lutDec2[256] = { + 0xf45150a7, 0x417e5365, 0x171ac3a4, 0x273a965e, 0xab3bcb6b, 0x9d1ff145, 0xfaacab58, 0xe34b9303, + 0x302055fa, 0x76adf66d, 0xcc889176, 0x02f5254c, 0xe54ffcd7, 0x2ac5d7cb, 0x35268044, 0x62b58fa3, + 0xb1de495a, 0xba25671b, 0xea45980e, 0xfe5de1c0, 0x2fc30275, 0x4c8112f0, 0x468da397, 0xd36bc6f9, + 0x8f03e75f, 0x9215959c, 0x6dbfeb7a, 0x5295da59, 0xbed42d83, 0x7458d321, 0xe0492969, 0xc98e44c8, + 0xc2756a89, 0x8ef47879, 0x58996b3e, 0xb927dd71, 0xe1beb64f, 0x88f017ad, 0x20c966ac, 0xce7db43a, + 0xdf63184a, 0x1ae58231, 0x51976033, 0x5362457f, 0x64b1e077, 0x6bbb84ae, 0x81fe1ca0, 0x08f9942b, + 0x48705868, 0x458f19fd, 0xde94876c, 0x7b52b7f8, 0x73ab23d3, 0x4b72e202, 0x1fe3578f, 0x55662aab, + 0xebb20728, 0xb52f03c2, 0xc5869a7b, 0x37d3a508, 0x2830f287, 0xbf23b2a5, 0x0302ba6a, 0x16ed5c82, + 0xcf8a2b1c, 0x79a792b4, 0x07f3f0f2, 0x694ea1e2, 0xda65cdf4, 0x0506d5be, 0x34d11f62, 0xa6c48afe, + 0x2e349d53, 0xf3a2a055, 0x8a0532e1, 0xf6a475eb, 0x830b39ec, 0x6040aaef, 0x715e069f, 0x6ebd5110, + 0x213ef98a, 0xdd963d06, 0x3eddae05, 0xe64d46bd, 0x5491b58d, 0xc471055d, 0x06046fd4, 0x5060ff15, + 0x981924fb, 0xbdd697e9, 0x4089cc43, 0xd967779e, 0xe8b0bd42, 0x8907888b, 0x19e7385b, 0xc879dbee, + 0x7ca1470a, 0x427ce90f, 0x84f8c91e, 0x00000000, 0x80098386, 0x2b3248ed, 0x111eac70, 0x5a6c4e72, + 0x0efdfbff, 0x850f5638, 0xae3d1ed5, 0x2d362739, 0x0f0a64d9, 0x5c6821a6, 0x5b9bd154, 0x36243a2e, + 0x0a0cb167, 0x57930fe7, 0xeeb4d296, 0x9b1b9e91, 0xc0804fc5, 0xdc61a220, 0x775a694b, 0x121c161a, + 0x93e20aba, 0xa0c0e52a, 0x223c43e0, 0x1b121d17, 0x090e0b0d, 0x8bf2adc7, 0xb62db9a8, 0x1e14c8a9, + 0xf1578519, 0x75af4c07, 0x99eebbdd, 0x7fa3fd60, 0x01f79f26, 0x725cbcf5, 0x6644c53b, 0xfb5b347e, + 0x438b7629, 0x23cbdcc6, 0xedb668fc, 0xe4b863f1, 0x31d7cadc, 0x63421085, 0x97134022, 0xc6842011, + 0x4a857d24, 0xbbd2f83d, 0xf9ae1132, 0x29c76da1, 0x9e1d4b2f, 0xb2dcf330, 0x860dec52, 0xc177d0e3, + 0xb32b6c16, 0x70a999b9, 0x9411fa48, 0xe9472264, 0xfca8c48c, 0xf0a01a3f, 0x7d56d82c, 0x3322ef90, + 0x4987c74e, 0x38d9c1d1, 0xca8cfea2, 0xd498360b, 0xf5a6cf81, 0x7aa528de, 0xb7da268e, 0xad3fa4bf, + 0x3a2ce49d, 0x78500d92, 0x5f6a9bcc, 0x7e546246, 0x8df6c213, 0xd890e8b8, 0x392e5ef7, 0xc382f5af, + 0x5d9fbe80, 0xd0697c93, 0xd56fa92d, 0x25cfb312, 0xacc83b99, 0x1810a77d, 0x9ce86e63, 0x3bdb7bbb, + 0x26cd0978, 0x596ef418, 0x9aec01b7, 0x4f83a89a, 0x95e6656e, 0xffaa7ee6, 0xbc2108cf, 0x15efe6e8, + 0xe7bad99b, 0x6f4ace36, 0x9fead409, 0xb029d67c, 0xa431afb2, 0x3f2a3123, 0xa5c63094, 0xa235c066, + 0x4e7437bc, 0x82fca6ca, 0x90e0b0d0, 0xa73315d8, 0x04f14a98, 0xec41f7da, 0xcd7f0e50, 0x91172ff6, + 0x4d768dd6, 0xef434db0, 0xaacc544d, 0x96e4df04, 0xd19ee3b5, 0x6a4c1b88, 0x2cc1b81f, 0x65467f51, + 0x5e9d04ea, 0x8c015d35, 0x87fa7374, 0x0bfb2e41, 0x67b35a1d, 0xdb9252d2, 0x10e93356, 0xd66d1347, + 0xd79a8c61, 0xa1377a0c, 0xf8598e14, 0x13eb893c, 0xa9ceee27, 0x61b735c9, 0x1ce1ede5, 0x477a3cb1, + 0xd29c59df, 0xf2553f73, 0x141879ce, 0xc773bf37, 0xf753eacd, 0xfd5f5baa, 0x3ddf146f, 0x447886db, + 0xafca81f3, 0x68b93ec4, 0x24382c34, 0xa3c25f40, 0x1d1672c3, 0xe2bc0c25, 0x3c288b49, 0x0dff4195, + 0xa8397101, 0x0c08deb3, 0xb4d89ce4, 0x566490c1, 0xcb7b6184, 0x32d570b6, 0x6c48745c, 0xb8d04257, +}; + +alignas(16) const uint32_t lutDec3[256] = { + 0x5150a7f4, 0x7e536541, 0x1ac3a417, 0x3a965e27, 0x3bcb6bab, 0x1ff1459d, 0xacab58fa, 0x4b9303e3, + 0x2055fa30, 0xadf66d76, 0x889176cc, 0xf5254c02, 0x4ffcd7e5, 0xc5d7cb2a, 0x26804435, 0xb58fa362, + 0xde495ab1, 0x25671bba, 0x45980eea, 0x5de1c0fe, 0xc302752f, 0x8112f04c, 0x8da39746, 0x6bc6f9d3, + 0x03e75f8f, 0x15959c92, 0xbfeb7a6d, 0x95da5952, 0xd42d83be, 0x58d32174, 0x492969e0, 0x8e44c8c9, + 0x756a89c2, 0xf478798e, 0x996b3e58, 0x27dd71b9, 0xbeb64fe1, 0xf017ad88, 0xc966ac20, 0x7db43ace, + 0x63184adf, 0xe582311a, 0x97603351, 0x62457f53, 0xb1e07764, 0xbb84ae6b, 0xfe1ca081, 0xf9942b08, + 0x70586848, 0x8f19fd45, 0x94876cde, 0x52b7f87b, 0xab23d373, 0x72e2024b, 0xe3578f1f, 0x662aab55, + 0xb20728eb, 0x2f03c2b5, 0x869a7bc5, 0xd3a50837, 0x30f28728, 0x23b2a5bf, 0x02ba6a03, 0xed5c8216, + 0x8a2b1ccf, 0xa792b479, 0xf3f0f207, 0x4ea1e269, 0x65cdf4da, 0x06d5be05, 0xd11f6234, 0xc48afea6, + 0x349d532e, 0xa2a055f3, 0x0532e18a, 0xa475ebf6, 0x0b39ec83, 0x40aaef60, 0x5e069f71, 0xbd51106e, + 0x3ef98a21, 0x963d06dd, 0xddae053e, 0x4d46bde6, 0x91b58d54, 0x71055dc4, 0x046fd406, 0x60ff1550, + 0x1924fb98, 0xd697e9bd, 0x89cc4340, 0x67779ed9, 0xb0bd42e8, 0x07888b89, 0xe7385b19, 0x79dbeec8, + 0xa1470a7c, 0x7ce90f42, 0xf8c91e84, 0x00000000, 0x09838680, 0x3248ed2b, 0x1eac7011, 0x6c4e725a, + 0xfdfbff0e, 0x0f563885, 0x3d1ed5ae, 0x3627392d, 0x0a64d90f, 0x6821a65c, 0x9bd1545b, 0x243a2e36, + 0x0cb1670a, 0x930fe757, 0xb4d296ee, 0x1b9e919b, 0x804fc5c0, 0x61a220dc, 0x5a694b77, 0x1c161a12, + 0xe20aba93, 0xc0e52aa0, 0x3c43e022, 0x121d171b, 0x0e0b0d09, 0xf2adc78b, 0x2db9a8b6, 0x14c8a91e, + 0x578519f1, 0xaf4c0775, 0xeebbdd99, 0xa3fd607f, 0xf79f2601, 0x5cbcf572, 0x44c53b66, 0x5b347efb, + 0x8b762943, 0xcbdcc623, 0xb668fced, 0xb863f1e4, 0xd7cadc31, 0x42108563, 0x13402297, 0x842011c6, + 0x857d244a, 0xd2f83dbb, 0xae1132f9, 0xc76da129, 0x1d4b2f9e, 0xdcf330b2, 0x0dec5286, 0x77d0e3c1, + 0x2b6c16b3, 0xa999b970, 0x11fa4894, 0x472264e9, 0xa8c48cfc, 0xa01a3ff0, 0x56d82c7d, 0x22ef9033, + 0x87c74e49, 0xd9c1d138, 0x8cfea2ca, 0x98360bd4, 0xa6cf81f5, 0xa528de7a, 0xda268eb7, 0x3fa4bfad, + 0x2ce49d3a, 0x500d9278, 0x6a9bcc5f, 0x5462467e, 0xf6c2138d, 0x90e8b8d8, 0x2e5ef739, 0x82f5afc3, + 0x9fbe805d, 0x697c93d0, 0x6fa92dd5, 0xcfb31225, 0xc83b99ac, 0x10a77d18, 0xe86e639c, 0xdb7bbb3b, + 0xcd097826, 0x6ef41859, 0xec01b79a, 0x83a89a4f, 0xe6656e95, 0xaa7ee6ff, 0x2108cfbc, 0xefe6e815, + 0xbad99be7, 0x4ace366f, 0xead4099f, 0x29d67cb0, 0x31afb2a4, 0x2a31233f, 0xc63094a5, 0x35c066a2, + 0x7437bc4e, 0xfca6ca82, 0xe0b0d090, 0x3315d8a7, 0xf14a9804, 0x41f7daec, 0x7f0e50cd, 0x172ff691, + 0x768dd64d, 0x434db0ef, 0xcc544daa, 0xe4df0496, 0x9ee3b5d1, 0x4c1b886a, 0xc1b81f2c, 0x467f5165, + 0x9d04ea5e, 0x015d358c, 0xfa737487, 0xfb2e410b, 0xb35a1d67, 0x9252d2db, 0xe9335610, 0x6d1347d6, + 0x9a8c61d7, 0x377a0ca1, 0x598e14f8, 0xeb893c13, 0xceee27a9, 0xb735c961, 0xe1ede51c, 0x7a3cb147, + 0x9c59dfd2, 0x553f73f2, 0x1879ce14, 0x73bf37c7, 0x53eacdf7, 0x5f5baafd, 0xdf146f3d, 0x7886db44, + 0xca81f3af, 0xb93ec468, 0x382c3424, 0xc25f40a3, 0x1672c31d, 0xbc0c25e2, 0x288b493c, 0xff41950d, + 0x397101a8, 0x08deb30c, 0xd89ce4b4, 0x6490c156, 0x7b6184cb, 0xd570b632, 0x48745c6c, 0xd04257b8, +}; + +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutEnc0[s0 & 0xff] ^ lutEnc1[(s3 >> 8) & 0xff] ^ lutEnc2[(s2 >> 16) & 0xff] ^ lutEnc3[s1 >> 24]), + (lutEnc0[s1 & 0xff] ^ lutEnc1[(s0 >> 8) & 0xff] ^ lutEnc2[(s3 >> 16) & 0xff] ^ lutEnc3[s2 >> 24]), + (lutEnc0[s2 & 0xff] ^ lutEnc1[(s1 >> 8) & 0xff] ^ lutEnc2[(s0 >> 16) & 0xff] ^ lutEnc3[s3 >> 24]), + (lutEnc0[s3 & 0xff] ^ lutEnc1[(s2 >> 8) & 0xff] ^ lutEnc2[(s1 >> 16) & 0xff] ^ lutEnc3[s0 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} + +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key) { + uint32_t s0, s1, s2, s3; + + s0 = rx_vec_i128_w(in); + s1 = rx_vec_i128_z(in); + s2 = rx_vec_i128_y(in); + s3 = rx_vec_i128_x(in); + + rx_vec_i128 out = rx_set_int_vec_i128( + (lutDec0[s0 & 0xff] ^ lutDec1[(s1 >> 8) & 0xff] ^ lutDec2[(s2 >> 16) & 0xff] ^ lutDec3[s3 >> 24]), + (lutDec0[s1 & 0xff] ^ lutDec1[(s2 >> 8) & 0xff] ^ lutDec2[(s3 >> 16) & 0xff] ^ lutDec3[s0 >> 24]), + (lutDec0[s2 & 0xff] ^ lutDec1[(s3 >> 8) & 0xff] ^ lutDec2[(s0 >> 16) & 0xff] ^ lutDec3[s1 >> 24]), + (lutDec0[s3 & 0xff] ^ lutDec1[(s0 >> 8) & 0xff] ^ lutDec2[(s1 >> 16) & 0xff] ^ lutDec3[s2 >> 24]) + ); + + return rx_xor_vec_i128(out, key); +} diff --git a/src/crypto/randomx/soft_aes.h b/src/crypto/randomx/soft_aes.h new file mode 100644 index 00000000..254f8d63 --- /dev/null +++ b/src/crypto/randomx/soft_aes.h @@ -0,0 +1,46 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "intrin_portable.h" + +rx_vec_i128 soft_aesenc(rx_vec_i128 in, rx_vec_i128 key); + +rx_vec_i128 soft_aesdec(rx_vec_i128 in, rx_vec_i128 key); + +template +inline rx_vec_i128 aesenc(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesenc(in, key) : rx_aesenc_vec_i128(in, key); +} + +template +inline rx_vec_i128 aesdec(rx_vec_i128 in, rx_vec_i128 key) { + return soft ? soft_aesdec(in, key) : rx_aesdec_vec_i128(in, key); +} \ No newline at end of file diff --git a/src/crypto/randomx/superscalar.cpp b/src/crypto/randomx/superscalar.cpp new file mode 100644 index 00000000..da605622 --- /dev/null +++ b/src/crypto/randomx/superscalar.cpp @@ -0,0 +1,891 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "configuration.h" +#include "program.hpp" +#include "blake2/endian.h" +#include "superscalar.hpp" +#include "intrin_portable.h" +#include "reciprocal.h" + +namespace randomx { + + static bool isMultiplication(SuperscalarInstructionType type) { + return type == SuperscalarInstructionType::IMUL_R || type == SuperscalarInstructionType::IMULH_R || type == SuperscalarInstructionType::ISMULH_R || type == SuperscalarInstructionType::IMUL_RCP; + } + + //uOPs (micro-ops) are represented only by the execution port they can go to + namespace ExecutionPort { + using type = int; + constexpr type Null = 0; + constexpr type P0 = 1; + constexpr type P1 = 2; + constexpr type P5 = 4; + constexpr type P01 = P0 | P1; + constexpr type P05 = P0 | P5; + constexpr type P015 = P0 | P1 | P5; + } + + //Macro-operation as output of the x86 decoder + //Usually one macro-op = one x86 instruction, but 2 instructions are sometimes fused into 1 macro-op + //Macro-op can consist of 1 or 2 uOPs. + class MacroOp { + public: + MacroOp(const char* name, int size) + : name_(name), size_(size), latency_(0), uop1_(ExecutionPort::Null), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop) + : name_(name), size_(size), latency_(latency), uop1_(uop), uop2_(ExecutionPort::Null) {} + MacroOp(const char* name, int size, int latency, ExecutionPort::type uop1, ExecutionPort::type uop2) + : name_(name), size_(size), latency_(latency), uop1_(uop1), uop2_(uop2) {} + MacroOp(const MacroOp& parent, bool dependent) + : name_(parent.name_), size_(parent.size_), latency_(parent.latency_), uop1_(parent.uop1_), uop2_(parent.uop2_), dependent_(dependent) {} + const char* getName() const { + return name_; + } + int getSize() const { + return size_; + } + int getLatency() const { + return latency_; + } + ExecutionPort::type getUop1() const { + return uop1_; + } + ExecutionPort::type getUop2() const { + return uop2_; + } + bool isSimple() const { + return uop2_ == ExecutionPort::Null; + } + bool isEliminated() const { + return uop1_ == ExecutionPort::Null; + } + bool isDependent() const { + return dependent_; + } + static const MacroOp Add_rr; + static const MacroOp Add_ri; + static const MacroOp Lea_sib; + static const MacroOp Sub_rr; + static const MacroOp Imul_rr; + static const MacroOp Imul_r; + static const MacroOp Mul_r; + static const MacroOp Mov_rr; + static const MacroOp Mov_ri64; + static const MacroOp Xor_rr; + static const MacroOp Xor_ri; + static const MacroOp Ror_rcl; + static const MacroOp Ror_ri; + static const MacroOp TestJz_fused; + static const MacroOp Xor_self; + static const MacroOp Cmp_ri; + static const MacroOp Setcc_r; + private: + const char* name_; + int size_; + int latency_; + ExecutionPort::type uop1_; + ExecutionPort::type uop2_; + bool dependent_ = false; + }; + + //Size: 3 bytes + const MacroOp MacroOp::Add_rr = MacroOp("add r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Sub_rr = MacroOp("sub r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_rr = MacroOp("xor r,r", 3, 1, ExecutionPort::P015); + const MacroOp MacroOp::Imul_r = MacroOp("imul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mul_r = MacroOp("mul r", 3, 4, ExecutionPort::P1, ExecutionPort::P5); + const MacroOp MacroOp::Mov_rr = MacroOp("mov r,r", 3); + + //Size: 4 bytes + const MacroOp MacroOp::Lea_sib = MacroOp("lea r,r+r*s", 4, 1, ExecutionPort::P01); + const MacroOp MacroOp::Imul_rr = MacroOp("imul r,r", 4, 3, ExecutionPort::P1); + const MacroOp MacroOp::Ror_ri = MacroOp("ror r,i", 4, 1, ExecutionPort::P05); + + //Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes) + const MacroOp MacroOp::Add_ri = MacroOp("add r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Xor_ri = MacroOp("xor r,i", 7, 1, ExecutionPort::P015); + + //Size: 10 bytes + const MacroOp MacroOp::Mov_ri64 = MacroOp("mov rax,i64", 10, 1, ExecutionPort::P015); + + //Unused: + const MacroOp MacroOp::Ror_rcl = MacroOp("ror r,cl", 3, 1, ExecutionPort::P0, ExecutionPort::P5); + const MacroOp MacroOp::Xor_self = MacroOp("xor rcx,rcx", 3); + const MacroOp MacroOp::Cmp_ri = MacroOp("cmp r,i", 7, 1, ExecutionPort::P015); + const MacroOp MacroOp::Setcc_r = MacroOp("setcc cl", 3, 1, ExecutionPort::P05); + const MacroOp MacroOp::TestJz_fused = MacroOp("testjz r,i", 13, 0, ExecutionPort::P5); + + const MacroOp IMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Mul_r, MacroOp::Mov_rr }; + const MacroOp ISMULH_R_ops_array[] = { MacroOp::Mov_rr, MacroOp::Imul_r, MacroOp::Mov_rr }; + const MacroOp IMUL_RCP_ops_array[] = { MacroOp::Mov_ri64, MacroOp(MacroOp::Imul_rr, true) }; + + class SuperscalarInstructionInfo { + public: + const char* getName() const { + return name_; + } + int getSize() const { + return ops_.size(); + } + bool isSimple() const { + return getSize() == 1; + } + int getLatency() const { + return latency_; + } + const MacroOp& getOp(int index) const { + return ops_[index]; + } + SuperscalarInstructionType getType() const { + return type_; + } + int getResultOp() const { + return resultOp_; + } + int getDstOp() const { + return dstOp_; + } + int getSrcOp() const { + return srcOp_; + } + static const SuperscalarInstructionInfo ISUB_R; + static const SuperscalarInstructionInfo IXOR_R; + static const SuperscalarInstructionInfo IADD_RS; + static const SuperscalarInstructionInfo IMUL_R; + static const SuperscalarInstructionInfo IROR_C; + static const SuperscalarInstructionInfo IADD_C7; + static const SuperscalarInstructionInfo IXOR_C7; + static const SuperscalarInstructionInfo IADD_C8; + static const SuperscalarInstructionInfo IXOR_C8; + static const SuperscalarInstructionInfo IADD_C9; + static const SuperscalarInstructionInfo IXOR_C9; + static const SuperscalarInstructionInfo IMULH_R; + static const SuperscalarInstructionInfo ISMULH_R; + static const SuperscalarInstructionInfo IMUL_RCP; + static const SuperscalarInstructionInfo NOP; + private: + const char* name_; + SuperscalarInstructionType type_; + std::vector ops_; + int latency_; + int resultOp_ = 0; + int dstOp_ = 0; + int srcOp_; + + SuperscalarInstructionInfo(const char* name) + : name_(name), type_(SuperscalarInstructionType::INVALID), latency_(0) {} + SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp& op, int srcOp) + : name_(name), type_(type), latency_(op.getLatency()), srcOp_(srcOp) { + ops_.push_back(MacroOp(op)); + } + template + SuperscalarInstructionInfo(const char* name, SuperscalarInstructionType type, const MacroOp(&arr)[N], int resultOp, int dstOp, int srcOp) + : name_(name), type_(type), latency_(0), resultOp_(resultOp), dstOp_(dstOp), srcOp_(srcOp) { + for (unsigned i = 0; i < N; ++i) { + ops_.push_back(MacroOp(arr[i])); + latency_ += ops_.back().getLatency(); + } + static_assert(N > 1, "Invalid array size"); + } + }; + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISUB_R = SuperscalarInstructionInfo("ISUB_R", SuperscalarInstructionType::ISUB_R, MacroOp::Sub_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_R = SuperscalarInstructionInfo("IXOR_R", SuperscalarInstructionType::IXOR_R, MacroOp::Xor_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_RS = SuperscalarInstructionInfo("IADD_RS", SuperscalarInstructionType::IADD_RS, MacroOp::Lea_sib, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_R = SuperscalarInstructionInfo("IMUL_R", SuperscalarInstructionType::IMUL_R, MacroOp::Imul_rr, 0); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IROR_C = SuperscalarInstructionInfo("IROR_C", SuperscalarInstructionType::IROR_C, MacroOp::Ror_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C7 = SuperscalarInstructionInfo("IADD_C7", SuperscalarInstructionType::IADD_C7, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C7 = SuperscalarInstructionInfo("IXOR_C7", SuperscalarInstructionType::IXOR_C7, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C8 = SuperscalarInstructionInfo("IADD_C8", SuperscalarInstructionType::IADD_C8, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C8 = SuperscalarInstructionInfo("IXOR_C8", SuperscalarInstructionType::IXOR_C8, MacroOp::Xor_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IADD_C9 = SuperscalarInstructionInfo("IADD_C9", SuperscalarInstructionType::IADD_C9, MacroOp::Add_ri, -1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IXOR_C9 = SuperscalarInstructionInfo("IXOR_C9", SuperscalarInstructionType::IXOR_C9, MacroOp::Xor_ri, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMULH_R = SuperscalarInstructionInfo("IMULH_R", SuperscalarInstructionType::IMULH_R, IMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::ISMULH_R = SuperscalarInstructionInfo("ISMULH_R", SuperscalarInstructionType::ISMULH_R, ISMULH_R_ops_array, 1, 0, 1); + const SuperscalarInstructionInfo SuperscalarInstructionInfo::IMUL_RCP = SuperscalarInstructionInfo("IMUL_RCP", SuperscalarInstructionType::IMUL_RCP, IMUL_RCP_ops_array, 1, 1, -1); + + const SuperscalarInstructionInfo SuperscalarInstructionInfo::NOP = SuperscalarInstructionInfo("NOP"); + + //these are some of the options how to split a 16-byte window into 3 or 4 x86 instructions. + //RandomX uses instructions with a native size of 3 (sub, xor, mul, mov), 4 (lea, mul), 7 (xor, add immediate) or 10 bytes (mov 64-bit immediate). + //Slots with sizes of 8 or 9 bytes need to be padded with a nop instruction. + const int buffer0[] = { 4, 8, 4 }; + const int buffer1[] = { 7, 3, 3, 3 }; + const int buffer2[] = { 3, 7, 3, 3 }; + const int buffer3[] = { 4, 9, 3 }; + const int buffer4[] = { 4, 4, 4, 4 }; + const int buffer5[] = { 3, 3, 10 }; + + class DecoderBuffer { + public: + static const DecoderBuffer Default; + template + DecoderBuffer(const char* name, int index, const int(&arr)[N]) + : name_(name), index_(index), counts_(arr), opsCount_(N) {} + const int* getCounts() const { + return counts_; + } + int getSize() const { + return opsCount_; + } + int getIndex() const { + return index_; + } + const char* getName() const { + return name_; + } + const DecoderBuffer* fetchNext(SuperscalarInstructionType instrType, int cycle, int mulCount, Blake2Generator& gen) const { + //If the current RandomX instruction is "IMULH", the next fetch configuration must be 3-3-10 + //because the full 128-bit multiplication instruction is 3 bytes long and decodes to 2 uOPs on Intel CPUs. + //Intel CPUs can decode at most 4 uOPs per cycle, so this requires a 2-1-1 configuration for a total of 3 macro ops. + if (instrType == SuperscalarInstructionType::IMULH_R || instrType == SuperscalarInstructionType::ISMULH_R) + return &decodeBuffer3310; + + //To make sure that the multiplication port is saturated, a 4-4-4-4 configuration is generated if the number of multiplications + //is lower than the number of cycles. + if (mulCount < cycle + 1) + return &decodeBuffer4444; + + //If the current RandomX instruction is "IMUL_RCP", the next buffer must begin with a 4-byte slot for multiplication. + if(instrType == SuperscalarInstructionType::IMUL_RCP) + return (gen.getByte() & 1) ? &decodeBuffer484 : &decodeBuffer493; + + //Default: select a random fetch configuration. + return fetchNextDefault(gen); + } + private: + const char* name_; + int index_; + const int* counts_; + int opsCount_; + DecoderBuffer() : index_(-1) {} + static const DecoderBuffer decodeBuffer484; + static const DecoderBuffer decodeBuffer7333; + static const DecoderBuffer decodeBuffer3733; + static const DecoderBuffer decodeBuffer493; + static const DecoderBuffer decodeBuffer4444; + static const DecoderBuffer decodeBuffer3310; + static const DecoderBuffer* decodeBuffers[4]; + const DecoderBuffer* fetchNextDefault(Blake2Generator& gen) const { + return decodeBuffers[gen.getByte() & 3]; + } + }; + + const DecoderBuffer DecoderBuffer::decodeBuffer484 = DecoderBuffer("4,8,4", 0, buffer0); + const DecoderBuffer DecoderBuffer::decodeBuffer7333 = DecoderBuffer("7,3,3,3", 1, buffer1); + const DecoderBuffer DecoderBuffer::decodeBuffer3733 = DecoderBuffer("3,7,3,3", 2, buffer2); + const DecoderBuffer DecoderBuffer::decodeBuffer493 = DecoderBuffer("4,9,3", 3, buffer3); + const DecoderBuffer DecoderBuffer::decodeBuffer4444 = DecoderBuffer("4,4,4,4", 4, buffer4); + const DecoderBuffer DecoderBuffer::decodeBuffer3310 = DecoderBuffer("3,3,10", 5, buffer5); + + const DecoderBuffer* DecoderBuffer::decodeBuffers[4] = { + &DecoderBuffer::decodeBuffer484, + &DecoderBuffer::decodeBuffer7333, + &DecoderBuffer::decodeBuffer3733, + &DecoderBuffer::decodeBuffer493, + }; + + const DecoderBuffer DecoderBuffer::Default = DecoderBuffer(); + + const SuperscalarInstructionInfo* slot_3[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R }; + const SuperscalarInstructionInfo* slot_3L[] = { &SuperscalarInstructionInfo::ISUB_R, &SuperscalarInstructionInfo::IXOR_R, &SuperscalarInstructionInfo::IMULH_R, &SuperscalarInstructionInfo::ISMULH_R }; + const SuperscalarInstructionInfo* slot_4[] = { &SuperscalarInstructionInfo::IROR_C, &SuperscalarInstructionInfo::IADD_RS }; + const SuperscalarInstructionInfo* slot_7[] = { &SuperscalarInstructionInfo::IXOR_C7, &SuperscalarInstructionInfo::IADD_C7 }; + const SuperscalarInstructionInfo* slot_8[] = { &SuperscalarInstructionInfo::IXOR_C8, &SuperscalarInstructionInfo::IADD_C8 }; + const SuperscalarInstructionInfo* slot_9[] = { &SuperscalarInstructionInfo::IXOR_C9, &SuperscalarInstructionInfo::IADD_C9 }; + const SuperscalarInstructionInfo* slot_10 = &SuperscalarInstructionInfo::IMUL_RCP; + + static bool selectRegister(std::vector& availableRegisters, Blake2Generator& gen, int& reg) { + int index; + if (availableRegisters.size() == 0) + return false; + + if (availableRegisters.size() > 1) { + index = gen.getInt32() % availableRegisters.size(); + } + else { + index = 0; + } + reg = availableRegisters[index]; + return true; + } + + class RegisterInfo { + public: + RegisterInfo() : latency(0), lastOpGroup(SuperscalarInstructionType::INVALID), lastOpPar(-1), value(0) {} + int latency; + SuperscalarInstructionType lastOpGroup; + int lastOpPar; + int value; + }; + + //"SuperscalarInstruction" consists of one or more macro-ops + class SuperscalarInstruction { + public: + void toInstr(Instruction& instr) { //translate to a RandomX instruction format + instr.opcode = (int)getType(); + instr.dst = dst_; + instr.src = src_ >= 0 ? src_ : dst_; + instr.setMod(mod_); + instr.setImm32(imm32_); + } + + void createForSlot(Blake2Generator& gen, int slotSize, int fetchType, bool isLast, bool isFirst) { + switch (slotSize) + { + case 3: + //if this is the last slot, we can also select "IMULH" instructions + if (isLast) { + create(slot_3L[gen.getByte() & 3], gen); + } + else { + create(slot_3[gen.getByte() & 1], gen); + } + break; + case 4: + //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions + if (fetchType == 4 && !isLast) { + create(&SuperscalarInstructionInfo::IMUL_R, gen); + } + else { + create(slot_4[gen.getByte() & 1], gen); + } + break; + case 7: + create(slot_7[gen.getByte() & 1], gen); + break; + case 8: + create(slot_8[gen.getByte() & 1], gen); + break; + case 9: + create(slot_9[gen.getByte() & 1], gen); + break; + case 10: + create(slot_10, gen); + break; + default: + UNREACHABLE; + } + } + + void create(const SuperscalarInstructionInfo* info, Blake2Generator& gen) { + info_ = info; + reset(); + switch (info->getType()) + { + case SuperscalarInstructionType::ISUB_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IXOR_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IXOR_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IADD_RS: { + mod_ = gen.getByte(); + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IADD_RS; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IMUL_R: { + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMUL_R; + groupParIsSource_ = true; + } break; + + case SuperscalarInstructionType::IROR_C: { + mod_ = 0; + do { + imm32_ = gen.getByte() & 63; + } while (imm32_ == 0); + opGroup_ = SuperscalarInstructionType::IROR_C; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: { + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = SuperscalarInstructionType::IADD_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: { + mod_ = 0; + imm32_ = gen.getInt32(); + opGroup_ = SuperscalarInstructionType::IXOR_C7; + opGroupPar_ = -1; + } break; + + case SuperscalarInstructionType::IMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::IMULH_R; + opGroupPar_ = gen.getInt32(); + } break; + + case SuperscalarInstructionType::ISMULH_R: { + canReuse_ = true; + mod_ = 0; + imm32_ = 0; + opGroup_ = SuperscalarInstructionType::ISMULH_R; + opGroupPar_ = gen.getInt32(); + } break; + + case SuperscalarInstructionType::IMUL_RCP: { + mod_ = 0; + do { + imm32_ = gen.getInt32(); + } while ((imm32_ & (imm32_ - 1)) == 0); + opGroup_ = SuperscalarInstructionType::IMUL_RCP; + opGroupPar_ = -1; + } break; + + default: + break; + } + } + + bool selectDestination(int cycle, bool allowChainedMul, RegisterInfo (®isters)[8], Blake2Generator& gen) { + /*if (allowChainedMultiplication && opGroup_ == SuperscalarInstructionType::IMUL_R) + std::cout << "Selecting destination with chained MUL enabled" << std::endl;*/ + std::vector availableRegisters; + //Conditions for the destination register: + // * value must be ready at the required cycle + // * cannot be the same as the source register unless the instruction allows it + // - this avoids optimizable instructions such as "xor r, r" or "sub r, r" + // * register cannot be multiplied twice in a row unless allowChainedMul is true + // - this avoids accumulation of trailing zeroes in registers due to excessive multiplication + // - allowChainedMul is set to true if an attempt to find source/destination registers failed (this is quite rare, but prevents a catastrophic failure of the generator) + // * either the last instruction applied to the register or its source must be different than this instruction + // - this avoids optimizable instruction sequences such as "xor r1, r2; xor r1, r2" or "ror r, C1; ror r, C2" or "add r, C1; add r, C2" + // * register r5 cannot be the destination of the IADD_RS instruction (limitation of the x86 lea instruction) + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle && (canReuse_ || i != src_) && (allowChainedMul || opGroup_ != SuperscalarInstructionType::IMUL_R || registers[i].lastOpGroup != SuperscalarInstructionType::IMUL_R) && (registers[i].lastOpGroup != opGroup_ || registers[i].lastOpPar != opGroupPar_) && (info_->getType() != SuperscalarInstructionType::IADD_RS || i != RegisterNeedsDisplacement)) + availableRegisters.push_back(i); + } + return selectRegister(availableRegisters, gen, dst_); + } + + bool selectSource(int cycle, RegisterInfo(®isters)[8], Blake2Generator& gen) { + std::vector availableRegisters; + //all registers that are ready at the cycle + for (unsigned i = 0; i < 8; ++i) { + if (registers[i].latency <= cycle) + availableRegisters.push_back(i); + } + //if there are only 2 available registers for IADD_RS and one of them is r5, select it as the source because it cannot be the destination + if (availableRegisters.size() == 2 && info_->getType() == SuperscalarInstructionType::IADD_RS) { + if (availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement) { + opGroupPar_ = src_ = RegisterNeedsDisplacement; + return true; + } + } + if (selectRegister(availableRegisters, gen, src_)) { + if (groupParIsSource_) + opGroupPar_ = src_; + return true; + } + return false; + } + + SuperscalarInstructionType getType() { + return info_->getType(); + } + int getSource() { + return src_; + } + int getDestination() { + return dst_; + } + SuperscalarInstructionType getGroup() { + return opGroup_; + } + int getGroupPar() { + return opGroupPar_; + } + + const SuperscalarInstructionInfo& getInfo() const { + return *info_; + } + + static const SuperscalarInstruction Null; + + private: + const SuperscalarInstructionInfo* info_; + int src_ = -1; + int dst_ = -1; + int mod_; + uint32_t imm32_; + SuperscalarInstructionType opGroup_; + int opGroupPar_; + bool canReuse_ = false; + bool groupParIsSource_ = false; + + void reset() { + src_ = dst_ = -1; + canReuse_ = groupParIsSource_ = false; + } + + SuperscalarInstruction(const SuperscalarInstructionInfo* info) : info_(info) { + } + }; + + const SuperscalarInstruction SuperscalarInstruction::Null = SuperscalarInstruction(&SuperscalarInstructionInfo::NOP); + + constexpr int CYCLE_MAP_SIZE = RANDOMX_SUPERSCALAR_MAX_LATENCY + 4; + constexpr int LOOK_FORWARD_CYCLES = 4; + constexpr int MAX_THROWAWAY_COUNT = 256; + + template + static int scheduleUop(ExecutionPort::type uop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle) { + //The scheduling here is done optimistically by checking port availability in order P5 -> P0 -> P1 to not overload + //port P1 (multiplication) by instructions that can go to any port. + for (; cycle < RandomX_CurrentConfig.SuperscalarLatency + 4; ++cycle) { + if ((uop & ExecutionPort::P5) != 0 && !portBusy[cycle][2]) { + if (commit) { + if (trace) std::cout << "; P5 at cycle " << cycle << std::endl; + portBusy[cycle][2] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P0) != 0 && !portBusy[cycle][0]) { + if (commit) { + if (trace) std::cout << "; P0 at cycle " << cycle << std::endl; + portBusy[cycle][0] = uop; + } + return cycle; + } + if ((uop & ExecutionPort::P1) != 0 && !portBusy[cycle][1]) { + if (commit) { + if (trace) std::cout << "; P1 at cycle " << cycle << std::endl; + portBusy[cycle][1] = uop; + } + return cycle; + } + } + return -1; + } + + template + static int scheduleMop(const MacroOp& mop, ExecutionPort::type(&portBusy)[CYCLE_MAP_SIZE][3], int cycle, int depCycle) { + //if this macro-op depends on the previous one, increase the starting cycle if needed + //this handles an explicit dependency chain in IMUL_RCP + if (mop.isDependent()) { + cycle = (cycle > depCycle) ? cycle : depCycle; + } + //move instructions are eliminated and don't need an execution unit + if (mop.isEliminated()) { + if (commit) + if (trace) std::cout << "; (eliminated)" << std::endl; + return cycle; + } + else if (mop.isSimple()) { + //this macro-op has only one uOP + return scheduleUop(mop.getUop1(), portBusy, cycle); + } + else { + //macro-ops with 2 uOPs are scheduled conservatively by requiring both uOPs to execute in the same cycle + for (; cycle < RandomX_CurrentConfig.SuperscalarLatency + 4; ++cycle) { + + int cycle1 = scheduleUop(mop.getUop1(), portBusy, cycle); + int cycle2 = scheduleUop(mop.getUop2(), portBusy, cycle); + + if (cycle1 == cycle2) { + if (commit) { + scheduleUop(mop.getUop1(), portBusy, cycle1); + scheduleUop(mop.getUop2(), portBusy, cycle2); + } + return cycle1; + } + } + } + + return -1; + } + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen) { + + ExecutionPort::type portBusy[CYCLE_MAP_SIZE][3]; + memset(portBusy, 0, sizeof(portBusy)); + RegisterInfo registers[8]; + + const DecoderBuffer* decodeBuffer = &DecoderBuffer::Default; + SuperscalarInstruction currentInstruction = SuperscalarInstruction::Null; + int macroOpIndex = 0; + int codeSize = 0; + int macroOpCount = 0; + int cycle = 0; + int depCycle = 0; + int retireCycle = 0; + bool portsSaturated = false; + int programSize = 0; + int mulCount = 0; + int decodeCycle; + int throwAwayCount = 0; + + //decode instructions for RANDOMX_SUPERSCALAR_LATENCY cycles or until an execution port is saturated. + //Each decode cycle decodes 16 bytes of x86 code. + //Since a decode cycle produces on average 3.45 macro-ops and there are only 3 ALU ports, execution ports are always + //saturated first. The cycle limit is present only to guarantee loop termination. + //Program size is limited to SuperscalarMaxSize instructions. + for (decodeCycle = 0; decodeCycle < RandomX_CurrentConfig.SuperscalarLatency && !portsSaturated && programSize < 3 * RandomX_CurrentConfig.SuperscalarLatency + 2; ++decodeCycle) { + + //select a decode configuration + decodeBuffer = decodeBuffer->fetchNext(currentInstruction.getType(), decodeCycle, mulCount, gen); + if (trace) std::cout << "; ------------- fetch cycle " << cycle << " (" << decodeBuffer->getName() << ")" << std::endl; + + int bufferIndex = 0; + + //fill all instruction slots in the current decode buffer + while (bufferIndex < decodeBuffer->getSize()) { + int topCycle = cycle; + + //if we have issued all macro-ops for the current RandomX instruction, create a new instruction + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + if (portsSaturated || programSize >= 3 * RandomX_CurrentConfig.SuperscalarLatency + 2) + break; + //select an instruction so that the first macro-op fits into the current slot + currentInstruction.createForSlot(gen, decodeBuffer->getCounts()[bufferIndex], decodeBuffer->getIndex(), decodeBuffer->getSize() == bufferIndex + 1, bufferIndex == 0); + macroOpIndex = 0; + if (trace) std::cout << "; " << currentInstruction.getInfo().getName() << std::endl; + } + const MacroOp& mop = currentInstruction.getInfo().getOp(macroOpIndex); + if (trace) std::cout << mop.getName() << " "; + + //calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution + int scheduleCycle = scheduleMop(mop, portBusy, cycle, depCycle); + if (scheduleCycle < 0) { + if (trace) std::cout << "Unable to map operation '" << mop.getName() << "' to execution port (cycle " << cycle << ")" << std::endl; + //__debugbreak(); + portsSaturated = true; + break; + } + + //find a source register (if applicable) that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getSrcOp()) { + int forward; + //if no suitable operand is ready, look up to LOOK_FORWARD_CYCLES forward + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectSource(scheduleCycle, registers, gen); ++forward) { + if (trace) std::cout << "; src STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + //if no register was found, throw the instruction away and try another one + if (forward == LOOK_FORWARD_CYCLES) { + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - source registers not available for operation " << currentInstruction.getInfo().getName() << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (trace) std::cout << "; src = r" << currentInstruction.getSource() << std::endl; + } + //find a destination register that will be ready when this instruction executes + if (macroOpIndex == currentInstruction.getInfo().getDstOp()) { + int forward; + for (forward = 0; forward < LOOK_FORWARD_CYCLES && !currentInstruction.selectDestination(scheduleCycle, throwAwayCount > 0, registers, gen); ++forward) { + if (trace) std::cout << "; dst STALL at cycle " << cycle << std::endl; + ++scheduleCycle; + ++cycle; + } + if (forward == LOOK_FORWARD_CYCLES) { //throw instruction away + if (throwAwayCount < MAX_THROWAWAY_COUNT) { + throwAwayCount++; + macroOpIndex = currentInstruction.getInfo().getSize(); + if (trace) std::cout << "; THROW away " << currentInstruction.getInfo().getName() << std::endl; + //cycle = topCycle; + continue; + } + //abort this decode buffer + if (trace) std::cout << "Aborting at cycle " << cycle << " with decode buffer " << decodeBuffer->getName() << " - destination registers not available" << std::endl; + currentInstruction = SuperscalarInstruction::Null; + break; + } + if (trace) std::cout << "; dst = r" << currentInstruction.getDestination() << std::endl; + } + throwAwayCount = 0; + + //recalculate when the instruction can be scheduled for execution based on operand availability + scheduleCycle = scheduleMop(mop, portBusy, scheduleCycle, scheduleCycle); + + //calculate when the result will be ready + depCycle = scheduleCycle + mop.getLatency(); + + //if this instruction writes the result, modify register information + // RegisterInfo.latency - which cycle the register will be ready + // RegisterInfo.lastOpGroup - the last operation that was applied to the register + // RegisterInfo.lastOpPar - the last operation source value (-1 = constant, 0-7 = register) + if (macroOpIndex == currentInstruction.getInfo().getResultOp()) { + int dst = currentInstruction.getDestination(); + RegisterInfo& ri = registers[dst]; + retireCycle = depCycle; + ri.latency = retireCycle; + ri.lastOpGroup = currentInstruction.getGroup(); + ri.lastOpPar = currentInstruction.getGroupPar(); + if (trace) std::cout << "; RETIRED at cycle " << retireCycle << std::endl; + } + codeSize += mop.getSize(); + bufferIndex++; + macroOpIndex++; + macroOpCount++; + + //terminating condition + if (scheduleCycle >= RandomX_CurrentConfig.SuperscalarLatency) { + portsSaturated = true; + } + cycle = topCycle; + + //when all macro-ops of the current instruction have been issued, add the instruction into the program + if (macroOpIndex >= currentInstruction.getInfo().getSize()) { + currentInstruction.toInstr(prog(programSize++)); + mulCount += isMultiplication(currentInstruction.getType()); + } + } + ++cycle; + } + + double ipc = (macroOpCount / (double)retireCycle); + + memset(prog.asicLatencies, 0, sizeof(prog.asicLatencies)); + + //Calculate ASIC latency: + //Assumes 1 cycle latency for all operations and unlimited parallelization. + for (int i = 0; i < programSize; ++i) { + Instruction& instr = prog(i); + int latDst = prog.asicLatencies[instr.dst] + 1; + int latSrc = instr.dst != instr.src ? prog.asicLatencies[instr.src] + 1 : 0; + prog.asicLatencies[instr.dst] = (latDst > latSrc) ? latDst : latSrc; + } + + //address register is the register with the highest ASIC latency + int asicLatencyMax = 0; + int addressReg = 0; + for (int i = 0; i < 8; ++i) { + if (prog.asicLatencies[i] > asicLatencyMax) { + asicLatencyMax = prog.asicLatencies[i]; + addressReg = i; + } + prog.cpuLatencies[i] = registers[i].latency; + } + + prog.setSize(programSize); + prog.setAddressRegister(addressReg); + + prog.cpuLatency = retireCycle; + prog.asicLatency = asicLatencyMax; + prog.codeSize = codeSize; + prog.macroOps = macroOpCount; + prog.decodeCycles = decodeCycle; + prog.ipc = ipc; + prog.mulCount = mulCount; + + + /*if(INFO) std::cout << "; ALU port utilization:" << std::endl; + if (INFO) std::cout << "; (* = in use, _ = idle)" << std::endl; + + int portCycles = 0; + for (int i = 0; i < RandomX_Config.SuperscalarLatency + 4; ++i) { + std::cout << "; " << std::setw(3) << i << " "; + for (int j = 0; j < 3; ++j) { + std::cout << (portBusy[i][j] ? '*' : '_'); + portCycles += !!portBusy[i][j]; + } + std::cout << std::endl; + }*/ + } + + void executeSuperscalar(int_reg_t(&r)[8], SuperscalarProgram& prog, std::vector *reciprocals) { + for (unsigned j = 0; j < prog.getSize(); ++j) { + Instruction& instr = prog(j); + switch ((SuperscalarInstructionType)instr.opcode) + { + case SuperscalarInstructionType::ISUB_R: + r[instr.dst] -= r[instr.src]; + break; + case SuperscalarInstructionType::IXOR_R: + r[instr.dst] ^= r[instr.src]; + break; + case SuperscalarInstructionType::IADD_RS: + r[instr.dst] += r[instr.src] << instr.getModShift(); + break; + case SuperscalarInstructionType::IMUL_R: + r[instr.dst] *= r[instr.src]; + break; + case SuperscalarInstructionType::IROR_C: + r[instr.dst] = rotr64(r[instr.dst], instr.getImm32()); + break; + case SuperscalarInstructionType::IADD_C7: + case SuperscalarInstructionType::IADD_C8: + case SuperscalarInstructionType::IADD_C9: + r[instr.dst] += signExtend2sCompl(instr.getImm32()); + break; + case SuperscalarInstructionType::IXOR_C7: + case SuperscalarInstructionType::IXOR_C8: + case SuperscalarInstructionType::IXOR_C9: + r[instr.dst] ^= signExtend2sCompl(instr.getImm32()); + break; + case SuperscalarInstructionType::IMULH_R: + r[instr.dst] = mulh(r[instr.dst], r[instr.src]); + break; + case SuperscalarInstructionType::ISMULH_R: + r[instr.dst] = smulh(r[instr.dst], r[instr.src]); + break; + case SuperscalarInstructionType::IMUL_RCP: + if (reciprocals != nullptr) + r[instr.dst] *= (*reciprocals)[instr.getImm32()]; + else + r[instr.dst] *= randomx_reciprocal(instr.getImm32()); + break; + default: + UNREACHABLE; + } + } + } +} diff --git a/src/crypto/randomx/superscalar.hpp b/src/crypto/randomx/superscalar.hpp new file mode 100644 index 00000000..bc101c45 --- /dev/null +++ b/src/crypto/randomx/superscalar.hpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "superscalar_program.hpp" +#include "blake2_generator.hpp" + +namespace randomx { + // Intel Ivy Bridge reference + enum class SuperscalarInstructionType { //uOPs (decode) execution ports latency code size + ISUB_R = 0, //1 p015 1 3 (sub) + IXOR_R = 1, //1 p015 1 3 (xor) + IADD_RS = 2, //1 p01 1 4 (lea) + IMUL_R = 3, //1 p1 3 4 (imul) + IROR_C = 4, //1 p05 1 4 (ror) + IADD_C7 = 5, //1 p015 1 7 (add) + IXOR_C7 = 6, //1 p015 1 7 (xor) + IADD_C8 = 7, //1+0 p015 1 7+1 (add+nop) + IXOR_C8 = 8, //1+0 p015 1 7+1 (xor+nop) + IADD_C9 = 9, //1+0 p015 1 7+2 (add+nop) + IXOR_C9 = 10, //1+0 p015 1 7+2 (xor+nop) + IMULH_R = 11, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+mul+mov) + ISMULH_R = 12, //1+2+1 0+(p1,p5)+0 3 3+3+3 (mov+imul+mov) + IMUL_RCP = 13, //1+1 p015+p1 4 10+4 (mov+imul) + + COUNT = 14, + INVALID = -1 + }; + + void generateSuperscalar(SuperscalarProgram& prog, Blake2Generator& gen); + void executeSuperscalar(uint64_t(&r)[8], SuperscalarProgram& prog, std::vector *reciprocals = nullptr); +} \ No newline at end of file diff --git a/src/crypto/randomx/superscalar_program.hpp b/src/crypto/randomx/superscalar_program.hpp new file mode 100644 index 00000000..dbb0b163 --- /dev/null +++ b/src/crypto/randomx/superscalar_program.hpp @@ -0,0 +1,73 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "instruction.hpp" +#include "common.hpp" + +namespace randomx { + + class SuperscalarProgram { + public: + Instruction& operator()(int pc) { + return programBuffer[pc]; + } + uint32_t getSize() { + return size; + } + void setSize(uint32_t val) { + size = val; + } + int getAddressRegister() { + return addrReg; + } + void setAddressRegister(int val) { + addrReg = val; + } + + Instruction programBuffer[SuperscalarMaxSize]; + uint32_t size +#ifndef NDEBUG + = 0 +#endif + ; + int addrReg; + double ipc; + int codeSize; + int macroOps; + int decodeCycles; + int cpuLatency; + int asicLatency; + int mulCount; + int cpuLatencies[8]; + int asicLatencies[8]; + }; + +} \ No newline at end of file diff --git a/src/crypto/randomx/virtual_machine.cpp b/src/crypto/randomx/virtual_machine.cpp new file mode 100644 index 00000000..f7bc9ce4 --- /dev/null +++ b/src/crypto/randomx/virtual_machine.cpp @@ -0,0 +1,137 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include "virtual_machine.hpp" +#include "common.hpp" +#include "aes_hash.hpp" +#include "blake2/blake2.h" +#include "intrin_portable.h" +#include "allocator.hpp" + +randomx_vm::~randomx_vm() { + +} + +void randomx_vm::resetRoundingMode() { + rx_reset_float_state(); +} + +namespace randomx { + + static inline uint64_t getSmallPositiveFloatBits(uint64_t entropy) { + auto exponent = entropy >> 59; //0..31 + auto mantissa = entropy & mantissaMask; + exponent += exponentBias; + exponent &= exponentMask; + exponent <<= mantissaSize; + return exponent | mantissa; + } + + static inline uint64_t getStaticExponent(uint64_t entropy) { + auto exponent = constExponentBits; + exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits; + exponent <<= mantissaSize; + return exponent; + } + + static inline uint64_t getFloatMask(uint64_t entropy) { + constexpr uint64_t mask22bit = (1ULL << 22) - 1; + return (entropy & mask22bit) | getStaticExponent(entropy); + } + +} + +void randomx_vm::initialize() { + store64(®.a[0].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(0))); + store64(®.a[0].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(1))); + store64(®.a[1].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(2))); + store64(®.a[1].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(3))); + store64(®.a[2].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(4))); + store64(®.a[2].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(5))); + store64(®.a[3].lo, randomx::getSmallPositiveFloatBits(program.getEntropy(6))); + store64(®.a[3].hi, randomx::getSmallPositiveFloatBits(program.getEntropy(7))); + mem.ma = program.getEntropy(8) & CacheLineAlignMask; + mem.mx = program.getEntropy(10); + auto addressRegisters = program.getEntropy(12); + config.readReg0 = 0 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg1 = 2 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg2 = 4 + (addressRegisters & 1); + addressRegisters >>= 1; + config.readReg3 = 6 + (addressRegisters & 1); + datasetOffset = (program.getEntropy(13) % (DatasetExtraItems + 1)) * randomx::CacheLineSize; + store64(&config.eMask[0], randomx::getFloatMask(program.getEntropy(14))); + store64(&config.eMask[1], randomx::getFloatMask(program.getEntropy(15))); +} + +namespace randomx { + + alignas(16) volatile static rx_vec_i128 aesDummy; + + template + VmBase::~VmBase() { + Allocator::freeMemory(scratchpad, RANDOMX_SCRATCHPAD_L3_MAX_SIZE); + } + + template + void VmBase::allocate() { + if (datasetPtr == nullptr) + throw std::invalid_argument("Cache/Dataset not set"); + if (!softAes) { //if hardware AES is not supported, it's better to fail now than to return a ticking bomb + rx_vec_i128 tmp = rx_load_vec_i128((const rx_vec_i128*)&aesDummy); + tmp = rx_aesenc_vec_i128(tmp, tmp); + rx_store_vec_i128((rx_vec_i128*)&aesDummy, tmp); + } + scratchpad = (uint8_t*)Allocator::allocMemory(RANDOMX_SCRATCHPAD_L3_MAX_SIZE); + } + + template + void VmBase::getFinalResult(void* out, size_t outSize) { + hashAes1Rx4(scratchpad, ScratchpadSize, ®.a); + blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + + template + void VmBase::initScratchpad(void* seed) { + fillAes1Rx4(seed, ScratchpadSize, scratchpad); + } + + template + void VmBase::generateProgram(void* seed) { + fillAes4Rx4(seed, 128 + RandomX_CurrentConfig.ProgramSize * 8, &program); + } + + template class VmBase, false>; + template class VmBase, true>; + template class VmBase; + template class VmBase; +} \ No newline at end of file diff --git a/src/crypto/randomx/virtual_machine.hpp b/src/crypto/randomx/virtual_machine.hpp new file mode 100644 index 00000000..488994df --- /dev/null +++ b/src/crypto/randomx/virtual_machine.hpp @@ -0,0 +1,83 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "common.hpp" +#include "program.hpp" + +/* Global namespace for C binding */ +class randomx_vm { +public: + virtual ~randomx_vm() = 0; + virtual void allocate() = 0; + virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void setDataset(randomx_dataset* dataset) { } + virtual void setCache(randomx_cache* cache) { } + virtual void initScratchpad(void* seed) = 0; + virtual void run(void* seed) = 0; + void resetRoundingMode(); + randomx::RegisterFile *getRegisterFile() { + return ® + } + const void* getScratchpad() { + return scratchpad; + } + const randomx::Program& getProgram() + { + return program; + } +protected: + void initialize(); + alignas(64) randomx::Program program; + alignas(64) randomx::RegisterFile reg; + alignas(16) randomx::ProgramConfiguration config; + randomx::MemoryRegisters mem; + uint8_t* scratchpad; + union { + randomx_cache* cachePtr = nullptr; + randomx_dataset* datasetPtr; + }; + uint64_t datasetOffset; +}; + +namespace randomx { + + template + class VmBase : public randomx_vm { + public: + ~VmBase() override; + void allocate() override; + void initScratchpad(void* seed) override; + void getFinalResult(void* out, size_t outSize) override; + protected: + void generateProgram(void* seed); + }; + +} \ No newline at end of file diff --git a/src/crypto/randomx/virtual_memory.cpp b/src/crypto/randomx/virtual_memory.cpp new file mode 100644 index 00000000..925e8e86 --- /dev/null +++ b/src/crypto/randomx/virtual_memory.cpp @@ -0,0 +1,105 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "virtual_memory.hpp" + +#include + +#if defined(_WIN32) || defined(__CYGWIN__) +#include +#else +#ifdef __APPLE__ +#include +#endif +#include +#include +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif + +#if defined(_WIN32) || defined(__CYGWIN__) +std::string getErrorMessage(const char* function) { + LPSTR messageBuffer = nullptr; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, GetLastError(), MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL); + std::string message(messageBuffer, size); + LocalFree(messageBuffer); + return std::string(function) + std::string(": ") + message; +} +#endif + +void* allocExecutableMemory(std::size_t bytes) { + void* mem; +#if defined(_WIN32) || defined(__CYGWIN__) + mem = VirtualAlloc(nullptr, bytes, MEM_COMMIT, PAGE_EXECUTE_READWRITE); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocExecutableMemory - VirtualAlloc")); +#else + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + throw std::runtime_error("allocExecutableMemory - mmap failed"); +#endif + return mem; +} + +constexpr std::size_t align(std::size_t pos, std::size_t align) { + return ((pos - 1) / align + 1) * align; +} + +void* allocLargePagesMemory(std::size_t bytes) { + void* mem; +#if defined(_WIN32) || defined(__CYGWIN__) + auto pageMinimum = GetLargePageMinimum(); + if (pageMinimum > 0) + mem = VirtualAlloc(NULL, align(bytes, pageMinimum), MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE); + else + throw std::runtime_error("allocLargePagesMemory - Large pages are not supported"); + if (mem == nullptr) + throw std::runtime_error(getErrorMessage("allocLargePagesMemory - VirtualAlloc")); +#else +#ifdef __APPLE__ + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0); +#elif defined(__FreeBSD__) + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER, -1, 0); +#else + mem = mmap(nullptr, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); +#endif + if (mem == MAP_FAILED) + throw std::runtime_error("allocLargePagesMemory - mmap failed"); +#endif + return mem; +} + +void freePagedMemory(void* ptr, std::size_t bytes) { +#if defined(_WIN32) || defined(__CYGWIN__) + VirtualFree(ptr, 0, MEM_RELEASE); +#else + munmap(ptr, bytes); +#endif +} diff --git a/src/crypto/randomx/virtual_memory.hpp b/src/crypto/randomx/virtual_memory.hpp new file mode 100644 index 00000000..d3b31db1 --- /dev/null +++ b/src/crypto/randomx/virtual_memory.hpp @@ -0,0 +1,35 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include + +void* allocExecutableMemory(std::size_t); +void* allocLargePagesMemory(std::size_t); +void freePagedMemory(void*, std::size_t); diff --git a/src/crypto/randomx/vm_compiled.cpp b/src/crypto/randomx/vm_compiled.cpp new file mode 100644 index 00000000..7f621a33 --- /dev/null +++ b/src/crypto/randomx/vm_compiled.cpp @@ -0,0 +1,60 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_compiled.hpp" +#include "common.hpp" + +namespace randomx { + + static_assert(sizeof(MemoryRegisters) == 2 * sizeof(addr_t) + sizeof(uintptr_t), "Invalid alignment of struct randomx::MemoryRegisters"); + static_assert(sizeof(RegisterFile) == 256, "Invalid alignment of struct randomx::RegisterFile"); + + template + void CompiledVm::setDataset(randomx_dataset* dataset) { + datasetPtr = dataset; + } + + template + void CompiledVm::run(void* seed) { + VmBase::generateProgram(seed); + randomx_vm::initialize(); + compiler.generateProgram(program, config); + mem.memory = datasetPtr->memory + datasetOffset; + execute(); + } + + template + void CompiledVm::execute() { + compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations); + } + + template class CompiledVm, false>; + template class CompiledVm, true>; + template class CompiledVm; + template class CompiledVm; +} \ No newline at end of file diff --git a/src/crypto/randomx/vm_compiled.hpp b/src/crypto/randomx/vm_compiled.hpp new file mode 100644 index 00000000..856f00d8 --- /dev/null +++ b/src/crypto/randomx/vm_compiled.hpp @@ -0,0 +1,72 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "virtual_machine.hpp" +#include "jit_compiler.hpp" +#include "allocator.hpp" +#include "dataset.hpp" + +namespace randomx { + + template + class CompiledVm : public VmBase { + public: + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(CompiledVm)); + } + void setDataset(randomx_dataset* dataset) override; + void run(void* seed) override; + + using VmBase::mem; + using VmBase::program; + using VmBase::config; + using VmBase::reg; + using VmBase::scratchpad; + using VmBase::datasetPtr; + using VmBase::datasetOffset; + protected: + void execute(); + + JitCompiler compiler; + }; + + using CompiledVmDefault = CompiledVm, true>; + using CompiledVmHardAes = CompiledVm, false>; + using CompiledVmLargePage = CompiledVm; + using CompiledVmLargePageHardAes = CompiledVm; +} diff --git a/src/crypto/randomx/vm_compiled_light.cpp b/src/crypto/randomx/vm_compiled_light.cpp new file mode 100644 index 00000000..c083f4aa --- /dev/null +++ b/src/crypto/randomx/vm_compiled_light.cpp @@ -0,0 +1,54 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_compiled_light.hpp" +#include "common.hpp" +#include + +namespace randomx { + + template + void CompiledLightVm::setCache(randomx_cache* cache) { + cachePtr = cache; + mem.memory = cache->memory; + compiler.generateSuperscalarHash(cache->programs, cache->reciprocalCache); + } + + template + void CompiledLightVm::run(void* seed) { + VmBase::generateProgram(seed); + randomx_vm::initialize(); + compiler.generateProgramLight(program, config, datasetOffset); + CompiledVm::execute(); + } + + template class CompiledLightVm, false>; + template class CompiledLightVm, true>; + template class CompiledLightVm; + template class CompiledLightVm; +} \ No newline at end of file diff --git a/src/crypto/randomx/vm_compiled_light.hpp b/src/crypto/randomx/vm_compiled_light.hpp new file mode 100644 index 00000000..6af82bbe --- /dev/null +++ b/src/crypto/randomx/vm_compiled_light.hpp @@ -0,0 +1,64 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "vm_compiled.hpp" + +namespace randomx { + + template + class CompiledLightVm : public CompiledVm { + public: + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(CompiledLightVm)); + } + void setCache(randomx_cache* cache) override; + void setDataset(randomx_dataset* dataset) override { } + void run(void* seed) override; + + using CompiledVm::mem; + using CompiledVm::compiler; + using CompiledVm::program; + using CompiledVm::config; + using CompiledVm::cachePtr; + using CompiledVm::datasetOffset; + }; + + using CompiledLightVmDefault = CompiledLightVm, true>; + using CompiledLightVmHardAes = CompiledLightVm, false>; + using CompiledLightVmLargePage = CompiledLightVm; + using CompiledLightVmLargePageHardAes = CompiledLightVm; +} \ No newline at end of file diff --git a/src/crypto/randomx/vm_interpreted.cpp b/src/crypto/randomx/vm_interpreted.cpp new file mode 100644 index 00000000..236d3efe --- /dev/null +++ b/src/crypto/randomx/vm_interpreted.cpp @@ -0,0 +1,125 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_interpreted.hpp" +#include "dataset.hpp" +#include "intrin_portable.h" +#include "reciprocal.h" + +namespace randomx { + + template + void InterpretedVm::setDataset(randomx_dataset* dataset) { + datasetPtr = dataset; + mem.memory = dataset->memory; + } + + template + void InterpretedVm::run(void* seed) { + VmBase::generateProgram(seed); + randomx_vm::initialize(); + execute(); + } + + template + void InterpretedVm::execute() { + + NativeRegisterFile nreg; + + for(unsigned i = 0; i < RegisterCountFlt; ++i) + nreg.a[i] = rx_load_vec_f128(®.a[i].lo); + + compileProgram(program, bytecode, nreg); + + uint32_t spAddr0 = mem.mx; + uint32_t spAddr1 = mem.ma; + + for(unsigned ic = 0; ic < RandomX_CurrentConfig.ProgramIterations; ++ic) { + uint64_t spMix = nreg.r[config.readReg0] ^ nreg.r[config.readReg1]; + spAddr0 ^= spMix; + spAddr0 &= ScratchpadL3Mask64; + spAddr1 ^= spMix >> 32; + spAddr1 &= ScratchpadL3Mask64; + + for (unsigned i = 0; i < RegistersCount; ++i) + nreg.r[i] ^= load64(scratchpad + spAddr0 + 8 * i); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + nreg.f[i] = rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * i); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + nreg.e[i] = maskRegisterExponentMantissa(config, rx_cvt_packed_int_vec_f128(scratchpad + spAddr1 + 8 * (RegisterCountFlt + i))); + + executeBytecode(bytecode, scratchpad, config); + + mem.mx ^= nreg.r[config.readReg2] ^ nreg.r[config.readReg3]; + mem.mx &= CacheLineAlignMask; + datasetPrefetch(datasetOffset + mem.mx); + datasetRead(datasetOffset + mem.ma, nreg.r); + std::swap(mem.mx, mem.ma); + + for (unsigned i = 0; i < RegistersCount; ++i) + store64(scratchpad + spAddr1 + 8 * i, nreg.r[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + nreg.f[i] = rx_xor_vec_f128(nreg.f[i], nreg.e[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128((double*)(scratchpad + spAddr0 + 16 * i), nreg.f[i]); + + spAddr0 = 0; + spAddr1 = 0; + } + + for (unsigned i = 0; i < RegistersCount; ++i) + store64(®.r[i], nreg.r[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128(®.f[i].lo, nreg.f[i]); + + for (unsigned i = 0; i < RegisterCountFlt; ++i) + rx_store_vec_f128(®.e[i].lo, nreg.e[i]); + } + + template + void InterpretedVm::datasetRead(uint64_t address, int_reg_t(&r)[RegistersCount]) { + uint64_t* datasetLine = (uint64_t*)(mem.memory + address); + for (int i = 0; i < RegistersCount; ++i) + r[i] ^= datasetLine[i]; + } + + template + void InterpretedVm::datasetPrefetch(uint64_t address) { + rx_prefetch_nta(mem.memory + address); + } + + template class InterpretedVm, false>; + template class InterpretedVm, true>; + template class InterpretedVm; + template class InterpretedVm; +} \ No newline at end of file diff --git a/src/crypto/randomx/vm_interpreted.hpp b/src/crypto/randomx/vm_interpreted.hpp new file mode 100644 index 00000000..99c88852 --- /dev/null +++ b/src/crypto/randomx/vm_interpreted.hpp @@ -0,0 +1,75 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include "common.hpp" +#include "virtual_machine.hpp" +#include "bytecode_machine.hpp" +#include "intrin_portable.h" +#include "allocator.hpp" + +namespace randomx { + + template + class InterpretedVm : public VmBase, public BytecodeMachine { + public: + using VmBase::mem; + using VmBase::scratchpad; + using VmBase::program; + using VmBase::config; + using VmBase::reg; + using VmBase::datasetPtr; + using VmBase::datasetOffset; + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(InterpretedVm)); + } + void run(void* seed) override; + void setDataset(randomx_dataset* dataset) override; + protected: + virtual void datasetRead(uint64_t blockNumber, int_reg_t(&r)[RegistersCount]); + virtual void datasetPrefetch(uint64_t blockNumber); + private: + void execute(); + + InstructionByteCode bytecode[RANDOMX_PROGRAM_MAX_SIZE]; + }; + + using InterpretedVmDefault = InterpretedVm, true>; + using InterpretedVmHardAes = InterpretedVm, false>; + using InterpretedVmLargePage = InterpretedVm; + using InterpretedVmLargePageHardAes = InterpretedVm; +} \ No newline at end of file diff --git a/src/crypto/randomx/vm_interpreted_light.cpp b/src/crypto/randomx/vm_interpreted_light.cpp new file mode 100644 index 00000000..c54b32f6 --- /dev/null +++ b/src/crypto/randomx/vm_interpreted_light.cpp @@ -0,0 +1,55 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "vm_interpreted_light.hpp" +#include "dataset.hpp" + +namespace randomx { + + template + void InterpretedLightVm::setCache(randomx_cache* cache) { + cachePtr = cache; + mem.memory = cache->memory; + } + + template + void InterpretedLightVm::datasetRead(uint64_t address, int_reg_t(&r)[8]) { + uint32_t itemNumber = address / CacheLineSize; + int_reg_t rl[8]; + + initDatasetItem(cachePtr, (uint8_t*)rl, itemNumber); + + for (unsigned q = 0; q < 8; ++q) + r[q] ^= rl[q]; + } + + template class InterpretedLightVm, false>; + template class InterpretedLightVm, true>; + template class InterpretedLightVm; + template class InterpretedLightVm; +} diff --git a/src/crypto/randomx/vm_interpreted_light.hpp b/src/crypto/randomx/vm_interpreted_light.hpp new file mode 100644 index 00000000..02d678f6 --- /dev/null +++ b/src/crypto/randomx/vm_interpreted_light.hpp @@ -0,0 +1,61 @@ +/* +Copyright (c) 2018-2019, tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include "vm_interpreted.hpp" + +namespace randomx { + + template + class InterpretedLightVm : public InterpretedVm { + public: + using VmBase::mem; + using VmBase::cachePtr; + void* operator new(size_t size) { + void* ptr = AlignedAllocator::allocMemory(size); + if (ptr == nullptr) + throw std::bad_alloc(); + return ptr; + } + void operator delete(void* ptr) { + AlignedAllocator::freeMemory(ptr, sizeof(InterpretedLightVm)); + } + void setDataset(randomx_dataset* dataset) override { } + void setCache(randomx_cache* cache) override; + protected: + void datasetRead(uint64_t address, int_reg_t(&r)[8]) override; + void datasetPrefetch(uint64_t address) override { } + }; + + using InterpretedLightVmDefault = InterpretedLightVm, true>; + using InterpretedLightVmHardAes = InterpretedLightVm, false>; + using InterpretedLightVmLargePage = InterpretedLightVm; + using InterpretedLightVmLargePageHardAes = InterpretedLightVm; +} diff --git a/src/workers/OclThread.cpp b/src/workers/OclThread.cpp index c615f1cb..543e4769 100644 --- a/src/workers/OclThread.cpp +++ b/src/workers/OclThread.cpp @@ -45,6 +45,7 @@ static const char *kMemChunk = "mem_chunk"; static const char *kStridedIndex = "strided_index"; static const char *kUnroll = "unroll"; static const char *kWorksize = "worksize"; +static const char *kBFactor = "bfactor"; } @@ -64,6 +65,7 @@ xmrig::OclThread::OclThread(const rapidjson::Value &object) : setIndex(Json::getUint(object, kIndex)); setIntensity(Json::getUint(object, kIntensity)); setWorksize(Json::getUint(object, kWorksize)); + setBFactor(Json::getUint(object, kBFactor, 6)); setAffinity(Json::getInt64(object, kAffineToCpu, -1)); setMemChunk(Json::getInt(object, kMemChunk, m_ctx->memChunk)); setUnrollFactor(Json::getInt(object, kUnroll, m_ctx->unrollFactor)); @@ -195,6 +197,12 @@ void xmrig::OclThread::setWorksize(size_t worksize) } +void xmrig::OclThread::setBFactor(size_t bfactor) +{ + m_ctx->bfactor = bfactor; +} + + #ifdef APP_DEBUG void xmrig::OclThread::print() const { diff --git a/src/workers/OclThread.h b/src/workers/OclThread.h index f8ae0aa5..265f10cc 100644 --- a/src/workers/OclThread.h +++ b/src/workers/OclThread.h @@ -70,6 +70,7 @@ class OclThread : public xmrig::IThread void setThreadsCountByGPU(size_t threads); void setUnrollFactor(int unrollFactor); void setWorksize(size_t worksize); + void setBFactor(size_t bfactor); protected: # ifdef APP_DEBUG diff --git a/src/workers/OclWorker.cpp b/src/workers/OclWorker.cpp index 3b248553..6e9860a4 100644 --- a/src/workers/OclWorker.cpp +++ b/src/workers/OclWorker.cpp @@ -93,7 +93,15 @@ void OclWorker::start() const int64_t t = xmrig::steadyTimestamp(); - XMRRunJob(m_ctx, results, m_job.algorithm().variant()); +# ifdef XMRIG_ALGO_RANDOMX + if (m_job.algorithm().algo() == xmrig::RANDOM_X) { + RXRunJob(m_ctx, results, m_job.algorithm().variant()); + } + else +# endif + { + XMRRunJob(m_ctx, results, m_job.algorithm().variant()); + } for (size_t i = 0; i < results[0xFF]; i++) { *m_job.nonce() = results[i]; @@ -239,7 +247,15 @@ void OclWorker::setJob() { memcpy(m_blob, m_job.blob(), sizeof(m_blob)); - XMRSetJob(m_ctx, m_blob, m_job.size(), m_job.target(), m_job.algorithm().variant(), m_job.height()); +# ifdef XMRIG_ALGO_RANDOMX + if (m_job.algorithm().algo() == xmrig::RANDOM_X) { + RXSetJob(m_ctx, m_blob, m_job.size(), m_job.target(), m_job.seed_hash(), m_job.algorithm().variant()); + } + else +# endif + { + XMRSetJob(m_ctx, m_blob, m_job.size(), m_job.target(), m_job.algorithm().variant(), m_job.height()); + } } diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp index 47e73baf..97b66780 100644 --- a/src/workers/Workers.cpp +++ b/src/workers/Workers.cpp @@ -29,6 +29,7 @@ #include "amd/OclGPU.h" #include "api/Api.h" #include "common/log/Log.h" +#include "common/cpu/Cpu.h" #include "core/Config.h" #include "core/Controller.h" #include "crypto/CryptoNight.h" @@ -61,6 +62,16 @@ xmrig::Controller *Workers::m_controller = nullptr; xmrig::IJobResultListener *Workers::m_listener = nullptr; xmrig::Job Workers::m_job; +#ifdef XMRIG_ALGO_RANDOMX +uv_rwlock_t Workers::m_rx_dataset_lock; +randomx_cache *Workers::m_rx_cache = nullptr; +randomx_dataset *Workers::m_rx_dataset = nullptr; +randomx_vm *Workers::m_rx_vm = nullptr; +uint8_t Workers::m_rx_seed_hash[32] = {}; +xmrig::Variant Workers::m_rx_variant = xmrig::VARIANT_MAX; +std::atomic Workers::m_rx_dataset_init_thread_counter = {}; +#endif + struct JobBaton { @@ -200,6 +211,10 @@ bool Workers::start(xmrig::Controller *controller) m_threadsCount = threads.size(); m_hashrate = new Hashrate(m_threadsCount, controller); +# ifdef XMRIG_ALGO_RANDOMX + uv_rwlock_init(&m_rx_dataset_lock); +# endif + uv_mutex_init(&m_mutex); uv_rwlock_init(&m_rwlock); @@ -326,13 +341,61 @@ void Workers::onResult(uv_async_t *handle) return; } - cryptonight_ctx *ctx; - MemInfo info = Mem::create(&ctx, baton->jobs[0].algorithm().algo(), 1); + cryptonight_ctx *ctx = nullptr; + MemInfo info; for (const xmrig::Job &job : baton->jobs) { xmrig::JobResult result(job); - if (CryptoNight::hash(job, result, ctx)) { + bool ok; + +# ifdef XMRIG_ALGO_RANDOMX + if (job.algorithm().algo() == xmrig::RANDOM_X) { + uv_rwlock_wrlock(&m_rx_dataset_lock); + + if (m_rx_variant != job.algorithm().variant()) { + m_rx_variant = job.algorithm().variant(); + + switch (job.algorithm().variant()) { + case xmrig::VARIANT_RX_WOW: + randomx_apply_config(RandomX_WowneroConfig); + break; + case xmrig::VARIANT_RX_LOKI: + randomx_apply_config(RandomX_LokiConfig); + break; + default: + randomx_apply_config(RandomX_MoneroConfig); + break; + } + } + + if (!m_rx_vm) { + int flags = RANDOMX_FLAG_LARGE_PAGES | RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_JIT; + if (!xmrig::Cpu::info()->hasAES()) { + flags |= RANDOMX_FLAG_HARD_AES; + } + + m_rx_vm = randomx_create_vm(static_cast(flags), nullptr, m_rx_dataset); + if (!m_rx_vm) { + m_rx_vm = randomx_create_vm(static_cast(flags - RANDOMX_FLAG_LARGE_PAGES), nullptr, m_rx_dataset); + } + } + randomx_calculate_hash(m_rx_vm, job.blob(), job.size(), result.result); + + uv_rwlock_wrunlock(&m_rx_dataset_lock); + + ok = *reinterpret_cast(result.result + 24) < job.target(); + } + else +# endif + { + if (!ctx) { + info = Mem::create(&ctx, baton->jobs[0].algorithm().algo(), 1); + } + ok = CryptoNight::hash(job, result, ctx); + } + + if (ok) { baton->results.push_back(result); } else { @@ -340,7 +403,9 @@ void Workers::onResult(uv_async_t *handle) } } - Mem::release(&ctx, 1, info); + if (ctx) { + Mem::release(&ctx, 1, info); + } }, [](uv_work_t* req, int status) { JobBaton *baton = static_cast(req->data); @@ -379,3 +444,72 @@ void Workers::start(IWorker *worker) { worker->start(); } + +#ifdef XMRIG_ALGO_RANDOMX +randomx_dataset* Workers::getDataset(const uint8_t* seed_hash, xmrig::Variant variant) +{ + uv_rwlock_wrlock(&m_rx_dataset_lock); + + // Check if we need to update cache and dataset + if (m_rx_dataset && ((memcmp(m_rx_seed_hash, seed_hash, sizeof(m_rx_seed_hash)) == 0) && (m_rx_variant == variant))) { + uv_rwlock_wrunlock(&m_rx_dataset_lock); + return m_rx_dataset; + } + + if (!m_rx_dataset) { + randomx_dataset* dataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES); + if (!dataset) { + dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT); + } + m_rx_cache = randomx_alloc_cache(static_cast(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)); + if (!m_rx_cache) { + m_rx_cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); + } + m_rx_dataset = dataset; + } + + const uint32_t num_threads = std::thread::hardware_concurrency(); + LOG_INFO("Started updating RandomX dataset (%u threads)", num_threads); + + if (m_rx_variant != variant) { + switch (variant) { + case xmrig::VARIANT_RX_WOW: + randomx_apply_config(RandomX_WowneroConfig); + break; + case xmrig::VARIANT_RX_LOKI: + randomx_apply_config(RandomX_LokiConfig); + break; + default: + randomx_apply_config(RandomX_MoneroConfig); + break; + } + m_rx_variant = variant; + } + + if (memcmp(m_rx_seed_hash, seed_hash, sizeof(m_rx_seed_hash)) != 0) { + memcpy(m_rx_seed_hash, seed_hash, sizeof(m_rx_seed_hash)); + randomx_init_cache(m_rx_cache, m_rx_seed_hash, sizeof(m_rx_seed_hash)); + } + + std::vector threads; + threads.reserve(num_threads); + for (uint32_t i = 0; i < num_threads; ++i) { + const uint32_t a = (randomx_dataset_item_count() * i) / num_threads; + const uint32_t b = (randomx_dataset_item_count() * (i + 1)) / num_threads; + threads.emplace_back(randomx_init_dataset, m_rx_dataset, m_rx_cache, a, b - a); + } + for (uint32_t i = 0; i < num_threads; ++i) { + threads[i].join(); + } + + if (m_rx_vm) { + randomx_vm_set_dataset(m_rx_vm, m_rx_dataset); + } + + LOG_INFO("Finished updating RandomX dataset (%u threads)", num_threads); + + uv_rwlock_wrunlock(&m_rx_dataset_lock); + + return m_rx_dataset; +} +#endif diff --git a/src/workers/Workers.h b/src/workers/Workers.h index 0682b48e..98cfac05 100644 --- a/src/workers/Workers.h +++ b/src/workers/Workers.h @@ -31,6 +31,10 @@ #include #include +#ifdef XMRIG_ALGO_RANDOMX +# include +#endif + #if defined(__APPLE__) # include #else @@ -79,6 +83,10 @@ class Workers static void threadsSummary(rapidjson::Document &doc); # endif +# ifdef XMRIG_ALGO_RANDOMX + static randomx_dataset* getDataset(const uint8_t* seed_hash, xmrig::Variant variant); +# endif + private: static void onReady(void *arg); static void onResult(uv_async_t *handle); @@ -101,6 +109,16 @@ class Workers static xmrig::Controller *m_controller; static xmrig::IJobResultListener *m_listener; static xmrig::Job m_job; + +# ifdef XMRIG_ALGO_RANDOMX + static uv_rwlock_t m_rx_dataset_lock; + static randomx_cache *m_rx_cache; + static randomx_dataset *m_rx_dataset; + static randomx_vm *m_rx_vm; + static uint8_t m_rx_seed_hash[32]; + static xmrig::Variant m_rx_variant; + static std::atomic m_rx_dataset_init_thread_counter; +# endif };