deps: update zlib to 1.3.0.1-motley-24c07df

PR-URL: #52199 Reviewed-By: Marco Ippolito <[email protected]> Reviewed-By: Luigi Pinca <[email protected]>
nodejs · Mar 27, 2024 · 38161c3 · 38161c3
1 parent af48641
commit 38161c3
Show file tree

Hide file tree

Showing 8 changed files with 183 additions and 26 deletions.
diff --git a/deps/zlib/CMakeLists.txt b/deps/zlib/CMakeLists.txt
@@ -74,6 +74,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS)
 
     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto")
   endif()
+
+  if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+    add_definitions(-DRISCV_RVV)
+    add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
+    add_definitions(-DADLER32_SIMD_RVV)
+    #TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
+    # Required by CPU features detection code.
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
+  endif()
+
 endif()
 
 #
@@ -180,20 +190,28 @@ set(ZLIB_SRCS
 # Update list of source files if optimizations were enabled
 #============================================================================
 if (ENABLE_SIMD_OPTIMIZATIONS)
-  list(REMOVE_ITEM ZLIB_SRCS inflate.c)
-
-  list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
-  list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
-  list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
-  list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
-  list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)
-
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
-  list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
+  if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
+    message("RISCVV: Add optimizations.")
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
+  else()
+    list(REMOVE_ITEM ZLIB_SRCS inflate.c)
+
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h)
+
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c)
+  endif()
 endif()
 
 # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION

diff --git a/deps/zlib/adler32.c b/deps/zlib/adler32.c
@@ -58,20 +58,24 @@
 #endif
 
 #include "cpu_features.h"
-#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV)
 #include "adler32_simd.h"
 #endif
 
 /* ========================================================================= */
 uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
     unsigned long sum2;
     unsigned n;
-
+    /* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
+    || defined(ADLER32_SIMD_RVV)
 #if defined(ADLER32_SIMD_SSSE3)
     if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
-        return adler32_simd_(adler, buf, len);
 #elif defined(ADLER32_SIMD_NEON)
     if (buf != Z_NULL && len >= 64)
+#elif defined(ADLER32_SIMD_RVV)
+    if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv)
+#endif
         return adler32_simd_(adler, buf, len);
 #endif
 
@@ -90,7 +94,8 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) {
         return adler | (sum2 << 16);
     }
 
-#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \
+    || defined(RISCV_RVV)
     /*
      * Use SIMD to compute the adler32. Since this function can be
      * freely used, check CPU features here. zlib convention is to

diff --git a/deps/zlib/adler32_simd.c b/deps/zlib/adler32_simd.c
@@ -41,6 +41,9 @@
  * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
  * of the adler s1 s2 of uint32_t type (see adler32.c).
  */
+/* Copyright (C) 2023 SiFive, Inc. All rights reserved.
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
 
 #include "adler32_simd.h"
 
@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
     return s1 | (s2 << 16);
 }
 
+#elif defined(ADLER32_SIMD_RVV)
+#include <riscv_vector.h>
+/* adler32_rvv.c - RVV version of Adler-32
+ * RVV 1.0 code contributed by Alex Chiang <[email protected]>
+ * on https://github.com/zlib-ng/zlib-ng/pull/1532
+ * Port from Simon Hosie's fork:
+ * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1
+ */
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* RVV */
+    uint32_t adler,
+    const unsigned char *buf,
+    unsigned long len)
+{
+    /* split Adler-32 into component sums */
+    uint32_t sum2 = (adler >> 16) & 0xffff;
+    adler &= 0xffff;
+
+    size_t left = len;
+    size_t vl = __riscv_vsetvlmax_e8m1();
+    vl = vl > 256 ? 256 : vl;
+    vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl);
+    vuint16m2_t v_buf16_accu;
+
+    /*
+     * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator.
+     * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit
+     * accumulators to boost performance.
+     *
+     * The block_size is the largest multiple of vl that <= 256, because overflow would occur when
+     * vl > 256 (255 * 256 <= UINT16_MAX).
+     *
+     * We accumulate 8-bit data into a 16-bit accumulator and then
+     * move the data into the 32-bit accumulator at the last iteration.
+     */
+    size_t block_size = (256 / vl) * vl;
+    size_t nmax_limit = (NMAX / block_size);
+    size_t cnt = 0;
+    while (left >= block_size) {
+        v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+        size_t subprob = block_size;
+        while (subprob > 0) {
+            vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+            v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+            v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+            buf += vl;
+            subprob -= vl;
+        }
+        v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl);
+        v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+        left -= block_size;
+        /* do modulo once each block of NMAX size */
+        if (++cnt >= nmax_limit) {
+            v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+            cnt = 0;
+        }
+    }
+    /* the left len <= 256 now, we can use 16-bit accum safely */
+    v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl);
+    size_t res = left;
+    while (left >= vl) {
+        vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl);
+        v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl);
+        v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl);
+        buf += vl;
+        left -= vl;
+    }
+    v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl);
+    v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl);
+    v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl);
+
+    vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl);
+    vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl);
+    vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl);
+
+    v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl);
+
+    vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl);
+    uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum);
+
+    sum2 += (sum2_sum + adler * (len - left));
+
+    vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl);
+    v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl);
+    uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum);
+
+    adler += adler_sum;
+
+    while (left--) {
+        adler += *buf++;
+        sum2 += adler;
+    }
+
+    sum2 %= BASE;
+    adler %= BASE;
+
+    return adler | (sum2 << 16);
+}
+
 #endif  /* ADLER32_SIMD_SSSE3 */
diff --git a/deps/zlib/cpu_features.c b/deps/zlib/cpu_features.c
@@ -33,9 +33,13 @@ int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
 int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
 int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
 
+int ZLIB_INTERNAL riscv_cpu_enable_rvv = 0;
+int ZLIB_INTERNAL riscv_cpu_enable_vclmul = 0;
+
 #ifndef CPU_NO_SIMD
 
-#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
+#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
+    defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS)
 #include <pthread.h>
 #endif
 
@@ -62,7 +66,10 @@ int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0;
 static void _cpu_check_features(void);
 #endif
 
-#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS)
+#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \
+    defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || \
+    defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) || \
+    defined(RISCV_RVV)
 #if !defined(ARMV8_OS_MACOS)
 // _cpu_check_features() doesn't need to do anything on mac/arm since all
 // features are known at build time, so don't call it.
@@ -184,6 +191,23 @@ static void _cpu_check_features(void)
     x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040;
 #endif
 }
+#endif // x86 & NO_SIMD
+
+#elif defined(RISCV_RVV)
+#include <sys/auxv.h>
+
+#ifndef ZLIB_HWCAP_RVV
+#define ZLIB_HWCAP_RVV (1 << ('v' - 'a'))
 #endif
-#endif
-#endif
+
+/* TODO(cavalcantii)
+ * - add support for Android@RISCV i.e. __riscv_hwprobe().
+ * - detect vclmul (crypto extensions).
+ */
+static void _cpu_check_features(void)
+{
+  unsigned long features = getauxval(AT_HWCAP);
+  riscv_cpu_enable_rvv = !!(features & ZLIB_HWCAP_RVV);
+}
+#endif // ARM | x86 | RISCV
+#endif // NO SIMD CPU
diff --git a/deps/zlib/cpu_features.h b/deps/zlib/cpu_features.h
@@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3;
 extern int x86_cpu_enable_simd;
 extern int x86_cpu_enable_avx512;
 
+extern int riscv_cpu_enable_rvv;
+extern int riscv_cpu_enable_vclmul;
+
 void cpu_check_features(void);
diff --git a/deps/zlib/crc32.c b/deps/zlib/crc32.c
@@ -706,7 +706,8 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
      * place to cache CPU features if needed for those later, more
      * interesting crc32() calls.
      */
-#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
+#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
+    || defined(RISCV_RVV)
     /*
      * Since this routine can be freely used, check CPU features here.
      */
@@ -1085,7 +1086,8 @@ unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
     /* Some bots compile with optimizations disabled, others will emulate
      * ARM on x86 and other weird combinations.
      */
-#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
+#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \
+    || defined(RISCV_RVV)
     /* We got to verify CPU features, so exploit the common usage pattern
      * of calling this function with Z_NULL for an initial valid crc value.
      * This allows to cache the result of the feature check and avoid extraneous

diff --git a/deps/zlib/deflate.c b/deps/zlib/deflate.c
@@ -401,7 +401,8 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method,
     // for all wrapper formats (e.g. RAW, ZLIB, GZIP).
     // Feature detection is not triggered while using RAW mode (i.e. we never
     // call crc32() with a NULL buffer).
-#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
+#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) \
+    || defined(RISCV_RVV)
     cpu_check_features();
 #endif
 

diff --git a/src/zlib_version.h b/src/zlib_version.h
@@ -2,5 +2,5 @@
 // Refer to tools/dep_updaters/update-zlib.sh
 #ifndef SRC_ZLIB_VERSION_H_
 #define SRC_ZLIB_VERSION_H_
-#define ZLIB_VERSION "1.3.0.1-motley-24342f6"
+#define ZLIB_VERSION "1.3.0.1-motley-24c07df"
 #endif  // SRC_ZLIB_VERSION_H_