-
Notifications
You must be signed in to change notification settings - Fork 30k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
deps: update zlib to 1.3.0.1-motley-24c07df
PR-URL: #52199 Reviewed-By: Marco Ippolito <[email protected]> Reviewed-By: Luigi Pinca <[email protected]>
- Loading branch information
1 parent
af48641
commit 38161c3
Showing
8 changed files
with
183 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,6 +41,9 @@ | |
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates | ||
* of the adler s1 s2 of uint32_t type (see adler32.c). | ||
*/ | ||
/* Copyright (C) 2023 SiFive, Inc. All rights reserved. | ||
* For conditions of distribution and use, see copyright notice in zlib.h | ||
*/ | ||
|
||
#include "adler32_simd.h" | ||
|
||
|
@@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ | |
return s1 | (s2 << 16); | ||
} | ||
|
||
#elif defined(ADLER32_SIMD_RVV) | ||
#include <riscv_vector.h> | ||
/* adler32_rvv.c - RVV version of Adler-32 | ||
* RVV 1.0 code contributed by Alex Chiang <[email protected]> | ||
* on https://github.com/zlib-ng/zlib-ng/pull/1532 | ||
* Port from Simon Hosie's fork: | ||
* https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1 | ||
*/ | ||
|
||
uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */ | ||
uint32_t adler, | ||
const unsigned char *buf, | ||
unsigned long len) | ||
{ | ||
/* split Adler-32 into component sums */ | ||
uint32_t sum2 = (adler >> 16) & 0xffff; | ||
adler &= 0xffff; | ||
|
||
size_t left = len; | ||
size_t vl = __riscv_vsetvlmax_e8m1(); | ||
vl = vl > 256 ? 256 : vl; | ||
vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); | ||
vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); | ||
vuint16m2_t v_buf16_accu; | ||
|
||
/* | ||
* We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. | ||
* However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit | ||
* accumulators to boost performance. | ||
* | ||
* The block_size is the largest multiple of vl that <= 256, because overflow would occur when | ||
* vl > 256 (255 * 256 <= UINT16_MAX). | ||
* | ||
* We accumulate 8-bit data into a 16-bit accumulator and then | ||
* move the data into the 32-bit accumulator at the last iteration. | ||
*/ | ||
size_t block_size = (256 / vl) * vl; | ||
size_t nmax_limit = (NMAX / block_size); | ||
size_t cnt = 0; | ||
while (left >= block_size) { | ||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); | ||
size_t subprob = block_size; | ||
while (subprob > 0) { | ||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); | ||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); | ||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); | ||
buf += vl; | ||
subprob -= vl; | ||
} | ||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); | ||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); | ||
left -= block_size; | ||
/* do modulo once each block of NMAX size */ | ||
if (++cnt >= nmax_limit) { | ||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); | ||
cnt = 0; | ||
} | ||
} | ||
/* the left len <= 256 now, we can use 16-bit accum safely */ | ||
v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); | ||
size_t res = left; | ||
while (left >= vl) { | ||
vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); | ||
v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); | ||
v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); | ||
buf += vl; | ||
left -= vl; | ||
} | ||
v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); | ||
v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); | ||
v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); | ||
|
||
vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); | ||
vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); | ||
vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); | ||
|
||
v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); | ||
|
||
vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); | ||
v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); | ||
uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); | ||
|
||
sum2 += (sum2_sum + adler * (len - left)); | ||
|
||
vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); | ||
v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); | ||
uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); | ||
|
||
adler += adler_sum; | ||
|
||
while (left--) { | ||
adler += *buf++; | ||
sum2 += adler; | ||
} | ||
|
||
sum2 %= BASE; | ||
adler %= BASE; | ||
|
||
return adler | (sum2 << 16); | ||
} | ||
|
||
#endif /* ADLER32_SIMD_SSSE3 */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters