Skip to content

Commit

Permalink
arm64: Use optimized memcmp
Browse files Browse the repository at this point in the history
This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.
  • Loading branch information
Wilco Dijkstra authored and Ristovski committed Nov 18, 2023
1 parent 7a8a822 commit 7ea39c5
Showing 1 changed file with 118 additions and 234 deletions.
352 changes: 118 additions & 234 deletions arch/arm64/lib/memcmp.S
Original file line number Diff line number Diff line change
@@ -1,247 +1,131 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 ARM Ltd.
* Copyright (C) 2013 Linaro.
* Copyright (c) 2017 ARM Ltd
* All rights reserved.
*
* This code is based on glibc cortex strings work originally authored by Linaro
* be found @
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the company may not be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
* files/head:/src/aarch64/
* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

/* Assumptions:
*
* ARMv8-a, AArch64, unaligned accesses.
*/

/* includes here */
#include <linux/linkage.h>
#include <asm/assembler.h>

/*
* compare memory areas(when two memory areas' offset are different,
* alignment handled by the hardware)
*
* Parameters:
* x0 - const memory area 1 pointer
* x1 - const memory area 2 pointer
* x2 - the maximal compare byte length
* Returns:
* x0 - a compare result, maybe less than, equal to, or greater than ZERO
*/

/* Parameters and result. */
src1 .req x0
src2 .req x1
limit .req x2
result .req x0
#define src1 x0
#define src2 x1
#define limit x2
#define result w0

/* Internal variables. */
data1 .req x3
data1w .req w3
data2 .req x4
data2w .req w4
has_nul .req x5
diff .req x6
endloop .req x7
tmp1 .req x8
tmp2 .req x9
tmp3 .req x10
pos .req x11
limit_wd .req x12
mask .req x13

SYM_FUNC_START_WEAK_PI(memcmp)
cbz limit, .Lret0
eor tmp1, src1, src2
tst tmp1, #7
b.ne .Lmisaligned8
ands tmp1, src1, #7
b.ne .Lmutual_align
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
/*
* The input source addresses are at alignment boundary.
* Directly compare eight bytes each time.
*/
.Lloop_aligned:
ldr data1, [src1], #8
ldr data2, [src2], #8
.Lstart_realigned:
subs limit_wd, limit_wd, #1
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, cs /* Last Dword or differences. */
cbz endloop, .Lloop_aligned

/* Not reached the limit, must have found a diff. */
tbz limit_wd, #63, .Lnot_limit

/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
ands limit, limit, #7
b.eq .Lnot_limit
/*
* The remained bytes less than 8. It is needed to extract valid data
* from last eight bytes of the intended memory range.
*/
lsl limit, limit, #3 /* bytes-> bits. */
mov mask, #~0
CPU_BE( lsr mask, mask, limit )
CPU_LE( lsl mask, mask, limit )
bic data1, data1, mask
bic data2, data2, mask

orr diff, diff, mask
b .Lnot_limit

.Lmutual_align:
/*
* Sources are mutually aligned, but are not currently at an
* alignment boundary. Round down the addresses and then mask off
* the bytes that precede the start point.
*/
bic src1, src1, #7
bic src2, src2, #7
ldr data1, [src1], #8
ldr data2, [src2], #8
/*
* We can not add limit with alignment offset(tmp1) here. Since the
* addition probably make the limit overflown.
*/
sub limit_wd, limit, #1/*limit != 0, so no underflow.*/
and tmp3, limit_wd, #7
lsr limit_wd, limit_wd, #3
add tmp3, tmp3, tmp1
add limit_wd, limit_wd, tmp3, lsr #3
add limit, limit, tmp1/* Adjust the limit for the extra. */

lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
neg tmp1, tmp1/* Bits to alignment -64. */
mov tmp2, #~0
/*mask off the non-intended bytes before the start address.*/
CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
/* Little-endian. Early bytes are at LSB. */
CPU_LE( lsr tmp2, tmp2, tmp1 )

orr data1, data1, tmp2
orr data2, data2, tmp2
b .Lstart_realigned

/*src1 and src2 have different alignment offset.*/
.Lmisaligned8:
cmp limit, #8
b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/

and tmp1, src1, #7
neg tmp1, tmp1
add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
and tmp2, src2, #7
neg tmp2, tmp2
add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
subs tmp3, tmp1, tmp2
csel pos, tmp1, tmp2, hi /*Choose the maximum.*/

sub limit, limit, pos
/*compare the proceeding bytes in the first 8 byte segment.*/
.Ltinycmp:
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs pos, pos, #1
ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
b.eq .Ltinycmp
cbnz pos, 1f /*diff occurred before the last byte.*/
#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define tmp1 x5

/* Small inputs of less than 8 bytes are handled separately. This allows the
main code to be sped up using unaligned loads since there are now at least
8 bytes to be compared. If the first 8 bytes are equal, align src1.
This ensures each iteration does at most one unaligned access even if both
src1 and src2 are unaligned, and mutually aligned inputs behave as if
aligned. After the main loop, process the last 8 bytes using unaligned
accesses. */

.p2align 6
WEAK(memcmp)
subs limit, limit, 8
b.lo .Lless8

/* Limit >= 8, so check first 8 bytes using unaligned loads. */
ldr data1, [src1], 8
ldr data2, [src2], 8
and tmp1, src1, 7
add limit, limit, tmp1
cmp data1, data2
bne .Lreturn

/* Align src1 and adjust src2 with bytes not yet done. */
sub src1, src1, tmp1
sub src2, src2, tmp1

subs limit, limit, 8
b.ls .Llast_bytes

/* Loop performing 8 bytes per iteration using aligned src1.
Limit is pre-decremented by 8 and must be larger than zero.
Exit if <= 8 bytes left to do or if the data is not equal. */
.p2align 4
.Lloop8:
ldr data1, [src1], 8
ldr data2, [src2], 8
subs limit, limit, 8
ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
b.eq .Lloop8

cmp data1, data2
bne .Lreturn

/* Compare last 1-8 bytes using unaligned access. */
.Llast_bytes:
ldr data1, [src1, limit]
ldr data2, [src2, limit]

/* Compare data bytes and set return value to 0, -1 or 1. */
.Lreturn:
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
cmp data1, data2
.Lret_eq:
cset result, ne
cneg result, result, lo
ret

.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
.Lless8:
adds limit, limit, 4
b.lo .Lless4
ldr data1w, [src1], 4
ldr data2w, [src2], 4
cmp data1w, data2w
b.eq .Lstart_align
1:
sub result, data1, data2
ret

.Lstart_align:
lsr limit_wd, limit, #3
cbz limit_wd, .Lremain8

ands xzr, src1, #7
b.eq .Lrecal_offset
/*process more leading bytes to make src1 aligned...*/
add src1, src1, tmp3 /*backwards src1 to alignment boundary*/
add src2, src2, tmp3
sub limit, limit, tmp3
lsr limit_wd, limit, #3
cbz limit_wd, .Lremain8
/*load 8 bytes from aligned SRC1..*/
ldr data1, [src1], #8
ldr data2, [src2], #8

subs limit_wd, limit_wd, #1
eor diff, data1, data2 /*Non-zero if differences found.*/
csinv endloop, diff, xzr, ne
cbnz endloop, .Lunequal_proc
/*How far is the current SRC2 from the alignment boundary...*/
and tmp3, tmp3, #7

.Lrecal_offset:/*src1 is aligned now..*/
neg pos, tmp3
.Lloopcmp_proc:
/*
* Divide the eight bytes into two parts. First,backwards the src2
* to an alignment boundary,load eight bytes and compare from
* the SRC2 alignment boundary. If all 8 bytes are equal,then start
* the second part's comparison. Otherwise finish the comparison.
* This special handle can garantee all the accesses are in the
* thread/task space in avoid to overrange access.
*/
ldr data1, [src1,pos]
ldr data2, [src2,pos]
eor diff, data1, data2 /* Non-zero if differences found. */
cbnz diff, .Lnot_limit

/*The second part process*/
ldr data1, [src1], #8
ldr data2, [src2], #8
eor diff, data1, data2 /* Non-zero if differences found. */
subs limit_wd, limit_wd, #1
csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
cbz endloop, .Lloopcmp_proc
.Lunequal_proc:
cbz diff, .Lremain8

/* There is difference occurred in the latest comparison. */
.Lnot_limit:
/*
* For little endian,reverse the low significant equal bits into MSB,then
* following CLZ can find how many equal bits exist.
*/
CPU_LE( rev diff, diff )
CPU_LE( rev data1, data1 )
CPU_LE( rev data2, data2 )

/*
* The MS-non-zero bit of DIFF marks either the first bit
* that is different, or the end of the significant data.
* Shifting left now will bring the critical information into the
* top bits.
*/
clz pos, diff
lsl data1, data1, pos
lsl data2, data2, pos
/*
* We need to zero-extend (char is unsigned) the value and then
* perform a signed subtraction.
*/
lsr data1, data1, #56
sub result, data1, data2, lsr #56
ret

.Lremain8:
/* Limit % 8 == 0 =>. all data are equal.*/
ands limit, limit, #7
b.eq .Lret0

.Ltiny8proc:
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1

ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */
b.eq .Ltiny8proc
sub result, data1, data2
ret
.Lret0:
mov result, #0
b.ne .Lreturn
sub limit, limit, 4
.Lless4:
adds limit, limit, 4
beq .Lret_eq
.Lbyte_loop:
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq .Lbyte_loop
sub result, data1w, data2w
ret
SYM_FUNC_END_PI(memcmp)
EXPORT_SYMBOL_NOKASAN(memcmp)
ENDPIPROC(memcmp)

0 comments on commit 7ea39c5

Please sign in to comment.