Skip to content

Commit

Permalink
windows: match the aligned alloc to avoid Windows crash, see 463414c
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Dec 3, 2024
1 parent 0aeb09a commit 1b3a486
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 40 deletions.
32 changes: 16 additions & 16 deletions constantine/math/elliptic/ec_multi_scalar_mul.nim
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](
const numBuckets = 1 shl c - 1 # bucket 0 is unused
const numWindows = bits.ceilDiv_vartime(c)

let miniMSMs = allocHeapArray(EC, numWindows)
let buckets = allocHeapArray(EC, numBuckets)
let miniMSMs = allocHeapArrayAligned(EC, numWindows, alignment = 64)
let buckets = allocHeapArrayAligned(EC, numBuckets, alignment = 64)

# Algorithm
# ---------
Expand Down Expand Up @@ -91,8 +91,8 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff](

# Cleanup
# -------
buckets.freeHeap()
miniMSMs.freeHeap()
buckets.freeHeapAligned()
miniMSMs.freeHeapAligned()

func multiScalarMul_reference_dispatch_vartime[bits: static int, EC, ECaff](
r: var EC,
Expand Down Expand Up @@ -151,7 +151,7 @@ func multiScalarMul_reference_vartime*[F, EC, ECaff](
coefs_big.batchFromField(coefs, n)
r.multiScalarMul_reference_vartime(coefs_big, points, n)

freeHeapAligned(coefs_big)
coefs_big.freeHeapAligned()

func multiScalarMul_reference_vartime*[EC, ECaff](
r: var EC,
Expand Down Expand Up @@ -264,7 +264,7 @@ func msmImpl_vartime[bits: static int, EC, ECaff](
# -----
const numBuckets = 1 shl (c-1)

let buckets = allocHeapArray(EC, numBuckets)
let buckets = allocHeapArrayAligned(EC, numBuckets, alignment = 64)
for i in 0 ..< numBuckets:
buckets[i].setNeutral()

Expand Down Expand Up @@ -293,7 +293,7 @@ func msmImpl_vartime[bits: static int, EC, ECaff](

# Cleanup
# -------
buckets.freeHeap()
buckets.freeHeapAligned()

# Multi scalar multiplication with batched affine additions
# -----------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -357,8 +357,8 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff](
# Setup
# -----
const (numBuckets, queueLen) = c.deriveSchedulerConstants()
let buckets = allocHeap(Buckets[numBuckets, EC, ECaff])
let sched = allocHeap(Scheduler[numBuckets, queueLen, EC, ECaff])
let buckets = allocHeapAligned(Buckets[numBuckets, EC, ECaff], alignment = 64)
let sched = allocHeapAligned(Scheduler[numBuckets, queueLen, EC, ECaff], alignment = 64)
sched.init(points, buckets, 0, numBuckets.int32)

# Algorithm
Expand Down Expand Up @@ -389,8 +389,8 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff](

# Cleanup
# -------
sched.freeHeap()
buckets.freeHeap()
sched.freeHeapAligned()
buckets.freeHeapAligned()

# Endomorphism acceleration
# -----------------------------------------------------------------------------------------------------------------------
Expand All @@ -410,8 +410,8 @@ proc applyEndomorphism[bits: static int, ECaff](
else: ECaff.G

const L = ECaff.getScalarField().bits().computeEndoRecodedLength(M)
let splitCoefs = allocHeapArray(array[M, BigInt[L]], N)
let endoBasis = allocHeapArray(array[M, ECaff], N)
let splitCoefs = allocHeapArrayAligned(array[M, BigInt[L]], N, alignment = 64)
let endoBasis = allocHeapArrayAligned(array[M, ECaff], N, alignment = 64)

for i in 0 ..< N:
var negatePoints {.noinit.}: array[M, SecretBool]
Expand Down Expand Up @@ -448,8 +448,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
# Given that bits and N changed, we are able to use a bigger `c`
# but it has no significant impact on performance
msmProc(r, endoCoefs, endoPoints, endoN, c)
freeHeap(endoCoefs)
freeHeap(endoPoints)
endoCoefs.freeHeapAligned()
endoPoints.freeHeapAligned()
else:
msmProc(r, coefs, points, N, c)

Expand Down Expand Up @@ -555,7 +555,7 @@ func multiScalarMul_vartime*[F, EC, ECaff](
coefs_big.batchFromField(coefs, n)
r.multiScalarMul_vartime(coefs_big, points, n)

freeHeapAligned(coefs_big)
coefs_big.freeHeapAligned()

func multiScalarMul_vartime*[EC, ECaff](
r: var EC,
Expand Down
48 changes: 24 additions & 24 deletions constantine/math/elliptic/ec_multi_scalar_mul_parallel.nim
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ proc msmImpl_vartime_parallel[bits: static int, EC, ECaff](
# Instead of storing the result in futures, risking them being scattered in memory
# we store them in a contiguous array, and the synchronizing future just returns a bool.
# top window is done on this thread
let miniMSMsResults = allocHeapArray(EC, numFullWindows)
let miniMSMsResults = allocHeapArrayAligned(EC, numFullWindows, alignment = 64)
let miniMSMsReady = allocStackArray(FlowVar[bool], numFullWindows)

let bucketsMatrix = allocHeapArray(EC, numBuckets*numWindows)
let bucketsMatrix = allocHeapArrayAligned(EC, numBuckets*numWindows, alignment = 64)

# Algorithm
# ---------
Expand Down Expand Up @@ -204,8 +204,8 @@ proc msmImpl_vartime_parallel[bits: static int, EC, ECaff](

# Cleanup
# -------
miniMSMsResults.freeHeap()
bucketsMatrix.freeHeap()
miniMSMsResults.freeHeapAligned()
bucketsMatrix.freeHeapAligned()

# Parallel MSM Affine - bucket accumulation
# -----------------------------------------
Expand All @@ -218,8 +218,8 @@ proc bucketAccumReduce_serial[bits: static int, EC, ECaff](
N: int) =

const (numBuckets, queueLen) = c.deriveSchedulerConstants()
let buckets = allocHeap(Buckets[numBuckets, EC, ECaff])
let sched = allocHeap(Scheduler[numBuckets, queueLen, EC, ECaff])
let buckets = allocHeapAligned(Buckets[numBuckets, EC, ECaff], alignment = 64)
let sched = allocHeapAligned(Scheduler[numBuckets, queueLen, EC, ECaff], alignment = 64)
sched.init(points, buckets, 0, numBuckets.int32)

# 1. Bucket Accumulation
Expand All @@ -230,8 +230,8 @@ proc bucketAccumReduce_serial[bits: static int, EC, ECaff](

# Cleanup
# ----------------
sched.freeHeap()
buckets.freeHeap()
sched.freeHeapAligned()
buckets.freeHeapAligned()

proc bucketAccumReduce_parallel[bits: static int, EC, ECaff](
tp: Threadpool,
Expand All @@ -253,8 +253,8 @@ proc bucketAccumReduce_parallel[bits: static int, EC, ECaff](
let chunkSize = int32(numBuckets) shr log2_vartime(cast[uint32](numChunks)) # Both are power of 2 so exact division
let chunksReadiness = allocStackArray(FlowVar[bool], numChunks-1) # Last chunk is done on this thread

let buckets = allocHeap(Buckets[numBuckets, EC, ECaff])
let scheds = allocHeapArray(Scheduler[numBuckets, queueLen, EC, ECaff], numChunks)
let buckets = allocHeapAligned(Buckets[numBuckets, EC, ECaff], alignment = 64)
let scheds = allocHeapArrayAligned(Scheduler[numBuckets, queueLen, EC, ECaff], numChunks, alignment = 64)

block: # 1. Bucket Accumulation
for chunkID in 0'i32 ..< numChunks-1:
Expand Down Expand Up @@ -307,8 +307,8 @@ proc bucketAccumReduce_parallel[bits: static int, EC, ECaff](

# Cleanup
# ----------------
scheds.freeHeap()
buckets.freeHeap()
scheds.freeHeapAligned()
buckets.freeHeapAligned()

# Parallel MSM Affine - window-level only
# ---------------------------------------
Expand All @@ -328,7 +328,7 @@ proc msmAffine_vartime_parallel[bits: static int, EC, ECaff](
# Instead of storing the result in futures, risking them being scattered in memory
# we store them in a contiguous array, and the synchronizing future just returns a bool.
# top window is done on this thread
let miniMSMsResults = allocHeapArray(EC, numFullWindows)
let miniMSMsResults = allocHeapArrayAligned(EC, numFullWindows, alignment = 64)
let miniMSMsReady = allocStackArray(Flowvar[bool], numFullWindows)

# Algorithm
Expand Down Expand Up @@ -365,13 +365,13 @@ proc msmAffine_vartime_parallel[bits: static int, EC, ECaff](
elif excess == 0: kFullWindow
else: kTopWindow

let buckets = allocHeapArray(EC, numBuckets)
let buckets = allocHeapArrayAligned(EC, numBuckets, alignment = 64)
bucketAccumReduce_withInit(
r,
buckets,
bitIndex = top, msmKind, c,
coefs, points, N)
freeHeapAligned(buckets)
buckets.freeHeapAligned()

# 3. Final reduction
for w in countdown(numFullWindows-1, 0):
Expand All @@ -382,7 +382,7 @@ proc msmAffine_vartime_parallel[bits: static int, EC, ECaff](

# Cleanup
# -------
miniMSMsResults.freeHeap()
miniMSMsResults.freeHeapAligned()

proc msmAffine_vartime_parallel_split[bits: static int, EC, ECaff](
tp: Threadpool,
Expand Down Expand Up @@ -410,7 +410,7 @@ proc msmAffine_vartime_parallel_split[bits: static int, EC, ECaff](
return

let chunkingDescriptor = balancedChunksPrioNumber(0, N, msmParallelism)
let splitMSMsResults = allocHeapArray(typeof(r[]), msmParallelism-1)
let splitMSMsResults = allocHeapArrayAligned(typeof(r[]), msmParallelism-1, alignment = 64)
let splitMSMsReady = allocStackArray(Flowvar[bool], msmParallelism-1)

for (i, start, len) in items(chunkingDescriptor):
Expand All @@ -429,7 +429,7 @@ proc msmAffine_vartime_parallel_split[bits: static int, EC, ECaff](
discard sync splitMSMsReady[i]
r[] ~+= splitMSMsResults[i]

freeHeap(splitMSMsResults)
splitMSMsResults.freeHeapAligned()

proc applyEndomorphism_parallel[bits: static int, ECaff](
tp: Threadpool,
Expand All @@ -447,8 +447,8 @@ proc applyEndomorphism_parallel[bits: static int, ECaff](
else: ECaff.G

const L = ECaff.getScalarField().bits().computeEndoRecodedLength(M)
let splitCoefs = allocHeapArray(array[M, BigInt[L]], N)
let endoBasis = allocHeapArray(array[M, ECaff], N)
let splitCoefs = allocHeapArrayAligned(array[M, BigInt[L]], N, alignment = 64)
let endoBasis = allocHeapArrayAligned(array[M, ECaff], N, alignment = 64)

syncScope:
tp.parallelFor i in 0 ..< N:
Expand Down Expand Up @@ -489,8 +489,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
# Given that bits and N changed, we are able to use a bigger `c`
# but it has no significant impact on performance
msmProc(tp, r, endoCoefs, endoPoints, endoN, c)
freeHeap(endoCoefs)
freeHeap(endoPoints)
endoCoefs.freeHeapAligned()
endoPoints.freeHeapAligned()
else:
msmProc(tp, r, coefs, points, N, c)

Expand All @@ -512,8 +512,8 @@ template withEndo[coefsBits: static int, EC, ECaff](
# Given that bits and N changed, we are able to use a bigger `c`
# but it has no significant impact on performance
msmProc(tp, r, endoCoefs, endoPoints, endoN, c, useParallelBuckets)
freeHeap(endoCoefs)
freeHeap(endoPoints)
endoCoefs.freeHeapAligned()
endoPoints.freeHeapAligned()
else:
msmProc(tp, r, coefs, points, N, c, useParallelBuckets)

Expand Down

0 comments on commit 1b3a486

Please sign in to comment.