Skip to content

Commit

Permalink
[SW] Update main files for better Cache/SPM configurations.
Browse files Browse the repository at this point in the history
  • Loading branch information
DiyouS committed Dec 3, 2024
1 parent 0d7dee2 commit c33aee3
Show file tree
Hide file tree
Showing 11 changed files with 124 additions and 51 deletions.
2 changes: 1 addition & 1 deletion sw/spatzBenchmarks/dp-faxpy-cache/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int main() {

if (cid == 0) {
// Init the cache
l1d_init(32);
l1d_init(16);
}

// Wait for all cores to finish
Expand Down
2 changes: 1 addition & 1 deletion sw/spatzBenchmarks/dp-faxpy/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ int main() {

if (cid == 0) {
// Init the cache
l1d_init(32);
l1d_init(120);
}

// Reset timer
Expand Down
86 changes: 56 additions & 30 deletions sw/spatzBenchmarks/dp-fdotp/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include DATAHEADER
#include "kernel/fdotp.c"

#define USE_CACHE

double *a;
double *b;
double *result;
Expand All @@ -41,9 +43,14 @@ static inline int fp_check(const double a, const double b) {
int main() {
const unsigned int num_cores = snrt_cluster_core_num();
const unsigned int cid = snrt_cluster_core_idx();
const int measure_iter = 1;

#ifdef USE_CACHE
uint32_t spm_size = 16;
#else
uint32_t spm_size = 120;
#endif

uint32_t spm_size = 32;

if (cid == 0) {
// Init the cache
l1d_init(spm_size);
Expand All @@ -54,10 +61,20 @@ int main() {

// Reset timer
unsigned int timer = (unsigned int)-1;
unsigned int timer_tmp = 0;

const unsigned int dim = dotp_l.M / num_cores;

// Allocate the matrices
#ifdef USE_CACHE
if (cid == 0) {
result = (double *)snrt_l1alloc(num_cores * sizeof(double));
}

double *a_int = dotp_A_dram + dim * cid;
double *b_int = dotp_B_dram + dim * cid;

#else
if (cid == 0) {
a = (double *)snrt_l1alloc(dotp_l.M * sizeof(double));
b = (double *)snrt_l1alloc(dotp_l.M * sizeof(double));
Expand All @@ -78,42 +95,51 @@ int main() {
double *a_int = a + dim * cid;
double *b_int = b + dim * cid;

#endif


// Wait for all cores to finish
snrt_cluster_hw_barrier();

// Start dump
if (cid == 0)
start_kernel();
for (int iter = 0; iter < measure_iter; iter ++) {
// Start dump
if (cid == 0)
start_kernel();

// Start timer
if (cid == 0)
timer = benchmark_get_cycle();
// Start timer
if (cid == 0)
timer_tmp = benchmark_get_cycle();

// Calculate dotp
double acc;
acc = fdotp_v64b(a_int, b_int, dim);
result[cid] = acc;
// Calculate dotp
double acc;
acc = fdotp_v64b(a_int, b_int, dim);
result[cid] = acc;

// Wait for all cores to finish
snrt_cluster_hw_barrier();
// Wait for all cores to finish
snrt_cluster_hw_barrier();

// Final reduction
if (cid == 0) {
for (unsigned int i = 1; i < num_cores; ++i)
acc += result[i];
result[0] = acc;
}
// Final reduction
if (cid == 0) {
for (unsigned int i = 1; i < num_cores; ++i)
acc += result[i];
result[0] = acc;
}

// Wait for all cores to finish
snrt_cluster_hw_barrier();
// Wait for all cores to finish
snrt_cluster_hw_barrier();

// End dump
if (cid == 0)
stop_kernel();
// End dump
if (cid == 0)
stop_kernel();

// End timer and check if new best runtime
if (cid == 0)
timer = benchmark_get_cycle() - timer;
// End timer and check if new best runtime
if (cid == 0) {
timer_tmp = benchmark_get_cycle() - timer_tmp;
timer = (timer < timer_tmp) ? timer : timer_tmp;
}

snrt_cluster_hw_barrier();
}

// Check and display results
if (cid == 0) {
Expand All @@ -127,8 +153,8 @@ int main() {
}

if (cid == 0)
if (fp_check(result[0], dotp_result)) {
printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result);
if (fp_check(result[0], dotp_result*measure_iter)) {
printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result*measure_iter);
return -1;
}

Expand Down
2 changes: 1 addition & 1 deletion sw/spatzBenchmarks/dp-fft-cache/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ int main() {

if (cid == 0) {
// Init the cache with half-half
l1d_init(32);
l1d_init(16);
}
// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down
7 changes: 5 additions & 2 deletions sw/spatzBenchmarks/dp-fft/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ int main() {

if (cid == 0) {
// Init the cache
l1d_init(32);
l1d_init(120);
}

// log2(nfft).
Expand All @@ -66,6 +66,9 @@ int main() {
bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t));
}

timer = benchmark_get_cycle();


// Initialize the matrices
if (cid == 0) {
snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double));
Expand All @@ -90,7 +93,7 @@ int main() {
snrt_cluster_hw_barrier();

// Start timer
timer = benchmark_get_cycle();
// timer = benchmark_get_cycle();

// Start dump
if (cid == 0)
Expand Down
17 changes: 16 additions & 1 deletion sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include DATAHEADER
#include "kernel/dp-fmatmul.c"

#define USE_CACHE

#ifndef KERNEL_SIZE
#define KERNEL_SIZE 4
#endif
Expand Down Expand Up @@ -61,20 +63,31 @@ int main() {
unsigned int m_start, m_end;
unsigned int p_start, p_end;
unsigned int kernel_size;

#ifdef USE_CACHE
uint32_t spm_size = 16;
#else
uint32_t spm_size = 120;

#endif
if (cid == 0) {
// Init the cache
l1d_init(spm_size);
}

// Wait for all cores to finish
snrt_cluster_hw_barrier();

// Allocate the matrices in the local tile
if (cid == 0) {
#ifdef USE_CACHE
a = gemm_A_dram;
b = gemm_B_dram;
c = gemm_C_dram;
#else
a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
#endif
}

// Reset timer
Expand All @@ -93,12 +106,14 @@ int main() {
snrt_cluster_hw_barrier();

// Initialize matrices
#ifndef USE_CACHE
if (cid == 0) {
snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
snrt_dma_wait_all();
}
#endif

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down
16 changes: 15 additions & 1 deletion sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include DATAHEADER
#include "kernel/dp-fmatmul.c"

// #define USE_CACHE

#ifndef KERNEL_SIZE
#define KERNEL_SIZE 8
#endif
Expand Down Expand Up @@ -53,8 +55,12 @@ int verify_matrix(double *matrix, const double *checksum,
int main() {
const unsigned int num_cores = snrt_cluster_core_num();
const unsigned int cid = snrt_cluster_core_idx();

#ifdef USE_CACHE
uint32_t spm_size = 16;
#else
uint32_t spm_size = 120;

#endif
if (cid == 0) {
// Init the cache
l1d_init(spm_size);
Expand All @@ -72,9 +78,15 @@ int main() {

// Allocate the matrices in the local tile
if (cid == 0) {
#ifdef USE_CACHE
a = gemm_A_dram;
b = gemm_B_dram;
c = gemm_C_dram;
#else
a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
#endif
}

// Reset timer
Expand All @@ -93,12 +105,14 @@ int main() {
snrt_cluster_hw_barrier();

// Initialize matrices
#ifndef USE_CACHE
if (cid == 0) {
snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
snrt_dma_wait_all();
}
#endif

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down
17 changes: 16 additions & 1 deletion sw/spatzBenchmarks/dp-fmatmul/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include DATAHEADER
#include "kernel/dp-fmatmul.c"

#define USE_CACHE

double *a;
double *b;
double *c;
Expand All @@ -49,8 +51,13 @@ int verify_matrix(double *matrix, const double *checksum,
int main() {
const unsigned int num_cores = snrt_cluster_core_num();
const unsigned int cid = snrt_cluster_core_idx();

#ifdef USE_CACHE
uint32_t spm_size = 32;
#else
uint32_t spm_size = 120;

#endif

if (cid == 0) {
// Init the cache
l1d_init(spm_size);
Expand All @@ -68,9 +75,15 @@ int main() {

// Allocate the matrices in the local tile
if (cid == 0) {
#ifdef USE_CACHE
a = gemm_A_dram;
b = gemm_B_dram;
c = gemm_C_dram;
#else
a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double));
b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
#endif
}

// Reset timer
Expand All @@ -89,12 +102,14 @@ int main() {
snrt_cluster_hw_barrier();

// Initialize matrices
#ifndef USE_CACHE
if (cid == 0) {
snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double));
snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double));
snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
snrt_dma_wait_all();
}
#endif

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down
8 changes: 4 additions & 4 deletions sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ int main() {
#ifndef USE_CACHE
uint32_t spm_size = 120; // 120 KB out of 128 KB
#else
uint32_t spm_size = 32; // Reserve small portion for SPM
uint32_t spm_size = 16; // Reserve small portion for stack only
#endif

if (cid == 0) {
Expand All @@ -142,7 +142,7 @@ int main() {
b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double));
c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double));
}
#else
#else
a = gemm_A_dram;
b = gemm_B_dram;
c = gemm_C_dram;
Expand Down Expand Up @@ -176,7 +176,7 @@ int main() {
snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double));
snrt_dma_wait_all();
}
#endif
#endif

// Wait for all cores to finish
snrt_cluster_hw_barrier();
Expand Down Expand Up @@ -251,7 +251,7 @@ int main() {
for (unsigned int j = 0; j < gemm_l.N; j++) {
checksum += c[i * gemm_l.N + j];
}
printf("Checksum[%d]=%f\n", i, checksum);
// printf("Checksum[%d]=%f\n", i, checksum);
double diff = checksum - (double)gemm_checksum[i];
if (diff < 0)
diff = -diff;
Expand Down
Loading

0 comments on commit c33aee3

Please sign in to comment.