-
-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathmandelbrot-simd.c
127 lines (110 loc) · 4.57 KB
/
mandelbrot-simd.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#define LIMIT_SQUARED 4.0
#define MAXIMUM_ITERATIONS 50
#include <inttypes.h>
#include <math.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef double doublex2 __attribute__((vector_size(16)));
typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
static inline void calcSum(doublex2 *r, doublex2 *i, doublex2 *sum,
const doublex2 init_r[4], const doublex2 init_i) {
for (uint64_t x_Minor = 0; x_Minor < 4; x_Minor++) {
doublex2 r2 = r[x_Minor] * r[x_Minor];
doublex2 i2 = i[x_Minor] * i[x_Minor];
doublex2 ri = r[x_Minor] * i[x_Minor];
sum[x_Minor] = r2 + i2;
r[x_Minor] = r2 - i2 + init_r[x_Minor];
i[x_Minor] = ri + ri + init_i;
}
}
static inline bool vec_gt(const doublex2 *v) {
const doublex2 f = {LIMIT_SQUARED, LIMIT_SQUARED};
const uint64x2_t r = (v[0] > f) & (v[1] > f) & (v[2] > f) & (v[3] > f);
return r[0] && r[1];
}
static inline uint8_t clrPixels_gt(const doublex2 *v) {
const doublex2 f = {LIMIT_SQUARED, LIMIT_SQUARED};
const uint64x2_t r0 = v[0] <= f;
const uint64x2_t r1 = v[1] <= f;
const uint64x2_t r2 = v[2] <= f;
const uint64x2_t r3 = v[3] <= f;
return (r0[0] & 0x1) << 7 | (r0[1] & 0x1) << 6 | (r1[0] & 0x1) << 5 |
(r1[1] & 0x1) << 4 | (r2[0] & 0x1) << 3 | (r2[1] & 0x1) << 2 |
(r3[0] & 0x1) << 1 | (r3[1] & 0x1) << 0;
}
static inline uint8_t mand8(const doublex2 init_r[4], const doublex2 init_i) {
doublex2 pixel_Group_r[4], pixel_Group_i[4];
for (uint64_t x_Minor = 0; x_Minor < 4; x_Minor++) {
pixel_Group_r[x_Minor] = init_r[x_Minor];
pixel_Group_i[x_Minor] = init_i;
}
doublex2 sum[4];
for (unsigned j = 0; j < 6; j++) {
for (unsigned k = 0; k < 8; k++) {
calcSum(pixel_Group_r, pixel_Group_i, sum, init_r, init_i);
}
if (vec_gt(sum)) {
return 0x00;
}
}
calcSum(pixel_Group_r, pixel_Group_i, sum, init_r, init_i);
calcSum(pixel_Group_r, pixel_Group_i, sum, init_r, init_i);
return clrPixels_gt(sum);
}
static inline uint64_t mand64(const doublex2 init_r[4], const doublex2 init_i) {
uint64_t sixtyfour_Pixels = 0;
for (uint64_t byte = 0; byte < 8; byte++) {
const uint64_t eight_Pixels = mand8(init_r + 4 * byte, init_i);
sixtyfour_Pixels =
(sixtyfour_Pixels >> UINT64_C(8)) | (eight_Pixels << UINT64_C(56));
}
return sixtyfour_Pixels;
}
int main(int argc, char **argv) {
const uint64_t image_Width_And_Height =
(__builtin_expect(atoi(argv[1]), 15000) + 7) / 8 * 8;
uint8_t *const pixels =
malloc(image_Width_And_Height * image_Width_And_Height / 8);
doublex2 initial_r[image_Width_And_Height / 2];
double initial_i[image_Width_And_Height];
for (uint64_t xy = 0; xy < image_Width_And_Height; xy++) {
initial_r[xy / 2] =
2.0 / image_Width_And_Height * (doublex2){xy, xy + 1} - 1.5;
initial_i[xy] = 2.0 / image_Width_And_Height * xy - 1.0;
}
if (image_Width_And_Height % 64) {
// process 8 pixels (one byte) at a time
for (uint64_t y = 0; y < image_Width_And_Height; y++) {
const doublex2 prefetched_Initial_i = {initial_i[y], initial_i[y]};
const size_t rowStart = y * image_Width_And_Height / 8;
for (uint64_t x_Major = 0; x_Major < image_Width_And_Height;
x_Major += 8) {
const doublex2 *prefetched_Initial_r = &initial_r[x_Major / 2];
pixels[rowStart + x_Major / 8] =
mand8(prefetched_Initial_r, prefetched_Initial_i);
}
}
} else {
// process 64 pixels (8 bytes) at a time
for (uint64_t y = 0; y < image_Width_And_Height; y++) {
const doublex2 prefetched_Initial_i = {initial_i[y], initial_i[y]};
const size_t rowStart = y * image_Width_And_Height / 8;
for (uint64_t x_Major = 0; x_Major < image_Width_And_Height;
x_Major += 8) {
const doublex2 *prefetched_Initial_r = &initial_r[x_Major / 2];
const uint64_t sixtyfour_Pixels =
mand64(prefetched_Initial_r, prefetched_Initial_i);
memcpy(&pixels[rowStart + x_Major / 8], &sixtyfour_Pixels, 8);
}
}
}
fprintf(stdout, "P4\n%" PRIu64 " %" PRIu64 "\n", image_Width_And_Height,
image_Width_And_Height);
fwrite(pixels, image_Width_And_Height * image_Width_And_Height / 8, 1,
stdout);
free(pixels);
return 0;
}