-
-
Notifications
You must be signed in to change notification settings - Fork 41
/
LoadBalancerS2.cpp
252 lines (225 loc) Β· 8.57 KB
/
LoadBalancerS2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
///
/// @file LoadBalancerS2.cpp
/// @brief The LoadBalancerS2 assigns work to the individual
/// threads in the computation of the special leaves in the
/// Lagarias-Miller-Odlyzko, Deleglise-Rivat and Gourdon
/// prime counting algorithms. This load balancer is used by
/// the S2_hard(x, y) and D(x, y) functions.
///
/// Simply parallelizing the computation of the special leaves
/// in the Lagarias-Miller-Odlyzko algorithm by subdividing
/// the sieve interval by the number of threads into equally
/// sized subintervals does not scale because the distribution
/// of the special leaves is highly skewed and most special
/// leaves are in the first few segments whereas later on
/// there are very few special leaves.
///
/// This LoadBalancerS2 gradually increases the number of
/// segments to sieve as long as the expected runtime of the
/// sieve distance is smaller than the expected finish time
/// of the algorithm. Near the end the LoadBalancerS2 will
/// gradually decrease the number of segments to sieve in
/// order to prevent that 1 thread will run much longer than
/// all the other threads.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///
#include <LoadBalancerS2.hpp>
#include <primecount-config.hpp>
#include <primecount-internal.hpp>
#include <StatusS2.hpp>
#include <Sieve.hpp>
#include <imath.hpp>
#include <int128_t.hpp>
#include <min.hpp>
#include <stdint.h>
namespace primecount {
LoadBalancerS2::LoadBalancerS2(maxint_t x,
int64_t sieve_limit,
maxint_t sum_approx,
int threads,
bool is_print) :
sieve_limit_(sieve_limit),
sum_approx_(sum_approx),
time_(get_time()),
is_print_(is_print),
status_(x)
{
lock_.init(threads);
// The best performance is usually achieved using a sieve
// array size that matches your CPU's L1 data cache size
// (per core) or that is slightly larger than your L1 cache
// size but smaller than your L2 cache size (per core).
// Also, the segment_size must be >= sqrt(sieve_limit).
int64_t sieve_bytes = L1D_CACHE_SIZE * 2;
int64_t numbers_per_byte = 30;
int64_t sqrt_limit = isqrt(sieve_limit);
max_size_ = max(sieve_bytes * numbers_per_byte, sqrt_limit);
if (threads == 1 &&
!is_print)
{
// When a single thread is used (and printing is disabled)
// we can set segment_size to its maximum size as load
// balancing is only useful for multi-threading.
segment_size_ = max_size_;
// Currently our Sieve.cpp does not rebalance its counters
// data structure. However, if we process the computation
// in chunks then the sieve gets recreated for each new
// chunk which rebalances the counters. Therefore we limit
// the number of segments here.
segments_ = 100;
}
else
{
// Start with a tiny segment size of x^(1/4) as
// most special leaves are in the first few
// segments and as we need to ensure that all
// threads are assigned an equal amount of work.
segment_size_ = isqrt(isqrt(x));
segments_ = 1;
}
int64_t min_size = 1 << 9;
segment_size_ = max(min_size, segment_size_);
segment_size_ = Sieve::get_segment_size(segment_size_);
}
maxint_t LoadBalancerS2::get_sum() const
{
return sum_;
}
bool LoadBalancerS2::get_work(ThreadData& thread)
{
LockGuard lockGuard(lock_);
sum_ += thread.sum;
if (is_print_)
{
uint64_t dist = thread.segments * thread.segment_size;
uint64_t high = thread.low + dist;
status_.print(high, sieve_limit_, sum_, sum_approx_);
}
update_load_balancing(thread);
thread.low = low_;
thread.segments = segments_;
thread.segment_size = segment_size_;
thread.sum = 0;
thread.secs = 0;
thread.init_secs = 0;
low_ += segments_ * segment_size_;
bool is_work = thread.low < sieve_limit_;
return is_work;
}
void LoadBalancerS2::update_load_balancing(const ThreadData& thread)
{
if (thread.low > max_low_)
{
max_low_ = thread.low;
segments_ = thread.segments;
// We only start increasing the segment_size and segments
// per thread once the first special leaves have been
// found. Near the start there is a very large number of
// leaves and we don't want a single thread to compute
// them all by himself (which would cause scaling issues).
if (sum_ == 0)
return;
// Slowly increase the segment size until it reaches
// sqrt(sieve_limit). Most special leaves are located
// around y, hence we need to be careful to not assign too
// much work to a single thread in this region.
if (segment_size_ < max_size_)
update_segment_size();
else
update_number_of_segments(thread);
}
}
/// Slowly increase segment_size until it reaches sqrt(sieve_limit)
void LoadBalancerS2::update_segment_size()
{
segment_size_ += segment_size_ / 16;
segment_size_ = min(segment_size_, max_size_);
segment_size_ = Sieve::get_segment_size(segment_size_);
}
/// Increase or decrease the number of segments per thread
/// based on the remaining runtime.
///
void LoadBalancerS2::update_number_of_segments(const ThreadData& thread)
{
// Near the end it is important that threads run only for
// a short amount of time in order to ensure that all
// threads finish nearly at the same time. Since the
// remaining time is just a rough estimation we want to be
// very conservative so we divide the remaining time by 3.
double rem_secs = remaining_secs() / 3;
// If the previous thread runtime is larger than the
// estimated remaining time the factor that we calculate
// below will be < 1 and we will reduce the number of
// segments per thread. Otherwise if the factor > 1 we
// will increase the number of segments per thread.
double min_secs = 0.001;
double divider = max(min_secs, thread.secs);
double factor = rem_secs / divider;
// For small and medium computations the thread runtime
// should be about 5000x the thread initialization time.
// However for very large computations we want to further
// reduce the thread runtimes in order to increase the
// backup frequency. If the thread runtime is > 6 hours
// we reduce the thread runtime to about 200x the thread
// initialization time.
double init_secs = max(min_secs, thread.init_secs);
double init_factor = in_between(200, (3600 * 6) / init_secs, 5000);
// Reduce the thread runtime if it is much larger than
// its initialization time. This increases the number of
// backups without deteriorating performance as we make
// sure that the thread runtime is still much larger than
// the thread initialization time.
if (thread.secs > min_secs &&
thread.secs > thread.init_secs * init_factor)
{
double old = factor;
double next_runtime = thread.init_secs * init_factor;
factor = next_runtime / thread.secs;
factor = min(factor, old);
}
// Near the end when the remaining time goes close to 0
// the load balancer tends to reduce the number of
// segments per thread also close to 0 which is very bad
// for performance. The condition below fixes this issue
// and ensures that the thread runtime is always at
// least 20x the thread initialization time.
if (thread.secs > 0 &&
thread.secs * factor < thread.init_secs * 20)
{
double next_runtime = thread.init_secs * 20;
double current_runtime = thread.secs;
factor = next_runtime / current_runtime;
}
// Since the distribution of the special leaves is highly
// skewed (at the beginning) we want to increase the
// number of segments per thread very slowly. Because if
// the previously sieved interval contained no special
// leaves but the next interval contains many special
// leaves then sieving the next interval might take orders
// of magnitude more time even if the interval size is
// identical.
factor = in_between(0.5, factor, 2.0);
double next_runtime = thread.secs * factor;
if (next_runtime < min_secs)
segments_ *= 2;
else
{
double new_segments = std::round(segments_ * factor);
segments_ = (int64_t) new_segments;
segments_ = max(segments_, 1);
}
}
/// Remaining seconds till finished
double LoadBalancerS2::remaining_secs() const
{
double percent = status_.getPercent(low_, sieve_limit_, sum_, sum_approx_);
percent = in_between(10, percent, 100);
double total_secs = get_time() - time_;
double secs = total_secs * (100 / percent) - total_secs;
return secs;
}
} // namespace