Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
pippenger_arena_layout.hpp
Go to the documentation of this file.
1// Per-worker arena layout for the round-parallel Pippenger MSM (Zone W slab).
2//
3// Canonical source of truth for the per-worker byte walk that was previously
4// duplicated across `compute_arena_bytes_for_msm`, the live allocator inside
5// `pippenger_round_parallel`, and `pippenger_bn254_arena_layout_fits_for_test`.
6// The historical arena drift bugs (cluster_offsets miscount, wasm
7// aligned_local overflow, NO_GLV abort, t1 abort) all traced to disagreements
8// between those copies; this struct removes that class by computing the layout
9// once.
10//
11// The constructor's layout walk mirrors the live allocator's `layout_add`
12// sequence exactly, including alignment slop. The sizer's previous
13// arithmetic-only formula did not honour per-allocation alignment, so it
14// systematically under-counted by a few bytes per slab; the struct fixes that
15// by construction.
16//
17// Phase A and Stage 6 fields overlay the same per-worker bytes because the
18// parallel_for invocations are disjoint (Phase A runs on the first window
19// batch, Stage 6 runs per batch thereafter, and never on the same worker
20// concurrently). `per_worker_union_bytes = max(ts_fixed, pa_layout)`.
21
22#pragma once
23
25
26#include <algorithm>
27#include <array>
28#include <bit>
29#include <cstddef>
30#include <cstdint>
31#include <utility>
32
34
35// ============================================================================
36// Round-parallel internals exposed to the test suite.
37//
38// `pippenger_bn254_arena_layout_fits_for_test` is a TU-local helper that walks
39// the actual Zone P / Zone W / Zone S allocator for representative inputs and
40// asserts the result fits in `compute_arena_bytes_for_msm`'s promise. Its body
41// lives in `scalar_multiplication.test.cpp`, which means the helpers it needs
42// (`choose_window_bits`, `build_var_window_schedule`, `ChunkOutput`,
43// `DEDUP_MAX_*`, `VAR_WINDOW_MAX_WINDOWS`, `compute_arena_bytes_for_msm`) need
44// header-visible declarations.
45// ============================================================================
46
47// Per-window count cap shared by `VariableWindowSchedule` arrays and the live
48// allocator's `window_sums_storage` slot.
49inline constexpr size_t VAR_WINDOW_MAX_WINDOWS = 128;
50
51// Dedup pre-pass caps. DEDUP_MAX_CLUSTERS bounds `extra_points` at ≤ 1 MB;
52// DEDUP_MAX_MEMBERS bounds the per-worker `cluster_members` slab.
53inline constexpr size_t DEDUP_MAX_CLUSTERS = 16384;
54inline constexpr size_t DEDUP_MAX_MEMBERS = 32768;
55
56// Uniform window schedule produced by `build_var_window_schedule`. Holds the
57// per-window `c` value and bucket count for downstream sizing/dispatch.
59 size_t num_windows = 0;
60 std::array<uint8_t, VAR_WINDOW_MAX_WINDOWS> window_bits_per_window{}; // window_bits_w for each w
61 std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> bit_base{}; // B_w = Σ_{k<w} c_k, B_0 = 0
62 // 2^(window_bits_w - 1) + 1. uint32_t: window_bits = 17 gives 65537, one past uint16_t, and the
63 // cost model can pick window_bits up to 18 for very large MSMs (n approaching the 2^26 SRS cap).
64 std::array<uint32_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{};
65};
66
67// Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell).
68template <typename Curve> struct ChunkOutput {
69 typename Curve::Element R{};
70 typename Curve::Element L{};
71 uint32_t lo = 0;
72 uint32_t hi = 0;
73 uint8_t empty = 1;
74};
75
76// Pick the optimal window size `c`. Native uses a cost model
77// `rounds * (n + 15 * buckets)`; WASM uses a closed-form `target_load` formula.
78[[nodiscard]] inline uint32_t choose_window_bits(size_t num_points,
79 size_t num_bits,
80 size_t n_input,
81 size_t num_logical_threads) noexcept
82{
83 constexpr uint32_t MAX_C = 20;
84 uint32_t best = 2;
85
86#ifdef __wasm__
87 static_cast<void>(num_bits);
88 const size_t target_load = (n_input > 4096) ? (num_logical_threads * 2 / 3) : (num_logical_threads / 3);
89 if (target_load == 0 || num_points <= target_load) {
90 best = 2;
91 } else {
92 const size_t ratio = num_points / target_load;
93 const uint32_t lg = static_cast<uint32_t>(numeric::get_msb(ratio));
94 best = lg + 1;
95 if (best < 2) {
96 best = 2;
97 } else if (best >= MAX_C) {
98 best = MAX_C - 1;
99 }
100 }
101#else
102 static_cast<void>(n_input);
103 static_cast<void>(num_logical_threads);
104 uint64_t best_cost = static_cast<uint64_t>(-1);
105 for (uint32_t window_bits = 2; window_bits < MAX_C; ++window_bits) {
106 const uint64_t rounds = (num_bits + 2 + window_bits - 1) / window_bits;
107 const uint64_t buckets = (uint64_t{ 1 } << (window_bits - 1)) + 1;
108 const uint64_t n = num_points;
109 constexpr uint64_t BUCKET_ACC_COST = 15;
110 const uint64_t cost = rounds * (n + (buckets * BUCKET_ACC_COST));
111 if (cost < best_cost) {
112 best_cost = cost;
113 best = window_bits;
114 }
115 }
116#endif
117
118 return best;
119}
120
121// Build a uniform window schedule for the given bit budget and chosen `c`. Every window
122// is `window_bits` wide except the final one, which takes the remaining bits. The +2 on
123// the bit budget accommodates the carry-less top bit of the Constantine recoder.
124inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits) noexcept
125{
127
128 size_t bits_remaining = num_bits + 2;
129 size_t bit_offset = 0;
130 size_t w = 0;
131 while (bits_remaining > 0 && w < VAR_WINDOW_MAX_WINDOWS) {
132 const size_t window_bits_w = std::min<size_t>(window_bits, bits_remaining);
133 sched.bit_base[w] = static_cast<uint16_t>(bit_offset);
134 sched.window_bits_per_window[w] = static_cast<uint8_t>(window_bits_w);
135 sched.num_buckets[w] = static_cast<uint32_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
136 bit_offset += window_bits_w;
137 bits_remaining -= window_bits_w;
138 ++w;
139 }
140 sched.num_windows = w;
141 return sched;
142}
143
144// Maximum number of independent additions batched per modular inversion in the
145// affine-arithmetic group ops (used by Stage 6a/6b). Sizes per-worker
146// `points_to_add`, `inversion_scratch`, and `pair_dest` arrays.
147inline constexpr size_t BATCH_CAPACITY = 256;
148
149// Phase A's chunked tree-reduce limit. Capped so the per-worker scratch slab
150// (chunk_pts + chunk_ids) stays under ~128 KB.
151inline constexpr size_t DEDUP_MAX_CHUNK_MEMBERS = 2048;
152
153inline constexpr size_t MIN_BATCH_CAPACITY = 32;
154inline constexpr size_t MIN_AFFINE_THREAD_RATIO = 2;
155inline constexpr size_t SUBCHUNK_ENTRIES_CAP = 2048;
156inline constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL;
157
158// Per-bucket-chunk metadata produced by Stage 6a, consumed by Stage 6b's
159// cross-thread reduce.
160// lo, hi — lowest / highest non-empty digit in the chunk (inclusive)
161// buckets_padded — next power of two ≥ (hi - lo + 1)
162// empty — 1 iff the chunk had no entries (Stage 6b skips it)
164 uint32_t lo = 0;
165 uint32_t hi = 0;
166 uint32_t buckets_padded = 0;
167 uint8_t empty = 1;
168};
169
170template <typename Curve> struct PerWorkerArenaLayout {
172 using BaseField = typename Curve::BaseField;
173
174 // Caps shared between sizer and allocator. Centralised here so the two
175 // sites can't diverge.
176 static constexpr size_t PHASE_A_DIRTY_SLOTS_CAP = 4096; // HT_SIZE
177 static constexpr size_t PHASE_A_BUCKET_REP_CAP = 256; // loose cap
178 static constexpr size_t PHASE_A_STAGED_CAP = 1024; // loose cap
180 static constexpr size_t WORKER_SLAB_ALIGN = alignof(AffineElement);
181
182 // Computed byte sizes (filled by constructor's layout walk).
183 size_t ts_fixed_layout = 0; // ThreadScratch wpb-independent fields, with align slop
184 size_t pa_layout = 0; // PhaseAScratch fields, with align slop
185 size_t per_worker_union_bytes = 0; // = align_up(max(ts_fixed_layout, pa_layout), WORKER_SLAB_ALIGN)
186 size_t per_worker_per_wpb_layout = 0; // Stage 6 wpb-dependent tail
187 size_t per_worker_bytes = 0; // = align_up(union + tail, WORKER_SLAB_ALIGN)
188
189 // Constructor performs the canonical layout walk. `windows_per_batch` and
190 // `dense_stride_est` may be zero — only the wpb-independent parts then
191 // have meaningful values, useful for the sizer's pre-wpb-solve step.
192 PerWorkerArenaLayout(size_t chunk_capacity,
193 size_t global_max_overflow_per_window,
194 bool dedup_active,
195 size_t phase_a_cluster_members_cap,
196 size_t phase_a_cluster_offsets_cap,
197 size_t windows_per_batch,
198 size_t dense_stride_est) noexcept
199 {
200 auto align_up = [](size_t off, size_t align) -> size_t { return (off + align - 1) & ~(align - 1); };
201 auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; };
202
203 // ThreadScratch fixed (curr_pts / curr_buckets / points_to_add /
204 // inversion_scratch / pair_dest / overflow_slots / overflow_pts).
205 layout_add(ts_fixed_layout, sizeof(AffineElement) * chunk_capacity, alignof(AffineElement));
206 layout_add(ts_fixed_layout, sizeof(uint32_t) * chunk_capacity, alignof(uint32_t));
207 layout_add(ts_fixed_layout, sizeof(AffineElement) * 2 * BATCH_CAPACITY, alignof(AffineElement));
208 layout_add(ts_fixed_layout, sizeof(BaseField) * BATCH_CAPACITY, alignof(BaseField));
209 layout_add(ts_fixed_layout, sizeof(uint32_t) * BATCH_CAPACITY, alignof(uint32_t));
210 layout_add(ts_fixed_layout, sizeof(uint32_t) * global_max_overflow_per_window, alignof(uint32_t));
211 layout_add(ts_fixed_layout, sizeof(AffineElement) * global_max_overflow_per_window, alignof(AffineElement));
212
213 // PhaseA (cluster_members / cluster_offsets / dirty_slots / bucket_rep
214 // / staged / chunk_pts / chunk_ids). Only allocated when dedup_active.
215 if (dedup_active) {
216 layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_members_cap, alignof(uint32_t));
217 layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_offsets_cap, alignof(uint32_t));
218 layout_add(pa_layout, sizeof(uint16_t) * PHASE_A_DIRTY_SLOTS_CAP, alignof(uint16_t));
219 layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_BUCKET_REP_CAP, alignof(uint32_t));
220 layout_add(pa_layout,
223 layout_add(pa_layout, sizeof(AffineElement) * PHASE_A_CHUNK_CAP, alignof(AffineElement));
224 layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_CHUNK_CAP, alignof(uint32_t));
225 }
226
228
229 // Stage 6 wpb-dependent tail (dense_buckets / is_present / pair
230 // scratch / chunk_infos). Skipped when windows_per_batch == 0 (sizer's
231 // pre-wpb-solve call).
232 if (windows_per_batch != 0) {
233 const size_t dense_total = windows_per_batch * dense_stride_est;
234 const size_t dense_pair_max = dense_total / 2;
235 layout_add(per_worker_per_wpb_layout, sizeof(AffineElement) * dense_total, alignof(AffineElement));
236 layout_add(per_worker_per_wpb_layout, sizeof(uint8_t) * dense_total, alignof(uint8_t));
237 layout_add(per_worker_per_wpb_layout,
238 sizeof(std::pair<uint32_t, uint32_t>) * dense_pair_max,
240 layout_add(per_worker_per_wpb_layout, sizeof(uint32_t) * dense_pair_max, alignof(uint32_t));
241 layout_add(per_worker_per_wpb_layout, sizeof(BaseField) * dense_pair_max, alignof(BaseField));
242 layout_add(per_worker_per_wpb_layout,
243 sizeof(AffineBucketChunkInfo) * windows_per_batch,
244 alignof(AffineBucketChunkInfo));
245 }
246
248 }
249};
250
251// Stride upper bound for `s.dense_buckets`: next_pow2(⌈(B-1)/T⌉), with a floor of 2.
252[[nodiscard]] inline size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept
253{
254 const size_t per_thread = (B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 };
255 return std::max<size_t>(2, std::bit_ceil(per_thread));
256}
257
258// Upper bound on Σ_t buckets_per_thread[t][w] per window: B + T - 1 (adjacent threads
259// may share one boundary bucket). Returns 0 when B_eff == 0.
260[[nodiscard]] inline size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept
261{
262 return (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : size_t{ 0 };
263}
264
265// Per-OS-thread Stage 6a seam overflow capacity (per-window upper bound).
266[[nodiscard]] inline size_t compute_global_max_overflow_per_window(size_t n,
267 size_t num_threads,
268 size_t subchunk_entries_cap) noexcept
269{
270 const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads;
271 return (global_max_chunk_len + subchunk_entries_cap - 1) / subchunk_entries_cap;
272}
273
274// Per-window byte cost for one window in a windows-per-batch slab. Identical formula
275// at three sites (sizer outer, sizer per-schedule lambda, live allocator); centralised
276// here so they cannot drift.
277//
278// schedule = 4·n
279// HIST slot = max(4·t·B, sizeof(ChunkOutput)·t + 96·t) [H ∪ O overlay]
280// DENSE slot = 65 · bucket_partials_max(B, t) [bucket_partials_dense + present]
281// bucket_start = 8·(B+1)
282// chunk arrays = 8·(t+1) + 8·(t+1) + 8·t + 8·t + 8·t + 16·worker + 8·t
283// dense_buckets = 87·worker·stride [s.dense_buckets + aux]
284template <typename Curve>
285[[nodiscard]] inline size_t compute_per_window_bytes(
286 size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
287{
288 const size_t bucket_partials_max = compute_bucket_partials_max(B_eff, num_threads);
289 const size_t hist_h_bytes_pw = size_t{ 4 } * num_threads * B_eff;
290 const size_t hist_o_bytes_pw = (sizeof(ChunkOutput<Curve>) * num_threads) + (size_t{ 96 } * num_threads);
291 const size_t hist_slot_bytes_pw = std::max(hist_h_bytes_pw, hist_o_bytes_pw);
292 const size_t dense_slot_bytes_pw = size_t{ 65 } * bucket_partials_max;
293 return (size_t{ 4 } * n) + hist_slot_bytes_pw + dense_slot_bytes_pw + (size_t{ 8 } * (B_eff + 1)) +
294 (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * num_threads) +
295 (size_t{ 8 } * num_threads) + (size_t{ 8 } * num_threads) + (size_t{ 16 } * worker_total) +
296 (size_t{ 8 } * num_threads) + (size_t{ 87 } * worker_total * dense_stride);
297}
298
299// Phase-1 prologue bytes living in the per-MSM arena (msb_per_scalar, glv_scalars,
300// glv_points, per_thread_msb_hist). Two-copy duplicate eliminated.
301[[nodiscard]] inline size_t compute_phase_one_prologue_bytes(size_t n,
302 bool use_glv,
303 bool inline_glv_double,
304 size_t profile_threads) noexcept
305{
306 return n // msb_per_scalar
307 + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) // glv_scalars_storage
308 + (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) // glv_points_storage
309 + (profile_threads * size_t{ 1024 }); // per_thread_msb_hist
310}
311
315};
316
317// Phase A per-worker caps. `members_cap = min(DEDUP_MAX_MEMBERS, n)` is tight (each
318// scalar contributes ≤ 1 cluster_member entry). `offsets_cap = cids_per_thread + 2`
319// covers the leading-zero sentinel + post-last terminator.
320[[nodiscard]] inline PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept
321{
322 return { std::min(DEDUP_MAX_MEMBERS, n), (DEDUP_MAX_CLUSTERS / num_threads) + 2 };
323}
324
325// Solve `wpb · per_window_bytes ≤ available_budget`, clamped to W_R and ≥ 1.
326// Mirrors the three identical wpb-pickers in the sizer and live allocator.
327[[nodiscard]] inline size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
328{
329 if (W_R == 0) {
330 return 1;
331 }
332 if (per_window_bytes == 0 || available_budget == 0) {
333 return std::max<size_t>(1, W_R);
334 }
335 return std::min(std::max<size_t>(1, available_budget / per_window_bytes), W_R);
336}
337
338} // namespace bb::scalar_multiplication::round_parallel_detail
typename Group::element Element
Definition grumpkin.hpp:63
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64
constexpr T get_msb(const T in)
Definition get_msb.hpp:50
size_t compute_global_max_overflow_per_window(size_t n, size_t num_threads, size_t subchunk_entries_cap) noexcept
size_t compute_phase_one_prologue_bytes(size_t n, bool use_glv, bool inline_glv_double, size_t profile_threads) noexcept
size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
size_t compute_per_window_bytes(size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept
PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept
size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept
uint32_t choose_window_bits(size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads) noexcept
VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits) noexcept
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
PerWorkerArenaLayout(size_t chunk_capacity, size_t global_max_overflow_per_window, bool dedup_active, size_t phase_a_cluster_members_cap, size_t phase_a_cluster_offsets_cap, size_t windows_per_batch, size_t dense_stride_est) noexcept