OpenSWMM Engine  6.0.0-alpha.1
Data-oriented, plugin-extensible SWMM Engine (6.0.0-alpha.1)
Loading...
Searching...
No Matches
SIMD.hpp
Go to the documentation of this file.
1
34#ifndef OPENSWMM_ENGINE_SIMD_HPP
35#define OPENSWMM_ENGINE_SIMD_HPP
36
37#ifndef OPENSWMM_RESTRICT
38# if defined(_MSC_VER)
39# define OPENSWMM_RESTRICT __restrict
40# else
41# define OPENSWMM_RESTRICT __restrict__
42# endif
43#endif
44
45// ============================================================================
46// Cross-compiler vectorisation hint
47// ============================================================================
48
49#ifndef OPENSWMM_IVDEP
50# if defined(_MSC_VER)
51# define OPENSWMM_IVDEP __pragma(loop(ivdep))
52# elif defined(__GNUC__) || defined(__clang__)
53# define OPENSWMM_IVDEP _Pragma("GCC ivdep")
54# else
55# define OPENSWMM_IVDEP
56# endif
57#endif
58
59#include <cstddef>
60#include <cmath>
61#include <algorithm>
62#include <cassert>
63
64// ============================================================================
65// Platform detection
66// ============================================================================
67
68#if !defined(OPENSWMM_SIMD_SCALAR)
69# if defined(__AVX2__)
70# include <immintrin.h>
71# define OPENSWMM_SIMD_AVX2 1
72# define OPENSWMM_SIMD_WIDTH 4
73# elif defined(__ARM_NEON) || defined(__aarch64__)
74# include <arm_neon.h>
75# define OPENSWMM_SIMD_NEON 1
76# define OPENSWMM_SIMD_WIDTH 2
77# else
78# define OPENSWMM_SIMD_SCALAR 1
79# define OPENSWMM_SIMD_WIDTH 1
80# endif
81#else
82# if !defined(OPENSWMM_SIMD_WIDTH)
83# define OPENSWMM_SIMD_WIDTH 1
84# endif
85#endif
86
87// Exactly one SIMD backend must be active
88static_assert(
89 (0
90#if defined(OPENSWMM_SIMD_AVX2)
91 + 1
92#endif
93#if defined(OPENSWMM_SIMD_NEON)
94 + 1
95#endif
96#if defined(OPENSWMM_SIMD_SCALAR)
97 + 1
98#endif
99 ) == 1,
100 "Exactly one SIMD backend must be active (AVX2, NEON, or SCALAR)"
101);
102
103namespace openswmm::simd {
104
105// ============================================================================
106// Element-wise operations (arrays of doubles)
107// ============================================================================
108
119inline void add(
120 const double* OPENSWMM_RESTRICT a,
121 const double* OPENSWMM_RESTRICT b,
122 double* OPENSWMM_RESTRICT dst,
123 std::size_t n
124) noexcept {
125#if defined(OPENSWMM_SIMD_AVX2)
126 const std::size_t nv = n / 4;
127 const std::size_t nr = n % 4;
128 for (std::size_t i = 0; i < nv; ++i) {
129 __m256d va = _mm256_loadu_pd(a + i * 4);
130 __m256d vb = _mm256_loadu_pd(b + i * 4);
131 _mm256_storeu_pd(dst + i * 4, _mm256_add_pd(va, vb));
132 }
133 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] + b[i];
134 (void)nr;
135#elif defined(OPENSWMM_SIMD_NEON)
136 const std::size_t nv = n / 2;
137 for (std::size_t i = 0; i < nv; ++i) {
138 float64x2_t va = vld1q_f64(a + i * 2);
139 float64x2_t vb = vld1q_f64(b + i * 2);
140 vst1q_f64(dst + i * 2, vaddq_f64(va, vb));
141 }
142 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] + b[i];
143#else
144 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] + b[i];
145#endif
146}
147
151inline void multiply(
152 const double* OPENSWMM_RESTRICT a,
153 const double* OPENSWMM_RESTRICT b,
154 double* OPENSWMM_RESTRICT dst,
155 std::size_t n
156) noexcept {
157#if defined(OPENSWMM_SIMD_AVX2)
158 const std::size_t nv = n / 4;
159 for (std::size_t i = 0; i < nv; ++i) {
160 __m256d va = _mm256_loadu_pd(a + i * 4);
161 __m256d vb = _mm256_loadu_pd(b + i * 4);
162 _mm256_storeu_pd(dst + i * 4, _mm256_mul_pd(va, vb));
163 }
164 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] * b[i];
165#elif defined(OPENSWMM_SIMD_NEON)
166 const std::size_t nv = n / 2;
167 for (std::size_t i = 0; i < nv; ++i) {
168 float64x2_t va = vld1q_f64(a + i * 2);
169 float64x2_t vb = vld1q_f64(b + i * 2);
170 vst1q_f64(dst + i * 2, vmulq_f64(va, vb));
171 }
172 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] * b[i];
173#else
174 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i];
175#endif
176}
177
185inline double min(const double* a, std::size_t n) noexcept {
186 assert(n >= 1);
187 double result = a[0];
188#if defined(OPENSWMM_SIMD_AVX2)
189 const std::size_t nv = n / 4;
190 if (nv > 0) {
191 __m256d vmin = _mm256_loadu_pd(a);
192 for (std::size_t i = 1; i < nv; ++i) {
193 vmin = _mm256_min_pd(vmin, _mm256_loadu_pd(a + i * 4));
194 }
195 // Horizontal reduction
196 __m128d hi = _mm256_extractf128_pd(vmin, 1);
197 __m128d lo = _mm256_castpd256_pd128(vmin);
198 __m128d m2 = _mm_min_pd(lo, hi);
199 __m128d m1 = _mm_min_pd(m2, _mm_shuffle_pd(m2, m2, 1));
200 result = _mm_cvtsd_f64(m1);
201 }
202 for (std::size_t i = nv * 4; i < n; ++i) result = std::min(result, a[i]);
203#elif defined(OPENSWMM_SIMD_NEON)
204 const std::size_t nv = n / 2;
205 if (nv > 0) {
206 float64x2_t vmin = vld1q_f64(a);
207 for (std::size_t i = 1; i < nv; ++i) {
208 vmin = vminq_f64(vmin, vld1q_f64(a + i * 2));
209 }
210 // Horizontal reduction: min of 2 lanes
211 result = std::min(vgetq_lane_f64(vmin, 0), vgetq_lane_f64(vmin, 1));
212 }
213 for (std::size_t i = nv * 2; i < n; ++i) result = std::min(result, a[i]);
214#else
215 for (std::size_t i = 1; i < n; ++i) result = std::min(result, a[i]);
216#endif
217 return result;
218}
219
223inline double max(const double* a, std::size_t n) noexcept {
224 assert(n >= 1);
225 double result = a[0];
226#if defined(OPENSWMM_SIMD_AVX2)
227 const std::size_t nv = n / 4;
228 if (nv > 0) {
229 __m256d vmax = _mm256_loadu_pd(a);
230 for (std::size_t i = 1; i < nv; ++i) {
231 vmax = _mm256_max_pd(vmax, _mm256_loadu_pd(a + i * 4));
232 }
233 // Horizontal reduction
234 __m128d hi = _mm256_extractf128_pd(vmax, 1);
235 __m128d lo = _mm256_castpd256_pd128(vmax);
236 __m128d m2 = _mm_max_pd(lo, hi);
237 __m128d m1 = _mm_max_pd(m2, _mm_shuffle_pd(m2, m2, 1));
238 result = _mm_cvtsd_f64(m1);
239 }
240 for (std::size_t i = nv * 4; i < n; ++i) result = std::max(result, a[i]);
241#elif defined(OPENSWMM_SIMD_NEON)
242 const std::size_t nv = n / 2;
243 if (nv > 0) {
244 float64x2_t vmax = vld1q_f64(a);
245 for (std::size_t i = 1; i < nv; ++i) {
246 vmax = vmaxq_f64(vmax, vld1q_f64(a + i * 2));
247 }
248 result = std::max(vgetq_lane_f64(vmax, 0), vgetq_lane_f64(vmax, 1));
249 }
250 for (std::size_t i = nv * 2; i < n; ++i) result = std::max(result, a[i]);
251#else
252 for (std::size_t i = 1; i < n; ++i) result = std::max(result, a[i]);
253#endif
254 return result;
255}
256
265inline void clamp(double* a, double lo, double hi, std::size_t n) noexcept {
266#if defined(OPENSWMM_SIMD_AVX2)
267 __m256d vlo = _mm256_set1_pd(lo);
268 __m256d vhi = _mm256_set1_pd(hi);
269 const std::size_t nv = n / 4;
270 for (std::size_t i = 0; i < nv; ++i) {
271 __m256d v = _mm256_loadu_pd(a + i * 4);
272 v = _mm256_max_pd(vlo, _mm256_min_pd(vhi, v));
273 _mm256_storeu_pd(a + i * 4, v);
274 }
275 for (std::size_t i = nv * 4; i < n; ++i) {
276 a[i] = std::max(lo, std::min(hi, a[i]));
277 }
278#elif defined(OPENSWMM_SIMD_NEON)
279 float64x2_t vlo = vdupq_n_f64(lo);
280 float64x2_t vhi = vdupq_n_f64(hi);
281 const std::size_t nv = n / 2;
282 for (std::size_t i = 0; i < nv; ++i) {
283 float64x2_t v = vld1q_f64(a + i * 2);
284 v = vmaxq_f64(vlo, vminq_f64(vhi, v));
285 vst1q_f64(a + i * 2, v);
286 }
287 for (std::size_t i = nv * 2; i < n; ++i) {
288 a[i] = std::max(lo, std::min(hi, a[i]));
289 }
290#else
291 for (std::size_t i = 0; i < n; ++i) {
292 a[i] = std::max(lo, std::min(hi, a[i]));
293 }
294#endif
295}
296
308inline double dot(
309 const double* OPENSWMM_RESTRICT a,
310 const double* OPENSWMM_RESTRICT b,
311 std::size_t n
312) noexcept {
313 double result = 0.0;
314#if defined(OPENSWMM_SIMD_AVX2)
315 const std::size_t nv = n / 4;
316 if (nv > 0) {
317 __m256d vsum = _mm256_setzero_pd();
318 for (std::size_t i = 0; i < nv; ++i) {
319 __m256d va = _mm256_loadu_pd(a + i * 4);
320 __m256d vb = _mm256_loadu_pd(b + i * 4);
321 vsum = _mm256_fmadd_pd(va, vb, vsum); // FMA: vsum += va * vb
322 }
323 // Horizontal sum
324 __m128d hi = _mm256_extractf128_pd(vsum, 1);
325 __m128d lo = _mm256_castpd256_pd128(vsum);
326 __m128d s2 = _mm_add_pd(lo, hi);
327 __m128d s1 = _mm_add_pd(s2, _mm_shuffle_pd(s2, s2, 1));
328 result = _mm_cvtsd_f64(s1);
329 }
330 for (std::size_t i = nv * 4; i < n; ++i) result += a[i] * b[i];
331#elif defined(OPENSWMM_SIMD_NEON)
332 const std::size_t nv = n / 2;
333 if (nv > 0) {
334 float64x2_t vsum = vdupq_n_f64(0.0);
335 for (std::size_t i = 0; i < nv; ++i) {
336 float64x2_t va = vld1q_f64(a + i * 2);
337 float64x2_t vb = vld1q_f64(b + i * 2);
338 vsum = vfmaq_f64(vsum, va, vb); // FMA: vsum += va * vb
339 }
340 // Horizontal sum: add both lanes
341 result = vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
342 }
343 for (std::size_t i = nv * 2; i < n; ++i) result += a[i] * b[i];
344#else
345 for (std::size_t i = 0; i < n; ++i) result += a[i] * b[i];
346#endif
347 return result;
348}
349
353constexpr std::size_t lane_width() noexcept {
354 return OPENSWMM_SIMD_WIDTH;
355}
356
362inline void sqrt_array(
363 const double* OPENSWMM_RESTRICT a,
364 double* OPENSWMM_RESTRICT dst,
365 std::size_t n
366) noexcept {
368 for (std::size_t i = 0; i < n; ++i) dst[i] = std::sqrt(a[i]);
369}
370
374inline void fabs_array(
375 const double* OPENSWMM_RESTRICT a,
376 double* OPENSWMM_RESTRICT dst,
377 std::size_t n
378) noexcept {
380 for (std::size_t i = 0; i < n; ++i) dst[i] = std::fabs(a[i]);
381}
382
386inline void fma_array(
387 const double* OPENSWMM_RESTRICT a,
388 const double* OPENSWMM_RESTRICT b,
389 const double* OPENSWMM_RESTRICT c,
390 double* OPENSWMM_RESTRICT dst,
391 std::size_t n
392) noexcept {
394 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i] + c[i];
395}
396
397} /* namespace openswmm::simd */
398
399// ============================================================================
400// Fast math: fixed-exponent pow() replacements using std::sqrt / std::cbrt
401//
402// std::pow(x, y) dispatches through exp(y*log(x)) — ~60-80 cycles on ARM.
403// std::sqrt / std::cbrt are ~10-15 cycle intrinsics. For exponents that
404// factor through half- and third-powers these closed forms are dramatically
405// cheaper in the hot loops (Manning friction, weir/orifice equations,
406// submergence corrections).
407// ============================================================================
408
410
412inline double pow3_2(double x) noexcept {
413 return x * std::sqrt(x);
414}
415
417inline double pow5_2(double x) noexcept {
418 return x * x * std::sqrt(x);
419}
420
422inline double pow5_3(double x) noexcept {
423 return x * std::cbrt(x * x);
424}
425
427inline double pow4_3(double x) noexcept {
428 return x * std::cbrt(x);
429}
430
432inline double pow2_3(double x) noexcept {
433 return std::cbrt(x * x);
434}
435
436#if defined(OPENSWMM_SIMD_NEON)
437// ============================================================================
438// NEON 2-wide vectorised cube root via Newton-Raphson refinement.
439//
440// Seeding strategy: cast to float32x2, use single-precision cbrt via
441// vrecpe/NR, widen back to float64x2 for three double-precision NR
442// corrections. Convergence: each NR step cubes the relative error, so
443// after 3 double steps the residual is < 1 ULP for normalised doubles.
444//
445// Used by batch_pow4_3 and batch_pow2_3 to compute Manning exponents on
446// pairs of hydraulic-radius values without calling std::cbrt twice.
447// ============================================================================
448
453inline float64x2_t vcbrtq_f64(float64x2_t x) noexcept {
454 // --- Step 1: float32 seed ---
455 // Narrow to float32, compute a rough cbrt via bit-manipulation seed
456 // (the classic Kahan integer cbrt trick generalised to float):
457 // ieee754(float) cbrt seed: reinterpret as int32, scale exponent by 1/3.
458 float32x2_t xf = vcvt_f32_f64(x); // narrow to float32
459 int32x2_t xi = vreinterpret_s32_f32(xf);
460 // Magic constant for float cbrt seed: (1/3) * (2^23) * (2 - 127/3 + bias)
461 // = 0x2A2A2A2B (approx). Gives ~5-bit accurate seed.
462 xi = vadd_s32(vshr_n_s32(vsub_s32(xi, vdup_n_s32(0x3F800000)), 2),
463 vdup_n_s32(0x3F800000 + (0x3F800000 / 3)));
464 float32x2_t cr = vreinterpret_f32_s32(xi);
465
466 // --- Step 2: two float32 NR steps: cr = cr - (cr - x/cr²) / 3 ---
467 float32x2_t cr2 = vmul_f32(cr, cr);
468 cr = vadd_f32(cr, vmul_f32(vdup_n_f32(0.333333333f),
469 vsub_f32(vdiv_f32(xf, cr2), cr)));
470 cr2 = vmul_f32(cr, cr);
471 cr = vadd_f32(cr, vmul_f32(vdup_n_f32(0.333333333f),
472 vsub_f32(vdiv_f32(xf, cr2), cr)));
473
474 // --- Step 3: widen seed to double and run three double-precision NR ---
475 float64x2_t c = vcvt_f64_f32(cr); // widen to float64
476
477 // NR: c_next = c - (c³ - x) / (3 c²)
478 // = (2c³ + x) / (3c²) [Horner-friendly form]
479 float64x2_t three = vdupq_n_f64(3.0);
480 float64x2_t onethird = vdupq_n_f64(0.333333333333333333);
481 for (int i = 0; i < 3; ++i) {
482 float64x2_t c2 = vmulq_f64(c, c);
483 float64x2_t c3 = vmulq_f64(c2, c);
484 // c = c - (c³ - x) / (3 * c²)
485 c = vsubq_f64(c, vmulq_f64(onethird,
486 vdivq_f64(vsubq_f64(c3, x),
487 vmulq_f64(three, c2))));
488 }
489 return c;
490}
491
496inline float64x2_t batch_pow4_3(float64x2_t x) noexcept {
497 return vmulq_f64(x, vcbrtq_f64(x));
498}
499
504inline float64x2_t batch_pow2_3(float64x2_t x) noexcept {
505 return vcbrtq_f64(vmulq_f64(x, x));
506}
507
508#endif /* OPENSWMM_SIMD_NEON */
509
510} /* namespace openswmm::fastmath */
511
512#endif /* OPENSWMM_ENGINE_SIMD_HPP */
#define OPENSWMM_IVDEP
Definition SIMD.hpp:55
#define OPENSWMM_SIMD_WIDTH
Scalar fallback.
Definition SIMD.hpp:79
#define OPENSWMM_RESTRICT
Definition XSectBatch.hpp:41
Definition SIMD.hpp:409
double pow4_3(double x) noexcept
pow(x, 4/3) = x * cbrt(x). Manning friction.
Definition SIMD.hpp:427
double pow3_2(double x) noexcept
pow(x, 3/2) = x * sqrt(x). Weir TRANSVERSE / TRAPEZOIDAL.
Definition SIMD.hpp:412
double pow2_3(double x) noexcept
pow(x, 2/3) = cbrt(x²). Manning normal-flow.
Definition SIMD.hpp:432
double pow5_2(double x) noexcept
pow(x, 5/2) = x² * sqrt(x). Weir V-NOTCH.
Definition SIMD.hpp:417
double pow5_3(double x) noexcept
pow(x, 5/3) = x * cbrt(x²). Weir SIDEFLOW (legacy 1.67 exponent).
Definition SIMD.hpp:422
Definition SIMD.hpp:103
constexpr std::size_t lane_width() noexcept
Returns the SIMD lane width (doubles per register on this platform).
Definition SIMD.hpp:353
void multiply(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise multiplication: dst[i] = a[i] * b[i].
Definition SIMD.hpp:151
double min(const double *a, std::size_t n) noexcept
Find the minimum value in an array.
Definition SIMD.hpp:185
double max(const double *a, std::size_t n) noexcept
Find the maximum value in an array.
Definition SIMD.hpp:223
void sqrt_array(const double *OPENSWMM_RESTRICT a, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise sqrt: dst[i] = sqrt(a[i]). Written as a simple loop; compilers auto-vectorize to platfor...
Definition SIMD.hpp:362
void fabs_array(const double *OPENSWMM_RESTRICT a, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise fabs: dst[i] = fabs(a[i]).
Definition SIMD.hpp:374
void add(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise addition: dst[i] = a[i] + b[i].
Definition SIMD.hpp:119
double dot(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, std::size_t n) noexcept
Dot product of two arrays: sum(a[i] * b[i]).
Definition SIMD.hpp:308
void fma_array(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, const double *OPENSWMM_RESTRICT c, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise fused multiply-add: dst[i] = a[i] * b[i] + c[i].
Definition SIMD.hpp:386
void clamp(double *a, double lo, double hi, std::size_t n) noexcept
Clamp all elements of an array to [lo, hi].
Definition SIMD.hpp:265