34#ifndef OPENSWMM_ENGINE_SIMD_HPP
35#define OPENSWMM_ENGINE_SIMD_HPP
37#ifndef OPENSWMM_RESTRICT
39# define OPENSWMM_RESTRICT __restrict
41# define OPENSWMM_RESTRICT __restrict__
51# define OPENSWMM_IVDEP __pragma(loop(ivdep))
52# elif defined(__GNUC__) || defined(__clang__)
53# define OPENSWMM_IVDEP _Pragma("GCC ivdep")
55# define OPENSWMM_IVDEP
68#if !defined(OPENSWMM_SIMD_SCALAR)
70# include <immintrin.h>
71# define OPENSWMM_SIMD_AVX2 1
72# define OPENSWMM_SIMD_WIDTH 4
73# elif defined(__ARM_NEON) || defined(__aarch64__)
75# define OPENSWMM_SIMD_NEON 1
76# define OPENSWMM_SIMD_WIDTH 2
78# define OPENSWMM_SIMD_SCALAR 1
79# define OPENSWMM_SIMD_WIDTH 1
82# if !defined(OPENSWMM_SIMD_WIDTH)
83# define OPENSWMM_SIMD_WIDTH 1
90#if defined(OPENSWMM_SIMD_AVX2)
93#if defined(OPENSWMM_SIMD_NEON)
96#if defined(OPENSWMM_SIMD_SCALAR)
100 "Exactly one SIMD backend must be active (AVX2, NEON, or SCALAR)"
125#if defined(OPENSWMM_SIMD_AVX2)
126 const std::size_t nv = n / 4;
127 const std::size_t nr = n % 4;
128 for (std::size_t i = 0; i < nv; ++i) {
129 __m256d va = _mm256_loadu_pd(a + i * 4);
130 __m256d vb = _mm256_loadu_pd(b + i * 4);
131 _mm256_storeu_pd(dst + i * 4, _mm256_add_pd(va, vb));
133 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] + b[i];
135#elif defined(OPENSWMM_SIMD_NEON)
136 const std::size_t nv = n / 2;
137 for (std::size_t i = 0; i < nv; ++i) {
138 float64x2_t va = vld1q_f64(a + i * 2);
139 float64x2_t vb = vld1q_f64(b + i * 2);
140 vst1q_f64(dst + i * 2, vaddq_f64(va, vb));
142 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] + b[i];
144 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] + b[i];
157#if defined(OPENSWMM_SIMD_AVX2)
158 const std::size_t nv = n / 4;
159 for (std::size_t i = 0; i < nv; ++i) {
160 __m256d va = _mm256_loadu_pd(a + i * 4);
161 __m256d vb = _mm256_loadu_pd(b + i * 4);
162 _mm256_storeu_pd(dst + i * 4, _mm256_mul_pd(va, vb));
164 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] * b[i];
165#elif defined(OPENSWMM_SIMD_NEON)
166 const std::size_t nv = n / 2;
167 for (std::size_t i = 0; i < nv; ++i) {
168 float64x2_t va = vld1q_f64(a + i * 2);
169 float64x2_t vb = vld1q_f64(b + i * 2);
170 vst1q_f64(dst + i * 2, vmulq_f64(va, vb));
172 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] * b[i];
174 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i];
185inline double min(
const double* a, std::size_t n)
noexcept {
187 double result = a[0];
188#if defined(OPENSWMM_SIMD_AVX2)
189 const std::size_t nv = n / 4;
191 __m256d vmin = _mm256_loadu_pd(a);
192 for (std::size_t i = 1; i < nv; ++i) {
193 vmin = _mm256_min_pd(vmin, _mm256_loadu_pd(a + i * 4));
196 __m128d hi = _mm256_extractf128_pd(vmin, 1);
197 __m128d lo = _mm256_castpd256_pd128(vmin);
198 __m128d m2 = _mm_min_pd(lo, hi);
199 __m128d m1 = _mm_min_pd(m2, _mm_shuffle_pd(m2, m2, 1));
200 result = _mm_cvtsd_f64(m1);
202 for (std::size_t i = nv * 4; i < n; ++i) result = std::min(result, a[i]);
203#elif defined(OPENSWMM_SIMD_NEON)
204 const std::size_t nv = n / 2;
206 float64x2_t vmin = vld1q_f64(a);
207 for (std::size_t i = 1; i < nv; ++i) {
208 vmin = vminq_f64(vmin, vld1q_f64(a + i * 2));
211 result = std::min(vgetq_lane_f64(vmin, 0), vgetq_lane_f64(vmin, 1));
213 for (std::size_t i = nv * 2; i < n; ++i) result = std::min(result, a[i]);
215 for (std::size_t i = 1; i < n; ++i) result = std::min(result, a[i]);
223inline double max(
const double* a, std::size_t n)
noexcept {
225 double result = a[0];
226#if defined(OPENSWMM_SIMD_AVX2)
227 const std::size_t nv = n / 4;
229 __m256d vmax = _mm256_loadu_pd(a);
230 for (std::size_t i = 1; i < nv; ++i) {
231 vmax = _mm256_max_pd(vmax, _mm256_loadu_pd(a + i * 4));
234 __m128d hi = _mm256_extractf128_pd(vmax, 1);
235 __m128d lo = _mm256_castpd256_pd128(vmax);
236 __m128d m2 = _mm_max_pd(lo, hi);
237 __m128d m1 = _mm_max_pd(m2, _mm_shuffle_pd(m2, m2, 1));
238 result = _mm_cvtsd_f64(m1);
240 for (std::size_t i = nv * 4; i < n; ++i) result = std::max(result, a[i]);
241#elif defined(OPENSWMM_SIMD_NEON)
242 const std::size_t nv = n / 2;
244 float64x2_t vmax = vld1q_f64(a);
245 for (std::size_t i = 1; i < nv; ++i) {
246 vmax = vmaxq_f64(vmax, vld1q_f64(a + i * 2));
248 result = std::max(vgetq_lane_f64(vmax, 0), vgetq_lane_f64(vmax, 1));
250 for (std::size_t i = nv * 2; i < n; ++i) result = std::max(result, a[i]);
252 for (std::size_t i = 1; i < n; ++i) result = std::max(result, a[i]);
265inline void clamp(
double* a,
double lo,
double hi, std::size_t n)
noexcept {
266#if defined(OPENSWMM_SIMD_AVX2)
267 __m256d vlo = _mm256_set1_pd(lo);
268 __m256d vhi = _mm256_set1_pd(hi);
269 const std::size_t nv = n / 4;
270 for (std::size_t i = 0; i < nv; ++i) {
271 __m256d v = _mm256_loadu_pd(a + i * 4);
272 v = _mm256_max_pd(vlo, _mm256_min_pd(vhi, v));
273 _mm256_storeu_pd(a + i * 4, v);
275 for (std::size_t i = nv * 4; i < n; ++i) {
276 a[i] = std::max(lo, std::min(hi, a[i]));
278#elif defined(OPENSWMM_SIMD_NEON)
279 float64x2_t vlo = vdupq_n_f64(lo);
280 float64x2_t vhi = vdupq_n_f64(hi);
281 const std::size_t nv = n / 2;
282 for (std::size_t i = 0; i < nv; ++i) {
283 float64x2_t v = vld1q_f64(a + i * 2);
284 v = vmaxq_f64(vlo, vminq_f64(vhi, v));
285 vst1q_f64(a + i * 2, v);
287 for (std::size_t i = nv * 2; i < n; ++i) {
288 a[i] = std::max(lo, std::min(hi, a[i]));
291 for (std::size_t i = 0; i < n; ++i) {
292 a[i] = std::max(lo, std::min(hi, a[i]));
314#if defined(OPENSWMM_SIMD_AVX2)
315 const std::size_t nv = n / 4;
317 __m256d vsum = _mm256_setzero_pd();
318 for (std::size_t i = 0; i < nv; ++i) {
319 __m256d va = _mm256_loadu_pd(a + i * 4);
320 __m256d vb = _mm256_loadu_pd(b + i * 4);
321 vsum = _mm256_fmadd_pd(va, vb, vsum);
324 __m128d hi = _mm256_extractf128_pd(vsum, 1);
325 __m128d lo = _mm256_castpd256_pd128(vsum);
326 __m128d s2 = _mm_add_pd(lo, hi);
327 __m128d s1 = _mm_add_pd(s2, _mm_shuffle_pd(s2, s2, 1));
328 result = _mm_cvtsd_f64(s1);
330 for (std::size_t i = nv * 4; i < n; ++i) result += a[i] * b[i];
331#elif defined(OPENSWMM_SIMD_NEON)
332 const std::size_t nv = n / 2;
334 float64x2_t vsum = vdupq_n_f64(0.0);
335 for (std::size_t i = 0; i < nv; ++i) {
336 float64x2_t va = vld1q_f64(a + i * 2);
337 float64x2_t vb = vld1q_f64(b + i * 2);
338 vsum = vfmaq_f64(vsum, va, vb);
341 result = vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
343 for (std::size_t i = nv * 2; i < n; ++i) result += a[i] * b[i];
345 for (std::size_t i = 0; i < n; ++i) result += a[i] * b[i];
368 for (std::size_t i = 0; i < n; ++i) dst[i] = std::sqrt(a[i]);
380 for (std::size_t i = 0; i < n; ++i) dst[i] = std::fabs(a[i]);
394 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i] + c[i];
413 return x * std::sqrt(x);
418 return x * x * std::sqrt(x);
423 return x * std::cbrt(x * x);
428 return x * std::cbrt(x);
433 return std::cbrt(x * x);
436#if defined(OPENSWMM_SIMD_NEON)
453inline float64x2_t vcbrtq_f64(float64x2_t x)
noexcept {
458 float32x2_t xf = vcvt_f32_f64(x);
459 int32x2_t xi = vreinterpret_s32_f32(xf);
462 xi = vadd_s32(vshr_n_s32(vsub_s32(xi, vdup_n_s32(0x3F800000)), 2),
463 vdup_n_s32(0x3F800000 + (0x3F800000 / 3)));
464 float32x2_t cr = vreinterpret_f32_s32(xi);
467 float32x2_t cr2 = vmul_f32(cr, cr);
468 cr = vadd_f32(cr, vmul_f32(vdup_n_f32(0.333333333f),
469 vsub_f32(vdiv_f32(xf, cr2), cr)));
470 cr2 = vmul_f32(cr, cr);
471 cr = vadd_f32(cr, vmul_f32(vdup_n_f32(0.333333333f),
472 vsub_f32(vdiv_f32(xf, cr2), cr)));
475 float64x2_t c = vcvt_f64_f32(cr);
479 float64x2_t three = vdupq_n_f64(3.0);
480 float64x2_t onethird = vdupq_n_f64(0.333333333333333333);
481 for (
int i = 0; i < 3; ++i) {
482 float64x2_t c2 = vmulq_f64(c, c);
483 float64x2_t c3 = vmulq_f64(c2, c);
485 c = vsubq_f64(c, vmulq_f64(onethird,
486 vdivq_f64(vsubq_f64(c3, x),
487 vmulq_f64(three, c2))));
496inline float64x2_t batch_pow4_3(float64x2_t x)
noexcept {
497 return vmulq_f64(x, vcbrtq_f64(x));
504inline float64x2_t batch_pow2_3(float64x2_t x)
noexcept {
505 return vcbrtq_f64(vmulq_f64(x, x));
#define OPENSWMM_IVDEP
Definition SIMD.hpp:55
#define OPENSWMM_SIMD_WIDTH
Scalar fallback.
Definition SIMD.hpp:79
#define OPENSWMM_RESTRICT
Definition XSectBatch.hpp:41
double pow4_3(double x) noexcept
pow(x, 4/3) = x * cbrt(x). Manning friction.
Definition SIMD.hpp:427
double pow3_2(double x) noexcept
pow(x, 3/2) = x * sqrt(x). Weir TRANSVERSE / TRAPEZOIDAL.
Definition SIMD.hpp:412
double pow2_3(double x) noexcept
pow(x, 2/3) = cbrt(x²). Manning normal-flow.
Definition SIMD.hpp:432
double pow5_2(double x) noexcept
pow(x, 5/2) = x² * sqrt(x). Weir V-NOTCH.
Definition SIMD.hpp:417
double pow5_3(double x) noexcept
pow(x, 5/3) = x * cbrt(x²). Weir SIDEFLOW (legacy 1.67 exponent).
Definition SIMD.hpp:422
constexpr std::size_t lane_width() noexcept
Returns the SIMD lane width (doubles per register on this platform).
Definition SIMD.hpp:353
void multiply(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise multiplication: dst[i] = a[i] * b[i].
Definition SIMD.hpp:151
double min(const double *a, std::size_t n) noexcept
Find the minimum value in an array.
Definition SIMD.hpp:185
double max(const double *a, std::size_t n) noexcept
Find the maximum value in an array.
Definition SIMD.hpp:223
void sqrt_array(const double *OPENSWMM_RESTRICT a, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise sqrt: dst[i] = sqrt(a[i]). Written as a simple loop; compilers auto-vectorize to platfor...
Definition SIMD.hpp:362
void fabs_array(const double *OPENSWMM_RESTRICT a, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise fabs: dst[i] = fabs(a[i]).
Definition SIMD.hpp:374
void add(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise addition: dst[i] = a[i] + b[i].
Definition SIMD.hpp:119
double dot(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, std::size_t n) noexcept
Dot product of two arrays: sum(a[i] * b[i]).
Definition SIMD.hpp:308
void fma_array(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, const double *OPENSWMM_RESTRICT c, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise fused multiply-add: dst[i] = a[i] * b[i] + c[i].
Definition SIMD.hpp:386
void clamp(double *a, double lo, double hi, std::size_t n) noexcept
Clamp all elements of an array to [lo, hi].
Definition SIMD.hpp:265