34#ifndef OPENSWMM_ENGINE_SIMD_HPP
35#define OPENSWMM_ENGINE_SIMD_HPP
37#ifndef OPENSWMM_RESTRICT
39# define OPENSWMM_RESTRICT __restrict
41# define OPENSWMM_RESTRICT __restrict__
55# include <immintrin.h>
56# define OPENSWMM_SIMD_AVX2 1
57# define OPENSWMM_SIMD_WIDTH 4
58#elif defined(__ARM_NEON) || defined(__aarch64__)
60# define OPENSWMM_SIMD_NEON 1
61# define OPENSWMM_SIMD_WIDTH 2
63# define OPENSWMM_SIMD_SCALAR 1
64# define OPENSWMM_SIMD_WIDTH 1
89#if defined(OPENSWMM_SIMD_AVX2)
90 const std::size_t nv = n / 4;
91 const std::size_t nr = n % 4;
92 for (std::size_t i = 0; i < nv; ++i) {
93 __m256d va = _mm256_loadu_pd(a + i * 4);
94 __m256d vb = _mm256_loadu_pd(b + i * 4);
95 _mm256_storeu_pd(dst + i * 4, _mm256_add_pd(va, vb));
97 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] + b[i];
99#elif defined(OPENSWMM_SIMD_NEON)
100 const std::size_t nv = n / 2;
101 for (std::size_t i = 0; i < nv; ++i) {
102 float64x2_t va = vld1q_f64(a + i * 2);
103 float64x2_t vb = vld1q_f64(b + i * 2);
104 vst1q_f64(dst + i * 2, vaddq_f64(va, vb));
106 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] + b[i];
108 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] + b[i];
121#if defined(OPENSWMM_SIMD_AVX2)
122 const std::size_t nv = n / 4;
123 for (std::size_t i = 0; i < nv; ++i) {
124 __m256d va = _mm256_loadu_pd(a + i * 4);
125 __m256d vb = _mm256_loadu_pd(b + i * 4);
126 _mm256_storeu_pd(dst + i * 4, _mm256_mul_pd(va, vb));
128 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] * b[i];
129#elif defined(OPENSWMM_SIMD_NEON)
130 const std::size_t nv = n / 2;
131 for (std::size_t i = 0; i < nv; ++i) {
132 float64x2_t va = vld1q_f64(a + i * 2);
133 float64x2_t vb = vld1q_f64(b + i * 2);
134 vst1q_f64(dst + i * 2, vmulq_f64(va, vb));
136 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] * b[i];
138 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i];
149inline double min(
const double* a, std::size_t n)
noexcept {
151 double result = a[0];
152#if defined(OPENSWMM_SIMD_AVX2)
153 const std::size_t nv = n / 4;
155 __m256d vmin = _mm256_loadu_pd(a);
156 for (std::size_t i = 1; i < nv; ++i) {
157 vmin = _mm256_min_pd(vmin, _mm256_loadu_pd(a + i * 4));
160 __m128d hi = _mm256_extractf128_pd(vmin, 1);
161 __m128d lo = _mm256_castpd256_pd128(vmin);
162 __m128d m2 = _mm_min_pd(lo, hi);
163 __m128d m1 = _mm_min_pd(m2, _mm_shuffle_pd(m2, m2, 1));
164 result = _mm_cvtsd_f64(m1);
166 for (std::size_t i = nv * 4; i < n; ++i) result = std::min(result, a[i]);
168 for (std::size_t i = 1; i < n; ++i) result = std::min(result, a[i]);
176inline double max(
const double* a, std::size_t n)
noexcept {
178 double result = a[0];
179 for (std::size_t i = 1; i < n; ++i) result = std::max(result, a[i]);
191inline void clamp(
double* a,
double lo,
double hi, std::size_t n)
noexcept {
192#if defined(OPENSWMM_SIMD_AVX2)
193 __m256d vlo = _mm256_set1_pd(lo);
194 __m256d vhi = _mm256_set1_pd(hi);
195 const std::size_t nv = n / 4;
196 for (std::size_t i = 0; i < nv; ++i) {
197 __m256d v = _mm256_loadu_pd(a + i * 4);
198 v = _mm256_max_pd(vlo, _mm256_min_pd(vhi, v));
199 _mm256_storeu_pd(a + i * 4, v);
201 for (std::size_t i = nv * 4; i < n; ++i) {
202 a[i] = std::max(lo, std::min(hi, a[i]));
205 for (std::size_t i = 0; i < n; ++i) {
206 a[i] = std::max(lo, std::min(hi, a[i]));
228#if defined(OPENSWMM_SIMD_AVX2)
229 const std::size_t nv = n / 4;
231 __m256d vsum = _mm256_setzero_pd();
232 for (std::size_t i = 0; i < nv; ++i) {
233 __m256d va = _mm256_loadu_pd(a + i * 4);
234 __m256d vb = _mm256_loadu_pd(b + i * 4);
235 vsum = _mm256_fmadd_pd(va, vb, vsum);
238 __m128d hi = _mm256_extractf128_pd(vsum, 1);
239 __m128d lo = _mm256_castpd256_pd128(vsum);
240 __m128d s2 = _mm_add_pd(lo, hi);
241 __m128d s1 = _mm_add_pd(s2, _mm_shuffle_pd(s2, s2, 1));
242 result = _mm_cvtsd_f64(s1);
244 for (std::size_t i = nv * 4; i < n; ++i) result += a[i] * b[i];
246 for (std::size_t i = 0; i < n; ++i) result += a[i] * b[i];
#define OPENSWMM_SIMD_WIDTH
Scalar fallback.
Definition SIMD.hpp:64
#define OPENSWMM_RESTRICT
Definition XSectBatch.hpp:41
constexpr std::size_t lane_width() noexcept
Returns the SIMD lane width (doubles per register on this platform).
Definition SIMD.hpp:254
void multiply(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise multiplication: dst[i] = a[i] * b[i].
Definition SIMD.hpp:115
double min(const double *a, std::size_t n) noexcept
Find the minimum value in an array.
Definition SIMD.hpp:149
double max(const double *a, std::size_t n) noexcept
Find the maximum value in an array.
Definition SIMD.hpp:176
void add(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise addition: dst[i] = a[i] + b[i].
Definition SIMD.hpp:83
double dot(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, std::size_t n) noexcept
Dot product of two arrays: sum(a[i] * b[i]).
Definition SIMD.hpp:222
void clamp(double *a, double lo, double hi, std::size_t n) noexcept
Clamp all elements of an array to [lo, hi].
Definition SIMD.hpp:191