OpenSWMM Engine  6.0.0-alpha.1
Data-oriented, plugin-extensible SWMM Engine (6.0.0-alpha.1)
Loading...
Searching...
No Matches
SIMD.hpp
Go to the documentation of this file.
1
34#ifndef OPENSWMM_ENGINE_SIMD_HPP
35#define OPENSWMM_ENGINE_SIMD_HPP
36
37#ifndef OPENSWMM_RESTRICT
38# if defined(_MSC_VER)
39# define OPENSWMM_RESTRICT __restrict
40# else
41# define OPENSWMM_RESTRICT __restrict__
42# endif
43#endif
44
45#include <cstddef>
46#include <cmath>
47#include <algorithm>
48#include <cassert>
49
50// ============================================================================
51// Platform detection
52// ============================================================================
53
54#if defined(__AVX2__)
55# include <immintrin.h>
56# define OPENSWMM_SIMD_AVX2 1
57# define OPENSWMM_SIMD_WIDTH 4
58#elif defined(__ARM_NEON) || defined(__aarch64__)
59# include <arm_neon.h>
60# define OPENSWMM_SIMD_NEON 1
61# define OPENSWMM_SIMD_WIDTH 2
62#else
63# define OPENSWMM_SIMD_SCALAR 1
64# define OPENSWMM_SIMD_WIDTH 1
65#endif
66
67namespace openswmm::simd {
68
69// ============================================================================
70// Element-wise operations (arrays of doubles)
71// ============================================================================
72
83inline void add(
84 const double* OPENSWMM_RESTRICT a,
85 const double* OPENSWMM_RESTRICT b,
86 double* OPENSWMM_RESTRICT dst,
87 std::size_t n
88) noexcept {
89#if defined(OPENSWMM_SIMD_AVX2)
90 const std::size_t nv = n / 4;
91 const std::size_t nr = n % 4;
92 for (std::size_t i = 0; i < nv; ++i) {
93 __m256d va = _mm256_loadu_pd(a + i * 4);
94 __m256d vb = _mm256_loadu_pd(b + i * 4);
95 _mm256_storeu_pd(dst + i * 4, _mm256_add_pd(va, vb));
96 }
97 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] + b[i];
98 (void)nr;
99#elif defined(OPENSWMM_SIMD_NEON)
100 const std::size_t nv = n / 2;
101 for (std::size_t i = 0; i < nv; ++i) {
102 float64x2_t va = vld1q_f64(a + i * 2);
103 float64x2_t vb = vld1q_f64(b + i * 2);
104 vst1q_f64(dst + i * 2, vaddq_f64(va, vb));
105 }
106 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] + b[i];
107#else
108 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] + b[i];
109#endif
110}
111
115inline void multiply(
116 const double* OPENSWMM_RESTRICT a,
117 const double* OPENSWMM_RESTRICT b,
118 double* OPENSWMM_RESTRICT dst,
119 std::size_t n
120) noexcept {
121#if defined(OPENSWMM_SIMD_AVX2)
122 const std::size_t nv = n / 4;
123 for (std::size_t i = 0; i < nv; ++i) {
124 __m256d va = _mm256_loadu_pd(a + i * 4);
125 __m256d vb = _mm256_loadu_pd(b + i * 4);
126 _mm256_storeu_pd(dst + i * 4, _mm256_mul_pd(va, vb));
127 }
128 for (std::size_t i = nv * 4; i < n; ++i) dst[i] = a[i] * b[i];
129#elif defined(OPENSWMM_SIMD_NEON)
130 const std::size_t nv = n / 2;
131 for (std::size_t i = 0; i < nv; ++i) {
132 float64x2_t va = vld1q_f64(a + i * 2);
133 float64x2_t vb = vld1q_f64(b + i * 2);
134 vst1q_f64(dst + i * 2, vmulq_f64(va, vb));
135 }
136 for (std::size_t i = nv * 2; i < n; ++i) dst[i] = a[i] * b[i];
137#else
138 for (std::size_t i = 0; i < n; ++i) dst[i] = a[i] * b[i];
139#endif
140}
141
149inline double min(const double* a, std::size_t n) noexcept {
150 assert(n >= 1);
151 double result = a[0];
152#if defined(OPENSWMM_SIMD_AVX2)
153 const std::size_t nv = n / 4;
154 if (nv > 0) {
155 __m256d vmin = _mm256_loadu_pd(a);
156 for (std::size_t i = 1; i < nv; ++i) {
157 vmin = _mm256_min_pd(vmin, _mm256_loadu_pd(a + i * 4));
158 }
159 // Horizontal reduction
160 __m128d hi = _mm256_extractf128_pd(vmin, 1);
161 __m128d lo = _mm256_castpd256_pd128(vmin);
162 __m128d m2 = _mm_min_pd(lo, hi);
163 __m128d m1 = _mm_min_pd(m2, _mm_shuffle_pd(m2, m2, 1));
164 result = _mm_cvtsd_f64(m1);
165 }
166 for (std::size_t i = nv * 4; i < n; ++i) result = std::min(result, a[i]);
167#else
168 for (std::size_t i = 1; i < n; ++i) result = std::min(result, a[i]);
169#endif
170 return result;
171}
172
176inline double max(const double* a, std::size_t n) noexcept {
177 assert(n >= 1);
178 double result = a[0];
179 for (std::size_t i = 1; i < n; ++i) result = std::max(result, a[i]);
180 return result;
181}
182
191inline void clamp(double* a, double lo, double hi, std::size_t n) noexcept {
192#if defined(OPENSWMM_SIMD_AVX2)
193 __m256d vlo = _mm256_set1_pd(lo);
194 __m256d vhi = _mm256_set1_pd(hi);
195 const std::size_t nv = n / 4;
196 for (std::size_t i = 0; i < nv; ++i) {
197 __m256d v = _mm256_loadu_pd(a + i * 4);
198 v = _mm256_max_pd(vlo, _mm256_min_pd(vhi, v));
199 _mm256_storeu_pd(a + i * 4, v);
200 }
201 for (std::size_t i = nv * 4; i < n; ++i) {
202 a[i] = std::max(lo, std::min(hi, a[i]));
203 }
204#else
205 for (std::size_t i = 0; i < n; ++i) {
206 a[i] = std::max(lo, std::min(hi, a[i]));
207 }
208#endif
209}
210
222inline double dot(
223 const double* OPENSWMM_RESTRICT a,
224 const double* OPENSWMM_RESTRICT b,
225 std::size_t n
226) noexcept {
227 double result = 0.0;
228#if defined(OPENSWMM_SIMD_AVX2)
229 const std::size_t nv = n / 4;
230 if (nv > 0) {
231 __m256d vsum = _mm256_setzero_pd();
232 for (std::size_t i = 0; i < nv; ++i) {
233 __m256d va = _mm256_loadu_pd(a + i * 4);
234 __m256d vb = _mm256_loadu_pd(b + i * 4);
235 vsum = _mm256_fmadd_pd(va, vb, vsum); // FMA: vsum += va * vb
236 }
237 // Horizontal sum
238 __m128d hi = _mm256_extractf128_pd(vsum, 1);
239 __m128d lo = _mm256_castpd256_pd128(vsum);
240 __m128d s2 = _mm_add_pd(lo, hi);
241 __m128d s1 = _mm_add_pd(s2, _mm_shuffle_pd(s2, s2, 1));
242 result = _mm_cvtsd_f64(s1);
243 }
244 for (std::size_t i = nv * 4; i < n; ++i) result += a[i] * b[i];
245#else
246 for (std::size_t i = 0; i < n; ++i) result += a[i] * b[i];
247#endif
248 return result;
249}
250
254constexpr std::size_t lane_width() noexcept {
255 return OPENSWMM_SIMD_WIDTH;
256}
257
258} /* namespace openswmm::simd */
259
260#endif /* OPENSWMM_ENGINE_SIMD_HPP */
#define OPENSWMM_SIMD_WIDTH
Scalar fallback.
Definition SIMD.hpp:64
#define OPENSWMM_RESTRICT
Definition XSectBatch.hpp:41
Definition SIMD.hpp:67
constexpr std::size_t lane_width() noexcept
Returns the SIMD lane width (doubles per register on this platform).
Definition SIMD.hpp:254
void multiply(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise multiplication: dst[i] = a[i] * b[i].
Definition SIMD.hpp:115
double min(const double *a, std::size_t n) noexcept
Find the minimum value in an array.
Definition SIMD.hpp:149
double max(const double *a, std::size_t n) noexcept
Find the maximum value in an array.
Definition SIMD.hpp:176
void add(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, double *OPENSWMM_RESTRICT dst, std::size_t n) noexcept
Element-wise addition: dst[i] = a[i] + b[i].
Definition SIMD.hpp:83
double dot(const double *OPENSWMM_RESTRICT a, const double *OPENSWMM_RESTRICT b, std::size_t n) noexcept
Dot product of two arrays: sum(a[i] * b[i]).
Definition SIMD.hpp:222
void clamp(double *a, double lo, double hi, std::size_t n) noexcept
Clamp all elements of an array to [lo, hi].
Definition SIMD.hpp:191