Monte Carlo Benchmarking Engine
High-performance SIMD Monte Carlo engine (AVX2/NEON) with custom memory allocators and perf logging.
 
Loading...
Searching...
No Matches
montecarlo.hpp
Go to the documentation of this file.
1// ========================================
2// montecarlo.hpp - SIMD Monte Carlo Engine
3// ========================================
179
180
181#pragma once
182
183#include "pool.hpp"
184#include <chrono>
185#include <iostream>
186#include <random>
187#include <thread>
188#include <memory>
189
190#if defined(USE_AVX)
191 #include <immintrin.h>
192#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
193 #define USE_NEON
194 #include <arm_neon.h>
195#else
196 #error "No SIMD instruction set supported. Compile with USE_AVX or USE_NEON"
197#endif
198
206inline bool isInsideCircle(double x, double y) {
207 return (x*x + y*y) <= 1.0;
208}
209
210#ifdef USE_AVX
217inline int countInsideCircle_AVX(__m256d x, __m256d y) {
218 __m256d x2 = _mm256_mul_pd(x, x);
219 __m256d y2 = _mm256_mul_pd(y, y);
220 __m256d dist2 = _mm256_add_pd(x2, y2);
221 __m256d ones = _mm256_set1_pd(1.0);
222 __m256d cmp = _mm256_cmp_pd(dist2, ones, _CMP_LE_OQ);
223 return __builtin_popcount(_mm256_movemask_pd(cmp));
224}
225#endif
226
227#ifdef USE_NEON
234inline int countInsideCircle_NEON(float64x2_t x, float64x2_t y) {
235 float64x2_t x2 = vmulq_f64(x, x);
236 float64x2_t y2 = vmulq_f64(y, y);
237 float64x2_t dist2 = vaddq_f64(x2, y2);
238 float64x2_t ones = vdupq_n_f64(1.0);
239 uint64x2_t cmp = vcleq_f64(dist2, ones);
240 return static_cast<int>(vgetq_lane_u64(cmp, 0) != 0) + static_cast<int>(vgetq_lane_u64(cmp, 1) != 0);
241}
242#endif
243
249int monteCarloPI_SEQUENTIAl(int numberOfTrials) {
250 std::random_device rd {};
251 std::default_random_engine engine {rd()};
252 std::uniform_real_distribution<double> darts{0.0, 1.0};
253
254 int hits = 0;
255 for (int i = 0; i < numberOfTrials; ++i) {
256 double dartX = darts(engine);
257 double dartY = darts(engine);
258 if (isInsideCircle(dartX, dartY)) ++hits;
259 }
260 return hits;
261}
262
268inline int* monteCarloPI_HEAP(int numberOfTrials) {
269 std::random_device rd {};
270 std::default_random_engine engine {rd()};
271 std::uniform_real_distribution<double> darts{0.0, 1.0};
272
273 int hits = 0;
274 for (int i = 0; i < numberOfTrials; ++i) {
275 double dartX = darts(engine);
276 double dartY = darts(engine);
277 if (isInsideCircle(dartX, dartY)) ++hits;
278 }
279 return new int{hits};
280}
281
287inline int* monteCarloPI_POOL(int numberOfTrials) {
288 thread_local PoolAllocator pool(64 * 1024);
289 pool.reset();
290
291 int* hits = pool.allocate<int>();
292 if (!hits) {
293 std::cerr << "[ERROR] PoolAllocator ran out of memory!\n";
294 std::exit(EXIT_FAILURE);
295 }
296 *hits = 0;
297
298 std::random_device rd;
299 std::default_random_engine engine{rd()};
300 std::uniform_real_distribution<double> darts{0.0, 1.0};
301
302 for (int i = 0; i < numberOfTrials; ++i) {
303 double dartX = darts(engine);
304 double dartY = darts(engine);
305 if (isInsideCircle(dartX, dartY)) ++(*hits);
306 }
307 return hits;
308}
309
315inline int* monteCarloPI_SIMD(int numberOfTrials) {
316 thread_local PoolAllocator pool(64 * 1024);
317 pool.reset();
318
319 int* hits = pool.allocate<int>();
320 if (!hits) {
321 std::cerr << "[ERROR] PoolAllocator ran out of memory!\n";
322 std::exit(EXIT_FAILURE);
323 }
324 *hits = 0;
325
326 thread_local std::mt19937_64 engine(std::random_device{}());
327 std::uniform_real_distribution<double> dist(0.0, 1.0);
328
329 int batch;
330 int loopEnd;
331
332 #ifdef USE_AVX
333 batch = 4;
334 alignas(32) double randX[batch], randY[batch];
335 loopEnd = numberOfTrials - (numberOfTrials % batch);
336
337 for (int i = 0; i < loopEnd; i+= batch) {
338 for (int j = 0; j < batch; ++j) {
339 randX[j] = dist(engine);
340 randY[j] = dist(engine);
341 }
342 __m256d dartX = _mm256_load_pd(randX);
343 __m256d dartY = _mm256_load_pd(randY);
344 *hits += countInsideCircle_AVX(dartX, dartY);
345 }
346 #elif defined(USE_NEON)
347 batch = 2;
348 alignas(16) double randX[batch], randY[batch];
349 loopEnd = numberOfTrials - (numberOfTrials % batch);
350
351 for (int i = 0; i < loopEnd; i += batch) {
352 for (int j = 0; j < batch; ++j) {
353 randX[j] = dist(engine);
354 randY[j] = dist(engine);
355 }
356 float64x2_t dartX = vld1q_f64(randX);
357 float64x2_t dartY = vld1q_f64(randY);
358 *hits += countInsideCircle_NEON(dartX, dartY);
359 }
360 #endif
361
362 for (int i = loopEnd; i < numberOfTrials; ++i) {
363 double dartX = dist(engine);
364 double dartY = dist(engine);
365 if (isInsideCircle(dartX, dartY)) ++(*hits);
366 }
367 return hits;
368}
int * monteCarloPI_SIMD(int numberOfTrials)
Estimates π using SIMD acceleration (AVX2 or NEON) and pool-allocated result storage.
int monteCarloPI_SEQUENTIAl(int numberOfTrials)
Estimates π using sequential dart throwing.
int * monteCarloPI_POOL(int numberOfTrials)
Estimates π using a thread-local memory pool (bump allocator).
bool isInsideCircle(double x, double y)
Checks if a 2D point lies inside the unit circle.
int * monteCarloPI_HEAP(int numberOfTrials)
Estimates π using heap-allocated result storage.
Fixed-size aligned pool allocator for high-performance simulations.
Fast aligned bump allocator for multithreaded simulations.
Definition pool.hpp:137
void reset()
Reset the allocator to reuse buffer (memory).
Definition pool.hpp:183
T * allocate(std::size_t align=alignof(T))
Allocates memory for type T with specified alignment (default = alignof(T)).
Definition pool.hpp:168