191 #include <immintrin.h>
192#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
194 #include <arm_neon.h>
196 #error "No SIMD instruction set supported. Compile with USE_AVX or USE_NEON"
207 return (x*x + y*y) <= 1.0;
217inline int countInsideCircle_AVX(__m256d x, __m256d y) {
218 __m256d x2 = _mm256_mul_pd(x, x);
219 __m256d y2 = _mm256_mul_pd(y, y);
220 __m256d dist2 = _mm256_add_pd(x2, y2);
221 __m256d ones = _mm256_set1_pd(1.0);
222 __m256d cmp = _mm256_cmp_pd(dist2, ones, _CMP_LE_OQ);
223 return __builtin_popcount(_mm256_movemask_pd(cmp));
234inline int countInsideCircle_NEON(float64x2_t x, float64x2_t y) {
235 float64x2_t x2 = vmulq_f64(x, x);
236 float64x2_t y2 = vmulq_f64(y, y);
237 float64x2_t dist2 = vaddq_f64(x2, y2);
238 float64x2_t ones = vdupq_n_f64(1.0);
239 uint64x2_t cmp = vcleq_f64(dist2, ones);
240 return static_cast<int>(vgetq_lane_u64(cmp, 0) != 0) +
static_cast<int>(vgetq_lane_u64(cmp, 1) != 0);
250 std::random_device rd {};
251 std::default_random_engine engine {rd()};
252 std::uniform_real_distribution<double> darts{0.0, 1.0};
255 for (
int i = 0; i < numberOfTrials; ++i) {
256 double dartX = darts(engine);
257 double dartY = darts(engine);
269 std::random_device rd {};
270 std::default_random_engine engine {rd()};
271 std::uniform_real_distribution<double> darts{0.0, 1.0};
274 for (
int i = 0; i < numberOfTrials; ++i) {
275 double dartX = darts(engine);
276 double dartY = darts(engine);
279 return new int{hits};
293 std::cerr <<
"[ERROR] PoolAllocator ran out of memory!\n";
294 std::exit(EXIT_FAILURE);
298 std::random_device rd;
299 std::default_random_engine engine{rd()};
300 std::uniform_real_distribution<double> darts{0.0, 1.0};
302 for (
int i = 0; i < numberOfTrials; ++i) {
303 double dartX = darts(engine);
304 double dartY = darts(engine);
321 std::cerr <<
"[ERROR] PoolAllocator ran out of memory!\n";
322 std::exit(EXIT_FAILURE);
326 thread_local std::mt19937_64 engine(std::random_device{}());
327 std::uniform_real_distribution<double> dist(0.0, 1.0);
334 alignas(32)
double randX[batch], randY[batch];
335 loopEnd = numberOfTrials - (numberOfTrials % batch);
337 for (
int i = 0; i < loopEnd; i+= batch) {
338 for (
int j = 0; j < batch; ++j) {
339 randX[j] = dist(engine);
340 randY[j] = dist(engine);
342 __m256d dartX = _mm256_load_pd(randX);
343 __m256d dartY = _mm256_load_pd(randY);
344 *hits += countInsideCircle_AVX(dartX, dartY);
346 #elif defined(USE_NEON)
348 alignas(16)
double randX[batch], randY[batch];
349 loopEnd = numberOfTrials - (numberOfTrials % batch);
351 for (
int i = 0; i < loopEnd; i += batch) {
352 for (
int j = 0; j < batch; ++j) {
353 randX[j] = dist(engine);
354 randY[j] = dist(engine);
356 float64x2_t dartX = vld1q_f64(randX);
357 float64x2_t dartY = vld1q_f64(randY);
358 *hits += countInsideCircle_NEON(dartX, dartY);
362 for (
int i = loopEnd; i < numberOfTrials; ++i) {
363 double dartX = dist(engine);
364 double dartY = dist(engine);
int * monteCarloPI_SIMD(int numberOfTrials)
Estimates π using SIMD acceleration (AVX2 or NEON) and pool-allocated result storage.
int monteCarloPI_SEQUENTIAl(int numberOfTrials)
Estimates π using sequential dart throwing.
int * monteCarloPI_POOL(int numberOfTrials)
Estimates π using a thread-local memory pool (bump allocator).
bool isInsideCircle(double x, double y)
Checks if a 2D point lies inside the unit circle.
int * monteCarloPI_HEAP(int numberOfTrials)
Estimates π using heap-allocated result storage.
Fixed-size aligned pool allocator for high-performance simulations.
Fast aligned bump allocator for multithreaded simulations.
void reset()
Reset the allocator to reuse buffer (memory).
T * allocate(std::size_t align=alignof(T))
Allocates memory for type T with specified alignment (default = alignof(T)).