montecarlo-benchmarking-engine/html/montecarlo_8hpp_source.html

// ========================================

// montecarlo.hpp - SIMD Monte Carlo Engine

// ========================================


#pragma once


#include "pool.hpp"

#include <chrono>

#include <iostream>

#include <random>

#include <thread>

#include <memory>


#if defined(USE_AVX)

    #include <immintrin.h>

#elif defined(__ARM_NEON) || defined(__ARM_NEON__)

    #define USE_NEON

    #include <arm_neon.h>

#else

    #error "No SIMD instruction set supported. Compile with USE_AVX or USE_NEON"

#endif


inline bool isInsideCircle(double x, double y) {

  return (x*x + y*y) <= 1.0;

}


#ifdef USE_AVX

inline int countInsideCircle_AVX(__m256d x, __m256d y) {

    __m256d x2 = _mm256_mul_pd(x, x);

    __m256d y2 = _mm256_mul_pd(y, y);

    __m256d dist2 = _mm256_add_pd(x2, y2);

    __m256d ones = _mm256_set1_pd(1.0);

    __m256d cmp = _mm256_cmp_pd(dist2, ones, _CMP_LE_OQ);

    return __builtin_popcount(_mm256_movemask_pd(cmp));

}

#endif


#ifdef USE_NEON

inline int countInsideCircle_NEON(float64x2_t x, float64x2_t y) {

    float64x2_t x2 = vmulq_f64(x, x);

    float64x2_t y2 = vmulq_f64(y, y);

    float64x2_t dist2 = vaddq_f64(x2, y2);

    float64x2_t ones = vdupq_n_f64(1.0);

    uint64x2_t cmp = vcleq_f64(dist2, ones);

    return static_cast<int>(vgetq_lane_u64(cmp, 0) != 0) + static_cast<int>(vgetq_lane_u64(cmp, 1) != 0);

}

#endif


int monteCarloPI_SEQUENTIAl(int numberOfTrials) {

    std::random_device rd {};

    std::default_random_engine engine {rd()};

    std::uniform_real_distribution<double> darts{0.0, 1.0};


    int hits = 0;

    for (int i = 0; i < numberOfTrials; ++i) {

        double dartX = darts(engine);

        double dartY = darts(engine);

        if (isInsideCircle(dartX, dartY)) ++hits;

    }

    return hits;

}


inline int* monteCarloPI_HEAP(int numberOfTrials) {

    std::random_device rd {};

    std::default_random_engine engine {rd()};

    std::uniform_real_distribution<double> darts{0.0, 1.0};


    int hits = 0;

    for (int i = 0; i < numberOfTrials; ++i) {

        double dartX = darts(engine);

        double dartY = darts(engine);

        if (isInsideCircle(dartX, dartY)) ++hits;

    }

    return new int{hits};

}


inline int* monteCarloPI_POOL(int numberOfTrials) {

    thread_local PoolAllocator pool(64 * 1024);

    pool.reset();


    int* hits = pool.allocate<int>();

    if (!hits) {

        std::cerr << "[ERROR] PoolAllocator ran out of memory!\n";

        std::exit(EXIT_FAILURE);

    }

    *hits = 0;


    std::random_device rd;

    std::default_random_engine engine{rd()};

    std::uniform_real_distribution<double> darts{0.0, 1.0};


    for (int i = 0; i < numberOfTrials; ++i) {

        double dartX = darts(engine);

        double dartY = darts(engine);

        if (isInsideCircle(dartX, dartY)) ++(*hits);

    }

    return hits;

}


inline int* monteCarloPI_SIMD(int numberOfTrials) {

    thread_local PoolAllocator pool(64 * 1024);

    pool.reset();


    int* hits = pool.allocate<int>();

    if (!hits) {

        std::cerr << "[ERROR] PoolAllocator ran out of memory!\n";

        std::exit(EXIT_FAILURE);

    }

    *hits = 0;


    thread_local std::mt19937_64 engine(std::random_device{}());

    std::uniform_real_distribution<double> dist(0.0, 1.0);


    int batch;

    int loopEnd;


    #ifdef USE_AVX

    batch = 4;

    alignas(32) double randX[batch], randY[batch];

    loopEnd = numberOfTrials - (numberOfTrials % batch);


    for (int i = 0; i < loopEnd; i+= batch) {

        for (int j = 0; j < batch; ++j) {

            randX[j] = dist(engine);

            randY[j] = dist(engine);

        }

        __m256d dartX = _mm256_load_pd(randX);

        __m256d dartY = _mm256_load_pd(randY);

        *hits += countInsideCircle_AVX(dartX, dartY);

    }

    #elif defined(USE_NEON)

    batch = 2;

    alignas(16) double randX[batch], randY[batch];

    loopEnd = numberOfTrials - (numberOfTrials % batch);


    for (int i = 0; i < loopEnd; i += batch) {

        for (int j = 0; j < batch; ++j) {

            randX[j] = dist(engine);

            randY[j] = dist(engine);

        }

        float64x2_t dartX = vld1q_f64(randX);

        float64x2_t dartY = vld1q_f64(randY);

        *hits += countInsideCircle_NEON(dartX, dartY);

    }

    #endif


    for (int i = loopEnd; i < numberOfTrials; ++i) {

        double dartX = dist(engine);

        double dartY = dist(engine);

        if (isInsideCircle(dartX, dartY)) ++(*hits);

    }

    return hits;

}


monteCarloPI_SIMD
int * monteCarloPI_SIMD(int numberOfTrials)
Estimates π using SIMD acceleration (AVX2 or NEON) and pool-allocated result storage.
Definition montecarlo.hpp:315

monteCarloPI_SEQUENTIAl
int monteCarloPI_SEQUENTIAl(int numberOfTrials)
Estimates π using sequential dart throwing.
Definition montecarlo.hpp:249

monteCarloPI_POOL
int * monteCarloPI_POOL(int numberOfTrials)
Estimates π using a thread-local memory pool (bump allocator).
Definition montecarlo.hpp:287

isInsideCircle
bool isInsideCircle(double x, double y)
Checks if a 2D point lies inside the unit circle.
Definition montecarlo.hpp:206

monteCarloPI_HEAP
int * monteCarloPI_HEAP(int numberOfTrials)
Estimates π using heap-allocated result storage.
Definition montecarlo.hpp:268

pool.hpp
Fixed-size aligned pool allocator for high-performance simulations.

PoolAllocator
Fast aligned bump allocator for multithreaded simulations.
Definition pool.hpp:137

PoolAllocator::reset
void reset()
Reset the allocator to reuse buffer (memory).
Definition pool.hpp:183

PoolAllocator::allocate
T * allocate(std::size_t align=alignof(T))
Allocates memory for type T with specified alignment (default = alignof(T)).
Definition pool.hpp:168