Single core performance, Sphere benchmark, SandyBridge
Performance in millions of particle-timesteps / second
Nparticles | CPU (mpi) | Kokkos/OMP (mpi) | Kokkos/serial (mpi) | |
8000 | 15.69 (1) | 10.11 (1) | 14.84 (1) | |
16000 | 25.87 (1) | 10.1 (1) | 22 (1) |
8000 | mpirun -n 1 -N 1 --bind-to core spa_chama_cpu -v x 8 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=cpu.kind=core.size=8K.node=1.mpi=1 |
16000 | mpirun -n 1 -N 1 --bind-to core spa_chama_cpu -v x 16 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=cpu.kind=core.size=16K.node=1.mpi=1 |
8000 | mpirun -n 1 -N 1 --bind-to socket spa_chama_kokkos_omp -sf kk -k on t 1 -pk kokkos reduction parallel/reduce comm classic -v x 8 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=kokkos_omp.kind=core.size=8K.node=1.mpi=1.thread=1 |
16000 | mpirun -n 1 -N 1 --bind-to socket spa_chama_kokkos_omp -sf kk -k on t 1 -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=kokkos_omp.kind=core.size=16K.node=1.mpi=1.thread=1 |
8000 | mpirun -n 1 -N 1 --bind-to core spa_chama_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 8 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=kokkos_serial.kind=core.size=8K.node=1.mpi=1 |
16000 | mpirun -n 1 -N 1 --bind-to core spa_chama_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 10 -v z 10 -v t 100 -in in.sphere.steps -log log.sparta.date=23Dec17.model=sphere.machine=chama.pkg=kokkos_serial.kind=core.size=16K.node=1.mpi=1 |