Single node performance, Collide benchmark, Haswell
Performance in millions of particle-timesteps / second

Nparticles CPU (mpi,hyper) Kokkos/OMP (mpi,hyper,thread) Kokkos/serial (mpi,hyper)
32000 452.3 (32,1) 323.5 (32,2,2) 377.1 (32,1)
64000 518.8 (64,2) 430.9 (32,2,2) 462.3 (32,1)
128000 614 (64,2) 504.4 (32,2,2) 523.1 (64,2)
256000 675.4 (64,2) 551.7 (32,2,2) 607 (64,2)
512000 616.7 (64,2) 515.4 (32,2,2) 539.3 (64,2)
1024000 288.4 (64,2) 256.9 (32,2,2) 255.2 (64,2)
2048000 244.4 (64,2) 242.2 (32,2,2) 245.3 (64,2)
4096000 220.1 (64,2) 215.7 (32,2,2) 220.1 (64,2)
8192000 185.4 (64,2) 179.2 (32,2,2) 187.7 (64,2)
16384000 153.1 (64,2) 154.6 (32,2,2) 160.9 (64,2)
32768000 132.9 (64,2) 134.5 (32,2,2) 141.6 (64,2)
65536000 123.1 (64,2) 123.4 (32,2,2) 130.2 (64,2)
131072000 116.6 (64,2) 112.4 (32,2,2) 121.2 (64,2)

Run commands and logfile links for column CPU

32000 srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=rank -c 2 ./spa_mutrino_cpu -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=32K.node=1.mpi=32.hyper=1
64000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=64K.node=1.mpi=64.hyper=2
128000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=128K.node=1.mpi=64.hyper=2
256000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=256K.node=1.mpi=64.hyper=2
512000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=512K.node=1.mpi=64.hyper=2
1024000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=1M.node=1.mpi=64.hyper=2
2048000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=2M.node=1.mpi=64.hyper=2
4096000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=4M.node=1.mpi=64.hyper=2
8192000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=8M.node=1.mpi=64.hyper=2
16384000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=16M.node=1.mpi=64.hyper=2
32768000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=32M.node=1.mpi=64.hyper=2
65536000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=64M.node=1.mpi=64.hyper=2
131072000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_cpu -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu.kind=node.size=128M.node=1.mpi=64.hyper=2

Run commands and logfile links for column Kokkos/OMP

32000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=32K.node=1.mpi=32.thread=2.hyper=2
64000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=64K.node=1.mpi=32.thread=2.hyper=2
128000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=128K.node=1.mpi=32.thread=2.hyper=2
256000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=256K.node=1.mpi=32.thread=2.hyper=2
512000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=512K.node=1.mpi=32.thread=2.hyper=2
1024000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=1M.node=1.mpi=32.thread=2.hyper=2
2048000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=2M.node=1.mpi=32.thread=2.hyper=2
4096000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=4M.node=1.mpi=32.thread=2.hyper=2
8192000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=8M.node=1.mpi=32.thread=2.hyper=2
16384000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=16M.node=1.mpi=32.thread=2.hyper=2
32768000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=32M.node=1.mpi=32.thread=2.hyper=2
65536000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=64M.node=1.mpi=32.thread=2.hyper=2
131072000 setenv OMP_NUM_THREADS 2; srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=cores -c 2 ./spa_mutrino_kokkos_omp -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_omp.kind=node.size=128M.node=1.mpi=32.thread=2.hyper=2

Run commands and logfile links for column Kokkos/serial

32000 srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=rank -c 2 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=32K.node=1.mpi=32.hyper=1
64000 srun -n 32 -C haswell --ntasks-per-node 32 --cpu_bind=rank -c 2 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=64K.node=1.mpi=32.hyper=1
128000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=128K.node=1.mpi=64.hyper=2
256000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=256K.node=1.mpi=64.hyper=2
512000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=512K.node=1.mpi=64.hyper=2
1024000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=1M.node=1.mpi=64.hyper=2
2048000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=2M.node=1.mpi=64.hyper=2
4096000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=4M.node=1.mpi=64.hyper=2
8192000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=8M.node=1.mpi=64.hyper=2
16384000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=16M.node=1.mpi=64.hyper=2
32768000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=32M.node=1.mpi=64.hyper=2
65536000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=64M.node=1.mpi=64.hyper=2
131072000 srun -n 64 -C haswell --ntasks-per-node 64 --cpu_bind=rank -c 1 ./spa_mutrino_kokkos_serial -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial.kind=node.size=128M.node=1.mpi=64.hyper=2