Single node performance, Free benchmark, P100
Performance in millions of particle-timesteps / second
| Nparticles | Kokkos/Cuda-1 (mpi) | Kokkos/Cuda-2 (mpi) | Kokkos/Cuda-4 (mpi) | |
| 32000 | 89.93 (1) | 59.19 (2) | 45.34 (4) | |
| 64000 | 143.9 (1) | 101.4 (2) | 83.43 (4) | |
| 128000 | 206.4 (1) | 169 (2) | 151.7 (4) | |
| 256000 | 256.9 (1) | 271.8 (2) | 258.6 (4) | |
| 512000 | 269.5 (1) | 382 (2) | 417.4 (4) | |
| 1024000 | 258.2 (1) | 457.1 (2) | 632.5 (4) | |
| 2048000 | 247.8 (1) | 475.6 (2) | 792.3 (4) | |
| 4096000 | 238.1 (1) | 471.1 (2) | 858.7 (4) | |
| 8192000 | 232.3 (1) | 463.6 (2) | 879.6 (4) | |
| 16384000 | 230.1 (1) | 457 (2) | 879.9 (4) | |
| 32768000 | 227.9 (1) | 454.6 (2) | 872.9 (4) | |
| 65536000 | None | 451.6 (2) | 865.4 (4) | |
| 131072000 | None | None | 853.6 (4) |
| 32000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=1.gpu=1 |
| 64000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=1.gpu=1 |
| 128000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=1.gpu=1 |
| 256000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=1.gpu=1 |
| 512000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=1.gpu=1 |
| 1024000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=1.gpu=1 |
| 2048000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=1.gpu=1 |
| 4096000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=1.gpu=1 |
| 8192000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=1.gpu=1 |
| 16384000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=1.gpu=1 |
| 32768000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=1.gpu=1 |
| 65536000 | None |
| 131072000 | None |
| 32000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2 |
| 64000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2 |
| 128000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2 |
| 256000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2 |
| 512000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2 |
| 1024000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2 |
| 2048000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2 |
| 4096000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2 |
| 8192000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2 |
| 16384000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2 |
| 32768000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2 |
| 65536000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=2.gpu=2 |
| 131072000 | None |
| 32000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4 |
| 64000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4 |
| 128000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4 |
| 256000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4 |
| 512000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4 |
| 1024000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4 |
| 2048000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4 |
| 4096000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4 |
| 8192000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4 |
| 16384000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4 |
| 32768000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4 |
| 65536000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4 |
| 131072000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 320 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128M.node=1.mpi=4.gpu=4 |