Single node performance, Free benchmark, P100
Performance in millions of particle-timesteps / second
Nparticles | Kokkos/Cuda-1 (mpi) | Kokkos/Cuda-2 (mpi) | Kokkos/Cuda-4 (mpi) | |
32000 | 89.93 (1) | 59.19 (2) | 45.34 (4) | |
64000 | 143.9 (1) | 101.4 (2) | 83.43 (4) | |
128000 | 206.4 (1) | 169 (2) | 151.7 (4) | |
256000 | 256.9 (1) | 271.8 (2) | 258.6 (4) | |
512000 | 269.5 (1) | 382 (2) | 417.4 (4) | |
1024000 | 258.2 (1) | 457.1 (2) | 632.5 (4) | |
2048000 | 247.8 (1) | 475.6 (2) | 792.3 (4) | |
4096000 | 238.1 (1) | 471.1 (2) | 858.7 (4) | |
8192000 | 232.3 (1) | 463.6 (2) | 879.6 (4) | |
16384000 | 230.1 (1) | 457 (2) | 879.9 (4) | |
32768000 | 227.9 (1) | 454.6 (2) | 872.9 (4) | |
65536000 | None | 451.6 (2) | 865.4 (4) | |
131072000 | None | None | 853.6 (4) |
32000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=1.gpu=1 |
64000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=1.gpu=1 |
128000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=1.gpu=1 |
256000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=1.gpu=1 |
512000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=1.gpu=1 |
1024000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=1.gpu=1 |
2048000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=1.gpu=1 |
4096000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=1.gpu=1 |
8192000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=1.gpu=1 |
16384000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=1.gpu=1 |
32768000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=1.gpu=1 |
65536000 | None |
131072000 | None |
32000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2 |
64000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2 |
128000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2 |
256000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2 |
512000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2 |
1024000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2 |
2048000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2 |
4096000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2 |
8192000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2 |
16384000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2 |
32768000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2 |
65536000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=2.gpu=2 |
131072000 | None |
32000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4 |
64000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4 |
128000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4 |
256000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4 |
512000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4 |
1024000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4 |
2048000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4 |
4096000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4 |
8192000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4 |
16384000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4 |
32768000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4 |
65536000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4 |
131072000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 320 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128M.node=1.mpi=4.gpu=4 |