Single node performance, Free benchmark, P100
Performance in millions of particle-timesteps / second

Nparticles Kokkos/Cuda-1 (mpi) Kokkos/Cuda-2 (mpi) Kokkos/Cuda-4 (mpi)
32000 89.93 (1) 59.19 (2) 45.34 (4)
64000 143.9 (1) 101.4 (2) 83.43 (4)
128000 206.4 (1) 169 (2) 151.7 (4)
256000 256.9 (1) 271.8 (2) 258.6 (4)
512000 269.5 (1) 382 (2) 417.4 (4)
1024000 258.2 (1) 457.1 (2) 632.5 (4)
2048000 247.8 (1) 475.6 (2) 792.3 (4)
4096000 238.1 (1) 471.1 (2) 858.7 (4)
8192000 232.3 (1) 463.6 (2) 879.6 (4)
16384000 230.1 (1) 457 (2) 879.9 (4)
32768000 227.9 (1) 454.6 (2) 872.9 (4)
65536000 None 451.6 (2) 865.4 (4)
131072000 None None 853.6 (4)

Run commands and logfile links for column Kokkos/Cuda-1

32000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=1.gpu=1
64000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=1.gpu=1
128000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=1.gpu=1
256000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=1.gpu=1
512000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=1.gpu=1
1024000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=1.gpu=1
2048000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=1.gpu=1
4096000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=1.gpu=1
8192000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=1.gpu=1
16384000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=1.gpu=1
32768000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=1.gpu=1
65536000 None
131072000 None

Run commands and logfile links for column Kokkos/Cuda-2

32000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2
64000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2
128000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2
256000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2
512000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2
1024000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2
2048000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2
4096000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2
8192000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2
16384000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2
32768000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2
65536000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=2.gpu=2
131072000 None

Run commands and logfile links for column Kokkos/Cuda-4

32000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4
64000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4
128000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4
256000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4
512000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4
1024000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4
2048000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4
4096000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4
8192000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4
16384000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4
32768000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4
65536000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4
131072000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 320 -v t 100 -in in.free.steps -log log.sparta.date=23Dec17.model=free.machine=ride100.pkg=kokkos_cuda.kind=node.size=128M.node=1.mpi=4.gpu=4