Single node performance, Sphere benchmark, K80
Performance in millions of particle-timesteps / second

Nparticles Kokkos/Cuda-1 (mpi) Kokkos/Cuda-2 (mpi)
32000 13.67 (2) 12.07 (4)
64000 27.56 (2) 27.11 (4)
128000 49.92 (2) 45.55 (4)
256000 93.53 (2) 91.73 (4)
512000 134.2 (2) 167.9 (4)
1024000 159.6 (2) 230.5 (4)
2048000 188.6 (2) 301.1 (4)
4096000 202.4 (2) 363.1 (4)
8192000 174.9 (2) 339.8 (4)
16384000 194.5 (2) 333.4 (4)
32768000 190.8 (2) 374.7 (4)
65536000 None 288.8 (4)

Run commands and logfile links for column Kokkos/Cuda-1

32000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2
64000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2
128000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2
256000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2
512000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2
1024000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2
2048000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2
4096000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2
8192000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2
16384000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2
32768000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2
65536000 None

Run commands and logfile links for column Kokkos/Cuda-2

32000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4
64000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4
128000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4
256000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4
512000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4
1024000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4
2048000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4
4096000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4
8192000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4
16384000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4
32768000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4
65536000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride80.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4