Single node performance, Collide benchmark, P100
Performance in millions of particle-timesteps / second

Nparticles Kokkos/Cuda-1 (mpi) Kokkos/Cuda-2 (mpi) Kokkos/Cuda-4 (mpi)
32000 53.9 (1) 42.07 (2) 34.93 (4)
64000 92.84 (1) 73.57 (2) 64.65 (4)
128000 145.2 (1) 126.3 (2) 118.5 (4)
256000 203.4 (1) 210.8 (2) 208.3 (4)
512000 253.3 (1) 319.4 (2) 344.4 (4)
1024000 284 (1) 415.1 (2) 526.9 (4)
2048000 308.2 (1) 492.1 (2) 694.3 (4)
4096000 302.7 (1) 549.7 (2) 845.3 (4)
8192000 300.3 (1) 558.7 (2) 964.5 (4)
16384000 302 (1) 534.3 (2) 1002 (4)
32768000 303.5 (1) 540.5 (2) 987 (4)
65536000 None 527.5 (2) 998.9 (4)
131072000 None None 942.2 (4)

Run commands and logfile links for column Kokkos/Cuda-1

32000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=1.gpu=1
64000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=1.gpu=1
128000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=1.gpu=1
256000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=1.gpu=1
512000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=1.gpu=1
1024000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=1.gpu=1
2048000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=1.gpu=1
4096000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=1.gpu=1
8192000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=1.gpu=1
16384000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=1.gpu=1
32768000 mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=1.gpu=1
65536000 None
131072000 None

Run commands and logfile links for column Kokkos/Cuda-2

32000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2
64000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2
128000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2
256000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2
512000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2
1024000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2
2048000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2
4096000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2
8192000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2
16384000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2
32768000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2
65536000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=2.gpu=2
131072000 None

Run commands and logfile links for column Kokkos/Cuda-4

32000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4
64000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4
128000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4
256000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4
512000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4
1024000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4
2048000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4
4096000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4
8192000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4
16384000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4
32768000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4
65536000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4
131072000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride100.pkg=kokkos_cuda.kind=node.size=128M.node=1.mpi=4.gpu=4