Single node performance, Collide benchmark, K80
Performance in millions of particle-timesteps / second

Nparticles Kokkos/Cuda-1 (mpi) Kokkos/Cuda-2 (mpi)
32000 20.03 (2) 16.46 (4)
64000 34.69 (2) 30.72 (4)
128000 62.77 (2) 56.22 (4)
256000 98.92 (2) 102.9 (4)
512000 124.5 (2) 155.3 (4)
1024000 140.1 (2) 213.1 (4)
2048000 142.8 (2) 253 (4)
4096000 147.5 (2) 266.8 (4)
8192000 145.8 (2) 282.3 (4)
16384000 145.1 (2) 277.4 (4)
32768000 143.3 (2) 267.9 (4)
65536000 None 268.2 (4)

Run commands and logfile links for column Kokkos/Cuda-1

32000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2
64000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2
128000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2
256000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2
512000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2
1024000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2
2048000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2
4096000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2
8192000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2
16384000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2
32768000 mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2
65536000 None

Run commands and logfile links for column Kokkos/Cuda-2

32000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4
64000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4
128000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4
256000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4
512000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4
1024000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4
2048000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4
4096000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4
8192000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4
16384000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4
32768000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4
65536000 mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4