Single node performance, Collide benchmark, K80
Performance in millions of particle-timesteps / second
| Nparticles | Kokkos/Cuda-1 (mpi) | Kokkos/Cuda-2 (mpi) | |
| 32000 | 20.03 (2) | 16.46 (4) | |
| 64000 | 34.69 (2) | 30.72 (4) | |
| 128000 | 62.77 (2) | 56.22 (4) | |
| 256000 | 98.92 (2) | 102.9 (4) | |
| 512000 | 124.5 (2) | 155.3 (4) | |
| 1024000 | 140.1 (2) | 213.1 (4) | |
| 2048000 | 142.8 (2) | 253 (4) | |
| 4096000 | 147.5 (2) | 266.8 (4) | |
| 8192000 | 145.8 (2) | 282.3 (4) | |
| 16384000 | 145.1 (2) | 277.4 (4) | |
| 32768000 | 143.3 (2) | 267.9 (4) | |
| 65536000 | None | 268.2 (4) |
| 32000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2 |
| 64000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2 |
| 128000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2 |
| 256000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2 |
| 512000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2 |
| 1024000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2 |
| 2048000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2 |
| 4096000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2 |
| 8192000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2 |
| 16384000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2 |
| 32768000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2 |
| 65536000 | None |
| 32000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4 |
| 64000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4 |
| 128000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4 |
| 256000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4 |
| 512000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4 |
| 1024000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4 |
| 2048000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4 |
| 4096000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4 |
| 8192000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4 |
| 16384000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4 |
| 32768000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4 |
| 65536000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride80_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.gpu.steps -log log.sparta.date=23Dec17.model=collide.machine=ride80.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4 |