Single node performance, Sphere benchmark, P100
Performance in millions of particle-timesteps / second
Nparticles | Kokkos/Cuda-1 (mpi) | Kokkos/Cuda-2 (mpi) | Kokkos/Cuda-4 (mpi) | |
32000 | 25.52 (1) | 24.98 (2) | 23.43 (4) | |
64000 | 50.94 (1) | 48.98 (2) | 47.9 (4) | |
128000 | 96.21 (1) | 90.6 (2) | 90.01 (4) | |
256000 | 163.2 (1) | 175.2 (2) | 168.2 (4) | |
512000 | 240.6 (1) | 296.5 (2) | 322.3 (4) | |
1024000 | 305.5 (1) | 403.4 (2) | 485.6 (4) | |
2048000 | 354.8 (1) | 550.5 (2) | 739.3 (4) | |
4096000 | 383.6 (1) | 658.1 (2) | 1023 (4) | |
8192000 | 382.1 (1) | 648.7 (2) | 1108 (4) | |
16384000 | 371.8 (1) | 675.4 (2) | 1211 (4) | |
32768000 | None | 676.9 (2) | 1270 (4) | |
65536000 | None | None | 1055 (4) |
32000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=1.gpu=1 |
64000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=1.gpu=1 |
128000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=1.gpu=1 |
256000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=1.gpu=1 |
512000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=1.gpu=1 |
1024000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=1.gpu=1 |
2048000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=1.gpu=1 |
4096000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=1.gpu=1 |
8192000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=1.gpu=1 |
16384000 | mpirun -np 1 --npernode 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 1 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=1.gpu=1 |
32768000 | None |
65536000 | None |
32000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=2.gpu=2 |
64000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=2.gpu=2 |
128000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=2.gpu=2 |
256000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=2.gpu=2 |
512000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=2.gpu=2 |
1024000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=2.gpu=2 |
2048000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=2.gpu=2 |
4096000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=2.gpu=2 |
8192000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=2.gpu=2 |
16384000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=2.gpu=2 |
32768000 | mpirun -np 2 --npersocket 1 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 2 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=2.gpu=2 |
65536000 | None |
32000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 10 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=32K.node=1.mpi=4.gpu=4 |
64000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 16 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=64K.node=1.mpi=4.gpu=4 |
128000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 20 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=128K.node=1.mpi=4.gpu=4 |
256000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 20 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=256K.node=1.mpi=4.gpu=4 |
512000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 32 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=512K.node=1.mpi=4.gpu=4 |
1024000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 40 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=1M.node=1.mpi=4.gpu=4 |
2048000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 40 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=2M.node=1.mpi=4.gpu=4 |
4096000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 64 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=4M.node=1.mpi=4.gpu=4 |
8192000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 80 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=8M.node=1.mpi=4.gpu=4 |
16384000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 80 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=16M.node=1.mpi=4.gpu=4 |
32768000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 128 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=32M.node=1.mpi=4.gpu=4 |
65536000 | mpirun -np 4 --npersocket 2 --bind-to core spa_ride100_kokkos_cuda -sf kk -k on g 4 -pk kokkos reduction atomic comm threaded -v x 256 -v y 160 -v z 160 -v t 100 -in in.sphere.gpu.steps -log log.sparta.date=23Dec17.model=sphere.machine=ride100.pkg=kokkos_cuda.kind=node.size=64M.node=1.mpi=4.gpu=4 |