Weak scaling performance, Collide benchmark, KNL, 16M particles/node
Performance in millions of particle-timesteps / second / node

Nodes CPU/KNL (mpi,hyper) Kokkos/KNL (mpi,thread,hyper) Kokkos/serial/KNL (mpi,hyper)
1 258.5 (256,4) 276.4 (128,2,4) 282.7 (256,4)
2 251.2 (256,4) 269.7 (128,2,4) 270.3 (256,4)
4 250.4 (256,4) 262.9 (64,4,4) 268.8 (256,4)
8 248 (256,4) 256.2 (128,2,4) 268.5 (256,4)
16 245.8 (256,4) 251.9 (128,2,4) 266.6 (256,4)
32 241.1 (256,4) 247.2 (128,2,4) 262.6 (256,4)
64 230.3 (256,4) 246.8 (128,2,4) 260.8 (256,4)

Run commands and logfile links for column CPU/KNL

1 srun -n 256 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=1.mpi=256.hyper=4
2 srun -n 512 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=2.mpi=256.hyper=4
4 srun -n 1024 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=4.mpi=256.hyper=4
8 srun -n 2048 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=8.mpi=256.hyper=4
16 srun -n 4096 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 256 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=16.mpi=256.hyper=4
32 srun -n 8192 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 512 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=32.mpi=256.hyper=4
64 srun -n 16384 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_knl -v x 512 -v y 320 -v z 640 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=cpu_knl.kind=weak.size=16M.node=64.mpi=256.hyper=4

Run commands and logfile links for column Kokkos/KNL

1 setenv OMP_NUM_THREADS 2; srun -n 128 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=1.mpi=128.thread=2.hyper=4
2 setenv OMP_NUM_THREADS 2; srun -n 256 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=2.mpi=128.thread=2.hyper=4
4 setenv OMP_NUM_THREADS 4; srun -n 256 -C knl --ntasks-per-node 64 --cpu_bind=cores -c 4 ./spa_mutrino_kokkos_knl -sf kk -k on t 4 -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=4.mpi=64.thread=4.hyper=4
8 setenv OMP_NUM_THREADS 2; srun -n 1024 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=8.mpi=128.thread=2.hyper=4
16 setenv OMP_NUM_THREADS 2; srun -n 2048 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=16.mpi=128.thread=2.hyper=4
32 setenv OMP_NUM_THREADS 2; srun -n 4096 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 512 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=32.mpi=128.thread=2.hyper=4
64 setenv OMP_NUM_THREADS 2; srun -n 8192 -C knl --ntasks-per-node 128 --cpu_bind=threads -c 2 ./spa_mutrino_kokkos_knl -sf kk -k on t 2 -pk kokkos reduction parallel/reduce comm classic -v x 512 -v y 320 -v z 640 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_knl.kind=weak.size=16M.node=64.mpi=128.thread=2.hyper=4

Run commands and logfile links for column Kokkos/serial/KNL

1 srun -n 256 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 80 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=1.mpi=256.hyper=4
2 srun -n 512 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 128 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=2.mpi=256.hyper=4
4 srun -n 1024 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 160 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=4.mpi=256.hyper=4
8 srun -n 2048 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 160 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=8.mpi=256.hyper=4
16 srun -n 4096 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 256 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=16.mpi=256.hyper=4
32 srun -n 8192 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 512 -v y 320 -v z 320 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=32.mpi=256.hyper=4
64 srun -n 16384 -C knl --ntasks-per-node 256 --cpu_bind=threads -c 1 ./spa_mutrino_kokkos_serial_knl -sf kk -k on -pk kokkos reduction parallel/reduce comm classic -v x 512 -v y 320 -v z 640 -v t 100 -in in.collide.steps -log log.sparta.date=23Dec17.model=collide.machine=mutrino.pkg=kokkos_serial_knl.kind=weak.size=16M.node=64.mpi=256.hyper=4