Generating new blocksizes to LIBCUSMM

Samuel Andermatt samuel.a... at student.ethz.ch
Wed Oct 29 09:51:01 UTC 2014


I attach the most recent parameters file from the development version (20 
times more parameters than 2.5.1 version). Maybe this can help. I often add 
new blocksizes to the file and so far it always worked, but I am using the 
development version.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.cp2k.org/archives/cp2k-user/attachments/20141029/1d5baf83/attachment.htm>
-------------- next part --------------
# *****************************************************************************
# * CP2K: A general program to perform molecular dynamics simulations         *
# * Copyright (C) 2000 - 2014 the CP2K developers group                       *
# *****************************************************************************

[
  Kernel_dnt_medium(m=13, n=16, k=8, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 148.468 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 218.026 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=13, split_thread=32, threads=128, grouping=16, minblocks=1) , # 55.0971 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 125.593 GFlop/s
  Kernel_dnt_medium(m=8, n=16, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 177.658 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 127.127 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 66.6945 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 196.169 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=16, tile_m=3, tile_n=2, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 332.873 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 162.657 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=23, tile_m=2, tile_n=2, w=6, v=12, threads=160, grouping=16, minblocks=8) , # 307.93 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=17, tile_m=2, tile_n=3, w=6, v=12, threads=160, grouping=16, minblocks=8) , # 275.462 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=9, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 239.736 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=5, tile_m=5, tile_n=2, w=2, v=18, threads=96, grouping=16, minblocks=12) , # 190.562 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=26, tile_m=2, tile_n=3, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 289.631 GFlop/s
  Kernel_dnt_medium(m=22, n=23, k=16, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 295.869 GFlop/s
  Kernel_dnt_largeDB(m=22, n=6, k=24, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=1) , # 152.082 GFlop/s
  Kernel_dnt_largeDB(m=8, n=32, k=23, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 224.463 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 107.426 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 129.899 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 76.7893 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 93.856 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 147.658 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 125.062 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 178.719 GFlop/s
  Kernel_dnt_largeDB(m=16, n=29, k=55, tile_m=2, tile_n=4, w=6, v=24, threads=96, grouping=16, minblocks=12) , # 360.672 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=32, tile_m=2, tile_n=4, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 407.594 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 220.321 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 60.4434 GFlop/s
  Kernel_dnt_tiny(m=8, n=8, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 72.9942 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 127.421 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 54.048 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=32, tile_m=1, tile_n=2, w=14, v=16, threads=128, grouping=16, minblocks=12) , # 179.16 GFlop/s
  Kernel_dnt_tiny(m=6, n=4, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 17.3648 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 217.745 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 197.735 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 111.233 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 165.872 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 111.101 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=13, tile_m=2, tile_n=2, w=4, v=22, threads=96, grouping=16, minblocks=12) , # 223.224 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=13, split_thread=32, threads=128, grouping=16, minblocks=1) , # 50.7414 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 98.0009 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=9, tile_m=4, tile_n=2, threads=128, grouping=16, minblocks=8) , # 240.63 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=6, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 111.035 GFlop/s
  Kernel_dnt_largeDB(m=9, n=23, k=26, tile_m=2, tile_n=1, w=10, v=12, threads=128, grouping=16, minblocks=12) , # 199.492 GFlop/s
  Kernel_dnt_largeDB(m=8, n=6, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=128, grouping=16, minblocks=12) , # 112.447 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=17, tile_m=2, tile_n=4, w=6, v=8, threads=96, grouping=16, minblocks=12) , # 263.517 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 107.115 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=32, tile_m=2, tile_n=3, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 340.251 GFlop/s
  Kernel_dnt_largeDB(m=16, n=64, k=9, tile_m=2, tile_n=4, w=4, v=40, threads=128, grouping=16, minblocks=4) , # 281.238 GFlop/s
  Kernel_dnt_largeDB(m=144, n=12, k=12, tile_m=2, tile_n=4, w=6, v=8, threads=288, grouping=16, minblocks=4) , # 268.209 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 117.022 GFlop/s
  Kernel_dnt_largeDB(m=22, n=22, k=64, tile_m=3, tile_n=3, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 397.616 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 254.265 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 119.067 GFlop/s
  Kernel_dnt_largeDB(m=22, n=9, k=26, tile_m=2, tile_n=2, w=10, v=8, threads=96, grouping=16, minblocks=12) , # 197.52 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=17, tile_m=2, tile_n=4, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 349.834 GFlop/s
  Kernel_dnt_small(m=8, n=24, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 99.7498 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=6, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 175.112 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 99.4373 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=6, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 152.914 GFlop/s
  Kernel_dnt_small(m=13, n=4, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 59.7354 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 81.9869 GFlop/s
  Kernel_dnt_small(m=9, n=5, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 65.8594 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 109.914 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 80.9386 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 39.9008 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 64.9733 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 32.8542 GFlop/s
  Kernel_dnt_largeDB(m=26, n=5, k=24, tile_m=1, tile_n=1, w=12, v=4, threads=160, grouping=16, minblocks=12) , # 141.37 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=22, tile_m=2, tile_n=3, w=6, v=16, threads=96, grouping=16, minblocks=12) , # 271.196 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=22, tile_m=2, tile_n=4, w=6, v=28, threads=96, grouping=16, minblocks=12) , # 328.078 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=32, tile_m=3, tile_n=2, w=14, v=24, threads=128, grouping=16, minblocks=8) , # 360.604 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=13, tile_m=2, tile_n=4, w=6, v=12, threads=96, grouping=16, minblocks=12) , # 328.765 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 96.638 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 203.566 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=8, tile_m=3, tile_n=2, w=4, v=28, threads=96, grouping=16, minblocks=12) , # 230.442 GFlop/s
  Kernel_dnt_tiny(m=16, n=8, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 105.373 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 94.6116 GFlop/s
  Kernel_dnt_largeDB(m=22, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=12) , # 146.023 GFlop/s
  Kernel_dnt_largeDB(m=4, n=17, k=24, tile_m=1, tile_n=1, w=12, v=14, threads=128, grouping=16, minblocks=12) , # 107.141 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=22, tile_m=1, tile_n=2, w=8, v=6, threads=96, grouping=16, minblocks=4) , # 154.487 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=13, tile_m=2, tile_n=4, w=6, v=20, threads=128, grouping=16, minblocks=8) , # 353.58 GFlop/s
  Kernel_dnt_largeDB(m=8, n=13, k=23, tile_m=1, tile_n=1, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 145.317 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 283.731 GFlop/s
  Kernel_dnt_small(m=8, n=23, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 105.176 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=8, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 110.66 GFlop/s
  Kernel_dnt_largeDB(m=5, n=26, k=23, tile_m=1, tile_n=2, w=6, v=26, threads=96, grouping=16, minblocks=4) , # 133.003 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=9, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 241.517 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 102.229 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 205.186 GFlop/s
  Kernel_dnt_largeDB(m=13, n=169, k=13, tile_m=4, tile_n=3, w=6, v=96, threads=256, grouping=16, minblocks=1) , # 241.485 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 27.0068 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=24, tile_m=3, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=8) , # 396.116 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=24, tile_m=3, tile_n=2, w=12, v=32, threads=128, grouping=16, minblocks=8) , # 384.75 GFlop/s
  Kernel_dnt_small(m=8, n=26, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 103.296 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 155.546 GFlop/s
  Kernel_dnt_medium(m=16, n=14, k=14, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 215.62 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 199.591 GFlop/s
  Kernel_dnt_largeDB(m=23, n=13, k=22, tile_m=2, tile_n=2, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 226.737 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=9, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 155.464 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=17, tile_m=2, tile_n=2, w=8, v=18, threads=128, grouping=16, minblocks=12) , # 265.788 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 128.998 GFlop/s
  Kernel_dnt_largeDB(m=9, n=17, k=24, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=1) , # 166.06 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 27.2512 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=23, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 167.944 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=16, tile_m=2, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=12) , # 267.336 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 312.346 GFlop/s
  Kernel_dnt_largeDB(m=22, n=6, k=23, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 144.445 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 150.001 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=26, tile_m=2, tile_n=4, w=12, v=22, threads=128, grouping=16, minblocks=8) , # 385.585 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 131.777 GFlop/s
  Kernel_dnt_largeDB(m=23, n=17, k=24, tile_m=2, tile_n=2, w=10, v=10, threads=128, grouping=16, minblocks=12) , # 274.855 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.271 GFlop/s
  Kernel_dnt_small(m=8, n=32, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 120.164 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 97.7841 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=32, tile_m=3, tile_n=1, w=8, v=22, threads=128, grouping=16, minblocks=12) , # 249.642 GFlop/s
  Kernel_dnt_largeDB(m=29, n=55, k=55, tile_m=3, tile_n=5, w=6, v=30, threads=128, grouping=16, minblocks=1) , # 465.271 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=23, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=8) , # 135.879 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 192.82 GFlop/s
  Kernel_dnt_largeDB(m=4, n=24, k=24, tile_m=1, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=4) , # 126.228 GFlop/s
  Kernel_dnt_largeDB(m=17, n=4, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=96, grouping=16, minblocks=1) , # 116.496 GFlop/s
  Kernel_dnt_largeDB(m=14, n=16, k=29, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 232.968 GFlop/s
  Kernel_dnt_largeDB(m=9, n=81, k=9, tile_m=5, tile_n=2, w=2, v=52, threads=96, grouping=16, minblocks=12) , # 202.245 GFlop/s
  Kernel_dnt_largeDB(m=17, n=17, k=22, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 235.625 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=4, tile_m=3, tile_n=2, w=2, v=12, threads=96, grouping=16, minblocks=12) , # 151.001 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 188.6 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 143.132 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 14.511 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=4, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 138.141 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=32, tile_m=2, tile_n=1, w=16, v=6, threads=128, grouping=16, minblocks=12) , # 171.985 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 82.1029 GFlop/s
  Kernel_dnt_small(m=8, n=32, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 126.86 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 115.359 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 72.3602 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=26, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 305.955 GFlop/s
  Kernel_dnt_largeDB(m=6, n=26, k=32, tile_m=1, tile_n=1, w=12, v=26, threads=160, grouping=16, minblocks=12) , # 172.981 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 90.3877 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 192.731 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 180.861 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=22, tile_m=2, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 171.029 GFlop/s
  Kernel_dnt_medium(m=32, n=23, k=6, tile_m=2, tile_n=3, threads=160, grouping=16, minblocks=8) , # 219.29 GFlop/s
  Kernel_dnt_largeDB(m=17, n=13, k=32, tile_m=2, tile_n=2, w=16, v=8, threads=96, grouping=16, minblocks=12) , # 238.862 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 63.9439 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 214.841 GFlop/s
  Kernel_dnt_medium(m=22, n=24, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 190.547 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=26, tile_m=2, tile_n=3, w=10, v=24, threads=96, grouping=16, minblocks=12) , # 309.598 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 50.5155 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 119.54 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=17, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 324.51 GFlop/s
  Kernel_dnt_largeDB(m=9, n=24, k=23, tile_m=2, tile_n=2, w=10, v=24, threads=128, grouping=16, minblocks=12) , # 207.575 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 225.673 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 59.0194 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=22, tile_m=2, tile_n=3, w=4, v=20, threads=128, grouping=16, minblocks=12) , # 368.406 GFlop/s
  Kernel_dnt_largeDB(m=13, n=9, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=12) , # 167.98 GFlop/s
  Kernel_dnt_medium(m=16, n=26, k=4, tile_m=1, tile_n=5, threads=128, grouping=16, minblocks=12) , # 145.64 GFlop/s
  Kernel_dnt_largeDB(m=81, n=9, k=9, tile_m=3, tile_n=3, w=4, v=6, threads=128, grouping=16, minblocks=8) , # 189.642 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=26, tile_m=2, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 235.319 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 143.712 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=32, tile_m=2, tile_n=3, w=14, v=14, threads=128, grouping=16, minblocks=8) , # 341.076 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=17, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=4) , # 175.381 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 75.0572 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=26, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 333.812 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=22, tile_m=3, tile_n=1, w=8, v=20, threads=128, grouping=16, minblocks=12) , # 230.249 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 164.48 GFlop/s
  Kernel_dnt_small(m=13, n=8, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 71.3921 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 139.231 GFlop/s
  Kernel_dnt_largeDB(m=22, n=8, k=32, tile_m=3, tile_n=2, w=16, v=8, threads=96, grouping=16, minblocks=12) , # 205.185 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=5, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 195.309 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 88.9991 GFlop/s
  Kernel_dnt_small(m=8, n=13, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 98.14 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=1) , # 222.329 GFlop/s
  Kernel_dnt_largeDB(m=6, n=23, k=32, tile_m=2, tile_n=1, w=8, v=16, threads=96, grouping=16, minblocks=8) , # 163.779 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=24, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 311.437 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 98.1766 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=8, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 165.821 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 246.715 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 87.1416 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 51.5816 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 212.052 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 105.056 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 64.6405 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 102.189 GFlop/s
  Kernel_dnt_medium(m=8, n=16, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 91.6492 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=26, tile_m=3, tile_n=2, w=10, v=14, threads=96, grouping=16, minblocks=12) , # 264.933 GFlop/s
  Kernel_dnt_tiny(m=5, n=6, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 31.0513 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 243.482 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=9, split_thread=32, threads=128, grouping=16, minblocks=1) , # 48.4776 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 223.837 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 125.152 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=9, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 221.968 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 78.998 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=16, tile_m=1, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=1) , # 192.481 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=23, tile_m=2, tile_n=4, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 365.224 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 144.101 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 152.933 GFlop/s
  Kernel_dnt_small(m=13, n=9, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 87.8396 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=5, tile_m=5, tile_n=2, w=2, v=14, threads=96, grouping=16, minblocks=12) , # 191.213 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 106.81 GFlop/s
  Kernel_dnt_largeDB(m=23, n=17, k=23, tile_m=3, tile_n=2, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 252.781 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=16, tile_m=1, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 155.64 GFlop/s
  Kernel_dnt_largeDB(m=5, n=22, k=32, tile_m=1, tile_n=1, w=16, v=22, threads=192, grouping=16, minblocks=8) , # 143.787 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=24, tile_m=2, tile_n=3, w=12, v=26, threads=128, grouping=16, minblocks=8) , # 312.05 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 123.841 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 50.8013 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=23, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 116.314 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 74.696 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 86.0905 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 71.0928 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 114.392 GFlop/s
  Kernel_dnt_small(m=23, n=4, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 63.3487 GFlop/s
  Kernel_dnt_medium(m=24, n=23, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 292.274 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=23, tile_m=2, tile_n=3, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 314.354 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 119.531 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=22, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 306.524 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=26, tile_m=1, tile_n=1, threads=224, grouping=16, minblocks=8) , # 155.587 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=13, split_thread=32, threads=96, grouping=16, minblocks=1) , # 30.4696 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=6, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 183.2 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 77.9372 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 67.6162 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=16, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 172.49 GFlop/s
  Kernel_dnt_largeDB(m=8, n=6, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=1) , # 99.2986 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=64, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 207.542 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=22, tile_m=2, tile_n=3, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 307.973 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=22, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 191.661 GFlop/s
  Kernel_dnt_largeDB(m=23, n=9, k=24, tile_m=2, tile_n=2, w=12, v=6, threads=128, grouping=16, minblocks=12) , # 205.963 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 98.0181 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 157.455 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=8) , # 242.184 GFlop/s
  Kernel_dnt_small(m=5, n=17, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 67.0057 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 66.887 GFlop/s
  Kernel_dnt_largeDB(m=32, n=8, k=24, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 231.024 GFlop/s
  Kernel_dnt_largeDB(m=17, n=17, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 240.597 GFlop/s
  Kernel_dnt_small(m=6, n=32, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 112.623 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=24, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 149.11 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=9, tile_m=1, tile_n=5, threads=96, grouping=16, minblocks=12) , # 187.291 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 275.019 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=23, tile_m=3, tile_n=2, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 257.187 GFlop/s
  Kernel_dnt_largeDB(m=22, n=13, k=26, tile_m=3, tile_n=2, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 236.465 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 29.4563 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 75.903 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=32, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 351.318 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=24, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 358.655 GFlop/s
  Kernel_dnt_tiny(m=6, n=4, k=9, split_thread=32, threads=96, grouping=16, minblocks=1) , # 30.9154 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=13, tile_m=2, tile_n=4, threads=96, grouping=16, minblocks=8) , # 262.523 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=23, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 209.048 GFlop/s
  Kernel_dnt_medium(m=29, n=16, k=14, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 277.48 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=17, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 71.4887 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 144.081 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=26, split_thread=32, threads=128, grouping=16, minblocks=1) , # 46.0797 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 26.0691 GFlop/s
  Kernel_dnt_largeDB(m=64, n=64, k=9, tile_m=4, tile_n=4, w=4, v=16, threads=256, grouping=16, minblocks=1) , # 271.502 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 72.194 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 71.0707 GFlop/s
  Kernel_dnt_small(m=16, n=32, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 149.795 GFlop/s
  Kernel_dnt_largeDB(m=9, n=23, k=23, tile_m=2, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 198.019 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 34.4605 GFlop/s
  Kernel_dnt_largeDB(m=22, n=64, k=22, tile_m=6, tile_n=2, w=4, v=32, threads=128, grouping=16, minblocks=1) , # 383.067 GFlop/s
  Kernel_dnt_largeDB(m=11, n=11, k=121, tile_m=3, tile_n=2, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 206.461 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 123.866 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=1) , # 179.675 GFlop/s
  Kernel_dnt_medium(m=23, n=32, k=6, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 221.286 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=26, tile_m=3, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 260.053 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 227.299 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 110.639 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=17, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 172.856 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 136.359 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=17, tile_m=3, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 281.315 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=17, tile_m=2, tile_n=3, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 301.954 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=24, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 337.228 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 43.6465 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 94.7326 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 117.722 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 71.0281 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=6, tile_m=4, tile_n=2, w=2, v=20, threads=96, grouping=16, minblocks=12) , # 216.291 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=23, tile_m=2, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 192.48 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=23, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 142.897 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 253.057 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=24, tile_m=2, tile_n=3, w=12, v=32, threads=128, grouping=16, minblocks=8) , # 374.035 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=22, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 277.137 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 79.1819 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=4, tile_m=4, tile_n=1, threads=96, grouping=16, minblocks=12) , # 116.674 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=17, split_thread=32, threads=128, grouping=16, minblocks=1) , # 52.168 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 174.284 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=22, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 167.882 GFlop/s
  Kernel_dnt_largeDB(m=9, n=8, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=1) , # 106.792 GFlop/s
  Kernel_dnt_medium(m=11, n=11, k=11, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 148.474 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=23, tile_m=3, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 328.197 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 123.606 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 109.711 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 117.985 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=23, tile_m=4, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 363.288 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 230.437 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=26, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 152.263 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 55.1226 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=16, tile_m=2, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=1) , # 186.548 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=9, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 153.966 GFlop/s
  Kernel_dnt_largeDB(m=5, n=26, k=26, tile_m=1, tile_n=2, w=10, v=26, threads=96, grouping=16, minblocks=4) , # 136.283 GFlop/s
  Kernel_dnt_largeDB(m=22, n=22, k=22, tile_m=2, tile_n=2, w=6, v=22, threads=160, grouping=16, minblocks=8) , # 299.905 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=14, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 253.252 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 112.051 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=22, tile_m=2, tile_n=3, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 383.032 GFlop/s
  Kernel_dnt_small(m=13, n=16, k=4, tile_m=2, tile_n=2, threads=64, grouping=16, minblocks=4) , # 100.974 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=23, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 270.301 GFlop/s
  Kernel_dnt_largeDB(m=64, n=16, k=9, tile_m=2, tile_n=4, w=4, v=10, threads=128, grouping=16, minblocks=8) , # 283.571 GFlop/s
  Kernel_dnt_largeDB(m=29, n=29, k=29, tile_m=4, tile_n=2, w=10, v=16, threads=128, grouping=16, minblocks=8) , # 387.473 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=23, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 337.924 GFlop/s
  Kernel_dnt_largeDB(m=17, n=16, k=16, tile_m=3, tile_n=1, w=4, v=10, threads=96, grouping=16, minblocks=4) , # 206.356 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 82.9676 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 74.465 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=64, tile_m=2, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 349.601 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 146.004 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=16, split_thread=32, threads=128, grouping=16, minblocks=1) , # 52.9062 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=24, split_thread=32, threads=128, grouping=16, minblocks=1) , # 62.8983 GFlop/s
  Kernel_dnt_largeDB(m=4, n=17, k=23, tile_m=1, tile_n=1, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 99.7952 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 195.482 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 122.699 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 225.962 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=22, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 273.938 GFlop/s
  Kernel_dnt_largeDB(m=13, n=9, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=1) , # 147.281 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=23, tile_m=2, tile_n=4, w=8, v=16, threads=128, grouping=16, minblocks=8) , # 370.605 GFlop/s
  Kernel_dnt_small(m=17, n=6, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 69.3811 GFlop/s
  Kernel_dnt_small(m=17, n=9, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 77.9497 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 107.42 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 255.027 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 196.805 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 150.841 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=16, tile_m=1, tile_n=2, w=8, v=6, threads=96, grouping=16, minblocks=4) , # 196.938 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 67.3989 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 94.7041 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 78.1446 GFlop/s
  Kernel_dnt_largeDB(m=64, n=16, k=16, tile_m=2, tile_n=4, w=8, v=10, threads=128, grouping=16, minblocks=8) , # 379.404 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=9, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 177.331 GFlop/s
  Kernel_dnt_largeDB(m=5, n=24, k=17, tile_m=1, tile_n=1, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 128.385 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 140.85 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=8, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 207.226 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 112.777 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 73.8541 GFlop/s
  Kernel_dnt_largeDB(m=32, n=5, k=24, tile_m=2, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 157.474 GFlop/s
  Kernel_dnt_largeDB(m=32, n=4, k=32, tile_m=1, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=1) , # 141.034 GFlop/s
  Kernel_dnt_largeDB(m=17, n=8, k=24, tile_m=2, tile_n=3, w=12, v=6, threads=96, grouping=16, minblocks=12) , # 158.743 GFlop/s
  Kernel_dnt_small(m=5, n=9, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 49.4117 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=128, grouping=16, minblocks=8) , # 317.704 GFlop/s
  Kernel_dnt_largeDB(m=10, n=10, k=100, tile_m=2, tile_n=2, w=20, v=10, threads=96, grouping=16, minblocks=12) , # 193.259 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 217.014 GFlop/s
  Kernel_dnt_largeDB(m=64, n=64, k=16, tile_m=4, tile_n=4, w=6, v=16, threads=256, grouping=16, minblocks=1) , # 363.539 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 115.513 GFlop/s
  Kernel_dnt_largeDB(m=22, n=17, k=22, tile_m=2, tile_n=4, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 256.316 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 194.858 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=8, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 118.582 GFlop/s
  Kernel_dnt_largeDB(m=225, n=15, k=15, tile_m=3, tile_n=3, w=4, v=12, threads=384, grouping=16, minblocks=1) , # 248.307 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=4, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 111.31 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=1) , # 152.464 GFlop/s
  Kernel_dnt_medium(m=32, n=23, k=13, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 337.911 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=16, tile_m=2, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=12) , # 281.091 GFlop/s
  Kernel_dnt_small(m=26, n=16, k=8, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 201.911 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=24, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=8) , # 196.5 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=8, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 136.523 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=6, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 80.5021 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=24, tile_m=2, tile_n=3, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 338.308 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=26, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 235.825 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 72.5704 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 171.063 GFlop/s
  Kernel_dnt_small(m=9, n=5, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 53.1872 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 110.912 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=22, tile_m=2, tile_n=4, w=6, v=10, threads=96, grouping=16, minblocks=12) , # 333.461 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 114.163 GFlop/s
  Kernel_dnt_largeDB(m=6, n=9, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 84.561 GFlop/s
  Kernel_dnt_tiny(m=6, n=8, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 56.4219 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=16, tile_m=2, tile_n=2, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 219.794 GFlop/s
  Kernel_dnt_largeDB(m=16, n=256, k=16, tile_m=2, tile_n=6, w=6, v=168, threads=384, grouping=16, minblocks=1) , # 309.179 GFlop/s
  Kernel_dnt_medium(m=121, n=11, k=11, tile_m=5, tile_n=3, threads=128, grouping=16, minblocks=1) , # 233.211 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=32, tile_m=2, tile_n=3, w=12, v=6, threads=96, grouping=16, minblocks=12) , # 305.311 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=24, split_thread=32, threads=128, grouping=16, minblocks=1) , # 59.2859 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=23, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 144.441 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 181.465 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=9, tile_m=3, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 275.596 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 57.6518 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 181.421 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 115.635 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 143.929 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=16, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 305.622 GFlop/s
  Kernel_dnt_tiny(m=5, n=23, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 66.9592 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 57.5751 GFlop/s
  Kernel_dnt_largeDB(m=16, n=13, k=22, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 210.521 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=9, tile_m=2, tile_n=4, w=4, v=20, threads=128, grouping=16, minblocks=1) , # 297.07 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 189.61 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=12) , # 155.289 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 290.16 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 117.999 GFlop/s
  Kernel_dnt_largeDB(m=9, n=5, k=16, tile_m=1, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=12) , # 70.7421 GFlop/s
  Kernel_dnt_small(m=9, n=6, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 58.9838 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 217.955 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=32, tile_m=2, tile_n=2, w=16, v=12, threads=160, grouping=16, minblocks=8) , # 314.433 GFlop/s
  Kernel_dnt_tiny(m=6, n=6, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 41.3431 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 120.103 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=5, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 72.9937 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=24, tile_m=3, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 345.783 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=8, tile_m=2, tile_n=2, w=4, v=26, threads=128, grouping=16, minblocks=12) , # 205.713 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 125.969 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 77.7652 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 174.994 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 148.462 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 107.39 GFlop/s
  Kernel_dnt_small(m=9, n=16, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 76.4401 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 179.544 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 154.026 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=8, tile_m=3, tile_n=3, w=4, v=28, threads=128, grouping=16, minblocks=8) , # 245.658 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 85.2808 GFlop/s
  Kernel_dnt_small(m=22, n=8, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 94.6278 GFlop/s
  Kernel_dnt_largeDB(m=14, n=29, k=14, tile_m=2, tile_n=2, w=4, v=20, threads=128, grouping=16, minblocks=12) , # 231.308 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 136.463 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 78.7224 GFlop/s
  Kernel_dnt_largeDB(m=26, n=6, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=160, grouping=16, minblocks=12) , # 167.712 GFlop/s
  Kernel_dnt_medium(m=23, n=22, k=4, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=8) , # 146.324 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 142.641 GFlop/s
  Kernel_dnt_tiny(m=16, n=5, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 68.9342 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 63.6804 GFlop/s
  Kernel_dnt_tiny(m=8, n=16, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 77.4084 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 84.4738 GFlop/s
  Kernel_dnt_largeDB(m=9, n=24, k=32, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 225.901 GFlop/s
  Kernel_dnt_largeDB(m=13, n=13, k=169, tile_m=2, tile_n=2, w=18, v=8, threads=96, grouping=16, minblocks=12) , # 258.16 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 59.3358 GFlop/s
  Kernel_dnt_small(m=32, n=8, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 143.901 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 118.278 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=32, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=8) , # 221.005 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=6, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 154.74 GFlop/s
  Kernel_dnt_largeDB(m=16, n=14, k=29, tile_m=2, tile_n=1, w=8, v=14, threads=128, grouping=16, minblocks=12) , # 231.585 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 146.774 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 138.391 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=6, split_thread=32, threads=64, grouping=16, minblocks=1) , # 23.5514 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=26, tile_m=2, tile_n=3, w=4, v=18, threads=128, grouping=16, minblocks=12) , # 361.325 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 136.959 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 123.976 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=17, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 156.458 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=32, tile_m=2, tile_n=2, w=16, v=22, threads=128, grouping=16, minblocks=8) , # 279.426 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 28.7395 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 216.881 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 98.7893 GFlop/s
  Kernel_dnt_tiny(m=24, n=5, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 70.448 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 99.4363 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 284.219 GFlop/s
  Kernel_dnt_small(m=32, n=6, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 97.4182 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=23, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 138.103 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=22, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 171.276 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 195.973 GFlop/s
  Kernel_dnt_small(m=16, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 102.023 GFlop/s
  Kernel_dnt_tiny(m=5, n=22, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 64.6247 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 131.17 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=9, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 160.328 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=23, tile_m=2, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=1) , # 199.689 GFlop/s
  Kernel_dnt_small(m=17, n=17, k=5, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=8) , # 144.444 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=5, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 195.723 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=22, tile_m=2, tile_n=3, w=8, v=18, threads=96, grouping=16, minblocks=12) , # 298.156 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=4) , # 224.809 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 78.1791 GFlop/s
  Kernel_dnt_tiny(m=4, n=16, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 56.7665 GFlop/s
  Kernel_dnt_medium(m=15, n=15, k=15, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 226.439 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=22, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 135.963 GFlop/s
  Kernel_dnt_largeDB(m=26, n=9, k=26, tile_m=2, tile_n=1, w=10, v=6, threads=128, grouping=16, minblocks=12) , # 212.628 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 92.322 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=17, split_thread=32, threads=128, grouping=16, minblocks=1) , # 34.9185 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=17, tile_m=2, tile_n=3, w=6, v=20, threads=160, grouping=16, minblocks=8) , # 300.411 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 116.875 GFlop/s
  Kernel_dnt_medium(m=16, n=26, k=8, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 203.682 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 127.087 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=23, tile_m=2, tile_n=2, w=6, v=24, threads=160, grouping=16, minblocks=8) , # 332.175 GFlop/s
  Kernel_dnt_largeDB(m=64, n=22, k=22, tile_m=6, tile_n=2, w=4, v=12, threads=128, grouping=16, minblocks=4) , # 383.088 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=1) , # 145.765 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=4, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 77.9597 GFlop/s
  Kernel_dnt_largeDB(m=5, n=8, k=32, tile_m=1, tile_n=1, w=16, v=8, threads=128, grouping=16, minblocks=12) , # 94.1701 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.972 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=26, tile_m=3, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 237.184 GFlop/s
  Kernel_dnt_medium(m=17, n=32, k=5, tile_m=3, tile_n=1, threads=192, grouping=16, minblocks=8) , # 168.196 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=16, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 415.932 GFlop/s
  Kernel_dnt_tiny(m=8, n=13, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 63.3347 GFlop/s
  Kernel_dnt_largeDB(m=26, n=16, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 289.704 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=22, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=8) , # 187.535 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=1) , # 182.386 GFlop/s
  Kernel_dnt_small(m=22, n=22, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 163.707 GFlop/s
  Kernel_dnt_small(m=4, n=26, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 58.9419 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 146.616 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=5, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 141.12 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 129.203 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 54.6039 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 100.125 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=16, tile_m=2, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=8) , # 184.93 GFlop/s
  Kernel_dnt_medium(m=22, n=32, k=5, tile_m=2, tile_n=2, threads=192, grouping=16, minblocks=8) , # 199.383 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 275.819 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=8, tile_m=3, tile_n=3, threads=128, grouping=16, minblocks=8) , # 252.144 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=32, tile_m=3, tile_n=2, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 367.949 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=23, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 245.584 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 214.408 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 123.944 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 273.966 GFlop/s
  Kernel_dnt_largeDB(m=256, n=16, k=16, tile_m=2, tile_n=6, w=6, v=10, threads=384, grouping=16, minblocks=1) , # 309.19 GFlop/s
  Kernel_dnt_largeDB(m=16, n=23, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 295.553 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.1265 GFlop/s
  Kernel_dnt_largeDB(m=55, n=29, k=29, tile_m=3, tile_n=5, w=10, v=26, threads=160, grouping=16, minblocks=4) , # 383.513 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=22, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 163.585 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=24, tile_m=2, tile_n=4, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 385.833 GFlop/s
  Kernel_dnt_largeDB(m=16, n=55, k=29, tile_m=2, tile_n=4, w=8, v=40, threads=128, grouping=16, minblocks=8) , # 387.674 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=9, tile_m=5, tile_n=2, w=2, v=14, threads=96, grouping=16, minblocks=12) , # 254.073 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 86.2825 GFlop/s
  Kernel_dnt_medium(m=23, n=22, k=16, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 286.218 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 141.119 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 159.925 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=4, tile_m=5, tile_n=2, w=2, v=14, threads=96, grouping=16, minblocks=12) , # 170.104 GFlop/s
  Kernel_dnt_largeDB(m=24, n=9, k=26, tile_m=2, tile_n=2, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 211.244 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 94.8452 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 111.759 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 57.4498 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 187.859 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=17, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 123.868 GFlop/s
  Kernel_dnt_largeDB(m=8, n=16, k=24, tile_m=1, tile_n=1, w=12, v=16, threads=128, grouping=16, minblocks=12) , # 183.951 GFlop/s
  Kernel_dnt_largeDB(m=15, n=225, k=15, tile_m=3, tile_n=3, w=4, v=150, threads=384, grouping=16, minblocks=1) , # 258.751 GFlop/s
  Kernel_dnt_largeDB(m=23, n=13, k=32, tile_m=3, tile_n=2, w=12, v=8, threads=96, grouping=16, minblocks=12) , # 258.236 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=8, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 183.429 GFlop/s
  Kernel_dnt_largeDB(m=22, n=17, k=24, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 259.941 GFlop/s
  Kernel_dnt_largeDB(m=23, n=9, k=26, tile_m=2, tile_n=2, w=10, v=8, threads=96, grouping=16, minblocks=12) , # 201.066 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=17, tile_m=2, tile_n=3, w=6, v=16, threads=160, grouping=16, minblocks=8) , # 315.978 GFlop/s
  Kernel_dnt_tiny(m=6, n=5, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 30.7386 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=24, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 139.574 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=22, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 317.821 GFlop/s
  Kernel_dnt_largeDB(m=64, n=22, k=64, tile_m=2, tile_n=6, w=8, v=2, threads=128, grouping=16, minblocks=4) , # 507.361 GFlop/s
  Kernel_dnt_medium(m=8, n=22, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 142.267 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 223.371 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=24, tile_m=2, tile_n=2, w=12, v=24, threads=128, grouping=16, minblocks=1) , # 276.151 GFlop/s
  Kernel_dnt_medium(m=10, n=10, k=10, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 116.741 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 131.211 GFlop/s
  Kernel_dnt_largeDB(m=8, n=32, k=24, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 231.134 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=16, tile_m=2, tile_n=4, w=8, v=22, threads=128, grouping=16, minblocks=8) , # 336.964 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=32, tile_m=2, tile_n=4, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 438.697 GFlop/s
  Kernel_dnt_largeDB(m=17, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=96, grouping=16, minblocks=4) , # 138.231 GFlop/s
  Kernel_dnt_largeDB(m=17, n=6, k=8, tile_m=1, tile_n=1, w=4, v=6, threads=128, grouping=16, minblocks=1) , # 87.7851 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=5, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 87.4288 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 121.912 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=6, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 168.045 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 119.94 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 74.9473 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 105.637 GFlop/s
  Kernel_dnt_largeDB(m=13, n=16, k=32, tile_m=2, tile_n=2, w=16, v=16, threads=96, grouping=16, minblocks=12) , # 232.99 GFlop/s
  Kernel_dnt_largeDB(m=23, n=8, k=24, tile_m=1, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=1) , # 198.683 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 220.147 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 149.521 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 239.748 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 101.297 GFlop/s
  Kernel_dnt_small(m=32, n=16, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 151.321 GFlop/s
  Kernel_dnt_tiny(m=6, n=4, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 20.891 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=9, tile_m=4, tile_n=2, threads=128, grouping=16, minblocks=8) , # 220.047 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 51.8175 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 61.8717 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 65.4323 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=17, tile_m=2, tile_n=3, w=6, v=20, threads=160, grouping=16, minblocks=8) , # 285.804 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=16, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 149.132 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 240.006 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=26, tile_m=2, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=4) , # 187.132 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=36, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 90.6049 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 106.972 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=23, tile_m=2, tile_n=3, w=8, v=16, threads=128, grouping=16, minblocks=8) , # 390.4 GFlop/s
  Kernel_dnt_largeDB(m=6, n=13, k=16, tile_m=1, tile_n=1, w=8, v=12, threads=128, grouping=16, minblocks=8) , # 109.832 GFlop/s
  Kernel_dnt_largeDB(m=64, n=22, k=9, tile_m=6, tile_n=2, w=4, v=12, threads=128, grouping=16, minblocks=1) , # 271.107 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 107.717 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 239.666 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.619 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=24, tile_m=2, tile_n=1, w=10, v=4, threads=96, grouping=16, minblocks=8) , # 175.852 GFlop/s
  Kernel_dnt_largeDB(m=16, n=8, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=4) , # 166.502 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 126.35 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=16, tile_m=2, tile_n=4, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 358.681 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 284.987 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 248.219 GFlop/s
  Kernel_dnt_small(m=8, n=24, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 111.391 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=5, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 165.435 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 130.403 GFlop/s
  Kernel_dnt_medium(m=22, n=23, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 270.773 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=5, tile_m=1, tile_n=3, threads=96, grouping=16, minblocks=12) , # 135.825 GFlop/s
  Kernel_dnt_small(m=13, n=4, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 56.8225 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 80.7153 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 100.382 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 157.793 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=16, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 185.8 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 39.5654 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 112.176 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=26, tile_m=3, tile_n=2, w=10, v=14, threads=96, grouping=16, minblocks=12) , # 288.933 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 24.016 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 123.42 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=23, tile_m=2, tile_n=2, w=6, v=24, threads=128, grouping=16, minblocks=12) , # 344.677 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=26, tile_m=1, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=8) , # 173.529 GFlop/s
  Kernel_dnt_largeDB(m=6, n=22, k=32, tile_m=2, tile_n=3, w=16, v=16, threads=96, grouping=16, minblocks=12) , # 160.565 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 88.8231 GFlop/s
  Kernel_dnt_largeDB(m=6, n=16, k=16, tile_m=1, tile_n=1, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 131.447 GFlop/s
  Kernel_dnt_largeDB(m=5, n=24, k=32, tile_m=1, tile_n=1, w=12, v=24, threads=160, grouping=16, minblocks=12) , # 151.834 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 264.544 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 170.444 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 90.0009 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=23, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=1) , # 156.025 GFlop/s
  Kernel_dnt_largeDB(m=22, n=8, k=22, tile_m=2, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 185.428 GFlop/s
  Kernel_dnt_largeDB(m=9, n=9, k=64, tile_m=1, tile_n=1, w=14, v=6, threads=128, grouping=16, minblocks=12) , # 160.483 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=22, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 162.424 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 46.2498 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 109.247 GFlop/s
  Kernel_dnt_medium(m=8, n=23, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 102.42 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 99.3344 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 99.2729 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=22, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 138.328 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 117.51 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 131.294 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 223.508 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 106.256 GFlop/s
  Kernel_dnt_small(m=13, n=13, k=6, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 128.884 GFlop/s
  Kernel_dnt_medium(m=32, n=17, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 144.864 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=8, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 260.398 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=13, split_thread=32, threads=128, grouping=16, minblocks=1) , # 54.9661 GFlop/s
  Kernel_dnt_largeDB(m=6, n=26, k=24, tile_m=1, tile_n=1, w=12, v=26, threads=160, grouping=16, minblocks=12) , # 167.824 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=32, tile_m=2, tile_n=2, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 296.36 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=32, tile_m=3, tile_n=2, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 356.942 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=13, tile_m=2, tile_n=4, w=4, v=28, threads=128, grouping=16, minblocks=8) , # 304.826 GFlop/s
  Kernel_dnt_largeDB(m=22, n=6, k=16, tile_m=2, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=1) , # 143.963 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 153.077 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=32, tile_m=2, tile_n=4, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 360.103 GFlop/s
  Kernel_dnt_medium(m=32, n=17, k=5, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=1) , # 169.7 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 151.999 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 115.668 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 94.4076 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 91.9906 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=24, tile_m=2, tile_n=3, w=12, v=26, threads=128, grouping=16, minblocks=8) , # 321.263 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=22, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=4) , # 135.568 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 140.095 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 91.5958 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 203.746 GFlop/s
  Kernel_dnt_largeDB(m=17, n=17, k=23, tile_m=2, tile_n=2, w=6, v=16, threads=128, grouping=16, minblocks=12) , # 237.176 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=4, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 150.444 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 196.824 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=13, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 138.713 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 142.579 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=17, tile_m=3, tile_n=3, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 334.409 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 29.8584 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.572 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 161.969 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=26, tile_m=2, tile_n=4, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 298.512 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 124.559 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 143.186 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 145.376 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=5, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 132.674 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=24, tile_m=2, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 321.113 GFlop/s
  Kernel_dnt_largeDB(m=22, n=17, k=32, tile_m=2, tile_n=2, w=16, v=16, threads=128, grouping=16, minblocks=8) , # 275.263 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 208.282 GFlop/s
  Kernel_dnt_small(m=9, n=13, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 86.6047 GFlop/s
  Kernel_dnt_medium(m=32, n=23, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 291.61 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=23, tile_m=3, tile_n=2, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 251.925 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 68.9966 GFlop/s
  Kernel_dnt_largeDB(m=16, n=13, k=23, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 214.991 GFlop/s
  Kernel_dnt_medium(m=22, n=23, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 177.922 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=4, tile_m=3, tile_n=2, w=2, v=12, threads=96, grouping=16, minblocks=12) , # 146.407 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=22, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=4) , # 124.959 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 209.101 GFlop/s
  Kernel_dnt_largeDB(m=13, n=26, k=23, tile_m=2, tile_n=2, w=6, v=26, threads=96, grouping=16, minblocks=12) , # 255.001 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 62.4159 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=13, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 147.358 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 77.8557 GFlop/s
  Kernel_dnt_largeDB(m=26, n=5, k=32, tile_m=2, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 146.982 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=23, tile_m=2, tile_n=3, w=6, v=32, threads=160, grouping=16, minblocks=8) , # 359.94 GFlop/s
  Kernel_dnt_tiny(m=5, n=5, k=4, split_thread=32, threads=64, grouping=16, minblocks=1) , # 22.8495 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=6, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 88.3131 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=26, tile_m=2, tile_n=3, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 345.182 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 129.272 GFlop/s
  Kernel_dnt_medium(m=16, n=26, k=5, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 161.73 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=24, tile_m=2, tile_n=3, w=12, v=24, threads=128, grouping=16, minblocks=8) , # 342.541 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 157.699 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=17, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 167.344 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=17, tile_m=2, tile_n=2, w=6, v=32, threads=96, grouping=16, minblocks=12) , # 212.232 GFlop/s
  Kernel_dnt_largeDB(m=4, n=32, k=24, tile_m=1, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 137.129 GFlop/s
  Kernel_dnt_tiny(m=13, n=8, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 64.3202 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 228.053 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 98.4389 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=23, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 284.497 GFlop/s
  Kernel_dnt_largeDB(m=5, n=13, k=32, tile_m=1, tile_n=1, w=16, v=10, threads=96, grouping=16, minblocks=4) , # 114.466 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 95.0394 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=8) , # 247.453 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 95.4329 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=22, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 120.919 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=16, tile_m=3, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=8) , # 358.411 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 104.903 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 106.005 GFlop/s
  Kernel_dnt_tiny(m=8, n=16, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 102.56 GFlop/s
  Kernel_dnt_medium(m=23, n=24, k=4, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=4) , # 151.249 GFlop/s
  Kernel_dnt_tiny(m=5, n=6, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 25.4954 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 316.953 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=26, tile_m=3, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 252.69 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=26, tile_m=3, tile_n=2, w=10, v=24, threads=96, grouping=16, minblocks=12) , # 253.677 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=8, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 114.152 GFlop/s
  Kernel_dnt_largeDB(m=9, n=17, k=32, tile_m=2, tile_n=1, w=16, v=10, threads=96, grouping=16, minblocks=12) , # 180.16 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 222.858 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=24, tile_m=2, tile_n=2, w=6, v=16, threads=128, grouping=16, minblocks=12) , # 282.786 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 283.257 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 142.411 GFlop/s
  Kernel_dnt_largeDB(m=8, n=13, k=32, tile_m=1, tile_n=1, w=16, v=10, threads=128, grouping=16, minblocks=1) , # 160.083 GFlop/s
  Kernel_dnt_largeDB(m=6, n=23, k=24, tile_m=2, tile_n=1, w=8, v=16, threads=96, grouping=16, minblocks=1) , # 153.564 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 253.335 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=32, tile_m=2, tile_n=3, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 352.432 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=17, tile_m=1, tile_n=2, w=8, v=24, threads=96, grouping=16, minblocks=4) , # 140.277 GFlop/s
  Kernel_dnt_largeDB(m=16, n=6, k=26, tile_m=1, tile_n=1, w=10, v=6, threads=96, grouping=16, minblocks=4) , # 149.231 GFlop/s
  Kernel_dnt_largeDB(m=22, n=22, k=24, tile_m=2, tile_n=2, w=8, v=22, threads=192, grouping=16, minblocks=8) , # 310.483 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.095 GFlop/s
  Kernel_dnt_largeDB(m=4, n=24, k=16, tile_m=1, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=1) , # 117.543 GFlop/s
  Kernel_dnt_largeDB(m=13, n=17, k=26, tile_m=2, tile_n=2, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 211.202 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=24, tile_m=2, tile_n=3, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 291.318 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 73.6249 GFlop/s
  Kernel_dnt_largeDB(m=17, n=16, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 240.279 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 59.6814 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=1) , # 165.846 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=24, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 194.026 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=22, tile_m=2, tile_n=3, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 321.363 GFlop/s
  Kernel_dnt_small(m=6, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 49.4221 GFlop/s
  Kernel_dnt_medium(m=23, n=32, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 327.89 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=23, tile_m=2, tile_n=2, w=6, v=22, threads=160, grouping=16, minblocks=8) , # 306.629 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=23, tile_m=3, tile_n=2, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 267.658 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 84.6147 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 128.156 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 191.689 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=64, tile_m=3, tile_n=2, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 213.459 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=17, tile_m=2, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 282.016 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 52.8509 GFlop/s
  Kernel_dnt_largeDB(m=5, n=22, k=8, tile_m=1, tile_n=1, w=4, v=22, threads=128, grouping=16, minblocks=8) , # 92.3688 GFlop/s
  Kernel_dnt_largeDB(m=5, n=23, k=32, tile_m=1, tile_n=1, w=16, v=22, threads=192, grouping=16, minblocks=8) , # 145.364 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 141.609 GFlop/s
  Kernel_dnt_largeDB(m=22, n=4, k=32, tile_m=1, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 128.16 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 195.592 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 200.731 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 152.251 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=24, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 212.921 GFlop/s
  Kernel_dnt_largeDB(m=16, n=13, k=26, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 210.224 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=23, tile_m=1, tile_n=3, w=6, v=14, threads=128, grouping=16, minblocks=12) , # 235.716 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 24.0285 GFlop/s
  Kernel_dnt_largeDB(m=13, n=26, k=24, tile_m=2, tile_n=3, w=12, v=14, threads=96, grouping=16, minblocks=12) , # 262.921 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 101.216 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=22, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 205.221 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 85.9047 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 34.6329 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=5, tile_m=2, tile_n=2, threads=192, grouping=16, minblocks=8) , # 166.138 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.212 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=16, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 177.57 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=23, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 127.16 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 301.015 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=26, tile_m=2, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 290.449 GFlop/s
  Kernel_dnt_largeDB(m=5, n=32, k=32, tile_m=2, tile_n=1, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 162.861 GFlop/s
  Kernel_dnt_largeDB(m=9, n=13, k=16, tile_m=1, tile_n=1, w=8, v=12, threads=128, grouping=16, minblocks=8) , # 148.198 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 117.265 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 172.625 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=23, tile_m=4, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 374.412 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=9, tile_m=2, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 239.093 GFlop/s
  Kernel_dnt_largeDB(m=4, n=32, k=32, tile_m=1, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=4) , # 141.559 GFlop/s
  Kernel_dnt_small(m=17, n=4, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 55.1151 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=5, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 142.332 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 216.799 GFlop/s
  Kernel_dnt_largeDB(m=5, n=26, k=32, tile_m=1, tile_n=1, w=12, v=26, threads=160, grouping=16, minblocks=12) , # 146.495 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=22, tile_m=2, tile_n=3, w=8, v=18, threads=96, grouping=16, minblocks=12) , # 325.781 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 62.3945 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=12) , # 265.276 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 104.38 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 100.426 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.121 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 145.86 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=22, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 140.428 GFlop/s
  Kernel_dnt_medium(m=23, n=24, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 291.1 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 190.243 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 63.7683 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=5, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 136.278 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 21.2783 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 185.07 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 162.59 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 100.321 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=22, tile_m=3, tile_n=2, w=6, v=32, threads=128, grouping=16, minblocks=8) , # 305.08 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=5, tile_m=5, tile_n=1, threads=96, grouping=16, minblocks=12) , # 155.889 GFlop/s
  Kernel_dnt_largeDB(m=8, n=9, k=32, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=12) , # 131.311 GFlop/s
  Kernel_dnt_largeDB(m=24, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=8) , # 153.309 GFlop/s
  Kernel_dnt_largeDB(m=55, n=55, k=55, tile_m=5, tile_n=5, w=6, v=26, threads=128, grouping=16, minblocks=1) , # 588.05 GFlop/s
  Kernel_dnt_medium(m=32, n=26, k=13, tile_m=4, tile_n=2, threads=128, grouping=16, minblocks=4) , # 298.845 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 67.2537 GFlop/s
  Kernel_dnt_largeDB(m=6, n=23, k=17, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=4) , # 130.847 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 90.287 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=24, tile_m=2, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=8) , # 163.247 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 128.018 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=32, tile_m=2, tile_n=4, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 362.722 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 308.901 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=16, tile_m=3, tile_n=2, w=6, v=26, threads=160, grouping=16, minblocks=8) , # 268.495 GFlop/s
  Kernel_dnt_small(m=12, n=12, k=12, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 154.228 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 51.8037 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 91.5999 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=8, tile_m=1, tile_n=1, threads=64, grouping=16, minblocks=12) , # 51.4584 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 149.14 GFlop/s
  Kernel_dnt_medium(m=16, n=9, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 166.142 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=22, tile_m=2, tile_n=3, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 332.21 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 72.2008 GFlop/s
  Kernel_dnt_largeDB(m=4, n=8, k=24, tile_m=1, tile_n=1, w=12, v=8, threads=96, grouping=16, minblocks=1) , # 72.6651 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 125.487 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 113.492 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=8, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 182.54 GFlop/s
  Kernel_dnt_tiny(m=8, n=8, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 52.7483 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=26, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 327.373 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=6, tile_m=5, tile_n=2, w=2, v=18, threads=96, grouping=16, minblocks=12) , # 222.341 GFlop/s
  Kernel_dnt_largeDB(m=16, n=22, k=23, tile_m=2, tile_n=3, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 277.81 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 107.347 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 143.348 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=22, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 133.965 GFlop/s
  Kernel_dnt_largeDB(m=9, n=22, k=24, tile_m=2, tile_n=2, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 194.866 GFlop/s
  Kernel_dnt_tiny(m=17, n=6, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 60.0659 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 123.069 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=32, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=1) , # 214.467 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=22, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 142.021 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 93.1439 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=8, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.094 GFlop/s
  Kernel_dnt_largeDB(m=23, n=9, k=32, tile_m=2, tile_n=2, w=10, v=6, threads=96, grouping=16, minblocks=12) , # 210.534 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 137.955 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 138.949 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 189.416 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 78.5843 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=26, tile_m=2, tile_n=4, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 409.018 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 99.2194 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=6, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 129.406 GFlop/s
  Kernel_dnt_medium(m=32, n=8, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 214.735 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=26, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=8) , # 204.676 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=32, tile_m=2, tile_n=2, w=16, v=16, threads=160, grouping=16, minblocks=8) , # 282.093 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=8, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 169.907 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=16, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 228.835 GFlop/s
  Kernel_dnt_largeDB(m=5, n=24, k=16, tile_m=1, tile_n=1, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 138.648 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=26, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 351.889 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=32, tile_m=2, tile_n=3, w=12, v=6, threads=96, grouping=16, minblocks=12) , # 255.137 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 235.336 GFlop/s
  Kernel_dnt_small(m=9, n=4, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 54.6432 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 114.289 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=24, tile_m=2, tile_n=3, w=6, v=14, threads=96, grouping=16, minblocks=12) , # 333.041 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 176.08 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 53.5197 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 163.24 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=5, tile_m=1, tile_n=5, threads=128, grouping=16, minblocks=12) , # 160.205 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=23, tile_m=2, tile_n=2, w=6, v=18, threads=128, grouping=16, minblocks=12) , # 284.848 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 125.713 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 130.85 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=23, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 137.919 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 286.552 GFlop/s
  Kernel_dnt_largeDB(m=55, n=16, k=16, tile_m=3, tile_n=3, w=8, v=12, threads=128, grouping=16, minblocks=8) , # 322.352 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 74.4677 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=9, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 223.126 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=23, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 450.644 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=9, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 148.067 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 107.639 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=9, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 220.434 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 123.335 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=23, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 344.66 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 59.3286 GFlop/s
  Kernel_dnt_largeDB(m=32, n=4, k=24, tile_m=1, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 137.282 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 101.712 GFlop/s
  Kernel_dnt_small(m=6, n=8, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 57.8262 GFlop/s
  Kernel_dnt_small(m=24, n=5, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 80.4972 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=32, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 98.6452 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=16, tile_m=3, tile_n=2, w=8, v=32, threads=128, grouping=16, minblocks=8) , # 321.452 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=13, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 137.164 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 66.7055 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 172.205 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=16, tile_m=2, tile_n=2, w=8, v=22, threads=128, grouping=16, minblocks=12) , # 215.76 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=32, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 317.92 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 47.6694 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=5, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 162.28 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=22, tile_m=2, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=8) , # 183.582 GFlop/s
  Kernel_dnt_largeDB(m=4, n=22, k=32, tile_m=1, tile_n=1, w=8, v=20, threads=96, grouping=16, minblocks=8) , # 127.401 GFlop/s
  Kernel_dnt_small(m=5, n=23, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 74.1593 GFlop/s
  Kernel_dnt_small(m=22, n=4, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 61.4088 GFlop/s
  Kernel_dnt_largeDB(m=55, n=29, k=16, tile_m=3, tile_n=5, w=8, v=20, threads=128, grouping=16, minblocks=1) , # 339.155 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 112.692 GFlop/s
  Kernel_dnt_largeDB(m=22, n=8, k=26, tile_m=1, tile_n=2, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 190.52 GFlop/s
  Kernel_dnt_largeDB(m=26, n=16, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 272.563 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=26, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 297.501 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=24, tile_m=2, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 195.033 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 63.7826 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=32, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 356.001 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 268.121 GFlop/s
  Kernel_dnt_largeDB(m=24, n=9, k=23, tile_m=2, tile_n=1, w=10, v=6, threads=128, grouping=16, minblocks=12) , # 204.604 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 73.9933 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 144.56 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 126.292 GFlop/s
  Kernel_dnt_small(m=6, n=8, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 81.6668 GFlop/s
  Kernel_dnt_medium(m=8, n=26, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 117.769 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 62.9356 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 126.088 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 87.0744 GFlop/s
  Kernel_dnt_largeDB(m=4, n=6, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=96, grouping=16, minblocks=1) , # 66.9879 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=24, split_thread=32, threads=128, grouping=16, minblocks=1) , # 50.9804 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=16, split_thread=32, threads=128, grouping=16, minblocks=1) , # 54.8645 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 228.962 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=17, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 177.212 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=9, tile_m=5, tile_n=2, w=2, v=18, threads=96, grouping=16, minblocks=12) , # 256.077 GFlop/s
  Kernel_dnt_largeDB(m=13, n=8, k=32, tile_m=2, tile_n=2, w=16, v=8, threads=96, grouping=16, minblocks=12) , # 157.473 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 99.159 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 101.742 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 132.002 GFlop/s
  Kernel_dnt_largeDB(m=4, n=24, k=22, tile_m=1, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 123.01 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 64.8851 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 180.998 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=26, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=1) , # 145.469 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=24, tile_m=2, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 209.127 GFlop/s
  Kernel_dnt_largeDB(m=9, n=6, k=16, tile_m=1, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 87.0012 GFlop/s
  Kernel_dnt_largeDB(m=17, n=13, k=26, tile_m=2, tile_n=2, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 213.018 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=32, tile_m=2, tile_n=3, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 366.046 GFlop/s
  Kernel_dnt_small(m=16, n=13, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 116.398 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=32, tile_m=3, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 356.369 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=24, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 116.045 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=17, tile_m=2, tile_n=3, w=8, v=14, threads=96, grouping=16, minblocks=12) , # 294.427 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=6, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 158.896 GFlop/s
  Kernel_dnt_medium(m=36, n=6, k=6, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 113.312 GFlop/s
  Kernel_dnt_small(m=6, n=5, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 58.085 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 117.606 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 153.149 GFlop/s
  Kernel_dnt_largeDB(m=5, n=8, k=24, tile_m=1, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=4) , # 84.7498 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=22, tile_m=3, tile_n=2, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 382.529 GFlop/s
  Kernel_dnt_largeDB(m=55, n=55, k=16, tile_m=5, tile_n=5, w=6, v=26, threads=128, grouping=16, minblocks=1) , # 361.626 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 61.4172 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=22, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 140.116 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=32, tile_m=3, tile_n=2, w=12, v=24, threads=96, grouping=16, minblocks=12) , # 272.912 GFlop/s
  Kernel_dnt_small(m=6, n=22, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 68.9181 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=23, tile_m=1, tile_n=1, threads=224, grouping=16, minblocks=8) , # 166.022 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=4, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=8) , # 160.698 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=5, tile_m=3, tile_n=2, w=2, v=16, threads=128, grouping=16, minblocks=12) , # 202.909 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 204.892 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 153.614 GFlop/s
  Kernel_dnt_medium(m=32, n=23, k=5, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 202.087 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=8, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 148.898 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=24, tile_m=2, tile_n=2, w=6, v=14, threads=128, grouping=16, minblocks=12) , # 298.462 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=8, tile_m=2, tile_n=1, w=4, v=24, threads=96, grouping=16, minblocks=1) , # 143.632 GFlop/s
  Kernel_dnt_largeDB(m=29, n=29, k=55, tile_m=2, tile_n=4, w=8, v=22, threads=128, grouping=16, minblocks=8) , # 431.556 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 86.3631 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=26, tile_m=3, tile_n=2, w=6, v=32, threads=96, grouping=16, minblocks=12) , # 274.165 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=23, tile_m=2, tile_n=3, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 302.868 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 42.5108 GFlop/s
  Kernel_dnt_largeDB(m=22, n=13, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 214.711 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 129.521 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=23, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=1) , # 136.381 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 106.858 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=24, tile_m=1, tile_n=3, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 239.49 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=24, tile_m=3, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 262.797 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=16, split_thread=32, threads=96, grouping=16, minblocks=1) , # 35.9461 GFlop/s
  Kernel_dnt_tiny(m=5, n=5, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 37.8978 GFlop/s
  Kernel_dnt_largeDB(m=23, n=6, k=26, tile_m=1, tile_n=2, w=8, v=4, threads=96, grouping=16, minblocks=1) , # 151.108 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 49.1063 GFlop/s
  Kernel_dnt_medium(m=16, n=26, k=9, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 211.795 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=26, tile_m=2, tile_n=2, w=6, v=24, threads=128, grouping=16, minblocks=12) , # 342.564 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 148.581 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 177.746 GFlop/s
  Kernel_dnt_small(m=6, n=17, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 68.6498 GFlop/s
  Kernel_dnt_largeDB(m=22, n=64, k=16, tile_m=3, tile_n=4, w=4, v=40, threads=128, grouping=16, minblocks=4) , # 364.597 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=4, tile_m=3, tile_n=2, w=2, v=16, threads=96, grouping=16, minblocks=12) , # 148.299 GFlop/s
  Kernel_dnt_largeDB(m=16, n=8, k=26, tile_m=1, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=12) , # 178.92 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=26, tile_m=2, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=12) , # 159.791 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=17, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 408.083 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=26, tile_m=2, tile_n=4, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 392.455 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=6, tile_m=3, tile_n=2, w=2, v=12, threads=128, grouping=16, minblocks=12) , # 213.72 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 86.341 GFlop/s
  Kernel_dnt_medium(m=8, n=23, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 147.697 GFlop/s
  Kernel_dnt_small(m=22, n=22, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 187.588 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=29, tile_m=2, tile_n=2, w=14, v=16, threads=128, grouping=16, minblocks=12) , # 270.638 GFlop/s
  Kernel_dnt_largeDB(m=24, n=9, k=32, tile_m=2, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 227.824 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 60.9248 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 83.0769 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 91.3193 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 75.5435 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=26, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=4) , # 117.406 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 133.366 GFlop/s
  Kernel_dnt_largeDB(m=8, n=8, k=64, tile_m=1, tile_n=1, w=16, v=8, threads=128, grouping=16, minblocks=12) , # 155.798 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 85.9489 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 49.3499 GFlop/s
  Kernel_dnt_largeDB(m=8, n=4, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=4) , # 84.7409 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=26, tile_m=3, tile_n=2, w=10, v=32, threads=128, grouping=16, minblocks=8) , # 381.494 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=17, tile_m=2, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 174.79 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=13, tile_m=1, tile_n=6, threads=96, grouping=16, minblocks=12) , # 129.778 GFlop/s
  Kernel_dnt_small(m=5, n=6, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 37.5652 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 272.907 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 270.155 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 121.156 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=22, tile_m=2, tile_n=3, w=6, v=12, threads=96, grouping=16, minblocks=12) , # 244.295 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=32, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 328.287 GFlop/s
  Kernel_dnt_largeDB(m=4, n=17, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=128, grouping=16, minblocks=4) , # 113.471 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=23, tile_m=1, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=8) , # 166.582 GFlop/s
  Kernel_dnt_medium(m=196, n=14, k=14, tile_m=6, tile_n=2, threads=256, grouping=16, minblocks=1) , # 243.838 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 122.26 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=16, tile_m=3, tile_n=2, w=8, v=32, threads=128, grouping=16, minblocks=8) , # 332.643 GFlop/s
  Kernel_dnt_small(m=6, n=16, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 79.6159 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=16, tile_m=2, tile_n=4, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 335.651 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 144.87 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=4, tile_m=4, tile_n=2, w=2, v=12, threads=96, grouping=16, minblocks=12) , # 168.649 GFlop/s
  Kernel_dnt_small(m=4, n=32, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 72.6159 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 159.739 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 147.402 GFlop/s
  Kernel_dnt_largeDB(m=23, n=8, k=26, tile_m=1, tile_n=2, w=10, v=8, threads=128, grouping=16, minblocks=12) , # 196.116 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 142.372 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 97.5907 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 62.5018 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 222.018 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 78.0896 GFlop/s
  Kernel_dnt_medium(m=8, n=26, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.311 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 199.157 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=32, tile_m=2, tile_n=1, w=8, v=18, threads=128, grouping=16, minblocks=12) , # 192.489 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 122.921 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=16, tile_m=3, tile_n=3, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 346.718 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=8, tile_m=3, tile_n=2, w=4, v=20, threads=96, grouping=16, minblocks=12) , # 239.901 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 281.835 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 76.8119 GFlop/s
  Kernel_dnt_tiny(m=6, n=5, k=9, split_thread=32, threads=96, grouping=16, minblocks=1) , # 37.5717 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=9, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 290.689 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 63.6387 GFlop/s
  Kernel_dnt_largeDB(m=23, n=8, k=32, tile_m=1, tile_n=2, w=12, v=8, threads=96, grouping=16, minblocks=1) , # 208.86 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 150.145 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 57.2607 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=17, tile_m=4, tile_n=2, w=6, v=22, threads=160, grouping=16, minblocks=8) , # 335.578 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 138.705 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 127.673 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=22, tile_m=1, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 199.914 GFlop/s
  Kernel_dnt_largeDB(m=13, n=6, k=16, tile_m=1, tile_n=1, w=8, v=6, threads=128, grouping=16, minblocks=4) , # 112.36 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 270.21 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 156.989 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 80.5774 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=32, tile_m=2, tile_n=2, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 278.982 GFlop/s
  Kernel_dnt_largeDB(m=64, n=16, k=22, tile_m=2, tile_n=4, w=8, v=8, threads=128, grouping=16, minblocks=8) , # 408.89 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 42.3413 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=23, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=8) , # 147.238 GFlop/s
  Kernel_dnt_largeDB(m=64, n=9, k=16, tile_m=2, tile_n=3, w=4, v=6, threads=96, grouping=16, minblocks=12) , # 263.151 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=17, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 311.324 GFlop/s
  Kernel_dnt_medium(m=16, n=9, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 98.1233 GFlop/s
  Kernel_dnt_medium(m=32, n=16, k=5, tile_m=2, tile_n=2, threads=192, grouping=16, minblocks=8) , # 166.987 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=16, tile_m=2, tile_n=3, w=6, v=20, threads=160, grouping=16, minblocks=8) , # 278.305 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 251.891 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=9, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 255.794 GFlop/s
  Kernel_dnt_largeDB(m=5, n=16, k=24, tile_m=1, tile_n=1, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 131.765 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=16, tile_m=2, tile_n=2, w=8, v=18, threads=128, grouping=16, minblocks=12) , # 273.171 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 83.6127 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=26, tile_m=3, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 241.066 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 90.7465 GFlop/s
  Kernel_dnt_largeDB(m=12, n=12, k=144, tile_m=3, tile_n=2, w=16, v=8, threads=96, grouping=16, minblocks=12) , # 249.09 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=23, tile_m=2, tile_n=2, w=6, v=10, threads=128, grouping=16, minblocks=12) , # 288.889 GFlop/s
  Kernel_dnt_small(m=17, n=8, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 73.5398 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=22, tile_m=2, tile_n=3, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 317.15 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=16, tile_m=1, tile_n=4, threads=96, grouping=16, minblocks=12) , # 136.353 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 104.947 GFlop/s
  Kernel_dnt_medium(m=29, n=14, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 258.248 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=23, tile_m=2, tile_n=1, w=10, v=4, threads=96, grouping=16, minblocks=4) , # 173.179 GFlop/s
  Kernel_dnt_medium(m=16, n=8, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 167.3 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=13, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 125.068 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 125.232 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 106.433 GFlop/s
  Kernel_dnt_largeDB(m=8, n=64, k=8, tile_m=1, tile_n=4, w=4, v=48, threads=128, grouping=16, minblocks=12) , # 205.107 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=22, tile_m=3, tile_n=2, w=8, v=14, threads=96, grouping=16, minblocks=12) , # 275.4 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 260.11 GFlop/s
  Kernel_dnt_largeDB(m=26, n=4, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=8) , # 130.708 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 147.182 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 82.5655 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=4, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=4) , # 113.047 GFlop/s
  Kernel_dnt_largeDB(m=9, n=24, k=24, tile_m=2, tile_n=2, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 215.864 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=8) , # 202.923 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 73.5383 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 184.285 GFlop/s
  Kernel_dnt_largeDB(m=32, n=6, k=23, tile_m=2, tile_n=1, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 179.563 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=1) , # 138.881 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=24, tile_m=2, tile_n=2, w=12, v=24, threads=192, grouping=16, minblocks=8) , # 283.245 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=9, tile_m=2, tile_n=3, w=4, v=20, threads=96, grouping=16, minblocks=12) , # 239.724 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 73.6718 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=8, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 109.787 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=32, tile_m=2, tile_n=3, w=14, v=16, threads=128, grouping=16, minblocks=8) , # 358.776 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 28.6828 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 126.933 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 254.23 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 130.503 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=17, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 300.284 GFlop/s
  Kernel_dnt_small(m=16, n=8, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 124.195 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 263.178 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 172.254 GFlop/s
  Kernel_dnt_largeDB(m=22, n=8, k=23, tile_m=1, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 187.194 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=16, tile_m=3, tile_n=2, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 348.283 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 108.16 GFlop/s
  Kernel_dnt_largeDB(m=13, n=4, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=12) , # 107.82 GFlop/s
  Kernel_dnt_small(m=13, n=5, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 64.2414 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.5916 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=26, tile_m=3, tile_n=3, w=6, v=16, threads=96, grouping=16, minblocks=12) , # 397.018 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 87.9086 GFlop/s
  Kernel_dnt_largeDB(m=8, n=16, k=16, tile_m=1, tile_n=1, w=8, v=16, threads=128, grouping=16, minblocks=1) , # 165.868 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 227.532 GFlop/s
  Kernel_dnt_largeDB(m=17, n=16, k=23, tile_m=3, tile_n=1, w=6, v=16, threads=128, grouping=16, minblocks=12) , # 235.699 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 121.05 GFlop/s
  Kernel_dnt_largeDB(m=23, n=13, k=24, tile_m=2, tile_n=3, w=12, v=8, threads=96, grouping=16, minblocks=12) , # 241.459 GFlop/s
  Kernel_dnt_largeDB(m=9, n=9, k=81, tile_m=1, tile_n=1, w=14, v=6, threads=128, grouping=16, minblocks=12) , # 164.81 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 280.577 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 112.586 GFlop/s
  Kernel_dnt_largeDB(m=8, n=9, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=8) , # 125.923 GFlop/s
  Kernel_dnt_largeDB(m=32, n=6, k=26, tile_m=2, tile_n=1, w=6, v=6, threads=96, grouping=16, minblocks=1) , # 181.915 GFlop/s
  Kernel_dnt_small(m=6, n=16, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 94.2819 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 39.1691 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=32, tile_m=2, tile_n=2, w=16, v=24, threads=128, grouping=16, minblocks=8) , # 301.526 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 226.28 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=4, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.375 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=26, tile_m=2, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 311.766 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 147.12 GFlop/s
  Kernel_dnt_largeDB(m=7, n=7, k=49, tile_m=1, tile_n=1, w=18, v=4, threads=128, grouping=16, minblocks=12) , # 120.76 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=16, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 216.01 GFlop/s
  Kernel_dnt_largeDB(m=4, n=32, k=26, tile_m=1, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 135.217 GFlop/s
  Kernel_dnt_small(m=32, n=4, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 71.9933 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 135.055 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 169.191 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 88.9708 GFlop/s
  Kernel_dnt_medium(m=8, n=22, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 98.4831 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 130.249 GFlop/s
  Kernel_dnt_largeDB(m=16, n=6, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=128, grouping=16, minblocks=12) , # 163.734 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 84.6639 GFlop/s
  Kernel_dnt_medium(m=23, n=24, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 176.59 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=26, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 124.519 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=26, tile_m=1, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 212.781 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 65.4188 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 189.636 GFlop/s
  Kernel_dnt_medium(m=24, n=23, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 196.204 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=32, tile_m=2, tile_n=4, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 308.645 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 174.489 GFlop/s
  Kernel_dnt_small(m=9, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 43.0718 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=32, tile_m=2, tile_n=3, w=14, v=26, threads=128, grouping=16, minblocks=8) , # 379.883 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=23, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 123.364 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 138.92 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=17, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 153.384 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 196.809 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 36.6409 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=24, tile_m=2, tile_n=3, w=12, v=26, threads=128, grouping=16, minblocks=8) , # 356.594 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 20.6833 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=9, tile_m=1, tile_n=5, threads=96, grouping=16, minblocks=12) , # 203.363 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 109.498 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 106.018 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=8, tile_m=2, tile_n=2, w=4, v=24, threads=128, grouping=16, minblocks=12) , # 234.188 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 130.008 GFlop/s
  Kernel_dnt_largeDB(m=55, n=55, k=29, tile_m=5, tile_n=5, w=10, v=30, threads=128, grouping=16, minblocks=1) , # 408.729 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 217.359 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 172.574 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=4) , # 132.131 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=24, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 129.224 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=24, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 292.866 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 163.688 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 81.0562 GFlop/s
  Kernel_dnt_medium(m=14, n=196, k=14, tile_m=2, tile_n=6, threads=256, grouping=16, minblocks=1) , # 278.411 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 221.618 GFlop/s
  Kernel_dnt_largeDB(m=29, n=55, k=16, tile_m=4, tile_n=4, w=8, v=38, threads=128, grouping=16, minblocks=4) , # 326.811 GFlop/s
  Kernel_dnt_small(m=24, n=8, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 102.382 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=8, tile_m=4, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 272.379 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=22, tile_m=2, tile_n=2, w=6, v=10, threads=96, grouping=16, minblocks=12) , # 250.837 GFlop/s
  Kernel_dnt_medium(m=8, n=24, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 185.726 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 110.942 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=13, tile_m=1, tile_n=2, w=6, v=16, threads=96, grouping=16, minblocks=8) , # 157.238 GFlop/s
  Kernel_dnt_small(m=5, n=24, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 79.6991 GFlop/s
  Kernel_dnt_small(m=23, n=5, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 76.0417 GFlop/s
  Kernel_dnt_medium(m=22, n=23, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 165.555 GFlop/s
  Kernel_dnt_medium(m=24, n=22, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 171.925 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 49.8578 GFlop/s
  Kernel_dnt_medium(m=22, n=24, k=4, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=8) , # 146.658 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=23, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 125.45 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 228.978 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 187.441 GFlop/s
  Kernel_dnt_largeDB(m=13, n=26, k=22, tile_m=2, tile_n=2, w=6, v=26, threads=96, grouping=16, minblocks=12) , # 251.077 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 103.968 GFlop/s
  Kernel_dnt_small(m=6, n=8, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 70.6689 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=17, tile_m=2, tile_n=3, w=6, v=14, threads=96, grouping=16, minblocks=12) , # 247.911 GFlop/s
  Kernel_dnt_tiny(m=5, n=5, k=5, split_thread=32, threads=64, grouping=16, minblocks=1) , # 27.9623 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 135.569 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=32, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 271.274 GFlop/s
  Kernel_dnt_largeDB(m=9, n=8, k=32, tile_m=1, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=12) , # 132.924 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 97.3911 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=6, tile_m=4, tile_n=2, w=2, v=12, threads=96, grouping=16, minblocks=4) , # 218.903 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 141.729 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=24, tile_m=2, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 331.615 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=16, tile_m=3, tile_n=1, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 215.364 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=22, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 439.077 GFlop/s
  Kernel_dnt_medium(m=10, n=100, k=10, tile_m=2, tile_n=4, threads=128, grouping=16, minblocks=4) , # 250.611 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 99.1383 GFlop/s
  Kernel_dnt_largeDB(m=4, n=9, k=32, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=12) , # 79.1474 GFlop/s
  Kernel_dnt_largeDB(m=17, n=6, k=32, tile_m=2, tile_n=2, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 150.065 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=22, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 279.926 GFlop/s
  Kernel_dnt_largeDB(m=13, n=16, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 214.166 GFlop/s
  Kernel_dnt_largeDB(m=13, n=17, k=32, tile_m=2, tile_n=2, w=16, v=12, threads=96, grouping=16, minblocks=12) , # 237.799 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=17, tile_m=3, tile_n=2, w=8, v=22, threads=128, grouping=16, minblocks=8) , # 348.821 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=26, tile_m=2, tile_n=2, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 200.465 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 98.7557 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=4, tile_m=5, tile_n=1, threads=96, grouping=16, minblocks=12) , # 131.562 GFlop/s
  Kernel_dnt_tiny(m=5, n=6, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 20.9988 GFlop/s
  Kernel_dnt_largeDB(m=24, n=4, k=32, tile_m=1, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 132.646 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 116.819 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=26, tile_m=2, tile_n=4, w=8, v=24, threads=128, grouping=16, minblocks=8) , # 370.104 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 102.375 GFlop/s
  Kernel_dnt_largeDB(m=6, n=26, k=16, tile_m=2, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 148.541 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=24, tile_m=3, tile_n=2, w=8, v=28, threads=96, grouping=16, minblocks=12) , # 335.43 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=24, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=8) , # 143.882 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 282.576 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 93.5526 GFlop/s
  Kernel_dnt_tiny(m=13, n=9, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 68.1998 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 145.287 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 255.563 GFlop/s
  Kernel_dnt_largeDB(m=14, n=14, k=196, tile_m=2, tile_n=2, w=16, v=12, threads=96, grouping=16, minblocks=12) , # 289.084 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=32, tile_m=2, tile_n=1, w=6, v=6, threads=96, grouping=16, minblocks=12) , # 179.384 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=26, tile_m=2, tile_n=3, w=4, v=18, threads=128, grouping=16, minblocks=12) , # 314.535 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 98.6797 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 112.537 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 115.374 GFlop/s
  Kernel_dnt_largeDB(m=13, n=26, k=32, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 277.278 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 23.9735 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 131.89 GFlop/s
  Kernel_dnt_largeDB(m=29, n=29, k=16, tile_m=4, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=8) , # 321.264 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=5, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 136.165 GFlop/s
  Kernel_dnt_largeDB(m=9, n=22, k=64, tile_m=2, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 239.402 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 116.283 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=16, tile_m=3, tile_n=2, w=6, v=12, threads=160, grouping=16, minblocks=8) , # 276.664 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=24, tile_m=2, tile_n=3, w=12, v=22, threads=128, grouping=16, minblocks=8) , # 312.951 GFlop/s
  Kernel_dnt_largeDB(m=26, n=9, k=32, tile_m=2, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=12) , # 228.573 GFlop/s
  Kernel_dnt_medium(m=8, n=22, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 141.743 GFlop/s
  Kernel_dnt_largeDB(m=8, n=17, k=24, tile_m=2, tile_n=1, w=12, v=6, threads=96, grouping=16, minblocks=12) , # 164.338 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=22, tile_m=3, tile_n=2, w=10, v=24, threads=96, grouping=16, minblocks=12) , # 265.399 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 220.011 GFlop/s
  Kernel_dnt_largeDB(m=9, n=22, k=22, tile_m=2, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 183.705 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 129.955 GFlop/s
  Kernel_dnt_largeDB(m=6, n=22, k=24, tile_m=2, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=4) , # 150.931 GFlop/s
  Kernel_dnt_largeDB(m=6, n=17, k=32, tile_m=1, tile_n=1, w=14, v=8, threads=128, grouping=16, minblocks=12) , # 147.119 GFlop/s
  Kernel_dnt_largeDB(m=16, n=5, k=26, tile_m=1, tile_n=1, w=12, v=4, threads=96, grouping=16, minblocks=1) , # 127.182 GFlop/s
  Kernel_dnt_tiny(m=16, n=4, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 56.0623 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=6, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 117.471 GFlop/s
  Kernel_dnt_medium(m=16, n=9, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 128.209 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=32, tile_m=2, tile_n=2, w=14, v=14, threads=128, grouping=16, minblocks=12) , # 276.949 GFlop/s
  Kernel_dnt_small(m=16, n=13, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 101.905 GFlop/s
  Kernel_dnt_largeDB(m=5, n=32, k=26, tile_m=2, tile_n=1, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 153.425 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 90.4024 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 155.536 GFlop/s
  Kernel_dnt_tiny(m=4, n=24, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 58.0271 GFlop/s
  Kernel_dnt_largeDB(m=32, n=8, k=26, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 229.882 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=6, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 153.953 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=4, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 88.5988 GFlop/s
  Kernel_dnt_largeDB(m=5, n=24, k=26, tile_m=1, tile_n=1, w=10, v=24, threads=128, grouping=16, minblocks=8) , # 144.829 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=22, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 234.804 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=26, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 347.137 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 89.287 GFlop/s
  Kernel_dnt_largeDB(m=29, n=16, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 295.779 GFlop/s
  Kernel_dnt_largeDB(m=4, n=24, k=23, tile_m=1, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=8) , # 121.888 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 88.2854 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=23, tile_m=3, tile_n=2, w=6, v=26, threads=160, grouping=16, minblocks=8) , # 313.656 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 181.813 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 29.7245 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 17.7217 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=13, tile_m=2, tile_n=2, w=6, v=26, threads=128, grouping=16, minblocks=12) , # 235.299 GFlop/s
  Kernel_dnt_medium(m=16, n=32, k=6, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 189.982 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=16, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 317.599 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 176.836 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=22, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 126.399 GFlop/s
  Kernel_dnt_largeDB(m=13, n=8, k=24, tile_m=1, tile_n=2, w=12, v=6, threads=96, grouping=16, minblocks=1) , # 151.064 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=32, tile_m=2, tile_n=1, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 171.227 GFlop/s
  Kernel_dnt_largeDB(m=16, n=8, k=22, tile_m=1, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=4) , # 175.569 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 126.365 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=17, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 144.57 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=22, tile_m=2, tile_n=3, w=10, v=12, threads=128, grouping=16, minblocks=8) , # 361.851 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 60.4699 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=23, tile_m=2, tile_n=3, w=6, v=16, threads=96, grouping=16, minblocks=12) , # 275.98 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 69.4042 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 132.184 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 113.32 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=23, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 311.315 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=23, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 331.261 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 56.9443 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=22, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 321.992 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 106.3 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 133.637 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 29.9379 GFlop/s
  Kernel_dnt_largeDB(m=13, n=16, k=17, tile_m=2, tile_n=2, w=6, v=16, threads=96, grouping=16, minblocks=1) , # 185.642 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=26, tile_m=2, tile_n=2, w=10, v=8, threads=96, grouping=16, minblocks=12) , # 201.388 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=5, tile_m=1, tile_n=3, threads=96, grouping=16, minblocks=8) , # 132.846 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 135.026 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 214.624 GFlop/s
  Kernel_dnt_medium(m=22, n=32, k=6, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 213.238 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 66.7856 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 49.3659 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=9, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 207.496 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 53.1228 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 148.585 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=32, tile_m=3, tile_n=3, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 401.738 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 17.8383 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=26, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 154.232 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 226.085 GFlop/s
  Kernel_dnt_largeDB(m=9, n=22, k=32, tile_m=2, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 207.422 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=16, tile_m=1, tile_n=2, w=8, v=10, threads=96, grouping=16, minblocks=1) , # 161.565 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=1) , # 150.43 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 170.247 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=17, tile_m=3, tile_n=2, w=8, v=28, threads=96, grouping=16, minblocks=12) , # 292.771 GFlop/s
  Kernel_dnt_medium(m=16, n=8, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 150.445 GFlop/s
  Kernel_dnt_largeDB(m=16, n=23, k=23, tile_m=2, tile_n=2, w=10, v=16, threads=128, grouping=16, minblocks=12) , # 282.297 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 120.439 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=4, tile_m=4, tile_n=2, w=2, v=20, threads=128, grouping=16, minblocks=1) , # 181.09 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 187.276 GFlop/s
  Kernel_dnt_largeDB(m=4, n=8, k=32, tile_m=1, tile_n=1, w=16, v=8, threads=128, grouping=16, minblocks=1) , # 83.7638 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=24, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 151.977 GFlop/s
  Kernel_dnt_largeDB(m=4, n=32, k=8, tile_m=1, tile_n=1, w=4, v=32, threads=128, grouping=16, minblocks=8) , # 101.579 GFlop/s
  Kernel_dnt_largeDB(m=6, n=23, k=16, tile_m=2, tile_n=1, w=8, v=16, threads=96, grouping=16, minblocks=4) , # 145.791 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=8, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 192.178 GFlop/s
  Kernel_dnt_largeDB(m=5, n=26, k=24, tile_m=1, tile_n=1, w=12, v=26, threads=160, grouping=16, minblocks=12) , # 142.549 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 316.266 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=32, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 308.173 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=17, tile_m=2, tile_n=3, w=6, v=14, threads=160, grouping=16, minblocks=8) , # 274.845 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 43.884 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 131.042 GFlop/s
  Kernel_dnt_largeDB(m=23, n=13, k=23, tile_m=3, tile_n=2, w=6, v=8, threads=96, grouping=16, minblocks=12) , # 232.991 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=9, tile_m=1, tile_n=1, threads=64, grouping=16, minblocks=4) , # 54.7869 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 184.007 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 161.29 GFlop/s
  Kernel_dnt_medium(m=5, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 69.8459 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=26, split_thread=32, threads=96, grouping=16, minblocks=1) , # 52.2776 GFlop/s
  Kernel_dnt_largeDB(m=15, n=15, k=225, tile_m=2, tile_n=2, w=16, v=10, threads=96, grouping=16, minblocks=12) , # 311.643 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=22, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 350.804 GFlop/s
  Kernel_dnt_tiny(m=6, n=5, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 25.6146 GFlop/s
  Kernel_dnt_tiny(m=8, n=8, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 62.8469 GFlop/s
  Kernel_dnt_medium(m=23, n=32, k=5, tile_m=3, tile_n=2, threads=192, grouping=16, minblocks=8) , # 200.317 GFlop/s
  Kernel_dnt_small(m=5, n=8, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 59.2125 GFlop/s
  Kernel_dnt_medium(m=11, n=121, k=11, tile_m=3, tile_n=2, threads=256, grouping=16, minblocks=4) , # 264.615 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=9, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 211.963 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 101.258 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 158.407 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 101.84 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=23, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 135.677 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 111.977 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=23, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 144.504 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=9, tile_m=2, tile_n=3, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 222.676 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 257.375 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=55, tile_m=2, tile_n=2, w=14, v=16, threads=128, grouping=16, minblocks=12) , # 303.184 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 217.596 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 136.417 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 118.252 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=5, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 111.697 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 93.8914 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 72.9839 GFlop/s
  Kernel_dnt_medium(m=32, n=8, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 213.369 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 215.17 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=17, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 231.149 GFlop/s
  Kernel_dnt_largeDB(m=64, n=64, k=64, tile_m=3, tile_n=6, w=16, v=32, threads=256, grouping=16, minblocks=1) , # 563.614 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 78.7409 GFlop/s
  Kernel_dnt_tiny(m=6, n=4, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 25.3088 GFlop/s
  Kernel_dnt_largeDB(m=9, n=8, k=24, tile_m=1, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=8) , # 127.47 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 43.8484 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 149.438 GFlop/s
  Kernel_dnt_largeDB(m=23, n=6, k=23, tile_m=1, tile_n=2, w=8, v=6, threads=96, grouping=16, minblocks=8) , # 148.475 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 304.306 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 122.251 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=13, tile_m=2, tile_n=2, w=6, v=24, threads=128, grouping=16, minblocks=12) , # 290.526 GFlop/s
  Kernel_dnt_largeDB(m=9, n=23, k=24, tile_m=2, tile_n=2, w=10, v=12, threads=128, grouping=16, minblocks=12) , # 203.27 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=17, tile_m=3, tile_n=2, w=6, v=24, threads=160, grouping=16, minblocks=8) , # 300.196 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 138.221 GFlop/s
  Kernel_dnt_largeDB(m=17, n=8, k=26, tile_m=2, tile_n=3, w=12, v=8, threads=96, grouping=16, minblocks=12) , # 158.262 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=6, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 150.539 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 144.455 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=32, tile_m=2, tile_n=2, w=16, v=16, threads=128, grouping=16, minblocks=4) , # 299.296 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 97.9858 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 110.269 GFlop/s
  Kernel_dnt_largeDB(m=22, n=9, k=24, tile_m=2, tile_n=2, w=12, v=6, threads=128, grouping=16, minblocks=12) , # 200.245 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=6, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 176.832 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=26, tile_m=2, tile_n=1, w=10, v=22, threads=128, grouping=16, minblocks=12) , # 191.916 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 204.952 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 179.063 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=22, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 177.946 GFlop/s
  Kernel_dnt_largeDB(m=5, n=24, k=8, tile_m=1, tile_n=1, w=4, v=24, threads=128, grouping=16, minblocks=12) , # 100.583 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 217.558 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=9, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 222.138 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=64, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 357.117 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 241.704 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=24, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 128.616 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 205.3 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 46.7754 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 150.275 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=17, tile_m=3, tile_n=3, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 322.465 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 83.4097 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 61.4407 GFlop/s
  Kernel_dnt_largeDB(m=13, n=13, k=32, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 200.726 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 217.951 GFlop/s
  Kernel_dnt_small(m=32, n=13, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 135.558 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=9, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 220.336 GFlop/s
  Kernel_dnt_tiny(m=16, n=8, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 78.0533 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=6, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 76.2473 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=32, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=1) , # 218.41 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=16, tile_m=1, tile_n=2, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 155.633 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 151.357 GFlop/s
  Kernel_dnt_largeDB(m=55, n=16, k=29, tile_m=3, tile_n=3, w=6, v=8, threads=128, grouping=16, minblocks=8) , # 369.726 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 137.477 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 278.933 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 109.504 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 77.3562 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=17, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 121.911 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 107.25 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 272.615 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=6, tile_m=5, tile_n=2, w=2, v=14, threads=96, grouping=16, minblocks=12) , # 220.557 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 209.088 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 86.1888 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 120.717 GFlop/s
  Kernel_dnt_largeDB(m=4, n=24, k=32, tile_m=1, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=4) , # 131.226 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 38.8056 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 94.6073 GFlop/s
  Kernel_dnt_medium(m=6, n=8, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 75.7991 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 43.2513 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 279.051 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 174.528 GFlop/s
  Kernel_dnt_largeDB(m=16, n=13, k=32, tile_m=2, tile_n=2, w=16, v=12, threads=128, grouping=16, minblocks=12) , # 234.974 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 64.3579 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=23, split_thread=32, threads=128, grouping=16, minblocks=1) , # 50.2012 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=17, split_thread=32, threads=128, grouping=16, minblocks=1) , # 54.3192 GFlop/s
  Kernel_dnt_largeDB(m=5, n=32, k=23, tile_m=1, tile_n=2, w=6, v=32, threads=96, grouping=16, minblocks=8) , # 153.738 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=32, tile_m=3, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 252.527 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=13, tile_m=2, tile_n=4, threads=96, grouping=16, minblocks=8) , # 309.16 GFlop/s
  Kernel_dnt_largeDB(m=13, n=6, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 138.459 GFlop/s
  Kernel_dnt_small(m=9, n=16, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 101.254 GFlop/s
  Kernel_dnt_largeDB(m=9, n=17, k=26, tile_m=1, tile_n=2, w=10, v=10, threads=96, grouping=16, minblocks=4) , # 167.993 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 167.762 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=22, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 305.434 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=17, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 142.699 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 152.877 GFlop/s
  Kernel_dnt_largeDB(m=5, n=32, k=24, tile_m=1, tile_n=2, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 156.997 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=23, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 157.04 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 150.565 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 53.2937 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 188.882 GFlop/s
  Kernel_dnt_largeDB(m=55, n=16, k=55, tile_m=5, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=8) , # 426.451 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 120.304 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 102.487 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 104.769 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 139.051 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=32, tile_m=2, tile_n=2, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 249.931 GFlop/s
  Kernel_dnt_medium(m=32, n=8, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 174.913 GFlop/s
  Kernel_dnt_small(m=26, n=4, k=4, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 59.2665 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 82.5687 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 149.841 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=8, split_thread=32, threads=128, grouping=16, minblocks=1) , # 31.7227 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=16, tile_m=2, tile_n=3, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 298.281 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=8, tile_m=2, tile_n=2, w=4, v=12, threads=128, grouping=16, minblocks=12) , # 237.248 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 153.276 GFlop/s
  Kernel_dnt_largeDB(m=32, n=5, k=23, tile_m=2, tile_n=1, w=6, v=2, threads=96, grouping=16, minblocks=4) , # 154.957 GFlop/s
  Kernel_dnt_largeDB(m=9, n=64, k=64, tile_m=3, tile_n=2, w=10, v=64, threads=128, grouping=16, minblocks=8) , # 319.273 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 177.384 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 236.192 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 165.895 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=6, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 169.805 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=8) , # 255.121 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 128.978 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=23, tile_m=2, tile_n=2, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 227.833 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 151.642 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=9, tile_m=1, tile_n=5, threads=96, grouping=16, minblocks=12) , # 192.159 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 50.793 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 77.9158 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 179.196 GFlop/s
  Kernel_dnt_largeDB(m=4, n=16, k=32, tile_m=1, tile_n=1, w=16, v=16, threads=128, grouping=16, minblocks=12) , # 119.269 GFlop/s
  Kernel_dnt_largeDB(m=29, n=14, k=29, tile_m=2, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=8) , # 274.397 GFlop/s
  Kernel_dnt_medium(m=16, n=9, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 82.0886 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=17, tile_m=6, tile_n=1, threads=96, grouping=16, minblocks=8) , # 145.146 GFlop/s
  Kernel_dnt_medium(m=32, n=23, k=4, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 170.118 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 85.3823 GFlop/s
  Kernel_dnt_medium(m=8, n=24, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 154.155 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=8, tile_m=3, tile_n=2, w=4, v=12, threads=128, grouping=16, minblocks=12) , # 231.867 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=24, tile_m=2, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 313.429 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 34.5786 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 114.731 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 76.4154 GFlop/s
  Kernel_dnt_largeDB(m=4, n=23, k=24, tile_m=1, tile_n=1, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 119.172 GFlop/s
  Kernel_dnt_largeDB(m=22, n=22, k=32, tile_m=2, tile_n=2, w=16, v=22, threads=128, grouping=16, minblocks=4) , # 345.889 GFlop/s
  Kernel_dnt_small(m=6, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 48.547 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 125.677 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 132.816 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=23, tile_m=2, tile_n=3, w=6, v=6, threads=96, grouping=16, minblocks=12) , # 230.82 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 90.6552 GFlop/s
  Kernel_dnt_medium(m=13, n=32, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 236.128 GFlop/s
  Kernel_dnt_tiny(m=5, n=5, k=9, split_thread=32, threads=96, grouping=16, minblocks=1) , # 41.6535 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 79.8984 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 44.6668 GFlop/s
  Kernel_dnt_medium(m=16, n=26, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 166.297 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 243.006 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=13, tile_m=2, tile_n=2, w=6, v=8, threads=128, grouping=16, minblocks=12) , # 244.245 GFlop/s
  Kernel_dnt_tiny(m=6, n=17, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 60.5441 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 116.617 GFlop/s
  Kernel_dnt_largeDB(m=23, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=8) , # 148.711 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=32, tile_m=1, tile_n=2, w=12, v=26, threads=128, grouping=16, minblocks=12) , # 227.989 GFlop/s
  Kernel_dnt_largeDB(m=26, n=16, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=1) , # 283.395 GFlop/s
  Kernel_dnt_medium(m=8, n=23, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 147.212 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=32, tile_m=2, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=8) , # 211.83 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=24, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 184.552 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 84.3988 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=26, tile_m=2, tile_n=3, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 325.286 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 44.3641 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=26, tile_m=1, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=4) , # 158.81 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 87.1602 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=13, tile_m=3, tile_n=2, w=6, v=20, threads=160, grouping=16, minblocks=8) , # 306.849 GFlop/s
  Kernel_dnt_largeDB(m=6, n=8, k=24, tile_m=1, tile_n=1, w=12, v=8, threads=128, grouping=16, minblocks=1) , # 99.5961 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=24, tile_m=2, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 274.723 GFlop/s
  Kernel_dnt_largeDB(m=17, n=16, k=32, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 257.664 GFlop/s
  Kernel_dnt_small(m=5, n=5, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 58.79 GFlop/s
  Kernel_dnt_tiny(m=5, n=6, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 39.6543 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 119.245 GFlop/s
  Kernel_dnt_medium(m=26, n=26, k=4, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=4) , # 175.533 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=23, tile_m=3, tile_n=2, w=10, v=32, threads=128, grouping=16, minblocks=8) , # 369.126 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=14, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 197.885 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=17, tile_m=2, tile_n=4, w=8, v=24, threads=128, grouping=16, minblocks=8) , # 325.543 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=22, tile_m=3, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 245.288 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=1) , # 147.417 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 172.352 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 73.6794 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=17, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 157.425 GFlop/s
  Kernel_dnt_medium(m=16, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 147.837 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=16, tile_m=2, tile_n=2, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 223.93 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 202.501 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 161.476 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=22, tile_m=2, tile_n=3, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 305.508 GFlop/s
  Kernel_dnt_largeDB(m=24, n=9, k=24, tile_m=2, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 216.095 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 72.4236 GFlop/s
  Kernel_dnt_largeDB(m=9, n=6, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=4) , # 108.241 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 175.389 GFlop/s
  Kernel_dnt_largeDB(m=13, n=17, k=23, tile_m=2, tile_n=2, w=6, v=16, threads=128, grouping=16, minblocks=12) , # 216.451 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=23, tile_m=1, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 205.283 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.3214 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 75.6469 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 45.8575 GFlop/s
  Kernel_dnt_medium(m=8, n=26, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 157.749 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=22, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 140.116 GFlop/s
  Kernel_dnt_largeDB(m=8, n=16, k=26, tile_m=2, tile_n=1, w=10, v=6, threads=96, grouping=16, minblocks=1) , # 180.88 GFlop/s
  Kernel_dnt_small(m=5, n=6, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 52.8571 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=17, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 331.579 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 71.3946 GFlop/s
  Kernel_dnt_medium(m=32, n=17, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 237.103 GFlop/s
  Kernel_dnt_tiny(m=6, n=5, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 39.7844 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 139.438 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 289.124 GFlop/s
  Kernel_dnt_largeDB(m=29, n=16, k=55, tile_m=2, tile_n=4, w=6, v=6, threads=96, grouping=16, minblocks=12) , # 360.532 GFlop/s
  Kernel_dnt_small(m=5, n=8, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 67.8873 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 280.206 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 151.035 GFlop/s
  Kernel_dnt_medium(m=6, n=4, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 63.2716 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=32, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 483.614 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 48.1233 GFlop/s
  Kernel_dnt_largeDB(m=8, n=32, k=26, tile_m=2, tile_n=2, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 230.134 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 87.6399 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 103.505 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 76.9026 GFlop/s
  Kernel_dnt_medium(m=16, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 134.168 GFlop/s
  Kernel_dnt_medium(m=16, n=9, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 149.02 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 276.614 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 147.827 GFlop/s
  Kernel_dnt_small(m=5, n=32, k=6, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 95.4478 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 83.3154 GFlop/s
  Kernel_dnt_largeDB(m=32, n=8, k=22, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 222.523 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=16, tile_m=2, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 198.469 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 51.2778 GFlop/s
  Kernel_dnt_largeDB(m=17, n=17, k=26, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 240.047 GFlop/s
  Kernel_dnt_medium(m=5, n=24, k=22, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=1) , # 147.78 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 94.7903 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=16, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 315.493 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=8, tile_m=3, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 231.681 GFlop/s
  Kernel_dnt_largeDB(m=22, n=13, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 233.378 GFlop/s
  Kernel_dnt_medium(m=32, n=16, k=6, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 189.814 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 92.0325 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 246.559 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=24, split_thread=32, threads=128, grouping=16, minblocks=1) , # 50.5776 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=22, tile_m=2, tile_n=3, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 335.764 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 35.3897 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=32, split_thread=32, threads=128, grouping=16, minblocks=1) , # 47.0079 GFlop/s
  Kernel_dnt_tiny(m=24, n=4, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 58.8119 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=22, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=8) , # 273.854 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=23, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 322.727 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 108.128 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 111.971 GFlop/s
  Kernel_dnt_largeDB(m=16, n=29, k=29, tile_m=2, tile_n=2, w=6, v=24, threads=128, grouping=16, minblocks=12) , # 312.556 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 118.17 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 99.8992 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 201.354 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 54.1312 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=24, tile_m=2, tile_n=2, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 262.097 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=17, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=1) , # 178.959 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=26, tile_m=2, tile_n=3, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 342.548 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=8, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 183.327 GFlop/s
  Kernel_dnt_largeDB(m=16, n=64, k=16, tile_m=2, tile_n=4, w=8, v=40, threads=128, grouping=16, minblocks=8) , # 374.147 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 84.3077 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=4, tile_m=3, tile_n=2, w=2, v=16, threads=128, grouping=16, minblocks=12) , # 188.795 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=9, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 189.732 GFlop/s
  Kernel_dnt_medium(m=23, n=22, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 177.939 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=8, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 239.385 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=26, tile_m=2, tile_n=3, w=4, v=20, threads=128, grouping=16, minblocks=12) , # 379.484 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=24, tile_m=3, tile_n=2, w=8, v=26, threads=96, grouping=16, minblocks=12) , # 292.828 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 77.4137 GFlop/s
  Kernel_dnt_largeDB(m=5, n=17, k=24, tile_m=1, tile_n=1, w=12, v=14, threads=128, grouping=16, minblocks=12) , # 128.46 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=9, split_thread=32, threads=128, grouping=16, minblocks=1) , # 30.6105 GFlop/s
  Kernel_dnt_largeDB(m=12, n=144, k=12, tile_m=3, tile_n=3, w=6, v=80, threads=256, grouping=16, minblocks=4) , # 273.287 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=22, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 227.875 GFlop/s
  Kernel_dnt_largeDB(m=32, n=6, k=32, tile_m=2, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 191.164 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=13, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 153.58 GFlop/s
  Kernel_dnt_medium(m=16, n=8, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 121.754 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=24, tile_m=2, tile_n=4, w=12, v=14, threads=128, grouping=16, minblocks=8) , # 382.529 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=4, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 148.753 GFlop/s
  Kernel_dnt_medium(m=13, n=8, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 139.52 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=17, tile_m=2, tile_n=4, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 337.15 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 112.934 GFlop/s
  Kernel_dnt_small(m=17, n=5, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 67.5261 GFlop/s
  Kernel_dnt_medium(m=8, n=23, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 118.044 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 84.7969 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=1) , # 191.25 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 68.783 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 118.024 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 65.899 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 158.986 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 126.451 GFlop/s
  Kernel_dnt_medium(m=8, n=16, k=17, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 166.714 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=17, tile_m=3, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 309.544 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=22, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 235.476 GFlop/s
  Kernel_dnt_largeDB(m=23, n=6, k=32, tile_m=2, tile_n=3, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 164.982 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 121.134 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=24, tile_m=3, tile_n=3, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 376.995 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=22, split_thread=32, threads=128, grouping=16, minblocks=1) , # 62.9255 GFlop/s
  Kernel_dnt_small(m=13, n=13, k=4, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 99.127 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=23, tile_m=2, tile_n=2, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 244.933 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=23, tile_m=1, tile_n=1, threads=224, grouping=16, minblocks=8) , # 162.634 GFlop/s
  Kernel_dnt_largeDB(m=32, n=13, k=32, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 317.465 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=32, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 117.743 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 99.4404 GFlop/s
  Kernel_dnt_small(m=5, n=8, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 48.3135 GFlop/s
  Kernel_dnt_largeDB(m=6, n=26, k=26, tile_m=1, tile_n=2, w=10, v=26, threads=96, grouping=16, minblocks=12) , # 160.058 GFlop/s
  Kernel_dnt_largeDB(m=16, n=8, k=32, tile_m=1, tile_n=1, w=16, v=8, threads=128, grouping=16, minblocks=1) , # 194.196 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=8, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 213.06 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 253.21 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=256, tile_m=2, tile_n=2, w=14, v=12, threads=128, grouping=16, minblocks=12) , # 341.89 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=17, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 213.609 GFlop/s
  Kernel_dnt_medium(m=14, n=16, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 230.421 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 80.8594 GFlop/s
  Kernel_dnt_small(m=8, n=22, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 102.251 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 98.5145 GFlop/s
  Kernel_dnt_small(m=6, n=24, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 77.1222 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 127.777 GFlop/s
  Kernel_dnt_tiny(m=23, n=5, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 66.3956 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 150.577 GFlop/s
  Kernel_dnt_small(m=13, n=6, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 77.0262 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=9, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 229.052 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=32, tile_m=2, tile_n=2, w=14, v=6, threads=128, grouping=16, minblocks=12) , # 214.072 GFlop/s
  Kernel_dnt_medium(m=32, n=8, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 206.098 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 185.538 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 93.3249 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 144.145 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=13, split_thread=32, threads=128, grouping=16, minblocks=1) , # 45.439 GFlop/s
  Kernel_dnt_medium(m=16, n=24, k=6, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 184.054 GFlop/s
  Kernel_dnt_small(m=9, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 40.9728 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=32, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 99.3711 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=13, tile_m=2, tile_n=2, w=6, v=12, threads=128, grouping=16, minblocks=12) , # 292.252 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 69.4004 GFlop/s
  Kernel_dnt_largeDB(m=32, n=5, k=26, tile_m=1, tile_n=1, w=10, v=4, threads=160, grouping=16, minblocks=12) , # 157.283 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 63.9309 GFlop/s
  Kernel_dnt_largeDB(m=6, n=16, k=32, tile_m=1, tile_n=1, w=16, v=16, threads=128, grouping=16, minblocks=12) , # 162.722 GFlop/s
  Kernel_dnt_largeDB(m=64, n=16, k=64, tile_m=2, tile_n=4, w=8, v=10, threads=128, grouping=16, minblocks=8) , # 501.245 GFlop/s
  Kernel_dnt_largeDB(m=32, n=6, k=22, tile_m=2, tile_n=1, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 177.592 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=8, tile_m=2, tile_n=2, w=4, v=24, threads=128, grouping=16, minblocks=12) , # 196.795 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 202.68 GFlop/s
  Kernel_dnt_largeDB(m=5, n=16, k=32, tile_m=1, tile_n=1, w=16, v=16, threads=128, grouping=16, minblocks=12) , # 139.49 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 156.829 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=8) , # 109.232 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=24, tile_m=2, tile_n=2, w=8, v=18, threads=128, grouping=16, minblocks=12) , # 301.847 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 175.277 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=9, tile_m=2, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 239.327 GFlop/s
  Kernel_dnt_small(m=6, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 52.5703 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=16, tile_m=2, tile_n=4, w=8, v=14, threads=96, grouping=16, minblocks=12) , # 373.707 GFlop/s
  Kernel_dnt_largeDB(m=17, n=6, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=128, grouping=16, minblocks=12) , # 142.037 GFlop/s
  Kernel_dnt_medium(m=9, n=64, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 213.899 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=26, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 323.835 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 70.6265 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=22, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 275.664 GFlop/s
  Kernel_dnt_small(m=22, n=5, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 73.3009 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 110.447 GFlop/s
  Kernel_dnt_tiny(m=9, n=13, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 68.1072 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=64, tile_m=2, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 312.221 GFlop/s
  Kernel_dnt_tiny(m=7, n=7, k=7, split_thread=32, threads=96, grouping=16, minblocks=1) , # 62.8401 GFlop/s
  Kernel_dnt_medium(m=5, n=17, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 54.7705 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 248.672 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 110.721 GFlop/s
  Kernel_dnt_tiny(m=5, n=24, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 70.2727 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=6, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 76.9185 GFlop/s
  Kernel_dnt_medium(m=22, n=23, k=4, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=4) , # 147.314 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=26, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 343.473 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 42.8558 GFlop/s
  Kernel_dnt_medium(m=22, n=24, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 172.584 GFlop/s
  Kernel_dnt_largeDB(m=22, n=13, k=22, tile_m=3, tile_n=2, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 221.172 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 59.5034 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=23, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 119.87 GFlop/s
  Kernel_dnt_largeDB(m=9, n=24, k=16, tile_m=1, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=1) , # 191.231 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 100.301 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=32, tile_m=2, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=12) , # 213.624 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=16, tile_m=2, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=8) , # 260.668 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=24, tile_m=2, tile_n=3, w=12, v=20, threads=160, grouping=16, minblocks=8) , # 341.77 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.055 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 63.6068 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 102.257 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=16, tile_m=2, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 326.65 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 93.47 GFlop/s
  Kernel_dnt_largeDB(m=9, n=64, k=22, tile_m=3, tile_n=2, w=6, v=32, threads=96, grouping=16, minblocks=12) , # 285.617 GFlop/s
  Kernel_dnt_small(m=8, n=17, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 91.8413 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 241.006 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 66.0702 GFlop/s
  Kernel_dnt_small(m=13, n=8, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 81.3107 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 106.787 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=32, tile_m=2, tile_n=4, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 419.821 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=8, tile_m=4, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 264.277 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 58.1283 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 123.059 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=4, tile_m=3, tile_n=2, w=2, v=16, threads=128, grouping=16, minblocks=12) , # 180.711 GFlop/s
  Kernel_dnt_medium(m=13, n=4, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 103.008 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 165.569 GFlop/s
  Kernel_dnt_small(m=6, n=6, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 29.0092 GFlop/s
  Kernel_dnt_largeDB(m=8, n=17, k=32, tile_m=2, tile_n=3, w=16, v=16, threads=96, grouping=16, minblocks=12) , # 172.147 GFlop/s
  Kernel_dnt_small(m=8, n=16, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 121.25 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=26, tile_m=2, tile_n=3, w=10, v=20, threads=96, grouping=16, minblocks=12) , # 348.405 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 216.852 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=24, tile_m=3, tile_n=2, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 253.032 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 106.081 GFlop/s
  Kernel_dnt_largeDB(m=64, n=64, k=22, tile_m=4, tile_n=4, w=8, v=16, threads=256, grouping=16, minblocks=1) , # 410.791 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 113.671 GFlop/s
  Kernel_dnt_tiny(m=8, n=8, k=8, split_thread=32, threads=128, grouping=16, minblocks=1) , # 90.7763 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 202.375 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 207.061 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=17, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 146.817 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=22, tile_m=2, tile_n=4, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 278.395 GFlop/s
  Kernel_dnt_medium(m=32, n=24, k=5, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=1) , # 192.606 GFlop/s
  Kernel_dnt_small(m=13, n=9, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 77.8963 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 113.174 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 121.92 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=22, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 163.987 GFlop/s
  Kernel_dnt_largeDB(m=16, n=6, k=24, tile_m=1, tile_n=1, w=12, v=6, threads=96, grouping=16, minblocks=8) , # 151.996 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 83.0577 GFlop/s
  Kernel_dnt_largeDB(m=22, n=22, k=26, tile_m=2, tile_n=2, w=8, v=14, threads=192, grouping=16, minblocks=8) , # 317.329 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 86.2243 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=24, tile_m=2, tile_n=4, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 411.108 GFlop/s
  Kernel_dnt_largeDB(m=13, n=17, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 214.46 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 28.4815 GFlop/s
  Kernel_dnt_small(m=32, n=8, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 125.786 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 160.165 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=4, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 116.716 GFlop/s
  Kernel_dnt_largeDB(m=8, n=32, k=22, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 222.573 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=22, tile_m=2, tile_n=3, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 362.792 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 16.1738 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=22, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 200.407 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=5, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 145.967 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 106.15 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 87.9872 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=23, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 157.854 GFlop/s
  Kernel_dnt_medium(m=17, n=24, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 260.23 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 237.976 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 81.0577 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 131.844 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 129.614 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=24, tile_m=1, tile_n=1, threads=224, grouping=16, minblocks=8) , # 168.798 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 104.824 GFlop/s
  Kernel_dnt_small(m=16, n=9, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=4) , # 120.113 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=6, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 73.2559 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=26, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 144.189 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 107.128 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 94.4818 GFlop/s
  Kernel_dnt_medium(m=6, n=32, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 107.109 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 124.782 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=23, tile_m=3, tile_n=2, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 341.553 GFlop/s
  Kernel_dnt_medium(m=24, n=22, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 284.038 GFlop/s
  Kernel_dnt_largeDB(m=13, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=96, grouping=16, minblocks=8) , # 115.955 GFlop/s
  Kernel_dnt_medium(m=9, n=4, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 35.1352 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 43.4494 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=23, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 205.862 GFlop/s
  Kernel_dnt_small(m=32, n=5, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 82.539 GFlop/s
  Kernel_dnt_largeDB(m=26, n=9, k=24, tile_m=2, tile_n=1, w=12, v=4, threads=128, grouping=16, minblocks=12) , # 218.442 GFlop/s
  Kernel_dnt_medium(m=8, n=22, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 172.127 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=22, tile_m=2, tile_n=3, w=6, v=20, threads=160, grouping=16, minblocks=8) , # 308.631 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=24, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 457.257 GFlop/s
  Kernel_dnt_small(m=5, n=4, k=23, tile_m=1, tile_n=1, threads=64, grouping=16, minblocks=4) , # 37.32 GFlop/s
  Kernel_dnt_small(m=5, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 50.9857 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 24.1362 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=32, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 330.08 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=5, split_thread=32, threads=96, grouping=16, minblocks=1) , # 21.2885 GFlop/s
  Kernel_dnt_largeDB(m=22, n=9, k=64, tile_m=2, tile_n=2, w=12, v=8, threads=96, grouping=16, minblocks=12) , # 239.452 GFlop/s
  Kernel_dnt_medium(m=13, n=22, k=4, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 117.05 GFlop/s
  Kernel_dnt_largeDB(m=64, n=22, k=16, tile_m=2, tile_n=6, w=8, v=14, threads=128, grouping=16, minblocks=1) , # 369.789 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=24, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 141.952 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=16, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 319.963 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=24, tile_m=3, tile_n=1, w=8, v=20, threads=128, grouping=16, minblocks=12) , # 239.302 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=24, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 292.083 GFlop/s
  Kernel_dnt_medium(m=16, n=8, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 178.527 GFlop/s
  Kernel_dnt_largeDB(m=16, n=29, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 294.66 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 157.619 GFlop/s
  Kernel_dnt_tiny(m=8, n=13, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 86.1038 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=32, tile_m=2, tile_n=4, w=12, v=22, threads=128, grouping=16, minblocks=8) , # 403.53 GFlop/s
  Kernel_dnt_medium(m=8, n=23, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 178.045 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 64.4064 GFlop/s
  Kernel_dnt_largeDB(m=16, n=5, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=12) , # 138.029 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=22, tile_m=3, tile_n=2, w=10, v=8, threads=96, grouping=16, minblocks=12) , # 301.99 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 99.1141 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=23, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 330.118 GFlop/s
  Kernel_dnt_largeDB(m=16, n=16, k=26, tile_m=2, tile_n=2, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 263.235 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 71.391 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=4, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 116.442 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 36.6767 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 64.7027 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 210.814 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=6, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=12) , # 122.677 GFlop/s
  Kernel_dnt_medium(m=8, n=24, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 123.551 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=9, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 301.38 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=4, tile_m=1, tile_n=3, threads=96, grouping=16, minblocks=12) , # 109.311 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 135.751 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 233.808 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 14.8372 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 92.0394 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 114.794 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=16, tile_m=3, tile_n=2, w=4, v=26, threads=128, grouping=16, minblocks=12) , # 304.612 GFlop/s
  Kernel_dnt_largeDB(m=16, n=23, k=22, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 279.549 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 140 GFlop/s
  Kernel_dnt_largeDB(m=22, n=6, k=32, tile_m=3, tile_n=2, w=16, v=4, threads=96, grouping=16, minblocks=12) , # 163.029 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=5, tile_m=4, tile_n=2, w=2, v=20, threads=128, grouping=16, minblocks=8) , # 205.088 GFlop/s
  Kernel_dnt_medium(m=22, n=8, k=17, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 184.837 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=26, tile_m=2, tile_n=4, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 378.454 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 114.088 GFlop/s
  Kernel_dnt_largeDB(m=6, n=13, k=32, tile_m=1, tile_n=1, w=16, v=8, threads=128, grouping=16, minblocks=12) , # 134.667 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=23, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 147.626 GFlop/s
  Kernel_dnt_largeDB(m=8, n=22, k=23, tile_m=2, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=1) , # 185.959 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=22, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 154.822 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=1) , # 317.077 GFlop/s
  Kernel_dnt_medium(m=8, n=22, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 113.706 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 35.7692 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 72.8839 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=32, tile_m=3, tile_n=2, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 417.571 GFlop/s
  Kernel_dnt_medium(m=8, n=26, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 183.678 GFlop/s
  Kernel_dnt_largeDB(m=8, n=16, k=22, tile_m=1, tile_n=1, w=8, v=16, threads=128, grouping=16, minblocks=1) , # 174.831 GFlop/s
  Kernel_dnt_largeDB(m=16, n=17, k=17, tile_m=3, tile_n=2, w=6, v=16, threads=128, grouping=16, minblocks=12) , # 210.423 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 74.4863 GFlop/s
  Kernel_dnt_medium(m=32, n=17, k=13, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 285.362 GFlop/s
  Kernel_dnt_largeDB(m=26, n=26, k=23, tile_m=2, tile_n=3, w=6, v=26, threads=160, grouping=16, minblocks=8) , # 348.123 GFlop/s
  Kernel_dnt_tiny(m=6, n=5, k=4, split_thread=32, threads=96, grouping=16, minblocks=1) , # 21.0382 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 163.56 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=24, tile_m=2, tile_n=3, w=12, v=12, threads=160, grouping=16, minblocks=8) , # 320.822 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=16, tile_m=3, tile_n=2, w=4, v=12, threads=128, grouping=16, minblocks=12) , # 289.65 GFlop/s
  Kernel_dnt_medium(m=49, n=7, k=7, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=8) , # 145.44 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=4, tile_m=5, tile_n=2, w=2, v=18, threads=96, grouping=16, minblocks=12) , # 170.647 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=8, tile_m=2, tile_n=2, w=4, v=22, threads=128, grouping=16, minblocks=12) , # 183.993 GFlop/s
  Kernel_dnt_largeDB(m=22, n=9, k=32, tile_m=2, tile_n=2, w=10, v=8, threads=96, grouping=16, minblocks=12) , # 207.544 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=32, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 150.503 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 129.951 GFlop/s
  Kernel_dnt_largeDB(m=14, n=29, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 267.108 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 82.6919 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 111.224 GFlop/s
  Kernel_dnt_largeDB(m=6, n=22, k=16, tile_m=1, tile_n=2, w=8, v=14, threads=96, grouping=16, minblocks=4) , # 143.533 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=8, tile_m=3, tile_n=2, w=4, v=20, threads=96, grouping=16, minblocks=12) , # 218.859 GFlop/s
  Kernel_dnt_largeDB(m=22, n=17, k=16, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 245.41 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=9, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 231.7 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=26, tile_m=3, tile_n=2, w=8, v=20, threads=96, grouping=16, minblocks=12) , # 372.117 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=4, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 101.985 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 193.774 GFlop/s
  Kernel_dnt_largeDB(m=17, n=23, k=22, tile_m=2, tile_n=2, w=10, v=22, threads=128, grouping=16, minblocks=12) , # 263.902 GFlop/s
  Kernel_dnt_largeDB(m=169, n=13, k=13, tile_m=3, tile_n=4, w=6, v=10, threads=256, grouping=16, minblocks=1) , # 221.427 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=17, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 202.974 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 35.8024 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 121.218 GFlop/s
  Kernel_dnt_medium(m=23, n=6, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 153.003 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 89.6464 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 193.414 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 122.011 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=8, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 240.36 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 136.931 GFlop/s
  Kernel_dnt_medium(m=6, n=17, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 122.369 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 157.597 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 93.8793 GFlop/s
  Kernel_dnt_medium(m=22, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 119.067 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.565 GFlop/s
  Kernel_dnt_largeDB(m=17, n=8, k=32, tile_m=3, tile_n=2, w=16, v=8, threads=96, grouping=16, minblocks=12) , # 178.979 GFlop/s
  Kernel_dnt_medium(m=5, n=13, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 94.5206 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 194.247 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=23, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 179.578 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=13, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 223.261 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=8, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 191.191 GFlop/s
  Kernel_dnt_largeDB(m=32, n=4, k=26, tile_m=1, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 135.188 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 131.35 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 196.752 GFlop/s
  Kernel_dnt_medium(m=17, n=22, k=5, tile_m=1, tile_n=5, threads=128, grouping=16, minblocks=12) , # 148.4 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 107.414 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 81.2228 GFlop/s
  Kernel_dnt_medium(m=4, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 63.1415 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=13, split_thread=32, threads=128, grouping=16, minblocks=1) , # 42.687 GFlop/s
  Kernel_dnt_medium(m=23, n=22, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 270.777 GFlop/s
  Kernel_dnt_largeDB(m=9, n=16, k=24, tile_m=1, tile_n=2, w=12, v=14, threads=128, grouping=16, minblocks=12) , # 170.479 GFlop/s
  Kernel_dnt_medium(m=9, n=8, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 103.155 GFlop/s
  Kernel_dnt_medium(m=17, n=32, k=9, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 233.382 GFlop/s
  Kernel_dnt_medium(m=16, n=8, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 91.3267 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=8, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 202.085 GFlop/s
  Kernel_dnt_medium(m=24, n=6, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 148.725 GFlop/s
  Kernel_dnt_largeDB(m=22, n=8, k=24, tile_m=2, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=8) , # 197.07 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=13, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 325.002 GFlop/s
  Kernel_dnt_largeDB(m=16, n=55, k=16, tile_m=2, tile_n=4, w=8, v=32, threads=128, grouping=16, minblocks=8) , # 331.33 GFlop/s
  Kernel_dnt_largeDB(m=8, n=13, k=16, tile_m=1, tile_n=1, w=8, v=10, threads=128, grouping=16, minblocks=12) , # 135.239 GFlop/s
  Kernel_dnt_largeDB(m=26, n=16, k=22, tile_m=2, tile_n=4, w=10, v=10, threads=96, grouping=16, minblocks=12) , # 283.175 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 113.713 GFlop/s
  Kernel_dnt_largeDB(m=23, n=16, k=24, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 297.891 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=4, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 72.6709 GFlop/s
  Kernel_dnt_largeDB(m=17, n=17, k=32, tile_m=3, tile_n=2, w=14, v=12, threads=96, grouping=16, minblocks=12) , # 260.215 GFlop/s
  Kernel_dnt_largeDB(m=22, n=23, k=32, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 332.066 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=8, tile_m=2, tile_n=2, threads=192, grouping=16, minblocks=8) , # 227.684 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 205.549 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 85.686 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 88.3985 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 78.6654 GFlop/s
  Kernel_dnt_medium(m=4, n=22, k=17, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 121.506 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 33.1999 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=5, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 104.199 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=23, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=8) , # 141.081 GFlop/s
  Kernel_dnt_medium(m=26, n=5, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 83.8041 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 126.065 GFlop/s
  Kernel_dnt_medium(m=8, n=16, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 150.649 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=9, tile_m=3, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 280.794 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=5, tile_m=1, tile_n=6, threads=96, grouping=16, minblocks=12) , # 155.547 GFlop/s
  Kernel_dnt_medium(m=24, n=24, k=4, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=8) , # 171.861 GFlop/s
  Kernel_dnt_medium(m=4, n=8, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 67.3422 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=22, split_thread=32, threads=128, grouping=16, minblocks=1) , # 48.5357 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 141.808 GFlop/s
  Kernel_dnt_medium(m=32, n=17, k=6, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 186.207 GFlop/s
  Kernel_dnt_small(m=8, n=8, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 110.805 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 121.474 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 32.9231 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=22, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 164.867 GFlop/s
  Kernel_dnt_medium(m=16, n=23, k=4, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 143.468 GFlop/s
  Kernel_dnt_medium(m=24, n=17, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 269.754 GFlop/s
  Kernel_dnt_largeDB(m=24, n=16, k=23, tile_m=2, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 305.948 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 147.34 GFlop/s
  Kernel_dnt_medium(m=24, n=5, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 143.808 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=22, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 157.352 GFlop/s
  Kernel_dnt_largeDB(m=6, n=24, k=8, tile_m=2, tile_n=1, w=4, v=24, threads=96, grouping=16, minblocks=8) , # 110.004 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 105.06 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=9, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 202.404 GFlop/s
  Kernel_dnt_medium(m=5, n=22, k=24, tile_m=1, tile_n=1, threads=192, grouping=16, minblocks=8) , # 138.483 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=26, tile_m=2, tile_n=1, w=10, v=8, threads=128, grouping=16, minblocks=12) , # 205.917 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 101.406 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=26, tile_m=2, tile_n=1, w=10, v=4, threads=96, grouping=16, minblocks=1) , # 180.319 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 168.934 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 87.2119 GFlop/s
  Kernel_dnt_medium(m=32, n=8, k=9, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 185.745 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=24, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 222.453 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=5, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 60.5944 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=32, tile_m=2, tile_n=3, w=12, v=32, threads=128, grouping=16, minblocks=8) , # 392.581 GFlop/s
  Kernel_dnt_medium(m=16, n=14, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 230.5 GFlop/s
  Kernel_dnt_small(m=16, n=13, k=8, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 146.46 GFlop/s
  Kernel_dnt_medium(m=24, n=23, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 176.59 GFlop/s
  Kernel_dnt_largeDB(m=4, n=13, k=32, tile_m=1, tile_n=1, w=16, v=10, threads=128, grouping=16, minblocks=12) , # 106.789 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=9, split_thread=32, threads=128, grouping=16, minblocks=1) , # 33.4619 GFlop/s
  Kernel_dnt_small(m=23, n=8, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 96.1136 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 56.5091 GFlop/s
  Kernel_dnt_largeDB(m=4, n=16, k=26, tile_m=1, tile_n=1, w=12, v=16, threads=96, grouping=16, minblocks=8) , # 109.271 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=9, tile_m=2, tile_n=2, w=4, v=8, threads=128, grouping=16, minblocks=12) , # 241.881 GFlop/s
  Kernel_dnt_medium(m=32, n=5, k=22, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=1) , # 158.723 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 78.5539 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 83.6633 GFlop/s
  Kernel_dnt_medium(m=5, n=9, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 43.5479 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 153.591 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=5, split_thread=32, threads=64, grouping=16, minblocks=1) , # 17.5227 GFlop/s
  Kernel_dnt_medium(m=24, n=26, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 192.911 GFlop/s
  Kernel_dnt_medium(m=22, n=6, k=13, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 137.639 GFlop/s
  Kernel_dnt_medium(m=24, n=4, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 125.42 GFlop/s
  Kernel_dnt_medium(m=23, n=8, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 148.553 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=22, tile_m=3, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 225.453 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=8, tile_m=1, tile_n=4, threads=96, grouping=16, minblocks=12) , # 132.298 GFlop/s
  Kernel_dnt_largeDB(m=9, n=23, k=32, tile_m=1, tile_n=2, w=14, v=12, threads=128, grouping=16, minblocks=12) , # 209.275 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 85.5436 GFlop/s
  Kernel_dnt_largeDB(m=16, n=26, k=22, tile_m=2, tile_n=4, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 285.563 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=16, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 177.395 GFlop/s
  Kernel_dnt_largeDB(m=29, n=55, k=29, tile_m=5, tile_n=3, w=10, v=44, threads=160, grouping=16, minblocks=4) , # 384.341 GFlop/s
  Kernel_dnt_medium(m=5, n=32, k=16, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=4) , # 148.849 GFlop/s
  Kernel_dnt_largeDB(m=16, n=64, k=64, tile_m=2, tile_n=4, w=8, v=44, threads=128, grouping=16, minblocks=8) , # 500.346 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=26, tile_m=2, tile_n=2, w=6, v=8, threads=128, grouping=16, minblocks=12) , # 287.445 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 126.591 GFlop/s
  Kernel_dnt_largeDB(m=32, n=8, k=32, tile_m=2, tile_n=2, w=12, v=8, threads=96, grouping=16, minblocks=12) , # 239.979 GFlop/s
  Kernel_dnt_largeDB(m=23, n=17, k=32, tile_m=2, tile_n=2, w=16, v=14, threads=192, grouping=16, minblocks=8) , # 281.666 GFlop/s
  Kernel_dnt_medium(m=100, n=10, k=10, tile_m=2, tile_n=2, threads=256, grouping=16, minblocks=4) , # 226.917 GFlop/s
  Kernel_dnt_medium(m=16, n=13, k=17, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=1) , # 191.644 GFlop/s
  Kernel_dnt_medium(m=24, n=22, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 190.105 GFlop/s
  Kernel_dnt_largeDB(m=22, n=24, k=9, tile_m=2, tile_n=3, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 230.841 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=32, split_thread=32, threads=128, grouping=16, minblocks=1) , # 56.7134 GFlop/s
  Kernel_dnt_medium(m=4, n=26, k=16, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.965 GFlop/s
  Kernel_dnt_largeDB(m=4, n=26, k=32, tile_m=1, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=8) , # 131.498 GFlop/s
  Kernel_dnt_largeDB(m=32, n=16, k=16, tile_m=2, tile_n=2, w=8, v=12, threads=128, grouping=16, minblocks=12) , # 329.973 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 241.87 GFlop/s
  Kernel_dnt_small(m=6, n=4, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 42.7597 GFlop/s
  Kernel_dnt_medium(m=32, n=4, k=17, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=8) , # 132.452 GFlop/s
  Kernel_dnt_medium(m=6, n=9, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 110.172 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=22, tile_m=1, tile_n=3, w=6, v=6, threads=128, grouping=16, minblocks=12) , # 226.772 GFlop/s
  Kernel_dnt_largeDB(m=22, n=64, k=64, tile_m=3, tile_n=4, w=8, v=24, threads=128, grouping=16, minblocks=1) , # 504.56 GFlop/s
  Kernel_dnt_tiny(m=5, n=5, k=6, split_thread=32, threads=64, grouping=16, minblocks=1) , # 33.673 GFlop/s
  Kernel_dnt_small(m=23, n=6, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 71.0708 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 119.074 GFlop/s
  Kernel_dnt_medium(m=22, n=22, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 192.65 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=26, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 342.458 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 216.145 GFlop/s
  Kernel_dnt_largeDB(m=9, n=32, k=23, tile_m=3, tile_n=1, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 232.646 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=6, tile_m=3, tile_n=2, w=2, v=16, threads=96, grouping=16, minblocks=12) , # 182.522 GFlop/s
  Kernel_dnt_largeDB(m=24, n=6, k=24, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 163.99 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 150.647 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=4, tile_m=3, tile_n=2, w=2, v=12, threads=128, grouping=16, minblocks=12) , # 174.057 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 105.704 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 266.036 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=9, tile_m=1, tile_n=3, threads=96, grouping=16, minblocks=12) , # 179.436 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=32, tile_m=2, tile_n=3, w=14, v=16, threads=128, grouping=16, minblocks=8) , # 328.943 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 42.4259 GFlop/s
  Kernel_dnt_largeDB(m=4, n=22, k=24, tile_m=1, tile_n=1, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 121.22 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 189.324 GFlop/s
  Kernel_dnt_largeDB(m=14, n=29, k=29, tile_m=2, tile_n=2, w=10, v=20, threads=160, grouping=16, minblocks=8) , # 272.491 GFlop/s
  Kernel_dnt_largeDB(m=32, n=5, k=32, tile_m=2, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 163.6 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 102.308 GFlop/s
  Kernel_dnt_medium(m=26, n=8, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.207 GFlop/s
  Kernel_dnt_medium(m=23, n=24, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 196.262 GFlop/s
  Kernel_dnt_small(m=5, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 60.0103 GFlop/s
  Kernel_dnt_tiny(m=4, n=8, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 45.9955 GFlop/s
  Kernel_dnt_largeDB(m=64, n=9, k=22, tile_m=2, tile_n=3, w=6, v=4, threads=96, grouping=16, minblocks=12) , # 286.988 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=24, tile_m=3, tile_n=2, w=12, v=24, threads=96, grouping=16, minblocks=12) , # 256.27 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=22, tile_m=3, tile_n=2, w=10, v=32, threads=128, grouping=16, minblocks=8) , # 360.996 GFlop/s
  Kernel_dnt_medium(m=6, n=16, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 63.5995 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 150.084 GFlop/s
  Kernel_dnt_largeDB(m=22, n=64, k=9, tile_m=3, tile_n=4, w=4, v=40, threads=128, grouping=16, minblocks=4) , # 278.224 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=22, tile_m=2, tile_n=4, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 358.85 GFlop/s
  Kernel_dnt_largeDB(m=24, n=17, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 278.266 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=32, tile_m=3, tile_n=2, w=14, v=22, threads=128, grouping=16, minblocks=8) , # 331.192 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 85.0733 GFlop/s
  Kernel_dnt_medium(m=8, n=6, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 32.5636 GFlop/s
  Kernel_dnt_largeDB(m=6, n=23, k=26, tile_m=2, tile_n=1, w=10, v=16, threads=128, grouping=16, minblocks=12) , # 149.513 GFlop/s
  Kernel_dnt_largeDB(m=23, n=17, k=22, tile_m=2, tile_n=2, w=10, v=10, threads=128, grouping=16, minblocks=12) , # 265.383 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 158.152 GFlop/s
  Kernel_dnt_small(m=16, n=16, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=8) , # 180.677 GFlop/s
  Kernel_dnt_largeDB(m=22, n=26, k=23, tile_m=2, tile_n=2, w=6, v=26, threads=160, grouping=16, minblocks=8) , # 307.774 GFlop/s
  Kernel_dnt_largeDB(m=22, n=13, k=32, tile_m=2, tile_n=3, w=12, v=10, threads=96, grouping=16, minblocks=12) , # 250.128 GFlop/s
  Kernel_dnt_largeDB(m=9, n=13, k=32, tile_m=1, tile_n=1, w=16, v=6, threads=128, grouping=16, minblocks=12) , # 167.552 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=22, tile_m=1, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 201.724 GFlop/s
  Kernel_dnt_medium(m=6, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 85.2118 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 88.5829 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=9, split_thread=32, threads=128, grouping=16, minblocks=1) , # 48.5061 GFlop/s
  Kernel_dnt_largeDB(m=6, n=8, k=32, tile_m=1, tile_n=1, w=16, v=8, threads=96, grouping=16, minblocks=8) , # 110.732 GFlop/s
  Kernel_dnt_largeDB(m=17, n=16, k=24, tile_m=3, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 245.438 GFlop/s
  Kernel_dnt_small(m=5, n=6, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 58.1624 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 68.454 GFlop/s
  Kernel_dnt_medium(m=4, n=17, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 99.368 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=1) , # 210.09 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=26, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 200.476 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=8, tile_m=2, tile_n=3, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 231.895 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=13, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=12) , # 146.013 GFlop/s
  Kernel_dnt_medium(m=26, n=22, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 149.972 GFlop/s
  Kernel_dnt_medium(m=23, n=26, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 286.414 GFlop/s
  Kernel_dnt_medium(m=24, n=16, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 177.041 GFlop/s
  Kernel_dnt_largeDB(m=13, n=32, k=8, tile_m=2, tile_n=2, w=4, v=32, threads=128, grouping=16, minblocks=12) , # 200.207 GFlop/s
  Kernel_dnt_largeDB(m=55, n=29, k=55, tile_m=3, tile_n=5, w=6, v=20, threads=128, grouping=16, minblocks=4) , # 466.603 GFlop/s
  Kernel_dnt_small(m=22, n=6, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 69.8662 GFlop/s
  Kernel_dnt_largeDB(m=16, n=55, k=55, tile_m=2, tile_n=4, w=8, v=24, threads=128, grouping=16, minblocks=8) , # 430.218 GFlop/s
  Kernel_dnt_medium(m=9, n=22, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 170.487 GFlop/s
  Kernel_dnt_largeDB(m=26, n=23, k=17, tile_m=2, tile_n=3, w=6, v=12, threads=160, grouping=16, minblocks=8) , # 285.445 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 121.148 GFlop/s
  Kernel_dnt_largeDB(m=23, n=22, k=23, tile_m=2, tile_n=3, w=10, v=12, threads=96, grouping=16, minblocks=12) , # 314.138 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=5, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 165.072 GFlop/s
  Kernel_dnt_medium(m=22, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 145.653 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=8, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 152.162 GFlop/s
  Kernel_dnt_medium(m=4, n=24, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 115.396 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=32, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 118.358 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=22, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 165.859 GFlop/s
  Kernel_dnt_medium(m=8, n=4, k=16, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 64.3498 GFlop/s
  Kernel_dnt_largeDB(m=32, n=8, k=23, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 223.88 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=17, tile_m=2, tile_n=1, w=8, v=10, threads=96, grouping=16, minblocks=1) , # 187.951 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 102.173 GFlop/s
  Kernel_dnt_largeDB(m=5, n=17, k=32, tile_m=1, tile_n=1, w=14, v=6, threads=128, grouping=16, minblocks=12) , # 135.01 GFlop/s
  Kernel_dnt_largeDB(m=24, n=22, k=9, tile_m=3, tile_n=2, w=4, v=12, threads=96, grouping=16, minblocks=12) , # 233.468 GFlop/s
  Kernel_dnt_largeDB(m=16, n=24, k=16, tile_m=2, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 283.218 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 71.163 GFlop/s
  Kernel_dnt_largeDB(m=4, n=23, k=32, tile_m=1, tile_n=1, w=8, v=20, threads=96, grouping=16, minblocks=4) , # 127.587 GFlop/s
  Kernel_dnt_largeDB(m=13, n=26, k=26, tile_m=2, tile_n=2, w=10, v=20, threads=96, grouping=16, minblocks=12) , # 260.673 GFlop/s
  Kernel_dnt_tiny(m=6, n=4, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 34.4029 GFlop/s
  Kernel_dnt_largeDB(m=5, n=9, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=96, grouping=16, minblocks=1) , # 71.2271 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 77.1216 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=23, tile_m=3, tile_n=2, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 245.432 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=23, tile_m=2, tile_n=2, w=6, v=26, threads=160, grouping=16, minblocks=8) , # 332.247 GFlop/s
  Kernel_dnt_largeDB(m=5, n=16, k=26, tile_m=1, tile_n=1, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 128.942 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=9, split_thread=32, threads=128, grouping=16, minblocks=1) , # 36.1752 GFlop/s
  Kernel_dnt_small(m=24, n=4, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 67.0164 GFlop/s
  Kernel_dnt_small(m=9, n=9, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 60.3803 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=24, tile_m=3, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 239.227 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=32, tile_m=3, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 256.302 GFlop/s
  Kernel_dnt_largeDB(m=9, n=23, k=22, tile_m=2, tile_n=2, w=10, v=14, threads=128, grouping=16, minblocks=12) , # 194.385 GFlop/s
  Kernel_dnt_medium(m=29, n=14, k=14, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 242.078 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 115.643 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=26, tile_m=2, tile_n=4, w=8, v=20, threads=128, grouping=16, minblocks=8) , # 453.859 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=22, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 198.116 GFlop/s
  Kernel_dnt_medium(m=17, n=4, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 44.4021 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=16, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 188.608 GFlop/s
  Kernel_dnt_largeDB(m=13, n=22, k=16, tile_m=2, tile_n=2, w=8, v=22, threads=96, grouping=16, minblocks=12) , # 206.302 GFlop/s
  Kernel_dnt_largeDB(m=6, n=32, k=16, tile_m=2, tile_n=1, w=8, v=32, threads=128, grouping=16, minblocks=12) , # 170.847 GFlop/s
  Kernel_dnt_medium(m=24, n=22, k=16, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 308.103 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 102.115 GFlop/s
  Kernel_dnt_largeDB(m=9, n=24, k=26, tile_m=1, tile_n=2, w=10, v=24, threads=128, grouping=16, minblocks=12) , # 210.582 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=8, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 188.223 GFlop/s
  Kernel_dnt_largeDB(m=16, n=23, k=26, tile_m=2, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 291.977 GFlop/s
  Kernel_dnt_medium(m=26, n=9, k=13, tile_m=4, tile_n=1, threads=96, grouping=16, minblocks=12) , # 189.529 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=23, tile_m=2, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 194.119 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 182.062 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=22, tile_m=2, tile_n=1, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 190.782 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=8, tile_m=2, tile_n=2, w=4, v=22, threads=128, grouping=16, minblocks=12) , # 182.307 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=16, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 253.516 GFlop/s
  Kernel_dnt_medium(m=5, n=5, k=24, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 68.8446 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=23, tile_m=3, tile_n=2, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 285.309 GFlop/s
  Kernel_dnt_tiny(m=4, n=6, k=16, split_thread=32, threads=128, grouping=16, minblocks=1) , # 62.3605 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 29.5849 GFlop/s
  Kernel_dnt_largeDB(m=13, n=23, k=23, tile_m=2, tile_n=3, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 233.26 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 100.035 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 72.8888 GFlop/s
  Kernel_dnt_medium(m=7, n=49, k=7, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 147.937 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=24, tile_m=4, tile_n=2, w=12, v=14, threads=128, grouping=16, minblocks=8) , # 386.132 GFlop/s
  Kernel_dnt_largeDB(m=13, n=8, k=16, tile_m=1, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=8) , # 138.215 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 109.971 GFlop/s
  Kernel_dnt_largeDB(m=32, n=22, k=22, tile_m=3, tile_n=2, w=10, v=20, threads=128, grouping=16, minblocks=8) , # 351.551 GFlop/s
  Kernel_dnt_medium(m=4, n=32, k=13, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 124.107 GFlop/s
  Kernel_dnt_medium(m=17, n=9, k=26, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 168.829 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=9, tile_m=1, tile_n=2, threads=96, grouping=16, minblocks=1) , # 126.842 GFlop/s
  Kernel_dnt_largeDB(m=78, n=78, k=78, tile_m=5, tile_n=5, w=8, v=26, threads=256, grouping=16, minblocks=1) , # 589.26 GFlop/s
  Kernel_dnt_medium(m=64, n=8, k=8, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 213.145 GFlop/s
  Kernel_dnt_medium(m=16, n=16, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 240.903 GFlop/s
  Kernel_dnt_largeDB(m=24, n=9, k=16, tile_m=2, tile_n=2, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 192.434 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=22, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 110.275 GFlop/s
  Kernel_dnt_medium(m=13, n=16, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 114.075 GFlop/s
  Kernel_dnt_medium(m=8, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 80.7721 GFlop/s
  Kernel_dnt_largeDB(m=23, n=17, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 264.804 GFlop/s
  Kernel_dnt_medium(m=14, n=16, k=14, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 213.982 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 128.35 GFlop/s
  Kernel_dnt_largeDB(m=23, n=24, k=16, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 312.142 GFlop/s
  Kernel_dnt_medium(m=17, n=16, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 207.944 GFlop/s
  Kernel_dnt_largeDB(m=23, n=13, k=26, tile_m=3, tile_n=2, w=12, v=12, threads=96, grouping=16, minblocks=12) , # 239.493 GFlop/s
  Kernel_dnt_medium(m=24, n=13, k=6, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=12) , # 160.137 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=17, split_thread=32, threads=128, grouping=16, minblocks=1) , # 51.922 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=23, split_thread=32, threads=128, grouping=16, minblocks=1) , # 60.6683 GFlop/s
  Kernel_dnt_small(m=13, n=13, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 114.055 GFlop/s
  Kernel_dnt_largeDB(m=13, n=24, k=22, tile_m=2, tile_n=3, w=6, v=12, threads=96, grouping=16, minblocks=12) , # 243.549 GFlop/s
  Kernel_dnt_largeDB(m=32, n=6, k=24, tile_m=2, tile_n=1, w=8, v=6, threads=128, grouping=16, minblocks=12) , # 184.834 GFlop/s
  Kernel_dnt_tiny(m=5, n=8, k=8, split_thread=32, threads=96, grouping=16, minblocks=1) , # 47.0366 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=17, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 260.436 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=22, tile_m=2, tile_n=4, w=8, v=22, threads=128, grouping=16, minblocks=8) , # 363.449 GFlop/s
  Kernel_dnt_largeDB(m=9, n=22, k=26, tile_m=2, tile_n=2, w=10, v=22, threads=96, grouping=16, minblocks=12) , # 194.953 GFlop/s
  Kernel_dnt_small(m=17, n=6, k=6, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 76.6485 GFlop/s
  Kernel_dnt_small(m=6, n=23, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 70.9369 GFlop/s
  Kernel_dnt_medium(m=17, n=8, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 117.865 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 85.3145 GFlop/s
  Kernel_dnt_medium(m=22, n=26, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 174.989 GFlop/s
  Kernel_dnt_medium(m=24, n=9, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 164.779 GFlop/s
  Kernel_dnt_medium(m=13, n=6, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 127.181 GFlop/s
  Kernel_dnt_medium(m=13, n=17, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 135.183 GFlop/s
  Kernel_dnt_largeDB(m=9, n=26, k=24, tile_m=1, tile_n=2, w=8, v=14, threads=128, grouping=16, minblocks=12) , # 215.418 GFlop/s
  Kernel_dnt_largeDB(m=26, n=17, k=8, tile_m=2, tile_n=2, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 203.653 GFlop/s
  Kernel_dnt_largeDB(m=8, n=23, k=23, tile_m=2, tile_n=1, w=8, v=12, threads=96, grouping=16, minblocks=8) , # 189.238 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=24, tile_m=2, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=4) , # 207.228 GFlop/s
  Kernel_dnt_medium(m=23, n=4, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 86.3208 GFlop/s
  Kernel_dnt_medium(m=23, n=5, k=16, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 138.278 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=24, tile_m=3, tile_n=2, w=8, v=12, threads=96, grouping=16, minblocks=12) , # 347.719 GFlop/s
  Kernel_dnt_largeDB(m=5, n=26, k=16, tile_m=1, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=1) , # 124.739 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 118.855 GFlop/s
  Kernel_dnt_largeDB(m=32, n=17, k=26, tile_m=2, tile_n=3, w=6, v=6, threads=96, grouping=16, minblocks=12) , # 323.24 GFlop/s
  Kernel_dnt_small(m=6, n=9, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 58.965 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 139.461 GFlop/s
  Kernel_dnt_medium(m=6, n=36, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 119.79 GFlop/s
  Kernel_dnt_medium(m=16, n=22, k=8, tile_m=2, tile_n=3, threads=96, grouping=16, minblocks=12) , # 205.457 GFlop/s
  Kernel_dnt_medium(m=8, n=32, k=9, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 183.653 GFlop/s
  Kernel_dnt_medium(m=26, n=24, k=4, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 166.334 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=17, tile_m=2, tile_n=4, w=8, v=10, threads=96, grouping=16, minblocks=12) , # 364.783 GFlop/s
  Kernel_dnt_medium(m=6, n=13, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 124.698 GFlop/s
  Kernel_dnt_largeDB(m=23, n=4, k=32, tile_m=1, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 128.586 GFlop/s
  Kernel_dnt_medium(m=16, n=4, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 76.3591 GFlop/s
  Kernel_dnt_medium(m=9, n=32, k=5, tile_m=3, tile_n=1, threads=96, grouping=16, minblocks=12) , # 130.153 GFlop/s
  Kernel_dnt_medium(m=23, n=9, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 184.356 GFlop/s
  Kernel_dnt_tiny(m=22, n=5, k=4, split_thread=32, threads=128, grouping=16, minblocks=1) , # 64.238 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=6, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 123.269 GFlop/s
  Kernel_dnt_small(m=9, n=13, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 77.1093 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 94.256 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 285.713 GFlop/s
  Kernel_dnt_largeDB(m=26, n=13, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 244.889 GFlop/s
  Kernel_dnt_medium(m=17, n=17, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 227.787 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 34.7964 GFlop/s
  Kernel_dnt_medium(m=22, n=13, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 183.073 GFlop/s
  Kernel_dnt_largeDB(m=9, n=4, k=16, tile_m=1, tile_n=1, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 60.1671 GFlop/s
  Kernel_dnt_small(m=9, n=5, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 49.3598 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=22, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=1) , # 118.425 GFlop/s
  Kernel_dnt_largeDB(m=4, n=26, k=26, tile_m=1, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 122.071 GFlop/s
  Kernel_dnt_largeDB(m=29, n=16, k=29, tile_m=2, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=8) , # 310.193 GFlop/s
  Kernel_dnt_medium(m=32, n=9, k=17, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 215.226 GFlop/s
  Kernel_dnt_largeDB(m=23, n=26, k=32, tile_m=2, tile_n=3, w=14, v=26, threads=128, grouping=16, minblocks=8) , # 340.048 GFlop/s
  Kernel_dnt_largeDB(m=24, n=32, k=23, tile_m=3, tile_n=2, w=8, v=16, threads=128, grouping=16, minblocks=8) , # 388.857 GFlop/s
  Kernel_dnt_largeDB(m=17, n=26, k=32, tile_m=2, tile_n=2, w=16, v=18, threads=160, grouping=16, minblocks=8) , # 317.652 GFlop/s
  Kernel_dnt_small(m=5, n=16, k=8, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 79.5016 GFlop/s
  Kernel_dnt_largeDB(m=8, n=32, k=32, tile_m=2, tile_n=1, w=8, v=16, threads=128, grouping=16, minblocks=12) , # 238.795 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=17, tile_m=2, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 322.698 GFlop/s
  Kernel_dnt_small(m=8, n=26, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 98.6441 GFlop/s
  Kernel_dnt_largeDB(m=32, n=24, k=8, tile_m=2, tile_n=4, w=4, v=14, threads=96, grouping=16, minblocks=12) , # 279.775 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 67.1608 GFlop/s
  Kernel_dnt_largeDB(m=23, n=32, k=32, tile_m=3, tile_n=2, w=12, v=16, threads=128, grouping=16, minblocks=8) , # 398.604 GFlop/s
  Kernel_dnt_largeDB(m=6, n=16, k=22, tile_m=1, tile_n=1, w=10, v=16, threads=96, grouping=16, minblocks=8) , # 140.994 GFlop/s
  Kernel_dnt_medium(m=14, n=14, k=29, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=4) , # 218.838 GFlop/s
  Kernel_dnt_largeDB(m=16, n=4, k=32, tile_m=1, tile_n=1, w=16, v=4, threads=128, grouping=16, minblocks=12) , # 119.202 GFlop/s
  Kernel_dnt_medium(m=17, n=32, k=13, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 261.656 GFlop/s
  Kernel_dnt_largeDB(m=26, n=32, k=32, tile_m=4, tile_n=2, w=8, v=28, threads=128, grouping=16, minblocks=8) , # 398.695 GFlop/s
  Kernel_dnt_largeDB(m=16, n=29, k=14, tile_m=2, tile_n=2, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 264.624 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=8, tile_m=3, tile_n=3, w=4, v=16, threads=128, grouping=16, minblocks=1) , # 288.605 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=4, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 89.3747 GFlop/s
  Kernel_dnt_medium(m=32, n=22, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 280.289 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=9, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 206.587 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=4, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=12) , # 133.402 GFlop/s
  Kernel_dnt_tiny(m=16, n=6, k=6, split_thread=32, threads=128, grouping=16, minblocks=1) , # 81.9641 GFlop/s
  Kernel_dnt_medium(m=9, n=6, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 63.1913 GFlop/s
  Kernel_dnt_largeDB(m=13, n=16, k=26, tile_m=2, tile_n=2, w=12, v=16, threads=96, grouping=16, minblocks=12) , # 210.017 GFlop/s
  Kernel_dnt_medium(m=9, n=24, k=8, tile_m=1, tile_n=3, threads=128, grouping=16, minblocks=12) , # 141.521 GFlop/s
  Kernel_dnt_tiny(m=6, n=6, k=5, split_thread=32, threads=64, grouping=16, minblocks=1) , # 34.7398 GFlop/s
  Kernel_dnt_largeDB(m=8, n=26, k=24, tile_m=2, tile_n=1, w=8, v=26, threads=128, grouping=16, minblocks=12) , # 201.264 GFlop/s
  Kernel_dnt_medium(m=8, n=16, k=9, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 120.932 GFlop/s
  Kernel_dnt_medium(m=17, n=23, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 153.311 GFlop/s
  Kernel_dnt_medium(m=22, n=32, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 317.355 GFlop/s
  Kernel_dnt_medium(m=17, n=26, k=9, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 205.14 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=17, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 219.365 GFlop/s
  Kernel_dnt_medium(m=6, n=5, k=23, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 71.6165 GFlop/s
  Kernel_dnt_tiny(m=4, n=5, k=26, split_thread=32, threads=128, grouping=16, minblocks=1) , # 46.1546 GFlop/s
  Kernel_dnt_largeDB(m=64, n=9, k=64, tile_m=2, tile_n=3, w=10, v=8, threads=128, grouping=16, minblocks=8) , # 319.492 GFlop/s
  Kernel_dnt_medium(m=13, n=23, k=5, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 136.973 GFlop/s
  Kernel_dnt_medium(m=9, n=16, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=12) , # 90.0694 GFlop/s
  Kernel_dnt_medium(m=8, n=8, k=9, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 96.8478 GFlop/s
  Kernel_dnt_medium(m=26, n=23, k=5, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 179.309 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=32, split_thread=32, threads=128, grouping=16, minblocks=1) , # 46.6154 GFlop/s
  Kernel_dnt_largeDB(m=17, n=32, k=26, tile_m=3, tile_n=2, w=8, v=28, threads=96, grouping=16, minblocks=12) , # 337.328 GFlop/s
  Kernel_dnt_largeDB(m=5, n=23, k=26, tile_m=1, tile_n=1, w=10, v=22, threads=128, grouping=16, minblocks=12) , # 136.154 GFlop/s
  Kernel_dnt_small(m=24, n=6, k=4, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 76.9814 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=26, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=4) , # 169.329 GFlop/s
  Kernel_dnt_medium(m=4, n=9, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 69.4121 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 122.006 GFlop/s
  Kernel_dnt_medium(m=26, n=6, k=23, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 166.214 GFlop/s
  Kernel_dnt_medium(m=23, n=22, k=5, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 165.527 GFlop/s
  Kernel_dnt_medium(m=5, n=26, k=5, tile_m=1, tile_n=1, threads=160, grouping=16, minblocks=12) , # 71.6372 GFlop/s
  Kernel_dnt_medium(m=22, n=17, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 224.16 GFlop/s
  Kernel_dnt_medium(m=13, n=5, k=24, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 114.338 GFlop/s
  Kernel_dnt_medium(m=9, n=26, k=17, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 191.617 GFlop/s
  Kernel_dnt_tiny(m=8, n=4, k=6, split_thread=32, threads=96, grouping=16, minblocks=1) , # 34.2353 GFlop/s
  Kernel_dnt_small(m=32, n=8, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=1) , # 129.29 GFlop/s
  Kernel_dnt_medium(m=17, n=13, k=4, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 100.145 GFlop/s
  Kernel_dnt_medium(m=26, n=4, k=9, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=12) , # 93.2779 GFlop/s
  Kernel_dnt_largeDB(m=8, n=16, k=32, tile_m=1, tile_n=1, w=16, v=16, threads=128, grouping=16, minblocks=4) , # 193.463 GFlop/s
  Kernel_dnt_largeDB(m=24, n=24, k=23, tile_m=2, tile_n=3, w=8, v=16, threads=96, grouping=16, minblocks=12) , # 369.879 GFlop/s
  Kernel_dnt_tiny(m=4, n=4, k=5, split_thread=32, threads=64, grouping=16, minblocks=1) , # 19.7765 GFlop/s
  Kernel_dnt_medium(m=13, n=13, k=23, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=8) , # 192.756 GFlop/s
  Kernel_dnt_medium(m=13, n=24, k=4, tile_m=3, tile_n=1, threads=128, grouping=16, minblocks=12) , # 126.462 GFlop/s
  Kernel_dnt_medium(m=9, n=17, k=6, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 98.6733 GFlop/s
  Kernel_dnt_largeDB(m=26, n=22, k=26, tile_m=2, tile_n=3, w=4, v=14, threads=128, grouping=16, minblocks=12) , # 316.347 GFlop/s
  Kernel_dnt_medium(m=5, n=8, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 85.8051 GFlop/s
  Kernel_dnt_medium(m=8, n=17, k=22, tile_m=1, tile_n=1, threads=224, grouping=16, minblocks=8) , # 150.659 GFlop/s
  Kernel_dnt_tiny(m=5, n=4, k=9, split_thread=32, threads=96, grouping=16, minblocks=1) , # 26.1494 GFlop/s
  Kernel_dnt_largeDB(m=17, n=24, k=16, tile_m=2, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 260.019 GFlop/s
  Kernel_dnt_medium(m=9, n=23, k=5, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 111.899 GFlop/s
  Kernel_dnt_medium(m=17, n=6, k=16, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 129.192 GFlop/s
  Kernel_dnt_medium(m=6, n=22, k=26, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 151.978 GFlop/s
  Kernel_dnt_medium(m=23, n=23, k=6, tile_m=3, tile_n=2, threads=160, grouping=16, minblocks=8) , # 200.307 GFlop/s
  Kernel_dnt_largeDB(m=16, n=5, k=24, tile_m=1, tile_n=1, w=12, v=4, threads=96, grouping=16, minblocks=12) , # 129.534 GFlop/s
  Kernel_dnt_largeDB(m=9, n=64, k=16, tile_m=3, tile_n=2, w=6, v=48, threads=96, grouping=16, minblocks=12) , # 262.958 GFlop/s
  Kernel_dnt_largeDB(m=22, n=17, k=26, tile_m=3, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 256.136 GFlop/s
  Kernel_dnt_small(m=5, n=22, k=5, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=8) , # 71.6059 GFlop/s
  Kernel_dnt_medium(m=24, n=8, k=13, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=12) , # 185.221 GFlop/s
  Kernel_dnt_medium(m=26, n=17, k=6, tile_m=2, tile_n=2, threads=160, grouping=16, minblocks=8) , # 174.981 GFlop/s
  Kernel_dnt_largeDB(m=8, n=24, k=22, tile_m=2, tile_n=1, w=8, v=24, threads=96, grouping=16, minblocks=12) , # 199.801 GFlop/s
  Kernel_dnt_largeDB(m=26, n=6, k=32, tile_m=1, tile_n=1, w=12, v=6, threads=160, grouping=16, minblocks=12) , # 174.271 GFlop/s
  Kernel_dnt_small(m=17, n=17, k=4, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=4) , # 123.512 GFlop/s
  Kernel_dnt_medium(m=32, n=6, k=13, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=4) , # 172.995 GFlop/s
  Kernel_dnt_medium(m=64, n=9, k=9, tile_m=2, tile_n=3, threads=128, grouping=16, minblocks=8) , # 240.201 GFlop/s
  Kernel_dnt_largeDB(m=24, n=23, k=22, tile_m=2, tile_n=3, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 335.112 GFlop/s
  Kernel_dnt_medium(m=22, n=24, k=13, tile_m=3, tile_n=2, threads=128, grouping=16, minblocks=8) , # 283.717 GFlop/s
  Kernel_dnt_medium(m=4, n=16, k=5, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 53.0611 GFlop/s
  Kernel_dnt_small(m=6, n=4, k=13, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=1) , # 40.7212 GFlop/s
  Kernel_dnt_largeDB(m=32, n=9, k=26, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 237.643 GFlop/s
  Kernel_dnt_largeDB(m=17, n=22, k=26, tile_m=3, tile_n=2, w=12, v=22, threads=96, grouping=16, minblocks=12) , # 257.52 GFlop/s
  Kernel_dnt_small(m=5, n=4, k=22, tile_m=1, tile_n=1, threads=64, grouping=16, minblocks=1) , # 37.0424 GFlop/s
  Kernel_dnt_largeDB(m=23, n=6, k=24, tile_m=1, tile_n=2, w=8, v=4, threads=96, grouping=16, minblocks=8) , # 154.711 GFlop/s
  Kernel_dnt_largeDB(m=24, n=26, k=16, tile_m=3, tile_n=2, w=8, v=26, threads=128, grouping=16, minblocks=8) , # 296.427 GFlop/s
  Kernel_dnt_medium(m=23, n=17, k=6, tile_m=3, tile_n=2, threads=96, grouping=16, minblocks=12) , # 160.956 GFlop/s
  Kernel_dnt_medium(m=5, n=16, k=17, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=8) , # 121.579 GFlop/s
  Kernel_dnt_largeDB(m=16, n=32, k=24, tile_m=2, tile_n=2, w=8, v=24, threads=128, grouping=16, minblocks=12) , # 358.185 GFlop/s
  Kernel_dnt_largeDB(m=26, n=24, k=22, tile_m=2, tile_n=3, w=4, v=16, threads=128, grouping=16, minblocks=12) , # 333.184 GFlop/s
  Kernel_dnt_medium(m=32, n=13, k=8, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=12) , # 218.093 GFlop/s
  Kernel_dnt_largeDB(m=32, n=32, k=6, tile_m=4, tile_n=2, w=2, v=20, threads=128, grouping=16, minblocks=8) , # 233.277 GFlop/s
  Kernel_dnt_largeDB(m=17, n=9, k=32, tile_m=2, tile_n=3, w=16, v=6, threads=96, grouping=16, minblocks=12) , # 184.866 GFlop/s
  Kernel_dnt_largeDB(m=23, n=23, k=17, tile_m=3, tile_n=2, w=4, v=16, threads=96, grouping=16, minblocks=12) , # 292.211 GFlop/s
  Kernel_dnt_largeDB(m=16, n=9, k=16, tile_m=2, tile_n=1, w=8, v=6, threads=96, grouping=16, minblocks=12) , # 161.845 GFlop/s
  Kernel_dnt_largeDB(m=16, n=8, k=24, tile_m=2, tile_n=1, w=12, v=8, threads=96, grouping=16, minblocks=4) , # 185.133 GFlop/s
  Kernel_dnt_largeDB(m=16, n=23, k=32, tile_m=2, tile_n=3, w=12, v=6, threads=96, grouping=16, minblocks=12) , # 313.606 GFlop/s
  Kernel_dnt_medium(m=9, n=13, k=23, tile_m=1, tile_n=1, threads=128, grouping=16, minblocks=1) , # 159.911 GFlop/s
  Kernel_dnt_medium(m=22, n=9, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 176.971 GFlop/s
  Kernel_dnt_largeDB(m=32, n=23, k=24, tile_m=2, tile_n=4, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 395.474 GFlop/s
  Kernel_dnt_medium(m=8, n=13, k=5, tile_m=2, tile_n=1, threads=96, grouping=16, minblocks=8) , # 75.8181 GFlop/s
  Kernel_dnt_medium(m=26, n=16, k=5, tile_m=5, tile_n=1, threads=96, grouping=16, minblocks=12) , # 155.57 GFlop/s
  Kernel_dnt_medium(m=26, n=13, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 164.444 GFlop/s
  Kernel_dnt_medium(m=23, n=16, k=13, tile_m=2, tile_n=2, threads=128, grouping=16, minblocks=8) , # 234.514 GFlop/s
  Kernel_dnt_medium(m=4, n=13, k=22, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 106.482 GFlop/s
  Kernel_dnt_medium(m=16, n=17, k=13, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 215.752 GFlop/s
  Kernel_dnt_largeDB(m=13, n=4, k=16, tile_m=1, tile_n=1, w=8, v=4, threads=128, grouping=16, minblocks=12) , # 82.3205 GFlop/s
  Kernel_dnt_largeDB(m=16, n=64, k=22, tile_m=2, tile_n=4, w=8, v=40, threads=128, grouping=16, minblocks=8) , # 409.267 GFlop/s
  Kernel_dnt_medium(m=9, n=5, k=26, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 93.5918 GFlop/s
  Kernel_dnt_medium(m=4, n=23, k=4, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 59.7783 GFlop/s
  Kernel_dnt_largeDB(m=22, n=16, k=23, tile_m=2, tile_n=2, w=10, v=16, threads=96, grouping=16, minblocks=12) , # 279.173 GFlop/s
  Kernel_dnt_medium(m=13, n=26, k=6, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 165.607 GFlop/s
  Kernel_dnt_largeDB(m=26, n=8, k=24, tile_m=2, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 203.357 GFlop/s
  Kernel_dnt_largeDB(m=29, n=29, k=14, tile_m=2, tile_n=4, w=4, v=16, threads=128, grouping=16, minblocks=8) , # 309.036 GFlop/s
  Kernel_dnt_largeDB(m=22, n=32, k=4, tile_m=3, tile_n=2, w=2, v=16, threads=128, grouping=16, minblocks=12) , # 173.713 GFlop/s
  Kernel_dnt_small(m=5, n=16, k=6, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 68.1547 GFlop/s
  Kernel_dnt_medium(m=23, n=13, k=8, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 173.078 GFlop/s
  Kernel_dnt_largeDB(m=24, n=13, k=16, tile_m=2, tile_n=2, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 243.961 GFlop/s
  Kernel_dnt_medium(m=9, n=9, k=24, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 147.669 GFlop/s
  Kernel_dnt_medium(m=22, n=16, k=4, tile_m=2, tile_n=2, threads=96, grouping=16, minblocks=12) , # 141.362 GFlop/s
  Kernel_dnt_medium(m=8, n=9, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=4) , # 84.8613 GFlop/s
  Kernel_dnt_largeDB(m=6, n=16, k=24, tile_m=1, tile_n=1, w=12, v=16, threads=96, grouping=16, minblocks=1) , # 151.414 GFlop/s
  Kernel_dnt_largeDB(m=26, n=16, k=32, tile_m=2, tile_n=2, w=16, v=16, threads=128, grouping=16, minblocks=1) , # 312.21 GFlop/s
  Kernel_dnt_medium(m=6, n=26, k=9, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 124.78 GFlop/s
  Kernel_dnt_medium(m=5, n=23, k=17, tile_m=1, tile_n=2, threads=128, grouping=16, minblocks=4) , # 125.344 GFlop/s
  Kernel_dnt_largeDB(m=13, n=8, k=23, tile_m=1, tile_n=1, w=8, v=8, threads=128, grouping=16, minblocks=12) , # 147.91 GFlop/s
  Kernel_dnt_medium(m=13, n=9, k=13, tile_m=2, tile_n=1, threads=128, grouping=16, minblocks=12) , # 130.505 GFlop/s
  Kernel_dnt_largeDB(m=23, n=8, k=23, tile_m=1, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=12) , # 192.208 GFlop/s
  Kernel_dnt_largeDB(m=24, n=8, k=23, tile_m=1, tile_n=2, w=8, v=8, threads=96, grouping=16, minblocks=1) , # 200.887 GFlop/s
  Kernel_dnt_largeDB(m=32, n=26, k=8, tile_m=3, tile_n=3, w=4, v=22, threads=128, grouping=16, minblocks=8) , # 246.38 GFlop/s
  Kernel_dnt_largeDB(m=96, n=96, k=96, tile_m=6, tile_n=3, w=14, v=48, threads=512, grouping=16, minblocks=1) , # 614.588 GFlop/s
  Kernel_dnt_medium(m=17, n=5, k=8, tile_m=1, tile_n=1, threads=96, grouping=16, minblocks=12) , # 78.5404 GFlop/s
  Kernel_dnt_medium(m=6, n=23, k=22, tile_m=2, tile_n=1, threads=160, grouping=16, minblocks=8) , # 145.676 GFlop/s
  Kernel_dnt_medium(m=6, n=24, k=23, tile_m=1, tile_n=2, threads=160, grouping=16, minblocks=8) , # 156.783 GFlop/s
  Kernel_dnt_largeDB(m=16, n=6, k=16, tile_m=1, tile_n=1, w=8, v=6, threads=128, grouping=16, minblocks=4) , # 134.069 GFlop/s
]

#EOF


More information about the CP2K-user mailing list