clusterperf November 24, 2018

basic+infud
echo100                3.9 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              5.0 us     send 30B message, receive 100B message 90%
echo100.99             5.6 us     send 30B message, receive 100B message 99%
echo100.999            9.4 us     send 30B message, receive 100B message 99.9%
echoBw100             22.6 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.3 us     send 30B message, receive 1KB message 90%
echo1K.99              6.5 us     send 30B message, receive 1KB message 99%
echo1K.999             9.6 us     send 30B message, receive 1KB message 99.9%
echoBw1K             183.8 MB/s   bandwidth receiving 1KB messages
echo10K                9.4 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             10.2 us     send 30B message, receive 10KB message 90%
echo10K.99            11.0 us     send 30B message, receive 10KB message 99%
echo10K.999           14.3 us     send 30B message, receive 10KB message 99.9%
echoBw10K            997.6 MB/s   bandwidth receiving 10KB messages
echo100K              38.9 us     send 30B message, receive 100KB message median
echo100K.min          37.9 us     send 30B message, receive 100KB message minimum
echo100K.9            39.7 us     send 30B message, receive 100KB message 90%
echo100K.99           41.7 us     send 30B message, receive 100KB message 99%
echo100K.999         144.6 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.418 GB/s   bandwidth receiving 100KB messages
echo1M               338.6 us     send 30B message, receive 1MB message median
echo1M.min           335.8 us     send 30B message, receive 1MB message minimum
echo1M.9             341.4 us     send 30B message, receive 1MB message 90%
echo1M.99            347.7 us     send 30B message, receive 1MB message 99%
echo1M.999           446.1 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.874 GB/s   bandwidth receiving 1MB messages
basic+infud
basic.read100          5.3 us     read random 100B object (30B key) median
basic.read100.min      5.0 us     read random 100B object (30B key) minimum
basic.read100.9        5.8 us     read random 100B object (30B key) 90%
basic.read100.99       7.2 us     read random 100B object (30B key) 99%
basic.read100.999     12.0 us     read random 100B object (30B key) 99.9%
basic.readBw100       17.5 MB/s   bandwidth reading 100B objects (30B key)
basic.read1K           6.4 us     read random 1KB object (30B key) median
basic.read1K.min       6.2 us     read random 1KB object (30B key) minimum
basic.read1K.9         7.0 us     read random 1KB object (30B key) 90%
basic.read1K.99        8.3 us     read random 1KB object (30B key) 99%
basic.read1K.999      13.4 us     read random 1KB object (30B key) 99.9%
basic.readBw1K       148.3 MB/s   bandwidth reading 1KB objects (30B key)
basic.read10K         10.9 us     read random 10KB object (30B key) median
basic.read10K.min     10.5 us     read random 10KB object (30B key) minimum
basic.read10K.9       11.8 us     read random 10KB object (30B key) 90%
basic.read10K.99      13.6 us     read random 10KB object (30B key) 99%
basic.read10K.999     18.5 us     read random 10KB object (30B key) 99.9%
basic.readBw10K      876.3 MB/s   bandwidth reading 10KB objects (30B key)
basic.read100K        41.3 us     read random 100KB object (30B key) median
basic.read100K.min    40.0 us     read random 100KB object (30B key) minimum
basic.read100K.9      42.5 us     read random 100KB object (30B key) 90%
basic.read100K.99     45.8 us     read random 100KB object (30B key) 99%
basic.read100K.999    52.4 us     read random 100KB object (30B key) 99.9%
basic.readBw100K     2.295 GB/s   bandwidth reading 100KB objects (30B key)
basic.read1M         348.4 us     read random 1MB object (30B key) median
basic.read1M.min     344.0 us     read random 1MB object (30B key) minimum
basic.read1M.9       353.4 us     read random 1MB object (30B key) 90%
basic.read1M.99      369.2 us     read random 1MB object (30B key) 99%
basic.read1M.999     460.3 us     read random 1MB object (30B key) 99.9%
basic.readBw1M       2.792 GB/s   bandwidth reading 1MB objects (30B key)
basic.write100        16.2 us     write random 100B object (30B key) median
basic.write100.min    15.4 us     write random 100B object (30B key) minimum
basic.write100.9      17.1 us     write random 100B object (30B key) 90%
basic.write100.99     26.3 us     write random 100B object (30B key) 99%
basic.write100.999    78.1 us     write random 100B object (30B key) 99.9%
basic.writeBw100       5.7 MB/s   bandwidth writing 100B objects (30B key)
basic.write1K         18.8 us     write random 1KB object (30B key) median
basic.write1K.min     17.9 us     write random 1KB object (30B key) minimum
basic.write1K.9       19.8 us     write random 1KB object (30B key) 90%
basic.write1K.99      30.7 us     write random 1KB object (30B key) 99%
basic.write1K.999    114.3 us     write random 1KB object (30B key) 99.9%
basic.writeBw1K       50.1 MB/s   bandwidth writing 1KB objects (30B key)
basic.write10K        39.0 us     write random 10KB object (30B key) median
basic.write10K.min    37.0 us     write random 10KB object (30B key) minimum
basic.write10K.9      41.6 us     write random 10KB object (30B key) 90%
basic.write10K.99    194.6 us     write random 10KB object (30B key) 99%
basic.write10K.999   379.7 us     write random 10KB object (30B key) 99.9%
basic.writeBw10K     212.7 MB/s   bandwidth writing 10KB objects (30B key)
basic.write100K      223.2 us     write random 100KB object (30B key) median
basic.write100K.min  214.2 us     write random 100KB object (30B key) minimum
basic.write100K.9    403.9 us     write random 100KB object (30B key) 90%
basic.write100K.99   574.7 us     write random 100KB object (30B key) 99%
basic.write100K.999    9.7 ms     write random 100KB object (30B key) 99.9%
basic.writeBw100K    312.6 MB/s   bandwidth writing 100KB objects (30B key)
basic.write1M          2.1 ms     write random 1MB object (30B key) median
basic.write1M.min      2.0 ms     write random 1MB object (30B key) minimum
basic.write1M.9       11.9 ms     write random 1MB object (30B key) 90%
basic.write1M.99      21.9 ms     write random 1MB object (30B key) 99%
basic.write1M.999     42.8 ms     write random 1MB object (30B key) 99.9%
basic.writeBw1M      275.5 MB/s   bandwidth writing 1MB objects (30B key)
homa+infud
echo100                3.9 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              5.0 us     send 30B message, receive 100B message 90%
echo100.99             5.6 us     send 30B message, receive 100B message 99%
echo100.999            9.7 us     send 30B message, receive 100B message 99.9%
echoBw100             22.4 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.2 us     send 30B message, receive 1KB message 90%
echo1K.99              6.5 us     send 30B message, receive 1KB message 99%
echo1K.999             9.6 us     send 30B message, receive 1KB message 99.9%
echoBw1K             183.9 MB/s   bandwidth receiving 1KB messages
echo10K                9.4 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             10.1 us     send 30B message, receive 10KB message 90%
echo10K.99            10.9 us     send 30B message, receive 10KB message 99%
echo10K.999           14.1 us     send 30B message, receive 10KB message 99.9%
echoBw10K            999.1 MB/s   bandwidth receiving 10KB messages
echo100K              39.5 us     send 30B message, receive 100KB message median
echo100K.min          38.1 us     send 30B message, receive 100KB message minimum
echo100K.9            40.6 us     send 30B message, receive 100KB message 90%
echo100K.99           43.7 us     send 30B message, receive 100KB message 99%
echo100K.999          48.1 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.387 GB/s   bandwidth receiving 100KB messages
echo1M               348.6 us     send 30B message, receive 1MB message median
echo1M.min           336.7 us     send 30B message, receive 1MB message minimum
echo1M.9             354.7 us     send 30B message, receive 1MB message 90%
echo1M.99            364.3 us     send 30B message, receive 1MB message 99%
echo1M.999           463.8 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.790 GB/s   bandwidth receiving 1MB messages
homa+infud
basic.read100          5.2 us     read random 100B object (30B key) median
basic.read100.min      4.7 us     read random 100B object (30B key) minimum
basic.read100.9        5.4 us     read random 100B object (30B key) 90%
basic.read100.99       6.4 us     read random 100B object (30B key) 99%
basic.read100.999     12.0 us     read random 100B object (30B key) 99.9%
basic.readBw100       18.1 MB/s   bandwidth reading 100B objects (30B key)
basic.read1K           6.1 us     read random 1KB object (30B key) median
basic.read1K.min       5.9 us     read random 1KB object (30B key) minimum
basic.read1K.9         6.7 us     read random 1KB object (30B key) 90%
basic.read1K.99        7.3 us     read random 1KB object (30B key) 99%
basic.read1K.999      10.9 us     read random 1KB object (30B key) 99.9%
basic.readBw1K       157.2 MB/s   bandwidth reading 1KB objects (30B key)
basic.read10K         10.5 us     read random 10KB object (30B key) median
basic.read10K.min     10.2 us     read random 10KB object (30B key) minimum
basic.read10K.9       11.3 us     read random 10KB object (30B key) 90%
basic.read10K.99      12.3 us     read random 10KB object (30B key) 99%
basic.read10K.999     98.0 us     read random 10KB object (30B key) 99.9%
basic.readBw10K      902.6 MB/s   bandwidth reading 10KB objects (30B key)
basic.read100K        41.9 us     read random 100KB object (30B key) median
basic.read100K.min    39.8 us     read random 100KB object (30B key) minimum
basic.read100K.9      43.5 us     read random 100KB object (30B key) 90%
basic.read100K.99     46.3 us     read random 100KB object (30B key) 99%
basic.read100K.999    81.4 us     read random 100KB object (30B key) 99.9%
basic.readBw100K     2.262 GB/s   bandwidth reading 100KB objects (30B key)
basic.read1M         358.3 us     read random 1MB object (30B key) median
basic.read1M.min     345.2 us     read random 1MB object (30B key) minimum
basic.read1M.9       365.4 us     read random 1MB object (30B key) 90%
basic.read1M.99      379.9 us     read random 1MB object (30B key) 99%
basic.read1M.999     458.6 us     read random 1MB object (30B key) 99.9%
basic.readBw1M       2.716 GB/s   bandwidth reading 1MB objects (30B key)
basic.write100        14.7 us     write random 100B object (30B key) median
basic.write100.min    13.9 us     write random 100B object (30B key) minimum
basic.write100.9      15.4 us     write random 100B object (30B key) 90%
basic.write100.99     24.0 us     write random 100B object (30B key) 99%
basic.write100.999   115.4 us     write random 100B object (30B key) 99.9%
basic.writeBw100       6.2 MB/s   bandwidth writing 100B objects (30B key)
basic.write1K         16.6 us     write random 1KB object (30B key) median
basic.write1K.min     15.9 us     write random 1KB object (30B key) minimum
basic.write1K.9       17.6 us     write random 1KB object (30B key) 90%
basic.write1K.99      28.3 us     write random 1KB object (30B key) 99%
basic.write1K.999    123.0 us     write random 1KB object (30B key) 99.9%
basic.writeBw1K       55.7 MB/s   bandwidth writing 1KB objects (30B key)
basic.write10K        36.4 us     write random 10KB object (30B key) median
basic.write10K.min    34.6 us     write random 10KB object (30B key) minimum
basic.write10K.9      37.9 us     write random 10KB object (30B key) 90%
basic.write10K.99    183.6 us     write random 10KB object (30B key) 99%
basic.write10K.999   389.8 us     write random 10KB object (30B key) 99.9%
basic.writeBw10K     193.9 MB/s   bandwidth writing 10KB objects (30B key)
basic.write100K      217.7 us     write random 100KB object (30B key) median
basic.write100K.min  207.5 us     write random 100KB object (30B key) minimum
basic.write100K.9    363.0 us     write random 100KB object (30B key) 90%
basic.write100K.99   603.2 us     write random 100KB object (30B key) 99%
basic.write100K.999   45.4 ms     write random 100KB object (30B key) 99.9%
basic.writeBw100K    204.5 MB/s   bandwidth writing 100KB objects (30B key)
basic.write1M          2.1 ms     write random 1MB object (30B key) median
basic.write1M.min      2.1 ms     write random 1MB object (30B key) minimum
basic.write1M.9       19.7 ms     write random 1MB object (30B key) 90%
basic.write1M.99      49.7 ms     write random 1MB object (30B key) 99%
basic.writeBw1M      177.5 MB/s   bandwidth writing 1MB objects (30B key)

infrc
echo100                3.6 us     send 30B message, receive 100B message median
echo100.min            3.5 us     send 30B message, receive 100B message minimum
echo100.9              4.2 us     send 30B message, receive 100B message 90%
echo100.99             5.5 us     send 30B message, receive 100B message 99%
echo100.999            7.4 us     send 30B message, receive 100B message 99.9%
echoBw100             24.8 MB/s   bandwidth receiving 100B messages
echo1K                 4.8 us     send 30B message, receive 1KB message median
echo1K.min             4.8 us     send 30B message, receive 1KB message minimum
echo1K.9               5.5 us     send 30B message, receive 1KB message 90%
echo1K.99              6.1 us     send 30B message, receive 1KB message 99%
echo1K.999             8.0 us     send 30B message, receive 1KB message 99.9%
echoBw1K             192.5 MB/s   bandwidth receiving 1KB messages
echo10K                7.8 us     send 30B message, receive 10KB message median
echo10K.min            7.6 us     send 30B message, receive 10KB message minimum
echo10K.9              8.3 us     send 30B message, receive 10KB message 90%
echo10K.99             9.0 us     send 30B message, receive 10KB message 99%
echo10K.999           10.6 us     send 30B message, receive 10KB message 99.9%
echoBw10K            1.196 GB/s   bandwidth receiving 10KB messages
echo100K              37.7 us     send 30B message, receive 100KB message median
echo100K.min          36.2 us     send 30B message, receive 100KB message minimum
echo100K.9            38.1 us     send 30B message, receive 100KB message 90%
echo100K.99           38.8 us     send 30B message, receive 100KB message 99%
echo100K.999          40.5 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.530 GB/s   bandwidth receiving 100KB messages
echo1M               332.2 us     send 30B message, receive 1MB message median
echo1M.min           330.4 us     send 30B message, receive 1MB message minimum
echo1M.9             332.8 us     send 30B message, receive 1MB message 90%
echo1M.99            333.5 us     send 30B message, receive 1MB message 99%
echo1M.999           335.6 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.939 GB/s   bandwidth receiving 1MB messages
infrc
basic.read100          4.5 us     read random 100B object (30B key) median
basic.read100.min      4.2 us     read random 100B object (30B key) minimum
basic.read100.9        5.2 us     read random 100B object (30B key) 90%
basic.read100.99       6.0 us     read random 100B object (30B key) 99%
basic.read100.999      8.3 us     read random 100B object (30B key) 99.9%
basic.readBw100       20.1 MB/s   bandwidth reading 100B objects (30B key)
basic.read1K           5.7 us     read random 1KB object (30B key) median
basic.read1K.min       5.5 us     read random 1KB object (30B key) minimum
basic.read1K.9         6.4 us     read random 1KB object (30B key) 90%
basic.read1K.99        7.1 us     read random 1KB object (30B key) 99%
basic.read1K.999       9.3 us     read random 1KB object (30B key) 99.9%
basic.readBw1K       165.9 MB/s   bandwidth reading 1KB objects (30B key)
basic.read10K          8.9 us     read random 10KB object (30B key) median
basic.read10K.min      8.2 us     read random 10KB object (30B key) minimum
basic.read10K.9        9.5 us     read random 10KB object (30B key) 90%
basic.read10K.99      10.3 us     read random 10KB object (30B key) 99%
basic.read10K.999     13.1 us     read random 10KB object (30B key) 99.9%
basic.readBw10K      1.060 GB/s   bandwidth reading 10KB objects (30B key)
basic.read100K        39.0 us     read random 100KB object (30B key) median
basic.read100K.min    36.5 us     read random 100KB object (30B key) minimum
basic.read100K.9      39.7 us     read random 100KB object (30B key) 90%
basic.read100K.99     40.7 us     read random 100KB object (30B key) 99%
basic.read100K.999    45.9 us     read random 100KB object (30B key) 99.9%
basic.readBw100K     2.443 GB/s   bandwidth reading 100KB objects (30B key)
basic.read1M         338.1 us     read random 1MB object (30B key) median
basic.read1M.min     332.7 us     read random 1MB object (30B key) minimum
basic.read1M.9       341.7 us     read random 1MB object (30B key) 90%
basic.read1M.99      343.5 us     read random 1MB object (30B key) 99%
basic.read1M.999     378.8 us     read random 1MB object (30B key) 99.9%
basic.readBw1M       2.888 GB/s   bandwidth reading 1MB objects (30B key)
basic.write100        13.8 us     write random 100B object (30B key) median
basic.write100.min    12.5 us     write random 100B object (30B key) minimum
basic.write100.9      14.7 us     write random 100B object (30B key) 90%
basic.write100.99     19.9 us     write random 100B object (30B key) 99%
basic.write100.999    87.4 us     write random 100B object (30B key) 99.9%
basic.writeBw100       6.7 MB/s   bandwidth writing 100B objects (30B key)
basic.write1K         16.5 us     write random 1KB object (30B key) median
basic.write1K.min     15.1 us     write random 1KB object (30B key) minimum
basic.write1K.9       17.4 us     write random 1KB object (30B key) 90%
basic.write1K.99      21.4 us     write random 1KB object (30B key) 99%
basic.write1K.999    112.7 us     write random 1KB object (30B key) 99.9%
basic.writeBw1K       57.6 MB/s   bandwidth writing 1KB objects (30B key)
basic.write10K        33.3 us     write random 10KB object (30B key) median
basic.write10K.min    31.6 us     write random 10KB object (30B key) minimum
basic.write10K.9      34.9 us     write random 10KB object (30B key) 90%
basic.write10K.99    131.6 us     write random 10KB object (30B key) 99%
basic.write10K.999   445.9 us     write random 10KB object (30B key) 99.9%
basic.writeBw10K     204.7 MB/s   bandwidth writing 10KB objects (30B key)
basic.write100K      227.0 us     write random 100KB object (30B key) median
basic.write100K.min  214.2 us     write random 100KB object (30B key) minimum
basic.write100K.9    272.8 us     write random 100KB object (30B key) 90%
basic.write100K.99   598.7 us     write random 100KB object (30B key) 99%
basic.write100K.999   50.0 ms     write random 100KB object (30B key) 99.9%
basic.writeBw100K    196.8 MB/s   bandwidth writing 100KB objects (30B key)
basic.write1M          2.2 ms     write random 1MB object (30B key) median
basic.write1M.min      2.1 ms     write random 1MB object (30B key) minimum
basic.write1M.9       16.4 ms     write random 1MB object (30B key) 90%
basic.write1M.99      42.5 ms     write random 1MB object (30B key) 99%
basic.writeBw1M      200.9 MB/s   bandwidth writing 1MB objects (30B key)

Notable changes/optimizations compared to the previous result:

  1. Enable servers to use hugepage memory to allocate LargeBlockOfMemory (commit 4232cee).
  2. Increase InfUdDriver's MTU from 2048 to 4096 (commit a2e0b8a).
  3. Increase ConnectX-2's PCIe MaxReadReq from 512 to 4096 via command setpci -s $(lspci | grep Mellanox | awk '{print $1}') 68.w=5020.
  4. Implement the selective signaling optimization in InfUdDriver (commit 1e639f4).

The hugepage optimization is used to reduce the address translation burden on the NIC to improve the throughput of basic benchmark; it doesn't affect the performance numbers of echo_basic. Note that the read bandwidth is still slightly lower than the echo bandwidth because, in echo_basic, we send out a new RPC before destroying the old one to reduce the network idle time.

To keep the client's downlink always busy, we can use "--concurrentOps 2" to allow at most 2 outstanding RPCs in the echo_basic benchmark. Note that ~3GB/s is consistent with the maximum TX throughput of UD measured with ib_send_bw. 

basic+infud, concurrentOps = 2
echo100                4.0 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              5.2 us     send 30B message, receive 100B message 90%
echo100.99             5.8 us     send 30B message, receive 100B message 99%
echo100.999           10.0 us     send 30B message, receive 100B message 99.9%
echoBw100             42.7 MB/s   bandwidth receiving 100B messages
echo1K                 5.3 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               6.1 us     send 30B message, receive 1KB message 90%
echo1K.99              7.1 us     send 30B message, receive 1KB message 99%
echo1K.999            11.2 us     send 30B message, receive 1KB message 99.9%
echoBw1K             346.3 MB/s   bandwidth receiving 1KB messages
echo10K                9.6 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             10.4 us     send 30B message, receive 10KB message 90%
echo10K.99            11.2 us     send 30B message, receive 10KB message 99%
echo10K.999           15.7 us     send 30B message, receive 10KB message 99.9%
echoBw10K            1.914 GB/s   bandwidth receiving 10KB messages
echo100K              64.4 us     send 30B message, receive 100KB message median
echo100K.min          46.0 us     send 30B message, receive 100KB message minimum
echo100K.9            67.9 us     send 30B message, receive 100KB message 90%
echo100K.99           73.0 us     send 30B message, receive 100KB message 99%
echo100K.999         130.8 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.948 GB/s   bandwidth receiving 100KB messages
echo1M               660.1 us     send 30B message, receive 1MB message median
echo1M.min           530.8 us     send 30B message, receive 1MB message minimum
echo1M.9             706.8 us     send 30B message, receive 1MB message 90%
echo1M.99            747.1 us     send 30B message, receive 1MB message 99%
echo1M.999           783.6 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.956 GB/s   bandwidth receiving 1MB messages


To test the maximum RX throughput, we can use "--concurrentOps 2 --servers 2" to have 2 servers echoing messages simultaneously in the echo_basic benchmark. 3.24GB/s also matches the numbers measured with ib_send_bw.

basic+infud, concurrentOps = 2, servers = 2
echo100                4.3 us     send 30B message, receive 100B message median
echo100.min            3.7 us     send 30B message, receive 100B message minimum
echo100.9              5.1 us     send 30B message, receive 100B message 90%
echo100.99             5.9 us     send 30B message, receive 100B message 99%
echo100.999           11.2 us     send 30B message, receive 100B message 99.9%
echoBw100             40.4 MB/s   bandwidth receiving 100B messages
echo1K                 5.2 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.8 us     send 30B message, receive 1KB message 90%
echo1K.99              6.9 us     send 30B message, receive 1KB message 99%
echo1K.999            10.5 us     send 30B message, receive 1KB message 99.9%
echoBw1K             342.5 MB/s   bandwidth receiving 1KB messages
echo10K                9.7 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             11.3 us     send 30B message, receive 10KB message 90%
echo10K.99            12.5 us     send 30B message, receive 10KB message 99%
echo10K.999           16.3 us     send 30B message, receive 10KB message 99.9%
echoBw10K            1.854 GB/s   bandwidth receiving 10KB messages
echo100K              65.1 us     send 30B message, receive 100KB message median
echo100K.min          38.4 us     send 30B message, receive 100KB message minimum
echo100K.9            66.1 us     send 30B message, receive 100KB message 90%
echo100K.99           67.4 us     send 30B message, receive 100KB message 99%
echo100K.999         156.4 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.903 GB/s   bandwidth receiving 100KB messages
echo1M               601.4 us     send 30B message, receive 1MB message median
echo1M.min           424.4 us     send 30B message, receive 1MB message minimum
echo1M.9             607.8 us     send 30B message, receive 1MB message 90%
echo1M.99            613.4 us     send 30B message, receive 1MB message 99%
echo1M.999           693.3 us     send 30B message, receive 1MB message 99.9%
echoBw1M             3.241 GB/s   bandwidth receiving 1MB messages

Out of the other three optimizations, 2 & 3 are crucial to improve InfUdDriver's maximum bandwidth utilization. For example, increasing the PCIe's MaxReadReq reduces the number of DMA's the NIC has to issue to read the packet payload. If we reduce MaxReadReq back to 512 but keep the other two optimizations, we have:

basic+infud
echo100                3.9 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              5.1 us     send 30B message, receive 100B message 90%
echo100.99             5.6 us     send 30B message, receive 100B message 99%
echo100.999            8.5 us     send 30B message, receive 100B message 99.9%
echoBw100             22.4 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.3 us     send 30B message, receive 1KB message 90%
echo1K.99              6.5 us     send 30B message, receive 1KB message 99%
echo1K.999             9.4 us     send 30B message, receive 1KB message 99.9%
echoBw1K             182.0 MB/s   bandwidth receiving 1KB messages
echo10K                9.5 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             10.2 us     send 30B message, receive 10KB message 90%
echo10K.99            11.0 us     send 30B message, receive 10KB message 99%
echo10K.999           14.7 us     send 30B message, receive 10KB message 99.9%
echoBw10K            990.1 MB/s   bandwidth receiving 10KB messages
echo100K              39.6 us     send 30B message, receive 100KB message median
echo100K.min          38.6 us     send 30B message, receive 100KB message minimum
echo100K.9            40.2 us     send 30B message, receive 100KB message 90%
echo100K.99           42.1 us     send 30B message, receive 100KB message 99%
echo100K.999          47.6 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.396 GB/s   bandwidth receiving 100KB messages
echo1M               352.9 us     send 30B message, receive 1MB message median
echo1M.min           349.8 us     send 30B message, receive 1MB message minimum
echo1M.9             355.4 us     send 30B message, receive 1MB message 90%
echo1M.99            361.5 us     send 30B message, receive 1MB message 99%
echo1M.999           457.5 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.760 GB/s   bandwidth receiving 1MB messages
homa+infud
echo100                3.9 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              5.1 us     send 30B message, receive 100B message 90%
echo100.99             5.7 us     send 30B message, receive 100B message 99%
echo100.999           10.8 us     send 30B message, receive 100B message 99.9%
echoBw100             22.2 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.3 us     send 30B message, receive 1KB message 90%
echo1K.99              6.5 us     send 30B message, receive 1KB message 99%
echo1K.999             9.6 us     send 30B message, receive 1KB message 99.9%
echoBw1K             182.7 MB/s   bandwidth receiving 1KB messages
echo10K                9.4 us     send 30B message, receive 10KB message median
echo10K.min            9.3 us     send 30B message, receive 10KB message minimum
echo10K.9             10.2 us     send 30B message, receive 10KB message 90%
echo10K.99            11.0 us     send 30B message, receive 10KB message 99%
echo10K.999           14.3 us     send 30B message, receive 10KB message 99.9%
echoBw10K            997.2 MB/s   bandwidth receiving 10KB messages
echo100K              40.1 us     send 30B message, receive 100KB message median
echo100K.min          38.7 us     send 30B message, receive 100KB message minimum
echo100K.9            41.3 us     send 30B message, receive 100KB message 90%
echo100K.99           44.7 us     send 30B message, receive 100KB message 99%
echo100K.999          49.1 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.353 GB/s   bandwidth receiving 100KB messages
echo1M               363.1 us     send 30B message, receive 1MB message median
echo1M.min           351.4 us     send 30B message, receive 1MB message minimum
echo1M.9             367.6 us     send 30B message, receive 1MB message 90%
echo1M.99            375.2 us     send 30B message, receive 1MB message 99%
echo1M.999           470.5 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.688 GB/s   bandwidth receiving 1MB messages


Again, if we reduce InfUdDriver's MTU to 2048 but keep the other changes, we have:

basic+infud
echo100                3.8 us     send 30B message, receive 100B message median
echo100.min            3.8 us     send 30B message, receive 100B message minimum
echo100.9              4.5 us     send 30B message, receive 100B message 90%
echo100.99             5.2 us     send 30B message, receive 100B message 99%
echo100.999            9.5 us     send 30B message, receive 100B message 99.9%
echoBw100             23.0 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.0 us     send 30B message, receive 1KB message minimum
echo1K.9               5.2 us     send 30B message, receive 1KB message 90%
echo1K.99              6.4 us     send 30B message, receive 1KB message 99%
echo1K.999             9.3 us     send 30B message, receive 1KB message 99.9%
echoBw1K             185.0 MB/s   bandwidth receiving 1KB messages
echo10K                8.8 us     send 30B message, receive 10KB message median
echo10K.min            8.7 us     send 30B message, receive 10KB message minimum
echo10K.9              9.8 us     send 30B message, receive 10KB message 90%
echo10K.99            10.7 us     send 30B message, receive 10KB message 99%
echo10K.999           13.6 us     send 30B message, receive 10KB message 99.9%
echoBw10K            1.027 GB/s   bandwidth receiving 10KB messages
echo100K              40.2 us     send 30B message, receive 100KB message median
echo100K.min          38.9 us     send 30B message, receive 100KB message minimum
echo100K.9            41.1 us     send 30B message, receive 100KB message 90%
echo100K.99           43.2 us     send 30B message, receive 100KB message 99%
echo100K.999          49.0 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.352 GB/s   bandwidth receiving 100KB messages
echo1M               366.9 us     send 30B message, receive 1MB message median
echo1M.min           362.0 us     send 30B message, receive 1MB message minimum
echo1M.9             369.9 us     send 30B message, receive 1MB message 90%
echo1M.99            375.9 us     send 30B message, receive 1MB message 99%
echo1M.999           470.9 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.656 GB/s   bandwidth receiving 1MB messages
homa+infud
echo100                4.0 us     send 30B message, receive 100B message median
echo100.min            3.9 us     send 30B message, receive 100B message minimum
echo100.9              4.6 us     send 30B message, receive 100B message 90%
echo100.99             5.5 us     send 30B message, receive 100B message 99%
echo100.999           11.0 us     send 30B message, receive 100B message 99.9%
echoBw100             22.2 MB/s   bandwidth receiving 100B messages
echo1K                 5.1 us     send 30B message, receive 1KB message median
echo1K.min             5.1 us     send 30B message, receive 1KB message minimum
echo1K.9               5.5 us     send 30B message, receive 1KB message 90%
echo1K.99              6.7 us     send 30B message, receive 1KB message 99%
echo1K.999             9.9 us     send 30B message, receive 1KB message 99.9%
echoBw1K             180.7 MB/s   bandwidth receiving 1KB messages
echo10K                8.9 us     send 30B message, receive 10KB message median
echo10K.min            8.7 us     send 30B message, receive 10KB message minimum
echo10K.9              9.9 us     send 30B message, receive 10KB message 90%
echo10K.99            10.8 us     send 30B message, receive 10KB message 99%
echo10K.999           13.9 us     send 30B message, receive 10KB message 99.9%
echoBw10K            1.017 GB/s   bandwidth receiving 10KB messages
echo100K              41.5 us     send 30B message, receive 100KB message median
echo100K.min          39.3 us     send 30B message, receive 100KB message minimum
echo100K.9            43.3 us     send 30B message, receive 100KB message 90%
echo100K.99           47.9 us     send 30B message, receive 100KB message 99%
echo100K.999          60.8 us     send 30B message, receive 100KB message 99.9%
echoBw100K           2.267 GB/s   bandwidth receiving 100KB messages
echo1M               385.5 us     send 30B message, receive 1MB message median
echo1M.min           367.6 us     send 30B message, receive 1MB message minimum
echo1M.9             394.5 us     send 30B message, receive 1MB message 90%
echo1M.99            423.7 us     send 30B message, receive 1MB message 99%
echo1M.999           500.6 us     send 30B message, receive 1MB message 99.9%
echoBw1M             2.521 GB/s   bandwidth receiving 1MB messages


Finally, the selective signalling optimization reduces 1) the CPU overhead of transmitting a packet and 2) the overhead of generating Work Completion records by the HCA. However, since our network throughput is currently limited by the PCIe bandwidth rather than CPU processing time, removing this optimization doesn't have visible impact on the maximum read/echo throughput.