...
- Determined in userspace by the following:
Code Block const size_t maxmem = 1 * 1024 * 1024 * 1024; int main() { const int loops = 100 * 1000 * 1000; uint64_t *buf = getbuf(); uint64_t b, i, j, total; memset(buf, 0, maxmem); size_t maxidx = maxmem / sizeof(buf[0]); //randomly link our pointers for (i = 0; i < maxidx; i++) { int idx = random() % maxidx; buf[i] = (uint64_t)&buf[idx]; } total = 0; for (j = 0; j < 100; j++) { uint64_t *p = &buf[random() % maxidx]; b = rdtsc(); for (i = 0; i < loops; i++) { if (*p & 0x1) { printf("encountered loop after %" PRIu64 " walks; ", i); break; } uint64_t *next = (uint64_t *)*p; *p |= 0x1; p = next; } uint64_t diff = rdtsc() - b; if (i == 0) i = 1;break; printf("walk %" PRIu64 " did %" PRIu64 " accesses in %" avg.PRIu64 ticks" /average walkticks\n", j, i, (diff / i)); total += (diff / i); //clean up & wreck the cache for (i = 0; i < maxidx; i++) buf[i] &= ~0x1; } printf("average of all walks: %" PRIu64 " ticks\n", total / j); return 0; }
- Where getbuf() returns a 1GB region of va based on maxmem.
- Note that Phenom and Nehalem have about 23MB of L1 and L2 data TLB coverage. The Xeon is likely similar, if less.
- All chips have < 10MB cache, so > 99% of the data set is uncached.
...