...
- Determined in userspace by the following:
Code Block const size_t maxmem = 1 * 1024 * 1024 * 1024; int main() { uint32_t *buf = getbuf(); const int loops = 100 * 1000 * 1000; uint64_t b; *buf = getbuf(); uint64_t blah = 0; // don't compile away int i; b = rdtsc(); b, i, j; memset(buf, 0, maxmem); size_t maxidx = maxmem / sizeof(buf[0]); //randomly link our pointers for (i = 0; i < loopsmaxidx; i++) blah + { int idx = random() % (maxmem / sizeof(buf[0])); uint64_t random_ticks = rdtsc() - b; printf("%" PRIu64 " ticks for random-mod (%" PRIu64 " each)\n", random_ticks, random_ticks / loops); maxidx; buf[i] = (uint64_t)&buf[idx]; } for (j = 0; j < 100; j++) { uint64_t *p = &buf[random() % maxidx]; b = rdtsc(); for (i = 0; i < loops; i++) blah += buf[random() % (maxmem / sizeof(buf[0]))]; uint64_t access_ticks { if (*p & 0x1) { printf("encountered loop after %" PRIu64 " walks; ", i); break; } uint64_t *next = (uint64_t *)*p; *p |= 0x1; p = next; } uint64_t diff = rdtsc() - b; if (i == 0) i = 1; printf("%" PRIu64 " totalavg. ticks (%" PRIu64 " each)\n", access_ticks, / walk\n", diff / i); access_ticks / loops); printf("%" PRIu64 " ticks not including random-mod (%" PRIu64 " each)\n", access_ticks - random_ticks, (access_ticks - random_ticks) / loops); return blah/clean up & wreck the cache for (i = 0; i < maxidx; i++) buf[i] &= ~0x1; } return 0; }
- Where getbuf() returns a 1GB region of va (maxmem = 1 * 1024 * 1024 * 1024)based on maxmem.
- Note that Phenom and Nehalem have about 23MB of L1 and L2 data TLB coverage. The Xeon is likely similar, if less.
- All chips have < 10MB cache, so > 99% of the data set is uncached.
...