...
Processor | MHz | Architecture | Page Size | Ticks/Access | NSec/access |
---|---|---|---|---|---|
Phenom 9850 Quad-Core | 2500 | K10 | 4k | 529 491 | 212 196 |
Phenom 9850 Quad-Core | 2500 | K10 | 1g | 244 260 | 98 104 |
Xeon Dual-Core 3060 | 2400 | Core 2 | 4k | 262 357 | 109 149 |
Xeon Dual-Core 3060 | 2400 | Core 2 | 4m | 193 241 | 80 100 |
Core i7-920 | 2667 | Nehalem | 4k | 99 381 | 37 143 |
Core i7-920 | 2667 | Nehalem | 2m | 63 213 | 24 80 |
- Determined in userspace by the following:
Code Block const size_t maxmem = 1 * 1024 * 1024 * 1024; int main() { const int loops = 100 * 1000 * 1000; uint64_t *buf = getbuf(); uint64_t b, i, j, total; memset(buf, 0, maxmem); size_t maxidx = maxmem / sizeof(buf[0]); //randomly link our pointers for (i = 0; i < maxidx; i++) { int idx = random() % maxidx; buf[i] = (uint64_t)&buf[idx]; } total = 0; for (j = 0; j < 100; j++) { uint64_t *p = &buf[random() % maxidx]; b = rdtsc(); for (i = 0; i < loops; i++) { if (*p & 0x1) break; uint64_t *next = (uint64_t *)*p; *p |= 0x1; p = next; } uint64_t diff = rdtsc() - b; if (i == 0) break; printf("walk %" PRIu64 " did %" PRIu64 " accesses in %" PRIu64 " average ticks\n", j, i, (diff / i)); total += (diff / i); //clean up & wreck the cache for (i = 0; i < maxidx; i++) buf[i] &= ~0x1; } printf("average of all walks: %" PRIu64 " ticks\n", total / j); return 0; }
- Where getbuf() returns a 1GB region of va based on maxmem.
- Note that Phenom and Nehalem have about 23MB of L1 and L2 data TLB coverage. The Xeon is likely similar, if less.
- All chips have < 10MB cache, so > 99% of the data set is uncached.
...